{ "best_global_step": 49040, "best_metric": 0.13326387107372284, "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_multirc_123_1765143191/checkpoint-49040", "epoch": 20.0, "eval_steps": 6130, "global_step": 122600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008156606851549756, "grad_norm": 1.3724517822265625, "learning_rate": 1.631321370309951e-08, "loss": 1.8362, "num_input_tokens_seen": 8928, "step": 5 }, { "epoch": 0.0016313213703099511, "grad_norm": 8.012661933898926, "learning_rate": 3.67047308319739e-08, "loss": 1.1487, "num_input_tokens_seen": 20448, "step": 10 }, { "epoch": 0.0024469820554649264, "grad_norm": 3.141139268875122, "learning_rate": 5.709624796084829e-08, "loss": 0.5042, "num_input_tokens_seen": 31072, "step": 15 }, { "epoch": 0.0032626427406199023, "grad_norm": 9.434551239013672, "learning_rate": 7.748776508972267e-08, "loss": 1.0508, "num_input_tokens_seen": 42528, "step": 20 }, { "epoch": 0.004078303425774877, "grad_norm": 4.487924575805664, "learning_rate": 9.787928221859706e-08, "loss": 1.3011, "num_input_tokens_seen": 52576, "step": 25 }, { "epoch": 0.004893964110929853, "grad_norm": 7.248781204223633, "learning_rate": 1.1827079934747146e-07, "loss": 0.7666, "num_input_tokens_seen": 62944, "step": 30 }, { "epoch": 0.005709624796084829, "grad_norm": 2.4928395748138428, "learning_rate": 1.3866231647634585e-07, "loss": 0.7836, "num_input_tokens_seen": 73504, "step": 35 }, { "epoch": 0.0065252854812398045, "grad_norm": 7.441699981689453, "learning_rate": 1.5905383360522023e-07, "loss": 1.02, "num_input_tokens_seen": 84640, "step": 40 }, { "epoch": 0.00734094616639478, "grad_norm": 1.8872206211090088, "learning_rate": 1.7944535073409463e-07, "loss": 0.718, "num_input_tokens_seen": 96288, "step": 45 }, { "epoch": 0.008156606851549755, "grad_norm": 5.425693035125732, "learning_rate": 1.99836867862969e-07, "loss": 1.0312, "num_input_tokens_seen": 107360, "step": 50 }, { "epoch": 0.00897226753670473, "grad_norm": 6.1366753578186035, "learning_rate": 2.2022838499184342e-07, "loss": 0.8805, "num_input_tokens_seen": 118432, "step": 55 }, { "epoch": 0.009787928221859706, "grad_norm": 2.5215580463409424, "learning_rate": 2.406199021207178e-07, "loss": 1.0033, "num_input_tokens_seen": 128416, "step": 60 }, { "epoch": 0.010603588907014683, "grad_norm": 2.1202142238616943, "learning_rate": 2.6101141924959217e-07, "loss": 0.4411, "num_input_tokens_seen": 138432, "step": 65 }, { "epoch": 0.011419249592169658, "grad_norm": 8.061209678649902, "learning_rate": 2.814029363784666e-07, "loss": 0.6391, "num_input_tokens_seen": 149504, "step": 70 }, { "epoch": 0.012234910277324634, "grad_norm": 3.9366326332092285, "learning_rate": 3.01794453507341e-07, "loss": 0.6581, "num_input_tokens_seen": 160192, "step": 75 }, { "epoch": 0.013050570962479609, "grad_norm": 2.6036856174468994, "learning_rate": 3.2218597063621533e-07, "loss": 0.792, "num_input_tokens_seen": 170432, "step": 80 }, { "epoch": 0.013866231647634585, "grad_norm": 2.4476478099823, "learning_rate": 3.4257748776508974e-07, "loss": 0.6432, "num_input_tokens_seen": 180672, "step": 85 }, { "epoch": 0.01468189233278956, "grad_norm": 3.1391689777374268, "learning_rate": 3.6296900489396414e-07, "loss": 0.759, "num_input_tokens_seen": 192352, "step": 90 }, { "epoch": 0.015497553017944535, "grad_norm": 7.911917686462402, "learning_rate": 3.833605220228385e-07, "loss": 0.7326, "num_input_tokens_seen": 203040, "step": 95 }, { "epoch": 0.01631321370309951, "grad_norm": 5.05709171295166, "learning_rate": 4.037520391517129e-07, "loss": 1.0211, "num_input_tokens_seen": 214272, "step": 100 }, { "epoch": 0.017128874388254486, "grad_norm": 4.9513773918151855, "learning_rate": 4.241435562805873e-07, "loss": 0.6563, "num_input_tokens_seen": 225664, "step": 105 }, { "epoch": 0.01794453507340946, "grad_norm": 3.9999263286590576, "learning_rate": 4.4453507340946166e-07, "loss": 1.6699, "num_input_tokens_seen": 237216, "step": 110 }, { "epoch": 0.018760195758564437, "grad_norm": 7.440149307250977, "learning_rate": 4.649265905383361e-07, "loss": 0.7365, "num_input_tokens_seen": 246560, "step": 115 }, { "epoch": 0.01957585644371941, "grad_norm": 8.823700904846191, "learning_rate": 4.853181076672105e-07, "loss": 1.3923, "num_input_tokens_seen": 257184, "step": 120 }, { "epoch": 0.020391517128874388, "grad_norm": 6.814681053161621, "learning_rate": 5.057096247960849e-07, "loss": 0.7766, "num_input_tokens_seen": 268384, "step": 125 }, { "epoch": 0.021207177814029365, "grad_norm": 2.9726951122283936, "learning_rate": 5.261011419249592e-07, "loss": 0.5986, "num_input_tokens_seen": 279680, "step": 130 }, { "epoch": 0.02202283849918434, "grad_norm": 6.119279861450195, "learning_rate": 5.464926590538336e-07, "loss": 1.2881, "num_input_tokens_seen": 290688, "step": 135 }, { "epoch": 0.022838499184339316, "grad_norm": 7.167515754699707, "learning_rate": 5.66884176182708e-07, "loss": 0.9243, "num_input_tokens_seen": 301408, "step": 140 }, { "epoch": 0.02365415986949429, "grad_norm": 2.561433792114258, "learning_rate": 5.872756933115824e-07, "loss": 1.1213, "num_input_tokens_seen": 311968, "step": 145 }, { "epoch": 0.024469820554649267, "grad_norm": 5.748373031616211, "learning_rate": 6.076672104404568e-07, "loss": 1.4, "num_input_tokens_seen": 322080, "step": 150 }, { "epoch": 0.02528548123980424, "grad_norm": 5.343301773071289, "learning_rate": 6.280587275693313e-07, "loss": 1.2958, "num_input_tokens_seen": 333824, "step": 155 }, { "epoch": 0.026101141924959218, "grad_norm": 3.026973247528076, "learning_rate": 6.484502446982055e-07, "loss": 1.6995, "num_input_tokens_seen": 344896, "step": 160 }, { "epoch": 0.026916802610114192, "grad_norm": 2.9629669189453125, "learning_rate": 6.6884176182708e-07, "loss": 0.9158, "num_input_tokens_seen": 356192, "step": 165 }, { "epoch": 0.02773246329526917, "grad_norm": 3.135768413543701, "learning_rate": 6.892332789559543e-07, "loss": 0.6812, "num_input_tokens_seen": 367136, "step": 170 }, { "epoch": 0.028548123980424143, "grad_norm": 2.0323848724365234, "learning_rate": 7.096247960848288e-07, "loss": 0.7793, "num_input_tokens_seen": 376832, "step": 175 }, { "epoch": 0.02936378466557912, "grad_norm": 2.762800931930542, "learning_rate": 7.300163132137031e-07, "loss": 0.5227, "num_input_tokens_seen": 387552, "step": 180 }, { "epoch": 0.030179445350734094, "grad_norm": 9.03598403930664, "learning_rate": 7.504078303425776e-07, "loss": 1.0487, "num_input_tokens_seen": 398528, "step": 185 }, { "epoch": 0.03099510603588907, "grad_norm": 6.312353134155273, "learning_rate": 7.707993474714518e-07, "loss": 0.9109, "num_input_tokens_seen": 408896, "step": 190 }, { "epoch": 0.03181076672104405, "grad_norm": 2.2967073917388916, "learning_rate": 7.911908646003262e-07, "loss": 0.7836, "num_input_tokens_seen": 420416, "step": 195 }, { "epoch": 0.03262642740619902, "grad_norm": 9.12990951538086, "learning_rate": 8.115823817292006e-07, "loss": 0.6284, "num_input_tokens_seen": 430912, "step": 200 }, { "epoch": 0.033442088091353996, "grad_norm": 9.451518058776855, "learning_rate": 8.319738988580751e-07, "loss": 1.0304, "num_input_tokens_seen": 442176, "step": 205 }, { "epoch": 0.03425774877650897, "grad_norm": 5.518346309661865, "learning_rate": 8.523654159869495e-07, "loss": 0.6616, "num_input_tokens_seen": 454496, "step": 210 }, { "epoch": 0.03507340946166395, "grad_norm": 5.31643533706665, "learning_rate": 8.727569331158239e-07, "loss": 0.5149, "num_input_tokens_seen": 464896, "step": 215 }, { "epoch": 0.03588907014681892, "grad_norm": 3.912665843963623, "learning_rate": 8.931484502446981e-07, "loss": 1.2458, "num_input_tokens_seen": 477024, "step": 220 }, { "epoch": 0.0367047308319739, "grad_norm": 8.151066780090332, "learning_rate": 9.135399673735726e-07, "loss": 0.7705, "num_input_tokens_seen": 487776, "step": 225 }, { "epoch": 0.037520391517128875, "grad_norm": 1.733149766921997, "learning_rate": 9.339314845024471e-07, "loss": 0.7368, "num_input_tokens_seen": 498528, "step": 230 }, { "epoch": 0.03833605220228385, "grad_norm": 3.6186559200286865, "learning_rate": 9.543230016313214e-07, "loss": 0.4768, "num_input_tokens_seen": 509184, "step": 235 }, { "epoch": 0.03915171288743882, "grad_norm": 3.256819725036621, "learning_rate": 9.747145187601957e-07, "loss": 0.7518, "num_input_tokens_seen": 520224, "step": 240 }, { "epoch": 0.0399673735725938, "grad_norm": 7.918660640716553, "learning_rate": 9.951060358890701e-07, "loss": 1.7933, "num_input_tokens_seen": 530464, "step": 245 }, { "epoch": 0.040783034257748776, "grad_norm": 8.040850639343262, "learning_rate": 1.0154975530179446e-06, "loss": 1.3675, "num_input_tokens_seen": 541952, "step": 250 }, { "epoch": 0.041598694942903754, "grad_norm": 2.639321804046631, "learning_rate": 1.035889070146819e-06, "loss": 0.5919, "num_input_tokens_seen": 551424, "step": 255 }, { "epoch": 0.04241435562805873, "grad_norm": 5.112524032592773, "learning_rate": 1.0562805872756933e-06, "loss": 1.4905, "num_input_tokens_seen": 561440, "step": 260 }, { "epoch": 0.0432300163132137, "grad_norm": 6.7127532958984375, "learning_rate": 1.0766721044045677e-06, "loss": 1.1333, "num_input_tokens_seen": 573440, "step": 265 }, { "epoch": 0.04404567699836868, "grad_norm": 4.3217453956604, "learning_rate": 1.0970636215334422e-06, "loss": 0.5284, "num_input_tokens_seen": 584032, "step": 270 }, { "epoch": 0.044861337683523655, "grad_norm": 9.712747573852539, "learning_rate": 1.1174551386623167e-06, "loss": 0.9837, "num_input_tokens_seen": 595328, "step": 275 }, { "epoch": 0.04567699836867863, "grad_norm": 3.402402639389038, "learning_rate": 1.137846655791191e-06, "loss": 1.2619, "num_input_tokens_seen": 605760, "step": 280 }, { "epoch": 0.0464926590538336, "grad_norm": 4.555566787719727, "learning_rate": 1.1582381729200654e-06, "loss": 0.839, "num_input_tokens_seen": 616384, "step": 285 }, { "epoch": 0.04730831973898858, "grad_norm": 3.3675262928009033, "learning_rate": 1.1786296900489396e-06, "loss": 0.8221, "num_input_tokens_seen": 626880, "step": 290 }, { "epoch": 0.04812398042414356, "grad_norm": 1.739200472831726, "learning_rate": 1.199021207177814e-06, "loss": 1.0448, "num_input_tokens_seen": 637536, "step": 295 }, { "epoch": 0.048939641109298535, "grad_norm": 8.000541687011719, "learning_rate": 1.2194127243066883e-06, "loss": 0.9338, "num_input_tokens_seen": 648000, "step": 300 }, { "epoch": 0.049755301794453505, "grad_norm": 3.3771164417266846, "learning_rate": 1.2398042414355628e-06, "loss": 1.5579, "num_input_tokens_seen": 659648, "step": 305 }, { "epoch": 0.05057096247960848, "grad_norm": 3.4884636402130127, "learning_rate": 1.2601957585644372e-06, "loss": 0.773, "num_input_tokens_seen": 669920, "step": 310 }, { "epoch": 0.05138662316476346, "grad_norm": 4.067154884338379, "learning_rate": 1.2805872756933117e-06, "loss": 1.0225, "num_input_tokens_seen": 681216, "step": 315 }, { "epoch": 0.052202283849918436, "grad_norm": 4.723116397857666, "learning_rate": 1.3009787928221861e-06, "loss": 1.2867, "num_input_tokens_seen": 691872, "step": 320 }, { "epoch": 0.05301794453507341, "grad_norm": 6.65973424911499, "learning_rate": 1.3213703099510604e-06, "loss": 0.6387, "num_input_tokens_seen": 702208, "step": 325 }, { "epoch": 0.053833605220228384, "grad_norm": 5.462034225463867, "learning_rate": 1.3417618270799348e-06, "loss": 0.8364, "num_input_tokens_seen": 712288, "step": 330 }, { "epoch": 0.05464926590538336, "grad_norm": 10.062926292419434, "learning_rate": 1.3621533442088093e-06, "loss": 1.5123, "num_input_tokens_seen": 722976, "step": 335 }, { "epoch": 0.05546492659053834, "grad_norm": 2.9235305786132812, "learning_rate": 1.3825448613376838e-06, "loss": 0.7639, "num_input_tokens_seen": 733760, "step": 340 }, { "epoch": 0.05628058727569331, "grad_norm": 3.005859375, "learning_rate": 1.402936378466558e-06, "loss": 1.0048, "num_input_tokens_seen": 743520, "step": 345 }, { "epoch": 0.057096247960848286, "grad_norm": 2.3126766681671143, "learning_rate": 1.4233278955954323e-06, "loss": 0.2781, "num_input_tokens_seen": 753984, "step": 350 }, { "epoch": 0.05791190864600326, "grad_norm": 3.495432138442993, "learning_rate": 1.4437194127243067e-06, "loss": 0.6741, "num_input_tokens_seen": 765376, "step": 355 }, { "epoch": 0.05872756933115824, "grad_norm": 4.609643459320068, "learning_rate": 1.4641109298531812e-06, "loss": 1.2081, "num_input_tokens_seen": 775904, "step": 360 }, { "epoch": 0.05954323001631321, "grad_norm": 2.5028162002563477, "learning_rate": 1.4845024469820554e-06, "loss": 1.1012, "num_input_tokens_seen": 788000, "step": 365 }, { "epoch": 0.06035889070146819, "grad_norm": 7.756853103637695, "learning_rate": 1.5048939641109299e-06, "loss": 1.3682, "num_input_tokens_seen": 798816, "step": 370 }, { "epoch": 0.061174551386623165, "grad_norm": 2.2152316570281982, "learning_rate": 1.5252854812398043e-06, "loss": 0.5359, "num_input_tokens_seen": 809856, "step": 375 }, { "epoch": 0.06199021207177814, "grad_norm": 2.457235336303711, "learning_rate": 1.5456769983686788e-06, "loss": 0.8371, "num_input_tokens_seen": 820448, "step": 380 }, { "epoch": 0.06280587275693311, "grad_norm": 7.175193786621094, "learning_rate": 1.566068515497553e-06, "loss": 1.4792, "num_input_tokens_seen": 830624, "step": 385 }, { "epoch": 0.0636215334420881, "grad_norm": 2.560544729232788, "learning_rate": 1.5864600326264277e-06, "loss": 0.9351, "num_input_tokens_seen": 840224, "step": 390 }, { "epoch": 0.06443719412724307, "grad_norm": 6.841041088104248, "learning_rate": 1.606851549755302e-06, "loss": 0.9432, "num_input_tokens_seen": 850176, "step": 395 }, { "epoch": 0.06525285481239804, "grad_norm": 3.937791585922241, "learning_rate": 1.6272430668841762e-06, "loss": 0.6393, "num_input_tokens_seen": 861056, "step": 400 }, { "epoch": 0.06606851549755302, "grad_norm": 3.08048939704895, "learning_rate": 1.6476345840130507e-06, "loss": 1.173, "num_input_tokens_seen": 871360, "step": 405 }, { "epoch": 0.06688417618270799, "grad_norm": 10.277064323425293, "learning_rate": 1.668026101141925e-06, "loss": 2.0493, "num_input_tokens_seen": 882656, "step": 410 }, { "epoch": 0.06769983686786298, "grad_norm": 3.225813388824463, "learning_rate": 1.6884176182707994e-06, "loss": 0.8889, "num_input_tokens_seen": 893952, "step": 415 }, { "epoch": 0.06851549755301795, "grad_norm": 6.246204376220703, "learning_rate": 1.7088091353996738e-06, "loss": 0.8155, "num_input_tokens_seen": 905472, "step": 420 }, { "epoch": 0.06933115823817292, "grad_norm": 2.387099027633667, "learning_rate": 1.729200652528548e-06, "loss": 0.9512, "num_input_tokens_seen": 916000, "step": 425 }, { "epoch": 0.0701468189233279, "grad_norm": 3.699512004852295, "learning_rate": 1.7495921696574227e-06, "loss": 1.0717, "num_input_tokens_seen": 926944, "step": 430 }, { "epoch": 0.07096247960848287, "grad_norm": 3.728041410446167, "learning_rate": 1.769983686786297e-06, "loss": 1.1159, "num_input_tokens_seen": 938496, "step": 435 }, { "epoch": 0.07177814029363784, "grad_norm": 1.8079861402511597, "learning_rate": 1.7903752039151712e-06, "loss": 1.1045, "num_input_tokens_seen": 949472, "step": 440 }, { "epoch": 0.07259380097879282, "grad_norm": 8.16455364227295, "learning_rate": 1.810766721044046e-06, "loss": 1.1812, "num_input_tokens_seen": 959712, "step": 445 }, { "epoch": 0.0734094616639478, "grad_norm": 7.215249061584473, "learning_rate": 1.8311582381729201e-06, "loss": 1.5608, "num_input_tokens_seen": 969888, "step": 450 }, { "epoch": 0.07422512234910278, "grad_norm": 3.456813335418701, "learning_rate": 1.8515497553017948e-06, "loss": 1.218, "num_input_tokens_seen": 981024, "step": 455 }, { "epoch": 0.07504078303425775, "grad_norm": 8.460956573486328, "learning_rate": 1.871941272430669e-06, "loss": 0.6864, "num_input_tokens_seen": 991680, "step": 460 }, { "epoch": 0.07585644371941272, "grad_norm": 4.067637920379639, "learning_rate": 1.8923327895595433e-06, "loss": 1.3527, "num_input_tokens_seen": 1003008, "step": 465 }, { "epoch": 0.0766721044045677, "grad_norm": 2.275658130645752, "learning_rate": 1.9127243066884178e-06, "loss": 0.7279, "num_input_tokens_seen": 1014336, "step": 470 }, { "epoch": 0.07748776508972267, "grad_norm": 5.116060733795166, "learning_rate": 1.933115823817292e-06, "loss": 1.323, "num_input_tokens_seen": 1024960, "step": 475 }, { "epoch": 0.07830342577487764, "grad_norm": 2.831258535385132, "learning_rate": 1.9535073409461663e-06, "loss": 0.4815, "num_input_tokens_seen": 1034880, "step": 480 }, { "epoch": 0.07911908646003263, "grad_norm": 6.077630043029785, "learning_rate": 1.973898858075041e-06, "loss": 1.6396, "num_input_tokens_seen": 1046112, "step": 485 }, { "epoch": 0.0799347471451876, "grad_norm": 7.082154273986816, "learning_rate": 1.994290375203915e-06, "loss": 1.3603, "num_input_tokens_seen": 1056864, "step": 490 }, { "epoch": 0.08075040783034258, "grad_norm": 2.1560025215148926, "learning_rate": 2.01468189233279e-06, "loss": 0.8702, "num_input_tokens_seen": 1067072, "step": 495 }, { "epoch": 0.08156606851549755, "grad_norm": 6.233649253845215, "learning_rate": 2.035073409461664e-06, "loss": 0.8868, "num_input_tokens_seen": 1079136, "step": 500 }, { "epoch": 0.08238172920065252, "grad_norm": 3.2148211002349854, "learning_rate": 2.0554649265905383e-06, "loss": 1.2583, "num_input_tokens_seen": 1089408, "step": 505 }, { "epoch": 0.08319738988580751, "grad_norm": 4.365433692932129, "learning_rate": 2.075856443719413e-06, "loss": 0.618, "num_input_tokens_seen": 1099488, "step": 510 }, { "epoch": 0.08401305057096248, "grad_norm": 6.860179901123047, "learning_rate": 2.0962479608482872e-06, "loss": 1.2969, "num_input_tokens_seen": 1110464, "step": 515 }, { "epoch": 0.08482871125611746, "grad_norm": 7.741820812225342, "learning_rate": 2.1166394779771615e-06, "loss": 0.7165, "num_input_tokens_seen": 1121216, "step": 520 }, { "epoch": 0.08564437194127243, "grad_norm": 8.306597709655762, "learning_rate": 2.137030995106036e-06, "loss": 1.1827, "num_input_tokens_seen": 1132192, "step": 525 }, { "epoch": 0.0864600326264274, "grad_norm": 4.382352828979492, "learning_rate": 2.1574225122349104e-06, "loss": 1.2209, "num_input_tokens_seen": 1142944, "step": 530 }, { "epoch": 0.08727569331158239, "grad_norm": 6.661376953125, "learning_rate": 2.177814029363785e-06, "loss": 1.1577, "num_input_tokens_seen": 1154080, "step": 535 }, { "epoch": 0.08809135399673736, "grad_norm": 5.086134910583496, "learning_rate": 2.1982055464926593e-06, "loss": 1.0745, "num_input_tokens_seen": 1165120, "step": 540 }, { "epoch": 0.08890701468189233, "grad_norm": 4.300531387329102, "learning_rate": 2.2185970636215336e-06, "loss": 0.7636, "num_input_tokens_seen": 1176352, "step": 545 }, { "epoch": 0.08972267536704731, "grad_norm": 4.058936595916748, "learning_rate": 2.238988580750408e-06, "loss": 0.4356, "num_input_tokens_seen": 1188288, "step": 550 }, { "epoch": 0.09053833605220228, "grad_norm": 2.540249824523926, "learning_rate": 2.2593800978792825e-06, "loss": 1.0234, "num_input_tokens_seen": 1200800, "step": 555 }, { "epoch": 0.09135399673735727, "grad_norm": 6.594124794006348, "learning_rate": 2.2797716150081567e-06, "loss": 1.0266, "num_input_tokens_seen": 1211360, "step": 560 }, { "epoch": 0.09216965742251224, "grad_norm": 2.661931276321411, "learning_rate": 2.300163132137031e-06, "loss": 0.6731, "num_input_tokens_seen": 1221184, "step": 565 }, { "epoch": 0.0929853181076672, "grad_norm": 2.7605957984924316, "learning_rate": 2.3205546492659052e-06, "loss": 1.2521, "num_input_tokens_seen": 1231808, "step": 570 }, { "epoch": 0.09380097879282219, "grad_norm": 4.789207935333252, "learning_rate": 2.34094616639478e-06, "loss": 0.5523, "num_input_tokens_seen": 1242560, "step": 575 }, { "epoch": 0.09461663947797716, "grad_norm": 2.632214307785034, "learning_rate": 2.361337683523654e-06, "loss": 0.5343, "num_input_tokens_seen": 1253440, "step": 580 }, { "epoch": 0.09543230016313213, "grad_norm": 6.561474323272705, "learning_rate": 2.3817292006525284e-06, "loss": 1.5794, "num_input_tokens_seen": 1263616, "step": 585 }, { "epoch": 0.09624796084828711, "grad_norm": 5.505043983459473, "learning_rate": 2.402120717781403e-06, "loss": 0.5237, "num_input_tokens_seen": 1275488, "step": 590 }, { "epoch": 0.09706362153344208, "grad_norm": 2.9514360427856445, "learning_rate": 2.4225122349102773e-06, "loss": 0.8767, "num_input_tokens_seen": 1287200, "step": 595 }, { "epoch": 0.09787928221859707, "grad_norm": 3.3899738788604736, "learning_rate": 2.442903752039152e-06, "loss": 1.3958, "num_input_tokens_seen": 1298336, "step": 600 }, { "epoch": 0.09869494290375204, "grad_norm": 5.640759468078613, "learning_rate": 2.4632952691680262e-06, "loss": 1.0729, "num_input_tokens_seen": 1309056, "step": 605 }, { "epoch": 0.09951060358890701, "grad_norm": 6.04180908203125, "learning_rate": 2.4836867862969005e-06, "loss": 0.7382, "num_input_tokens_seen": 1320864, "step": 610 }, { "epoch": 0.100326264274062, "grad_norm": 4.79779577255249, "learning_rate": 2.504078303425775e-06, "loss": 1.3793, "num_input_tokens_seen": 1332448, "step": 615 }, { "epoch": 0.10114192495921696, "grad_norm": 4.414719104766846, "learning_rate": 2.5244698205546494e-06, "loss": 1.1724, "num_input_tokens_seen": 1342368, "step": 620 }, { "epoch": 0.10195758564437195, "grad_norm": 7.5571160316467285, "learning_rate": 2.5448613376835236e-06, "loss": 0.9557, "num_input_tokens_seen": 1353024, "step": 625 }, { "epoch": 0.10277324632952692, "grad_norm": 7.056685924530029, "learning_rate": 2.5652528548123983e-06, "loss": 0.9486, "num_input_tokens_seen": 1364000, "step": 630 }, { "epoch": 0.10358890701468189, "grad_norm": 6.69120979309082, "learning_rate": 2.5856443719412725e-06, "loss": 1.4974, "num_input_tokens_seen": 1374784, "step": 635 }, { "epoch": 0.10440456769983687, "grad_norm": 2.072876214981079, "learning_rate": 2.6060358890701472e-06, "loss": 0.8678, "num_input_tokens_seen": 1384192, "step": 640 }, { "epoch": 0.10522022838499184, "grad_norm": 3.144529104232788, "learning_rate": 2.6264274061990215e-06, "loss": 0.7582, "num_input_tokens_seen": 1395392, "step": 645 }, { "epoch": 0.10603588907014681, "grad_norm": 5.4254655838012695, "learning_rate": 2.6468189233278957e-06, "loss": 0.7403, "num_input_tokens_seen": 1406208, "step": 650 }, { "epoch": 0.1068515497553018, "grad_norm": 4.902912616729736, "learning_rate": 2.6672104404567704e-06, "loss": 0.6243, "num_input_tokens_seen": 1417024, "step": 655 }, { "epoch": 0.10766721044045677, "grad_norm": 3.5322487354278564, "learning_rate": 2.6876019575856446e-06, "loss": 0.9954, "num_input_tokens_seen": 1427808, "step": 660 }, { "epoch": 0.10848287112561175, "grad_norm": 2.258141040802002, "learning_rate": 2.707993474714519e-06, "loss": 0.5469, "num_input_tokens_seen": 1437344, "step": 665 }, { "epoch": 0.10929853181076672, "grad_norm": 5.760447978973389, "learning_rate": 2.728384991843393e-06, "loss": 1.0456, "num_input_tokens_seen": 1448352, "step": 670 }, { "epoch": 0.11011419249592169, "grad_norm": 3.7249224185943604, "learning_rate": 2.7487765089722678e-06, "loss": 0.5595, "num_input_tokens_seen": 1459520, "step": 675 }, { "epoch": 0.11092985318107668, "grad_norm": 4.291210174560547, "learning_rate": 2.769168026101142e-06, "loss": 0.7455, "num_input_tokens_seen": 1470432, "step": 680 }, { "epoch": 0.11174551386623165, "grad_norm": 2.9785103797912598, "learning_rate": 2.7895595432300163e-06, "loss": 0.7005, "num_input_tokens_seen": 1480544, "step": 685 }, { "epoch": 0.11256117455138662, "grad_norm": 6.005767822265625, "learning_rate": 2.8099510603588905e-06, "loss": 1.0932, "num_input_tokens_seen": 1490752, "step": 690 }, { "epoch": 0.1133768352365416, "grad_norm": 8.193732261657715, "learning_rate": 2.830342577487765e-06, "loss": 0.9863, "num_input_tokens_seen": 1502240, "step": 695 }, { "epoch": 0.11419249592169657, "grad_norm": 4.903045177459717, "learning_rate": 2.8507340946166394e-06, "loss": 1.2023, "num_input_tokens_seen": 1512256, "step": 700 }, { "epoch": 0.11500815660685156, "grad_norm": 2.055464029312134, "learning_rate": 2.871125611745514e-06, "loss": 1.0729, "num_input_tokens_seen": 1523168, "step": 705 }, { "epoch": 0.11582381729200653, "grad_norm": 2.8402328491210938, "learning_rate": 2.8915171288743884e-06, "loss": 0.5163, "num_input_tokens_seen": 1534976, "step": 710 }, { "epoch": 0.1166394779771615, "grad_norm": 2.7248482704162598, "learning_rate": 2.9119086460032626e-06, "loss": 1.1814, "num_input_tokens_seen": 1545856, "step": 715 }, { "epoch": 0.11745513866231648, "grad_norm": 2.1035971641540527, "learning_rate": 2.9323001631321373e-06, "loss": 0.3174, "num_input_tokens_seen": 1555680, "step": 720 }, { "epoch": 0.11827079934747145, "grad_norm": 1.4461320638656616, "learning_rate": 2.9526916802610115e-06, "loss": 0.3697, "num_input_tokens_seen": 1567328, "step": 725 }, { "epoch": 0.11908646003262642, "grad_norm": 4.433595180511475, "learning_rate": 2.9730831973898858e-06, "loss": 1.3134, "num_input_tokens_seen": 1577184, "step": 730 }, { "epoch": 0.1199021207177814, "grad_norm": 9.404504776000977, "learning_rate": 2.9934747145187604e-06, "loss": 1.1673, "num_input_tokens_seen": 1588544, "step": 735 }, { "epoch": 0.12071778140293637, "grad_norm": 3.795610189437866, "learning_rate": 3.0138662316476347e-06, "loss": 0.8974, "num_input_tokens_seen": 1599648, "step": 740 }, { "epoch": 0.12153344208809136, "grad_norm": 3.076533079147339, "learning_rate": 3.0342577487765094e-06, "loss": 1.2286, "num_input_tokens_seen": 1612096, "step": 745 }, { "epoch": 0.12234910277324633, "grad_norm": 2.6914947032928467, "learning_rate": 3.0546492659053836e-06, "loss": 0.6522, "num_input_tokens_seen": 1623584, "step": 750 }, { "epoch": 0.1231647634584013, "grad_norm": 6.979761600494385, "learning_rate": 3.075040783034258e-06, "loss": 1.1558, "num_input_tokens_seen": 1634688, "step": 755 }, { "epoch": 0.12398042414355628, "grad_norm": 3.603928804397583, "learning_rate": 3.0954323001631325e-06, "loss": 1.3778, "num_input_tokens_seen": 1645600, "step": 760 }, { "epoch": 0.12479608482871125, "grad_norm": 5.718084335327148, "learning_rate": 3.1158238172920068e-06, "loss": 0.5182, "num_input_tokens_seen": 1655968, "step": 765 }, { "epoch": 0.12561174551386622, "grad_norm": 4.596571922302246, "learning_rate": 3.1362153344208814e-06, "loss": 1.3224, "num_input_tokens_seen": 1666848, "step": 770 }, { "epoch": 0.1264274061990212, "grad_norm": 8.877181053161621, "learning_rate": 3.1566068515497553e-06, "loss": 1.0659, "num_input_tokens_seen": 1676512, "step": 775 }, { "epoch": 0.1272430668841762, "grad_norm": 2.707555055618286, "learning_rate": 3.17699836867863e-06, "loss": 1.0524, "num_input_tokens_seen": 1687488, "step": 780 }, { "epoch": 0.12805872756933115, "grad_norm": 3.1155269145965576, "learning_rate": 3.197389885807504e-06, "loss": 1.3103, "num_input_tokens_seen": 1697312, "step": 785 }, { "epoch": 0.12887438825448613, "grad_norm": 7.497936248779297, "learning_rate": 3.2177814029363784e-06, "loss": 1.1716, "num_input_tokens_seen": 1708096, "step": 790 }, { "epoch": 0.12969004893964112, "grad_norm": 2.5311765670776367, "learning_rate": 3.238172920065253e-06, "loss": 0.7693, "num_input_tokens_seen": 1717824, "step": 795 }, { "epoch": 0.13050570962479607, "grad_norm": 5.427221298217773, "learning_rate": 3.2585644371941273e-06, "loss": 1.7591, "num_input_tokens_seen": 1728608, "step": 800 }, { "epoch": 0.13132137030995106, "grad_norm": 8.121956825256348, "learning_rate": 3.278955954323002e-06, "loss": 1.1294, "num_input_tokens_seen": 1739456, "step": 805 }, { "epoch": 0.13213703099510604, "grad_norm": 4.399386882781982, "learning_rate": 3.299347471451876e-06, "loss": 1.0482, "num_input_tokens_seen": 1750304, "step": 810 }, { "epoch": 0.132952691680261, "grad_norm": 7.496412754058838, "learning_rate": 3.3197389885807505e-06, "loss": 0.7904, "num_input_tokens_seen": 1762496, "step": 815 }, { "epoch": 0.13376835236541598, "grad_norm": 4.141295909881592, "learning_rate": 3.340130505709625e-06, "loss": 1.2006, "num_input_tokens_seen": 1772640, "step": 820 }, { "epoch": 0.13458401305057097, "grad_norm": 2.7353579998016357, "learning_rate": 3.360522022838499e-06, "loss": 0.7774, "num_input_tokens_seen": 1782272, "step": 825 }, { "epoch": 0.13539967373572595, "grad_norm": 1.775003433227539, "learning_rate": 3.3809135399673737e-06, "loss": 0.8329, "num_input_tokens_seen": 1793120, "step": 830 }, { "epoch": 0.1362153344208809, "grad_norm": 2.9494428634643555, "learning_rate": 3.4013050570962483e-06, "loss": 1.004, "num_input_tokens_seen": 1803264, "step": 835 }, { "epoch": 0.1370309951060359, "grad_norm": 4.140613555908203, "learning_rate": 3.421696574225122e-06, "loss": 0.3828, "num_input_tokens_seen": 1813088, "step": 840 }, { "epoch": 0.13784665579119088, "grad_norm": 3.2531661987304688, "learning_rate": 3.442088091353997e-06, "loss": 1.1141, "num_input_tokens_seen": 1824032, "step": 845 }, { "epoch": 0.13866231647634583, "grad_norm": 5.305840015411377, "learning_rate": 3.4624796084828715e-06, "loss": 1.3558, "num_input_tokens_seen": 1836512, "step": 850 }, { "epoch": 0.13947797716150082, "grad_norm": 5.5439276695251465, "learning_rate": 3.4828711256117453e-06, "loss": 0.8258, "num_input_tokens_seen": 1848256, "step": 855 }, { "epoch": 0.1402936378466558, "grad_norm": 4.403939247131348, "learning_rate": 3.50326264274062e-06, "loss": 0.7032, "num_input_tokens_seen": 1859040, "step": 860 }, { "epoch": 0.14110929853181076, "grad_norm": 2.7750277519226074, "learning_rate": 3.5236541598694946e-06, "loss": 0.7897, "num_input_tokens_seen": 1869248, "step": 865 }, { "epoch": 0.14192495921696574, "grad_norm": 5.631011962890625, "learning_rate": 3.5440456769983693e-06, "loss": 1.4054, "num_input_tokens_seen": 1881184, "step": 870 }, { "epoch": 0.14274061990212072, "grad_norm": 2.605897903442383, "learning_rate": 3.564437194127243e-06, "loss": 0.9772, "num_input_tokens_seen": 1893760, "step": 875 }, { "epoch": 0.14355628058727568, "grad_norm": 4.393505096435547, "learning_rate": 3.584828711256118e-06, "loss": 0.823, "num_input_tokens_seen": 1904800, "step": 880 }, { "epoch": 0.14437194127243066, "grad_norm": 4.008682727813721, "learning_rate": 3.6052202283849925e-06, "loss": 0.7852, "num_input_tokens_seen": 1915296, "step": 885 }, { "epoch": 0.14518760195758565, "grad_norm": 11.65932559967041, "learning_rate": 3.6256117455138663e-06, "loss": 1.5265, "num_input_tokens_seen": 1925152, "step": 890 }, { "epoch": 0.14600326264274063, "grad_norm": 5.75033712387085, "learning_rate": 3.646003262642741e-06, "loss": 0.8923, "num_input_tokens_seen": 1935200, "step": 895 }, { "epoch": 0.1468189233278956, "grad_norm": 2.5531187057495117, "learning_rate": 3.6663947797716152e-06, "loss": 1.1944, "num_input_tokens_seen": 1945632, "step": 900 }, { "epoch": 0.14763458401305057, "grad_norm": 7.313605308532715, "learning_rate": 3.6867862969004895e-06, "loss": 1.6921, "num_input_tokens_seen": 1956992, "step": 905 }, { "epoch": 0.14845024469820556, "grad_norm": 3.625026226043701, "learning_rate": 3.707177814029364e-06, "loss": 1.2355, "num_input_tokens_seen": 1968800, "step": 910 }, { "epoch": 0.14926590538336051, "grad_norm": 3.149094343185425, "learning_rate": 3.7275693311582384e-06, "loss": 1.0466, "num_input_tokens_seen": 1979424, "step": 915 }, { "epoch": 0.1500815660685155, "grad_norm": 2.6502768993377686, "learning_rate": 3.7479608482871126e-06, "loss": 1.4593, "num_input_tokens_seen": 1990976, "step": 920 }, { "epoch": 0.15089722675367048, "grad_norm": 6.204927444458008, "learning_rate": 3.768352365415987e-06, "loss": 0.8367, "num_input_tokens_seen": 2002432, "step": 925 }, { "epoch": 0.15171288743882544, "grad_norm": 2.758460760116577, "learning_rate": 3.7887438825448615e-06, "loss": 0.8898, "num_input_tokens_seen": 2013632, "step": 930 }, { "epoch": 0.15252854812398042, "grad_norm": 6.596063613891602, "learning_rate": 3.8091353996737362e-06, "loss": 1.3758, "num_input_tokens_seen": 2024960, "step": 935 }, { "epoch": 0.1533442088091354, "grad_norm": 8.624046325683594, "learning_rate": 3.8295269168026105e-06, "loss": 1.0327, "num_input_tokens_seen": 2035712, "step": 940 }, { "epoch": 0.15415986949429036, "grad_norm": 2.8915867805480957, "learning_rate": 3.849918433931485e-06, "loss": 0.6374, "num_input_tokens_seen": 2046656, "step": 945 }, { "epoch": 0.15497553017944535, "grad_norm": 2.981114149093628, "learning_rate": 3.870309951060359e-06, "loss": 1.4447, "num_input_tokens_seen": 2058112, "step": 950 }, { "epoch": 0.15579119086460033, "grad_norm": 5.024288177490234, "learning_rate": 3.890701468189234e-06, "loss": 1.1959, "num_input_tokens_seen": 2068768, "step": 955 }, { "epoch": 0.1566068515497553, "grad_norm": 10.004108428955078, "learning_rate": 3.911092985318108e-06, "loss": 1.3193, "num_input_tokens_seen": 2080928, "step": 960 }, { "epoch": 0.15742251223491027, "grad_norm": 4.610642433166504, "learning_rate": 3.931484502446982e-06, "loss": 1.1185, "num_input_tokens_seen": 2090784, "step": 965 }, { "epoch": 0.15823817292006526, "grad_norm": 7.469626426696777, "learning_rate": 3.951876019575857e-06, "loss": 0.6389, "num_input_tokens_seen": 2100416, "step": 970 }, { "epoch": 0.15905383360522024, "grad_norm": 4.5558061599731445, "learning_rate": 3.972267536704731e-06, "loss": 0.5067, "num_input_tokens_seen": 2111424, "step": 975 }, { "epoch": 0.1598694942903752, "grad_norm": 2.2577614784240723, "learning_rate": 3.992659053833605e-06, "loss": 0.7008, "num_input_tokens_seen": 2122176, "step": 980 }, { "epoch": 0.16068515497553018, "grad_norm": 3.14097261428833, "learning_rate": 4.013050570962479e-06, "loss": 0.5742, "num_input_tokens_seen": 2132928, "step": 985 }, { "epoch": 0.16150081566068517, "grad_norm": 4.141494274139404, "learning_rate": 4.033442088091354e-06, "loss": 0.534, "num_input_tokens_seen": 2144480, "step": 990 }, { "epoch": 0.16231647634584012, "grad_norm": 3.573063850402832, "learning_rate": 4.0538336052202284e-06, "loss": 0.7764, "num_input_tokens_seen": 2154272, "step": 995 }, { "epoch": 0.1631321370309951, "grad_norm": 4.24932861328125, "learning_rate": 4.074225122349102e-06, "loss": 0.2635, "num_input_tokens_seen": 2163936, "step": 1000 }, { "epoch": 0.1639477977161501, "grad_norm": 3.2050375938415527, "learning_rate": 4.094616639477977e-06, "loss": 0.9814, "num_input_tokens_seen": 2175392, "step": 1005 }, { "epoch": 0.16476345840130505, "grad_norm": 2.442913293838501, "learning_rate": 4.115008156606852e-06, "loss": 1.0027, "num_input_tokens_seen": 2185120, "step": 1010 }, { "epoch": 0.16557911908646003, "grad_norm": 6.3474812507629395, "learning_rate": 4.135399673735726e-06, "loss": 1.4309, "num_input_tokens_seen": 2196864, "step": 1015 }, { "epoch": 0.16639477977161501, "grad_norm": 10.543128967285156, "learning_rate": 4.1557911908646e-06, "loss": 1.102, "num_input_tokens_seen": 2207744, "step": 1020 }, { "epoch": 0.16721044045676997, "grad_norm": 3.176244020462036, "learning_rate": 4.176182707993475e-06, "loss": 1.8282, "num_input_tokens_seen": 2218528, "step": 1025 }, { "epoch": 0.16802610114192496, "grad_norm": 11.791255950927734, "learning_rate": 4.1965742251223494e-06, "loss": 0.9121, "num_input_tokens_seen": 2228320, "step": 1030 }, { "epoch": 0.16884176182707994, "grad_norm": 5.114573955535889, "learning_rate": 4.216965742251223e-06, "loss": 1.0336, "num_input_tokens_seen": 2238464, "step": 1035 }, { "epoch": 0.16965742251223492, "grad_norm": 1.9803240299224854, "learning_rate": 4.237357259380098e-06, "loss": 0.6937, "num_input_tokens_seen": 2249440, "step": 1040 }, { "epoch": 0.17047308319738988, "grad_norm": 3.1710615158081055, "learning_rate": 4.257748776508973e-06, "loss": 0.9229, "num_input_tokens_seen": 2260576, "step": 1045 }, { "epoch": 0.17128874388254486, "grad_norm": 4.21212100982666, "learning_rate": 4.278140293637846e-06, "loss": 0.6676, "num_input_tokens_seen": 2271904, "step": 1050 }, { "epoch": 0.17210440456769985, "grad_norm": 3.8057854175567627, "learning_rate": 4.298531810766721e-06, "loss": 0.6131, "num_input_tokens_seen": 2281568, "step": 1055 }, { "epoch": 0.1729200652528548, "grad_norm": 3.4884297847747803, "learning_rate": 4.318923327895596e-06, "loss": 0.5354, "num_input_tokens_seen": 2292704, "step": 1060 }, { "epoch": 0.1737357259380098, "grad_norm": 2.7731664180755615, "learning_rate": 4.33931484502447e-06, "loss": 0.7002, "num_input_tokens_seen": 2303904, "step": 1065 }, { "epoch": 0.17455138662316477, "grad_norm": 3.13751482963562, "learning_rate": 4.359706362153344e-06, "loss": 0.4427, "num_input_tokens_seen": 2314336, "step": 1070 }, { "epoch": 0.17536704730831973, "grad_norm": 1.5175836086273193, "learning_rate": 4.380097879282219e-06, "loss": 0.461, "num_input_tokens_seen": 2325024, "step": 1075 }, { "epoch": 0.1761827079934747, "grad_norm": 2.0806100368499756, "learning_rate": 4.400489396411094e-06, "loss": 0.6967, "num_input_tokens_seen": 2336224, "step": 1080 }, { "epoch": 0.1769983686786297, "grad_norm": 2.5255484580993652, "learning_rate": 4.420880913539967e-06, "loss": 0.4496, "num_input_tokens_seen": 2346880, "step": 1085 }, { "epoch": 0.17781402936378465, "grad_norm": 2.264073133468628, "learning_rate": 4.441272430668842e-06, "loss": 0.5866, "num_input_tokens_seen": 2357632, "step": 1090 }, { "epoch": 0.17862969004893964, "grad_norm": 2.1738240718841553, "learning_rate": 4.461663947797717e-06, "loss": 0.5659, "num_input_tokens_seen": 2370592, "step": 1095 }, { "epoch": 0.17944535073409462, "grad_norm": 4.569738388061523, "learning_rate": 4.4820554649265906e-06, "loss": 0.9621, "num_input_tokens_seen": 2381024, "step": 1100 }, { "epoch": 0.1802610114192496, "grad_norm": 5.2099289894104, "learning_rate": 4.502446982055465e-06, "loss": 0.5808, "num_input_tokens_seen": 2392160, "step": 1105 }, { "epoch": 0.18107667210440456, "grad_norm": 3.414036512374878, "learning_rate": 4.52283849918434e-06, "loss": 1.1409, "num_input_tokens_seen": 2403616, "step": 1110 }, { "epoch": 0.18189233278955955, "grad_norm": 2.567964553833008, "learning_rate": 4.543230016313214e-06, "loss": 1.3296, "num_input_tokens_seen": 2416032, "step": 1115 }, { "epoch": 0.18270799347471453, "grad_norm": 1.3791857957839966, "learning_rate": 4.563621533442088e-06, "loss": 0.658, "num_input_tokens_seen": 2426112, "step": 1120 }, { "epoch": 0.1835236541598695, "grad_norm": 2.908480405807495, "learning_rate": 4.584013050570963e-06, "loss": 0.6889, "num_input_tokens_seen": 2437344, "step": 1125 }, { "epoch": 0.18433931484502447, "grad_norm": 1.3433001041412354, "learning_rate": 4.604404567699837e-06, "loss": 0.8567, "num_input_tokens_seen": 2448672, "step": 1130 }, { "epoch": 0.18515497553017946, "grad_norm": 1.2802082300186157, "learning_rate": 4.6247960848287116e-06, "loss": 0.5637, "num_input_tokens_seen": 2459552, "step": 1135 }, { "epoch": 0.1859706362153344, "grad_norm": 0.7948053479194641, "learning_rate": 4.645187601957586e-06, "loss": 0.5045, "num_input_tokens_seen": 2469184, "step": 1140 }, { "epoch": 0.1867862969004894, "grad_norm": 1.2662429809570312, "learning_rate": 4.66557911908646e-06, "loss": 0.319, "num_input_tokens_seen": 2481120, "step": 1145 }, { "epoch": 0.18760195758564438, "grad_norm": 0.8438661098480225, "learning_rate": 4.685970636215335e-06, "loss": 0.4469, "num_input_tokens_seen": 2490976, "step": 1150 }, { "epoch": 0.18841761827079934, "grad_norm": 4.365973472595215, "learning_rate": 4.706362153344209e-06, "loss": 0.7946, "num_input_tokens_seen": 2501024, "step": 1155 }, { "epoch": 0.18923327895595432, "grad_norm": 3.289198160171509, "learning_rate": 4.726753670473084e-06, "loss": 0.7222, "num_input_tokens_seen": 2511264, "step": 1160 }, { "epoch": 0.1900489396411093, "grad_norm": 1.3834810256958008, "learning_rate": 4.747145187601958e-06, "loss": 0.7106, "num_input_tokens_seen": 2521920, "step": 1165 }, { "epoch": 0.19086460032626426, "grad_norm": 2.887634515762329, "learning_rate": 4.7675367047308326e-06, "loss": 1.3633, "num_input_tokens_seen": 2533056, "step": 1170 }, { "epoch": 0.19168026101141925, "grad_norm": 1.6030365228652954, "learning_rate": 4.787928221859707e-06, "loss": 0.8119, "num_input_tokens_seen": 2544320, "step": 1175 }, { "epoch": 0.19249592169657423, "grad_norm": 3.9976539611816406, "learning_rate": 4.808319738988581e-06, "loss": 1.3252, "num_input_tokens_seen": 2555168, "step": 1180 }, { "epoch": 0.1933115823817292, "grad_norm": 1.4870409965515137, "learning_rate": 4.828711256117456e-06, "loss": 0.8723, "num_input_tokens_seen": 2566304, "step": 1185 }, { "epoch": 0.19412724306688417, "grad_norm": 0.7355818748474121, "learning_rate": 4.84910277324633e-06, "loss": 1.29, "num_input_tokens_seen": 2575936, "step": 1190 }, { "epoch": 0.19494290375203915, "grad_norm": 7.552244663238525, "learning_rate": 4.869494290375204e-06, "loss": 1.0293, "num_input_tokens_seen": 2586528, "step": 1195 }, { "epoch": 0.19575856443719414, "grad_norm": 4.816568374633789, "learning_rate": 4.889885807504079e-06, "loss": 0.5315, "num_input_tokens_seen": 2596064, "step": 1200 }, { "epoch": 0.1965742251223491, "grad_norm": 7.222052097320557, "learning_rate": 4.910277324632953e-06, "loss": 0.8695, "num_input_tokens_seen": 2606464, "step": 1205 }, { "epoch": 0.19738988580750408, "grad_norm": 5.016211986541748, "learning_rate": 4.930668841761827e-06, "loss": 1.0388, "num_input_tokens_seen": 2617568, "step": 1210 }, { "epoch": 0.19820554649265906, "grad_norm": 0.6027952432632446, "learning_rate": 4.951060358890701e-06, "loss": 0.5358, "num_input_tokens_seen": 2628928, "step": 1215 }, { "epoch": 0.19902120717781402, "grad_norm": 4.328365802764893, "learning_rate": 4.971451876019576e-06, "loss": 0.4948, "num_input_tokens_seen": 2641344, "step": 1220 }, { "epoch": 0.199836867862969, "grad_norm": 3.690885066986084, "learning_rate": 4.9918433931484505e-06, "loss": 0.2013, "num_input_tokens_seen": 2651584, "step": 1225 }, { "epoch": 0.200652528548124, "grad_norm": 0.4666721522808075, "learning_rate": 5.012234910277324e-06, "loss": 0.3626, "num_input_tokens_seen": 2663360, "step": 1230 }, { "epoch": 0.20146818923327894, "grad_norm": 2.7491722106933594, "learning_rate": 5.032626427406199e-06, "loss": 0.8468, "num_input_tokens_seen": 2675168, "step": 1235 }, { "epoch": 0.20228384991843393, "grad_norm": 4.988853454589844, "learning_rate": 5.053017944535074e-06, "loss": 0.7969, "num_input_tokens_seen": 2685952, "step": 1240 }, { "epoch": 0.2030995106035889, "grad_norm": 3.807046413421631, "learning_rate": 5.0734094616639475e-06, "loss": 0.7401, "num_input_tokens_seen": 2695488, "step": 1245 }, { "epoch": 0.2039151712887439, "grad_norm": 4.485692977905273, "learning_rate": 5.093800978792822e-06, "loss": 0.5898, "num_input_tokens_seen": 2706880, "step": 1250 }, { "epoch": 0.20473083197389885, "grad_norm": 3.1724348068237305, "learning_rate": 5.114192495921697e-06, "loss": 1.1997, "num_input_tokens_seen": 2717184, "step": 1255 }, { "epoch": 0.20554649265905384, "grad_norm": 0.5278952121734619, "learning_rate": 5.134584013050571e-06, "loss": 0.7323, "num_input_tokens_seen": 2729184, "step": 1260 }, { "epoch": 0.20636215334420882, "grad_norm": 5.658390045166016, "learning_rate": 5.154975530179445e-06, "loss": 0.306, "num_input_tokens_seen": 2740768, "step": 1265 }, { "epoch": 0.20717781402936378, "grad_norm": 0.46904081106185913, "learning_rate": 5.17536704730832e-06, "loss": 0.6571, "num_input_tokens_seen": 2751328, "step": 1270 }, { "epoch": 0.20799347471451876, "grad_norm": 1.1401277780532837, "learning_rate": 5.195758564437194e-06, "loss": 0.5627, "num_input_tokens_seen": 2762464, "step": 1275 }, { "epoch": 0.20880913539967375, "grad_norm": 0.37963372468948364, "learning_rate": 5.2161500815660685e-06, "loss": 0.3288, "num_input_tokens_seen": 2773824, "step": 1280 }, { "epoch": 0.2096247960848287, "grad_norm": 8.360880851745605, "learning_rate": 5.236541598694943e-06, "loss": 1.2493, "num_input_tokens_seen": 2784544, "step": 1285 }, { "epoch": 0.21044045676998369, "grad_norm": 0.3803986608982086, "learning_rate": 5.256933115823817e-06, "loss": 0.2077, "num_input_tokens_seen": 2796032, "step": 1290 }, { "epoch": 0.21125611745513867, "grad_norm": 0.3127916753292084, "learning_rate": 5.277324632952692e-06, "loss": 0.5234, "num_input_tokens_seen": 2808096, "step": 1295 }, { "epoch": 0.21207177814029363, "grad_norm": 3.715475082397461, "learning_rate": 5.297716150081566e-06, "loss": 0.4042, "num_input_tokens_seen": 2818080, "step": 1300 }, { "epoch": 0.2128874388254486, "grad_norm": 0.35196179151535034, "learning_rate": 5.318107667210441e-06, "loss": 0.9615, "num_input_tokens_seen": 2829248, "step": 1305 }, { "epoch": 0.2137030995106036, "grad_norm": 0.5670180916786194, "learning_rate": 5.338499184339315e-06, "loss": 0.512, "num_input_tokens_seen": 2840288, "step": 1310 }, { "epoch": 0.21451876019575855, "grad_norm": 5.983412265777588, "learning_rate": 5.3588907014681895e-06, "loss": 0.8243, "num_input_tokens_seen": 2849440, "step": 1315 }, { "epoch": 0.21533442088091354, "grad_norm": 0.17860674858093262, "learning_rate": 5.379282218597064e-06, "loss": 0.4741, "num_input_tokens_seen": 2861056, "step": 1320 }, { "epoch": 0.21615008156606852, "grad_norm": 0.2327325940132141, "learning_rate": 5.399673735725938e-06, "loss": 0.5862, "num_input_tokens_seen": 2872832, "step": 1325 }, { "epoch": 0.2169657422512235, "grad_norm": 5.927593231201172, "learning_rate": 5.420065252854813e-06, "loss": 1.1106, "num_input_tokens_seen": 2883968, "step": 1330 }, { "epoch": 0.21778140293637846, "grad_norm": 0.24360142648220062, "learning_rate": 5.440456769983687e-06, "loss": 0.45, "num_input_tokens_seen": 2894880, "step": 1335 }, { "epoch": 0.21859706362153344, "grad_norm": 3.1550445556640625, "learning_rate": 5.460848287112561e-06, "loss": 0.7361, "num_input_tokens_seen": 2905216, "step": 1340 }, { "epoch": 0.21941272430668843, "grad_norm": 2.715406894683838, "learning_rate": 5.481239804241436e-06, "loss": 0.1759, "num_input_tokens_seen": 2917152, "step": 1345 }, { "epoch": 0.22022838499184338, "grad_norm": 3.1345643997192383, "learning_rate": 5.5016313213703105e-06, "loss": 0.696, "num_input_tokens_seen": 2928448, "step": 1350 }, { "epoch": 0.22104404567699837, "grad_norm": 4.369110107421875, "learning_rate": 5.522022838499184e-06, "loss": 0.6746, "num_input_tokens_seen": 2938336, "step": 1355 }, { "epoch": 0.22185970636215335, "grad_norm": 2.2190358638763428, "learning_rate": 5.542414355628059e-06, "loss": 0.1963, "num_input_tokens_seen": 2950496, "step": 1360 }, { "epoch": 0.2226753670473083, "grad_norm": 5.229277610778809, "learning_rate": 5.562805872756934e-06, "loss": 1.1886, "num_input_tokens_seen": 2961024, "step": 1365 }, { "epoch": 0.2234910277324633, "grad_norm": 0.3433627188205719, "learning_rate": 5.583197389885808e-06, "loss": 0.2333, "num_input_tokens_seen": 2971744, "step": 1370 }, { "epoch": 0.22430668841761828, "grad_norm": 3.039000988006592, "learning_rate": 5.603588907014682e-06, "loss": 0.4717, "num_input_tokens_seen": 2984192, "step": 1375 }, { "epoch": 0.22512234910277323, "grad_norm": 0.33931997418403625, "learning_rate": 5.623980424143557e-06, "loss": 0.4531, "num_input_tokens_seen": 2994816, "step": 1380 }, { "epoch": 0.22593800978792822, "grad_norm": 0.18841548264026642, "learning_rate": 5.6443719412724315e-06, "loss": 0.3208, "num_input_tokens_seen": 3005568, "step": 1385 }, { "epoch": 0.2267536704730832, "grad_norm": 5.965118408203125, "learning_rate": 5.664763458401305e-06, "loss": 0.8437, "num_input_tokens_seen": 3015680, "step": 1390 }, { "epoch": 0.2275693311582382, "grad_norm": 3.281815528869629, "learning_rate": 5.68515497553018e-06, "loss": 0.363, "num_input_tokens_seen": 3026752, "step": 1395 }, { "epoch": 0.22838499184339314, "grad_norm": 0.5801479816436768, "learning_rate": 5.705546492659055e-06, "loss": 0.4606, "num_input_tokens_seen": 3036704, "step": 1400 }, { "epoch": 0.22920065252854813, "grad_norm": 2.8873565196990967, "learning_rate": 5.7259380097879285e-06, "loss": 0.6625, "num_input_tokens_seen": 3048960, "step": 1405 }, { "epoch": 0.2300163132137031, "grad_norm": 6.54552698135376, "learning_rate": 5.746329526916803e-06, "loss": 0.8532, "num_input_tokens_seen": 3059456, "step": 1410 }, { "epoch": 0.23083197389885807, "grad_norm": 0.11681745946407318, "learning_rate": 5.766721044045678e-06, "loss": 0.5241, "num_input_tokens_seen": 3070112, "step": 1415 }, { "epoch": 0.23164763458401305, "grad_norm": 0.20601288974285126, "learning_rate": 5.787112561174552e-06, "loss": 0.3032, "num_input_tokens_seen": 3080224, "step": 1420 }, { "epoch": 0.23246329526916804, "grad_norm": 2.613618850708008, "learning_rate": 5.807504078303426e-06, "loss": 0.7254, "num_input_tokens_seen": 3090304, "step": 1425 }, { "epoch": 0.233278955954323, "grad_norm": 3.315976858139038, "learning_rate": 5.827895595432301e-06, "loss": 0.319, "num_input_tokens_seen": 3099232, "step": 1430 }, { "epoch": 0.23409461663947798, "grad_norm": 3.312969923019409, "learning_rate": 5.848287112561175e-06, "loss": 0.8097, "num_input_tokens_seen": 3109696, "step": 1435 }, { "epoch": 0.23491027732463296, "grad_norm": 3.5534603595733643, "learning_rate": 5.8686786296900495e-06, "loss": 0.4577, "num_input_tokens_seen": 3120160, "step": 1440 }, { "epoch": 0.23572593800978792, "grad_norm": 0.1030135452747345, "learning_rate": 5.889070146818923e-06, "loss": 0.3082, "num_input_tokens_seen": 3130784, "step": 1445 }, { "epoch": 0.2365415986949429, "grad_norm": 6.506193161010742, "learning_rate": 5.909461663947798e-06, "loss": 0.8093, "num_input_tokens_seen": 3141536, "step": 1450 }, { "epoch": 0.23735725938009788, "grad_norm": 0.11771534383296967, "learning_rate": 5.929853181076672e-06, "loss": 0.561, "num_input_tokens_seen": 3152160, "step": 1455 }, { "epoch": 0.23817292006525284, "grad_norm": 2.6675424575805664, "learning_rate": 5.9502446982055465e-06, "loss": 0.21, "num_input_tokens_seen": 3162560, "step": 1460 }, { "epoch": 0.23898858075040783, "grad_norm": 1.3277784585952759, "learning_rate": 5.970636215334421e-06, "loss": 0.3041, "num_input_tokens_seen": 3175360, "step": 1465 }, { "epoch": 0.2398042414355628, "grad_norm": 4.747895240783691, "learning_rate": 5.991027732463295e-06, "loss": 0.3133, "num_input_tokens_seen": 3187936, "step": 1470 }, { "epoch": 0.2406199021207178, "grad_norm": 5.6028594970703125, "learning_rate": 6.01141924959217e-06, "loss": 0.8116, "num_input_tokens_seen": 3199424, "step": 1475 }, { "epoch": 0.24143556280587275, "grad_norm": 0.11191844195127487, "learning_rate": 6.031810766721044e-06, "loss": 0.7472, "num_input_tokens_seen": 3210272, "step": 1480 }, { "epoch": 0.24225122349102773, "grad_norm": 2.675246477127075, "learning_rate": 6.052202283849918e-06, "loss": 0.4052, "num_input_tokens_seen": 3219680, "step": 1485 }, { "epoch": 0.24306688417618272, "grad_norm": 0.05278303474187851, "learning_rate": 6.072593800978793e-06, "loss": 0.3686, "num_input_tokens_seen": 3229376, "step": 1490 }, { "epoch": 0.24388254486133767, "grad_norm": 4.547016620635986, "learning_rate": 6.0929853181076675e-06, "loss": 0.6032, "num_input_tokens_seen": 3240896, "step": 1495 }, { "epoch": 0.24469820554649266, "grad_norm": 5.967410087585449, "learning_rate": 6.113376835236541e-06, "loss": 1.1279, "num_input_tokens_seen": 3251808, "step": 1500 }, { "epoch": 0.24551386623164764, "grad_norm": 1.6293954849243164, "learning_rate": 6.133768352365416e-06, "loss": 0.2304, "num_input_tokens_seen": 3262688, "step": 1505 }, { "epoch": 0.2463295269168026, "grad_norm": 2.7731893062591553, "learning_rate": 6.154159869494291e-06, "loss": 0.219, "num_input_tokens_seen": 3273344, "step": 1510 }, { "epoch": 0.24714518760195758, "grad_norm": 3.9372148513793945, "learning_rate": 6.174551386623165e-06, "loss": 0.5326, "num_input_tokens_seen": 3283520, "step": 1515 }, { "epoch": 0.24796084828711257, "grad_norm": 0.16058208048343658, "learning_rate": 6.194942903752039e-06, "loss": 0.5517, "num_input_tokens_seen": 3294816, "step": 1520 }, { "epoch": 0.24877650897226752, "grad_norm": 0.7491521835327148, "learning_rate": 6.215334420880914e-06, "loss": 0.4364, "num_input_tokens_seen": 3305152, "step": 1525 }, { "epoch": 0.2495921696574225, "grad_norm": 0.06863339245319366, "learning_rate": 6.2357259380097885e-06, "loss": 0.3617, "num_input_tokens_seen": 3315264, "step": 1530 }, { "epoch": 0.25040783034257746, "grad_norm": 6.61660623550415, "learning_rate": 6.256117455138663e-06, "loss": 0.641, "num_input_tokens_seen": 3326208, "step": 1535 }, { "epoch": 0.25122349102773245, "grad_norm": 3.028336524963379, "learning_rate": 6.276508972267536e-06, "loss": 0.5309, "num_input_tokens_seen": 3337472, "step": 1540 }, { "epoch": 0.25203915171288743, "grad_norm": 4.0093231201171875, "learning_rate": 6.296900489396411e-06, "loss": 0.2363, "num_input_tokens_seen": 3348576, "step": 1545 }, { "epoch": 0.2528548123980424, "grad_norm": 5.024332046508789, "learning_rate": 6.3172920065252854e-06, "loss": 0.571, "num_input_tokens_seen": 3359200, "step": 1550 }, { "epoch": 0.2536704730831974, "grad_norm": 1.1903791427612305, "learning_rate": 6.33768352365416e-06, "loss": 0.1781, "num_input_tokens_seen": 3369696, "step": 1555 }, { "epoch": 0.2544861337683524, "grad_norm": 2.4607763290405273, "learning_rate": 6.358075040783035e-06, "loss": 0.4625, "num_input_tokens_seen": 3381088, "step": 1560 }, { "epoch": 0.2553017944535073, "grad_norm": 1.2783178091049194, "learning_rate": 6.3784665579119094e-06, "loss": 0.349, "num_input_tokens_seen": 3392704, "step": 1565 }, { "epoch": 0.2561174551386623, "grad_norm": 0.09664727747440338, "learning_rate": 6.398858075040784e-06, "loss": 0.7477, "num_input_tokens_seen": 3403136, "step": 1570 }, { "epoch": 0.2569331158238173, "grad_norm": 0.0862194299697876, "learning_rate": 6.419249592169657e-06, "loss": 0.3018, "num_input_tokens_seen": 3411968, "step": 1575 }, { "epoch": 0.25774877650897227, "grad_norm": 0.13956482708454132, "learning_rate": 6.439641109298532e-06, "loss": 0.2863, "num_input_tokens_seen": 3421216, "step": 1580 }, { "epoch": 0.25856443719412725, "grad_norm": 2.828794002532959, "learning_rate": 6.4600326264274064e-06, "loss": 0.6718, "num_input_tokens_seen": 3433120, "step": 1585 }, { "epoch": 0.25938009787928223, "grad_norm": 2.8362302780151367, "learning_rate": 6.480424143556281e-06, "loss": 0.3018, "num_input_tokens_seen": 3444960, "step": 1590 }, { "epoch": 0.2601957585644372, "grad_norm": 0.24622145295143127, "learning_rate": 6.500815660685156e-06, "loss": 0.2191, "num_input_tokens_seen": 3456512, "step": 1595 }, { "epoch": 0.26101141924959215, "grad_norm": 2.136747121810913, "learning_rate": 6.5212071778140304e-06, "loss": 0.1749, "num_input_tokens_seen": 3466880, "step": 1600 }, { "epoch": 0.26182707993474713, "grad_norm": 1.116094708442688, "learning_rate": 6.541598694942903e-06, "loss": 0.2532, "num_input_tokens_seen": 3478528, "step": 1605 }, { "epoch": 0.2626427406199021, "grad_norm": 0.6898568868637085, "learning_rate": 6.561990212071778e-06, "loss": 0.5224, "num_input_tokens_seen": 3490240, "step": 1610 }, { "epoch": 0.2634584013050571, "grad_norm": 5.0546441078186035, "learning_rate": 6.582381729200653e-06, "loss": 0.5808, "num_input_tokens_seen": 3501152, "step": 1615 }, { "epoch": 0.2642740619902121, "grad_norm": 0.05416727066040039, "learning_rate": 6.6027732463295274e-06, "loss": 0.4902, "num_input_tokens_seen": 3511360, "step": 1620 }, { "epoch": 0.26508972267536707, "grad_norm": 3.0003700256347656, "learning_rate": 6.623164763458402e-06, "loss": 0.1692, "num_input_tokens_seen": 3522880, "step": 1625 }, { "epoch": 0.265905383360522, "grad_norm": 0.037446435540914536, "learning_rate": 6.643556280587277e-06, "loss": 0.3091, "num_input_tokens_seen": 3534720, "step": 1630 }, { "epoch": 0.266721044045677, "grad_norm": 0.08614230155944824, "learning_rate": 6.6639477977161514e-06, "loss": 0.1405, "num_input_tokens_seen": 3544128, "step": 1635 }, { "epoch": 0.26753670473083196, "grad_norm": 2.603851556777954, "learning_rate": 6.684339314845024e-06, "loss": 0.2464, "num_input_tokens_seen": 3555552, "step": 1640 }, { "epoch": 0.26835236541598695, "grad_norm": 0.4188733398914337, "learning_rate": 6.704730831973899e-06, "loss": 0.1965, "num_input_tokens_seen": 3565824, "step": 1645 }, { "epoch": 0.26916802610114193, "grad_norm": 3.9753730297088623, "learning_rate": 6.725122349102774e-06, "loss": 0.1876, "num_input_tokens_seen": 3576864, "step": 1650 }, { "epoch": 0.2699836867862969, "grad_norm": 6.9343156814575195, "learning_rate": 6.745513866231648e-06, "loss": 0.245, "num_input_tokens_seen": 3588032, "step": 1655 }, { "epoch": 0.2707993474714519, "grad_norm": 3.945366859436035, "learning_rate": 6.765905383360522e-06, "loss": 0.2744, "num_input_tokens_seen": 3599488, "step": 1660 }, { "epoch": 0.27161500815660683, "grad_norm": 0.06378654390573502, "learning_rate": 6.786296900489397e-06, "loss": 0.0231, "num_input_tokens_seen": 3610368, "step": 1665 }, { "epoch": 0.2724306688417618, "grad_norm": 4.536157608032227, "learning_rate": 6.806688417618271e-06, "loss": 0.4946, "num_input_tokens_seen": 3620128, "step": 1670 }, { "epoch": 0.2732463295269168, "grad_norm": 1.5053355693817139, "learning_rate": 6.827079934747145e-06, "loss": 0.2589, "num_input_tokens_seen": 3631872, "step": 1675 }, { "epoch": 0.2740619902120718, "grad_norm": 5.326475620269775, "learning_rate": 6.84747145187602e-06, "loss": 0.7041, "num_input_tokens_seen": 3643296, "step": 1680 }, { "epoch": 0.27487765089722677, "grad_norm": 0.623485267162323, "learning_rate": 6.867862969004894e-06, "loss": 0.4755, "num_input_tokens_seen": 3654592, "step": 1685 }, { "epoch": 0.27569331158238175, "grad_norm": 6.395427227020264, "learning_rate": 6.8882544861337686e-06, "loss": 0.9392, "num_input_tokens_seen": 3665312, "step": 1690 }, { "epoch": 0.2765089722675367, "grad_norm": 5.373067378997803, "learning_rate": 6.908646003262643e-06, "loss": 0.2736, "num_input_tokens_seen": 3677024, "step": 1695 }, { "epoch": 0.27732463295269166, "grad_norm": 3.3822553157806396, "learning_rate": 6.929037520391518e-06, "loss": 0.5365, "num_input_tokens_seen": 3687904, "step": 1700 }, { "epoch": 0.27814029363784665, "grad_norm": 0.470096230506897, "learning_rate": 6.949429037520392e-06, "loss": 0.5298, "num_input_tokens_seen": 3698848, "step": 1705 }, { "epoch": 0.27895595432300163, "grad_norm": 4.26376485824585, "learning_rate": 6.9698205546492656e-06, "loss": 0.4405, "num_input_tokens_seen": 3709856, "step": 1710 }, { "epoch": 0.2797716150081566, "grad_norm": 0.839065432548523, "learning_rate": 6.99021207177814e-06, "loss": 0.3741, "num_input_tokens_seen": 3720736, "step": 1715 }, { "epoch": 0.2805872756933116, "grad_norm": 3.745051622390747, "learning_rate": 7.010603588907015e-06, "loss": 0.5432, "num_input_tokens_seen": 3731264, "step": 1720 }, { "epoch": 0.2814029363784666, "grad_norm": 4.85554313659668, "learning_rate": 7.0309951060358896e-06, "loss": 0.5617, "num_input_tokens_seen": 3743072, "step": 1725 }, { "epoch": 0.2822185970636215, "grad_norm": 2.7528631687164307, "learning_rate": 7.051386623164764e-06, "loss": 0.5109, "num_input_tokens_seen": 3754656, "step": 1730 }, { "epoch": 0.2830342577487765, "grad_norm": 2.8373639583587646, "learning_rate": 7.071778140293637e-06, "loss": 0.2639, "num_input_tokens_seen": 3765248, "step": 1735 }, { "epoch": 0.2838499184339315, "grad_norm": 3.3679959774017334, "learning_rate": 7.092169657422512e-06, "loss": 0.6553, "num_input_tokens_seen": 3774464, "step": 1740 }, { "epoch": 0.28466557911908646, "grad_norm": 5.498345851898193, "learning_rate": 7.1125611745513865e-06, "loss": 0.4469, "num_input_tokens_seen": 3784608, "step": 1745 }, { "epoch": 0.28548123980424145, "grad_norm": 0.5953945517539978, "learning_rate": 7.132952691680261e-06, "loss": 0.3063, "num_input_tokens_seen": 3796672, "step": 1750 }, { "epoch": 0.28629690048939643, "grad_norm": 1.0803401470184326, "learning_rate": 7.153344208809136e-06, "loss": 0.2076, "num_input_tokens_seen": 3807712, "step": 1755 }, { "epoch": 0.28711256117455136, "grad_norm": 2.54874849319458, "learning_rate": 7.1737357259380106e-06, "loss": 0.26, "num_input_tokens_seen": 3817984, "step": 1760 }, { "epoch": 0.28792822185970635, "grad_norm": 1.499727725982666, "learning_rate": 7.194127243066885e-06, "loss": 0.0456, "num_input_tokens_seen": 3829152, "step": 1765 }, { "epoch": 0.28874388254486133, "grad_norm": 4.234437942504883, "learning_rate": 7.214518760195758e-06, "loss": 0.6324, "num_input_tokens_seen": 3840352, "step": 1770 }, { "epoch": 0.2895595432300163, "grad_norm": 0.05717654526233673, "learning_rate": 7.234910277324633e-06, "loss": 0.4965, "num_input_tokens_seen": 3851456, "step": 1775 }, { "epoch": 0.2903752039151713, "grad_norm": 1.2248029708862305, "learning_rate": 7.2553017944535075e-06, "loss": 0.0458, "num_input_tokens_seen": 3863040, "step": 1780 }, { "epoch": 0.2911908646003263, "grad_norm": 2.919182300567627, "learning_rate": 7.275693311582382e-06, "loss": 0.1017, "num_input_tokens_seen": 3874176, "step": 1785 }, { "epoch": 0.29200652528548127, "grad_norm": 0.058215513825416565, "learning_rate": 7.296084828711257e-06, "loss": 0.4597, "num_input_tokens_seen": 3884736, "step": 1790 }, { "epoch": 0.2928221859706362, "grad_norm": 2.5698227882385254, "learning_rate": 7.3164763458401316e-06, "loss": 0.2325, "num_input_tokens_seen": 3895584, "step": 1795 }, { "epoch": 0.2936378466557912, "grad_norm": 0.06556704640388489, "learning_rate": 7.3368678629690045e-06, "loss": 0.367, "num_input_tokens_seen": 3907584, "step": 1800 }, { "epoch": 0.29445350734094616, "grad_norm": 2.205702066421509, "learning_rate": 7.357259380097879e-06, "loss": 0.2881, "num_input_tokens_seen": 3919424, "step": 1805 }, { "epoch": 0.29526916802610115, "grad_norm": 3.189990758895874, "learning_rate": 7.377650897226754e-06, "loss": 0.2354, "num_input_tokens_seen": 3929152, "step": 1810 }, { "epoch": 0.29608482871125613, "grad_norm": 2.8307888507843018, "learning_rate": 7.3980424143556285e-06, "loss": 0.3228, "num_input_tokens_seen": 3940928, "step": 1815 }, { "epoch": 0.2969004893964111, "grad_norm": 2.54551100730896, "learning_rate": 7.418433931484503e-06, "loss": 0.1687, "num_input_tokens_seen": 3949568, "step": 1820 }, { "epoch": 0.29771615008156604, "grad_norm": 0.5649632811546326, "learning_rate": 7.438825448613378e-06, "loss": 0.2202, "num_input_tokens_seen": 3961088, "step": 1825 }, { "epoch": 0.29853181076672103, "grad_norm": 0.19162696599960327, "learning_rate": 7.4592169657422525e-06, "loss": 0.4944, "num_input_tokens_seen": 3973088, "step": 1830 }, { "epoch": 0.299347471451876, "grad_norm": 0.11915277689695358, "learning_rate": 7.4796084828711255e-06, "loss": 0.2503, "num_input_tokens_seen": 3982976, "step": 1835 }, { "epoch": 0.300163132137031, "grad_norm": 2.7708964347839355, "learning_rate": 7.5e-06, "loss": 0.4938, "num_input_tokens_seen": 3993440, "step": 1840 }, { "epoch": 0.300978792822186, "grad_norm": 0.3245686888694763, "learning_rate": 7.520391517128875e-06, "loss": 0.0841, "num_input_tokens_seen": 4002720, "step": 1845 }, { "epoch": 0.30179445350734097, "grad_norm": 1.6240615844726562, "learning_rate": 7.5407830342577495e-06, "loss": 0.7429, "num_input_tokens_seen": 4012896, "step": 1850 }, { "epoch": 0.30261011419249595, "grad_norm": 0.2792931795120239, "learning_rate": 7.561174551386624e-06, "loss": 0.1405, "num_input_tokens_seen": 4023680, "step": 1855 }, { "epoch": 0.3034257748776509, "grad_norm": 0.07334275543689728, "learning_rate": 7.581566068515499e-06, "loss": 0.0978, "num_input_tokens_seen": 4033760, "step": 1860 }, { "epoch": 0.30424143556280586, "grad_norm": 3.937211275100708, "learning_rate": 7.601957585644372e-06, "loss": 0.3304, "num_input_tokens_seen": 4044192, "step": 1865 }, { "epoch": 0.30505709624796085, "grad_norm": 2.4421064853668213, "learning_rate": 7.6223491027732465e-06, "loss": 0.3055, "num_input_tokens_seen": 4055520, "step": 1870 }, { "epoch": 0.30587275693311583, "grad_norm": 0.09250957518815994, "learning_rate": 7.642740619902121e-06, "loss": 0.1572, "num_input_tokens_seen": 4067264, "step": 1875 }, { "epoch": 0.3066884176182708, "grad_norm": 3.148164987564087, "learning_rate": 7.663132137030995e-06, "loss": 0.4497, "num_input_tokens_seen": 4078016, "step": 1880 }, { "epoch": 0.3075040783034258, "grad_norm": 0.03364921733736992, "learning_rate": 7.68352365415987e-06, "loss": 0.2322, "num_input_tokens_seen": 4088352, "step": 1885 }, { "epoch": 0.3083197389885807, "grad_norm": 4.356660842895508, "learning_rate": 7.703915171288744e-06, "loss": 0.2785, "num_input_tokens_seen": 4100416, "step": 1890 }, { "epoch": 0.3091353996737357, "grad_norm": 2.665492534637451, "learning_rate": 7.724306688417618e-06, "loss": 0.3426, "num_input_tokens_seen": 4110080, "step": 1895 }, { "epoch": 0.3099510603588907, "grad_norm": 4.878696441650391, "learning_rate": 7.744698205546492e-06, "loss": 0.7649, "num_input_tokens_seen": 4120736, "step": 1900 }, { "epoch": 0.3107667210440457, "grad_norm": 0.40482279658317566, "learning_rate": 7.765089722675368e-06, "loss": 0.191, "num_input_tokens_seen": 4132736, "step": 1905 }, { "epoch": 0.31158238172920066, "grad_norm": 5.901864051818848, "learning_rate": 7.785481239804241e-06, "loss": 0.4255, "num_input_tokens_seen": 4143808, "step": 1910 }, { "epoch": 0.31239804241435565, "grad_norm": 3.2325332164764404, "learning_rate": 7.805872756933117e-06, "loss": 0.2782, "num_input_tokens_seen": 4154976, "step": 1915 }, { "epoch": 0.3132137030995106, "grad_norm": 2.989654302597046, "learning_rate": 7.82626427406199e-06, "loss": 0.4086, "num_input_tokens_seen": 4165504, "step": 1920 }, { "epoch": 0.31402936378466556, "grad_norm": 5.6098456382751465, "learning_rate": 7.846655791190866e-06, "loss": 0.361, "num_input_tokens_seen": 4177184, "step": 1925 }, { "epoch": 0.31484502446982054, "grad_norm": 3.0970447063446045, "learning_rate": 7.867047308319738e-06, "loss": 0.3303, "num_input_tokens_seen": 4189088, "step": 1930 }, { "epoch": 0.31566068515497553, "grad_norm": 0.13851726055145264, "learning_rate": 7.887438825448614e-06, "loss": 0.075, "num_input_tokens_seen": 4200064, "step": 1935 }, { "epoch": 0.3164763458401305, "grad_norm": 0.5572215914726257, "learning_rate": 7.907830342577488e-06, "loss": 0.3072, "num_input_tokens_seen": 4211936, "step": 1940 }, { "epoch": 0.3172920065252855, "grad_norm": 1.0190293788909912, "learning_rate": 7.928221859706363e-06, "loss": 0.2618, "num_input_tokens_seen": 4221952, "step": 1945 }, { "epoch": 0.3181076672104405, "grad_norm": 0.06778541207313538, "learning_rate": 7.948613376835237e-06, "loss": 0.4715, "num_input_tokens_seen": 4233024, "step": 1950 }, { "epoch": 0.3189233278955954, "grad_norm": 2.1068990230560303, "learning_rate": 7.969004893964113e-06, "loss": 0.1094, "num_input_tokens_seen": 4244480, "step": 1955 }, { "epoch": 0.3197389885807504, "grad_norm": 0.035436857491731644, "learning_rate": 7.989396411092985e-06, "loss": 0.3445, "num_input_tokens_seen": 4255904, "step": 1960 }, { "epoch": 0.3205546492659054, "grad_norm": 2.5638551712036133, "learning_rate": 8.00978792822186e-06, "loss": 0.7167, "num_input_tokens_seen": 4265792, "step": 1965 }, { "epoch": 0.32137030995106036, "grad_norm": 0.03428907319903374, "learning_rate": 8.030179445350734e-06, "loss": 0.0966, "num_input_tokens_seen": 4276160, "step": 1970 }, { "epoch": 0.32218597063621535, "grad_norm": 3.168548345565796, "learning_rate": 8.05057096247961e-06, "loss": 0.4224, "num_input_tokens_seen": 4286624, "step": 1975 }, { "epoch": 0.32300163132137033, "grad_norm": 2.9012691974639893, "learning_rate": 8.070962479608483e-06, "loss": 0.1887, "num_input_tokens_seen": 4297600, "step": 1980 }, { "epoch": 0.32381729200652526, "grad_norm": 3.8062045574188232, "learning_rate": 8.091353996737359e-06, "loss": 0.3318, "num_input_tokens_seen": 4308352, "step": 1985 }, { "epoch": 0.32463295269168024, "grad_norm": 4.031979084014893, "learning_rate": 8.111745513866233e-06, "loss": 0.3892, "num_input_tokens_seen": 4319360, "step": 1990 }, { "epoch": 0.3254486133768352, "grad_norm": 3.0761168003082275, "learning_rate": 8.132137030995106e-06, "loss": 0.1236, "num_input_tokens_seen": 4331552, "step": 1995 }, { "epoch": 0.3262642740619902, "grad_norm": 0.1398240178823471, "learning_rate": 8.15252854812398e-06, "loss": 0.5698, "num_input_tokens_seen": 4343744, "step": 2000 }, { "epoch": 0.3270799347471452, "grad_norm": 0.04928942024707794, "learning_rate": 8.172920065252856e-06, "loss": 0.3382, "num_input_tokens_seen": 4354304, "step": 2005 }, { "epoch": 0.3278955954323002, "grad_norm": 2.76121187210083, "learning_rate": 8.19331158238173e-06, "loss": 0.2151, "num_input_tokens_seen": 4365696, "step": 2010 }, { "epoch": 0.32871125611745516, "grad_norm": 8.213173866271973, "learning_rate": 8.213703099510603e-06, "loss": 0.5117, "num_input_tokens_seen": 4376736, "step": 2015 }, { "epoch": 0.3295269168026101, "grad_norm": 0.14160363376140594, "learning_rate": 8.234094616639479e-06, "loss": 0.2806, "num_input_tokens_seen": 4388064, "step": 2020 }, { "epoch": 0.3303425774877651, "grad_norm": 8.041695594787598, "learning_rate": 8.254486133768353e-06, "loss": 0.2341, "num_input_tokens_seen": 4398912, "step": 2025 }, { "epoch": 0.33115823817292006, "grad_norm": 3.6900603771209717, "learning_rate": 8.274877650897227e-06, "loss": 0.3317, "num_input_tokens_seen": 4409856, "step": 2030 }, { "epoch": 0.33197389885807504, "grad_norm": 2.366342306137085, "learning_rate": 8.295269168026102e-06, "loss": 0.2103, "num_input_tokens_seen": 4419552, "step": 2035 }, { "epoch": 0.33278955954323003, "grad_norm": 0.08505433052778244, "learning_rate": 8.315660685154976e-06, "loss": 0.245, "num_input_tokens_seen": 4430080, "step": 2040 }, { "epoch": 0.333605220228385, "grad_norm": 2.5287070274353027, "learning_rate": 8.33605220228385e-06, "loss": 0.3979, "num_input_tokens_seen": 4441568, "step": 2045 }, { "epoch": 0.33442088091353994, "grad_norm": 4.5702009201049805, "learning_rate": 8.356443719412725e-06, "loss": 0.5355, "num_input_tokens_seen": 4451200, "step": 2050 }, { "epoch": 0.3352365415986949, "grad_norm": 2.2750282287597656, "learning_rate": 8.376835236541599e-06, "loss": 0.3409, "num_input_tokens_seen": 4461888, "step": 2055 }, { "epoch": 0.3360522022838499, "grad_norm": 2.853708505630493, "learning_rate": 8.397226753670473e-06, "loss": 0.1644, "num_input_tokens_seen": 4471392, "step": 2060 }, { "epoch": 0.3368678629690049, "grad_norm": 1.5267902612686157, "learning_rate": 8.417618270799347e-06, "loss": 0.1741, "num_input_tokens_seen": 4482656, "step": 2065 }, { "epoch": 0.3376835236541599, "grad_norm": 2.402932643890381, "learning_rate": 8.438009787928222e-06, "loss": 0.1922, "num_input_tokens_seen": 4494336, "step": 2070 }, { "epoch": 0.33849918433931486, "grad_norm": 3.3514010906219482, "learning_rate": 8.458401305057096e-06, "loss": 0.4428, "num_input_tokens_seen": 4505760, "step": 2075 }, { "epoch": 0.33931484502446985, "grad_norm": 3.7235231399536133, "learning_rate": 8.478792822185972e-06, "loss": 0.3476, "num_input_tokens_seen": 4514976, "step": 2080 }, { "epoch": 0.3401305057096248, "grad_norm": 0.02181488834321499, "learning_rate": 8.499184339314845e-06, "loss": 0.1375, "num_input_tokens_seen": 4524928, "step": 2085 }, { "epoch": 0.34094616639477976, "grad_norm": 2.304445743560791, "learning_rate": 8.51957585644372e-06, "loss": 0.3217, "num_input_tokens_seen": 4536192, "step": 2090 }, { "epoch": 0.34176182707993474, "grad_norm": 0.07463689148426056, "learning_rate": 8.539967373572593e-06, "loss": 0.1745, "num_input_tokens_seen": 4548160, "step": 2095 }, { "epoch": 0.3425774877650897, "grad_norm": 1.6991610527038574, "learning_rate": 8.560358890701469e-06, "loss": 0.3299, "num_input_tokens_seen": 4558560, "step": 2100 }, { "epoch": 0.3433931484502447, "grad_norm": 3.3701062202453613, "learning_rate": 8.580750407830342e-06, "loss": 0.3961, "num_input_tokens_seen": 4570272, "step": 2105 }, { "epoch": 0.3442088091353997, "grad_norm": 0.49982950091362, "learning_rate": 8.601141924959218e-06, "loss": 0.3544, "num_input_tokens_seen": 4580608, "step": 2110 }, { "epoch": 0.3450244698205546, "grad_norm": 1.700905442237854, "learning_rate": 8.621533442088092e-06, "loss": 0.1665, "num_input_tokens_seen": 4591072, "step": 2115 }, { "epoch": 0.3458401305057096, "grad_norm": 1.3512170314788818, "learning_rate": 8.641924959216967e-06, "loss": 0.174, "num_input_tokens_seen": 4602368, "step": 2120 }, { "epoch": 0.3466557911908646, "grad_norm": 2.9085943698883057, "learning_rate": 8.66231647634584e-06, "loss": 0.188, "num_input_tokens_seen": 4613600, "step": 2125 }, { "epoch": 0.3474714518760196, "grad_norm": 0.030118437483906746, "learning_rate": 8.682707993474715e-06, "loss": 0.2258, "num_input_tokens_seen": 4623872, "step": 2130 }, { "epoch": 0.34828711256117456, "grad_norm": 2.613267660140991, "learning_rate": 8.703099510603589e-06, "loss": 0.4417, "num_input_tokens_seen": 4635136, "step": 2135 }, { "epoch": 0.34910277324632955, "grad_norm": 0.053674302995204926, "learning_rate": 8.723491027732464e-06, "loss": 0.0318, "num_input_tokens_seen": 4645152, "step": 2140 }, { "epoch": 0.34991843393148453, "grad_norm": 3.231687068939209, "learning_rate": 8.743882544861338e-06, "loss": 0.2682, "num_input_tokens_seen": 4654240, "step": 2145 }, { "epoch": 0.35073409461663946, "grad_norm": 0.5705206394195557, "learning_rate": 8.764274061990214e-06, "loss": 0.246, "num_input_tokens_seen": 4666240, "step": 2150 }, { "epoch": 0.35154975530179444, "grad_norm": 2.0118424892425537, "learning_rate": 8.784665579119086e-06, "loss": 0.1963, "num_input_tokens_seen": 4678528, "step": 2155 }, { "epoch": 0.3523654159869494, "grad_norm": 3.380246877670288, "learning_rate": 8.805057096247961e-06, "loss": 0.3466, "num_input_tokens_seen": 4688768, "step": 2160 }, { "epoch": 0.3531810766721044, "grad_norm": 1.8953642845153809, "learning_rate": 8.825448613376835e-06, "loss": 0.1565, "num_input_tokens_seen": 4698496, "step": 2165 }, { "epoch": 0.3539967373572594, "grad_norm": 1.275225281715393, "learning_rate": 8.84584013050571e-06, "loss": 0.1101, "num_input_tokens_seen": 4710208, "step": 2170 }, { "epoch": 0.3548123980424144, "grad_norm": 0.9144735932350159, "learning_rate": 8.866231647634584e-06, "loss": 0.3345, "num_input_tokens_seen": 4720480, "step": 2175 }, { "epoch": 0.3556280587275693, "grad_norm": 0.37059950828552246, "learning_rate": 8.88662316476346e-06, "loss": 0.5494, "num_input_tokens_seen": 4730560, "step": 2180 }, { "epoch": 0.3564437194127243, "grad_norm": 0.8306304216384888, "learning_rate": 8.907014681892334e-06, "loss": 0.0332, "num_input_tokens_seen": 4740768, "step": 2185 }, { "epoch": 0.3572593800978793, "grad_norm": 0.646552562713623, "learning_rate": 8.927406199021208e-06, "loss": 0.6973, "num_input_tokens_seen": 4751040, "step": 2190 }, { "epoch": 0.35807504078303426, "grad_norm": 2.9322378635406494, "learning_rate": 8.947797716150081e-06, "loss": 0.1786, "num_input_tokens_seen": 4762016, "step": 2195 }, { "epoch": 0.35889070146818924, "grad_norm": 2.324927806854248, "learning_rate": 8.968189233278957e-06, "loss": 0.2195, "num_input_tokens_seen": 4771648, "step": 2200 }, { "epoch": 0.35970636215334423, "grad_norm": 3.080955982208252, "learning_rate": 8.98858075040783e-06, "loss": 0.2572, "num_input_tokens_seen": 4781440, "step": 2205 }, { "epoch": 0.3605220228384992, "grad_norm": 1.9486790895462036, "learning_rate": 9.008972267536706e-06, "loss": 0.2168, "num_input_tokens_seen": 4793312, "step": 2210 }, { "epoch": 0.36133768352365414, "grad_norm": 2.805945873260498, "learning_rate": 9.02936378466558e-06, "loss": 0.1372, "num_input_tokens_seen": 4804288, "step": 2215 }, { "epoch": 0.3621533442088091, "grad_norm": 5.330896854400635, "learning_rate": 9.049755301794454e-06, "loss": 0.5373, "num_input_tokens_seen": 4814784, "step": 2220 }, { "epoch": 0.3629690048939641, "grad_norm": 0.08147232234477997, "learning_rate": 9.070146818923328e-06, "loss": 0.6647, "num_input_tokens_seen": 4825152, "step": 2225 }, { "epoch": 0.3637846655791191, "grad_norm": 0.1762763112783432, "learning_rate": 9.090538336052203e-06, "loss": 0.3815, "num_input_tokens_seen": 4835776, "step": 2230 }, { "epoch": 0.3646003262642741, "grad_norm": 0.10870002210140228, "learning_rate": 9.110929853181077e-06, "loss": 0.1083, "num_input_tokens_seen": 4846848, "step": 2235 }, { "epoch": 0.36541598694942906, "grad_norm": 0.1511486917734146, "learning_rate": 9.131321370309953e-06, "loss": 0.0096, "num_input_tokens_seen": 4857664, "step": 2240 }, { "epoch": 0.366231647634584, "grad_norm": 4.95281457901001, "learning_rate": 9.151712887438826e-06, "loss": 0.3865, "num_input_tokens_seen": 4868800, "step": 2245 }, { "epoch": 0.367047308319739, "grad_norm": 0.04483659937977791, "learning_rate": 9.1721044045677e-06, "loss": 0.3825, "num_input_tokens_seen": 4879776, "step": 2250 }, { "epoch": 0.36786296900489396, "grad_norm": 0.16324496269226074, "learning_rate": 9.192495921696574e-06, "loss": 0.3502, "num_input_tokens_seen": 4890816, "step": 2255 }, { "epoch": 0.36867862969004894, "grad_norm": 2.3887641429901123, "learning_rate": 9.21288743882545e-06, "loss": 0.2848, "num_input_tokens_seen": 4899136, "step": 2260 }, { "epoch": 0.3694942903752039, "grad_norm": 0.03572484105825424, "learning_rate": 9.233278955954323e-06, "loss": 0.1367, "num_input_tokens_seen": 4910112, "step": 2265 }, { "epoch": 0.3703099510603589, "grad_norm": 2.8488998413085938, "learning_rate": 9.253670473083197e-06, "loss": 0.4234, "num_input_tokens_seen": 4920032, "step": 2270 }, { "epoch": 0.37112561174551384, "grad_norm": 2.313248634338379, "learning_rate": 9.274061990212073e-06, "loss": 0.2471, "num_input_tokens_seen": 4930752, "step": 2275 }, { "epoch": 0.3719412724306688, "grad_norm": 0.04822679981589317, "learning_rate": 9.294453507340947e-06, "loss": 0.2174, "num_input_tokens_seen": 4940960, "step": 2280 }, { "epoch": 0.3727569331158238, "grad_norm": 2.1716558933258057, "learning_rate": 9.31484502446982e-06, "loss": 0.2848, "num_input_tokens_seen": 4952128, "step": 2285 }, { "epoch": 0.3735725938009788, "grad_norm": 1.4830292463302612, "learning_rate": 9.335236541598694e-06, "loss": 0.5019, "num_input_tokens_seen": 4962848, "step": 2290 }, { "epoch": 0.3743882544861338, "grad_norm": 1.999653935432434, "learning_rate": 9.35562805872757e-06, "loss": 0.0659, "num_input_tokens_seen": 4972448, "step": 2295 }, { "epoch": 0.37520391517128876, "grad_norm": 0.4622013568878174, "learning_rate": 9.376019575856444e-06, "loss": 0.2185, "num_input_tokens_seen": 4982432, "step": 2300 }, { "epoch": 0.37601957585644374, "grad_norm": 0.03528936952352524, "learning_rate": 9.396411092985319e-06, "loss": 0.0939, "num_input_tokens_seen": 4992800, "step": 2305 }, { "epoch": 0.3768352365415987, "grad_norm": 0.6311270594596863, "learning_rate": 9.416802610114193e-06, "loss": 0.0402, "num_input_tokens_seen": 5004352, "step": 2310 }, { "epoch": 0.37765089722675366, "grad_norm": 0.11481994390487671, "learning_rate": 9.437194127243067e-06, "loss": 0.2118, "num_input_tokens_seen": 5015968, "step": 2315 }, { "epoch": 0.37846655791190864, "grad_norm": 1.3783618211746216, "learning_rate": 9.45758564437194e-06, "loss": 0.0666, "num_input_tokens_seen": 5026624, "step": 2320 }, { "epoch": 0.3792822185970636, "grad_norm": 3.8794796466827393, "learning_rate": 9.477977161500816e-06, "loss": 0.1838, "num_input_tokens_seen": 5037312, "step": 2325 }, { "epoch": 0.3800978792822186, "grad_norm": 0.16678617894649506, "learning_rate": 9.49836867862969e-06, "loss": 0.2209, "num_input_tokens_seen": 5048544, "step": 2330 }, { "epoch": 0.3809135399673736, "grad_norm": 1.075762152671814, "learning_rate": 9.518760195758565e-06, "loss": 0.1917, "num_input_tokens_seen": 5059584, "step": 2335 }, { "epoch": 0.3817292006525285, "grad_norm": 2.2040605545043945, "learning_rate": 9.53915171288744e-06, "loss": 0.3295, "num_input_tokens_seen": 5072736, "step": 2340 }, { "epoch": 0.3825448613376835, "grad_norm": 1.067184329032898, "learning_rate": 9.559543230016315e-06, "loss": 0.6389, "num_input_tokens_seen": 5083456, "step": 2345 }, { "epoch": 0.3833605220228385, "grad_norm": 0.6784358024597168, "learning_rate": 9.579934747145187e-06, "loss": 0.1956, "num_input_tokens_seen": 5094240, "step": 2350 }, { "epoch": 0.3841761827079935, "grad_norm": 3.646688938140869, "learning_rate": 9.600326264274062e-06, "loss": 0.1972, "num_input_tokens_seen": 5103744, "step": 2355 }, { "epoch": 0.38499184339314846, "grad_norm": 2.430851936340332, "learning_rate": 9.620717781402936e-06, "loss": 0.2742, "num_input_tokens_seen": 5114048, "step": 2360 }, { "epoch": 0.38580750407830344, "grad_norm": 2.50156831741333, "learning_rate": 9.641109298531812e-06, "loss": 0.5062, "num_input_tokens_seen": 5123744, "step": 2365 }, { "epoch": 0.3866231647634584, "grad_norm": 2.3581511974334717, "learning_rate": 9.661500815660686e-06, "loss": 0.2739, "num_input_tokens_seen": 5133888, "step": 2370 }, { "epoch": 0.38743882544861336, "grad_norm": 0.025238124653697014, "learning_rate": 9.681892332789561e-06, "loss": 0.2247, "num_input_tokens_seen": 5145024, "step": 2375 }, { "epoch": 0.38825448613376834, "grad_norm": 0.9830336570739746, "learning_rate": 9.702283849918433e-06, "loss": 0.0302, "num_input_tokens_seen": 5156544, "step": 2380 }, { "epoch": 0.3890701468189233, "grad_norm": 2.4630117416381836, "learning_rate": 9.722675367047309e-06, "loss": 0.439, "num_input_tokens_seen": 5168192, "step": 2385 }, { "epoch": 0.3898858075040783, "grad_norm": 0.879244863986969, "learning_rate": 9.743066884176183e-06, "loss": 0.3195, "num_input_tokens_seen": 5178240, "step": 2390 }, { "epoch": 0.3907014681892333, "grad_norm": 0.06460980325937271, "learning_rate": 9.763458401305058e-06, "loss": 0.0923, "num_input_tokens_seen": 5189248, "step": 2395 }, { "epoch": 0.3915171288743883, "grad_norm": 1.3311941623687744, "learning_rate": 9.783849918433932e-06, "loss": 0.1188, "num_input_tokens_seen": 5199392, "step": 2400 }, { "epoch": 0.3923327895595432, "grad_norm": 3.32798433303833, "learning_rate": 9.804241435562807e-06, "loss": 0.2752, "num_input_tokens_seen": 5210240, "step": 2405 }, { "epoch": 0.3931484502446982, "grad_norm": 4.680781364440918, "learning_rate": 9.824632952691681e-06, "loss": 0.3948, "num_input_tokens_seen": 5220000, "step": 2410 }, { "epoch": 0.3939641109298532, "grad_norm": 2.3756251335144043, "learning_rate": 9.845024469820555e-06, "loss": 0.0973, "num_input_tokens_seen": 5231520, "step": 2415 }, { "epoch": 0.39477977161500816, "grad_norm": 3.2637367248535156, "learning_rate": 9.865415986949429e-06, "loss": 0.3451, "num_input_tokens_seen": 5243200, "step": 2420 }, { "epoch": 0.39559543230016314, "grad_norm": 2.632678747177124, "learning_rate": 9.885807504078304e-06, "loss": 0.1705, "num_input_tokens_seen": 5253632, "step": 2425 }, { "epoch": 0.3964110929853181, "grad_norm": 0.29987868666648865, "learning_rate": 9.906199021207178e-06, "loss": 0.1888, "num_input_tokens_seen": 5264736, "step": 2430 }, { "epoch": 0.3972267536704731, "grad_norm": 0.6762347221374512, "learning_rate": 9.926590538336054e-06, "loss": 0.276, "num_input_tokens_seen": 5276192, "step": 2435 }, { "epoch": 0.39804241435562804, "grad_norm": 0.06295319646596909, "learning_rate": 9.946982055464928e-06, "loss": 0.2301, "num_input_tokens_seen": 5287296, "step": 2440 }, { "epoch": 0.398858075040783, "grad_norm": 1.109140396118164, "learning_rate": 9.967373572593801e-06, "loss": 0.312, "num_input_tokens_seen": 5298336, "step": 2445 }, { "epoch": 0.399673735725938, "grad_norm": 0.17041422426700592, "learning_rate": 9.987765089722675e-06, "loss": 0.1243, "num_input_tokens_seen": 5309248, "step": 2450 }, { "epoch": 0.400489396411093, "grad_norm": 0.40360745787620544, "learning_rate": 1.000815660685155e-05, "loss": 0.5038, "num_input_tokens_seen": 5320384, "step": 2455 }, { "epoch": 0.401305057096248, "grad_norm": 0.15627728402614594, "learning_rate": 1.0028548123980425e-05, "loss": 0.0403, "num_input_tokens_seen": 5331904, "step": 2460 }, { "epoch": 0.40212071778140296, "grad_norm": 0.09619128704071045, "learning_rate": 1.00489396411093e-05, "loss": 0.2566, "num_input_tokens_seen": 5343904, "step": 2465 }, { "epoch": 0.4029363784665579, "grad_norm": 0.7849164605140686, "learning_rate": 1.0069331158238174e-05, "loss": 0.0988, "num_input_tokens_seen": 5354336, "step": 2470 }, { "epoch": 0.40375203915171287, "grad_norm": 0.3961533308029175, "learning_rate": 1.0089722675367048e-05, "loss": 0.32, "num_input_tokens_seen": 5366144, "step": 2475 }, { "epoch": 0.40456769983686786, "grad_norm": 3.5211946964263916, "learning_rate": 1.0110114192495921e-05, "loss": 0.1461, "num_input_tokens_seen": 5377344, "step": 2480 }, { "epoch": 0.40538336052202284, "grad_norm": 2.0909061431884766, "learning_rate": 1.0130505709624797e-05, "loss": 0.1045, "num_input_tokens_seen": 5386752, "step": 2485 }, { "epoch": 0.4061990212071778, "grad_norm": 0.07640524953603745, "learning_rate": 1.015089722675367e-05, "loss": 0.096, "num_input_tokens_seen": 5396384, "step": 2490 }, { "epoch": 0.4070146818923328, "grad_norm": 3.2305779457092285, "learning_rate": 1.0171288743882545e-05, "loss": 0.2788, "num_input_tokens_seen": 5408256, "step": 2495 }, { "epoch": 0.4078303425774878, "grad_norm": 0.15801499783992767, "learning_rate": 1.019168026101142e-05, "loss": 0.2226, "num_input_tokens_seen": 5418752, "step": 2500 }, { "epoch": 0.4086460032626427, "grad_norm": 0.03341606259346008, "learning_rate": 1.0212071778140294e-05, "loss": 0.1557, "num_input_tokens_seen": 5429792, "step": 2505 }, { "epoch": 0.4094616639477977, "grad_norm": 3.0574679374694824, "learning_rate": 1.0232463295269168e-05, "loss": 0.3616, "num_input_tokens_seen": 5440608, "step": 2510 }, { "epoch": 0.4102773246329527, "grad_norm": 1.80524480342865, "learning_rate": 1.0252854812398043e-05, "loss": 0.3613, "num_input_tokens_seen": 5450016, "step": 2515 }, { "epoch": 0.4110929853181077, "grad_norm": 6.020650863647461, "learning_rate": 1.0273246329526917e-05, "loss": 0.6025, "num_input_tokens_seen": 5461120, "step": 2520 }, { "epoch": 0.41190864600326266, "grad_norm": 3.5118343830108643, "learning_rate": 1.0293637846655791e-05, "loss": 0.0903, "num_input_tokens_seen": 5471648, "step": 2525 }, { "epoch": 0.41272430668841764, "grad_norm": 0.3511371910572052, "learning_rate": 1.0314029363784666e-05, "loss": 0.3465, "num_input_tokens_seen": 5482656, "step": 2530 }, { "epoch": 0.41353996737357257, "grad_norm": 2.4649465084075928, "learning_rate": 1.033442088091354e-05, "loss": 0.0684, "num_input_tokens_seen": 5491168, "step": 2535 }, { "epoch": 0.41435562805872755, "grad_norm": 0.6195061206817627, "learning_rate": 1.0354812398042416e-05, "loss": 0.3827, "num_input_tokens_seen": 5501792, "step": 2540 }, { "epoch": 0.41517128874388254, "grad_norm": 2.1570937633514404, "learning_rate": 1.0375203915171288e-05, "loss": 0.4043, "num_input_tokens_seen": 5513152, "step": 2545 }, { "epoch": 0.4159869494290375, "grad_norm": 0.4406980276107788, "learning_rate": 1.0395595432300163e-05, "loss": 0.0484, "num_input_tokens_seen": 5524480, "step": 2550 }, { "epoch": 0.4168026101141925, "grad_norm": 0.03811057284474373, "learning_rate": 1.0415986949429037e-05, "loss": 0.1424, "num_input_tokens_seen": 5535008, "step": 2555 }, { "epoch": 0.4176182707993475, "grad_norm": 0.5462912917137146, "learning_rate": 1.0436378466557913e-05, "loss": 0.108, "num_input_tokens_seen": 5546048, "step": 2560 }, { "epoch": 0.4184339314845024, "grad_norm": 4.163430690765381, "learning_rate": 1.0456769983686787e-05, "loss": 0.4851, "num_input_tokens_seen": 5557760, "step": 2565 }, { "epoch": 0.4192495921696574, "grad_norm": 0.2540128827095032, "learning_rate": 1.0477161500815662e-05, "loss": 0.312, "num_input_tokens_seen": 5567488, "step": 2570 }, { "epoch": 0.4200652528548124, "grad_norm": 2.148900270462036, "learning_rate": 1.0497553017944534e-05, "loss": 0.3677, "num_input_tokens_seen": 5578592, "step": 2575 }, { "epoch": 0.42088091353996737, "grad_norm": 1.6073061227798462, "learning_rate": 1.051794453507341e-05, "loss": 0.0575, "num_input_tokens_seen": 5588576, "step": 2580 }, { "epoch": 0.42169657422512236, "grad_norm": 1.1934785842895508, "learning_rate": 1.0538336052202284e-05, "loss": 0.1166, "num_input_tokens_seen": 5599264, "step": 2585 }, { "epoch": 0.42251223491027734, "grad_norm": 0.2245580404996872, "learning_rate": 1.0558727569331159e-05, "loss": 0.1589, "num_input_tokens_seen": 5609504, "step": 2590 }, { "epoch": 0.4233278955954323, "grad_norm": 5.02882194519043, "learning_rate": 1.0579119086460033e-05, "loss": 0.3561, "num_input_tokens_seen": 5620256, "step": 2595 }, { "epoch": 0.42414355628058725, "grad_norm": 2.394925594329834, "learning_rate": 1.0599510603588908e-05, "loss": 0.5994, "num_input_tokens_seen": 5630592, "step": 2600 }, { "epoch": 0.42495921696574224, "grad_norm": 3.714993953704834, "learning_rate": 1.061990212071778e-05, "loss": 0.4714, "num_input_tokens_seen": 5641536, "step": 2605 }, { "epoch": 0.4257748776508972, "grad_norm": 0.16938544809818268, "learning_rate": 1.0640293637846656e-05, "loss": 0.0271, "num_input_tokens_seen": 5652416, "step": 2610 }, { "epoch": 0.4265905383360522, "grad_norm": 2.330188035964966, "learning_rate": 1.066068515497553e-05, "loss": 0.2538, "num_input_tokens_seen": 5662528, "step": 2615 }, { "epoch": 0.4274061990212072, "grad_norm": 2.211507797241211, "learning_rate": 1.0681076672104405e-05, "loss": 0.223, "num_input_tokens_seen": 5672288, "step": 2620 }, { "epoch": 0.4282218597063622, "grad_norm": 0.9671165347099304, "learning_rate": 1.070146818923328e-05, "loss": 0.0976, "num_input_tokens_seen": 5682976, "step": 2625 }, { "epoch": 0.4290375203915171, "grad_norm": 0.382793128490448, "learning_rate": 1.0721859706362155e-05, "loss": 0.105, "num_input_tokens_seen": 5694240, "step": 2630 }, { "epoch": 0.4298531810766721, "grad_norm": 0.0711759701371193, "learning_rate": 1.0742251223491029e-05, "loss": 0.1578, "num_input_tokens_seen": 5704192, "step": 2635 }, { "epoch": 0.43066884176182707, "grad_norm": 0.16581550240516663, "learning_rate": 1.0762642740619902e-05, "loss": 0.0991, "num_input_tokens_seen": 5715968, "step": 2640 }, { "epoch": 0.43148450244698205, "grad_norm": 0.2977418005466461, "learning_rate": 1.0783034257748776e-05, "loss": 0.078, "num_input_tokens_seen": 5726784, "step": 2645 }, { "epoch": 0.43230016313213704, "grad_norm": 3.519282817840576, "learning_rate": 1.0803425774877652e-05, "loss": 0.4613, "num_input_tokens_seen": 5737952, "step": 2650 }, { "epoch": 0.433115823817292, "grad_norm": 0.6504804491996765, "learning_rate": 1.0823817292006526e-05, "loss": 0.1405, "num_input_tokens_seen": 5748512, "step": 2655 }, { "epoch": 0.433931484502447, "grad_norm": 0.11351238936185837, "learning_rate": 1.0844208809135401e-05, "loss": 0.2177, "num_input_tokens_seen": 5758944, "step": 2660 }, { "epoch": 0.43474714518760194, "grad_norm": 1.3033699989318848, "learning_rate": 1.0864600326264275e-05, "loss": 0.326, "num_input_tokens_seen": 5769632, "step": 2665 }, { "epoch": 0.4355628058727569, "grad_norm": 2.032930374145508, "learning_rate": 1.0884991843393149e-05, "loss": 0.15, "num_input_tokens_seen": 5781248, "step": 2670 }, { "epoch": 0.4363784665579119, "grad_norm": 0.4192090630531311, "learning_rate": 1.0905383360522023e-05, "loss": 0.2446, "num_input_tokens_seen": 5792192, "step": 2675 }, { "epoch": 0.4371941272430669, "grad_norm": 1.0949187278747559, "learning_rate": 1.0925774877650898e-05, "loss": 0.0698, "num_input_tokens_seen": 5803264, "step": 2680 }, { "epoch": 0.43800978792822187, "grad_norm": 0.02187405154109001, "learning_rate": 1.0946166394779772e-05, "loss": 0.074, "num_input_tokens_seen": 5815072, "step": 2685 }, { "epoch": 0.43882544861337686, "grad_norm": 0.09475145488977432, "learning_rate": 1.0966557911908647e-05, "loss": 0.3131, "num_input_tokens_seen": 5825248, "step": 2690 }, { "epoch": 0.4396411092985318, "grad_norm": 3.4036996364593506, "learning_rate": 1.0986949429037521e-05, "loss": 0.3418, "num_input_tokens_seen": 5835904, "step": 2695 }, { "epoch": 0.44045676998368677, "grad_norm": 2.0667238235473633, "learning_rate": 1.1007340946166395e-05, "loss": 0.2068, "num_input_tokens_seen": 5844736, "step": 2700 }, { "epoch": 0.44127243066884175, "grad_norm": 0.12551303207874298, "learning_rate": 1.1027732463295269e-05, "loss": 0.4, "num_input_tokens_seen": 5856672, "step": 2705 }, { "epoch": 0.44208809135399674, "grad_norm": 0.03617444634437561, "learning_rate": 1.1048123980424144e-05, "loss": 0.182, "num_input_tokens_seen": 5866848, "step": 2710 }, { "epoch": 0.4429037520391517, "grad_norm": 0.15304698050022125, "learning_rate": 1.1068515497553018e-05, "loss": 0.0394, "num_input_tokens_seen": 5877856, "step": 2715 }, { "epoch": 0.4437194127243067, "grad_norm": 0.08448563516139984, "learning_rate": 1.1088907014681894e-05, "loss": 0.1898, "num_input_tokens_seen": 5888896, "step": 2720 }, { "epoch": 0.4445350734094617, "grad_norm": 0.2472865879535675, "learning_rate": 1.1109298531810768e-05, "loss": 0.2505, "num_input_tokens_seen": 5899872, "step": 2725 }, { "epoch": 0.4453507340946166, "grad_norm": 2.9898886680603027, "learning_rate": 1.1129690048939641e-05, "loss": 0.1542, "num_input_tokens_seen": 5910272, "step": 2730 }, { "epoch": 0.4461663947797716, "grad_norm": 2.7423582077026367, "learning_rate": 1.1150081566068515e-05, "loss": 0.2802, "num_input_tokens_seen": 5921312, "step": 2735 }, { "epoch": 0.4469820554649266, "grad_norm": 0.978839635848999, "learning_rate": 1.117047308319739e-05, "loss": 0.1235, "num_input_tokens_seen": 5932032, "step": 2740 }, { "epoch": 0.44779771615008157, "grad_norm": 0.3982903063297272, "learning_rate": 1.1190864600326265e-05, "loss": 0.1224, "num_input_tokens_seen": 5942176, "step": 2745 }, { "epoch": 0.44861337683523655, "grad_norm": 4.047948360443115, "learning_rate": 1.1211256117455138e-05, "loss": 0.6411, "num_input_tokens_seen": 5954336, "step": 2750 }, { "epoch": 0.44942903752039154, "grad_norm": 6.068945407867432, "learning_rate": 1.1231647634584014e-05, "loss": 0.4493, "num_input_tokens_seen": 5965152, "step": 2755 }, { "epoch": 0.45024469820554647, "grad_norm": 1.5082937479019165, "learning_rate": 1.1252039151712888e-05, "loss": 0.3033, "num_input_tokens_seen": 5977216, "step": 2760 }, { "epoch": 0.45106035889070145, "grad_norm": 0.8583357930183411, "learning_rate": 1.1272430668841763e-05, "loss": 0.2326, "num_input_tokens_seen": 5988000, "step": 2765 }, { "epoch": 0.45187601957585644, "grad_norm": 3.6348094940185547, "learning_rate": 1.1292822185970635e-05, "loss": 0.1897, "num_input_tokens_seen": 5998400, "step": 2770 }, { "epoch": 0.4526916802610114, "grad_norm": 1.8462319374084473, "learning_rate": 1.1313213703099511e-05, "loss": 0.2611, "num_input_tokens_seen": 6010176, "step": 2775 }, { "epoch": 0.4535073409461664, "grad_norm": 2.8028063774108887, "learning_rate": 1.1333605220228385e-05, "loss": 0.1845, "num_input_tokens_seen": 6021312, "step": 2780 }, { "epoch": 0.4543230016313214, "grad_norm": 2.7420802116394043, "learning_rate": 1.135399673735726e-05, "loss": 0.2288, "num_input_tokens_seen": 6033024, "step": 2785 }, { "epoch": 0.4551386623164764, "grad_norm": 0.051115527749061584, "learning_rate": 1.1374388254486134e-05, "loss": 0.1196, "num_input_tokens_seen": 6042432, "step": 2790 }, { "epoch": 0.4559543230016313, "grad_norm": 2.1143031120300293, "learning_rate": 1.139477977161501e-05, "loss": 0.27, "num_input_tokens_seen": 6054304, "step": 2795 }, { "epoch": 0.4567699836867863, "grad_norm": 1.533827543258667, "learning_rate": 1.1415171288743882e-05, "loss": 0.1915, "num_input_tokens_seen": 6065312, "step": 2800 }, { "epoch": 0.45758564437194127, "grad_norm": 2.405583381652832, "learning_rate": 1.1435562805872757e-05, "loss": 0.4966, "num_input_tokens_seen": 6075072, "step": 2805 }, { "epoch": 0.45840130505709625, "grad_norm": 2.0803134441375732, "learning_rate": 1.1455954323001631e-05, "loss": 0.0717, "num_input_tokens_seen": 6084992, "step": 2810 }, { "epoch": 0.45921696574225124, "grad_norm": 4.4311676025390625, "learning_rate": 1.1476345840130507e-05, "loss": 0.1337, "num_input_tokens_seen": 6095424, "step": 2815 }, { "epoch": 0.4600326264274062, "grad_norm": 1.7138910293579102, "learning_rate": 1.149673735725938e-05, "loss": 0.1333, "num_input_tokens_seen": 6107328, "step": 2820 }, { "epoch": 0.46084828711256115, "grad_norm": 0.372507244348526, "learning_rate": 1.1517128874388256e-05, "loss": 0.0131, "num_input_tokens_seen": 6118048, "step": 2825 }, { "epoch": 0.46166394779771613, "grad_norm": 2.5327861309051514, "learning_rate": 1.153752039151713e-05, "loss": 0.4867, "num_input_tokens_seen": 6128512, "step": 2830 }, { "epoch": 0.4624796084828711, "grad_norm": 0.5594280958175659, "learning_rate": 1.1557911908646004e-05, "loss": 0.0464, "num_input_tokens_seen": 6138720, "step": 2835 }, { "epoch": 0.4632952691680261, "grad_norm": 4.3226165771484375, "learning_rate": 1.1578303425774877e-05, "loss": 0.3125, "num_input_tokens_seen": 6149600, "step": 2840 }, { "epoch": 0.4641109298531811, "grad_norm": 3.957191228866577, "learning_rate": 1.1598694942903753e-05, "loss": 0.1762, "num_input_tokens_seen": 6160608, "step": 2845 }, { "epoch": 0.46492659053833607, "grad_norm": 1.8001214265823364, "learning_rate": 1.1619086460032627e-05, "loss": 0.282, "num_input_tokens_seen": 6171968, "step": 2850 }, { "epoch": 0.46574225122349105, "grad_norm": 2.0602316856384277, "learning_rate": 1.1639477977161502e-05, "loss": 0.1962, "num_input_tokens_seen": 6183360, "step": 2855 }, { "epoch": 0.466557911908646, "grad_norm": 1.9378005266189575, "learning_rate": 1.1659869494290376e-05, "loss": 0.0824, "num_input_tokens_seen": 6193664, "step": 2860 }, { "epoch": 0.46737357259380097, "grad_norm": 0.623241126537323, "learning_rate": 1.168026101141925e-05, "loss": 0.0409, "num_input_tokens_seen": 6204320, "step": 2865 }, { "epoch": 0.46818923327895595, "grad_norm": 2.3501925468444824, "learning_rate": 1.1700652528548124e-05, "loss": 0.3969, "num_input_tokens_seen": 6214560, "step": 2870 }, { "epoch": 0.46900489396411094, "grad_norm": 0.9982662796974182, "learning_rate": 1.1721044045677e-05, "loss": 0.3594, "num_input_tokens_seen": 6226176, "step": 2875 }, { "epoch": 0.4698205546492659, "grad_norm": 0.9709409475326538, "learning_rate": 1.1741435562805873e-05, "loss": 0.6178, "num_input_tokens_seen": 6236768, "step": 2880 }, { "epoch": 0.4706362153344209, "grad_norm": 3.643327236175537, "learning_rate": 1.1761827079934749e-05, "loss": 0.2086, "num_input_tokens_seen": 6247424, "step": 2885 }, { "epoch": 0.47145187601957583, "grad_norm": 0.42564088106155396, "learning_rate": 1.1782218597063622e-05, "loss": 0.0575, "num_input_tokens_seen": 6257280, "step": 2890 }, { "epoch": 0.4722675367047308, "grad_norm": 2.2364091873168945, "learning_rate": 1.1802610114192498e-05, "loss": 0.2357, "num_input_tokens_seen": 6268160, "step": 2895 }, { "epoch": 0.4730831973898858, "grad_norm": 0.14197373390197754, "learning_rate": 1.182300163132137e-05, "loss": 0.0907, "num_input_tokens_seen": 6278976, "step": 2900 }, { "epoch": 0.4738988580750408, "grad_norm": 0.03590124472975731, "learning_rate": 1.1843393148450246e-05, "loss": 0.2266, "num_input_tokens_seen": 6290144, "step": 2905 }, { "epoch": 0.47471451876019577, "grad_norm": 0.9708537459373474, "learning_rate": 1.186378466557912e-05, "loss": 0.1051, "num_input_tokens_seen": 6301760, "step": 2910 }, { "epoch": 0.47553017944535075, "grad_norm": 0.06616406887769699, "learning_rate": 1.1884176182707995e-05, "loss": 0.5611, "num_input_tokens_seen": 6312416, "step": 2915 }, { "epoch": 0.4763458401305057, "grad_norm": 0.41756701469421387, "learning_rate": 1.1904567699836869e-05, "loss": 0.1318, "num_input_tokens_seen": 6323488, "step": 2920 }, { "epoch": 0.47716150081566067, "grad_norm": 2.489903688430786, "learning_rate": 1.1924959216965744e-05, "loss": 0.3946, "num_input_tokens_seen": 6334496, "step": 2925 }, { "epoch": 0.47797716150081565, "grad_norm": 1.6376978158950806, "learning_rate": 1.1945350734094616e-05, "loss": 0.4419, "num_input_tokens_seen": 6344320, "step": 2930 }, { "epoch": 0.47879282218597063, "grad_norm": 2.681023597717285, "learning_rate": 1.1965742251223492e-05, "loss": 0.3277, "num_input_tokens_seen": 6355904, "step": 2935 }, { "epoch": 0.4796084828711256, "grad_norm": 2.495313882827759, "learning_rate": 1.1986133768352366e-05, "loss": 0.4008, "num_input_tokens_seen": 6366016, "step": 2940 }, { "epoch": 0.4804241435562806, "grad_norm": 2.148643970489502, "learning_rate": 1.2006525285481241e-05, "loss": 0.081, "num_input_tokens_seen": 6375712, "step": 2945 }, { "epoch": 0.4812398042414356, "grad_norm": 0.08061229437589645, "learning_rate": 1.2026916802610115e-05, "loss": 0.1307, "num_input_tokens_seen": 6386720, "step": 2950 }, { "epoch": 0.4820554649265905, "grad_norm": 0.0381893664598465, "learning_rate": 1.2047308319738989e-05, "loss": 0.1092, "num_input_tokens_seen": 6397536, "step": 2955 }, { "epoch": 0.4828711256117455, "grad_norm": 2.440633773803711, "learning_rate": 1.2067699836867863e-05, "loss": 0.2775, "num_input_tokens_seen": 6408384, "step": 2960 }, { "epoch": 0.4836867862969005, "grad_norm": 1.6051976680755615, "learning_rate": 1.2088091353996738e-05, "loss": 0.1411, "num_input_tokens_seen": 6420640, "step": 2965 }, { "epoch": 0.48450244698205547, "grad_norm": 0.13026829063892365, "learning_rate": 1.2108482871125612e-05, "loss": 0.208, "num_input_tokens_seen": 6431680, "step": 2970 }, { "epoch": 0.48531810766721045, "grad_norm": 0.7724660038948059, "learning_rate": 1.2128874388254486e-05, "loss": 0.0776, "num_input_tokens_seen": 6442368, "step": 2975 }, { "epoch": 0.48613376835236544, "grad_norm": 0.009828317910432816, "learning_rate": 1.2149265905383361e-05, "loss": 0.1199, "num_input_tokens_seen": 6453856, "step": 2980 }, { "epoch": 0.48694942903752036, "grad_norm": 0.9046329855918884, "learning_rate": 1.2169657422512235e-05, "loss": 0.3692, "num_input_tokens_seen": 6464640, "step": 2985 }, { "epoch": 0.48776508972267535, "grad_norm": 4.113231182098389, "learning_rate": 1.219004893964111e-05, "loss": 0.2179, "num_input_tokens_seen": 6475392, "step": 2990 }, { "epoch": 0.48858075040783033, "grad_norm": 1.3222898244857788, "learning_rate": 1.2210440456769985e-05, "loss": 0.1634, "num_input_tokens_seen": 6486880, "step": 2995 }, { "epoch": 0.4893964110929853, "grad_norm": 0.09175632148981094, "learning_rate": 1.2230831973898858e-05, "loss": 0.14, "num_input_tokens_seen": 6497216, "step": 3000 }, { "epoch": 0.4902120717781403, "grad_norm": 1.9849364757537842, "learning_rate": 1.2251223491027732e-05, "loss": 0.2846, "num_input_tokens_seen": 6508320, "step": 3005 }, { "epoch": 0.4910277324632953, "grad_norm": 2.906438112258911, "learning_rate": 1.2271615008156608e-05, "loss": 0.3147, "num_input_tokens_seen": 6519520, "step": 3010 }, { "epoch": 0.49184339314845027, "grad_norm": 1.2984158992767334, "learning_rate": 1.2292006525285482e-05, "loss": 0.2018, "num_input_tokens_seen": 6530240, "step": 3015 }, { "epoch": 0.4926590538336052, "grad_norm": 2.2028419971466064, "learning_rate": 1.2312398042414357e-05, "loss": 0.4027, "num_input_tokens_seen": 6540192, "step": 3020 }, { "epoch": 0.4934747145187602, "grad_norm": 0.28005537390708923, "learning_rate": 1.233278955954323e-05, "loss": 0.1055, "num_input_tokens_seen": 6550016, "step": 3025 }, { "epoch": 0.49429037520391517, "grad_norm": 0.18810728192329407, "learning_rate": 1.2353181076672105e-05, "loss": 0.1489, "num_input_tokens_seen": 6560672, "step": 3030 }, { "epoch": 0.49510603588907015, "grad_norm": 0.2771707773208618, "learning_rate": 1.2373572593800978e-05, "loss": 0.0875, "num_input_tokens_seen": 6571808, "step": 3035 }, { "epoch": 0.49592169657422513, "grad_norm": 1.215230107307434, "learning_rate": 1.2393964110929854e-05, "loss": 0.3464, "num_input_tokens_seen": 6582624, "step": 3040 }, { "epoch": 0.4967373572593801, "grad_norm": 0.4815494418144226, "learning_rate": 1.2414355628058728e-05, "loss": 0.3052, "num_input_tokens_seen": 6595008, "step": 3045 }, { "epoch": 0.49755301794453505, "grad_norm": 1.9709105491638184, "learning_rate": 1.2434747145187603e-05, "loss": 0.1467, "num_input_tokens_seen": 6606976, "step": 3050 }, { "epoch": 0.49836867862969003, "grad_norm": 2.6331074237823486, "learning_rate": 1.2455138662316477e-05, "loss": 0.1543, "num_input_tokens_seen": 6617760, "step": 3055 }, { "epoch": 0.499184339314845, "grad_norm": 3.6393673419952393, "learning_rate": 1.2475530179445351e-05, "loss": 0.1359, "num_input_tokens_seen": 6628896, "step": 3060 }, { "epoch": 0.5, "grad_norm": 3.94649600982666, "learning_rate": 1.2495921696574225e-05, "loss": 0.3561, "num_input_tokens_seen": 6639424, "step": 3065 }, { "epoch": 0.5008156606851549, "grad_norm": 1.7874926328659058, "learning_rate": 1.2516313213703102e-05, "loss": 0.302, "num_input_tokens_seen": 6649280, "step": 3070 }, { "epoch": 0.50163132137031, "grad_norm": 0.7427738308906555, "learning_rate": 1.2536704730831974e-05, "loss": 0.1157, "num_input_tokens_seen": 6659008, "step": 3075 }, { "epoch": 0.5024469820554649, "grad_norm": 2.215290069580078, "learning_rate": 1.2557096247960848e-05, "loss": 0.1948, "num_input_tokens_seen": 6669312, "step": 3080 }, { "epoch": 0.5032626427406199, "grad_norm": 2.2863759994506836, "learning_rate": 1.2577487765089723e-05, "loss": 0.4479, "num_input_tokens_seen": 6679968, "step": 3085 }, { "epoch": 0.5040783034257749, "grad_norm": 3.063027858734131, "learning_rate": 1.2597879282218597e-05, "loss": 0.26, "num_input_tokens_seen": 6689728, "step": 3090 }, { "epoch": 0.5048939641109299, "grad_norm": 2.4226479530334473, "learning_rate": 1.2618270799347473e-05, "loss": 0.6018, "num_input_tokens_seen": 6700448, "step": 3095 }, { "epoch": 0.5057096247960848, "grad_norm": 2.2188169956207275, "learning_rate": 1.2638662316476347e-05, "loss": 0.1564, "num_input_tokens_seen": 6711424, "step": 3100 }, { "epoch": 0.5065252854812398, "grad_norm": 1.2756606340408325, "learning_rate": 1.2659053833605219e-05, "loss": 0.0642, "num_input_tokens_seen": 6722752, "step": 3105 }, { "epoch": 0.5073409461663948, "grad_norm": 0.5937644243240356, "learning_rate": 1.2679445350734096e-05, "loss": 0.4945, "num_input_tokens_seen": 6734080, "step": 3110 }, { "epoch": 0.5081566068515497, "grad_norm": 0.41466352343559265, "learning_rate": 1.2699836867862968e-05, "loss": 0.0787, "num_input_tokens_seen": 6744672, "step": 3115 }, { "epoch": 0.5089722675367048, "grad_norm": 2.3938405513763428, "learning_rate": 1.2720228384991845e-05, "loss": 0.2613, "num_input_tokens_seen": 6755104, "step": 3120 }, { "epoch": 0.5097879282218597, "grad_norm": 1.7781873941421509, "learning_rate": 1.2740619902120717e-05, "loss": 0.2699, "num_input_tokens_seen": 6767168, "step": 3125 }, { "epoch": 0.5106035889070146, "grad_norm": 0.09250271320343018, "learning_rate": 1.2761011419249595e-05, "loss": 0.2118, "num_input_tokens_seen": 6777472, "step": 3130 }, { "epoch": 0.5114192495921697, "grad_norm": 3.964479684829712, "learning_rate": 1.2781402936378467e-05, "loss": 0.3344, "num_input_tokens_seen": 6787776, "step": 3135 }, { "epoch": 0.5122349102773246, "grad_norm": 1.8454570770263672, "learning_rate": 1.280179445350734e-05, "loss": 0.3943, "num_input_tokens_seen": 6799008, "step": 3140 }, { "epoch": 0.5130505709624796, "grad_norm": 0.08018161356449127, "learning_rate": 1.2822185970636216e-05, "loss": 0.1362, "num_input_tokens_seen": 6809440, "step": 3145 }, { "epoch": 0.5138662316476346, "grad_norm": 0.05553530156612396, "learning_rate": 1.284257748776509e-05, "loss": 0.0624, "num_input_tokens_seen": 6819456, "step": 3150 }, { "epoch": 0.5146818923327896, "grad_norm": 2.3732383251190186, "learning_rate": 1.2862969004893965e-05, "loss": 0.1007, "num_input_tokens_seen": 6829312, "step": 3155 }, { "epoch": 0.5154975530179445, "grad_norm": 1.6976927518844604, "learning_rate": 1.288336052202284e-05, "loss": 0.4687, "num_input_tokens_seen": 6839456, "step": 3160 }, { "epoch": 0.5163132137030995, "grad_norm": 0.2698207497596741, "learning_rate": 1.2903752039151715e-05, "loss": 0.3085, "num_input_tokens_seen": 6850752, "step": 3165 }, { "epoch": 0.5171288743882545, "grad_norm": 1.0551974773406982, "learning_rate": 1.2924143556280589e-05, "loss": 0.1192, "num_input_tokens_seen": 6862144, "step": 3170 }, { "epoch": 0.5179445350734094, "grad_norm": 3.5197205543518066, "learning_rate": 1.294453507340946e-05, "loss": 0.2326, "num_input_tokens_seen": 6872704, "step": 3175 }, { "epoch": 0.5187601957585645, "grad_norm": 3.3070504665374756, "learning_rate": 1.2964926590538336e-05, "loss": 0.1858, "num_input_tokens_seen": 6883232, "step": 3180 }, { "epoch": 0.5195758564437194, "grad_norm": 2.3119945526123047, "learning_rate": 1.298531810766721e-05, "loss": 0.3418, "num_input_tokens_seen": 6894368, "step": 3185 }, { "epoch": 0.5203915171288744, "grad_norm": 2.1621358394622803, "learning_rate": 1.3005709624796086e-05, "loss": 0.212, "num_input_tokens_seen": 6906336, "step": 3190 }, { "epoch": 0.5212071778140294, "grad_norm": 0.1250164657831192, "learning_rate": 1.302610114192496e-05, "loss": 0.1273, "num_input_tokens_seen": 6917440, "step": 3195 }, { "epoch": 0.5220228384991843, "grad_norm": 1.969495415687561, "learning_rate": 1.3046492659053835e-05, "loss": 0.4718, "num_input_tokens_seen": 6928736, "step": 3200 }, { "epoch": 0.5228384991843393, "grad_norm": 1.7941052913665771, "learning_rate": 1.3066884176182709e-05, "loss": 0.2197, "num_input_tokens_seen": 6939488, "step": 3205 }, { "epoch": 0.5236541598694943, "grad_norm": 0.3889951705932617, "learning_rate": 1.3087275693311583e-05, "loss": 0.046, "num_input_tokens_seen": 6950016, "step": 3210 }, { "epoch": 0.5244698205546493, "grad_norm": 2.690838098526001, "learning_rate": 1.3107667210440458e-05, "loss": 0.3526, "num_input_tokens_seen": 6961280, "step": 3215 }, { "epoch": 0.5252854812398042, "grad_norm": 1.2496626377105713, "learning_rate": 1.3128058727569332e-05, "loss": 0.0987, "num_input_tokens_seen": 6971360, "step": 3220 }, { "epoch": 0.5261011419249593, "grad_norm": 2.2789146900177, "learning_rate": 1.3148450244698207e-05, "loss": 0.52, "num_input_tokens_seen": 6982528, "step": 3225 }, { "epoch": 0.5269168026101142, "grad_norm": 0.6820924878120422, "learning_rate": 1.316884176182708e-05, "loss": 0.0967, "num_input_tokens_seen": 6992704, "step": 3230 }, { "epoch": 0.5277324632952691, "grad_norm": 1.7313612699508667, "learning_rate": 1.3189233278955953e-05, "loss": 0.3483, "num_input_tokens_seen": 7003072, "step": 3235 }, { "epoch": 0.5285481239804242, "grad_norm": 1.2402286529541016, "learning_rate": 1.3209624796084829e-05, "loss": 0.1927, "num_input_tokens_seen": 7013728, "step": 3240 }, { "epoch": 0.5293637846655791, "grad_norm": 2.2399699687957764, "learning_rate": 1.3230016313213703e-05, "loss": 0.1035, "num_input_tokens_seen": 7024256, "step": 3245 }, { "epoch": 0.5301794453507341, "grad_norm": 1.103711724281311, "learning_rate": 1.3250407830342578e-05, "loss": 0.4672, "num_input_tokens_seen": 7034688, "step": 3250 }, { "epoch": 0.5309951060358891, "grad_norm": 1.9034101963043213, "learning_rate": 1.3270799347471452e-05, "loss": 0.1605, "num_input_tokens_seen": 7046080, "step": 3255 }, { "epoch": 0.531810766721044, "grad_norm": 1.989866852760315, "learning_rate": 1.3291190864600328e-05, "loss": 0.2109, "num_input_tokens_seen": 7056032, "step": 3260 }, { "epoch": 0.532626427406199, "grad_norm": 2.013803482055664, "learning_rate": 1.3311582381729201e-05, "loss": 0.2165, "num_input_tokens_seen": 7066880, "step": 3265 }, { "epoch": 0.533442088091354, "grad_norm": 2.686363935470581, "learning_rate": 1.3331973898858075e-05, "loss": 0.374, "num_input_tokens_seen": 7077664, "step": 3270 }, { "epoch": 0.534257748776509, "grad_norm": 3.062865972518921, "learning_rate": 1.335236541598695e-05, "loss": 0.1719, "num_input_tokens_seen": 7088928, "step": 3275 }, { "epoch": 0.5350734094616639, "grad_norm": 0.7033872008323669, "learning_rate": 1.3372756933115823e-05, "loss": 0.2315, "num_input_tokens_seen": 7100800, "step": 3280 }, { "epoch": 0.535889070146819, "grad_norm": 1.7542067766189575, "learning_rate": 1.33931484502447e-05, "loss": 0.1882, "num_input_tokens_seen": 7112160, "step": 3285 }, { "epoch": 0.5367047308319739, "grad_norm": 0.8725832104682922, "learning_rate": 1.3413539967373572e-05, "loss": 0.2084, "num_input_tokens_seen": 7122848, "step": 3290 }, { "epoch": 0.5375203915171288, "grad_norm": 1.969261884689331, "learning_rate": 1.343393148450245e-05, "loss": 0.2348, "num_input_tokens_seen": 7132800, "step": 3295 }, { "epoch": 0.5383360522022839, "grad_norm": 0.3499806821346283, "learning_rate": 1.3454323001631322e-05, "loss": 0.142, "num_input_tokens_seen": 7144064, "step": 3300 }, { "epoch": 0.5391517128874388, "grad_norm": 1.450896978378296, "learning_rate": 1.3474714518760195e-05, "loss": 0.2005, "num_input_tokens_seen": 7155744, "step": 3305 }, { "epoch": 0.5399673735725938, "grad_norm": 0.18681150674819946, "learning_rate": 1.3495106035889071e-05, "loss": 0.0384, "num_input_tokens_seen": 7166336, "step": 3310 }, { "epoch": 0.5407830342577488, "grad_norm": 0.05888867750763893, "learning_rate": 1.3515497553017945e-05, "loss": 0.287, "num_input_tokens_seen": 7176832, "step": 3315 }, { "epoch": 0.5415986949429038, "grad_norm": 2.042929172515869, "learning_rate": 1.353588907014682e-05, "loss": 0.2365, "num_input_tokens_seen": 7187616, "step": 3320 }, { "epoch": 0.5424143556280587, "grad_norm": 0.04058634117245674, "learning_rate": 1.3556280587275694e-05, "loss": 0.205, "num_input_tokens_seen": 7198144, "step": 3325 }, { "epoch": 0.5432300163132137, "grad_norm": 1.8447086811065674, "learning_rate": 1.357667210440457e-05, "loss": 0.1434, "num_input_tokens_seen": 7208960, "step": 3330 }, { "epoch": 0.5440456769983687, "grad_norm": 0.04508940875530243, "learning_rate": 1.3597063621533443e-05, "loss": 0.4265, "num_input_tokens_seen": 7220064, "step": 3335 }, { "epoch": 0.5448613376835236, "grad_norm": 0.45537418127059937, "learning_rate": 1.3617455138662316e-05, "loss": 0.2715, "num_input_tokens_seen": 7231008, "step": 3340 }, { "epoch": 0.5456769983686787, "grad_norm": 1.7218809127807617, "learning_rate": 1.3637846655791193e-05, "loss": 0.4378, "num_input_tokens_seen": 7242144, "step": 3345 }, { "epoch": 0.5464926590538336, "grad_norm": 0.8339822888374329, "learning_rate": 1.3658238172920065e-05, "loss": 0.3236, "num_input_tokens_seen": 7252448, "step": 3350 }, { "epoch": 0.5473083197389886, "grad_norm": 0.20142516493797302, "learning_rate": 1.3678629690048942e-05, "loss": 0.181, "num_input_tokens_seen": 7261856, "step": 3355 }, { "epoch": 0.5481239804241436, "grad_norm": 0.24802237749099731, "learning_rate": 1.3699021207177814e-05, "loss": 0.1057, "num_input_tokens_seen": 7272800, "step": 3360 }, { "epoch": 0.5489396411092985, "grad_norm": 2.35129451751709, "learning_rate": 1.3719412724306688e-05, "loss": 0.2415, "num_input_tokens_seen": 7283808, "step": 3365 }, { "epoch": 0.5497553017944535, "grad_norm": 0.7507189512252808, "learning_rate": 1.3739804241435564e-05, "loss": 0.0459, "num_input_tokens_seen": 7294976, "step": 3370 }, { "epoch": 0.5505709624796085, "grad_norm": 0.9412408471107483, "learning_rate": 1.3760195758564437e-05, "loss": 0.1956, "num_input_tokens_seen": 7306560, "step": 3375 }, { "epoch": 0.5513866231647635, "grad_norm": 0.25092750787734985, "learning_rate": 1.3780587275693313e-05, "loss": 0.0288, "num_input_tokens_seen": 7317600, "step": 3380 }, { "epoch": 0.5522022838499184, "grad_norm": 2.204451084136963, "learning_rate": 1.3800978792822187e-05, "loss": 0.104, "num_input_tokens_seen": 7330144, "step": 3385 }, { "epoch": 0.5530179445350734, "grad_norm": 2.3068387508392334, "learning_rate": 1.3821370309951062e-05, "loss": 0.2228, "num_input_tokens_seen": 7341248, "step": 3390 }, { "epoch": 0.5538336052202284, "grad_norm": 2.147447109222412, "learning_rate": 1.3841761827079936e-05, "loss": 0.2886, "num_input_tokens_seen": 7351200, "step": 3395 }, { "epoch": 0.5546492659053833, "grad_norm": 0.9868392944335938, "learning_rate": 1.3862153344208808e-05, "loss": 0.04, "num_input_tokens_seen": 7361792, "step": 3400 }, { "epoch": 0.5554649265905384, "grad_norm": 2.1145732402801514, "learning_rate": 1.3882544861337685e-05, "loss": 0.1777, "num_input_tokens_seen": 7372160, "step": 3405 }, { "epoch": 0.5562805872756933, "grad_norm": 0.2551671862602234, "learning_rate": 1.3902936378466558e-05, "loss": 0.0734, "num_input_tokens_seen": 7381408, "step": 3410 }, { "epoch": 0.5570962479608483, "grad_norm": 3.229334592819214, "learning_rate": 1.3923327895595433e-05, "loss": 0.2951, "num_input_tokens_seen": 7392512, "step": 3415 }, { "epoch": 0.5579119086460033, "grad_norm": 1.613127589225769, "learning_rate": 1.3943719412724307e-05, "loss": 0.277, "num_input_tokens_seen": 7404192, "step": 3420 }, { "epoch": 0.5587275693311582, "grad_norm": 0.012924741953611374, "learning_rate": 1.3964110929853182e-05, "loss": 0.2104, "num_input_tokens_seen": 7415008, "step": 3425 }, { "epoch": 0.5595432300163132, "grad_norm": 1.5130163431167603, "learning_rate": 1.3984502446982056e-05, "loss": 0.1257, "num_input_tokens_seen": 7425088, "step": 3430 }, { "epoch": 0.5603588907014682, "grad_norm": 1.482914924621582, "learning_rate": 1.400489396411093e-05, "loss": 0.3503, "num_input_tokens_seen": 7436608, "step": 3435 }, { "epoch": 0.5611745513866232, "grad_norm": 0.5588006377220154, "learning_rate": 1.4025285481239806e-05, "loss": 0.2086, "num_input_tokens_seen": 7446784, "step": 3440 }, { "epoch": 0.5619902120717781, "grad_norm": 0.2482733428478241, "learning_rate": 1.404567699836868e-05, "loss": 0.2149, "num_input_tokens_seen": 7456640, "step": 3445 }, { "epoch": 0.5628058727569332, "grad_norm": 0.4095010757446289, "learning_rate": 1.4066068515497555e-05, "loss": 0.2954, "num_input_tokens_seen": 7468096, "step": 3450 }, { "epoch": 0.5636215334420881, "grad_norm": 1.7926872968673706, "learning_rate": 1.4086460032626429e-05, "loss": 0.2578, "num_input_tokens_seen": 7480096, "step": 3455 }, { "epoch": 0.564437194127243, "grad_norm": 0.7732520699501038, "learning_rate": 1.4106851549755301e-05, "loss": 0.1368, "num_input_tokens_seen": 7491136, "step": 3460 }, { "epoch": 0.5652528548123981, "grad_norm": 0.16162295639514923, "learning_rate": 1.4127243066884176e-05, "loss": 0.0962, "num_input_tokens_seen": 7502304, "step": 3465 }, { "epoch": 0.566068515497553, "grad_norm": 0.5016602873802185, "learning_rate": 1.414763458401305e-05, "loss": 0.1592, "num_input_tokens_seen": 7512928, "step": 3470 }, { "epoch": 0.566884176182708, "grad_norm": 0.1605583131313324, "learning_rate": 1.4168026101141926e-05, "loss": 0.2737, "num_input_tokens_seen": 7523904, "step": 3475 }, { "epoch": 0.567699836867863, "grad_norm": 2.4818360805511475, "learning_rate": 1.41884176182708e-05, "loss": 0.4419, "num_input_tokens_seen": 7534496, "step": 3480 }, { "epoch": 0.5685154975530179, "grad_norm": 2.3941597938537598, "learning_rate": 1.4208809135399675e-05, "loss": 0.4877, "num_input_tokens_seen": 7545120, "step": 3485 }, { "epoch": 0.5693311582381729, "grad_norm": 1.5087777376174927, "learning_rate": 1.4229200652528549e-05, "loss": 0.1305, "num_input_tokens_seen": 7555712, "step": 3490 }, { "epoch": 0.5701468189233279, "grad_norm": 0.445043683052063, "learning_rate": 1.4249592169657423e-05, "loss": 0.0639, "num_input_tokens_seen": 7567200, "step": 3495 }, { "epoch": 0.5709624796084829, "grad_norm": 2.477911949157715, "learning_rate": 1.4269983686786298e-05, "loss": 0.308, "num_input_tokens_seen": 7578464, "step": 3500 }, { "epoch": 0.5717781402936378, "grad_norm": 1.5432686805725098, "learning_rate": 1.429037520391517e-05, "loss": 0.2254, "num_input_tokens_seen": 7588256, "step": 3505 }, { "epoch": 0.5725938009787929, "grad_norm": 1.6002298593521118, "learning_rate": 1.4310766721044048e-05, "loss": 0.3492, "num_input_tokens_seen": 7598848, "step": 3510 }, { "epoch": 0.5734094616639478, "grad_norm": 1.9020211696624756, "learning_rate": 1.433115823817292e-05, "loss": 0.1528, "num_input_tokens_seen": 7609696, "step": 3515 }, { "epoch": 0.5742251223491027, "grad_norm": 0.48589545488357544, "learning_rate": 1.4351549755301797e-05, "loss": 0.0893, "num_input_tokens_seen": 7620864, "step": 3520 }, { "epoch": 0.5750407830342578, "grad_norm": 2.2743475437164307, "learning_rate": 1.4371941272430669e-05, "loss": 0.0906, "num_input_tokens_seen": 7631392, "step": 3525 }, { "epoch": 0.5758564437194127, "grad_norm": 0.32751768827438354, "learning_rate": 1.4392332789559543e-05, "loss": 0.1368, "num_input_tokens_seen": 7643232, "step": 3530 }, { "epoch": 0.5766721044045677, "grad_norm": 0.22598782181739807, "learning_rate": 1.4412724306688418e-05, "loss": 0.2441, "num_input_tokens_seen": 7654944, "step": 3535 }, { "epoch": 0.5774877650897227, "grad_norm": 1.9328452348709106, "learning_rate": 1.4433115823817292e-05, "loss": 0.121, "num_input_tokens_seen": 7665760, "step": 3540 }, { "epoch": 0.5783034257748777, "grad_norm": 0.18304021656513214, "learning_rate": 1.4453507340946168e-05, "loss": 0.0997, "num_input_tokens_seen": 7676576, "step": 3545 }, { "epoch": 0.5791190864600326, "grad_norm": 0.7518135905265808, "learning_rate": 1.4473898858075042e-05, "loss": 0.369, "num_input_tokens_seen": 7687104, "step": 3550 }, { "epoch": 0.5799347471451876, "grad_norm": 2.1949565410614014, "learning_rate": 1.4494290375203917e-05, "loss": 0.1137, "num_input_tokens_seen": 7698080, "step": 3555 }, { "epoch": 0.5807504078303426, "grad_norm": 1.3888131380081177, "learning_rate": 1.4514681892332791e-05, "loss": 0.2795, "num_input_tokens_seen": 7709632, "step": 3560 }, { "epoch": 0.5815660685154975, "grad_norm": 0.7627912759780884, "learning_rate": 1.4535073409461663e-05, "loss": 0.0711, "num_input_tokens_seen": 7719936, "step": 3565 }, { "epoch": 0.5823817292006526, "grad_norm": 0.5102983713150024, "learning_rate": 1.455546492659054e-05, "loss": 0.0991, "num_input_tokens_seen": 7730080, "step": 3570 }, { "epoch": 0.5831973898858075, "grad_norm": 1.226204752922058, "learning_rate": 1.4575856443719412e-05, "loss": 0.1623, "num_input_tokens_seen": 7740576, "step": 3575 }, { "epoch": 0.5840130505709625, "grad_norm": 0.039720240980386734, "learning_rate": 1.459624796084829e-05, "loss": 0.2797, "num_input_tokens_seen": 7751040, "step": 3580 }, { "epoch": 0.5848287112561175, "grad_norm": 0.024539321660995483, "learning_rate": 1.4616639477977162e-05, "loss": 0.0735, "num_input_tokens_seen": 7760768, "step": 3585 }, { "epoch": 0.5856443719412724, "grad_norm": 0.16835741698741913, "learning_rate": 1.4637030995106035e-05, "loss": 0.0824, "num_input_tokens_seen": 7771680, "step": 3590 }, { "epoch": 0.5864600326264274, "grad_norm": 0.2900225818157196, "learning_rate": 1.4657422512234911e-05, "loss": 0.1187, "num_input_tokens_seen": 7783072, "step": 3595 }, { "epoch": 0.5872756933115824, "grad_norm": 0.20034745335578918, "learning_rate": 1.4677814029363785e-05, "loss": 0.1112, "num_input_tokens_seen": 7792608, "step": 3600 }, { "epoch": 0.5880913539967374, "grad_norm": 0.1635698527097702, "learning_rate": 1.469820554649266e-05, "loss": 0.2973, "num_input_tokens_seen": 7803744, "step": 3605 }, { "epoch": 0.5889070146818923, "grad_norm": 0.9617367386817932, "learning_rate": 1.4718597063621534e-05, "loss": 0.1352, "num_input_tokens_seen": 7815008, "step": 3610 }, { "epoch": 0.5897226753670473, "grad_norm": 1.4414182901382446, "learning_rate": 1.473898858075041e-05, "loss": 0.3407, "num_input_tokens_seen": 7825536, "step": 3615 }, { "epoch": 0.5905383360522023, "grad_norm": 0.10963639616966248, "learning_rate": 1.4759380097879284e-05, "loss": 0.1972, "num_input_tokens_seen": 7836160, "step": 3620 }, { "epoch": 0.5913539967373572, "grad_norm": 1.5366636514663696, "learning_rate": 1.4779771615008156e-05, "loss": 0.0973, "num_input_tokens_seen": 7848864, "step": 3625 }, { "epoch": 0.5921696574225123, "grad_norm": 2.218977689743042, "learning_rate": 1.4800163132137033e-05, "loss": 0.1583, "num_input_tokens_seen": 7859488, "step": 3630 }, { "epoch": 0.5929853181076672, "grad_norm": 0.8146871328353882, "learning_rate": 1.4820554649265905e-05, "loss": 0.1804, "num_input_tokens_seen": 7869600, "step": 3635 }, { "epoch": 0.5938009787928222, "grad_norm": 1.6079431772232056, "learning_rate": 1.484094616639478e-05, "loss": 0.2618, "num_input_tokens_seen": 7879840, "step": 3640 }, { "epoch": 0.5946166394779772, "grad_norm": 0.0374605767428875, "learning_rate": 1.4861337683523654e-05, "loss": 0.102, "num_input_tokens_seen": 7890272, "step": 3645 }, { "epoch": 0.5954323001631321, "grad_norm": 3.541083574295044, "learning_rate": 1.488172920065253e-05, "loss": 0.3684, "num_input_tokens_seen": 7901120, "step": 3650 }, { "epoch": 0.5962479608482871, "grad_norm": 1.3141701221466064, "learning_rate": 1.4902120717781404e-05, "loss": 0.0699, "num_input_tokens_seen": 7912608, "step": 3655 }, { "epoch": 0.5970636215334421, "grad_norm": 1.2000682353973389, "learning_rate": 1.4922512234910277e-05, "loss": 0.2962, "num_input_tokens_seen": 7923936, "step": 3660 }, { "epoch": 0.5978792822185971, "grad_norm": 2.205348491668701, "learning_rate": 1.4942903752039153e-05, "loss": 0.2931, "num_input_tokens_seen": 7934496, "step": 3665 }, { "epoch": 0.598694942903752, "grad_norm": 1.8807446956634521, "learning_rate": 1.4963295269168027e-05, "loss": 0.2266, "num_input_tokens_seen": 7945728, "step": 3670 }, { "epoch": 0.5995106035889071, "grad_norm": 0.2155112773180008, "learning_rate": 1.4983686786296902e-05, "loss": 0.163, "num_input_tokens_seen": 7956960, "step": 3675 }, { "epoch": 0.600326264274062, "grad_norm": 1.4297126531600952, "learning_rate": 1.5004078303425776e-05, "loss": 0.1721, "num_input_tokens_seen": 7969440, "step": 3680 }, { "epoch": 0.6011419249592169, "grad_norm": 1.5855439901351929, "learning_rate": 1.5024469820554652e-05, "loss": 0.1716, "num_input_tokens_seen": 7978560, "step": 3685 }, { "epoch": 0.601957585644372, "grad_norm": 1.1050636768341064, "learning_rate": 1.5044861337683524e-05, "loss": 0.2924, "num_input_tokens_seen": 7990400, "step": 3690 }, { "epoch": 0.6027732463295269, "grad_norm": 1.8125606775283813, "learning_rate": 1.5065252854812398e-05, "loss": 0.1883, "num_input_tokens_seen": 8000128, "step": 3695 }, { "epoch": 0.6035889070146819, "grad_norm": 0.06071191653609276, "learning_rate": 1.5085644371941273e-05, "loss": 0.1324, "num_input_tokens_seen": 8010592, "step": 3700 }, { "epoch": 0.6044045676998369, "grad_norm": 1.7421069145202637, "learning_rate": 1.5106035889070147e-05, "loss": 0.2703, "num_input_tokens_seen": 8022240, "step": 3705 }, { "epoch": 0.6052202283849919, "grad_norm": 0.32061806321144104, "learning_rate": 1.5126427406199022e-05, "loss": 0.2663, "num_input_tokens_seen": 8032096, "step": 3710 }, { "epoch": 0.6060358890701468, "grad_norm": 3.248082160949707, "learning_rate": 1.5146818923327896e-05, "loss": 0.3208, "num_input_tokens_seen": 8042400, "step": 3715 }, { "epoch": 0.6068515497553018, "grad_norm": 0.4243759214878082, "learning_rate": 1.516721044045677e-05, "loss": 0.0799, "num_input_tokens_seen": 8052672, "step": 3720 }, { "epoch": 0.6076672104404568, "grad_norm": 0.865230143070221, "learning_rate": 1.5187601957585646e-05, "loss": 0.1629, "num_input_tokens_seen": 8063712, "step": 3725 }, { "epoch": 0.6084828711256117, "grad_norm": 1.7629679441452026, "learning_rate": 1.520799347471452e-05, "loss": 0.1227, "num_input_tokens_seen": 8073600, "step": 3730 }, { "epoch": 0.6092985318107668, "grad_norm": 0.33555343747138977, "learning_rate": 1.5228384991843395e-05, "loss": 0.1897, "num_input_tokens_seen": 8085664, "step": 3735 }, { "epoch": 0.6101141924959217, "grad_norm": 1.7863975763320923, "learning_rate": 1.5248776508972267e-05, "loss": 0.2321, "num_input_tokens_seen": 8097152, "step": 3740 }, { "epoch": 0.6109298531810766, "grad_norm": 1.6569404602050781, "learning_rate": 1.5269168026101143e-05, "loss": 0.1143, "num_input_tokens_seen": 8107648, "step": 3745 }, { "epoch": 0.6117455138662317, "grad_norm": 2.12369704246521, "learning_rate": 1.5289559543230016e-05, "loss": 0.3896, "num_input_tokens_seen": 8118272, "step": 3750 }, { "epoch": 0.6125611745513866, "grad_norm": 1.1568551063537598, "learning_rate": 1.530995106035889e-05, "loss": 0.1129, "num_input_tokens_seen": 8129120, "step": 3755 }, { "epoch": 0.6133768352365416, "grad_norm": 0.13752444088459015, "learning_rate": 1.5330342577487767e-05, "loss": 0.1997, "num_input_tokens_seen": 8139200, "step": 3760 }, { "epoch": 0.6141924959216966, "grad_norm": 1.1652982234954834, "learning_rate": 1.5350734094616638e-05, "loss": 0.1545, "num_input_tokens_seen": 8150272, "step": 3765 }, { "epoch": 0.6150081566068516, "grad_norm": 3.1869072914123535, "learning_rate": 1.5371125611745515e-05, "loss": 0.3661, "num_input_tokens_seen": 8162496, "step": 3770 }, { "epoch": 0.6158238172920065, "grad_norm": 0.4093005061149597, "learning_rate": 1.539151712887439e-05, "loss": 0.1918, "num_input_tokens_seen": 8173664, "step": 3775 }, { "epoch": 0.6166394779771615, "grad_norm": 0.46528494358062744, "learning_rate": 1.5411908646003266e-05, "loss": 0.3646, "num_input_tokens_seen": 8184512, "step": 3780 }, { "epoch": 0.6174551386623165, "grad_norm": 2.139164924621582, "learning_rate": 1.5432300163132137e-05, "loss": 0.2594, "num_input_tokens_seen": 8194432, "step": 3785 }, { "epoch": 0.6182707993474714, "grad_norm": 2.2626473903656006, "learning_rate": 1.545269168026101e-05, "loss": 0.1057, "num_input_tokens_seen": 8205152, "step": 3790 }, { "epoch": 0.6190864600326265, "grad_norm": 0.24821190536022186, "learning_rate": 1.5473083197389888e-05, "loss": 0.2131, "num_input_tokens_seen": 8215360, "step": 3795 }, { "epoch": 0.6199021207177814, "grad_norm": 0.21795476973056793, "learning_rate": 1.549347471451876e-05, "loss": 0.1355, "num_input_tokens_seen": 8225216, "step": 3800 }, { "epoch": 0.6207177814029364, "grad_norm": 2.6953072547912598, "learning_rate": 1.5513866231647635e-05, "loss": 0.239, "num_input_tokens_seen": 8235488, "step": 3805 }, { "epoch": 0.6215334420880914, "grad_norm": 0.847017765045166, "learning_rate": 1.553425774877651e-05, "loss": 0.1525, "num_input_tokens_seen": 8246208, "step": 3810 }, { "epoch": 0.6223491027732463, "grad_norm": 1.1698718070983887, "learning_rate": 1.5554649265905383e-05, "loss": 0.1828, "num_input_tokens_seen": 8258048, "step": 3815 }, { "epoch": 0.6231647634584013, "grad_norm": 2.2529196739196777, "learning_rate": 1.557504078303426e-05, "loss": 0.1377, "num_input_tokens_seen": 8269472, "step": 3820 }, { "epoch": 0.6239804241435563, "grad_norm": 3.3745269775390625, "learning_rate": 1.559543230016313e-05, "loss": 0.3828, "num_input_tokens_seen": 8280512, "step": 3825 }, { "epoch": 0.6247960848287113, "grad_norm": 0.8120728731155396, "learning_rate": 1.5615823817292008e-05, "loss": 0.225, "num_input_tokens_seen": 8291360, "step": 3830 }, { "epoch": 0.6256117455138662, "grad_norm": 0.1976441591978073, "learning_rate": 1.563621533442088e-05, "loss": 0.0446, "num_input_tokens_seen": 8301440, "step": 3835 }, { "epoch": 0.6264274061990212, "grad_norm": 2.212717056274414, "learning_rate": 1.565660685154976e-05, "loss": 0.3302, "num_input_tokens_seen": 8313472, "step": 3840 }, { "epoch": 0.6272430668841762, "grad_norm": 0.0879954919219017, "learning_rate": 1.567699836867863e-05, "loss": 0.0285, "num_input_tokens_seen": 8323488, "step": 3845 }, { "epoch": 0.6280587275693311, "grad_norm": 0.5916217565536499, "learning_rate": 1.5697389885807503e-05, "loss": 0.0552, "num_input_tokens_seen": 8336096, "step": 3850 }, { "epoch": 0.6288743882544862, "grad_norm": 2.017768144607544, "learning_rate": 1.571778140293638e-05, "loss": 0.2791, "num_input_tokens_seen": 8344736, "step": 3855 }, { "epoch": 0.6296900489396411, "grad_norm": 0.19952479004859924, "learning_rate": 1.5738172920065254e-05, "loss": 0.1276, "num_input_tokens_seen": 8356416, "step": 3860 }, { "epoch": 0.6305057096247961, "grad_norm": 0.306976318359375, "learning_rate": 1.5758564437194128e-05, "loss": 0.2479, "num_input_tokens_seen": 8367712, "step": 3865 }, { "epoch": 0.6313213703099511, "grad_norm": 0.13040493428707123, "learning_rate": 1.5778955954323002e-05, "loss": 0.2297, "num_input_tokens_seen": 8378752, "step": 3870 }, { "epoch": 0.632137030995106, "grad_norm": 1.9690895080566406, "learning_rate": 1.579934747145188e-05, "loss": 0.251, "num_input_tokens_seen": 8389472, "step": 3875 }, { "epoch": 0.632952691680261, "grad_norm": 0.12379740178585052, "learning_rate": 1.5819738988580753e-05, "loss": 0.2196, "num_input_tokens_seen": 8400064, "step": 3880 }, { "epoch": 0.633768352365416, "grad_norm": 0.5652022361755371, "learning_rate": 1.5840130505709623e-05, "loss": 0.1061, "num_input_tokens_seen": 8410176, "step": 3885 }, { "epoch": 0.634584013050571, "grad_norm": 0.977329671382904, "learning_rate": 1.58605220228385e-05, "loss": 0.3241, "num_input_tokens_seen": 8420256, "step": 3890 }, { "epoch": 0.6353996737357259, "grad_norm": 0.14305181801319122, "learning_rate": 1.5880913539967374e-05, "loss": 0.1872, "num_input_tokens_seen": 8431808, "step": 3895 }, { "epoch": 0.636215334420881, "grad_norm": 0.6989423036575317, "learning_rate": 1.5901305057096248e-05, "loss": 0.077, "num_input_tokens_seen": 8444064, "step": 3900 }, { "epoch": 0.6370309951060359, "grad_norm": 1.8678274154663086, "learning_rate": 1.5921696574225122e-05, "loss": 0.3273, "num_input_tokens_seen": 8454944, "step": 3905 }, { "epoch": 0.6378466557911908, "grad_norm": 0.4749884307384491, "learning_rate": 1.5942088091354e-05, "loss": 0.0679, "num_input_tokens_seen": 8466208, "step": 3910 }, { "epoch": 0.6386623164763459, "grad_norm": 0.36379274725914, "learning_rate": 1.5962479608482873e-05, "loss": 0.1407, "num_input_tokens_seen": 8476480, "step": 3915 }, { "epoch": 0.6394779771615008, "grad_norm": 2.0704195499420166, "learning_rate": 1.5982871125611747e-05, "loss": 0.222, "num_input_tokens_seen": 8486688, "step": 3920 }, { "epoch": 0.6402936378466558, "grad_norm": 0.1751447319984436, "learning_rate": 1.600326264274062e-05, "loss": 0.2238, "num_input_tokens_seen": 8499200, "step": 3925 }, { "epoch": 0.6411092985318108, "grad_norm": 1.4482629299163818, "learning_rate": 1.6023654159869494e-05, "loss": 0.1703, "num_input_tokens_seen": 8509728, "step": 3930 }, { "epoch": 0.6419249592169658, "grad_norm": 2.5437512397766113, "learning_rate": 1.604404567699837e-05, "loss": 0.1859, "num_input_tokens_seen": 8520928, "step": 3935 }, { "epoch": 0.6427406199021207, "grad_norm": 0.07707276940345764, "learning_rate": 1.6064437194127242e-05, "loss": 0.1996, "num_input_tokens_seen": 8532448, "step": 3940 }, { "epoch": 0.6435562805872757, "grad_norm": 0.4778570830821991, "learning_rate": 1.6084828711256116e-05, "loss": 0.2494, "num_input_tokens_seen": 8543360, "step": 3945 }, { "epoch": 0.6443719412724307, "grad_norm": 0.47916996479034424, "learning_rate": 1.6105220228384993e-05, "loss": 0.2239, "num_input_tokens_seen": 8554176, "step": 3950 }, { "epoch": 0.6451876019575856, "grad_norm": 0.4552326798439026, "learning_rate": 1.6125611745513867e-05, "loss": 0.1445, "num_input_tokens_seen": 8565376, "step": 3955 }, { "epoch": 0.6460032626427407, "grad_norm": 0.23558256030082703, "learning_rate": 1.614600326264274e-05, "loss": 0.3292, "num_input_tokens_seen": 8576480, "step": 3960 }, { "epoch": 0.6468189233278956, "grad_norm": 0.21800675988197327, "learning_rate": 1.6166394779771615e-05, "loss": 0.1125, "num_input_tokens_seen": 8586944, "step": 3965 }, { "epoch": 0.6476345840130505, "grad_norm": 0.21565359830856323, "learning_rate": 1.6186786296900492e-05, "loss": 0.1776, "num_input_tokens_seen": 8596640, "step": 3970 }, { "epoch": 0.6484502446982056, "grad_norm": 0.3588792085647583, "learning_rate": 1.6207177814029366e-05, "loss": 0.0941, "num_input_tokens_seen": 8608256, "step": 3975 }, { "epoch": 0.6492659053833605, "grad_norm": 1.4087555408477783, "learning_rate": 1.622756933115824e-05, "loss": 0.1917, "num_input_tokens_seen": 8618752, "step": 3980 }, { "epoch": 0.6500815660685155, "grad_norm": 0.055870894342660904, "learning_rate": 1.6247960848287113e-05, "loss": 0.1677, "num_input_tokens_seen": 8629824, "step": 3985 }, { "epoch": 0.6508972267536705, "grad_norm": 0.13194610178470612, "learning_rate": 1.6268352365415987e-05, "loss": 0.2197, "num_input_tokens_seen": 8640832, "step": 3990 }, { "epoch": 0.6517128874388255, "grad_norm": 0.12435001879930496, "learning_rate": 1.6288743882544864e-05, "loss": 0.2218, "num_input_tokens_seen": 8651104, "step": 3995 }, { "epoch": 0.6525285481239804, "grad_norm": 2.0014560222625732, "learning_rate": 1.6309135399673735e-05, "loss": 0.2313, "num_input_tokens_seen": 8662112, "step": 4000 }, { "epoch": 0.6533442088091354, "grad_norm": 2.080409288406372, "learning_rate": 1.6329526916802612e-05, "loss": 0.1173, "num_input_tokens_seen": 8672960, "step": 4005 }, { "epoch": 0.6541598694942904, "grad_norm": 0.10733157396316528, "learning_rate": 1.6349918433931486e-05, "loss": 0.1281, "num_input_tokens_seen": 8683968, "step": 4010 }, { "epoch": 0.6549755301794453, "grad_norm": 2.5118257999420166, "learning_rate": 1.637030995106036e-05, "loss": 0.1686, "num_input_tokens_seen": 8695200, "step": 4015 }, { "epoch": 0.6557911908646004, "grad_norm": 1.9188458919525146, "learning_rate": 1.6390701468189233e-05, "loss": 0.2519, "num_input_tokens_seen": 8705408, "step": 4020 }, { "epoch": 0.6566068515497553, "grad_norm": 2.27872371673584, "learning_rate": 1.6411092985318107e-05, "loss": 0.1461, "num_input_tokens_seen": 8716128, "step": 4025 }, { "epoch": 0.6574225122349103, "grad_norm": 0.9074472188949585, "learning_rate": 1.6431484502446984e-05, "loss": 0.3905, "num_input_tokens_seen": 8727744, "step": 4030 }, { "epoch": 0.6582381729200653, "grad_norm": 1.00557279586792, "learning_rate": 1.6451876019575858e-05, "loss": 0.2071, "num_input_tokens_seen": 8740128, "step": 4035 }, { "epoch": 0.6590538336052202, "grad_norm": 3.216709613800049, "learning_rate": 1.6472267536704732e-05, "loss": 0.2677, "num_input_tokens_seen": 8752320, "step": 4040 }, { "epoch": 0.6598694942903752, "grad_norm": 0.5466094613075256, "learning_rate": 1.6492659053833606e-05, "loss": 0.1923, "num_input_tokens_seen": 8762784, "step": 4045 }, { "epoch": 0.6606851549755302, "grad_norm": 0.1793988049030304, "learning_rate": 1.651305057096248e-05, "loss": 0.1534, "num_input_tokens_seen": 8773312, "step": 4050 }, { "epoch": 0.6615008156606852, "grad_norm": 0.5032705664634705, "learning_rate": 1.6533442088091357e-05, "loss": 0.2183, "num_input_tokens_seen": 8783744, "step": 4055 }, { "epoch": 0.6623164763458401, "grad_norm": 0.30762073397636414, "learning_rate": 1.6553833605220227e-05, "loss": 0.0714, "num_input_tokens_seen": 8795104, "step": 4060 }, { "epoch": 0.6631321370309952, "grad_norm": 0.6898226141929626, "learning_rate": 1.6574225122349105e-05, "loss": 0.131, "num_input_tokens_seen": 8805760, "step": 4065 }, { "epoch": 0.6639477977161501, "grad_norm": 2.498847246170044, "learning_rate": 1.659461663947798e-05, "loss": 0.1016, "num_input_tokens_seen": 8817536, "step": 4070 }, { "epoch": 0.664763458401305, "grad_norm": 1.0033594369888306, "learning_rate": 1.6615008156606852e-05, "loss": 0.0403, "num_input_tokens_seen": 8829184, "step": 4075 }, { "epoch": 0.6655791190864601, "grad_norm": 0.07174458354711533, "learning_rate": 1.6635399673735726e-05, "loss": 0.1116, "num_input_tokens_seen": 8839328, "step": 4080 }, { "epoch": 0.666394779771615, "grad_norm": 1.3977618217468262, "learning_rate": 1.66557911908646e-05, "loss": 0.1802, "num_input_tokens_seen": 8850336, "step": 4085 }, { "epoch": 0.66721044045677, "grad_norm": 0.8105493783950806, "learning_rate": 1.6676182707993477e-05, "loss": 0.2197, "num_input_tokens_seen": 8859392, "step": 4090 }, { "epoch": 0.668026101141925, "grad_norm": 0.11732974648475647, "learning_rate": 1.669657422512235e-05, "loss": 0.1303, "num_input_tokens_seen": 8871008, "step": 4095 }, { "epoch": 0.6688417618270799, "grad_norm": 0.10266581922769547, "learning_rate": 1.6716965742251225e-05, "loss": 0.2302, "num_input_tokens_seen": 8881760, "step": 4100 }, { "epoch": 0.6696574225122349, "grad_norm": 1.9716567993164062, "learning_rate": 1.67373572593801e-05, "loss": 0.39, "num_input_tokens_seen": 8892544, "step": 4105 }, { "epoch": 0.6704730831973899, "grad_norm": 2.074406385421753, "learning_rate": 1.6757748776508972e-05, "loss": 0.3103, "num_input_tokens_seen": 8903936, "step": 4110 }, { "epoch": 0.6712887438825449, "grad_norm": 0.20378755033016205, "learning_rate": 1.677814029363785e-05, "loss": 0.1904, "num_input_tokens_seen": 8913952, "step": 4115 }, { "epoch": 0.6721044045676998, "grad_norm": 2.4414570331573486, "learning_rate": 1.679853181076672e-05, "loss": 0.264, "num_input_tokens_seen": 8925120, "step": 4120 }, { "epoch": 0.6729200652528549, "grad_norm": 0.09439627081155777, "learning_rate": 1.6818923327895597e-05, "loss": 0.125, "num_input_tokens_seen": 8935872, "step": 4125 }, { "epoch": 0.6737357259380098, "grad_norm": 0.1845470368862152, "learning_rate": 1.683931484502447e-05, "loss": 0.1876, "num_input_tokens_seen": 8946624, "step": 4130 }, { "epoch": 0.6745513866231647, "grad_norm": 0.28252995014190674, "learning_rate": 1.6859706362153345e-05, "loss": 0.1155, "num_input_tokens_seen": 8957568, "step": 4135 }, { "epoch": 0.6753670473083198, "grad_norm": 1.671524167060852, "learning_rate": 1.688009787928222e-05, "loss": 0.2617, "num_input_tokens_seen": 8968160, "step": 4140 }, { "epoch": 0.6761827079934747, "grad_norm": 0.9018524289131165, "learning_rate": 1.6900489396411092e-05, "loss": 0.163, "num_input_tokens_seen": 8978560, "step": 4145 }, { "epoch": 0.6769983686786297, "grad_norm": 1.68541419506073, "learning_rate": 1.692088091353997e-05, "loss": 0.1845, "num_input_tokens_seen": 8988896, "step": 4150 }, { "epoch": 0.6778140293637847, "grad_norm": 1.5294511318206787, "learning_rate": 1.6941272430668844e-05, "loss": 0.2085, "num_input_tokens_seen": 9000928, "step": 4155 }, { "epoch": 0.6786296900489397, "grad_norm": 0.22490760684013367, "learning_rate": 1.6961663947797717e-05, "loss": 0.12, "num_input_tokens_seen": 9011712, "step": 4160 }, { "epoch": 0.6794453507340946, "grad_norm": 1.1116102933883667, "learning_rate": 1.698205546492659e-05, "loss": 0.2304, "num_input_tokens_seen": 9022624, "step": 4165 }, { "epoch": 0.6802610114192496, "grad_norm": 0.054856423288583755, "learning_rate": 1.7002446982055465e-05, "loss": 0.2156, "num_input_tokens_seen": 9033088, "step": 4170 }, { "epoch": 0.6810766721044046, "grad_norm": 3.493870735168457, "learning_rate": 1.702283849918434e-05, "loss": 0.3067, "num_input_tokens_seen": 9043680, "step": 4175 }, { "epoch": 0.6818923327895595, "grad_norm": 1.694981336593628, "learning_rate": 1.7043230016313213e-05, "loss": 0.1854, "num_input_tokens_seen": 9055424, "step": 4180 }, { "epoch": 0.6827079934747146, "grad_norm": 1.5325381755828857, "learning_rate": 1.706362153344209e-05, "loss": 0.0725, "num_input_tokens_seen": 9066816, "step": 4185 }, { "epoch": 0.6835236541598695, "grad_norm": 0.38139835000038147, "learning_rate": 1.7084013050570964e-05, "loss": 0.2072, "num_input_tokens_seen": 9077792, "step": 4190 }, { "epoch": 0.6843393148450244, "grad_norm": 0.15412074327468872, "learning_rate": 1.7104404567699837e-05, "loss": 0.2716, "num_input_tokens_seen": 9088160, "step": 4195 }, { "epoch": 0.6851549755301795, "grad_norm": 0.08613083511590958, "learning_rate": 1.712479608482871e-05, "loss": 0.0526, "num_input_tokens_seen": 9099264, "step": 4200 }, { "epoch": 0.6859706362153344, "grad_norm": 1.7397295236587524, "learning_rate": 1.7145187601957585e-05, "loss": 0.2833, "num_input_tokens_seen": 9110336, "step": 4205 }, { "epoch": 0.6867862969004894, "grad_norm": 1.5407311916351318, "learning_rate": 1.7165579119086462e-05, "loss": 0.0624, "num_input_tokens_seen": 9121248, "step": 4210 }, { "epoch": 0.6876019575856444, "grad_norm": 1.9683964252471924, "learning_rate": 1.7185970636215333e-05, "loss": 0.1335, "num_input_tokens_seen": 9132896, "step": 4215 }, { "epoch": 0.6884176182707994, "grad_norm": 0.07209761440753937, "learning_rate": 1.720636215334421e-05, "loss": 0.131, "num_input_tokens_seen": 9143968, "step": 4220 }, { "epoch": 0.6892332789559543, "grad_norm": 0.08131467550992966, "learning_rate": 1.7226753670473084e-05, "loss": 0.1222, "num_input_tokens_seen": 9154848, "step": 4225 }, { "epoch": 0.6900489396411092, "grad_norm": 0.07987446337938309, "learning_rate": 1.724714518760196e-05, "loss": 0.2354, "num_input_tokens_seen": 9167264, "step": 4230 }, { "epoch": 0.6908646003262643, "grad_norm": 0.10958471894264221, "learning_rate": 1.726753670473083e-05, "loss": 0.0888, "num_input_tokens_seen": 9178016, "step": 4235 }, { "epoch": 0.6916802610114192, "grad_norm": 0.46088549494743347, "learning_rate": 1.7287928221859705e-05, "loss": 0.1393, "num_input_tokens_seen": 9188224, "step": 4240 }, { "epoch": 0.6924959216965743, "grad_norm": 0.0237217228859663, "learning_rate": 1.7308319738988583e-05, "loss": 0.042, "num_input_tokens_seen": 9198208, "step": 4245 }, { "epoch": 0.6933115823817292, "grad_norm": 0.5368531346321106, "learning_rate": 1.7328711256117456e-05, "loss": 0.1658, "num_input_tokens_seen": 9208128, "step": 4250 }, { "epoch": 0.6941272430668842, "grad_norm": 0.0930468961596489, "learning_rate": 1.734910277324633e-05, "loss": 0.081, "num_input_tokens_seen": 9218080, "step": 4255 }, { "epoch": 0.6949429037520392, "grad_norm": 0.17482717335224152, "learning_rate": 1.7369494290375204e-05, "loss": 0.021, "num_input_tokens_seen": 9228416, "step": 4260 }, { "epoch": 0.6957585644371941, "grad_norm": 1.3625614643096924, "learning_rate": 1.738988580750408e-05, "loss": 0.279, "num_input_tokens_seen": 9239296, "step": 4265 }, { "epoch": 0.6965742251223491, "grad_norm": 0.7443859577178955, "learning_rate": 1.7410277324632955e-05, "loss": 0.254, "num_input_tokens_seen": 9249472, "step": 4270 }, { "epoch": 0.697389885807504, "grad_norm": 1.9904735088348389, "learning_rate": 1.7430668841761825e-05, "loss": 0.1752, "num_input_tokens_seen": 9259616, "step": 4275 }, { "epoch": 0.6982055464926591, "grad_norm": 0.9749678373336792, "learning_rate": 1.7451060358890703e-05, "loss": 0.1025, "num_input_tokens_seen": 9271360, "step": 4280 }, { "epoch": 0.699021207177814, "grad_norm": 0.35825878381729126, "learning_rate": 1.7471451876019576e-05, "loss": 0.1817, "num_input_tokens_seen": 9281952, "step": 4285 }, { "epoch": 0.6998368678629691, "grad_norm": 3.153740406036377, "learning_rate": 1.7491843393148454e-05, "loss": 0.2328, "num_input_tokens_seen": 9292480, "step": 4290 }, { "epoch": 0.700652528548124, "grad_norm": 0.12001180648803711, "learning_rate": 1.7512234910277324e-05, "loss": 0.3547, "num_input_tokens_seen": 9301696, "step": 4295 }, { "epoch": 0.7014681892332789, "grad_norm": 1.1591695547103882, "learning_rate": 1.7532626427406198e-05, "loss": 0.0847, "num_input_tokens_seen": 9311360, "step": 4300 }, { "epoch": 0.702283849918434, "grad_norm": 1.4829282760620117, "learning_rate": 1.7553017944535075e-05, "loss": 0.3086, "num_input_tokens_seen": 9321504, "step": 4305 }, { "epoch": 0.7030995106035889, "grad_norm": 1.1955201625823975, "learning_rate": 1.757340946166395e-05, "loss": 0.1991, "num_input_tokens_seen": 9333024, "step": 4310 }, { "epoch": 0.7039151712887439, "grad_norm": 1.1234995126724243, "learning_rate": 1.7593800978792823e-05, "loss": 0.1336, "num_input_tokens_seen": 9343424, "step": 4315 }, { "epoch": 0.7047308319738989, "grad_norm": 0.028532184660434723, "learning_rate": 1.7614192495921697e-05, "loss": 0.0734, "num_input_tokens_seen": 9354016, "step": 4320 }, { "epoch": 0.7055464926590538, "grad_norm": 0.10510175675153732, "learning_rate": 1.7634584013050574e-05, "loss": 0.2438, "num_input_tokens_seen": 9365600, "step": 4325 }, { "epoch": 0.7063621533442088, "grad_norm": 0.17173944413661957, "learning_rate": 1.7654975530179448e-05, "loss": 0.1938, "num_input_tokens_seen": 9377216, "step": 4330 }, { "epoch": 0.7071778140293637, "grad_norm": 0.8825304508209229, "learning_rate": 1.7675367047308318e-05, "loss": 0.136, "num_input_tokens_seen": 9388000, "step": 4335 }, { "epoch": 0.7079934747145188, "grad_norm": 1.4820185899734497, "learning_rate": 1.7695758564437195e-05, "loss": 0.2438, "num_input_tokens_seen": 9399616, "step": 4340 }, { "epoch": 0.7088091353996737, "grad_norm": 1.2415467500686646, "learning_rate": 1.771615008156607e-05, "loss": 0.1581, "num_input_tokens_seen": 9411872, "step": 4345 }, { "epoch": 0.7096247960848288, "grad_norm": 1.268717646598816, "learning_rate": 1.7736541598694943e-05, "loss": 0.2257, "num_input_tokens_seen": 9422912, "step": 4350 }, { "epoch": 0.7104404567699837, "grad_norm": 0.7482898831367493, "learning_rate": 1.7756933115823817e-05, "loss": 0.0386, "num_input_tokens_seen": 9434176, "step": 4355 }, { "epoch": 0.7112561174551386, "grad_norm": 0.6515147089958191, "learning_rate": 1.7777324632952694e-05, "loss": 0.175, "num_input_tokens_seen": 9443936, "step": 4360 }, { "epoch": 0.7120717781402937, "grad_norm": 0.1514621078968048, "learning_rate": 1.7797716150081568e-05, "loss": 0.0986, "num_input_tokens_seen": 9453984, "step": 4365 }, { "epoch": 0.7128874388254486, "grad_norm": 0.2661975026130676, "learning_rate": 1.781810766721044e-05, "loss": 0.1973, "num_input_tokens_seen": 9465408, "step": 4370 }, { "epoch": 0.7137030995106036, "grad_norm": 0.9722493886947632, "learning_rate": 1.7838499184339315e-05, "loss": 0.1956, "num_input_tokens_seen": 9475392, "step": 4375 }, { "epoch": 0.7145187601957586, "grad_norm": 1.6432887315750122, "learning_rate": 1.785889070146819e-05, "loss": 0.2474, "num_input_tokens_seen": 9485280, "step": 4380 }, { "epoch": 0.7153344208809136, "grad_norm": 1.083878755569458, "learning_rate": 1.7879282218597066e-05, "loss": 0.0946, "num_input_tokens_seen": 9496224, "step": 4385 }, { "epoch": 0.7161500815660685, "grad_norm": 1.0341323614120483, "learning_rate": 1.789967373572594e-05, "loss": 0.1322, "num_input_tokens_seen": 9507456, "step": 4390 }, { "epoch": 0.7169657422512234, "grad_norm": 2.006626605987549, "learning_rate": 1.7920065252854814e-05, "loss": 0.3969, "num_input_tokens_seen": 9517952, "step": 4395 }, { "epoch": 0.7177814029363785, "grad_norm": 2.1099724769592285, "learning_rate": 1.7940456769983688e-05, "loss": 0.1661, "num_input_tokens_seen": 9528672, "step": 4400 }, { "epoch": 0.7185970636215334, "grad_norm": 1.0293619632720947, "learning_rate": 1.7960848287112562e-05, "loss": 0.0686, "num_input_tokens_seen": 9539552, "step": 4405 }, { "epoch": 0.7194127243066885, "grad_norm": 1.6753711700439453, "learning_rate": 1.7981239804241436e-05, "loss": 0.2639, "num_input_tokens_seen": 9550112, "step": 4410 }, { "epoch": 0.7202283849918434, "grad_norm": 0.4701779782772064, "learning_rate": 1.800163132137031e-05, "loss": 0.2097, "num_input_tokens_seen": 9560544, "step": 4415 }, { "epoch": 0.7210440456769984, "grad_norm": 3.6500895023345947, "learning_rate": 1.8022022838499187e-05, "loss": 0.2592, "num_input_tokens_seen": 9571232, "step": 4420 }, { "epoch": 0.7218597063621534, "grad_norm": 0.21135959029197693, "learning_rate": 1.804241435562806e-05, "loss": 0.0671, "num_input_tokens_seen": 9581920, "step": 4425 }, { "epoch": 0.7226753670473083, "grad_norm": 0.8600396513938904, "learning_rate": 1.8062805872756934e-05, "loss": 0.0675, "num_input_tokens_seen": 9592448, "step": 4430 }, { "epoch": 0.7234910277324633, "grad_norm": 0.3887813985347748, "learning_rate": 1.8083197389885808e-05, "loss": 0.2023, "num_input_tokens_seen": 9603360, "step": 4435 }, { "epoch": 0.7243066884176182, "grad_norm": 0.7580736875534058, "learning_rate": 1.8103588907014682e-05, "loss": 0.1619, "num_input_tokens_seen": 9614336, "step": 4440 }, { "epoch": 0.7251223491027733, "grad_norm": 0.19309353828430176, "learning_rate": 1.812398042414356e-05, "loss": 0.1385, "num_input_tokens_seen": 9624512, "step": 4445 }, { "epoch": 0.7259380097879282, "grad_norm": 0.18695828318595886, "learning_rate": 1.814437194127243e-05, "loss": 0.2617, "num_input_tokens_seen": 9635168, "step": 4450 }, { "epoch": 0.7267536704730831, "grad_norm": 1.7266050577163696, "learning_rate": 1.8164763458401307e-05, "loss": 0.2748, "num_input_tokens_seen": 9645280, "step": 4455 }, { "epoch": 0.7275693311582382, "grad_norm": 0.6974174976348877, "learning_rate": 1.818515497553018e-05, "loss": 0.1309, "num_input_tokens_seen": 9656032, "step": 4460 }, { "epoch": 0.7283849918433931, "grad_norm": 0.1553499847650528, "learning_rate": 1.8205546492659054e-05, "loss": 0.2864, "num_input_tokens_seen": 9667968, "step": 4465 }, { "epoch": 0.7292006525285482, "grad_norm": 1.4776428937911987, "learning_rate": 1.8225938009787928e-05, "loss": 0.0962, "num_input_tokens_seen": 9680032, "step": 4470 }, { "epoch": 0.7300163132137031, "grad_norm": 1.5076990127563477, "learning_rate": 1.8246329526916802e-05, "loss": 0.2624, "num_input_tokens_seen": 9691424, "step": 4475 }, { "epoch": 0.7308319738988581, "grad_norm": 1.577380895614624, "learning_rate": 1.826672104404568e-05, "loss": 0.2629, "num_input_tokens_seen": 9702400, "step": 4480 }, { "epoch": 0.731647634584013, "grad_norm": 0.17113642394542694, "learning_rate": 1.8287112561174553e-05, "loss": 0.0727, "num_input_tokens_seen": 9712928, "step": 4485 }, { "epoch": 0.732463295269168, "grad_norm": 2.573993444442749, "learning_rate": 1.8307504078303427e-05, "loss": 0.5759, "num_input_tokens_seen": 9723360, "step": 4490 }, { "epoch": 0.733278955954323, "grad_norm": 1.3356900215148926, "learning_rate": 1.83278955954323e-05, "loss": 0.121, "num_input_tokens_seen": 9734496, "step": 4495 }, { "epoch": 0.734094616639478, "grad_norm": 1.5720560550689697, "learning_rate": 1.8348287112561175e-05, "loss": 0.2252, "num_input_tokens_seen": 9744288, "step": 4500 }, { "epoch": 0.734910277324633, "grad_norm": 2.323415517807007, "learning_rate": 1.8368678629690052e-05, "loss": 0.308, "num_input_tokens_seen": 9755072, "step": 4505 }, { "epoch": 0.7357259380097879, "grad_norm": 0.32190826535224915, "learning_rate": 1.8389070146818922e-05, "loss": 0.1591, "num_input_tokens_seen": 9764192, "step": 4510 }, { "epoch": 0.736541598694943, "grad_norm": 2.937070846557617, "learning_rate": 1.84094616639478e-05, "loss": 0.3258, "num_input_tokens_seen": 9775392, "step": 4515 }, { "epoch": 0.7373572593800979, "grad_norm": 0.7282078862190247, "learning_rate": 1.8429853181076673e-05, "loss": 0.1217, "num_input_tokens_seen": 9784864, "step": 4520 }, { "epoch": 0.7381729200652528, "grad_norm": 0.7463207840919495, "learning_rate": 1.8450244698205547e-05, "loss": 0.0615, "num_input_tokens_seen": 9796512, "step": 4525 }, { "epoch": 0.7389885807504079, "grad_norm": 1.4708690643310547, "learning_rate": 1.847063621533442e-05, "loss": 0.122, "num_input_tokens_seen": 9808416, "step": 4530 }, { "epoch": 0.7398042414355628, "grad_norm": 2.0893545150756836, "learning_rate": 1.8491027732463295e-05, "loss": 0.1742, "num_input_tokens_seen": 9819552, "step": 4535 }, { "epoch": 0.7406199021207178, "grad_norm": 0.35801175236701965, "learning_rate": 1.8511419249592172e-05, "loss": 0.1021, "num_input_tokens_seen": 9831168, "step": 4540 }, { "epoch": 0.7414355628058727, "grad_norm": 1.438406229019165, "learning_rate": 1.8531810766721046e-05, "loss": 0.0776, "num_input_tokens_seen": 9841280, "step": 4545 }, { "epoch": 0.7422512234910277, "grad_norm": 1.2709296941757202, "learning_rate": 1.855220228384992e-05, "loss": 0.2205, "num_input_tokens_seen": 9852224, "step": 4550 }, { "epoch": 0.7430668841761827, "grad_norm": 0.2960064113140106, "learning_rate": 1.8572593800978793e-05, "loss": 0.3931, "num_input_tokens_seen": 9862656, "step": 4555 }, { "epoch": 0.7438825448613376, "grad_norm": 0.22023937106132507, "learning_rate": 1.8592985318107667e-05, "loss": 0.0372, "num_input_tokens_seen": 9874304, "step": 4560 }, { "epoch": 0.7446982055464927, "grad_norm": 0.9494736194610596, "learning_rate": 1.8613376835236544e-05, "loss": 0.1286, "num_input_tokens_seen": 9884128, "step": 4565 }, { "epoch": 0.7455138662316476, "grad_norm": 2.1696295738220215, "learning_rate": 1.8633768352365415e-05, "loss": 0.1248, "num_input_tokens_seen": 9894208, "step": 4570 }, { "epoch": 0.7463295269168027, "grad_norm": 2.8061153888702393, "learning_rate": 1.8654159869494292e-05, "loss": 0.2435, "num_input_tokens_seen": 9904864, "step": 4575 }, { "epoch": 0.7471451876019576, "grad_norm": 1.4444061517715454, "learning_rate": 1.8674551386623166e-05, "loss": 0.1838, "num_input_tokens_seen": 9915904, "step": 4580 }, { "epoch": 0.7479608482871125, "grad_norm": 0.10839609801769257, "learning_rate": 1.869494290375204e-05, "loss": 0.2423, "num_input_tokens_seen": 9926048, "step": 4585 }, { "epoch": 0.7487765089722676, "grad_norm": 0.37083321809768677, "learning_rate": 1.8715334420880914e-05, "loss": 0.0807, "num_input_tokens_seen": 9936736, "step": 4590 }, { "epoch": 0.7495921696574225, "grad_norm": 2.1850292682647705, "learning_rate": 1.8735725938009787e-05, "loss": 0.1935, "num_input_tokens_seen": 9949408, "step": 4595 }, { "epoch": 0.7504078303425775, "grad_norm": 0.5800833106040955, "learning_rate": 1.8756117455138665e-05, "loss": 0.0972, "num_input_tokens_seen": 9960160, "step": 4600 }, { "epoch": 0.7512234910277324, "grad_norm": 1.6647318601608276, "learning_rate": 1.877650897226754e-05, "loss": 0.227, "num_input_tokens_seen": 9970624, "step": 4605 }, { "epoch": 0.7520391517128875, "grad_norm": 1.0460528135299683, "learning_rate": 1.8796900489396412e-05, "loss": 0.1586, "num_input_tokens_seen": 9982080, "step": 4610 }, { "epoch": 0.7528548123980424, "grad_norm": 0.32497119903564453, "learning_rate": 1.8817292006525286e-05, "loss": 0.0266, "num_input_tokens_seen": 9992480, "step": 4615 }, { "epoch": 0.7536704730831973, "grad_norm": 0.4186214506626129, "learning_rate": 1.8837683523654163e-05, "loss": 0.217, "num_input_tokens_seen": 10002496, "step": 4620 }, { "epoch": 0.7544861337683524, "grad_norm": 1.2214596271514893, "learning_rate": 1.8858075040783034e-05, "loss": 0.3279, "num_input_tokens_seen": 10013408, "step": 4625 }, { "epoch": 0.7553017944535073, "grad_norm": 0.9684097170829773, "learning_rate": 1.8878466557911908e-05, "loss": 0.1808, "num_input_tokens_seen": 10022880, "step": 4630 }, { "epoch": 0.7561174551386624, "grad_norm": 1.6602609157562256, "learning_rate": 1.8898858075040785e-05, "loss": 0.108, "num_input_tokens_seen": 10034208, "step": 4635 }, { "epoch": 0.7569331158238173, "grad_norm": 1.4203039407730103, "learning_rate": 1.891924959216966e-05, "loss": 0.2018, "num_input_tokens_seen": 10044832, "step": 4640 }, { "epoch": 0.7577487765089723, "grad_norm": 0.3482493460178375, "learning_rate": 1.8939641109298532e-05, "loss": 0.1811, "num_input_tokens_seen": 10055712, "step": 4645 }, { "epoch": 0.7585644371941273, "grad_norm": 1.8664467334747314, "learning_rate": 1.8960032626427406e-05, "loss": 0.1927, "num_input_tokens_seen": 10066976, "step": 4650 }, { "epoch": 0.7593800978792822, "grad_norm": 1.3321412801742554, "learning_rate": 1.898042414355628e-05, "loss": 0.244, "num_input_tokens_seen": 10078304, "step": 4655 }, { "epoch": 0.7601957585644372, "grad_norm": 3.3312623500823975, "learning_rate": 1.9000815660685157e-05, "loss": 0.369, "num_input_tokens_seen": 10089056, "step": 4660 }, { "epoch": 0.7610114192495921, "grad_norm": 0.1644737422466278, "learning_rate": 1.902120717781403e-05, "loss": 0.2619, "num_input_tokens_seen": 10099904, "step": 4665 }, { "epoch": 0.7618270799347472, "grad_norm": 1.4639757871627808, "learning_rate": 1.9041598694942905e-05, "loss": 0.0881, "num_input_tokens_seen": 10111552, "step": 4670 }, { "epoch": 0.7626427406199021, "grad_norm": 2.3878066539764404, "learning_rate": 1.906199021207178e-05, "loss": 0.2715, "num_input_tokens_seen": 10121568, "step": 4675 }, { "epoch": 0.763458401305057, "grad_norm": 0.6350622773170471, "learning_rate": 1.9082381729200656e-05, "loss": 0.2534, "num_input_tokens_seen": 10132000, "step": 4680 }, { "epoch": 0.7642740619902121, "grad_norm": 1.7538223266601562, "learning_rate": 1.9102773246329526e-05, "loss": 0.1696, "num_input_tokens_seen": 10141952, "step": 4685 }, { "epoch": 0.765089722675367, "grad_norm": 0.3280741274356842, "learning_rate": 1.91231647634584e-05, "loss": 0.1331, "num_input_tokens_seen": 10151776, "step": 4690 }, { "epoch": 0.765905383360522, "grad_norm": 1.0147438049316406, "learning_rate": 1.9143556280587277e-05, "loss": 0.1812, "num_input_tokens_seen": 10162688, "step": 4695 }, { "epoch": 0.766721044045677, "grad_norm": 0.11201968789100647, "learning_rate": 1.916394779771615e-05, "loss": 0.0239, "num_input_tokens_seen": 10174624, "step": 4700 }, { "epoch": 0.767536704730832, "grad_norm": 0.49887627363204956, "learning_rate": 1.9184339314845025e-05, "loss": 0.0914, "num_input_tokens_seen": 10185248, "step": 4705 }, { "epoch": 0.768352365415987, "grad_norm": 0.05005313456058502, "learning_rate": 1.92047308319739e-05, "loss": 0.0583, "num_input_tokens_seen": 10196768, "step": 4710 }, { "epoch": 0.7691680261011419, "grad_norm": 0.4460125267505646, "learning_rate": 1.9225122349102776e-05, "loss": 0.0826, "num_input_tokens_seen": 10206816, "step": 4715 }, { "epoch": 0.7699836867862969, "grad_norm": 0.3714996874332428, "learning_rate": 1.924551386623165e-05, "loss": 0.1108, "num_input_tokens_seen": 10217600, "step": 4720 }, { "epoch": 0.7707993474714518, "grad_norm": 0.9360318183898926, "learning_rate": 1.926590538336052e-05, "loss": 0.1994, "num_input_tokens_seen": 10228544, "step": 4725 }, { "epoch": 0.7716150081566069, "grad_norm": 0.13271303474903107, "learning_rate": 1.9286296900489398e-05, "loss": 0.1391, "num_input_tokens_seen": 10239168, "step": 4730 }, { "epoch": 0.7724306688417618, "grad_norm": 0.6708807349205017, "learning_rate": 1.930668841761827e-05, "loss": 0.1631, "num_input_tokens_seen": 10250784, "step": 4735 }, { "epoch": 0.7732463295269169, "grad_norm": 1.0529173612594604, "learning_rate": 1.932707993474715e-05, "loss": 0.0676, "num_input_tokens_seen": 10260480, "step": 4740 }, { "epoch": 0.7740619902120718, "grad_norm": 2.0507497787475586, "learning_rate": 1.934747145187602e-05, "loss": 0.2181, "num_input_tokens_seen": 10272064, "step": 4745 }, { "epoch": 0.7748776508972267, "grad_norm": 0.08762767165899277, "learning_rate": 1.9367862969004896e-05, "loss": 0.0817, "num_input_tokens_seen": 10281472, "step": 4750 }, { "epoch": 0.7756933115823818, "grad_norm": 1.0556964874267578, "learning_rate": 1.938825448613377e-05, "loss": 0.1089, "num_input_tokens_seen": 10293536, "step": 4755 }, { "epoch": 0.7765089722675367, "grad_norm": 0.22250208258628845, "learning_rate": 1.9408646003262644e-05, "loss": 0.1255, "num_input_tokens_seen": 10303968, "step": 4760 }, { "epoch": 0.7773246329526917, "grad_norm": 0.19301114976406097, "learning_rate": 1.9429037520391518e-05, "loss": 0.085, "num_input_tokens_seen": 10316064, "step": 4765 }, { "epoch": 0.7781402936378466, "grad_norm": 0.3299367129802704, "learning_rate": 1.944942903752039e-05, "loss": 0.1315, "num_input_tokens_seen": 10327328, "step": 4770 }, { "epoch": 0.7789559543230016, "grad_norm": 1.667073369026184, "learning_rate": 1.946982055464927e-05, "loss": 0.273, "num_input_tokens_seen": 10337088, "step": 4775 }, { "epoch": 0.7797716150081566, "grad_norm": 1.6375466585159302, "learning_rate": 1.9490212071778143e-05, "loss": 0.3135, "num_input_tokens_seen": 10347488, "step": 4780 }, { "epoch": 0.7805872756933115, "grad_norm": 1.7796368598937988, "learning_rate": 1.9510603588907013e-05, "loss": 0.3207, "num_input_tokens_seen": 10358656, "step": 4785 }, { "epoch": 0.7814029363784666, "grad_norm": 0.9365811347961426, "learning_rate": 1.953099510603589e-05, "loss": 0.172, "num_input_tokens_seen": 10370816, "step": 4790 }, { "epoch": 0.7822185970636215, "grad_norm": 0.4352564811706543, "learning_rate": 1.9551386623164764e-05, "loss": 0.0673, "num_input_tokens_seen": 10381856, "step": 4795 }, { "epoch": 0.7830342577487766, "grad_norm": 1.9763329029083252, "learning_rate": 1.957177814029364e-05, "loss": 0.2123, "num_input_tokens_seen": 10392320, "step": 4800 }, { "epoch": 0.7838499184339315, "grad_norm": 1.2199801206588745, "learning_rate": 1.959216965742251e-05, "loss": 0.284, "num_input_tokens_seen": 10401568, "step": 4805 }, { "epoch": 0.7846655791190864, "grad_norm": 0.6362788677215576, "learning_rate": 1.961256117455139e-05, "loss": 0.1634, "num_input_tokens_seen": 10413248, "step": 4810 }, { "epoch": 0.7854812398042414, "grad_norm": 1.0286747217178345, "learning_rate": 1.9632952691680263e-05, "loss": 0.0796, "num_input_tokens_seen": 10423168, "step": 4815 }, { "epoch": 0.7862969004893964, "grad_norm": 0.38998374342918396, "learning_rate": 1.9653344208809136e-05, "loss": 0.1722, "num_input_tokens_seen": 10433792, "step": 4820 }, { "epoch": 0.7871125611745514, "grad_norm": 1.6692233085632324, "learning_rate": 1.967373572593801e-05, "loss": 0.1737, "num_input_tokens_seen": 10445120, "step": 4825 }, { "epoch": 0.7879282218597063, "grad_norm": 0.7829678654670715, "learning_rate": 1.9694127243066884e-05, "loss": 0.2057, "num_input_tokens_seen": 10455584, "step": 4830 }, { "epoch": 0.7887438825448614, "grad_norm": 0.1044018492102623, "learning_rate": 1.971451876019576e-05, "loss": 0.089, "num_input_tokens_seen": 10466752, "step": 4835 }, { "epoch": 0.7895595432300163, "grad_norm": 1.4393038749694824, "learning_rate": 1.9734910277324635e-05, "loss": 0.2318, "num_input_tokens_seen": 10477376, "step": 4840 }, { "epoch": 0.7903752039151712, "grad_norm": 2.1251561641693115, "learning_rate": 1.975530179445351e-05, "loss": 0.1767, "num_input_tokens_seen": 10487392, "step": 4845 }, { "epoch": 0.7911908646003263, "grad_norm": 1.8253865242004395, "learning_rate": 1.9775693311582383e-05, "loss": 0.3594, "num_input_tokens_seen": 10498208, "step": 4850 }, { "epoch": 0.7920065252854812, "grad_norm": 0.8156093955039978, "learning_rate": 1.9796084828711257e-05, "loss": 0.1194, "num_input_tokens_seen": 10507200, "step": 4855 }, { "epoch": 0.7928221859706363, "grad_norm": 0.28762689232826233, "learning_rate": 1.981647634584013e-05, "loss": 0.1535, "num_input_tokens_seen": 10516416, "step": 4860 }, { "epoch": 0.7936378466557912, "grad_norm": 1.4188666343688965, "learning_rate": 1.9836867862969004e-05, "loss": 0.2233, "num_input_tokens_seen": 10527648, "step": 4865 }, { "epoch": 0.7944535073409462, "grad_norm": 0.5241157412528992, "learning_rate": 1.985725938009788e-05, "loss": 0.0481, "num_input_tokens_seen": 10537952, "step": 4870 }, { "epoch": 0.7952691680261011, "grad_norm": 2.4140093326568604, "learning_rate": 1.9877650897226755e-05, "loss": 0.1564, "num_input_tokens_seen": 10548320, "step": 4875 }, { "epoch": 0.7960848287112561, "grad_norm": 0.961135745048523, "learning_rate": 1.989804241435563e-05, "loss": 0.164, "num_input_tokens_seen": 10557824, "step": 4880 }, { "epoch": 0.7969004893964111, "grad_norm": 0.30531221628189087, "learning_rate": 1.9918433931484503e-05, "loss": 0.0413, "num_input_tokens_seen": 10568896, "step": 4885 }, { "epoch": 0.797716150081566, "grad_norm": 1.2962751388549805, "learning_rate": 1.9938825448613377e-05, "loss": 0.3596, "num_input_tokens_seen": 10580064, "step": 4890 }, { "epoch": 0.7985318107667211, "grad_norm": 0.07929970324039459, "learning_rate": 1.9959216965742254e-05, "loss": 0.1764, "num_input_tokens_seen": 10590976, "step": 4895 }, { "epoch": 0.799347471451876, "grad_norm": 1.6059187650680542, "learning_rate": 1.9979608482871124e-05, "loss": 0.1711, "num_input_tokens_seen": 10600448, "step": 4900 }, { "epoch": 0.8001631321370309, "grad_norm": 0.9581203460693359, "learning_rate": 2e-05, "loss": 0.2006, "num_input_tokens_seen": 10611904, "step": 4905 }, { "epoch": 0.800978792822186, "grad_norm": 1.7088154554367065, "learning_rate": 2.0020391517128875e-05, "loss": 0.1323, "num_input_tokens_seen": 10622240, "step": 4910 }, { "epoch": 0.8017944535073409, "grad_norm": 4.041092395782471, "learning_rate": 2.004078303425775e-05, "loss": 0.3223, "num_input_tokens_seen": 10632864, "step": 4915 }, { "epoch": 0.802610114192496, "grad_norm": 0.3624032735824585, "learning_rate": 2.0061174551386623e-05, "loss": 0.0925, "num_input_tokens_seen": 10642464, "step": 4920 }, { "epoch": 0.8034257748776509, "grad_norm": 0.932833194732666, "learning_rate": 2.0081566068515497e-05, "loss": 0.171, "num_input_tokens_seen": 10653024, "step": 4925 }, { "epoch": 0.8042414355628059, "grad_norm": 0.4085087776184082, "learning_rate": 2.0101957585644374e-05, "loss": 0.1613, "num_input_tokens_seen": 10664384, "step": 4930 }, { "epoch": 0.8050570962479608, "grad_norm": 2.1656086444854736, "learning_rate": 2.0122349102773248e-05, "loss": 0.3286, "num_input_tokens_seen": 10675360, "step": 4935 }, { "epoch": 0.8058727569331158, "grad_norm": 1.1133102178573608, "learning_rate": 2.0142740619902122e-05, "loss": 0.1887, "num_input_tokens_seen": 10685824, "step": 4940 }, { "epoch": 0.8066884176182708, "grad_norm": 1.9837539196014404, "learning_rate": 2.0163132137030996e-05, "loss": 0.2242, "num_input_tokens_seen": 10697536, "step": 4945 }, { "epoch": 0.8075040783034257, "grad_norm": 0.28076624870300293, "learning_rate": 2.018352365415987e-05, "loss": 0.0984, "num_input_tokens_seen": 10709888, "step": 4950 }, { "epoch": 0.8083197389885808, "grad_norm": 0.06450529396533966, "learning_rate": 2.0203915171288747e-05, "loss": 0.1, "num_input_tokens_seen": 10721216, "step": 4955 }, { "epoch": 0.8091353996737357, "grad_norm": 0.2811688780784607, "learning_rate": 2.0224306688417617e-05, "loss": 0.0954, "num_input_tokens_seen": 10729952, "step": 4960 }, { "epoch": 0.8099510603588908, "grad_norm": 1.1830421686172485, "learning_rate": 2.0244698205546494e-05, "loss": 0.1188, "num_input_tokens_seen": 10741216, "step": 4965 }, { "epoch": 0.8107667210440457, "grad_norm": 0.2527272403240204, "learning_rate": 2.0265089722675368e-05, "loss": 0.1811, "num_input_tokens_seen": 10752544, "step": 4970 }, { "epoch": 0.8115823817292006, "grad_norm": 0.18759939074516296, "learning_rate": 2.0285481239804245e-05, "loss": 0.2101, "num_input_tokens_seen": 10764960, "step": 4975 }, { "epoch": 0.8123980424143556, "grad_norm": 1.3062570095062256, "learning_rate": 2.0305872756933116e-05, "loss": 0.0936, "num_input_tokens_seen": 10775296, "step": 4980 }, { "epoch": 0.8132137030995106, "grad_norm": 0.9901962876319885, "learning_rate": 2.032626427406199e-05, "loss": 0.1795, "num_input_tokens_seen": 10786112, "step": 4985 }, { "epoch": 0.8140293637846656, "grad_norm": 0.528945803642273, "learning_rate": 2.0346655791190867e-05, "loss": 0.1646, "num_input_tokens_seen": 10797152, "step": 4990 }, { "epoch": 0.8148450244698205, "grad_norm": 2.1381211280822754, "learning_rate": 2.036704730831974e-05, "loss": 0.2062, "num_input_tokens_seen": 10808480, "step": 4995 }, { "epoch": 0.8156606851549756, "grad_norm": 1.7115509510040283, "learning_rate": 2.0387438825448614e-05, "loss": 0.0847, "num_input_tokens_seen": 10819424, "step": 5000 }, { "epoch": 0.8164763458401305, "grad_norm": 0.722379207611084, "learning_rate": 2.0407830342577488e-05, "loss": 0.1871, "num_input_tokens_seen": 10830048, "step": 5005 }, { "epoch": 0.8172920065252854, "grad_norm": 0.4597250521183014, "learning_rate": 2.0428221859706362e-05, "loss": 0.2137, "num_input_tokens_seen": 10840768, "step": 5010 }, { "epoch": 0.8181076672104405, "grad_norm": 0.15833978354930878, "learning_rate": 2.044861337683524e-05, "loss": 0.1949, "num_input_tokens_seen": 10849792, "step": 5015 }, { "epoch": 0.8189233278955954, "grad_norm": 0.1406390517950058, "learning_rate": 2.046900489396411e-05, "loss": 0.0341, "num_input_tokens_seen": 10861472, "step": 5020 }, { "epoch": 0.8197389885807504, "grad_norm": 1.9348084926605225, "learning_rate": 2.0489396411092987e-05, "loss": 0.176, "num_input_tokens_seen": 10871968, "step": 5025 }, { "epoch": 0.8205546492659054, "grad_norm": 1.2461206912994385, "learning_rate": 2.050978792822186e-05, "loss": 0.222, "num_input_tokens_seen": 10882560, "step": 5030 }, { "epoch": 0.8213703099510603, "grad_norm": 1.6845827102661133, "learning_rate": 2.0530179445350735e-05, "loss": 0.1755, "num_input_tokens_seen": 10894048, "step": 5035 }, { "epoch": 0.8221859706362153, "grad_norm": 2.3540263175964355, "learning_rate": 2.055057096247961e-05, "loss": 0.2418, "num_input_tokens_seen": 10905408, "step": 5040 }, { "epoch": 0.8230016313213703, "grad_norm": 1.704645037651062, "learning_rate": 2.0570962479608482e-05, "loss": 0.1268, "num_input_tokens_seen": 10915648, "step": 5045 }, { "epoch": 0.8238172920065253, "grad_norm": 1.3966140747070312, "learning_rate": 2.059135399673736e-05, "loss": 0.1598, "num_input_tokens_seen": 10926880, "step": 5050 }, { "epoch": 0.8246329526916802, "grad_norm": 0.9364606738090515, "learning_rate": 2.0611745513866233e-05, "loss": 0.2375, "num_input_tokens_seen": 10938432, "step": 5055 }, { "epoch": 0.8254486133768353, "grad_norm": 1.1450631618499756, "learning_rate": 2.0632137030995107e-05, "loss": 0.2654, "num_input_tokens_seen": 10948992, "step": 5060 }, { "epoch": 0.8262642740619902, "grad_norm": 3.3790876865386963, "learning_rate": 2.065252854812398e-05, "loss": 0.2703, "num_input_tokens_seen": 10958272, "step": 5065 }, { "epoch": 0.8270799347471451, "grad_norm": 0.9768489003181458, "learning_rate": 2.0672920065252858e-05, "loss": 0.125, "num_input_tokens_seen": 10968512, "step": 5070 }, { "epoch": 0.8278955954323002, "grad_norm": 0.7118699550628662, "learning_rate": 2.0693311582381732e-05, "loss": 0.1416, "num_input_tokens_seen": 10979072, "step": 5075 }, { "epoch": 0.8287112561174551, "grad_norm": 1.26682448387146, "learning_rate": 2.0713703099510602e-05, "loss": 0.1521, "num_input_tokens_seen": 10988736, "step": 5080 }, { "epoch": 0.8295269168026101, "grad_norm": 1.0427632331848145, "learning_rate": 2.073409461663948e-05, "loss": 0.1597, "num_input_tokens_seen": 11000512, "step": 5085 }, { "epoch": 0.8303425774877651, "grad_norm": 1.403535008430481, "learning_rate": 2.0754486133768353e-05, "loss": 0.2225, "num_input_tokens_seen": 11010240, "step": 5090 }, { "epoch": 0.8311582381729201, "grad_norm": 0.23283468186855316, "learning_rate": 2.0774877650897227e-05, "loss": 0.2447, "num_input_tokens_seen": 11020672, "step": 5095 }, { "epoch": 0.831973898858075, "grad_norm": 0.19987201690673828, "learning_rate": 2.07952691680261e-05, "loss": 0.11, "num_input_tokens_seen": 11031808, "step": 5100 }, { "epoch": 0.83278955954323, "grad_norm": 1.0381370782852173, "learning_rate": 2.0815660685154978e-05, "loss": 0.1357, "num_input_tokens_seen": 11042144, "step": 5105 }, { "epoch": 0.833605220228385, "grad_norm": 0.2936075031757355, "learning_rate": 2.0836052202283852e-05, "loss": 0.1225, "num_input_tokens_seen": 11053728, "step": 5110 }, { "epoch": 0.8344208809135399, "grad_norm": 0.793506383895874, "learning_rate": 2.0856443719412726e-05, "loss": 0.2429, "num_input_tokens_seen": 11064032, "step": 5115 }, { "epoch": 0.835236541598695, "grad_norm": 2.00640606880188, "learning_rate": 2.08768352365416e-05, "loss": 0.2549, "num_input_tokens_seen": 11073792, "step": 5120 }, { "epoch": 0.8360522022838499, "grad_norm": 0.19981320202350616, "learning_rate": 2.0897226753670474e-05, "loss": 0.096, "num_input_tokens_seen": 11085216, "step": 5125 }, { "epoch": 0.8368678629690048, "grad_norm": 0.4166668951511383, "learning_rate": 2.091761827079935e-05, "loss": 0.2858, "num_input_tokens_seen": 11096288, "step": 5130 }, { "epoch": 0.8376835236541599, "grad_norm": 0.6684190034866333, "learning_rate": 2.093800978792822e-05, "loss": 0.1461, "num_input_tokens_seen": 11107584, "step": 5135 }, { "epoch": 0.8384991843393148, "grad_norm": 0.7018530368804932, "learning_rate": 2.0958401305057095e-05, "loss": 0.092, "num_input_tokens_seen": 11118752, "step": 5140 }, { "epoch": 0.8393148450244698, "grad_norm": 1.134337306022644, "learning_rate": 2.0978792822185972e-05, "loss": 0.1345, "num_input_tokens_seen": 11127840, "step": 5145 }, { "epoch": 0.8401305057096248, "grad_norm": 0.059280022978782654, "learning_rate": 2.0999184339314846e-05, "loss": 0.1755, "num_input_tokens_seen": 11137952, "step": 5150 }, { "epoch": 0.8409461663947798, "grad_norm": 0.2551420331001282, "learning_rate": 2.101957585644372e-05, "loss": 0.1542, "num_input_tokens_seen": 11147008, "step": 5155 }, { "epoch": 0.8417618270799347, "grad_norm": 2.0133302211761475, "learning_rate": 2.1039967373572594e-05, "loss": 0.1156, "num_input_tokens_seen": 11158912, "step": 5160 }, { "epoch": 0.8425774877650897, "grad_norm": 0.9029374718666077, "learning_rate": 2.106035889070147e-05, "loss": 0.1068, "num_input_tokens_seen": 11169440, "step": 5165 }, { "epoch": 0.8433931484502447, "grad_norm": 0.3823668360710144, "learning_rate": 2.1080750407830345e-05, "loss": 0.2052, "num_input_tokens_seen": 11179488, "step": 5170 }, { "epoch": 0.8442088091353996, "grad_norm": 0.970924973487854, "learning_rate": 2.1101141924959215e-05, "loss": 0.2445, "num_input_tokens_seen": 11190912, "step": 5175 }, { "epoch": 0.8450244698205547, "grad_norm": 0.788021981716156, "learning_rate": 2.1121533442088092e-05, "loss": 0.1913, "num_input_tokens_seen": 11202208, "step": 5180 }, { "epoch": 0.8458401305057096, "grad_norm": 1.7398823499679565, "learning_rate": 2.1141924959216966e-05, "loss": 0.3169, "num_input_tokens_seen": 11212448, "step": 5185 }, { "epoch": 0.8466557911908646, "grad_norm": 0.7785069346427917, "learning_rate": 2.1162316476345843e-05, "loss": 0.1771, "num_input_tokens_seen": 11223648, "step": 5190 }, { "epoch": 0.8474714518760196, "grad_norm": 0.12561003863811493, "learning_rate": 2.1182707993474714e-05, "loss": 0.2278, "num_input_tokens_seen": 11234880, "step": 5195 }, { "epoch": 0.8482871125611745, "grad_norm": 0.21728695929050446, "learning_rate": 2.120309951060359e-05, "loss": 0.1136, "num_input_tokens_seen": 11245184, "step": 5200 }, { "epoch": 0.8491027732463295, "grad_norm": 1.4102331399917603, "learning_rate": 2.1223491027732465e-05, "loss": 0.1236, "num_input_tokens_seen": 11256288, "step": 5205 }, { "epoch": 0.8499184339314845, "grad_norm": 0.7906144857406616, "learning_rate": 2.124388254486134e-05, "loss": 0.1647, "num_input_tokens_seen": 11265728, "step": 5210 }, { "epoch": 0.8507340946166395, "grad_norm": 0.23342232406139374, "learning_rate": 2.1264274061990213e-05, "loss": 0.1402, "num_input_tokens_seen": 11275712, "step": 5215 }, { "epoch": 0.8515497553017944, "grad_norm": 1.0328246355056763, "learning_rate": 2.1284665579119086e-05, "loss": 0.1226, "num_input_tokens_seen": 11287776, "step": 5220 }, { "epoch": 0.8523654159869495, "grad_norm": 1.7023743391036987, "learning_rate": 2.1305057096247964e-05, "loss": 0.1594, "num_input_tokens_seen": 11298528, "step": 5225 }, { "epoch": 0.8531810766721044, "grad_norm": 0.8246278762817383, "learning_rate": 2.1325448613376837e-05, "loss": 0.2644, "num_input_tokens_seen": 11309536, "step": 5230 }, { "epoch": 0.8539967373572593, "grad_norm": 0.8582363128662109, "learning_rate": 2.1345840130505708e-05, "loss": 0.3404, "num_input_tokens_seen": 11319200, "step": 5235 }, { "epoch": 0.8548123980424144, "grad_norm": 0.21193242073059082, "learning_rate": 2.1366231647634585e-05, "loss": 0.1437, "num_input_tokens_seen": 11329120, "step": 5240 }, { "epoch": 0.8556280587275693, "grad_norm": 0.21623507142066956, "learning_rate": 2.138662316476346e-05, "loss": 0.1569, "num_input_tokens_seen": 11339360, "step": 5245 }, { "epoch": 0.8564437194127243, "grad_norm": 2.636643171310425, "learning_rate": 2.1407014681892336e-05, "loss": 0.2654, "num_input_tokens_seen": 11350848, "step": 5250 }, { "epoch": 0.8572593800978793, "grad_norm": 0.3828139007091522, "learning_rate": 2.1427406199021206e-05, "loss": 0.16, "num_input_tokens_seen": 11360704, "step": 5255 }, { "epoch": 0.8580750407830342, "grad_norm": 1.4550957679748535, "learning_rate": 2.1447797716150084e-05, "loss": 0.1137, "num_input_tokens_seen": 11370816, "step": 5260 }, { "epoch": 0.8588907014681892, "grad_norm": 1.614835500717163, "learning_rate": 2.1468189233278958e-05, "loss": 0.1757, "num_input_tokens_seen": 11382080, "step": 5265 }, { "epoch": 0.8597063621533442, "grad_norm": 0.2329808920621872, "learning_rate": 2.148858075040783e-05, "loss": 0.2735, "num_input_tokens_seen": 11392736, "step": 5270 }, { "epoch": 0.8605220228384992, "grad_norm": 0.08891406655311584, "learning_rate": 2.1508972267536705e-05, "loss": 0.1726, "num_input_tokens_seen": 11402720, "step": 5275 }, { "epoch": 0.8613376835236541, "grad_norm": 0.8000302314758301, "learning_rate": 2.152936378466558e-05, "loss": 0.0706, "num_input_tokens_seen": 11413984, "step": 5280 }, { "epoch": 0.8621533442088092, "grad_norm": 1.9905157089233398, "learning_rate": 2.1549755301794456e-05, "loss": 0.24, "num_input_tokens_seen": 11424832, "step": 5285 }, { "epoch": 0.8629690048939641, "grad_norm": 0.8786543607711792, "learning_rate": 2.157014681892333e-05, "loss": 0.106, "num_input_tokens_seen": 11436128, "step": 5290 }, { "epoch": 0.863784665579119, "grad_norm": 0.05087887495756149, "learning_rate": 2.1590538336052204e-05, "loss": 0.1991, "num_input_tokens_seen": 11448448, "step": 5295 }, { "epoch": 0.8646003262642741, "grad_norm": 0.7941170930862427, "learning_rate": 2.1610929853181078e-05, "loss": 0.1511, "num_input_tokens_seen": 11458176, "step": 5300 }, { "epoch": 0.865415986949429, "grad_norm": 1.3135331869125366, "learning_rate": 2.163132137030995e-05, "loss": 0.1642, "num_input_tokens_seen": 11468480, "step": 5305 }, { "epoch": 0.866231647634584, "grad_norm": 1.2193794250488281, "learning_rate": 2.1651712887438825e-05, "loss": 0.2098, "num_input_tokens_seen": 11478880, "step": 5310 }, { "epoch": 0.867047308319739, "grad_norm": 0.75066077709198, "learning_rate": 2.16721044045677e-05, "loss": 0.2109, "num_input_tokens_seen": 11490176, "step": 5315 }, { "epoch": 0.867862969004894, "grad_norm": 0.9323612451553345, "learning_rate": 2.1692495921696576e-05, "loss": 0.1837, "num_input_tokens_seen": 11500992, "step": 5320 }, { "epoch": 0.8686786296900489, "grad_norm": 0.45937904715538025, "learning_rate": 2.171288743882545e-05, "loss": 0.0619, "num_input_tokens_seen": 11512288, "step": 5325 }, { "epoch": 0.8694942903752039, "grad_norm": 2.6386826038360596, "learning_rate": 2.1733278955954324e-05, "loss": 0.2689, "num_input_tokens_seen": 11521664, "step": 5330 }, { "epoch": 0.8703099510603589, "grad_norm": 0.7031424045562744, "learning_rate": 2.1753670473083198e-05, "loss": 0.0521, "num_input_tokens_seen": 11532608, "step": 5335 }, { "epoch": 0.8711256117455138, "grad_norm": 2.190650701522827, "learning_rate": 2.177406199021207e-05, "loss": 0.2486, "num_input_tokens_seen": 11543072, "step": 5340 }, { "epoch": 0.8719412724306689, "grad_norm": 0.5880157351493835, "learning_rate": 2.179445350734095e-05, "loss": 0.0971, "num_input_tokens_seen": 11554144, "step": 5345 }, { "epoch": 0.8727569331158238, "grad_norm": 0.2194947898387909, "learning_rate": 2.1814845024469823e-05, "loss": 0.1113, "num_input_tokens_seen": 11565376, "step": 5350 }, { "epoch": 0.8735725938009788, "grad_norm": 1.6957155466079712, "learning_rate": 2.1835236541598697e-05, "loss": 0.3152, "num_input_tokens_seen": 11576128, "step": 5355 }, { "epoch": 0.8743882544861338, "grad_norm": 0.13261868059635162, "learning_rate": 2.185562805872757e-05, "loss": 0.1034, "num_input_tokens_seen": 11588000, "step": 5360 }, { "epoch": 0.8752039151712887, "grad_norm": 0.49597373604774475, "learning_rate": 2.1876019575856444e-05, "loss": 0.0994, "num_input_tokens_seen": 11599520, "step": 5365 }, { "epoch": 0.8760195758564437, "grad_norm": 0.9599140286445618, "learning_rate": 2.1896411092985318e-05, "loss": 0.1263, "num_input_tokens_seen": 11611168, "step": 5370 }, { "epoch": 0.8768352365415987, "grad_norm": 0.44484588503837585, "learning_rate": 2.1916802610114192e-05, "loss": 0.0807, "num_input_tokens_seen": 11621632, "step": 5375 }, { "epoch": 0.8776508972267537, "grad_norm": 2.3959906101226807, "learning_rate": 2.193719412724307e-05, "loss": 0.3897, "num_input_tokens_seen": 11632160, "step": 5380 }, { "epoch": 0.8784665579119086, "grad_norm": 0.20547328889369965, "learning_rate": 2.1957585644371943e-05, "loss": 0.2004, "num_input_tokens_seen": 11641344, "step": 5385 }, { "epoch": 0.8792822185970636, "grad_norm": 0.17874668538570404, "learning_rate": 2.1977977161500817e-05, "loss": 0.1352, "num_input_tokens_seen": 11652128, "step": 5390 }, { "epoch": 0.8800978792822186, "grad_norm": 0.15838871896266937, "learning_rate": 2.199836867862969e-05, "loss": 0.1029, "num_input_tokens_seen": 11662528, "step": 5395 }, { "epoch": 0.8809135399673735, "grad_norm": 0.4664207994937897, "learning_rate": 2.2018760195758564e-05, "loss": 0.1681, "num_input_tokens_seen": 11672544, "step": 5400 }, { "epoch": 0.8817292006525286, "grad_norm": 0.9918261170387268, "learning_rate": 2.203915171288744e-05, "loss": 0.1449, "num_input_tokens_seen": 11683552, "step": 5405 }, { "epoch": 0.8825448613376835, "grad_norm": 1.2541193962097168, "learning_rate": 2.2059543230016312e-05, "loss": 0.1559, "num_input_tokens_seen": 11693408, "step": 5410 }, { "epoch": 0.8833605220228385, "grad_norm": 0.6047132611274719, "learning_rate": 2.207993474714519e-05, "loss": 0.1528, "num_input_tokens_seen": 11704160, "step": 5415 }, { "epoch": 0.8841761827079935, "grad_norm": 0.23786024749279022, "learning_rate": 2.2100326264274063e-05, "loss": 0.1504, "num_input_tokens_seen": 11714752, "step": 5420 }, { "epoch": 0.8849918433931484, "grad_norm": 0.25055745244026184, "learning_rate": 2.212071778140294e-05, "loss": 0.08, "num_input_tokens_seen": 11725408, "step": 5425 }, { "epoch": 0.8858075040783034, "grad_norm": 1.0264168977737427, "learning_rate": 2.214110929853181e-05, "loss": 0.2061, "num_input_tokens_seen": 11736640, "step": 5430 }, { "epoch": 0.8866231647634584, "grad_norm": 1.8801883459091187, "learning_rate": 2.2161500815660684e-05, "loss": 0.1487, "num_input_tokens_seen": 11747008, "step": 5435 }, { "epoch": 0.8874388254486134, "grad_norm": 1.2049992084503174, "learning_rate": 2.218189233278956e-05, "loss": 0.1275, "num_input_tokens_seen": 11757728, "step": 5440 }, { "epoch": 0.8882544861337683, "grad_norm": 0.8459743857383728, "learning_rate": 2.2202283849918435e-05, "loss": 0.1846, "num_input_tokens_seen": 11768704, "step": 5445 }, { "epoch": 0.8890701468189234, "grad_norm": 3.5850913524627686, "learning_rate": 2.222267536704731e-05, "loss": 0.2984, "num_input_tokens_seen": 11780288, "step": 5450 }, { "epoch": 0.8898858075040783, "grad_norm": 1.8750990629196167, "learning_rate": 2.2243066884176183e-05, "loss": 0.3289, "num_input_tokens_seen": 11790080, "step": 5455 }, { "epoch": 0.8907014681892332, "grad_norm": 1.671177625656128, "learning_rate": 2.2263458401305057e-05, "loss": 0.3891, "num_input_tokens_seen": 11801952, "step": 5460 }, { "epoch": 0.8915171288743883, "grad_norm": 1.6500853300094604, "learning_rate": 2.2283849918433934e-05, "loss": 0.1253, "num_input_tokens_seen": 11811520, "step": 5465 }, { "epoch": 0.8923327895595432, "grad_norm": 0.5703606009483337, "learning_rate": 2.2304241435562805e-05, "loss": 0.2709, "num_input_tokens_seen": 11822240, "step": 5470 }, { "epoch": 0.8931484502446982, "grad_norm": 1.8431891202926636, "learning_rate": 2.2324632952691682e-05, "loss": 0.2641, "num_input_tokens_seen": 11833632, "step": 5475 }, { "epoch": 0.8939641109298532, "grad_norm": 0.057165827602148056, "learning_rate": 2.2345024469820556e-05, "loss": 0.0883, "num_input_tokens_seen": 11844640, "step": 5480 }, { "epoch": 0.8947797716150081, "grad_norm": 1.6453362703323364, "learning_rate": 2.2365415986949433e-05, "loss": 0.1979, "num_input_tokens_seen": 11855776, "step": 5485 }, { "epoch": 0.8955954323001631, "grad_norm": 0.7937455177307129, "learning_rate": 2.2385807504078303e-05, "loss": 0.3515, "num_input_tokens_seen": 11865856, "step": 5490 }, { "epoch": 0.8964110929853181, "grad_norm": 0.997843325138092, "learning_rate": 2.2406199021207177e-05, "loss": 0.1442, "num_input_tokens_seen": 11877504, "step": 5495 }, { "epoch": 0.8972267536704731, "grad_norm": 0.1603955179452896, "learning_rate": 2.2426590538336054e-05, "loss": 0.2518, "num_input_tokens_seen": 11888320, "step": 5500 }, { "epoch": 0.898042414355628, "grad_norm": 0.32236871123313904, "learning_rate": 2.2446982055464928e-05, "loss": 0.1775, "num_input_tokens_seen": 11899456, "step": 5505 }, { "epoch": 0.8988580750407831, "grad_norm": 1.3071986436843872, "learning_rate": 2.2467373572593802e-05, "loss": 0.1723, "num_input_tokens_seen": 11910240, "step": 5510 }, { "epoch": 0.899673735725938, "grad_norm": 1.4625436067581177, "learning_rate": 2.2487765089722676e-05, "loss": 0.0773, "num_input_tokens_seen": 11920512, "step": 5515 }, { "epoch": 0.9004893964110929, "grad_norm": 0.2675362825393677, "learning_rate": 2.2508156606851553e-05, "loss": 0.2462, "num_input_tokens_seen": 11930592, "step": 5520 }, { "epoch": 0.901305057096248, "grad_norm": 0.6692772507667542, "learning_rate": 2.2528548123980427e-05, "loss": 0.328, "num_input_tokens_seen": 11941088, "step": 5525 }, { "epoch": 0.9021207177814029, "grad_norm": 0.4876037538051605, "learning_rate": 2.2548939641109297e-05, "loss": 0.1576, "num_input_tokens_seen": 11952320, "step": 5530 }, { "epoch": 0.9029363784665579, "grad_norm": 0.5066727995872498, "learning_rate": 2.2569331158238174e-05, "loss": 0.1925, "num_input_tokens_seen": 11962880, "step": 5535 }, { "epoch": 0.9037520391517129, "grad_norm": 1.300635814666748, "learning_rate": 2.2589722675367048e-05, "loss": 0.1056, "num_input_tokens_seen": 11973472, "step": 5540 }, { "epoch": 0.9045676998368679, "grad_norm": 1.3065646886825562, "learning_rate": 2.2610114192495922e-05, "loss": 0.3671, "num_input_tokens_seen": 11984096, "step": 5545 }, { "epoch": 0.9053833605220228, "grad_norm": 0.9948182106018066, "learning_rate": 2.2630505709624796e-05, "loss": 0.1178, "num_input_tokens_seen": 11995488, "step": 5550 }, { "epoch": 0.9061990212071778, "grad_norm": 2.8675906658172607, "learning_rate": 2.2650897226753673e-05, "loss": 0.1664, "num_input_tokens_seen": 12007104, "step": 5555 }, { "epoch": 0.9070146818923328, "grad_norm": 0.5892194509506226, "learning_rate": 2.2671288743882547e-05, "loss": 0.2129, "num_input_tokens_seen": 12018688, "step": 5560 }, { "epoch": 0.9078303425774877, "grad_norm": 0.23121559619903564, "learning_rate": 2.269168026101142e-05, "loss": 0.0707, "num_input_tokens_seen": 12028096, "step": 5565 }, { "epoch": 0.9086460032626428, "grad_norm": 1.373759388923645, "learning_rate": 2.2712071778140295e-05, "loss": 0.1553, "num_input_tokens_seen": 12038976, "step": 5570 }, { "epoch": 0.9094616639477977, "grad_norm": 0.9833641648292542, "learning_rate": 2.273246329526917e-05, "loss": 0.2604, "num_input_tokens_seen": 12050816, "step": 5575 }, { "epoch": 0.9102773246329527, "grad_norm": 0.1182592436671257, "learning_rate": 2.2752854812398046e-05, "loss": 0.1887, "num_input_tokens_seen": 12062336, "step": 5580 }, { "epoch": 0.9110929853181077, "grad_norm": 2.1094250679016113, "learning_rate": 2.2773246329526916e-05, "loss": 0.1854, "num_input_tokens_seen": 12072096, "step": 5585 }, { "epoch": 0.9119086460032626, "grad_norm": 0.4575773775577545, "learning_rate": 2.279363784665579e-05, "loss": 0.1105, "num_input_tokens_seen": 12083136, "step": 5590 }, { "epoch": 0.9127243066884176, "grad_norm": 0.5437511205673218, "learning_rate": 2.2814029363784667e-05, "loss": 0.2371, "num_input_tokens_seen": 12094272, "step": 5595 }, { "epoch": 0.9135399673735726, "grad_norm": 0.33670419454574585, "learning_rate": 2.283442088091354e-05, "loss": 0.2442, "num_input_tokens_seen": 12105280, "step": 5600 }, { "epoch": 0.9143556280587276, "grad_norm": 0.9186738133430481, "learning_rate": 2.2854812398042415e-05, "loss": 0.2279, "num_input_tokens_seen": 12115968, "step": 5605 }, { "epoch": 0.9151712887438825, "grad_norm": 1.5646605491638184, "learning_rate": 2.287520391517129e-05, "loss": 0.171, "num_input_tokens_seen": 12127136, "step": 5610 }, { "epoch": 0.9159869494290375, "grad_norm": 2.6565909385681152, "learning_rate": 2.2895595432300166e-05, "loss": 0.151, "num_input_tokens_seen": 12136704, "step": 5615 }, { "epoch": 0.9168026101141925, "grad_norm": 0.25066646933555603, "learning_rate": 2.291598694942904e-05, "loss": 0.0754, "num_input_tokens_seen": 12146784, "step": 5620 }, { "epoch": 0.9176182707993474, "grad_norm": 1.0960850715637207, "learning_rate": 2.2936378466557913e-05, "loss": 0.0868, "num_input_tokens_seen": 12157248, "step": 5625 }, { "epoch": 0.9184339314845025, "grad_norm": 1.156923770904541, "learning_rate": 2.2956769983686787e-05, "loss": 0.1813, "num_input_tokens_seen": 12168256, "step": 5630 }, { "epoch": 0.9192495921696574, "grad_norm": 1.5387502908706665, "learning_rate": 2.297716150081566e-05, "loss": 0.1507, "num_input_tokens_seen": 12177760, "step": 5635 }, { "epoch": 0.9200652528548124, "grad_norm": 0.8192406892776489, "learning_rate": 2.2997553017944538e-05, "loss": 0.0836, "num_input_tokens_seen": 12188192, "step": 5640 }, { "epoch": 0.9208809135399674, "grad_norm": 1.582208514213562, "learning_rate": 2.301794453507341e-05, "loss": 0.2373, "num_input_tokens_seen": 12198880, "step": 5645 }, { "epoch": 0.9216965742251223, "grad_norm": 1.3057328462600708, "learning_rate": 2.3038336052202286e-05, "loss": 0.2172, "num_input_tokens_seen": 12208800, "step": 5650 }, { "epoch": 0.9225122349102773, "grad_norm": 0.6589564681053162, "learning_rate": 2.305872756933116e-05, "loss": 0.2169, "num_input_tokens_seen": 12219360, "step": 5655 }, { "epoch": 0.9233278955954323, "grad_norm": 2.207927703857422, "learning_rate": 2.3079119086460034e-05, "loss": 0.1324, "num_input_tokens_seen": 12229376, "step": 5660 }, { "epoch": 0.9241435562805873, "grad_norm": 1.4825502634048462, "learning_rate": 2.3099510603588907e-05, "loss": 0.2289, "num_input_tokens_seen": 12241152, "step": 5665 }, { "epoch": 0.9249592169657422, "grad_norm": 0.4954742193222046, "learning_rate": 2.311990212071778e-05, "loss": 0.2789, "num_input_tokens_seen": 12252192, "step": 5670 }, { "epoch": 0.9257748776508973, "grad_norm": 0.918637216091156, "learning_rate": 2.314029363784666e-05, "loss": 0.2238, "num_input_tokens_seen": 12262848, "step": 5675 }, { "epoch": 0.9265905383360522, "grad_norm": 1.2427233457565308, "learning_rate": 2.3160685154975532e-05, "loss": 0.2379, "num_input_tokens_seen": 12273728, "step": 5680 }, { "epoch": 0.9274061990212071, "grad_norm": 0.6488909125328064, "learning_rate": 2.3181076672104406e-05, "loss": 0.1383, "num_input_tokens_seen": 12284512, "step": 5685 }, { "epoch": 0.9282218597063622, "grad_norm": 1.6316263675689697, "learning_rate": 2.320146818923328e-05, "loss": 0.1962, "num_input_tokens_seen": 12295968, "step": 5690 }, { "epoch": 0.9290375203915171, "grad_norm": 0.5079619884490967, "learning_rate": 2.3221859706362154e-05, "loss": 0.1461, "num_input_tokens_seen": 12307616, "step": 5695 }, { "epoch": 0.9298531810766721, "grad_norm": 0.6208934783935547, "learning_rate": 2.324225122349103e-05, "loss": 0.1539, "num_input_tokens_seen": 12318400, "step": 5700 }, { "epoch": 0.9306688417618271, "grad_norm": 0.5709812641143799, "learning_rate": 2.32626427406199e-05, "loss": 0.114, "num_input_tokens_seen": 12329472, "step": 5705 }, { "epoch": 0.9314845024469821, "grad_norm": 0.8119657635688782, "learning_rate": 2.328303425774878e-05, "loss": 0.1043, "num_input_tokens_seen": 12340032, "step": 5710 }, { "epoch": 0.932300163132137, "grad_norm": 0.6785255670547485, "learning_rate": 2.3303425774877652e-05, "loss": 0.0954, "num_input_tokens_seen": 12351584, "step": 5715 }, { "epoch": 0.933115823817292, "grad_norm": 1.1752477884292603, "learning_rate": 2.3323817292006526e-05, "loss": 0.3214, "num_input_tokens_seen": 12362752, "step": 5720 }, { "epoch": 0.933931484502447, "grad_norm": 1.0111322402954102, "learning_rate": 2.33442088091354e-05, "loss": 0.2946, "num_input_tokens_seen": 12374720, "step": 5725 }, { "epoch": 0.9347471451876019, "grad_norm": 0.338679701089859, "learning_rate": 2.3364600326264274e-05, "loss": 0.111, "num_input_tokens_seen": 12384896, "step": 5730 }, { "epoch": 0.935562805872757, "grad_norm": 0.8557340502738953, "learning_rate": 2.338499184339315e-05, "loss": 0.1392, "num_input_tokens_seen": 12395808, "step": 5735 }, { "epoch": 0.9363784665579119, "grad_norm": 1.5960032939910889, "learning_rate": 2.3405383360522025e-05, "loss": 0.1845, "num_input_tokens_seen": 12406880, "step": 5740 }, { "epoch": 0.9371941272430668, "grad_norm": 0.6839702725410461, "learning_rate": 2.34257748776509e-05, "loss": 0.2687, "num_input_tokens_seen": 12418240, "step": 5745 }, { "epoch": 0.9380097879282219, "grad_norm": 0.8646183013916016, "learning_rate": 2.3446166394779773e-05, "loss": 0.2047, "num_input_tokens_seen": 12428896, "step": 5750 }, { "epoch": 0.9388254486133768, "grad_norm": 0.2707282304763794, "learning_rate": 2.3466557911908646e-05, "loss": 0.0551, "num_input_tokens_seen": 12439872, "step": 5755 }, { "epoch": 0.9396411092985318, "grad_norm": 0.24920654296875, "learning_rate": 2.3486949429037524e-05, "loss": 0.0868, "num_input_tokens_seen": 12451840, "step": 5760 }, { "epoch": 0.9404567699836868, "grad_norm": 0.6038890480995178, "learning_rate": 2.3507340946166394e-05, "loss": 0.1803, "num_input_tokens_seen": 12463328, "step": 5765 }, { "epoch": 0.9412724306688418, "grad_norm": 0.5356212854385376, "learning_rate": 2.352773246329527e-05, "loss": 0.2671, "num_input_tokens_seen": 12474080, "step": 5770 }, { "epoch": 0.9420880913539967, "grad_norm": 0.3453023135662079, "learning_rate": 2.3548123980424145e-05, "loss": 0.3396, "num_input_tokens_seen": 12484960, "step": 5775 }, { "epoch": 0.9429037520391517, "grad_norm": 0.14999543130397797, "learning_rate": 2.356851549755302e-05, "loss": 0.0783, "num_input_tokens_seen": 12495584, "step": 5780 }, { "epoch": 0.9437194127243067, "grad_norm": 0.51494961977005, "learning_rate": 2.3588907014681893e-05, "loss": 0.1999, "num_input_tokens_seen": 12505568, "step": 5785 }, { "epoch": 0.9445350734094616, "grad_norm": 0.269064337015152, "learning_rate": 2.3609298531810767e-05, "loss": 0.2111, "num_input_tokens_seen": 12516480, "step": 5790 }, { "epoch": 0.9453507340946167, "grad_norm": 1.7296295166015625, "learning_rate": 2.3629690048939644e-05, "loss": 0.3659, "num_input_tokens_seen": 12527136, "step": 5795 }, { "epoch": 0.9461663947797716, "grad_norm": 0.1266348659992218, "learning_rate": 2.3650081566068518e-05, "loss": 0.3405, "num_input_tokens_seen": 12537568, "step": 5800 }, { "epoch": 0.9469820554649266, "grad_norm": 0.34620264172554016, "learning_rate": 2.367047308319739e-05, "loss": 0.1261, "num_input_tokens_seen": 12548736, "step": 5805 }, { "epoch": 0.9477977161500816, "grad_norm": 0.10990728437900543, "learning_rate": 2.3690864600326265e-05, "loss": 0.122, "num_input_tokens_seen": 12560192, "step": 5810 }, { "epoch": 0.9486133768352365, "grad_norm": 0.4822218418121338, "learning_rate": 2.371125611745514e-05, "loss": 0.1114, "num_input_tokens_seen": 12570816, "step": 5815 }, { "epoch": 0.9494290375203915, "grad_norm": 2.9297780990600586, "learning_rate": 2.3731647634584013e-05, "loss": 0.2757, "num_input_tokens_seen": 12582272, "step": 5820 }, { "epoch": 0.9502446982055465, "grad_norm": 0.6797144412994385, "learning_rate": 2.3752039151712887e-05, "loss": 0.2501, "num_input_tokens_seen": 12594432, "step": 5825 }, { "epoch": 0.9510603588907015, "grad_norm": 0.5840714573860168, "learning_rate": 2.3772430668841764e-05, "loss": 0.0541, "num_input_tokens_seen": 12606048, "step": 5830 }, { "epoch": 0.9518760195758564, "grad_norm": 1.1621571779251099, "learning_rate": 2.3792822185970638e-05, "loss": 0.3241, "num_input_tokens_seen": 12617088, "step": 5835 }, { "epoch": 0.9526916802610114, "grad_norm": 1.6641908884048462, "learning_rate": 2.381321370309951e-05, "loss": 0.3314, "num_input_tokens_seen": 12627808, "step": 5840 }, { "epoch": 0.9535073409461664, "grad_norm": 0.43076133728027344, "learning_rate": 2.3833605220228385e-05, "loss": 0.0808, "num_input_tokens_seen": 12638720, "step": 5845 }, { "epoch": 0.9543230016313213, "grad_norm": 0.6191142797470093, "learning_rate": 2.385399673735726e-05, "loss": 0.1142, "num_input_tokens_seen": 12648608, "step": 5850 }, { "epoch": 0.9551386623164764, "grad_norm": 0.6124065518379211, "learning_rate": 2.3874388254486136e-05, "loss": 0.1, "num_input_tokens_seen": 12658944, "step": 5855 }, { "epoch": 0.9559543230016313, "grad_norm": 2.1615841388702393, "learning_rate": 2.3894779771615007e-05, "loss": 0.1741, "num_input_tokens_seen": 12669856, "step": 5860 }, { "epoch": 0.9567699836867863, "grad_norm": 0.9297847151756287, "learning_rate": 2.3915171288743884e-05, "loss": 0.2221, "num_input_tokens_seen": 12680096, "step": 5865 }, { "epoch": 0.9575856443719413, "grad_norm": 0.10183943063020706, "learning_rate": 2.3935562805872758e-05, "loss": 0.0489, "num_input_tokens_seen": 12691040, "step": 5870 }, { "epoch": 0.9584013050570962, "grad_norm": 0.20934873819351196, "learning_rate": 2.3955954323001635e-05, "loss": 0.1704, "num_input_tokens_seen": 12701728, "step": 5875 }, { "epoch": 0.9592169657422512, "grad_norm": 0.2443799078464508, "learning_rate": 2.3976345840130505e-05, "loss": 0.1481, "num_input_tokens_seen": 12712224, "step": 5880 }, { "epoch": 0.9600326264274062, "grad_norm": 1.216274380683899, "learning_rate": 2.399673735725938e-05, "loss": 0.2615, "num_input_tokens_seen": 12723264, "step": 5885 }, { "epoch": 0.9608482871125612, "grad_norm": 0.8208592534065247, "learning_rate": 2.4017128874388257e-05, "loss": 0.2007, "num_input_tokens_seen": 12734400, "step": 5890 }, { "epoch": 0.9616639477977161, "grad_norm": 0.08039859682321548, "learning_rate": 2.403752039151713e-05, "loss": 0.1548, "num_input_tokens_seen": 12745504, "step": 5895 }, { "epoch": 0.9624796084828712, "grad_norm": 0.7841009497642517, "learning_rate": 2.4057911908646004e-05, "loss": 0.2637, "num_input_tokens_seen": 12755520, "step": 5900 }, { "epoch": 0.9632952691680261, "grad_norm": 0.8188749551773071, "learning_rate": 2.4078303425774878e-05, "loss": 0.0647, "num_input_tokens_seen": 12766336, "step": 5905 }, { "epoch": 0.964110929853181, "grad_norm": 1.5803637504577637, "learning_rate": 2.4098694942903755e-05, "loss": 0.1637, "num_input_tokens_seen": 12776704, "step": 5910 }, { "epoch": 0.9649265905383361, "grad_norm": 0.7092013955116272, "learning_rate": 2.411908646003263e-05, "loss": 0.255, "num_input_tokens_seen": 12787872, "step": 5915 }, { "epoch": 0.965742251223491, "grad_norm": 0.4554656744003296, "learning_rate": 2.41394779771615e-05, "loss": 0.0978, "num_input_tokens_seen": 12798880, "step": 5920 }, { "epoch": 0.966557911908646, "grad_norm": 1.5387128591537476, "learning_rate": 2.4159869494290377e-05, "loss": 0.1887, "num_input_tokens_seen": 12809696, "step": 5925 }, { "epoch": 0.967373572593801, "grad_norm": 0.3342445194721222, "learning_rate": 2.418026101141925e-05, "loss": 0.1373, "num_input_tokens_seen": 12820192, "step": 5930 }, { "epoch": 0.968189233278956, "grad_norm": 2.2452731132507324, "learning_rate": 2.4200652528548128e-05, "loss": 0.0635, "num_input_tokens_seen": 12832384, "step": 5935 }, { "epoch": 0.9690048939641109, "grad_norm": 0.2392226606607437, "learning_rate": 2.4221044045676998e-05, "loss": 0.111, "num_input_tokens_seen": 12843936, "step": 5940 }, { "epoch": 0.9698205546492659, "grad_norm": 0.7011986374855042, "learning_rate": 2.4241435562805872e-05, "loss": 0.1334, "num_input_tokens_seen": 12854752, "step": 5945 }, { "epoch": 0.9706362153344209, "grad_norm": 0.7147500514984131, "learning_rate": 2.426182707993475e-05, "loss": 0.0807, "num_input_tokens_seen": 12865248, "step": 5950 }, { "epoch": 0.9714518760195758, "grad_norm": 2.0301167964935303, "learning_rate": 2.4282218597063623e-05, "loss": 0.1781, "num_input_tokens_seen": 12876128, "step": 5955 }, { "epoch": 0.9722675367047309, "grad_norm": 1.3803895711898804, "learning_rate": 2.4302610114192497e-05, "loss": 0.2655, "num_input_tokens_seen": 12887008, "step": 5960 }, { "epoch": 0.9730831973898858, "grad_norm": 0.09043998271226883, "learning_rate": 2.432300163132137e-05, "loss": 0.062, "num_input_tokens_seen": 12898624, "step": 5965 }, { "epoch": 0.9738988580750407, "grad_norm": 0.3484722673892975, "learning_rate": 2.4343393148450248e-05, "loss": 0.1135, "num_input_tokens_seen": 12909888, "step": 5970 }, { "epoch": 0.9747145187601958, "grad_norm": 1.9315521717071533, "learning_rate": 2.436378466557912e-05, "loss": 0.2772, "num_input_tokens_seen": 12920032, "step": 5975 }, { "epoch": 0.9755301794453507, "grad_norm": 2.3831372261047363, "learning_rate": 2.4384176182707992e-05, "loss": 0.3602, "num_input_tokens_seen": 12931392, "step": 5980 }, { "epoch": 0.9763458401305057, "grad_norm": 1.1407262086868286, "learning_rate": 2.440456769983687e-05, "loss": 0.1633, "num_input_tokens_seen": 12941472, "step": 5985 }, { "epoch": 0.9771615008156607, "grad_norm": 0.745768666267395, "learning_rate": 2.4424959216965743e-05, "loss": 0.0981, "num_input_tokens_seen": 12952192, "step": 5990 }, { "epoch": 0.9779771615008157, "grad_norm": 1.7681684494018555, "learning_rate": 2.4445350734094617e-05, "loss": 0.2383, "num_input_tokens_seen": 12963200, "step": 5995 }, { "epoch": 0.9787928221859706, "grad_norm": 0.8427234888076782, "learning_rate": 2.446574225122349e-05, "loss": 0.2022, "num_input_tokens_seen": 12972768, "step": 6000 }, { "epoch": 0.9796084828711256, "grad_norm": 1.0506258010864258, "learning_rate": 2.4486133768352368e-05, "loss": 0.0935, "num_input_tokens_seen": 12984064, "step": 6005 }, { "epoch": 0.9804241435562806, "grad_norm": 0.5907758474349976, "learning_rate": 2.4506525285481242e-05, "loss": 0.1821, "num_input_tokens_seen": 12994336, "step": 6010 }, { "epoch": 0.9812398042414355, "grad_norm": 0.1557426005601883, "learning_rate": 2.4526916802610116e-05, "loss": 0.067, "num_input_tokens_seen": 13004096, "step": 6015 }, { "epoch": 0.9820554649265906, "grad_norm": 0.28201887011528015, "learning_rate": 2.454730831973899e-05, "loss": 0.0523, "num_input_tokens_seen": 13015168, "step": 6020 }, { "epoch": 0.9828711256117455, "grad_norm": 0.12409595400094986, "learning_rate": 2.4567699836867863e-05, "loss": 0.0606, "num_input_tokens_seen": 13026080, "step": 6025 }, { "epoch": 0.9836867862969005, "grad_norm": 1.8593138456344604, "learning_rate": 2.458809135399674e-05, "loss": 0.2877, "num_input_tokens_seen": 13037888, "step": 6030 }, { "epoch": 0.9845024469820555, "grad_norm": 2.603433847427368, "learning_rate": 2.4608482871125614e-05, "loss": 0.3981, "num_input_tokens_seen": 13049440, "step": 6035 }, { "epoch": 0.9853181076672104, "grad_norm": 0.38098427653312683, "learning_rate": 2.4628874388254488e-05, "loss": 0.0789, "num_input_tokens_seen": 13060480, "step": 6040 }, { "epoch": 0.9861337683523654, "grad_norm": 0.8253401517868042, "learning_rate": 2.4649265905383362e-05, "loss": 0.1231, "num_input_tokens_seen": 13071584, "step": 6045 }, { "epoch": 0.9869494290375204, "grad_norm": 0.7708802223205566, "learning_rate": 2.4669657422512236e-05, "loss": 0.1513, "num_input_tokens_seen": 13081376, "step": 6050 }, { "epoch": 0.9877650897226754, "grad_norm": 0.08679323643445969, "learning_rate": 2.469004893964111e-05, "loss": 0.0877, "num_input_tokens_seen": 13092768, "step": 6055 }, { "epoch": 0.9885807504078303, "grad_norm": 0.5472085475921631, "learning_rate": 2.4710440456769983e-05, "loss": 0.2423, "num_input_tokens_seen": 13104096, "step": 6060 }, { "epoch": 0.9893964110929854, "grad_norm": 0.21649034321308136, "learning_rate": 2.473083197389886e-05, "loss": 0.1011, "num_input_tokens_seen": 13115488, "step": 6065 }, { "epoch": 0.9902120717781403, "grad_norm": 0.10802499204874039, "learning_rate": 2.4751223491027734e-05, "loss": 0.0866, "num_input_tokens_seen": 13126752, "step": 6070 }, { "epoch": 0.9910277324632952, "grad_norm": 0.2356015294790268, "learning_rate": 2.4771615008156608e-05, "loss": 0.1264, "num_input_tokens_seen": 13137920, "step": 6075 }, { "epoch": 0.9918433931484503, "grad_norm": 1.3174539804458618, "learning_rate": 2.4792006525285482e-05, "loss": 0.1739, "num_input_tokens_seen": 13148416, "step": 6080 }, { "epoch": 0.9926590538336052, "grad_norm": 1.396807312965393, "learning_rate": 2.4812398042414356e-05, "loss": 0.0809, "num_input_tokens_seen": 13157920, "step": 6085 }, { "epoch": 0.9934747145187602, "grad_norm": 0.34657758474349976, "learning_rate": 2.4832789559543233e-05, "loss": 0.0802, "num_input_tokens_seen": 13167968, "step": 6090 }, { "epoch": 0.9942903752039152, "grad_norm": 1.412208080291748, "learning_rate": 2.4853181076672104e-05, "loss": 0.2245, "num_input_tokens_seen": 13179648, "step": 6095 }, { "epoch": 0.9951060358890701, "grad_norm": 0.1649756133556366, "learning_rate": 2.487357259380098e-05, "loss": 0.1574, "num_input_tokens_seen": 13190656, "step": 6100 }, { "epoch": 0.9959216965742251, "grad_norm": 0.3353521227836609, "learning_rate": 2.4893964110929855e-05, "loss": 0.1946, "num_input_tokens_seen": 13201120, "step": 6105 }, { "epoch": 0.9967373572593801, "grad_norm": 0.475442111492157, "learning_rate": 2.491435562805873e-05, "loss": 0.0936, "num_input_tokens_seen": 13211744, "step": 6110 }, { "epoch": 0.9975530179445351, "grad_norm": 0.042498793452978134, "learning_rate": 2.4934747145187602e-05, "loss": 0.198, "num_input_tokens_seen": 13222752, "step": 6115 }, { "epoch": 0.99836867862969, "grad_norm": 0.3564106225967407, "learning_rate": 2.4955138662316476e-05, "loss": 0.2443, "num_input_tokens_seen": 13233696, "step": 6120 }, { "epoch": 0.9991843393148451, "grad_norm": 0.6613544821739197, "learning_rate": 2.4975530179445353e-05, "loss": 0.1975, "num_input_tokens_seen": 13245504, "step": 6125 }, { "epoch": 1.0, "grad_norm": 0.05797780305147171, "learning_rate": 2.4995921696574227e-05, "loss": 0.2557, "num_input_tokens_seen": 13255424, "step": 6130 }, { "epoch": 1.0, "eval_loss": 0.16971780359745026, "eval_runtime": 90.6185, "eval_samples_per_second": 30.071, "eval_steps_per_second": 7.526, "num_input_tokens_seen": 13255424, "step": 6130 }, { "epoch": 1.000815660685155, "grad_norm": 0.8408638834953308, "learning_rate": 2.50163132137031e-05, "loss": 0.1987, "num_input_tokens_seen": 13265952, "step": 6135 }, { "epoch": 1.0016313213703099, "grad_norm": 1.003691554069519, "learning_rate": 2.503670473083197e-05, "loss": 0.1147, "num_input_tokens_seen": 13276768, "step": 6140 }, { "epoch": 1.002446982055465, "grad_norm": 0.36370348930358887, "learning_rate": 2.505709624796085e-05, "loss": 0.1287, "num_input_tokens_seen": 13287776, "step": 6145 }, { "epoch": 1.00326264274062, "grad_norm": 0.25202417373657227, "learning_rate": 2.5077487765089726e-05, "loss": 0.2395, "num_input_tokens_seen": 13299488, "step": 6150 }, { "epoch": 1.004078303425775, "grad_norm": 0.6363725066184998, "learning_rate": 2.50978792822186e-05, "loss": 0.1067, "num_input_tokens_seen": 13310208, "step": 6155 }, { "epoch": 1.0048939641109298, "grad_norm": 0.07952968776226044, "learning_rate": 2.511827079934747e-05, "loss": 0.087, "num_input_tokens_seen": 13319904, "step": 6160 }, { "epoch": 1.0057096247960848, "grad_norm": 0.4860723912715912, "learning_rate": 2.5138662316476347e-05, "loss": 0.1636, "num_input_tokens_seen": 13330528, "step": 6165 }, { "epoch": 1.0065252854812399, "grad_norm": 0.2687889337539673, "learning_rate": 2.5159053833605224e-05, "loss": 0.1261, "num_input_tokens_seen": 13341760, "step": 6170 }, { "epoch": 1.0073409461663947, "grad_norm": 0.04493793472647667, "learning_rate": 2.5179445350734095e-05, "loss": 0.2071, "num_input_tokens_seen": 13352448, "step": 6175 }, { "epoch": 1.0081566068515497, "grad_norm": 1.0383867025375366, "learning_rate": 2.519983686786297e-05, "loss": 0.2695, "num_input_tokens_seen": 13363296, "step": 6180 }, { "epoch": 1.0089722675367048, "grad_norm": 1.2102408409118652, "learning_rate": 2.5220228384991846e-05, "loss": 0.101, "num_input_tokens_seen": 13374368, "step": 6185 }, { "epoch": 1.0097879282218598, "grad_norm": 1.0517953634262085, "learning_rate": 2.524061990212072e-05, "loss": 0.1728, "num_input_tokens_seen": 13385184, "step": 6190 }, { "epoch": 1.0106035889070146, "grad_norm": 1.4415392875671387, "learning_rate": 2.526101141924959e-05, "loss": 0.3058, "num_input_tokens_seen": 13395104, "step": 6195 }, { "epoch": 1.0114192495921697, "grad_norm": 1.0897414684295654, "learning_rate": 2.5281402936378467e-05, "loss": 0.1554, "num_input_tokens_seen": 13406080, "step": 6200 }, { "epoch": 1.0122349102773247, "grad_norm": 0.4400215446949005, "learning_rate": 2.5301794453507345e-05, "loss": 0.1953, "num_input_tokens_seen": 13417952, "step": 6205 }, { "epoch": 1.0130505709624795, "grad_norm": 1.6920990943908691, "learning_rate": 2.5322185970636215e-05, "loss": 0.1561, "num_input_tokens_seen": 13428512, "step": 6210 }, { "epoch": 1.0138662316476346, "grad_norm": 0.20910201966762543, "learning_rate": 2.534257748776509e-05, "loss": 0.0723, "num_input_tokens_seen": 13439648, "step": 6215 }, { "epoch": 1.0146818923327896, "grad_norm": 0.20997019112110138, "learning_rate": 2.5362969004893966e-05, "loss": 0.1331, "num_input_tokens_seen": 13451520, "step": 6220 }, { "epoch": 1.0154975530179446, "grad_norm": 1.7941372394561768, "learning_rate": 2.5383360522022843e-05, "loss": 0.1145, "num_input_tokens_seen": 13460480, "step": 6225 }, { "epoch": 1.0163132137030995, "grad_norm": 0.36920154094696045, "learning_rate": 2.5403752039151714e-05, "loss": 0.1116, "num_input_tokens_seen": 13471328, "step": 6230 }, { "epoch": 1.0171288743882545, "grad_norm": 1.4522379636764526, "learning_rate": 2.5424143556280588e-05, "loss": 0.1334, "num_input_tokens_seen": 13482048, "step": 6235 }, { "epoch": 1.0179445350734095, "grad_norm": 0.5432682037353516, "learning_rate": 2.5444535073409465e-05, "loss": 0.2269, "num_input_tokens_seen": 13492512, "step": 6240 }, { "epoch": 1.0187601957585644, "grad_norm": 0.7726547718048096, "learning_rate": 2.5464926590538335e-05, "loss": 0.1539, "num_input_tokens_seen": 13504000, "step": 6245 }, { "epoch": 1.0195758564437194, "grad_norm": 0.3767243027687073, "learning_rate": 2.5485318107667212e-05, "loss": 0.1267, "num_input_tokens_seen": 13514784, "step": 6250 }, { "epoch": 1.0203915171288744, "grad_norm": 0.925632655620575, "learning_rate": 2.5505709624796086e-05, "loss": 0.1331, "num_input_tokens_seen": 13525376, "step": 6255 }, { "epoch": 1.0212071778140293, "grad_norm": 0.35194873809814453, "learning_rate": 2.5526101141924963e-05, "loss": 0.14, "num_input_tokens_seen": 13535872, "step": 6260 }, { "epoch": 1.0220228384991843, "grad_norm": 0.7817695140838623, "learning_rate": 2.5546492659053834e-05, "loss": 0.107, "num_input_tokens_seen": 13546464, "step": 6265 }, { "epoch": 1.0228384991843393, "grad_norm": 0.5599012970924377, "learning_rate": 2.5566884176182708e-05, "loss": 0.2557, "num_input_tokens_seen": 13555936, "step": 6270 }, { "epoch": 1.0236541598694944, "grad_norm": 0.4872044026851654, "learning_rate": 2.5587275693311585e-05, "loss": 0.0317, "num_input_tokens_seen": 13566528, "step": 6275 }, { "epoch": 1.0244698205546492, "grad_norm": 0.1385488063097, "learning_rate": 2.5607667210440455e-05, "loss": 0.1703, "num_input_tokens_seen": 13577152, "step": 6280 }, { "epoch": 1.0252854812398042, "grad_norm": 1.4723848104476929, "learning_rate": 2.5628058727569333e-05, "loss": 0.1181, "num_input_tokens_seen": 13588704, "step": 6285 }, { "epoch": 1.0261011419249593, "grad_norm": 1.5383245944976807, "learning_rate": 2.5648450244698206e-05, "loss": 0.0744, "num_input_tokens_seen": 13599552, "step": 6290 }, { "epoch": 1.026916802610114, "grad_norm": 0.17126432061195374, "learning_rate": 2.5668841761827084e-05, "loss": 0.0462, "num_input_tokens_seen": 13611264, "step": 6295 }, { "epoch": 1.0277324632952691, "grad_norm": 0.7245639562606812, "learning_rate": 2.5689233278955954e-05, "loss": 0.2077, "num_input_tokens_seen": 13621760, "step": 6300 }, { "epoch": 1.0285481239804242, "grad_norm": 1.5111762285232544, "learning_rate": 2.570962479608483e-05, "loss": 0.2515, "num_input_tokens_seen": 13632672, "step": 6305 }, { "epoch": 1.0293637846655792, "grad_norm": 0.2231024205684662, "learning_rate": 2.5730016313213705e-05, "loss": 0.043, "num_input_tokens_seen": 13643584, "step": 6310 }, { "epoch": 1.030179445350734, "grad_norm": 0.33908364176750183, "learning_rate": 2.5750407830342575e-05, "loss": 0.2919, "num_input_tokens_seen": 13655488, "step": 6315 }, { "epoch": 1.030995106035889, "grad_norm": 1.184804081916809, "learning_rate": 2.5770799347471453e-05, "loss": 0.1723, "num_input_tokens_seen": 13666368, "step": 6320 }, { "epoch": 1.031810766721044, "grad_norm": 0.35690420866012573, "learning_rate": 2.579119086460033e-05, "loss": 0.3247, "num_input_tokens_seen": 13678048, "step": 6325 }, { "epoch": 1.032626427406199, "grad_norm": 2.067239999771118, "learning_rate": 2.5811582381729204e-05, "loss": 0.1153, "num_input_tokens_seen": 13689024, "step": 6330 }, { "epoch": 1.033442088091354, "grad_norm": 0.8257416486740112, "learning_rate": 2.5831973898858074e-05, "loss": 0.2088, "num_input_tokens_seen": 13699744, "step": 6335 }, { "epoch": 1.034257748776509, "grad_norm": 0.44281765818595886, "learning_rate": 2.585236541598695e-05, "loss": 0.1106, "num_input_tokens_seen": 13710016, "step": 6340 }, { "epoch": 1.035073409461664, "grad_norm": 0.6575015783309937, "learning_rate": 2.587275693311583e-05, "loss": 0.1574, "num_input_tokens_seen": 13721664, "step": 6345 }, { "epoch": 1.0358890701468189, "grad_norm": 1.13430917263031, "learning_rate": 2.58931484502447e-05, "loss": 0.1063, "num_input_tokens_seen": 13734752, "step": 6350 }, { "epoch": 1.036704730831974, "grad_norm": 0.34053677320480347, "learning_rate": 2.5913539967373573e-05, "loss": 0.1118, "num_input_tokens_seen": 13745504, "step": 6355 }, { "epoch": 1.037520391517129, "grad_norm": 3.3207836151123047, "learning_rate": 2.593393148450245e-05, "loss": 0.2683, "num_input_tokens_seen": 13755968, "step": 6360 }, { "epoch": 1.0383360522022838, "grad_norm": 1.6118868589401245, "learning_rate": 2.595432300163132e-05, "loss": 0.1772, "num_input_tokens_seen": 13767296, "step": 6365 }, { "epoch": 1.0391517128874388, "grad_norm": 0.13146935403347015, "learning_rate": 2.5974714518760194e-05, "loss": 0.2154, "num_input_tokens_seen": 13778656, "step": 6370 }, { "epoch": 1.0399673735725938, "grad_norm": 1.8729017972946167, "learning_rate": 2.599510603588907e-05, "loss": 0.1417, "num_input_tokens_seen": 13789216, "step": 6375 }, { "epoch": 1.0407830342577489, "grad_norm": 0.6877574920654297, "learning_rate": 2.601549755301795e-05, "loss": 0.143, "num_input_tokens_seen": 13799680, "step": 6380 }, { "epoch": 1.0415986949429037, "grad_norm": 0.29747068881988525, "learning_rate": 2.603588907014682e-05, "loss": 0.1555, "num_input_tokens_seen": 13810720, "step": 6385 }, { "epoch": 1.0424143556280587, "grad_norm": 0.17228420078754425, "learning_rate": 2.6056280587275693e-05, "loss": 0.144, "num_input_tokens_seen": 13821856, "step": 6390 }, { "epoch": 1.0432300163132138, "grad_norm": 0.49469250440597534, "learning_rate": 2.607667210440457e-05, "loss": 0.1837, "num_input_tokens_seen": 13833984, "step": 6395 }, { "epoch": 1.0440456769983686, "grad_norm": 0.840440034866333, "learning_rate": 2.609706362153344e-05, "loss": 0.2322, "num_input_tokens_seen": 13844928, "step": 6400 }, { "epoch": 1.0448613376835236, "grad_norm": 2.4371728897094727, "learning_rate": 2.6117455138662318e-05, "loss": 0.2051, "num_input_tokens_seen": 13856384, "step": 6405 }, { "epoch": 1.0456769983686787, "grad_norm": 0.39406827092170715, "learning_rate": 2.613784665579119e-05, "loss": 0.1121, "num_input_tokens_seen": 13867456, "step": 6410 }, { "epoch": 1.0464926590538337, "grad_norm": 0.23891323804855347, "learning_rate": 2.615823817292007e-05, "loss": 0.0415, "num_input_tokens_seen": 13877216, "step": 6415 }, { "epoch": 1.0473083197389885, "grad_norm": 1.4919840097427368, "learning_rate": 2.617862969004894e-05, "loss": 0.372, "num_input_tokens_seen": 13888192, "step": 6420 }, { "epoch": 1.0481239804241436, "grad_norm": 0.7874672412872314, "learning_rate": 2.6199021207177817e-05, "loss": 0.044, "num_input_tokens_seen": 13899072, "step": 6425 }, { "epoch": 1.0489396411092986, "grad_norm": 2.1759941577911377, "learning_rate": 2.621941272430669e-05, "loss": 0.2085, "num_input_tokens_seen": 13909792, "step": 6430 }, { "epoch": 1.0497553017944534, "grad_norm": 0.9163088798522949, "learning_rate": 2.623980424143556e-05, "loss": 0.1719, "num_input_tokens_seen": 13920800, "step": 6435 }, { "epoch": 1.0505709624796085, "grad_norm": 0.8951594829559326, "learning_rate": 2.6260195758564438e-05, "loss": 0.1025, "num_input_tokens_seen": 13930976, "step": 6440 }, { "epoch": 1.0513866231647635, "grad_norm": 0.7039554119110107, "learning_rate": 2.6280587275693315e-05, "loss": 0.0704, "num_input_tokens_seen": 13942080, "step": 6445 }, { "epoch": 1.0522022838499185, "grad_norm": 1.3319934606552124, "learning_rate": 2.630097879282219e-05, "loss": 0.1026, "num_input_tokens_seen": 13952736, "step": 6450 }, { "epoch": 1.0530179445350734, "grad_norm": 1.2572267055511475, "learning_rate": 2.632137030995106e-05, "loss": 0.137, "num_input_tokens_seen": 13962400, "step": 6455 }, { "epoch": 1.0538336052202284, "grad_norm": 0.06527839601039886, "learning_rate": 2.6341761827079937e-05, "loss": 0.0389, "num_input_tokens_seen": 13974304, "step": 6460 }, { "epoch": 1.0546492659053834, "grad_norm": 0.5592102408409119, "learning_rate": 2.636215334420881e-05, "loss": 0.1336, "num_input_tokens_seen": 13985248, "step": 6465 }, { "epoch": 1.0554649265905383, "grad_norm": 1.8896276950836182, "learning_rate": 2.638254486133768e-05, "loss": 0.1344, "num_input_tokens_seen": 13995840, "step": 6470 }, { "epoch": 1.0562805872756933, "grad_norm": 0.16912586987018585, "learning_rate": 2.6402936378466558e-05, "loss": 0.109, "num_input_tokens_seen": 14007104, "step": 6475 }, { "epoch": 1.0570962479608483, "grad_norm": 1.3286287784576416, "learning_rate": 2.6423327895595435e-05, "loss": 0.1119, "num_input_tokens_seen": 14018144, "step": 6480 }, { "epoch": 1.0579119086460032, "grad_norm": 0.3749818503856659, "learning_rate": 2.644371941272431e-05, "loss": 0.1245, "num_input_tokens_seen": 14029024, "step": 6485 }, { "epoch": 1.0587275693311582, "grad_norm": 0.09329988062381744, "learning_rate": 2.646411092985318e-05, "loss": 0.2473, "num_input_tokens_seen": 14040480, "step": 6490 }, { "epoch": 1.0595432300163132, "grad_norm": 0.26970794796943665, "learning_rate": 2.6484502446982057e-05, "loss": 0.0164, "num_input_tokens_seen": 14050976, "step": 6495 }, { "epoch": 1.0603588907014683, "grad_norm": 0.8618782162666321, "learning_rate": 2.6504893964110934e-05, "loss": 0.0291, "num_input_tokens_seen": 14060352, "step": 6500 }, { "epoch": 1.061174551386623, "grad_norm": 1.620761513710022, "learning_rate": 2.6525285481239804e-05, "loss": 0.0853, "num_input_tokens_seen": 14071680, "step": 6505 }, { "epoch": 1.0619902120717781, "grad_norm": 0.8047590255737305, "learning_rate": 2.6545676998368678e-05, "loss": 0.1481, "num_input_tokens_seen": 14083296, "step": 6510 }, { "epoch": 1.0628058727569332, "grad_norm": 2.054839849472046, "learning_rate": 2.6566068515497556e-05, "loss": 0.1436, "num_input_tokens_seen": 14094752, "step": 6515 }, { "epoch": 1.0636215334420882, "grad_norm": 2.2537803649902344, "learning_rate": 2.6586460032626433e-05, "loss": 0.3233, "num_input_tokens_seen": 14105088, "step": 6520 }, { "epoch": 1.064437194127243, "grad_norm": 3.3131163120269775, "learning_rate": 2.6606851549755303e-05, "loss": 0.2296, "num_input_tokens_seen": 14114976, "step": 6525 }, { "epoch": 1.065252854812398, "grad_norm": 1.4047459363937378, "learning_rate": 2.6627243066884177e-05, "loss": 0.1501, "num_input_tokens_seen": 14126592, "step": 6530 }, { "epoch": 1.066068515497553, "grad_norm": 1.6666779518127441, "learning_rate": 2.6647634584013054e-05, "loss": 0.197, "num_input_tokens_seen": 14138048, "step": 6535 }, { "epoch": 1.066884176182708, "grad_norm": 0.319205641746521, "learning_rate": 2.6668026101141925e-05, "loss": 0.1208, "num_input_tokens_seen": 14148128, "step": 6540 }, { "epoch": 1.067699836867863, "grad_norm": 0.9152668714523315, "learning_rate": 2.66884176182708e-05, "loss": 0.1333, "num_input_tokens_seen": 14160096, "step": 6545 }, { "epoch": 1.068515497553018, "grad_norm": 0.30633482336997986, "learning_rate": 2.6708809135399676e-05, "loss": 0.0995, "num_input_tokens_seen": 14170752, "step": 6550 }, { "epoch": 1.0693311582381728, "grad_norm": 0.11097779870033264, "learning_rate": 2.6729200652528553e-05, "loss": 0.3873, "num_input_tokens_seen": 14181952, "step": 6555 }, { "epoch": 1.0701468189233279, "grad_norm": 0.2780572772026062, "learning_rate": 2.6749592169657423e-05, "loss": 0.2277, "num_input_tokens_seen": 14192384, "step": 6560 }, { "epoch": 1.070962479608483, "grad_norm": 0.10025517642498016, "learning_rate": 2.6769983686786297e-05, "loss": 0.0462, "num_input_tokens_seen": 14204384, "step": 6565 }, { "epoch": 1.071778140293638, "grad_norm": 0.35778772830963135, "learning_rate": 2.6790375203915174e-05, "loss": 0.1434, "num_input_tokens_seen": 14215776, "step": 6570 }, { "epoch": 1.0725938009787928, "grad_norm": 0.3651520609855652, "learning_rate": 2.6810766721044045e-05, "loss": 0.1406, "num_input_tokens_seen": 14227520, "step": 6575 }, { "epoch": 1.0734094616639478, "grad_norm": 0.25669023394584656, "learning_rate": 2.6831158238172922e-05, "loss": 0.185, "num_input_tokens_seen": 14238976, "step": 6580 }, { "epoch": 1.0742251223491028, "grad_norm": 0.03840647265315056, "learning_rate": 2.6851549755301796e-05, "loss": 0.1571, "num_input_tokens_seen": 14249344, "step": 6585 }, { "epoch": 1.0750407830342577, "grad_norm": 0.6657704710960388, "learning_rate": 2.6871941272430666e-05, "loss": 0.0446, "num_input_tokens_seen": 14259840, "step": 6590 }, { "epoch": 1.0758564437194127, "grad_norm": 0.6102529168128967, "learning_rate": 2.6892332789559543e-05, "loss": 0.212, "num_input_tokens_seen": 14271456, "step": 6595 }, { "epoch": 1.0766721044045677, "grad_norm": 1.4916627407073975, "learning_rate": 2.691272430668842e-05, "loss": 0.1118, "num_input_tokens_seen": 14282688, "step": 6600 }, { "epoch": 1.0774877650897228, "grad_norm": 0.7622233033180237, "learning_rate": 2.6933115823817294e-05, "loss": 0.0786, "num_input_tokens_seen": 14293920, "step": 6605 }, { "epoch": 1.0783034257748776, "grad_norm": 2.8302083015441895, "learning_rate": 2.6953507340946165e-05, "loss": 0.0792, "num_input_tokens_seen": 14304512, "step": 6610 }, { "epoch": 1.0791190864600326, "grad_norm": 0.8392570614814758, "learning_rate": 2.6973898858075042e-05, "loss": 0.0645, "num_input_tokens_seen": 14315616, "step": 6615 }, { "epoch": 1.0799347471451877, "grad_norm": 0.22010819613933563, "learning_rate": 2.699429037520392e-05, "loss": 0.0614, "num_input_tokens_seen": 14326400, "step": 6620 }, { "epoch": 1.0807504078303425, "grad_norm": 0.049191996455192566, "learning_rate": 2.701468189233279e-05, "loss": 0.1748, "num_input_tokens_seen": 14336288, "step": 6625 }, { "epoch": 1.0815660685154975, "grad_norm": 0.8809645175933838, "learning_rate": 2.7035073409461664e-05, "loss": 0.1262, "num_input_tokens_seen": 14347616, "step": 6630 }, { "epoch": 1.0823817292006526, "grad_norm": 0.31376710534095764, "learning_rate": 2.705546492659054e-05, "loss": 0.1177, "num_input_tokens_seen": 14357664, "step": 6635 }, { "epoch": 1.0831973898858076, "grad_norm": 1.3577420711517334, "learning_rate": 2.7075856443719415e-05, "loss": 0.3389, "num_input_tokens_seen": 14367232, "step": 6640 }, { "epoch": 1.0840130505709624, "grad_norm": 0.13520829379558563, "learning_rate": 2.7096247960848285e-05, "loss": 0.0861, "num_input_tokens_seen": 14378048, "step": 6645 }, { "epoch": 1.0848287112561175, "grad_norm": 1.3878653049468994, "learning_rate": 2.7116639477977162e-05, "loss": 0.2148, "num_input_tokens_seen": 14389248, "step": 6650 }, { "epoch": 1.0856443719412725, "grad_norm": 0.4986162781715393, "learning_rate": 2.713703099510604e-05, "loss": 0.1043, "num_input_tokens_seen": 14399616, "step": 6655 }, { "epoch": 1.0864600326264273, "grad_norm": 2.296828508377075, "learning_rate": 2.715742251223491e-05, "loss": 0.3412, "num_input_tokens_seen": 14412000, "step": 6660 }, { "epoch": 1.0872756933115824, "grad_norm": 0.24937503039836884, "learning_rate": 2.7177814029363784e-05, "loss": 0.2364, "num_input_tokens_seen": 14423392, "step": 6665 }, { "epoch": 1.0880913539967374, "grad_norm": 0.5714582800865173, "learning_rate": 2.719820554649266e-05, "loss": 0.2719, "num_input_tokens_seen": 14433792, "step": 6670 }, { "epoch": 1.0889070146818924, "grad_norm": 0.6201543211936951, "learning_rate": 2.7218597063621538e-05, "loss": 0.1224, "num_input_tokens_seen": 14445152, "step": 6675 }, { "epoch": 1.0897226753670473, "grad_norm": 1.1842833757400513, "learning_rate": 2.723898858075041e-05, "loss": 0.1394, "num_input_tokens_seen": 14455968, "step": 6680 }, { "epoch": 1.0905383360522023, "grad_norm": 0.33757251501083374, "learning_rate": 2.7259380097879282e-05, "loss": 0.0547, "num_input_tokens_seen": 14465824, "step": 6685 }, { "epoch": 1.0913539967373573, "grad_norm": 2.088867425918579, "learning_rate": 2.727977161500816e-05, "loss": 0.137, "num_input_tokens_seen": 14476480, "step": 6690 }, { "epoch": 1.0921696574225122, "grad_norm": 1.8245424032211304, "learning_rate": 2.730016313213703e-05, "loss": 0.1638, "num_input_tokens_seen": 14486144, "step": 6695 }, { "epoch": 1.0929853181076672, "grad_norm": 0.8466487526893616, "learning_rate": 2.7320554649265907e-05, "loss": 0.2401, "num_input_tokens_seen": 14497664, "step": 6700 }, { "epoch": 1.0938009787928222, "grad_norm": 2.0088562965393066, "learning_rate": 2.734094616639478e-05, "loss": 0.1662, "num_input_tokens_seen": 14509888, "step": 6705 }, { "epoch": 1.094616639477977, "grad_norm": 0.5821710824966431, "learning_rate": 2.736133768352366e-05, "loss": 0.2094, "num_input_tokens_seen": 14521664, "step": 6710 }, { "epoch": 1.095432300163132, "grad_norm": 1.4707576036453247, "learning_rate": 2.738172920065253e-05, "loss": 0.2005, "num_input_tokens_seen": 14532768, "step": 6715 }, { "epoch": 1.0962479608482871, "grad_norm": 1.170255422592163, "learning_rate": 2.7402120717781406e-05, "loss": 0.1756, "num_input_tokens_seen": 14544160, "step": 6720 }, { "epoch": 1.0970636215334422, "grad_norm": 1.3005447387695312, "learning_rate": 2.742251223491028e-05, "loss": 0.0854, "num_input_tokens_seen": 14554112, "step": 6725 }, { "epoch": 1.097879282218597, "grad_norm": 0.9079923033714294, "learning_rate": 2.744290375203915e-05, "loss": 0.0927, "num_input_tokens_seen": 14564096, "step": 6730 }, { "epoch": 1.098694942903752, "grad_norm": 0.2865547239780426, "learning_rate": 2.7463295269168027e-05, "loss": 0.1419, "num_input_tokens_seen": 14574592, "step": 6735 }, { "epoch": 1.099510603588907, "grad_norm": 0.0975954458117485, "learning_rate": 2.74836867862969e-05, "loss": 0.0392, "num_input_tokens_seen": 14585728, "step": 6740 }, { "epoch": 1.100326264274062, "grad_norm": 0.5237464904785156, "learning_rate": 2.750407830342578e-05, "loss": 0.213, "num_input_tokens_seen": 14594464, "step": 6745 }, { "epoch": 1.101141924959217, "grad_norm": 1.1424612998962402, "learning_rate": 2.752446982055465e-05, "loss": 0.1355, "num_input_tokens_seen": 14604480, "step": 6750 }, { "epoch": 1.101957585644372, "grad_norm": 1.8097118139266968, "learning_rate": 2.7544861337683526e-05, "loss": 0.1355, "num_input_tokens_seen": 14616128, "step": 6755 }, { "epoch": 1.102773246329527, "grad_norm": 0.05354773625731468, "learning_rate": 2.75652528548124e-05, "loss": 0.0611, "num_input_tokens_seen": 14627008, "step": 6760 }, { "epoch": 1.1035889070146818, "grad_norm": 0.6302398443222046, "learning_rate": 2.758564437194127e-05, "loss": 0.2242, "num_input_tokens_seen": 14637280, "step": 6765 }, { "epoch": 1.1044045676998369, "grad_norm": 2.2300596237182617, "learning_rate": 2.7606035889070148e-05, "loss": 0.2154, "num_input_tokens_seen": 14646976, "step": 6770 }, { "epoch": 1.105220228384992, "grad_norm": 5.367934703826904, "learning_rate": 2.7626427406199025e-05, "loss": 0.1468, "num_input_tokens_seen": 14657152, "step": 6775 }, { "epoch": 1.1060358890701467, "grad_norm": 0.6736247539520264, "learning_rate": 2.76468189233279e-05, "loss": 0.3006, "num_input_tokens_seen": 14667488, "step": 6780 }, { "epoch": 1.1068515497553018, "grad_norm": 0.6653191447257996, "learning_rate": 2.766721044045677e-05, "loss": 0.2055, "num_input_tokens_seen": 14678112, "step": 6785 }, { "epoch": 1.1076672104404568, "grad_norm": 0.05840279161930084, "learning_rate": 2.7687601957585646e-05, "loss": 0.066, "num_input_tokens_seen": 14689312, "step": 6790 }, { "epoch": 1.1084828711256118, "grad_norm": 0.1236935704946518, "learning_rate": 2.7707993474714523e-05, "loss": 0.2697, "num_input_tokens_seen": 14700032, "step": 6795 }, { "epoch": 1.1092985318107667, "grad_norm": 0.6272677183151245, "learning_rate": 2.7728384991843394e-05, "loss": 0.2073, "num_input_tokens_seen": 14711008, "step": 6800 }, { "epoch": 1.1101141924959217, "grad_norm": 0.0422775074839592, "learning_rate": 2.7748776508972268e-05, "loss": 0.1044, "num_input_tokens_seen": 14720448, "step": 6805 }, { "epoch": 1.1109298531810767, "grad_norm": 0.1113593801856041, "learning_rate": 2.7769168026101145e-05, "loss": 0.1326, "num_input_tokens_seen": 14732128, "step": 6810 }, { "epoch": 1.1117455138662315, "grad_norm": 0.10868604481220245, "learning_rate": 2.7789559543230015e-05, "loss": 0.1223, "num_input_tokens_seen": 14743200, "step": 6815 }, { "epoch": 1.1125611745513866, "grad_norm": 1.9100013971328735, "learning_rate": 2.780995106035889e-05, "loss": 0.2658, "num_input_tokens_seen": 14754464, "step": 6820 }, { "epoch": 1.1133768352365416, "grad_norm": 0.8125177025794983, "learning_rate": 2.7830342577487766e-05, "loss": 0.278, "num_input_tokens_seen": 14763840, "step": 6825 }, { "epoch": 1.1141924959216967, "grad_norm": 0.40633073449134827, "learning_rate": 2.7850734094616644e-05, "loss": 0.1304, "num_input_tokens_seen": 14774304, "step": 6830 }, { "epoch": 1.1150081566068515, "grad_norm": 1.1955792903900146, "learning_rate": 2.7871125611745514e-05, "loss": 0.0778, "num_input_tokens_seen": 14783456, "step": 6835 }, { "epoch": 1.1158238172920065, "grad_norm": 0.049226462841033936, "learning_rate": 2.7891517128874388e-05, "loss": 0.0391, "num_input_tokens_seen": 14793664, "step": 6840 }, { "epoch": 1.1166394779771616, "grad_norm": 2.0709164142608643, "learning_rate": 2.7911908646003265e-05, "loss": 0.1723, "num_input_tokens_seen": 14804256, "step": 6845 }, { "epoch": 1.1174551386623164, "grad_norm": 0.42388400435447693, "learning_rate": 2.7932300163132136e-05, "loss": 0.2685, "num_input_tokens_seen": 14815744, "step": 6850 }, { "epoch": 1.1182707993474714, "grad_norm": 0.23291608691215515, "learning_rate": 2.7952691680261013e-05, "loss": 0.0465, "num_input_tokens_seen": 14826304, "step": 6855 }, { "epoch": 1.1190864600326265, "grad_norm": 0.8871657252311707, "learning_rate": 2.7973083197389887e-05, "loss": 0.2488, "num_input_tokens_seen": 14838016, "step": 6860 }, { "epoch": 1.1199021207177815, "grad_norm": 1.2391330003738403, "learning_rate": 2.7993474714518764e-05, "loss": 0.0879, "num_input_tokens_seen": 14849344, "step": 6865 }, { "epoch": 1.1207177814029363, "grad_norm": 0.7289042472839355, "learning_rate": 2.8013866231647634e-05, "loss": 0.2311, "num_input_tokens_seen": 14860608, "step": 6870 }, { "epoch": 1.1215334420880914, "grad_norm": 1.7424829006195068, "learning_rate": 2.803425774877651e-05, "loss": 0.1064, "num_input_tokens_seen": 14870976, "step": 6875 }, { "epoch": 1.1223491027732464, "grad_norm": 1.2605844736099243, "learning_rate": 2.8054649265905385e-05, "loss": 0.1233, "num_input_tokens_seen": 14880960, "step": 6880 }, { "epoch": 1.1231647634584012, "grad_norm": 0.036656107753515244, "learning_rate": 2.8075040783034256e-05, "loss": 0.157, "num_input_tokens_seen": 14892256, "step": 6885 }, { "epoch": 1.1239804241435563, "grad_norm": 1.9853692054748535, "learning_rate": 2.8095432300163133e-05, "loss": 0.3496, "num_input_tokens_seen": 14902304, "step": 6890 }, { "epoch": 1.1247960848287113, "grad_norm": 0.17855894565582275, "learning_rate": 2.811582381729201e-05, "loss": 0.1411, "num_input_tokens_seen": 14913536, "step": 6895 }, { "epoch": 1.1256117455138663, "grad_norm": 1.2110531330108643, "learning_rate": 2.8136215334420884e-05, "loss": 0.1931, "num_input_tokens_seen": 14924480, "step": 6900 }, { "epoch": 1.1264274061990212, "grad_norm": 0.8528078198432922, "learning_rate": 2.8156606851549754e-05, "loss": 0.1288, "num_input_tokens_seen": 14935264, "step": 6905 }, { "epoch": 1.1272430668841762, "grad_norm": 1.6221421957015991, "learning_rate": 2.817699836867863e-05, "loss": 0.128, "num_input_tokens_seen": 14947072, "step": 6910 }, { "epoch": 1.1280587275693312, "grad_norm": 0.5562580823898315, "learning_rate": 2.8197389885807505e-05, "loss": 0.2, "num_input_tokens_seen": 14957888, "step": 6915 }, { "epoch": 1.128874388254486, "grad_norm": 1.3447675704956055, "learning_rate": 2.8217781402936376e-05, "loss": 0.2254, "num_input_tokens_seen": 14967424, "step": 6920 }, { "epoch": 1.129690048939641, "grad_norm": 2.2802469730377197, "learning_rate": 2.8238172920065253e-05, "loss": 0.1347, "num_input_tokens_seen": 14977536, "step": 6925 }, { "epoch": 1.1305057096247961, "grad_norm": 0.238326296210289, "learning_rate": 2.825856443719413e-05, "loss": 0.0432, "num_input_tokens_seen": 14987008, "step": 6930 }, { "epoch": 1.131321370309951, "grad_norm": 0.20491929352283478, "learning_rate": 2.8278955954323004e-05, "loss": 0.3715, "num_input_tokens_seen": 14998272, "step": 6935 }, { "epoch": 1.132137030995106, "grad_norm": 1.9756356477737427, "learning_rate": 2.8299347471451874e-05, "loss": 0.2898, "num_input_tokens_seen": 15007392, "step": 6940 }, { "epoch": 1.132952691680261, "grad_norm": 2.1323964595794678, "learning_rate": 2.831973898858075e-05, "loss": 0.3462, "num_input_tokens_seen": 15016608, "step": 6945 }, { "epoch": 1.133768352365416, "grad_norm": 1.4048857688903809, "learning_rate": 2.834013050570963e-05, "loss": 0.2041, "num_input_tokens_seen": 15027712, "step": 6950 }, { "epoch": 1.1345840130505709, "grad_norm": 0.7563011646270752, "learning_rate": 2.83605220228385e-05, "loss": 0.1321, "num_input_tokens_seen": 15038080, "step": 6955 }, { "epoch": 1.135399673735726, "grad_norm": 0.3050967752933502, "learning_rate": 2.8380913539967373e-05, "loss": 0.1889, "num_input_tokens_seen": 15050432, "step": 6960 }, { "epoch": 1.136215334420881, "grad_norm": 0.3302226662635803, "learning_rate": 2.840130505709625e-05, "loss": 0.1965, "num_input_tokens_seen": 15060992, "step": 6965 }, { "epoch": 1.137030995106036, "grad_norm": 0.9965848922729492, "learning_rate": 2.8421696574225128e-05, "loss": 0.1258, "num_input_tokens_seen": 15070816, "step": 6970 }, { "epoch": 1.1378466557911908, "grad_norm": 0.4260268807411194, "learning_rate": 2.8442088091353998e-05, "loss": 0.1379, "num_input_tokens_seen": 15080128, "step": 6975 }, { "epoch": 1.1386623164763459, "grad_norm": 0.25361913442611694, "learning_rate": 2.8462479608482872e-05, "loss": 0.1253, "num_input_tokens_seen": 15090464, "step": 6980 }, { "epoch": 1.139477977161501, "grad_norm": 1.076406478881836, "learning_rate": 2.848287112561175e-05, "loss": 0.3029, "num_input_tokens_seen": 15101472, "step": 6985 }, { "epoch": 1.1402936378466557, "grad_norm": 1.3645461797714233, "learning_rate": 2.850326264274062e-05, "loss": 0.1498, "num_input_tokens_seen": 15111616, "step": 6990 }, { "epoch": 1.1411092985318108, "grad_norm": 1.7661687135696411, "learning_rate": 2.8523654159869497e-05, "loss": 0.2919, "num_input_tokens_seen": 15122496, "step": 6995 }, { "epoch": 1.1419249592169658, "grad_norm": 0.1841251701116562, "learning_rate": 2.854404567699837e-05, "loss": 0.1428, "num_input_tokens_seen": 15134112, "step": 7000 }, { "epoch": 1.1427406199021206, "grad_norm": 1.3237230777740479, "learning_rate": 2.8564437194127248e-05, "loss": 0.1219, "num_input_tokens_seen": 15144352, "step": 7005 }, { "epoch": 1.1435562805872757, "grad_norm": 1.1808927059173584, "learning_rate": 2.8584828711256118e-05, "loss": 0.1678, "num_input_tokens_seen": 15154304, "step": 7010 }, { "epoch": 1.1443719412724307, "grad_norm": 0.7123340964317322, "learning_rate": 2.8605220228384992e-05, "loss": 0.1145, "num_input_tokens_seen": 15165024, "step": 7015 }, { "epoch": 1.1451876019575857, "grad_norm": 1.204732060432434, "learning_rate": 2.862561174551387e-05, "loss": 0.182, "num_input_tokens_seen": 15176224, "step": 7020 }, { "epoch": 1.1460032626427405, "grad_norm": 1.7972949743270874, "learning_rate": 2.864600326264274e-05, "loss": 0.1117, "num_input_tokens_seen": 15186464, "step": 7025 }, { "epoch": 1.1468189233278956, "grad_norm": 1.1805990934371948, "learning_rate": 2.8666394779771617e-05, "loss": 0.1871, "num_input_tokens_seen": 15197408, "step": 7030 }, { "epoch": 1.1476345840130506, "grad_norm": 1.1515229940414429, "learning_rate": 2.868678629690049e-05, "loss": 0.14, "num_input_tokens_seen": 15209504, "step": 7035 }, { "epoch": 1.1484502446982057, "grad_norm": 1.6585588455200195, "learning_rate": 2.8707177814029368e-05, "loss": 0.2594, "num_input_tokens_seen": 15220576, "step": 7040 }, { "epoch": 1.1492659053833605, "grad_norm": 0.7679675817489624, "learning_rate": 2.872756933115824e-05, "loss": 0.1071, "num_input_tokens_seen": 15230752, "step": 7045 }, { "epoch": 1.1500815660685155, "grad_norm": 1.5913883447647095, "learning_rate": 2.8747960848287116e-05, "loss": 0.0895, "num_input_tokens_seen": 15241312, "step": 7050 }, { "epoch": 1.1508972267536706, "grad_norm": 1.1476140022277832, "learning_rate": 2.876835236541599e-05, "loss": 0.15, "num_input_tokens_seen": 15252512, "step": 7055 }, { "epoch": 1.1517128874388254, "grad_norm": 1.0307979583740234, "learning_rate": 2.878874388254486e-05, "loss": 0.0928, "num_input_tokens_seen": 15262400, "step": 7060 }, { "epoch": 1.1525285481239804, "grad_norm": 1.5060549974441528, "learning_rate": 2.8809135399673737e-05, "loss": 0.25, "num_input_tokens_seen": 15274816, "step": 7065 }, { "epoch": 1.1533442088091355, "grad_norm": 0.08758176863193512, "learning_rate": 2.8829526916802614e-05, "loss": 0.1156, "num_input_tokens_seen": 15285248, "step": 7070 }, { "epoch": 1.1541598694942903, "grad_norm": 2.070138931274414, "learning_rate": 2.8849918433931485e-05, "loss": 0.0899, "num_input_tokens_seen": 15295552, "step": 7075 }, { "epoch": 1.1549755301794453, "grad_norm": 0.7197743654251099, "learning_rate": 2.887030995106036e-05, "loss": 0.1075, "num_input_tokens_seen": 15305248, "step": 7080 }, { "epoch": 1.1557911908646004, "grad_norm": 0.20354434847831726, "learning_rate": 2.8890701468189236e-05, "loss": 0.0531, "num_input_tokens_seen": 15316576, "step": 7085 }, { "epoch": 1.1566068515497552, "grad_norm": 0.06521645933389664, "learning_rate": 2.891109298531811e-05, "loss": 0.1562, "num_input_tokens_seen": 15328256, "step": 7090 }, { "epoch": 1.1574225122349102, "grad_norm": 0.8838436603546143, "learning_rate": 2.893148450244698e-05, "loss": 0.2483, "num_input_tokens_seen": 15338752, "step": 7095 }, { "epoch": 1.1582381729200653, "grad_norm": 0.2383032739162445, "learning_rate": 2.8951876019575857e-05, "loss": 0.0581, "num_input_tokens_seen": 15348224, "step": 7100 }, { "epoch": 1.1590538336052203, "grad_norm": 0.4287551939487457, "learning_rate": 2.8972267536704734e-05, "loss": 0.1006, "num_input_tokens_seen": 15359008, "step": 7105 }, { "epoch": 1.1598694942903751, "grad_norm": 0.2017919421195984, "learning_rate": 2.8992659053833605e-05, "loss": 0.0543, "num_input_tokens_seen": 15369408, "step": 7110 }, { "epoch": 1.1606851549755302, "grad_norm": 0.4012167155742645, "learning_rate": 2.901305057096248e-05, "loss": 0.021, "num_input_tokens_seen": 15379392, "step": 7115 }, { "epoch": 1.1615008156606852, "grad_norm": 0.7069433927536011, "learning_rate": 2.9033442088091356e-05, "loss": 0.1629, "num_input_tokens_seen": 15389920, "step": 7120 }, { "epoch": 1.1623164763458402, "grad_norm": 0.3904680013656616, "learning_rate": 2.9053833605220233e-05, "loss": 0.1498, "num_input_tokens_seen": 15400896, "step": 7125 }, { "epoch": 1.163132137030995, "grad_norm": 0.6737809181213379, "learning_rate": 2.9074225122349103e-05, "loss": 0.0806, "num_input_tokens_seen": 15412320, "step": 7130 }, { "epoch": 1.16394779771615, "grad_norm": 0.08426140993833542, "learning_rate": 2.9094616639477977e-05, "loss": 0.1136, "num_input_tokens_seen": 15423680, "step": 7135 }, { "epoch": 1.1647634584013051, "grad_norm": 0.32010701298713684, "learning_rate": 2.9115008156606854e-05, "loss": 0.1418, "num_input_tokens_seen": 15433952, "step": 7140 }, { "epoch": 1.16557911908646, "grad_norm": 0.8833522200584412, "learning_rate": 2.9135399673735725e-05, "loss": 0.1985, "num_input_tokens_seen": 15444896, "step": 7145 }, { "epoch": 1.166394779771615, "grad_norm": 0.255380779504776, "learning_rate": 2.9155791190864602e-05, "loss": 0.1035, "num_input_tokens_seen": 15456192, "step": 7150 }, { "epoch": 1.16721044045677, "grad_norm": 2.1248409748077393, "learning_rate": 2.9176182707993476e-05, "loss": 0.2697, "num_input_tokens_seen": 15465312, "step": 7155 }, { "epoch": 1.1680261011419248, "grad_norm": 0.34027308225631714, "learning_rate": 2.9196574225122353e-05, "loss": 0.1202, "num_input_tokens_seen": 15474496, "step": 7160 }, { "epoch": 1.1688417618270799, "grad_norm": 1.8906432390213013, "learning_rate": 2.9216965742251224e-05, "loss": 0.1601, "num_input_tokens_seen": 15484992, "step": 7165 }, { "epoch": 1.169657422512235, "grad_norm": 0.331583708524704, "learning_rate": 2.92373572593801e-05, "loss": 0.0853, "num_input_tokens_seen": 15495232, "step": 7170 }, { "epoch": 1.17047308319739, "grad_norm": 0.24566514790058136, "learning_rate": 2.9257748776508975e-05, "loss": 0.144, "num_input_tokens_seen": 15506304, "step": 7175 }, { "epoch": 1.1712887438825448, "grad_norm": 0.47540733218193054, "learning_rate": 2.9278140293637845e-05, "loss": 0.2057, "num_input_tokens_seen": 15517248, "step": 7180 }, { "epoch": 1.1721044045676998, "grad_norm": 0.8313619494438171, "learning_rate": 2.9298531810766722e-05, "loss": 0.0776, "num_input_tokens_seen": 15527040, "step": 7185 }, { "epoch": 1.1729200652528549, "grad_norm": 1.051429033279419, "learning_rate": 2.9318923327895596e-05, "loss": 0.2276, "num_input_tokens_seen": 15538368, "step": 7190 }, { "epoch": 1.17373572593801, "grad_norm": 0.35272401571273804, "learning_rate": 2.9339314845024473e-05, "loss": 0.0523, "num_input_tokens_seen": 15548608, "step": 7195 }, { "epoch": 1.1745513866231647, "grad_norm": 0.33632194995880127, "learning_rate": 2.9359706362153344e-05, "loss": 0.1804, "num_input_tokens_seen": 15560032, "step": 7200 }, { "epoch": 1.1753670473083198, "grad_norm": 0.834802508354187, "learning_rate": 2.938009787928222e-05, "loss": 0.1683, "num_input_tokens_seen": 15571616, "step": 7205 }, { "epoch": 1.1761827079934748, "grad_norm": 0.26060569286346436, "learning_rate": 2.9400489396411095e-05, "loss": 0.122, "num_input_tokens_seen": 15583392, "step": 7210 }, { "epoch": 1.1769983686786296, "grad_norm": 0.4072590470314026, "learning_rate": 2.9420880913539965e-05, "loss": 0.2684, "num_input_tokens_seen": 15593504, "step": 7215 }, { "epoch": 1.1778140293637847, "grad_norm": 0.6428176164627075, "learning_rate": 2.9441272430668842e-05, "loss": 0.0911, "num_input_tokens_seen": 15604928, "step": 7220 }, { "epoch": 1.1786296900489397, "grad_norm": 1.5707155466079712, "learning_rate": 2.946166394779772e-05, "loss": 0.161, "num_input_tokens_seen": 15615200, "step": 7225 }, { "epoch": 1.1794453507340945, "grad_norm": 0.7110115885734558, "learning_rate": 2.9482055464926593e-05, "loss": 0.0704, "num_input_tokens_seen": 15626176, "step": 7230 }, { "epoch": 1.1802610114192496, "grad_norm": 0.6847907900810242, "learning_rate": 2.9502446982055464e-05, "loss": 0.114, "num_input_tokens_seen": 15636768, "step": 7235 }, { "epoch": 1.1810766721044046, "grad_norm": 1.101696491241455, "learning_rate": 2.952283849918434e-05, "loss": 0.2116, "num_input_tokens_seen": 15648512, "step": 7240 }, { "epoch": 1.1818923327895596, "grad_norm": 0.8009437918663025, "learning_rate": 2.954323001631322e-05, "loss": 0.0756, "num_input_tokens_seen": 15659328, "step": 7245 }, { "epoch": 1.1827079934747144, "grad_norm": 0.4317280352115631, "learning_rate": 2.956362153344209e-05, "loss": 0.0584, "num_input_tokens_seen": 15669920, "step": 7250 }, { "epoch": 1.1835236541598695, "grad_norm": 0.2741183638572693, "learning_rate": 2.9584013050570963e-05, "loss": 0.2639, "num_input_tokens_seen": 15681152, "step": 7255 }, { "epoch": 1.1843393148450245, "grad_norm": 1.6365232467651367, "learning_rate": 2.960440456769984e-05, "loss": 0.1199, "num_input_tokens_seen": 15690912, "step": 7260 }, { "epoch": 1.1851549755301796, "grad_norm": 1.5930060148239136, "learning_rate": 2.9624796084828717e-05, "loss": 0.1845, "num_input_tokens_seen": 15701216, "step": 7265 }, { "epoch": 1.1859706362153344, "grad_norm": 0.9978070259094238, "learning_rate": 2.9645187601957587e-05, "loss": 0.1654, "num_input_tokens_seen": 15712000, "step": 7270 }, { "epoch": 1.1867862969004894, "grad_norm": 1.4299677610397339, "learning_rate": 2.966557911908646e-05, "loss": 0.2702, "num_input_tokens_seen": 15722848, "step": 7275 }, { "epoch": 1.1876019575856445, "grad_norm": 0.377249538898468, "learning_rate": 2.968597063621534e-05, "loss": 0.2084, "num_input_tokens_seen": 15734816, "step": 7280 }, { "epoch": 1.1884176182707993, "grad_norm": 0.8580963611602783, "learning_rate": 2.970636215334421e-05, "loss": 0.3254, "num_input_tokens_seen": 15745664, "step": 7285 }, { "epoch": 1.1892332789559543, "grad_norm": 2.7409276962280273, "learning_rate": 2.9726753670473083e-05, "loss": 0.1948, "num_input_tokens_seen": 15756896, "step": 7290 }, { "epoch": 1.1900489396411094, "grad_norm": 0.487038254737854, "learning_rate": 2.974714518760196e-05, "loss": 0.1624, "num_input_tokens_seen": 15766656, "step": 7295 }, { "epoch": 1.1908646003262642, "grad_norm": 1.8791967630386353, "learning_rate": 2.976753670473083e-05, "loss": 0.2791, "num_input_tokens_seen": 15777792, "step": 7300 }, { "epoch": 1.1916802610114192, "grad_norm": 0.6685689091682434, "learning_rate": 2.9787928221859708e-05, "loss": 0.2444, "num_input_tokens_seen": 15787968, "step": 7305 }, { "epoch": 1.1924959216965743, "grad_norm": 1.1044747829437256, "learning_rate": 2.980831973898858e-05, "loss": 0.2765, "num_input_tokens_seen": 15797920, "step": 7310 }, { "epoch": 1.1933115823817293, "grad_norm": 0.6103065013885498, "learning_rate": 2.982871125611746e-05, "loss": 0.1839, "num_input_tokens_seen": 15809376, "step": 7315 }, { "epoch": 1.1941272430668841, "grad_norm": 3.1740643978118896, "learning_rate": 2.984910277324633e-05, "loss": 0.1623, "num_input_tokens_seen": 15821408, "step": 7320 }, { "epoch": 1.1949429037520392, "grad_norm": 0.24268551170825958, "learning_rate": 2.9869494290375206e-05, "loss": 0.1232, "num_input_tokens_seen": 15830816, "step": 7325 }, { "epoch": 1.1957585644371942, "grad_norm": 0.5286183953285217, "learning_rate": 2.988988580750408e-05, "loss": 0.0659, "num_input_tokens_seen": 15841792, "step": 7330 }, { "epoch": 1.196574225122349, "grad_norm": 0.24089014530181885, "learning_rate": 2.991027732463295e-05, "loss": 0.3298, "num_input_tokens_seen": 15852768, "step": 7335 }, { "epoch": 1.197389885807504, "grad_norm": 0.38423898816108704, "learning_rate": 2.9930668841761828e-05, "loss": 0.0972, "num_input_tokens_seen": 15863680, "step": 7340 }, { "epoch": 1.198205546492659, "grad_norm": 0.7160916924476624, "learning_rate": 2.9951060358890705e-05, "loss": 0.1406, "num_input_tokens_seen": 15874976, "step": 7345 }, { "epoch": 1.1990212071778141, "grad_norm": 0.19556842744350433, "learning_rate": 2.997145187601958e-05, "loss": 0.0543, "num_input_tokens_seen": 15885408, "step": 7350 }, { "epoch": 1.199836867862969, "grad_norm": 1.4999905824661255, "learning_rate": 2.999184339314845e-05, "loss": 0.1755, "num_input_tokens_seen": 15895168, "step": 7355 }, { "epoch": 1.200652528548124, "grad_norm": 1.1424049139022827, "learning_rate": 3.0012234910277326e-05, "loss": 0.1409, "num_input_tokens_seen": 15906240, "step": 7360 }, { "epoch": 1.201468189233279, "grad_norm": 2.0464696884155273, "learning_rate": 3.00326264274062e-05, "loss": 0.1658, "num_input_tokens_seen": 15917024, "step": 7365 }, { "epoch": 1.2022838499184338, "grad_norm": 2.6473259925842285, "learning_rate": 3.005301794453507e-05, "loss": 0.3089, "num_input_tokens_seen": 15926496, "step": 7370 }, { "epoch": 1.2030995106035889, "grad_norm": 1.0588749647140503, "learning_rate": 3.0073409461663948e-05, "loss": 0.0961, "num_input_tokens_seen": 15937600, "step": 7375 }, { "epoch": 1.203915171288744, "grad_norm": 1.455744981765747, "learning_rate": 3.0093800978792825e-05, "loss": 0.0868, "num_input_tokens_seen": 15948064, "step": 7380 }, { "epoch": 1.2047308319738987, "grad_norm": 0.12223126739263535, "learning_rate": 3.01141924959217e-05, "loss": 0.1041, "num_input_tokens_seen": 15958752, "step": 7385 }, { "epoch": 1.2055464926590538, "grad_norm": 2.0572476387023926, "learning_rate": 3.013458401305057e-05, "loss": 0.2759, "num_input_tokens_seen": 15970080, "step": 7390 }, { "epoch": 1.2063621533442088, "grad_norm": 0.3673534095287323, "learning_rate": 3.0154975530179447e-05, "loss": 0.1128, "num_input_tokens_seen": 15982080, "step": 7395 }, { "epoch": 1.2071778140293639, "grad_norm": 0.4116165339946747, "learning_rate": 3.0175367047308324e-05, "loss": 0.0602, "num_input_tokens_seen": 15993056, "step": 7400 }, { "epoch": 1.2079934747145187, "grad_norm": 1.7334603071212769, "learning_rate": 3.0195758564437194e-05, "loss": 0.0935, "num_input_tokens_seen": 16003584, "step": 7405 }, { "epoch": 1.2088091353996737, "grad_norm": 1.7155135869979858, "learning_rate": 3.0216150081566068e-05, "loss": 0.177, "num_input_tokens_seen": 16014912, "step": 7410 }, { "epoch": 1.2096247960848288, "grad_norm": 0.49742481112480164, "learning_rate": 3.0236541598694945e-05, "loss": 0.1224, "num_input_tokens_seen": 16026304, "step": 7415 }, { "epoch": 1.2104404567699838, "grad_norm": 0.5793755054473877, "learning_rate": 3.0256933115823822e-05, "loss": 0.1081, "num_input_tokens_seen": 16038464, "step": 7420 }, { "epoch": 1.2112561174551386, "grad_norm": 1.937647819519043, "learning_rate": 3.0277324632952693e-05, "loss": 0.1348, "num_input_tokens_seen": 16049728, "step": 7425 }, { "epoch": 1.2120717781402937, "grad_norm": 2.218933582305908, "learning_rate": 3.0297716150081567e-05, "loss": 0.4383, "num_input_tokens_seen": 16060032, "step": 7430 }, { "epoch": 1.2128874388254487, "grad_norm": 2.6830461025238037, "learning_rate": 3.0318107667210444e-05, "loss": 0.1073, "num_input_tokens_seen": 16072064, "step": 7435 }, { "epoch": 1.2137030995106035, "grad_norm": 1.2715826034545898, "learning_rate": 3.0338499184339314e-05, "loss": 0.1609, "num_input_tokens_seen": 16082112, "step": 7440 }, { "epoch": 1.2145187601957586, "grad_norm": 0.32597097754478455, "learning_rate": 3.035889070146819e-05, "loss": 0.2151, "num_input_tokens_seen": 16093280, "step": 7445 }, { "epoch": 1.2153344208809136, "grad_norm": 1.9027576446533203, "learning_rate": 3.0379282218597065e-05, "loss": 0.2871, "num_input_tokens_seen": 16104352, "step": 7450 }, { "epoch": 1.2161500815660684, "grad_norm": 0.5636919140815735, "learning_rate": 3.0399673735725943e-05, "loss": 0.0849, "num_input_tokens_seen": 16114784, "step": 7455 }, { "epoch": 1.2169657422512234, "grad_norm": 0.5795993804931641, "learning_rate": 3.0420065252854813e-05, "loss": 0.1167, "num_input_tokens_seen": 16126080, "step": 7460 }, { "epoch": 1.2177814029363785, "grad_norm": 1.1142092943191528, "learning_rate": 3.0440456769983687e-05, "loss": 0.2593, "num_input_tokens_seen": 16136480, "step": 7465 }, { "epoch": 1.2185970636215335, "grad_norm": 2.1701393127441406, "learning_rate": 3.0460848287112564e-05, "loss": 0.3473, "num_input_tokens_seen": 16147424, "step": 7470 }, { "epoch": 1.2194127243066883, "grad_norm": 0.47016122937202454, "learning_rate": 3.0481239804241434e-05, "loss": 0.2303, "num_input_tokens_seen": 16157280, "step": 7475 }, { "epoch": 1.2202283849918434, "grad_norm": 0.9304462671279907, "learning_rate": 3.0501631321370312e-05, "loss": 0.1307, "num_input_tokens_seen": 16168320, "step": 7480 }, { "epoch": 1.2210440456769984, "grad_norm": 0.16175755858421326, "learning_rate": 3.052202283849919e-05, "loss": 0.0868, "num_input_tokens_seen": 16178976, "step": 7485 }, { "epoch": 1.2218597063621535, "grad_norm": 0.06778473407030106, "learning_rate": 3.054241435562806e-05, "loss": 0.133, "num_input_tokens_seen": 16190208, "step": 7490 }, { "epoch": 1.2226753670473083, "grad_norm": 0.5435280799865723, "learning_rate": 3.0562805872756937e-05, "loss": 0.2155, "num_input_tokens_seen": 16199488, "step": 7495 }, { "epoch": 1.2234910277324633, "grad_norm": 0.30480268597602844, "learning_rate": 3.058319738988581e-05, "loss": 0.2149, "num_input_tokens_seen": 16210432, "step": 7500 }, { "epoch": 1.2243066884176184, "grad_norm": 1.3688687086105347, "learning_rate": 3.0603588907014684e-05, "loss": 0.1041, "num_input_tokens_seen": 16221664, "step": 7505 }, { "epoch": 1.2251223491027732, "grad_norm": 0.11699343472719193, "learning_rate": 3.062398042414356e-05, "loss": 0.2082, "num_input_tokens_seen": 16232064, "step": 7510 }, { "epoch": 1.2259380097879282, "grad_norm": 0.11201776564121246, "learning_rate": 3.064437194127243e-05, "loss": 0.1453, "num_input_tokens_seen": 16242976, "step": 7515 }, { "epoch": 1.2267536704730833, "grad_norm": 0.11423972994089127, "learning_rate": 3.0664763458401306e-05, "loss": 0.2385, "num_input_tokens_seen": 16253440, "step": 7520 }, { "epoch": 1.227569331158238, "grad_norm": 0.6397272944450378, "learning_rate": 3.068515497553018e-05, "loss": 0.1287, "num_input_tokens_seen": 16263680, "step": 7525 }, { "epoch": 1.2283849918433931, "grad_norm": 0.15702597796916962, "learning_rate": 3.070554649265905e-05, "loss": 0.2791, "num_input_tokens_seen": 16274944, "step": 7530 }, { "epoch": 1.2292006525285482, "grad_norm": 0.5898988842964172, "learning_rate": 3.072593800978793e-05, "loss": 0.0849, "num_input_tokens_seen": 16285376, "step": 7535 }, { "epoch": 1.2300163132137032, "grad_norm": 1.3647980690002441, "learning_rate": 3.074632952691681e-05, "loss": 0.1234, "num_input_tokens_seen": 16296544, "step": 7540 }, { "epoch": 1.230831973898858, "grad_norm": 0.39639848470687866, "learning_rate": 3.0766721044045675e-05, "loss": 0.22, "num_input_tokens_seen": 16306752, "step": 7545 }, { "epoch": 1.231647634584013, "grad_norm": 1.6631537675857544, "learning_rate": 3.0787112561174555e-05, "loss": 0.1648, "num_input_tokens_seen": 16317536, "step": 7550 }, { "epoch": 1.232463295269168, "grad_norm": 2.0583062171936035, "learning_rate": 3.080750407830343e-05, "loss": 0.1614, "num_input_tokens_seen": 16328640, "step": 7555 }, { "epoch": 1.233278955954323, "grad_norm": 1.600994348526001, "learning_rate": 3.0827895595432296e-05, "loss": 0.2178, "num_input_tokens_seen": 16340032, "step": 7560 }, { "epoch": 1.234094616639478, "grad_norm": 1.458606481552124, "learning_rate": 3.084828711256118e-05, "loss": 0.2438, "num_input_tokens_seen": 16351328, "step": 7565 }, { "epoch": 1.234910277324633, "grad_norm": 0.7670322060585022, "learning_rate": 3.086867862969005e-05, "loss": 0.1329, "num_input_tokens_seen": 16360832, "step": 7570 }, { "epoch": 1.235725938009788, "grad_norm": 0.18479403853416443, "learning_rate": 3.0889070146818925e-05, "loss": 0.0486, "num_input_tokens_seen": 16372640, "step": 7575 }, { "epoch": 1.2365415986949428, "grad_norm": 0.3187597393989563, "learning_rate": 3.09094616639478e-05, "loss": 0.0807, "num_input_tokens_seen": 16383360, "step": 7580 }, { "epoch": 1.2373572593800979, "grad_norm": 0.8869293928146362, "learning_rate": 3.092985318107667e-05, "loss": 0.181, "num_input_tokens_seen": 16393728, "step": 7585 }, { "epoch": 1.238172920065253, "grad_norm": 2.2427151203155518, "learning_rate": 3.095024469820555e-05, "loss": 0.2877, "num_input_tokens_seen": 16405440, "step": 7590 }, { "epoch": 1.2389885807504077, "grad_norm": 1.400766372680664, "learning_rate": 3.097063621533442e-05, "loss": 0.256, "num_input_tokens_seen": 16416672, "step": 7595 }, { "epoch": 1.2398042414355628, "grad_norm": 1.1911576986312866, "learning_rate": 3.0991027732463294e-05, "loss": 0.2328, "num_input_tokens_seen": 16427136, "step": 7600 }, { "epoch": 1.2406199021207178, "grad_norm": 0.5163933634757996, "learning_rate": 3.1011419249592174e-05, "loss": 0.1207, "num_input_tokens_seen": 16437984, "step": 7605 }, { "epoch": 1.2414355628058726, "grad_norm": 0.35601142048835754, "learning_rate": 3.103181076672105e-05, "loss": 0.1717, "num_input_tokens_seen": 16448640, "step": 7610 }, { "epoch": 1.2422512234910277, "grad_norm": 1.2479281425476074, "learning_rate": 3.1052202283849915e-05, "loss": 0.2609, "num_input_tokens_seen": 16457856, "step": 7615 }, { "epoch": 1.2430668841761827, "grad_norm": 0.08827143162488937, "learning_rate": 3.1072593800978796e-05, "loss": 0.0773, "num_input_tokens_seen": 16469056, "step": 7620 }, { "epoch": 1.2438825448613378, "grad_norm": 0.2106606364250183, "learning_rate": 3.109298531810767e-05, "loss": 0.1358, "num_input_tokens_seen": 16479616, "step": 7625 }, { "epoch": 1.2446982055464926, "grad_norm": 0.749669075012207, "learning_rate": 3.111337683523654e-05, "loss": 0.0806, "num_input_tokens_seen": 16489952, "step": 7630 }, { "epoch": 1.2455138662316476, "grad_norm": 0.3544733226299286, "learning_rate": 3.113376835236542e-05, "loss": 0.1154, "num_input_tokens_seen": 16501152, "step": 7635 }, { "epoch": 1.2463295269168027, "grad_norm": 0.4773344397544861, "learning_rate": 3.115415986949429e-05, "loss": 0.0426, "num_input_tokens_seen": 16512608, "step": 7640 }, { "epoch": 1.2471451876019577, "grad_norm": 1.6789281368255615, "learning_rate": 3.117455138662317e-05, "loss": 0.1749, "num_input_tokens_seen": 16524064, "step": 7645 }, { "epoch": 1.2479608482871125, "grad_norm": 1.1747148036956787, "learning_rate": 3.119494290375204e-05, "loss": 0.1051, "num_input_tokens_seen": 16534688, "step": 7650 }, { "epoch": 1.2487765089722676, "grad_norm": 1.8327245712280273, "learning_rate": 3.121533442088091e-05, "loss": 0.1476, "num_input_tokens_seen": 16544928, "step": 7655 }, { "epoch": 1.2495921696574226, "grad_norm": 1.502790093421936, "learning_rate": 3.123572593800979e-05, "loss": 0.2597, "num_input_tokens_seen": 16555200, "step": 7660 }, { "epoch": 1.2504078303425774, "grad_norm": 1.881581425666809, "learning_rate": 3.125611745513866e-05, "loss": 0.2831, "num_input_tokens_seen": 16566304, "step": 7665 }, { "epoch": 1.2512234910277324, "grad_norm": 1.2956181764602661, "learning_rate": 3.127650897226754e-05, "loss": 0.2441, "num_input_tokens_seen": 16578560, "step": 7670 }, { "epoch": 1.2520391517128875, "grad_norm": 0.31293419003486633, "learning_rate": 3.1296900489396415e-05, "loss": 0.0902, "num_input_tokens_seen": 16588704, "step": 7675 }, { "epoch": 1.2528548123980423, "grad_norm": 0.9572903513908386, "learning_rate": 3.131729200652529e-05, "loss": 0.0462, "num_input_tokens_seen": 16599776, "step": 7680 }, { "epoch": 1.2536704730831973, "grad_norm": 0.3089185655117035, "learning_rate": 3.133768352365416e-05, "loss": 0.1028, "num_input_tokens_seen": 16609824, "step": 7685 }, { "epoch": 1.2544861337683524, "grad_norm": 2.016958475112915, "learning_rate": 3.1358075040783036e-05, "loss": 0.3465, "num_input_tokens_seen": 16621280, "step": 7690 }, { "epoch": 1.2553017944535072, "grad_norm": 0.49985840916633606, "learning_rate": 3.137846655791191e-05, "loss": 0.1472, "num_input_tokens_seen": 16631776, "step": 7695 }, { "epoch": 1.2561174551386622, "grad_norm": 2.227165937423706, "learning_rate": 3.1398858075040784e-05, "loss": 0.1313, "num_input_tokens_seen": 16643584, "step": 7700 }, { "epoch": 1.2569331158238173, "grad_norm": 0.08662078529596329, "learning_rate": 3.141924959216966e-05, "loss": 0.0865, "num_input_tokens_seen": 16654432, "step": 7705 }, { "epoch": 1.2577487765089723, "grad_norm": 0.13557977974414825, "learning_rate": 3.143964110929853e-05, "loss": 0.1257, "num_input_tokens_seen": 16664064, "step": 7710 }, { "epoch": 1.2585644371941274, "grad_norm": 0.5007374286651611, "learning_rate": 3.146003262642741e-05, "loss": 0.1473, "num_input_tokens_seen": 16674432, "step": 7715 }, { "epoch": 1.2593800978792822, "grad_norm": 1.6796844005584717, "learning_rate": 3.148042414355628e-05, "loss": 0.1311, "num_input_tokens_seen": 16685920, "step": 7720 }, { "epoch": 1.2601957585644372, "grad_norm": 0.7040912508964539, "learning_rate": 3.150081566068516e-05, "loss": 0.0949, "num_input_tokens_seen": 16696320, "step": 7725 }, { "epoch": 1.2610114192495923, "grad_norm": 0.27375131845474243, "learning_rate": 3.152120717781403e-05, "loss": 0.0962, "num_input_tokens_seen": 16707040, "step": 7730 }, { "epoch": 1.261827079934747, "grad_norm": 1.877291202545166, "learning_rate": 3.15415986949429e-05, "loss": 0.3615, "num_input_tokens_seen": 16718208, "step": 7735 }, { "epoch": 1.2626427406199021, "grad_norm": 1.3889222145080566, "learning_rate": 3.156199021207178e-05, "loss": 0.0953, "num_input_tokens_seen": 16728480, "step": 7740 }, { "epoch": 1.2634584013050572, "grad_norm": 0.27797967195510864, "learning_rate": 3.1582381729200655e-05, "loss": 0.139, "num_input_tokens_seen": 16740384, "step": 7745 }, { "epoch": 1.264274061990212, "grad_norm": 1.5987427234649658, "learning_rate": 3.160277324632953e-05, "loss": 0.2713, "num_input_tokens_seen": 16749888, "step": 7750 }, { "epoch": 1.265089722675367, "grad_norm": 1.6360993385314941, "learning_rate": 3.16231647634584e-05, "loss": 0.2781, "num_input_tokens_seen": 16761024, "step": 7755 }, { "epoch": 1.265905383360522, "grad_norm": 0.0920514389872551, "learning_rate": 3.1643556280587276e-05, "loss": 0.0471, "num_input_tokens_seen": 16771296, "step": 7760 }, { "epoch": 1.2667210440456769, "grad_norm": 0.9642395973205566, "learning_rate": 3.166394779771616e-05, "loss": 0.0873, "num_input_tokens_seen": 16781984, "step": 7765 }, { "epoch": 1.267536704730832, "grad_norm": 1.0591163635253906, "learning_rate": 3.1684339314845024e-05, "loss": 0.1532, "num_input_tokens_seen": 16793056, "step": 7770 }, { "epoch": 1.268352365415987, "grad_norm": 0.6488621234893799, "learning_rate": 3.17047308319739e-05, "loss": 0.1091, "num_input_tokens_seen": 16805568, "step": 7775 }, { "epoch": 1.269168026101142, "grad_norm": 0.30986106395721436, "learning_rate": 3.172512234910278e-05, "loss": 0.1217, "num_input_tokens_seen": 16815808, "step": 7780 }, { "epoch": 1.269983686786297, "grad_norm": 0.21548645198345184, "learning_rate": 3.1745513866231645e-05, "loss": 0.2145, "num_input_tokens_seen": 16826272, "step": 7785 }, { "epoch": 1.2707993474714518, "grad_norm": 2.241647958755493, "learning_rate": 3.176590538336052e-05, "loss": 0.2062, "num_input_tokens_seen": 16836576, "step": 7790 }, { "epoch": 1.2716150081566069, "grad_norm": 0.48969966173171997, "learning_rate": 3.17862969004894e-05, "loss": 0.0483, "num_input_tokens_seen": 16846720, "step": 7795 }, { "epoch": 1.272430668841762, "grad_norm": 1.084110975265503, "learning_rate": 3.1806688417618274e-05, "loss": 0.1851, "num_input_tokens_seen": 16856960, "step": 7800 }, { "epoch": 1.2732463295269167, "grad_norm": 0.5499045252799988, "learning_rate": 3.182707993474715e-05, "loss": 0.1738, "num_input_tokens_seen": 16868512, "step": 7805 }, { "epoch": 1.2740619902120718, "grad_norm": 0.21771444380283356, "learning_rate": 3.184747145187602e-05, "loss": 0.1321, "num_input_tokens_seen": 16879744, "step": 7810 }, { "epoch": 1.2748776508972268, "grad_norm": 0.9427559971809387, "learning_rate": 3.1867862969004895e-05, "loss": 0.1631, "num_input_tokens_seen": 16889440, "step": 7815 }, { "epoch": 1.2756933115823816, "grad_norm": 0.4599268138408661, "learning_rate": 3.188825448613377e-05, "loss": 0.1804, "num_input_tokens_seen": 16900768, "step": 7820 }, { "epoch": 1.2765089722675367, "grad_norm": 0.36112305521965027, "learning_rate": 3.190864600326264e-05, "loss": 0.1338, "num_input_tokens_seen": 16910240, "step": 7825 }, { "epoch": 1.2773246329526917, "grad_norm": 0.3238697648048401, "learning_rate": 3.1929037520391517e-05, "loss": 0.07, "num_input_tokens_seen": 16921600, "step": 7830 }, { "epoch": 1.2781402936378465, "grad_norm": 1.455399751663208, "learning_rate": 3.19494290375204e-05, "loss": 0.2254, "num_input_tokens_seen": 16933344, "step": 7835 }, { "epoch": 1.2789559543230016, "grad_norm": 1.5067270994186401, "learning_rate": 3.1969820554649264e-05, "loss": 0.2926, "num_input_tokens_seen": 16944416, "step": 7840 }, { "epoch": 1.2797716150081566, "grad_norm": 1.8554586172103882, "learning_rate": 3.1990212071778145e-05, "loss": 0.2593, "num_input_tokens_seen": 16954816, "step": 7845 }, { "epoch": 1.2805872756933117, "grad_norm": 0.8477010130882263, "learning_rate": 3.201060358890702e-05, "loss": 0.1079, "num_input_tokens_seen": 16964992, "step": 7850 }, { "epoch": 1.2814029363784667, "grad_norm": 0.13338704407215118, "learning_rate": 3.2030995106035886e-05, "loss": 0.0633, "num_input_tokens_seen": 16975200, "step": 7855 }, { "epoch": 1.2822185970636215, "grad_norm": 0.8327591419219971, "learning_rate": 3.2051386623164766e-05, "loss": 0.1089, "num_input_tokens_seen": 16985984, "step": 7860 }, { "epoch": 1.2830342577487766, "grad_norm": 0.19897039234638214, "learning_rate": 3.207177814029364e-05, "loss": 0.1532, "num_input_tokens_seen": 16997504, "step": 7865 }, { "epoch": 1.2838499184339316, "grad_norm": 0.03951404616236687, "learning_rate": 3.2092169657422514e-05, "loss": 0.0729, "num_input_tokens_seen": 17007296, "step": 7870 }, { "epoch": 1.2846655791190864, "grad_norm": 1.0904587507247925, "learning_rate": 3.211256117455139e-05, "loss": 0.1863, "num_input_tokens_seen": 17017792, "step": 7875 }, { "epoch": 1.2854812398042414, "grad_norm": 0.5251303911209106, "learning_rate": 3.213295269168026e-05, "loss": 0.1301, "num_input_tokens_seen": 17029472, "step": 7880 }, { "epoch": 1.2862969004893965, "grad_norm": 0.16585302352905273, "learning_rate": 3.2153344208809135e-05, "loss": 0.0669, "num_input_tokens_seen": 17039808, "step": 7885 }, { "epoch": 1.2871125611745513, "grad_norm": 0.32625213265419006, "learning_rate": 3.217373572593801e-05, "loss": 0.0996, "num_input_tokens_seen": 17050720, "step": 7890 }, { "epoch": 1.2879282218597063, "grad_norm": 0.8665242195129395, "learning_rate": 3.219412724306688e-05, "loss": 0.2183, "num_input_tokens_seen": 17061792, "step": 7895 }, { "epoch": 1.2887438825448614, "grad_norm": 0.9900668263435364, "learning_rate": 3.2214518760195764e-05, "loss": 0.1118, "num_input_tokens_seen": 17072512, "step": 7900 }, { "epoch": 1.2895595432300162, "grad_norm": 0.7027276158332825, "learning_rate": 3.223491027732464e-05, "loss": 0.1445, "num_input_tokens_seen": 17084288, "step": 7905 }, { "epoch": 1.2903752039151712, "grad_norm": 1.3030767440795898, "learning_rate": 3.2255301794453505e-05, "loss": 0.3313, "num_input_tokens_seen": 17095648, "step": 7910 }, { "epoch": 1.2911908646003263, "grad_norm": 0.7478718161582947, "learning_rate": 3.2275693311582385e-05, "loss": 0.1867, "num_input_tokens_seen": 17106976, "step": 7915 }, { "epoch": 1.2920065252854813, "grad_norm": 0.9552913308143616, "learning_rate": 3.229608482871126e-05, "loss": 0.1025, "num_input_tokens_seen": 17116640, "step": 7920 }, { "epoch": 1.2928221859706361, "grad_norm": 0.36810198426246643, "learning_rate": 3.231647634584013e-05, "loss": 0.2272, "num_input_tokens_seen": 17127584, "step": 7925 }, { "epoch": 1.2936378466557912, "grad_norm": 0.025508427992463112, "learning_rate": 3.2336867862969007e-05, "loss": 0.0725, "num_input_tokens_seen": 17137760, "step": 7930 }, { "epoch": 1.2944535073409462, "grad_norm": 1.6763790845870972, "learning_rate": 3.235725938009788e-05, "loss": 0.3411, "num_input_tokens_seen": 17149056, "step": 7935 }, { "epoch": 1.2952691680261013, "grad_norm": 0.08741169422864914, "learning_rate": 3.237765089722676e-05, "loss": 0.1705, "num_input_tokens_seen": 17160096, "step": 7940 }, { "epoch": 1.296084828711256, "grad_norm": 1.4976335763931274, "learning_rate": 3.239804241435563e-05, "loss": 0.1429, "num_input_tokens_seen": 17170592, "step": 7945 }, { "epoch": 1.2969004893964111, "grad_norm": 1.947638988494873, "learning_rate": 3.24184339314845e-05, "loss": 0.3321, "num_input_tokens_seen": 17181760, "step": 7950 }, { "epoch": 1.2977161500815662, "grad_norm": 2.5989532470703125, "learning_rate": 3.243882544861338e-05, "loss": 0.2903, "num_input_tokens_seen": 17192416, "step": 7955 }, { "epoch": 1.298531810766721, "grad_norm": 1.2122973203659058, "learning_rate": 3.245921696574225e-05, "loss": 0.1029, "num_input_tokens_seen": 17204256, "step": 7960 }, { "epoch": 1.299347471451876, "grad_norm": 0.8616533279418945, "learning_rate": 3.247960848287112e-05, "loss": 0.2318, "num_input_tokens_seen": 17215520, "step": 7965 }, { "epoch": 1.300163132137031, "grad_norm": 1.6566909551620483, "learning_rate": 3.2500000000000004e-05, "loss": 0.2187, "num_input_tokens_seen": 17226560, "step": 7970 }, { "epoch": 1.3009787928221859, "grad_norm": 0.2148759663105011, "learning_rate": 3.252039151712888e-05, "loss": 0.1185, "num_input_tokens_seen": 17236864, "step": 7975 }, { "epoch": 1.301794453507341, "grad_norm": 1.0688302516937256, "learning_rate": 3.254078303425775e-05, "loss": 0.1782, "num_input_tokens_seen": 17247424, "step": 7980 }, { "epoch": 1.302610114192496, "grad_norm": 1.355430006980896, "learning_rate": 3.2561174551386625e-05, "loss": 0.0952, "num_input_tokens_seen": 17258208, "step": 7985 }, { "epoch": 1.3034257748776508, "grad_norm": 1.2531392574310303, "learning_rate": 3.25815660685155e-05, "loss": 0.1455, "num_input_tokens_seen": 17269280, "step": 7990 }, { "epoch": 1.3042414355628058, "grad_norm": 0.7680014967918396, "learning_rate": 3.260195758564437e-05, "loss": 0.056, "num_input_tokens_seen": 17278816, "step": 7995 }, { "epoch": 1.3050570962479608, "grad_norm": 0.7367815971374512, "learning_rate": 3.262234910277325e-05, "loss": 0.0501, "num_input_tokens_seen": 17289824, "step": 8000 }, { "epoch": 1.3058727569331159, "grad_norm": 0.5298978090286255, "learning_rate": 3.264274061990212e-05, "loss": 0.101, "num_input_tokens_seen": 17300448, "step": 8005 }, { "epoch": 1.306688417618271, "grad_norm": 1.5037504434585571, "learning_rate": 3.2663132137030995e-05, "loss": 0.2604, "num_input_tokens_seen": 17310592, "step": 8010 }, { "epoch": 1.3075040783034257, "grad_norm": 0.20582814514636993, "learning_rate": 3.268352365415987e-05, "loss": 0.1283, "num_input_tokens_seen": 17321632, "step": 8015 }, { "epoch": 1.3083197389885808, "grad_norm": 0.506993293762207, "learning_rate": 3.270391517128875e-05, "loss": 0.156, "num_input_tokens_seen": 17332960, "step": 8020 }, { "epoch": 1.3091353996737358, "grad_norm": 2.880558729171753, "learning_rate": 3.272430668841762e-05, "loss": 0.3439, "num_input_tokens_seen": 17343808, "step": 8025 }, { "epoch": 1.3099510603588906, "grad_norm": 0.16281837224960327, "learning_rate": 3.274469820554649e-05, "loss": 0.1749, "num_input_tokens_seen": 17353696, "step": 8030 }, { "epoch": 1.3107667210440457, "grad_norm": 0.49560174345970154, "learning_rate": 3.276508972267537e-05, "loss": 0.1757, "num_input_tokens_seen": 17364000, "step": 8035 }, { "epoch": 1.3115823817292007, "grad_norm": 0.4752032458782196, "learning_rate": 3.2785481239804244e-05, "loss": 0.1416, "num_input_tokens_seen": 17371936, "step": 8040 }, { "epoch": 1.3123980424143555, "grad_norm": 0.8630049824714661, "learning_rate": 3.280587275693312e-05, "loss": 0.1554, "num_input_tokens_seen": 17382304, "step": 8045 }, { "epoch": 1.3132137030995106, "grad_norm": 0.5562887191772461, "learning_rate": 3.282626427406199e-05, "loss": 0.0551, "num_input_tokens_seen": 17392000, "step": 8050 }, { "epoch": 1.3140293637846656, "grad_norm": 0.41155919432640076, "learning_rate": 3.2846655791190866e-05, "loss": 0.0992, "num_input_tokens_seen": 17402528, "step": 8055 }, { "epoch": 1.3148450244698204, "grad_norm": 0.15330134332180023, "learning_rate": 3.286704730831974e-05, "loss": 0.2084, "num_input_tokens_seen": 17412480, "step": 8060 }, { "epoch": 1.3156606851549755, "grad_norm": 0.4071227014064789, "learning_rate": 3.288743882544861e-05, "loss": 0.0965, "num_input_tokens_seen": 17423520, "step": 8065 }, { "epoch": 1.3164763458401305, "grad_norm": 0.6776436567306519, "learning_rate": 3.290783034257749e-05, "loss": 0.1242, "num_input_tokens_seen": 17434176, "step": 8070 }, { "epoch": 1.3172920065252856, "grad_norm": 1.954531192779541, "learning_rate": 3.292822185970637e-05, "loss": 0.2044, "num_input_tokens_seen": 17445504, "step": 8075 }, { "epoch": 1.3181076672104406, "grad_norm": 0.27457737922668457, "learning_rate": 3.2948613376835235e-05, "loss": 0.0765, "num_input_tokens_seen": 17456544, "step": 8080 }, { "epoch": 1.3189233278955954, "grad_norm": 0.9595165848731995, "learning_rate": 3.296900489396411e-05, "loss": 0.13, "num_input_tokens_seen": 17466432, "step": 8085 }, { "epoch": 1.3197389885807504, "grad_norm": 1.2455354928970337, "learning_rate": 3.298939641109299e-05, "loss": 0.08, "num_input_tokens_seen": 17477472, "step": 8090 }, { "epoch": 1.3205546492659055, "grad_norm": 0.4855937957763672, "learning_rate": 3.300978792822186e-05, "loss": 0.0946, "num_input_tokens_seen": 17487360, "step": 8095 }, { "epoch": 1.3213703099510603, "grad_norm": 0.37317991256713867, "learning_rate": 3.303017944535074e-05, "loss": 0.0797, "num_input_tokens_seen": 17497568, "step": 8100 }, { "epoch": 1.3221859706362153, "grad_norm": 0.683054506778717, "learning_rate": 3.305057096247961e-05, "loss": 0.0791, "num_input_tokens_seen": 17508768, "step": 8105 }, { "epoch": 1.3230016313213704, "grad_norm": 0.31551891565322876, "learning_rate": 3.3070962479608485e-05, "loss": 0.1243, "num_input_tokens_seen": 17518368, "step": 8110 }, { "epoch": 1.3238172920065252, "grad_norm": 2.6679718494415283, "learning_rate": 3.309135399673736e-05, "loss": 0.3612, "num_input_tokens_seen": 17528416, "step": 8115 }, { "epoch": 1.3246329526916802, "grad_norm": 0.42504215240478516, "learning_rate": 3.311174551386623e-05, "loss": 0.2889, "num_input_tokens_seen": 17537600, "step": 8120 }, { "epoch": 1.3254486133768353, "grad_norm": 0.4772164225578308, "learning_rate": 3.3132137030995106e-05, "loss": 0.0854, "num_input_tokens_seen": 17548736, "step": 8125 }, { "epoch": 1.32626427406199, "grad_norm": 0.7715117335319519, "learning_rate": 3.3152528548123987e-05, "loss": 0.2917, "num_input_tokens_seen": 17558336, "step": 8130 }, { "epoch": 1.3270799347471451, "grad_norm": 0.4470285475254059, "learning_rate": 3.3172920065252854e-05, "loss": 0.0419, "num_input_tokens_seen": 17570272, "step": 8135 }, { "epoch": 1.3278955954323002, "grad_norm": 0.8129357695579529, "learning_rate": 3.3193311582381734e-05, "loss": 0.1084, "num_input_tokens_seen": 17580960, "step": 8140 }, { "epoch": 1.3287112561174552, "grad_norm": 0.20494697988033295, "learning_rate": 3.321370309951061e-05, "loss": 0.1221, "num_input_tokens_seen": 17590528, "step": 8145 }, { "epoch": 1.32952691680261, "grad_norm": 1.00864577293396, "learning_rate": 3.3234094616639475e-05, "loss": 0.1713, "num_input_tokens_seen": 17600672, "step": 8150 }, { "epoch": 1.330342577487765, "grad_norm": 0.5659168362617493, "learning_rate": 3.3254486133768356e-05, "loss": 0.2446, "num_input_tokens_seen": 17611648, "step": 8155 }, { "epoch": 1.3311582381729201, "grad_norm": 0.6662846803665161, "learning_rate": 3.327487765089723e-05, "loss": 0.2215, "num_input_tokens_seen": 17622656, "step": 8160 }, { "epoch": 1.3319738988580752, "grad_norm": 0.28888851404190063, "learning_rate": 3.32952691680261e-05, "loss": 0.0977, "num_input_tokens_seen": 17632864, "step": 8165 }, { "epoch": 1.33278955954323, "grad_norm": 0.14157478511333466, "learning_rate": 3.331566068515498e-05, "loss": 0.1448, "num_input_tokens_seen": 17643968, "step": 8170 }, { "epoch": 1.333605220228385, "grad_norm": 1.013179063796997, "learning_rate": 3.333605220228385e-05, "loss": 0.1658, "num_input_tokens_seen": 17654272, "step": 8175 }, { "epoch": 1.33442088091354, "grad_norm": 0.46575406193733215, "learning_rate": 3.3356443719412725e-05, "loss": 0.1332, "num_input_tokens_seen": 17665728, "step": 8180 }, { "epoch": 1.3352365415986949, "grad_norm": 0.29163211584091187, "learning_rate": 3.33768352365416e-05, "loss": 0.2627, "num_input_tokens_seen": 17677856, "step": 8185 }, { "epoch": 1.33605220228385, "grad_norm": 0.15699200332164764, "learning_rate": 3.339722675367047e-05, "loss": 0.1239, "num_input_tokens_seen": 17688672, "step": 8190 }, { "epoch": 1.336867862969005, "grad_norm": 0.4692433178424835, "learning_rate": 3.341761827079935e-05, "loss": 0.1539, "num_input_tokens_seen": 17699456, "step": 8195 }, { "epoch": 1.3376835236541598, "grad_norm": 1.7604652643203735, "learning_rate": 3.343800978792823e-05, "loss": 0.1913, "num_input_tokens_seen": 17710976, "step": 8200 }, { "epoch": 1.3384991843393148, "grad_norm": 0.9676280617713928, "learning_rate": 3.3458401305057094e-05, "loss": 0.0806, "num_input_tokens_seen": 17721152, "step": 8205 }, { "epoch": 1.3393148450244698, "grad_norm": 0.8670285940170288, "learning_rate": 3.3478792822185975e-05, "loss": 0.1621, "num_input_tokens_seen": 17731200, "step": 8210 }, { "epoch": 1.3401305057096247, "grad_norm": 1.50360107421875, "learning_rate": 3.349918433931485e-05, "loss": 0.1977, "num_input_tokens_seen": 17741760, "step": 8215 }, { "epoch": 1.3409461663947797, "grad_norm": 1.496700644493103, "learning_rate": 3.351957585644372e-05, "loss": 0.175, "num_input_tokens_seen": 17754368, "step": 8220 }, { "epoch": 1.3417618270799347, "grad_norm": 1.076671838760376, "learning_rate": 3.3539967373572596e-05, "loss": 0.2158, "num_input_tokens_seen": 17764096, "step": 8225 }, { "epoch": 1.3425774877650898, "grad_norm": 0.7248032689094543, "learning_rate": 3.356035889070147e-05, "loss": 0.1316, "num_input_tokens_seen": 17774784, "step": 8230 }, { "epoch": 1.3433931484502448, "grad_norm": 0.16087286174297333, "learning_rate": 3.3580750407830344e-05, "loss": 0.0466, "num_input_tokens_seen": 17786112, "step": 8235 }, { "epoch": 1.3442088091353996, "grad_norm": 1.5406805276870728, "learning_rate": 3.360114192495922e-05, "loss": 0.0832, "num_input_tokens_seen": 17797120, "step": 8240 }, { "epoch": 1.3450244698205547, "grad_norm": 1.1945672035217285, "learning_rate": 3.362153344208809e-05, "loss": 0.158, "num_input_tokens_seen": 17808032, "step": 8245 }, { "epoch": 1.3458401305057097, "grad_norm": 0.28966668248176575, "learning_rate": 3.364192495921697e-05, "loss": 0.0499, "num_input_tokens_seen": 17818144, "step": 8250 }, { "epoch": 1.3466557911908645, "grad_norm": 0.18572062253952026, "learning_rate": 3.366231647634584e-05, "loss": 0.0536, "num_input_tokens_seen": 17828032, "step": 8255 }, { "epoch": 1.3474714518760196, "grad_norm": 0.8240967988967896, "learning_rate": 3.368270799347471e-05, "loss": 0.1284, "num_input_tokens_seen": 17839232, "step": 8260 }, { "epoch": 1.3482871125611746, "grad_norm": 0.6289734840393066, "learning_rate": 3.370309951060359e-05, "loss": 0.0493, "num_input_tokens_seen": 17849088, "step": 8265 }, { "epoch": 1.3491027732463294, "grad_norm": 0.14027953147888184, "learning_rate": 3.372349102773246e-05, "loss": 0.064, "num_input_tokens_seen": 17860512, "step": 8270 }, { "epoch": 1.3499184339314845, "grad_norm": 0.41360947489738464, "learning_rate": 3.374388254486134e-05, "loss": 0.3605, "num_input_tokens_seen": 17872000, "step": 8275 }, { "epoch": 1.3507340946166395, "grad_norm": 0.40547481179237366, "learning_rate": 3.3764274061990215e-05, "loss": 0.1576, "num_input_tokens_seen": 17883712, "step": 8280 }, { "epoch": 1.3515497553017943, "grad_norm": 1.5238080024719238, "learning_rate": 3.378466557911909e-05, "loss": 0.1259, "num_input_tokens_seen": 17894240, "step": 8285 }, { "epoch": 1.3523654159869494, "grad_norm": 0.12662386894226074, "learning_rate": 3.380505709624796e-05, "loss": 0.1517, "num_input_tokens_seen": 17903904, "step": 8290 }, { "epoch": 1.3531810766721044, "grad_norm": 0.4738956093788147, "learning_rate": 3.3825448613376836e-05, "loss": 0.1555, "num_input_tokens_seen": 17914208, "step": 8295 }, { "epoch": 1.3539967373572595, "grad_norm": 0.04475792497396469, "learning_rate": 3.384584013050571e-05, "loss": 0.0204, "num_input_tokens_seen": 17924640, "step": 8300 }, { "epoch": 1.3548123980424145, "grad_norm": 0.15591184794902802, "learning_rate": 3.3866231647634584e-05, "loss": 0.0941, "num_input_tokens_seen": 17935520, "step": 8305 }, { "epoch": 1.3556280587275693, "grad_norm": 0.7797941565513611, "learning_rate": 3.388662316476346e-05, "loss": 0.1196, "num_input_tokens_seen": 17946848, "step": 8310 }, { "epoch": 1.3564437194127243, "grad_norm": 0.7514940500259399, "learning_rate": 3.390701468189234e-05, "loss": 0.0952, "num_input_tokens_seen": 17957568, "step": 8315 }, { "epoch": 1.3572593800978794, "grad_norm": 2.0413243770599365, "learning_rate": 3.392740619902121e-05, "loss": 0.0981, "num_input_tokens_seen": 17967776, "step": 8320 }, { "epoch": 1.3580750407830342, "grad_norm": 1.4346760511398315, "learning_rate": 3.394779771615008e-05, "loss": 0.08, "num_input_tokens_seen": 17978688, "step": 8325 }, { "epoch": 1.3588907014681892, "grad_norm": 0.24302583932876587, "learning_rate": 3.396818923327896e-05, "loss": 0.1853, "num_input_tokens_seen": 17990432, "step": 8330 }, { "epoch": 1.3597063621533443, "grad_norm": 2.5725717544555664, "learning_rate": 3.3988580750407834e-05, "loss": 0.2179, "num_input_tokens_seen": 18001888, "step": 8335 }, { "epoch": 1.360522022838499, "grad_norm": 0.13523806631565094, "learning_rate": 3.40089722675367e-05, "loss": 0.0979, "num_input_tokens_seen": 18013536, "step": 8340 }, { "epoch": 1.3613376835236541, "grad_norm": 2.8524084091186523, "learning_rate": 3.402936378466558e-05, "loss": 0.3174, "num_input_tokens_seen": 18024864, "step": 8345 }, { "epoch": 1.3621533442088092, "grad_norm": 1.8659899234771729, "learning_rate": 3.4049755301794455e-05, "loss": 0.0823, "num_input_tokens_seen": 18036192, "step": 8350 }, { "epoch": 1.362969004893964, "grad_norm": 1.7079176902770996, "learning_rate": 3.407014681892333e-05, "loss": 0.1646, "num_input_tokens_seen": 18044864, "step": 8355 }, { "epoch": 1.363784665579119, "grad_norm": 0.3070336878299713, "learning_rate": 3.40905383360522e-05, "loss": 0.2309, "num_input_tokens_seen": 18056192, "step": 8360 }, { "epoch": 1.364600326264274, "grad_norm": 0.11936453729867935, "learning_rate": 3.4110929853181077e-05, "loss": 0.1283, "num_input_tokens_seen": 18066592, "step": 8365 }, { "epoch": 1.3654159869494291, "grad_norm": 1.1823725700378418, "learning_rate": 3.413132137030996e-05, "loss": 0.1598, "num_input_tokens_seen": 18077408, "step": 8370 }, { "epoch": 1.366231647634584, "grad_norm": 1.1475764513015747, "learning_rate": 3.4151712887438824e-05, "loss": 0.1296, "num_input_tokens_seen": 18088544, "step": 8375 }, { "epoch": 1.367047308319739, "grad_norm": 0.4997105598449707, "learning_rate": 3.41721044045677e-05, "loss": 0.2008, "num_input_tokens_seen": 18100192, "step": 8380 }, { "epoch": 1.367862969004894, "grad_norm": 0.7039017677307129, "learning_rate": 3.419249592169658e-05, "loss": 0.3041, "num_input_tokens_seen": 18110432, "step": 8385 }, { "epoch": 1.368678629690049, "grad_norm": 0.35550418496131897, "learning_rate": 3.421288743882545e-05, "loss": 0.2078, "num_input_tokens_seen": 18120928, "step": 8390 }, { "epoch": 1.3694942903752039, "grad_norm": 0.6171401143074036, "learning_rate": 3.4233278955954326e-05, "loss": 0.1996, "num_input_tokens_seen": 18131104, "step": 8395 }, { "epoch": 1.370309951060359, "grad_norm": 0.04249635338783264, "learning_rate": 3.42536704730832e-05, "loss": 0.0531, "num_input_tokens_seen": 18142720, "step": 8400 }, { "epoch": 1.371125611745514, "grad_norm": 0.15382078289985657, "learning_rate": 3.4274061990212074e-05, "loss": 0.0837, "num_input_tokens_seen": 18153664, "step": 8405 }, { "epoch": 1.3719412724306688, "grad_norm": 1.9465186595916748, "learning_rate": 3.429445350734095e-05, "loss": 0.4807, "num_input_tokens_seen": 18164512, "step": 8410 }, { "epoch": 1.3727569331158238, "grad_norm": 0.5409563183784485, "learning_rate": 3.431484502446982e-05, "loss": 0.2967, "num_input_tokens_seen": 18174368, "step": 8415 }, { "epoch": 1.3735725938009788, "grad_norm": 1.0447885990142822, "learning_rate": 3.4335236541598695e-05, "loss": 0.0931, "num_input_tokens_seen": 18185216, "step": 8420 }, { "epoch": 1.3743882544861337, "grad_norm": 0.8947162628173828, "learning_rate": 3.4355628058727576e-05, "loss": 0.1174, "num_input_tokens_seen": 18196256, "step": 8425 }, { "epoch": 1.3752039151712887, "grad_norm": 0.9181270003318787, "learning_rate": 3.437601957585644e-05, "loss": 0.0887, "num_input_tokens_seen": 18207264, "step": 8430 }, { "epoch": 1.3760195758564437, "grad_norm": 2.1099917888641357, "learning_rate": 3.439641109298532e-05, "loss": 0.2689, "num_input_tokens_seen": 18217760, "step": 8435 }, { "epoch": 1.3768352365415986, "grad_norm": 0.12338656932115555, "learning_rate": 3.44168026101142e-05, "loss": 0.1251, "num_input_tokens_seen": 18228608, "step": 8440 }, { "epoch": 1.3776508972267536, "grad_norm": 2.0169379711151123, "learning_rate": 3.4437194127243065e-05, "loss": 0.2538, "num_input_tokens_seen": 18239200, "step": 8445 }, { "epoch": 1.3784665579119086, "grad_norm": 3.409749984741211, "learning_rate": 3.4457585644371945e-05, "loss": 0.2729, "num_input_tokens_seen": 18251200, "step": 8450 }, { "epoch": 1.3792822185970637, "grad_norm": 1.4616402387619019, "learning_rate": 3.447797716150082e-05, "loss": 0.0725, "num_input_tokens_seen": 18261856, "step": 8455 }, { "epoch": 1.3800978792822187, "grad_norm": 0.5608766674995422, "learning_rate": 3.449836867862969e-05, "loss": 0.054, "num_input_tokens_seen": 18273152, "step": 8460 }, { "epoch": 1.3809135399673735, "grad_norm": 0.1965847760438919, "learning_rate": 3.4518760195758567e-05, "loss": 0.0717, "num_input_tokens_seen": 18284896, "step": 8465 }, { "epoch": 1.3817292006525286, "grad_norm": 0.23029837012290955, "learning_rate": 3.453915171288744e-05, "loss": 0.1416, "num_input_tokens_seen": 18294912, "step": 8470 }, { "epoch": 1.3825448613376836, "grad_norm": 0.7230048179626465, "learning_rate": 3.4559543230016314e-05, "loss": 0.1324, "num_input_tokens_seen": 18304672, "step": 8475 }, { "epoch": 1.3833605220228384, "grad_norm": 0.45215579867362976, "learning_rate": 3.457993474714519e-05, "loss": 0.1576, "num_input_tokens_seen": 18316256, "step": 8480 }, { "epoch": 1.3841761827079935, "grad_norm": 0.17936651408672333, "learning_rate": 3.460032626427406e-05, "loss": 0.2042, "num_input_tokens_seen": 18327904, "step": 8485 }, { "epoch": 1.3849918433931485, "grad_norm": 0.08195719867944717, "learning_rate": 3.462071778140294e-05, "loss": 0.0537, "num_input_tokens_seen": 18338752, "step": 8490 }, { "epoch": 1.3858075040783033, "grad_norm": 0.19731712341308594, "learning_rate": 3.464110929853181e-05, "loss": 0.1163, "num_input_tokens_seen": 18350464, "step": 8495 }, { "epoch": 1.3866231647634584, "grad_norm": 0.9131155014038086, "learning_rate": 3.466150081566068e-05, "loss": 0.182, "num_input_tokens_seen": 18360832, "step": 8500 }, { "epoch": 1.3874388254486134, "grad_norm": 1.632011890411377, "learning_rate": 3.4681892332789564e-05, "loss": 0.1753, "num_input_tokens_seen": 18370976, "step": 8505 }, { "epoch": 1.3882544861337682, "grad_norm": 0.1228082999587059, "learning_rate": 3.470228384991844e-05, "loss": 0.1583, "num_input_tokens_seen": 18381920, "step": 8510 }, { "epoch": 1.3890701468189233, "grad_norm": 0.4466029107570648, "learning_rate": 3.4722675367047305e-05, "loss": 0.1995, "num_input_tokens_seen": 18393152, "step": 8515 }, { "epoch": 1.3898858075040783, "grad_norm": 0.8562221527099609, "learning_rate": 3.4743066884176185e-05, "loss": 0.1103, "num_input_tokens_seen": 18403168, "step": 8520 }, { "epoch": 1.3907014681892333, "grad_norm": 0.13069143891334534, "learning_rate": 3.476345840130506e-05, "loss": 0.2047, "num_input_tokens_seen": 18413792, "step": 8525 }, { "epoch": 1.3915171288743884, "grad_norm": 1.780239462852478, "learning_rate": 3.478384991843393e-05, "loss": 0.1391, "num_input_tokens_seen": 18424864, "step": 8530 }, { "epoch": 1.3923327895595432, "grad_norm": 1.1525864601135254, "learning_rate": 3.480424143556281e-05, "loss": 0.0666, "num_input_tokens_seen": 18434144, "step": 8535 }, { "epoch": 1.3931484502446982, "grad_norm": 1.208383321762085, "learning_rate": 3.482463295269168e-05, "loss": 0.1968, "num_input_tokens_seen": 18444576, "step": 8540 }, { "epoch": 1.3939641109298533, "grad_norm": 0.40610769391059875, "learning_rate": 3.484502446982056e-05, "loss": 0.2383, "num_input_tokens_seen": 18455808, "step": 8545 }, { "epoch": 1.394779771615008, "grad_norm": 0.7057903409004211, "learning_rate": 3.486541598694943e-05, "loss": 0.0761, "num_input_tokens_seen": 18466848, "step": 8550 }, { "epoch": 1.3955954323001631, "grad_norm": 0.6883257627487183, "learning_rate": 3.48858075040783e-05, "loss": 0.2245, "num_input_tokens_seen": 18479040, "step": 8555 }, { "epoch": 1.3964110929853182, "grad_norm": 0.6603574752807617, "learning_rate": 3.490619902120718e-05, "loss": 0.138, "num_input_tokens_seen": 18490080, "step": 8560 }, { "epoch": 1.397226753670473, "grad_norm": 1.8832846879959106, "learning_rate": 3.492659053833605e-05, "loss": 0.2694, "num_input_tokens_seen": 18501632, "step": 8565 }, { "epoch": 1.398042414355628, "grad_norm": 0.05719039589166641, "learning_rate": 3.494698205546493e-05, "loss": 0.154, "num_input_tokens_seen": 18513056, "step": 8570 }, { "epoch": 1.398858075040783, "grad_norm": 0.4685496985912323, "learning_rate": 3.4967373572593804e-05, "loss": 0.0193, "num_input_tokens_seen": 18523616, "step": 8575 }, { "epoch": 1.399673735725938, "grad_norm": 0.4328828454017639, "learning_rate": 3.498776508972268e-05, "loss": 0.1549, "num_input_tokens_seen": 18534688, "step": 8580 }, { "epoch": 1.400489396411093, "grad_norm": 0.2567574679851532, "learning_rate": 3.500815660685155e-05, "loss": 0.2083, "num_input_tokens_seen": 18545120, "step": 8585 }, { "epoch": 1.401305057096248, "grad_norm": 0.38120144605636597, "learning_rate": 3.5028548123980426e-05, "loss": 0.0935, "num_input_tokens_seen": 18556224, "step": 8590 }, { "epoch": 1.402120717781403, "grad_norm": 0.8701684474945068, "learning_rate": 3.50489396411093e-05, "loss": 0.4595, "num_input_tokens_seen": 18566272, "step": 8595 }, { "epoch": 1.4029363784665578, "grad_norm": 0.4132962226867676, "learning_rate": 3.506933115823817e-05, "loss": 0.2199, "num_input_tokens_seen": 18577216, "step": 8600 }, { "epoch": 1.4037520391517129, "grad_norm": 0.19861069321632385, "learning_rate": 3.508972267536705e-05, "loss": 0.337, "num_input_tokens_seen": 18587072, "step": 8605 }, { "epoch": 1.404567699836868, "grad_norm": 1.2774569988250732, "learning_rate": 3.511011419249592e-05, "loss": 0.1076, "num_input_tokens_seen": 18598112, "step": 8610 }, { "epoch": 1.405383360522023, "grad_norm": 0.8359360694885254, "learning_rate": 3.51305057096248e-05, "loss": 0.2002, "num_input_tokens_seen": 18608384, "step": 8615 }, { "epoch": 1.4061990212071778, "grad_norm": 0.7352054119110107, "learning_rate": 3.515089722675367e-05, "loss": 0.0749, "num_input_tokens_seen": 18618944, "step": 8620 }, { "epoch": 1.4070146818923328, "grad_norm": 0.2556878626346588, "learning_rate": 3.517128874388255e-05, "loss": 0.1152, "num_input_tokens_seen": 18629248, "step": 8625 }, { "epoch": 1.4078303425774878, "grad_norm": 0.5232822895050049, "learning_rate": 3.519168026101142e-05, "loss": 0.0444, "num_input_tokens_seen": 18641248, "step": 8630 }, { "epoch": 1.4086460032626427, "grad_norm": 0.18315750360488892, "learning_rate": 3.521207177814029e-05, "loss": 0.0327, "num_input_tokens_seen": 18651520, "step": 8635 }, { "epoch": 1.4094616639477977, "grad_norm": 0.7163806557655334, "learning_rate": 3.523246329526917e-05, "loss": 0.2009, "num_input_tokens_seen": 18662304, "step": 8640 }, { "epoch": 1.4102773246329527, "grad_norm": 0.0265289805829525, "learning_rate": 3.5252854812398045e-05, "loss": 0.1114, "num_input_tokens_seen": 18672416, "step": 8645 }, { "epoch": 1.4110929853181076, "grad_norm": 1.1695860624313354, "learning_rate": 3.527324632952692e-05, "loss": 0.1026, "num_input_tokens_seen": 18683136, "step": 8650 }, { "epoch": 1.4119086460032626, "grad_norm": 0.17457425594329834, "learning_rate": 3.529363784665579e-05, "loss": 0.2579, "num_input_tokens_seen": 18693472, "step": 8655 }, { "epoch": 1.4127243066884176, "grad_norm": 0.3990385830402374, "learning_rate": 3.5314029363784666e-05, "loss": 0.0672, "num_input_tokens_seen": 18704128, "step": 8660 }, { "epoch": 1.4135399673735725, "grad_norm": 0.5275196433067322, "learning_rate": 3.5334420880913547e-05, "loss": 0.0732, "num_input_tokens_seen": 18715456, "step": 8665 }, { "epoch": 1.4143556280587275, "grad_norm": 0.5347080230712891, "learning_rate": 3.5354812398042414e-05, "loss": 0.1865, "num_input_tokens_seen": 18726240, "step": 8670 }, { "epoch": 1.4151712887438825, "grad_norm": 0.2040901482105255, "learning_rate": 3.537520391517129e-05, "loss": 0.0599, "num_input_tokens_seen": 18736992, "step": 8675 }, { "epoch": 1.4159869494290376, "grad_norm": 0.49543654918670654, "learning_rate": 3.539559543230017e-05, "loss": 0.0787, "num_input_tokens_seen": 18749216, "step": 8680 }, { "epoch": 1.4168026101141926, "grad_norm": 0.06324899941682816, "learning_rate": 3.541598694942904e-05, "loss": 0.0951, "num_input_tokens_seen": 18760352, "step": 8685 }, { "epoch": 1.4176182707993474, "grad_norm": 0.37602710723876953, "learning_rate": 3.5436378466557916e-05, "loss": 0.1121, "num_input_tokens_seen": 18769792, "step": 8690 }, { "epoch": 1.4184339314845025, "grad_norm": 0.168251171708107, "learning_rate": 3.545676998368679e-05, "loss": 0.2515, "num_input_tokens_seen": 18781696, "step": 8695 }, { "epoch": 1.4192495921696575, "grad_norm": 1.0763535499572754, "learning_rate": 3.547716150081566e-05, "loss": 0.1726, "num_input_tokens_seen": 18790912, "step": 8700 }, { "epoch": 1.4200652528548123, "grad_norm": 1.7877050638198853, "learning_rate": 3.549755301794454e-05, "loss": 0.1832, "num_input_tokens_seen": 18800672, "step": 8705 }, { "epoch": 1.4208809135399674, "grad_norm": 2.7680394649505615, "learning_rate": 3.551794453507341e-05, "loss": 0.205, "num_input_tokens_seen": 18812032, "step": 8710 }, { "epoch": 1.4216965742251224, "grad_norm": 1.6999238729476929, "learning_rate": 3.5538336052202285e-05, "loss": 0.2441, "num_input_tokens_seen": 18823328, "step": 8715 }, { "epoch": 1.4225122349102772, "grad_norm": 1.593079924583435, "learning_rate": 3.555872756933116e-05, "loss": 0.2392, "num_input_tokens_seen": 18834048, "step": 8720 }, { "epoch": 1.4233278955954323, "grad_norm": 1.0210304260253906, "learning_rate": 3.557911908646003e-05, "loss": 0.1676, "num_input_tokens_seen": 18845216, "step": 8725 }, { "epoch": 1.4241435562805873, "grad_norm": 0.12143933773040771, "learning_rate": 3.5599510603588906e-05, "loss": 0.1691, "num_input_tokens_seen": 18854848, "step": 8730 }, { "epoch": 1.4249592169657421, "grad_norm": 1.4706982374191284, "learning_rate": 3.561990212071779e-05, "loss": 0.1372, "num_input_tokens_seen": 18866368, "step": 8735 }, { "epoch": 1.4257748776508972, "grad_norm": 1.1104543209075928, "learning_rate": 3.5640293637846654e-05, "loss": 0.2903, "num_input_tokens_seen": 18878464, "step": 8740 }, { "epoch": 1.4265905383360522, "grad_norm": 0.22824575006961823, "learning_rate": 3.5660685154975535e-05, "loss": 0.2032, "num_input_tokens_seen": 18889856, "step": 8745 }, { "epoch": 1.4274061990212072, "grad_norm": 0.21102416515350342, "learning_rate": 3.568107667210441e-05, "loss": 0.1618, "num_input_tokens_seen": 18900352, "step": 8750 }, { "epoch": 1.4282218597063623, "grad_norm": 0.2107299566268921, "learning_rate": 3.5701468189233275e-05, "loss": 0.1036, "num_input_tokens_seen": 18911936, "step": 8755 }, { "epoch": 1.429037520391517, "grad_norm": 0.3393383026123047, "learning_rate": 3.5721859706362156e-05, "loss": 0.1284, "num_input_tokens_seen": 18922048, "step": 8760 }, { "epoch": 1.4298531810766721, "grad_norm": 1.4784893989562988, "learning_rate": 3.574225122349103e-05, "loss": 0.1418, "num_input_tokens_seen": 18932288, "step": 8765 }, { "epoch": 1.4306688417618272, "grad_norm": 0.9119970202445984, "learning_rate": 3.5762642740619904e-05, "loss": 0.1664, "num_input_tokens_seen": 18943008, "step": 8770 }, { "epoch": 1.431484502446982, "grad_norm": 1.332330346107483, "learning_rate": 3.578303425774878e-05, "loss": 0.1292, "num_input_tokens_seen": 18953376, "step": 8775 }, { "epoch": 1.432300163132137, "grad_norm": 0.3202761113643646, "learning_rate": 3.580342577487765e-05, "loss": 0.0443, "num_input_tokens_seen": 18965600, "step": 8780 }, { "epoch": 1.433115823817292, "grad_norm": 0.4961252808570862, "learning_rate": 3.5823817292006525e-05, "loss": 0.1161, "num_input_tokens_seen": 18975936, "step": 8785 }, { "epoch": 1.433931484502447, "grad_norm": 0.12128481268882751, "learning_rate": 3.58442088091354e-05, "loss": 0.0585, "num_input_tokens_seen": 18986048, "step": 8790 }, { "epoch": 1.434747145187602, "grad_norm": 0.5476045608520508, "learning_rate": 3.586460032626427e-05, "loss": 0.1358, "num_input_tokens_seen": 18997056, "step": 8795 }, { "epoch": 1.435562805872757, "grad_norm": 0.11753060668706894, "learning_rate": 3.588499184339315e-05, "loss": 0.0354, "num_input_tokens_seen": 19007744, "step": 8800 }, { "epoch": 1.4363784665579118, "grad_norm": 0.7172370553016663, "learning_rate": 3.590538336052203e-05, "loss": 0.1858, "num_input_tokens_seen": 19018784, "step": 8805 }, { "epoch": 1.4371941272430668, "grad_norm": 0.7970319986343384, "learning_rate": 3.5925774877650894e-05, "loss": 0.1598, "num_input_tokens_seen": 19030176, "step": 8810 }, { "epoch": 1.4380097879282219, "grad_norm": 0.13491101562976837, "learning_rate": 3.5946166394779775e-05, "loss": 0.0943, "num_input_tokens_seen": 19041088, "step": 8815 }, { "epoch": 1.438825448613377, "grad_norm": 0.6166205406188965, "learning_rate": 3.596655791190865e-05, "loss": 0.0368, "num_input_tokens_seen": 19052352, "step": 8820 }, { "epoch": 1.4396411092985317, "grad_norm": 0.7920663952827454, "learning_rate": 3.598694942903752e-05, "loss": 0.0894, "num_input_tokens_seen": 19062688, "step": 8825 }, { "epoch": 1.4404567699836868, "grad_norm": 1.645377516746521, "learning_rate": 3.6007340946166396e-05, "loss": 0.1416, "num_input_tokens_seen": 19073504, "step": 8830 }, { "epoch": 1.4412724306688418, "grad_norm": 1.0022027492523193, "learning_rate": 3.602773246329527e-05, "loss": 0.2383, "num_input_tokens_seen": 19085792, "step": 8835 }, { "epoch": 1.4420880913539968, "grad_norm": 0.3937346935272217, "learning_rate": 3.604812398042415e-05, "loss": 0.1759, "num_input_tokens_seen": 19096512, "step": 8840 }, { "epoch": 1.4429037520391517, "grad_norm": 1.490692377090454, "learning_rate": 3.606851549755302e-05, "loss": 0.1722, "num_input_tokens_seen": 19106336, "step": 8845 }, { "epoch": 1.4437194127243067, "grad_norm": 1.3247740268707275, "learning_rate": 3.608890701468189e-05, "loss": 0.1361, "num_input_tokens_seen": 19116800, "step": 8850 }, { "epoch": 1.4445350734094617, "grad_norm": 1.408993124961853, "learning_rate": 3.610929853181077e-05, "loss": 0.1938, "num_input_tokens_seen": 19127488, "step": 8855 }, { "epoch": 1.4453507340946166, "grad_norm": 0.09042391180992126, "learning_rate": 3.612969004893964e-05, "loss": 0.2369, "num_input_tokens_seen": 19135264, "step": 8860 }, { "epoch": 1.4461663947797716, "grad_norm": 0.7029399275779724, "learning_rate": 3.615008156606852e-05, "loss": 0.1574, "num_input_tokens_seen": 19146656, "step": 8865 }, { "epoch": 1.4469820554649266, "grad_norm": 0.07612548768520355, "learning_rate": 3.6170473083197394e-05, "loss": 0.035, "num_input_tokens_seen": 19158880, "step": 8870 }, { "epoch": 1.4477977161500815, "grad_norm": 0.9720413088798523, "learning_rate": 3.619086460032627e-05, "loss": 0.2306, "num_input_tokens_seen": 19170240, "step": 8875 }, { "epoch": 1.4486133768352365, "grad_norm": 0.29054489731788635, "learning_rate": 3.621125611745514e-05, "loss": 0.0707, "num_input_tokens_seen": 19180960, "step": 8880 }, { "epoch": 1.4494290375203915, "grad_norm": 1.7739505767822266, "learning_rate": 3.6231647634584015e-05, "loss": 0.424, "num_input_tokens_seen": 19192576, "step": 8885 }, { "epoch": 1.4502446982055464, "grad_norm": 0.6429181098937988, "learning_rate": 3.625203915171289e-05, "loss": 0.1063, "num_input_tokens_seen": 19202752, "step": 8890 }, { "epoch": 1.4510603588907014, "grad_norm": 0.422404944896698, "learning_rate": 3.627243066884176e-05, "loss": 0.0826, "num_input_tokens_seen": 19214048, "step": 8895 }, { "epoch": 1.4518760195758564, "grad_norm": 2.2074790000915527, "learning_rate": 3.6292822185970637e-05, "loss": 0.3078, "num_input_tokens_seen": 19225056, "step": 8900 }, { "epoch": 1.4526916802610115, "grad_norm": 3.9438438415527344, "learning_rate": 3.631321370309951e-05, "loss": 0.2268, "num_input_tokens_seen": 19236480, "step": 8905 }, { "epoch": 1.4535073409461665, "grad_norm": 1.7539645433425903, "learning_rate": 3.633360522022839e-05, "loss": 0.2129, "num_input_tokens_seen": 19248192, "step": 8910 }, { "epoch": 1.4543230016313213, "grad_norm": 0.47606974840164185, "learning_rate": 3.635399673735726e-05, "loss": 0.1758, "num_input_tokens_seen": 19258368, "step": 8915 }, { "epoch": 1.4551386623164764, "grad_norm": 0.14671754837036133, "learning_rate": 3.637438825448614e-05, "loss": 0.0853, "num_input_tokens_seen": 19269088, "step": 8920 }, { "epoch": 1.4559543230016314, "grad_norm": 1.5150141716003418, "learning_rate": 3.639477977161501e-05, "loss": 0.1439, "num_input_tokens_seen": 19280384, "step": 8925 }, { "epoch": 1.4567699836867862, "grad_norm": 0.23910333216190338, "learning_rate": 3.641517128874388e-05, "loss": 0.2346, "num_input_tokens_seen": 19292192, "step": 8930 }, { "epoch": 1.4575856443719413, "grad_norm": 1.4446731805801392, "learning_rate": 3.643556280587276e-05, "loss": 0.1563, "num_input_tokens_seen": 19303904, "step": 8935 }, { "epoch": 1.4584013050570963, "grad_norm": 0.43221980333328247, "learning_rate": 3.6455954323001634e-05, "loss": 0.1214, "num_input_tokens_seen": 19315712, "step": 8940 }, { "epoch": 1.4592169657422511, "grad_norm": 0.5678945183753967, "learning_rate": 3.647634584013051e-05, "loss": 0.2223, "num_input_tokens_seen": 19326848, "step": 8945 }, { "epoch": 1.4600326264274062, "grad_norm": 3.6920571327209473, "learning_rate": 3.649673735725938e-05, "loss": 0.3462, "num_input_tokens_seen": 19338496, "step": 8950 }, { "epoch": 1.4608482871125612, "grad_norm": 1.1374636888504028, "learning_rate": 3.6517128874388255e-05, "loss": 0.2836, "num_input_tokens_seen": 19348992, "step": 8955 }, { "epoch": 1.461663947797716, "grad_norm": 0.6384190320968628, "learning_rate": 3.6537520391517136e-05, "loss": 0.1974, "num_input_tokens_seen": 19359680, "step": 8960 }, { "epoch": 1.462479608482871, "grad_norm": 1.8810054063796997, "learning_rate": 3.6557911908646e-05, "loss": 0.1176, "num_input_tokens_seen": 19370848, "step": 8965 }, { "epoch": 1.463295269168026, "grad_norm": 0.22248736023902893, "learning_rate": 3.657830342577488e-05, "loss": 0.1405, "num_input_tokens_seen": 19381888, "step": 8970 }, { "epoch": 1.4641109298531811, "grad_norm": 0.08306456357240677, "learning_rate": 3.659869494290376e-05, "loss": 0.0909, "num_input_tokens_seen": 19392768, "step": 8975 }, { "epoch": 1.4649265905383362, "grad_norm": 0.23028871417045593, "learning_rate": 3.6619086460032625e-05, "loss": 0.0659, "num_input_tokens_seen": 19403488, "step": 8980 }, { "epoch": 1.465742251223491, "grad_norm": 0.6212596297264099, "learning_rate": 3.66394779771615e-05, "loss": 0.2472, "num_input_tokens_seen": 19414656, "step": 8985 }, { "epoch": 1.466557911908646, "grad_norm": 0.3281605541706085, "learning_rate": 3.665986949429038e-05, "loss": 0.2576, "num_input_tokens_seen": 19424960, "step": 8990 }, { "epoch": 1.467373572593801, "grad_norm": 2.4531588554382324, "learning_rate": 3.668026101141925e-05, "loss": 0.1737, "num_input_tokens_seen": 19436512, "step": 8995 }, { "epoch": 1.468189233278956, "grad_norm": 2.113834857940674, "learning_rate": 3.6700652528548127e-05, "loss": 0.1424, "num_input_tokens_seen": 19447808, "step": 9000 }, { "epoch": 1.469004893964111, "grad_norm": 0.5178369879722595, "learning_rate": 3.6721044045677e-05, "loss": 0.0753, "num_input_tokens_seen": 19457984, "step": 9005 }, { "epoch": 1.469820554649266, "grad_norm": 0.3631610572338104, "learning_rate": 3.6741435562805874e-05, "loss": 0.0833, "num_input_tokens_seen": 19467680, "step": 9010 }, { "epoch": 1.4706362153344208, "grad_norm": 0.6922982931137085, "learning_rate": 3.676182707993475e-05, "loss": 0.1678, "num_input_tokens_seen": 19475968, "step": 9015 }, { "epoch": 1.4714518760195758, "grad_norm": 0.8108764886856079, "learning_rate": 3.678221859706362e-05, "loss": 0.2595, "num_input_tokens_seen": 19487328, "step": 9020 }, { "epoch": 1.4722675367047309, "grad_norm": 0.10152062773704529, "learning_rate": 3.6802610114192496e-05, "loss": 0.0384, "num_input_tokens_seen": 19497280, "step": 9025 }, { "epoch": 1.4730831973898857, "grad_norm": 0.2581130862236023, "learning_rate": 3.6823001631321376e-05, "loss": 0.1606, "num_input_tokens_seen": 19507264, "step": 9030 }, { "epoch": 1.4738988580750407, "grad_norm": 0.6808251738548279, "learning_rate": 3.684339314845024e-05, "loss": 0.1158, "num_input_tokens_seen": 19518080, "step": 9035 }, { "epoch": 1.4747145187601958, "grad_norm": 0.8009209632873535, "learning_rate": 3.6863784665579124e-05, "loss": 0.1229, "num_input_tokens_seen": 19528288, "step": 9040 }, { "epoch": 1.4755301794453508, "grad_norm": 1.0834261178970337, "learning_rate": 3.6884176182708e-05, "loss": 0.1369, "num_input_tokens_seen": 19539072, "step": 9045 }, { "epoch": 1.4763458401305056, "grad_norm": 1.5355477333068848, "learning_rate": 3.6904567699836865e-05, "loss": 0.241, "num_input_tokens_seen": 19550496, "step": 9050 }, { "epoch": 1.4771615008156607, "grad_norm": 0.13740812242031097, "learning_rate": 3.6924959216965745e-05, "loss": 0.0674, "num_input_tokens_seen": 19562752, "step": 9055 }, { "epoch": 1.4779771615008157, "grad_norm": 0.27237311005592346, "learning_rate": 3.694535073409462e-05, "loss": 0.25, "num_input_tokens_seen": 19572512, "step": 9060 }, { "epoch": 1.4787928221859707, "grad_norm": 0.13851670920848846, "learning_rate": 3.696574225122349e-05, "loss": 0.1832, "num_input_tokens_seen": 19582752, "step": 9065 }, { "epoch": 1.4796084828711256, "grad_norm": 2.338378429412842, "learning_rate": 3.698613376835237e-05, "loss": 0.2086, "num_input_tokens_seen": 19593184, "step": 9070 }, { "epoch": 1.4804241435562806, "grad_norm": 0.34699177742004395, "learning_rate": 3.700652528548124e-05, "loss": 0.0492, "num_input_tokens_seen": 19604768, "step": 9075 }, { "epoch": 1.4812398042414356, "grad_norm": 0.28478381037712097, "learning_rate": 3.7026916802610115e-05, "loss": 0.1214, "num_input_tokens_seen": 19616576, "step": 9080 }, { "epoch": 1.4820554649265905, "grad_norm": 0.16984719038009644, "learning_rate": 3.704730831973899e-05, "loss": 0.0375, "num_input_tokens_seen": 19628384, "step": 9085 }, { "epoch": 1.4828711256117455, "grad_norm": 0.857304036617279, "learning_rate": 3.706769983686786e-05, "loss": 0.1082, "num_input_tokens_seen": 19639136, "step": 9090 }, { "epoch": 1.4836867862969005, "grad_norm": 1.5272735357284546, "learning_rate": 3.708809135399674e-05, "loss": 0.1158, "num_input_tokens_seen": 19649856, "step": 9095 }, { "epoch": 1.4845024469820554, "grad_norm": 0.6329488158226013, "learning_rate": 3.7108482871125617e-05, "loss": 0.1367, "num_input_tokens_seen": 19661120, "step": 9100 }, { "epoch": 1.4853181076672104, "grad_norm": 1.6676065921783447, "learning_rate": 3.7128874388254484e-05, "loss": 0.1453, "num_input_tokens_seen": 19671648, "step": 9105 }, { "epoch": 1.4861337683523654, "grad_norm": 0.2707604765892029, "learning_rate": 3.7149265905383364e-05, "loss": 0.097, "num_input_tokens_seen": 19683584, "step": 9110 }, { "epoch": 1.4869494290375203, "grad_norm": 1.1278218030929565, "learning_rate": 3.716965742251224e-05, "loss": 0.3043, "num_input_tokens_seen": 19694080, "step": 9115 }, { "epoch": 1.4877650897226753, "grad_norm": 1.8807241916656494, "learning_rate": 3.719004893964111e-05, "loss": 0.3482, "num_input_tokens_seen": 19705824, "step": 9120 }, { "epoch": 1.4885807504078303, "grad_norm": 1.7480480670928955, "learning_rate": 3.7210440456769986e-05, "loss": 0.4319, "num_input_tokens_seen": 19717056, "step": 9125 }, { "epoch": 1.4893964110929854, "grad_norm": 0.7434868216514587, "learning_rate": 3.723083197389886e-05, "loss": 0.095, "num_input_tokens_seen": 19728512, "step": 9130 }, { "epoch": 1.4902120717781404, "grad_norm": 1.2574936151504517, "learning_rate": 3.725122349102774e-05, "loss": 0.0945, "num_input_tokens_seen": 19740768, "step": 9135 }, { "epoch": 1.4910277324632952, "grad_norm": 1.8625929355621338, "learning_rate": 3.727161500815661e-05, "loss": 0.2287, "num_input_tokens_seen": 19752576, "step": 9140 }, { "epoch": 1.4918433931484503, "grad_norm": 0.07480208575725555, "learning_rate": 3.729200652528548e-05, "loss": 0.052, "num_input_tokens_seen": 19764064, "step": 9145 }, { "epoch": 1.4926590538336053, "grad_norm": 0.18419574201107025, "learning_rate": 3.731239804241436e-05, "loss": 0.0859, "num_input_tokens_seen": 19776608, "step": 9150 }, { "epoch": 1.4934747145187601, "grad_norm": 0.16575151681900024, "learning_rate": 3.733278955954323e-05, "loss": 0.1957, "num_input_tokens_seen": 19787488, "step": 9155 }, { "epoch": 1.4942903752039152, "grad_norm": 0.6049216985702515, "learning_rate": 3.73531810766721e-05, "loss": 0.2886, "num_input_tokens_seen": 19798240, "step": 9160 }, { "epoch": 1.4951060358890702, "grad_norm": 0.11123808473348618, "learning_rate": 3.737357259380098e-05, "loss": 0.2809, "num_input_tokens_seen": 19808384, "step": 9165 }, { "epoch": 1.495921696574225, "grad_norm": 0.646359920501709, "learning_rate": 3.739396411092986e-05, "loss": 0.1749, "num_input_tokens_seen": 19817696, "step": 9170 }, { "epoch": 1.49673735725938, "grad_norm": 0.6252368688583374, "learning_rate": 3.741435562805873e-05, "loss": 0.1648, "num_input_tokens_seen": 19828384, "step": 9175 }, { "epoch": 1.497553017944535, "grad_norm": 0.2978098392486572, "learning_rate": 3.7434747145187605e-05, "loss": 0.1234, "num_input_tokens_seen": 19840704, "step": 9180 }, { "epoch": 1.49836867862969, "grad_norm": 0.6464101672172546, "learning_rate": 3.745513866231648e-05, "loss": 0.1561, "num_input_tokens_seen": 19851648, "step": 9185 }, { "epoch": 1.499184339314845, "grad_norm": 2.3124911785125732, "learning_rate": 3.747553017944535e-05, "loss": 0.1541, "num_input_tokens_seen": 19861568, "step": 9190 }, { "epoch": 1.5, "grad_norm": 0.1085851788520813, "learning_rate": 3.7495921696574226e-05, "loss": 0.0997, "num_input_tokens_seen": 19871232, "step": 9195 }, { "epoch": 1.5008156606851548, "grad_norm": 1.599067211151123, "learning_rate": 3.75163132137031e-05, "loss": 0.066, "num_input_tokens_seen": 19882400, "step": 9200 }, { "epoch": 1.50163132137031, "grad_norm": 0.5081494450569153, "learning_rate": 3.7536704730831974e-05, "loss": 0.1624, "num_input_tokens_seen": 19893824, "step": 9205 }, { "epoch": 1.502446982055465, "grad_norm": 0.17357273399829865, "learning_rate": 3.755709624796085e-05, "loss": 0.1098, "num_input_tokens_seen": 19905280, "step": 9210 }, { "epoch": 1.50326264274062, "grad_norm": 0.5352590680122375, "learning_rate": 3.757748776508973e-05, "loss": 0.0637, "num_input_tokens_seen": 19916032, "step": 9215 }, { "epoch": 1.504078303425775, "grad_norm": 0.4387705624103546, "learning_rate": 3.75978792822186e-05, "loss": 0.1027, "num_input_tokens_seen": 19927232, "step": 9220 }, { "epoch": 1.5048939641109298, "grad_norm": 0.8720506429672241, "learning_rate": 3.761827079934747e-05, "loss": 0.0458, "num_input_tokens_seen": 19936992, "step": 9225 }, { "epoch": 1.5057096247960848, "grad_norm": 0.6533507108688354, "learning_rate": 3.763866231647635e-05, "loss": 0.2013, "num_input_tokens_seen": 19948256, "step": 9230 }, { "epoch": 1.5065252854812399, "grad_norm": 0.8121970891952515, "learning_rate": 3.765905383360522e-05, "loss": 0.0971, "num_input_tokens_seen": 19960416, "step": 9235 }, { "epoch": 1.5073409461663947, "grad_norm": 1.0240153074264526, "learning_rate": 3.76794453507341e-05, "loss": 0.1191, "num_input_tokens_seen": 19970656, "step": 9240 }, { "epoch": 1.5081566068515497, "grad_norm": 0.06773070991039276, "learning_rate": 3.769983686786297e-05, "loss": 0.1137, "num_input_tokens_seen": 19982176, "step": 9245 }, { "epoch": 1.5089722675367048, "grad_norm": 1.9186370372772217, "learning_rate": 3.7720228384991845e-05, "loss": 0.2707, "num_input_tokens_seen": 19991968, "step": 9250 }, { "epoch": 1.5097879282218596, "grad_norm": 1.5791863203048706, "learning_rate": 3.774061990212072e-05, "loss": 0.2293, "num_input_tokens_seen": 20003968, "step": 9255 }, { "epoch": 1.5106035889070146, "grad_norm": 1.3195111751556396, "learning_rate": 3.776101141924959e-05, "loss": 0.1154, "num_input_tokens_seen": 20014784, "step": 9260 }, { "epoch": 1.5114192495921697, "grad_norm": 0.5152102708816528, "learning_rate": 3.7781402936378466e-05, "loss": 0.2743, "num_input_tokens_seen": 20024192, "step": 9265 }, { "epoch": 1.5122349102773245, "grad_norm": 0.29359573125839233, "learning_rate": 3.780179445350735e-05, "loss": 0.0735, "num_input_tokens_seen": 20034368, "step": 9270 }, { "epoch": 1.5130505709624797, "grad_norm": 3.320225477218628, "learning_rate": 3.7822185970636214e-05, "loss": 0.1902, "num_input_tokens_seen": 20045216, "step": 9275 }, { "epoch": 1.5138662316476346, "grad_norm": 0.3211157023906708, "learning_rate": 3.784257748776509e-05, "loss": 0.1243, "num_input_tokens_seen": 20056928, "step": 9280 }, { "epoch": 1.5146818923327896, "grad_norm": 2.0455915927886963, "learning_rate": 3.786296900489397e-05, "loss": 0.1927, "num_input_tokens_seen": 20067520, "step": 9285 }, { "epoch": 1.5154975530179446, "grad_norm": 0.14136485755443573, "learning_rate": 3.788336052202284e-05, "loss": 0.0503, "num_input_tokens_seen": 20077568, "step": 9290 }, { "epoch": 1.5163132137030995, "grad_norm": 0.6677345037460327, "learning_rate": 3.7903752039151716e-05, "loss": 0.3044, "num_input_tokens_seen": 20087584, "step": 9295 }, { "epoch": 1.5171288743882545, "grad_norm": 0.5407724976539612, "learning_rate": 3.792414355628059e-05, "loss": 0.1389, "num_input_tokens_seen": 20098880, "step": 9300 }, { "epoch": 1.5179445350734095, "grad_norm": 2.961512327194214, "learning_rate": 3.7944535073409464e-05, "loss": 0.2312, "num_input_tokens_seen": 20109568, "step": 9305 }, { "epoch": 1.5187601957585644, "grad_norm": 0.38681188225746155, "learning_rate": 3.796492659053834e-05, "loss": 0.0739, "num_input_tokens_seen": 20121184, "step": 9310 }, { "epoch": 1.5195758564437194, "grad_norm": 0.36204129457473755, "learning_rate": 3.798531810766721e-05, "loss": 0.0952, "num_input_tokens_seen": 20132608, "step": 9315 }, { "epoch": 1.5203915171288744, "grad_norm": 1.1493598222732544, "learning_rate": 3.8005709624796085e-05, "loss": 0.0573, "num_input_tokens_seen": 20143200, "step": 9320 }, { "epoch": 1.5212071778140293, "grad_norm": 0.04527086019515991, "learning_rate": 3.8026101141924966e-05, "loss": 0.1181, "num_input_tokens_seen": 20153344, "step": 9325 }, { "epoch": 1.5220228384991843, "grad_norm": 0.13256825506687164, "learning_rate": 3.804649265905383e-05, "loss": 0.1642, "num_input_tokens_seen": 20164288, "step": 9330 }, { "epoch": 1.5228384991843393, "grad_norm": 0.3433194160461426, "learning_rate": 3.8066884176182707e-05, "loss": 0.1283, "num_input_tokens_seen": 20174880, "step": 9335 }, { "epoch": 1.5236541598694942, "grad_norm": 0.23138941824436188, "learning_rate": 3.808727569331159e-05, "loss": 0.1233, "num_input_tokens_seen": 20184960, "step": 9340 }, { "epoch": 1.5244698205546494, "grad_norm": 0.3837876319885254, "learning_rate": 3.8107667210440454e-05, "loss": 0.0875, "num_input_tokens_seen": 20194464, "step": 9345 }, { "epoch": 1.5252854812398042, "grad_norm": 0.31076908111572266, "learning_rate": 3.8128058727569335e-05, "loss": 0.3648, "num_input_tokens_seen": 20204352, "step": 9350 }, { "epoch": 1.5261011419249593, "grad_norm": 0.3746667802333832, "learning_rate": 3.814845024469821e-05, "loss": 0.0887, "num_input_tokens_seen": 20214816, "step": 9355 }, { "epoch": 1.5269168026101143, "grad_norm": 2.557605504989624, "learning_rate": 3.816884176182708e-05, "loss": 0.2532, "num_input_tokens_seen": 20225664, "step": 9360 }, { "epoch": 1.5277324632952691, "grad_norm": 0.9144682288169861, "learning_rate": 3.8189233278955956e-05, "loss": 0.077, "num_input_tokens_seen": 20236672, "step": 9365 }, { "epoch": 1.5285481239804242, "grad_norm": 1.6879842281341553, "learning_rate": 3.820962479608483e-05, "loss": 0.2287, "num_input_tokens_seen": 20246784, "step": 9370 }, { "epoch": 1.5293637846655792, "grad_norm": 0.2349882572889328, "learning_rate": 3.8230016313213704e-05, "loss": 0.0945, "num_input_tokens_seen": 20257088, "step": 9375 }, { "epoch": 1.530179445350734, "grad_norm": 0.1049012541770935, "learning_rate": 3.825040783034258e-05, "loss": 0.0428, "num_input_tokens_seen": 20267520, "step": 9380 }, { "epoch": 1.530995106035889, "grad_norm": 0.23903314769268036, "learning_rate": 3.827079934747145e-05, "loss": 0.0586, "num_input_tokens_seen": 20277568, "step": 9385 }, { "epoch": 1.531810766721044, "grad_norm": 0.8165331482887268, "learning_rate": 3.829119086460033e-05, "loss": 0.0927, "num_input_tokens_seen": 20288768, "step": 9390 }, { "epoch": 1.532626427406199, "grad_norm": 0.6797024011611938, "learning_rate": 3.8311582381729206e-05, "loss": 0.1433, "num_input_tokens_seen": 20299968, "step": 9395 }, { "epoch": 1.533442088091354, "grad_norm": 1.4883137941360474, "learning_rate": 3.833197389885807e-05, "loss": 0.3119, "num_input_tokens_seen": 20310592, "step": 9400 }, { "epoch": 1.534257748776509, "grad_norm": 1.7952152490615845, "learning_rate": 3.8352365415986954e-05, "loss": 0.377, "num_input_tokens_seen": 20321088, "step": 9405 }, { "epoch": 1.5350734094616638, "grad_norm": 0.1689329296350479, "learning_rate": 3.837275693311583e-05, "loss": 0.1631, "num_input_tokens_seen": 20331136, "step": 9410 }, { "epoch": 1.535889070146819, "grad_norm": 1.3064274787902832, "learning_rate": 3.83931484502447e-05, "loss": 0.1905, "num_input_tokens_seen": 20343136, "step": 9415 }, { "epoch": 1.536704730831974, "grad_norm": 0.3735467195510864, "learning_rate": 3.8413539967373575e-05, "loss": 0.1765, "num_input_tokens_seen": 20354656, "step": 9420 }, { "epoch": 1.5375203915171287, "grad_norm": 0.20951536297798157, "learning_rate": 3.843393148450245e-05, "loss": 0.033, "num_input_tokens_seen": 20365856, "step": 9425 }, { "epoch": 1.538336052202284, "grad_norm": 1.783406376838684, "learning_rate": 3.845432300163132e-05, "loss": 0.3068, "num_input_tokens_seen": 20376064, "step": 9430 }, { "epoch": 1.5391517128874388, "grad_norm": 0.8534454107284546, "learning_rate": 3.8474714518760197e-05, "loss": 0.0973, "num_input_tokens_seen": 20387104, "step": 9435 }, { "epoch": 1.5399673735725938, "grad_norm": 1.6619659662246704, "learning_rate": 3.849510603588907e-05, "loss": 0.1271, "num_input_tokens_seen": 20398112, "step": 9440 }, { "epoch": 1.5407830342577489, "grad_norm": 0.5054094791412354, "learning_rate": 3.851549755301795e-05, "loss": 0.0701, "num_input_tokens_seen": 20409152, "step": 9445 }, { "epoch": 1.5415986949429037, "grad_norm": 0.4495157301425934, "learning_rate": 3.853588907014682e-05, "loss": 0.108, "num_input_tokens_seen": 20420704, "step": 9450 }, { "epoch": 1.5424143556280587, "grad_norm": 0.6414544582366943, "learning_rate": 3.855628058727569e-05, "loss": 0.1217, "num_input_tokens_seen": 20431456, "step": 9455 }, { "epoch": 1.5432300163132138, "grad_norm": 0.8792763352394104, "learning_rate": 3.857667210440457e-05, "loss": 0.1772, "num_input_tokens_seen": 20442880, "step": 9460 }, { "epoch": 1.5440456769983686, "grad_norm": 0.9698439836502075, "learning_rate": 3.859706362153344e-05, "loss": 0.0551, "num_input_tokens_seen": 20454624, "step": 9465 }, { "epoch": 1.5448613376835236, "grad_norm": 0.8407248854637146, "learning_rate": 3.861745513866232e-05, "loss": 0.1573, "num_input_tokens_seen": 20465824, "step": 9470 }, { "epoch": 1.5456769983686787, "grad_norm": 0.2501432001590729, "learning_rate": 3.8637846655791194e-05, "loss": 0.161, "num_input_tokens_seen": 20477088, "step": 9475 }, { "epoch": 1.5464926590538335, "grad_norm": 0.03376385569572449, "learning_rate": 3.865823817292007e-05, "loss": 0.0447, "num_input_tokens_seen": 20487296, "step": 9480 }, { "epoch": 1.5473083197389887, "grad_norm": 0.16299869120121002, "learning_rate": 3.867862969004894e-05, "loss": 0.183, "num_input_tokens_seen": 20498080, "step": 9485 }, { "epoch": 1.5481239804241436, "grad_norm": 1.555908441543579, "learning_rate": 3.8699021207177815e-05, "loss": 0.2425, "num_input_tokens_seen": 20509824, "step": 9490 }, { "epoch": 1.5489396411092984, "grad_norm": 0.6382370591163635, "learning_rate": 3.871941272430669e-05, "loss": 0.3087, "num_input_tokens_seen": 20520928, "step": 9495 }, { "epoch": 1.5497553017944536, "grad_norm": 1.0381131172180176, "learning_rate": 3.873980424143556e-05, "loss": 0.2528, "num_input_tokens_seen": 20532288, "step": 9500 }, { "epoch": 1.5505709624796085, "grad_norm": 0.5706748962402344, "learning_rate": 3.876019575856444e-05, "loss": 0.2697, "num_input_tokens_seen": 20543456, "step": 9505 }, { "epoch": 1.5513866231647635, "grad_norm": 0.8308067917823792, "learning_rate": 3.878058727569332e-05, "loss": 0.0949, "num_input_tokens_seen": 20554592, "step": 9510 }, { "epoch": 1.5522022838499185, "grad_norm": 0.9888598322868347, "learning_rate": 3.880097879282219e-05, "loss": 0.125, "num_input_tokens_seen": 20565504, "step": 9515 }, { "epoch": 1.5530179445350734, "grad_norm": 1.493121862411499, "learning_rate": 3.882137030995106e-05, "loss": 0.3161, "num_input_tokens_seen": 20576896, "step": 9520 }, { "epoch": 1.5538336052202284, "grad_norm": 0.936271607875824, "learning_rate": 3.884176182707994e-05, "loss": 0.1203, "num_input_tokens_seen": 20586720, "step": 9525 }, { "epoch": 1.5546492659053834, "grad_norm": 2.0225460529327393, "learning_rate": 3.886215334420881e-05, "loss": 0.1557, "num_input_tokens_seen": 20597376, "step": 9530 }, { "epoch": 1.5554649265905383, "grad_norm": 1.153343915939331, "learning_rate": 3.888254486133768e-05, "loss": 0.0851, "num_input_tokens_seen": 20608608, "step": 9535 }, { "epoch": 1.5562805872756933, "grad_norm": 0.6486978530883789, "learning_rate": 3.890293637846656e-05, "loss": 0.1125, "num_input_tokens_seen": 20620128, "step": 9540 }, { "epoch": 1.5570962479608483, "grad_norm": 0.12363860011100769, "learning_rate": 3.8923327895595434e-05, "loss": 0.0732, "num_input_tokens_seen": 20631200, "step": 9545 }, { "epoch": 1.5579119086460032, "grad_norm": 0.08781436085700989, "learning_rate": 3.894371941272431e-05, "loss": 0.0626, "num_input_tokens_seen": 20640352, "step": 9550 }, { "epoch": 1.5587275693311582, "grad_norm": 1.4872766733169556, "learning_rate": 3.896411092985318e-05, "loss": 0.2932, "num_input_tokens_seen": 20652832, "step": 9555 }, { "epoch": 1.5595432300163132, "grad_norm": 0.5830636024475098, "learning_rate": 3.8984502446982056e-05, "loss": 0.0919, "num_input_tokens_seen": 20663488, "step": 9560 }, { "epoch": 1.560358890701468, "grad_norm": 0.2692667245864868, "learning_rate": 3.9004893964110936e-05, "loss": 0.0674, "num_input_tokens_seen": 20674464, "step": 9565 }, { "epoch": 1.5611745513866233, "grad_norm": 0.46099594235420227, "learning_rate": 3.90252854812398e-05, "loss": 0.1431, "num_input_tokens_seen": 20685760, "step": 9570 }, { "epoch": 1.5619902120717781, "grad_norm": 3.147345542907715, "learning_rate": 3.904567699836868e-05, "loss": 0.2702, "num_input_tokens_seen": 20697280, "step": 9575 }, { "epoch": 1.5628058727569332, "grad_norm": 0.05783794820308685, "learning_rate": 3.906606851549756e-05, "loss": 0.1327, "num_input_tokens_seen": 20708000, "step": 9580 }, { "epoch": 1.5636215334420882, "grad_norm": 0.3613324463367462, "learning_rate": 3.908646003262643e-05, "loss": 0.1236, "num_input_tokens_seen": 20719424, "step": 9585 }, { "epoch": 1.564437194127243, "grad_norm": 1.694031000137329, "learning_rate": 3.9106851549755305e-05, "loss": 0.1976, "num_input_tokens_seen": 20728096, "step": 9590 }, { "epoch": 1.565252854812398, "grad_norm": 0.9047970771789551, "learning_rate": 3.912724306688418e-05, "loss": 0.2308, "num_input_tokens_seen": 20738816, "step": 9595 }, { "epoch": 1.566068515497553, "grad_norm": 0.18730683624744415, "learning_rate": 3.914763458401305e-05, "loss": 0.1514, "num_input_tokens_seen": 20749408, "step": 9600 }, { "epoch": 1.566884176182708, "grad_norm": 0.5730803608894348, "learning_rate": 3.916802610114193e-05, "loss": 0.0533, "num_input_tokens_seen": 20759200, "step": 9605 }, { "epoch": 1.567699836867863, "grad_norm": 0.5699712038040161, "learning_rate": 3.91884176182708e-05, "loss": 0.0987, "num_input_tokens_seen": 20770560, "step": 9610 }, { "epoch": 1.568515497553018, "grad_norm": 0.7310659289360046, "learning_rate": 3.9208809135399675e-05, "loss": 0.1707, "num_input_tokens_seen": 20781056, "step": 9615 }, { "epoch": 1.5693311582381728, "grad_norm": 1.354735016822815, "learning_rate": 3.9229200652528555e-05, "loss": 0.0905, "num_input_tokens_seen": 20791648, "step": 9620 }, { "epoch": 1.5701468189233279, "grad_norm": 0.9189955592155457, "learning_rate": 3.924959216965742e-05, "loss": 0.183, "num_input_tokens_seen": 20804128, "step": 9625 }, { "epoch": 1.570962479608483, "grad_norm": 1.6348750591278076, "learning_rate": 3.9269983686786296e-05, "loss": 0.1323, "num_input_tokens_seen": 20814752, "step": 9630 }, { "epoch": 1.5717781402936377, "grad_norm": 0.7113264203071594, "learning_rate": 3.929037520391518e-05, "loss": 0.1698, "num_input_tokens_seen": 20826720, "step": 9635 }, { "epoch": 1.572593800978793, "grad_norm": 0.5695316195487976, "learning_rate": 3.9310766721044044e-05, "loss": 0.073, "num_input_tokens_seen": 20837536, "step": 9640 }, { "epoch": 1.5734094616639478, "grad_norm": 2.2033629417419434, "learning_rate": 3.9331158238172924e-05, "loss": 0.1632, "num_input_tokens_seen": 20847744, "step": 9645 }, { "epoch": 1.5742251223491026, "grad_norm": 1.5300705432891846, "learning_rate": 3.93515497553018e-05, "loss": 0.3441, "num_input_tokens_seen": 20859136, "step": 9650 }, { "epoch": 1.5750407830342579, "grad_norm": 2.39156174659729, "learning_rate": 3.9371941272430665e-05, "loss": 0.2051, "num_input_tokens_seen": 20868832, "step": 9655 }, { "epoch": 1.5758564437194127, "grad_norm": 0.6496899724006653, "learning_rate": 3.9392332789559546e-05, "loss": 0.1089, "num_input_tokens_seen": 20878080, "step": 9660 }, { "epoch": 1.5766721044045677, "grad_norm": 0.16704070568084717, "learning_rate": 3.941272430668842e-05, "loss": 0.1534, "num_input_tokens_seen": 20889600, "step": 9665 }, { "epoch": 1.5774877650897228, "grad_norm": 1.2648061513900757, "learning_rate": 3.943311582381729e-05, "loss": 0.2161, "num_input_tokens_seen": 20899392, "step": 9670 }, { "epoch": 1.5783034257748776, "grad_norm": 0.5666310787200928, "learning_rate": 3.945350734094617e-05, "loss": 0.2034, "num_input_tokens_seen": 20909888, "step": 9675 }, { "epoch": 1.5791190864600326, "grad_norm": 1.4375687837600708, "learning_rate": 3.947389885807504e-05, "loss": 0.069, "num_input_tokens_seen": 20920032, "step": 9680 }, { "epoch": 1.5799347471451877, "grad_norm": 0.07810524106025696, "learning_rate": 3.949429037520392e-05, "loss": 0.2127, "num_input_tokens_seen": 20931136, "step": 9685 }, { "epoch": 1.5807504078303425, "grad_norm": 0.0691853016614914, "learning_rate": 3.951468189233279e-05, "loss": 0.0865, "num_input_tokens_seen": 20941792, "step": 9690 }, { "epoch": 1.5815660685154975, "grad_norm": 0.9178061485290527, "learning_rate": 3.953507340946166e-05, "loss": 0.1366, "num_input_tokens_seen": 20952672, "step": 9695 }, { "epoch": 1.5823817292006526, "grad_norm": 0.37220901250839233, "learning_rate": 3.955546492659054e-05, "loss": 0.0569, "num_input_tokens_seen": 20964480, "step": 9700 }, { "epoch": 1.5831973898858074, "grad_norm": 0.4019668400287628, "learning_rate": 3.957585644371942e-05, "loss": 0.0638, "num_input_tokens_seen": 20975392, "step": 9705 }, { "epoch": 1.5840130505709626, "grad_norm": 0.20445969700813293, "learning_rate": 3.9596247960848284e-05, "loss": 0.1619, "num_input_tokens_seen": 20985344, "step": 9710 }, { "epoch": 1.5848287112561175, "grad_norm": 0.9215459227561951, "learning_rate": 3.9616639477977165e-05, "loss": 0.2119, "num_input_tokens_seen": 20995456, "step": 9715 }, { "epoch": 1.5856443719412723, "grad_norm": 0.2618297338485718, "learning_rate": 3.963703099510604e-05, "loss": 0.0604, "num_input_tokens_seen": 21006816, "step": 9720 }, { "epoch": 1.5864600326264275, "grad_norm": 1.6102073192596436, "learning_rate": 3.965742251223491e-05, "loss": 0.1973, "num_input_tokens_seen": 21017344, "step": 9725 }, { "epoch": 1.5872756933115824, "grad_norm": 0.527043342590332, "learning_rate": 3.9677814029363786e-05, "loss": 0.0998, "num_input_tokens_seen": 21027040, "step": 9730 }, { "epoch": 1.5880913539967374, "grad_norm": 0.7275537848472595, "learning_rate": 3.969820554649266e-05, "loss": 0.078, "num_input_tokens_seen": 21037472, "step": 9735 }, { "epoch": 1.5889070146818924, "grad_norm": 2.6754696369171143, "learning_rate": 3.971859706362154e-05, "loss": 0.2823, "num_input_tokens_seen": 21048800, "step": 9740 }, { "epoch": 1.5897226753670473, "grad_norm": 1.3964941501617432, "learning_rate": 3.973898858075041e-05, "loss": 0.2832, "num_input_tokens_seen": 21060672, "step": 9745 }, { "epoch": 1.5905383360522023, "grad_norm": 0.16193002462387085, "learning_rate": 3.975938009787928e-05, "loss": 0.2739, "num_input_tokens_seen": 21070880, "step": 9750 }, { "epoch": 1.5913539967373573, "grad_norm": 1.1681309938430786, "learning_rate": 3.977977161500816e-05, "loss": 0.3042, "num_input_tokens_seen": 21081792, "step": 9755 }, { "epoch": 1.5921696574225122, "grad_norm": 0.1931799054145813, "learning_rate": 3.980016313213703e-05, "loss": 0.0901, "num_input_tokens_seen": 21093280, "step": 9760 }, { "epoch": 1.5929853181076672, "grad_norm": 0.8570016622543335, "learning_rate": 3.982055464926591e-05, "loss": 0.0978, "num_input_tokens_seen": 21103264, "step": 9765 }, { "epoch": 1.5938009787928222, "grad_norm": 0.2233196347951889, "learning_rate": 3.9840946166394783e-05, "loss": 0.1632, "num_input_tokens_seen": 21113632, "step": 9770 }, { "epoch": 1.594616639477977, "grad_norm": 1.4208407402038574, "learning_rate": 3.986133768352366e-05, "loss": 0.2016, "num_input_tokens_seen": 21124448, "step": 9775 }, { "epoch": 1.595432300163132, "grad_norm": 0.8554823398590088, "learning_rate": 3.988172920065253e-05, "loss": 0.1153, "num_input_tokens_seen": 21135776, "step": 9780 }, { "epoch": 1.5962479608482871, "grad_norm": 0.2620565593242645, "learning_rate": 3.9902120717781405e-05, "loss": 0.2278, "num_input_tokens_seen": 21147072, "step": 9785 }, { "epoch": 1.597063621533442, "grad_norm": 0.2595762610435486, "learning_rate": 3.992251223491028e-05, "loss": 0.13, "num_input_tokens_seen": 21158848, "step": 9790 }, { "epoch": 1.5978792822185972, "grad_norm": 0.8207564949989319, "learning_rate": 3.994290375203915e-05, "loss": 0.1124, "num_input_tokens_seen": 21168448, "step": 9795 }, { "epoch": 1.598694942903752, "grad_norm": 2.127171039581299, "learning_rate": 3.9963295269168026e-05, "loss": 0.1961, "num_input_tokens_seen": 21179744, "step": 9800 }, { "epoch": 1.599510603588907, "grad_norm": 1.5096732378005981, "learning_rate": 3.99836867862969e-05, "loss": 0.1815, "num_input_tokens_seen": 21191904, "step": 9805 }, { "epoch": 1.600326264274062, "grad_norm": 0.21833635866641998, "learning_rate": 4.000407830342578e-05, "loss": 0.1191, "num_input_tokens_seen": 21202848, "step": 9810 }, { "epoch": 1.601141924959217, "grad_norm": 0.20092487335205078, "learning_rate": 4.002446982055465e-05, "loss": 0.087, "num_input_tokens_seen": 21214304, "step": 9815 }, { "epoch": 1.601957585644372, "grad_norm": 0.4256657361984253, "learning_rate": 4.004486133768353e-05, "loss": 0.1194, "num_input_tokens_seen": 21224448, "step": 9820 }, { "epoch": 1.602773246329527, "grad_norm": 2.298814296722412, "learning_rate": 4.00652528548124e-05, "loss": 0.2527, "num_input_tokens_seen": 21235392, "step": 9825 }, { "epoch": 1.6035889070146818, "grad_norm": 0.9622877836227417, "learning_rate": 4.008564437194127e-05, "loss": 0.1847, "num_input_tokens_seen": 21246656, "step": 9830 }, { "epoch": 1.6044045676998369, "grad_norm": 0.7709288597106934, "learning_rate": 4.010603588907015e-05, "loss": 0.0889, "num_input_tokens_seen": 21257376, "step": 9835 }, { "epoch": 1.605220228384992, "grad_norm": 0.7792813777923584, "learning_rate": 4.0126427406199024e-05, "loss": 0.1306, "num_input_tokens_seen": 21267456, "step": 9840 }, { "epoch": 1.6060358890701467, "grad_norm": 0.9451857805252075, "learning_rate": 4.01468189233279e-05, "loss": 0.1485, "num_input_tokens_seen": 21279104, "step": 9845 }, { "epoch": 1.6068515497553018, "grad_norm": 0.17714758217334747, "learning_rate": 4.016721044045677e-05, "loss": 0.2217, "num_input_tokens_seen": 21291328, "step": 9850 }, { "epoch": 1.6076672104404568, "grad_norm": 0.5818141102790833, "learning_rate": 4.0187601957585645e-05, "loss": 0.0993, "num_input_tokens_seen": 21302016, "step": 9855 }, { "epoch": 1.6084828711256116, "grad_norm": 0.35444435477256775, "learning_rate": 4.0207993474714526e-05, "loss": 0.1084, "num_input_tokens_seen": 21312448, "step": 9860 }, { "epoch": 1.6092985318107669, "grad_norm": 0.96122145652771, "learning_rate": 4.022838499184339e-05, "loss": 0.1054, "num_input_tokens_seen": 21323136, "step": 9865 }, { "epoch": 1.6101141924959217, "grad_norm": 0.7203935980796814, "learning_rate": 4.024877650897227e-05, "loss": 0.088, "num_input_tokens_seen": 21334368, "step": 9870 }, { "epoch": 1.6109298531810765, "grad_norm": 1.177601933479309, "learning_rate": 4.026916802610115e-05, "loss": 0.1754, "num_input_tokens_seen": 21344992, "step": 9875 }, { "epoch": 1.6117455138662318, "grad_norm": 0.29670244455337524, "learning_rate": 4.0289559543230014e-05, "loss": 0.1205, "num_input_tokens_seen": 21354688, "step": 9880 }, { "epoch": 1.6125611745513866, "grad_norm": 2.259653329849243, "learning_rate": 4.030995106035889e-05, "loss": 0.2959, "num_input_tokens_seen": 21365696, "step": 9885 }, { "epoch": 1.6133768352365416, "grad_norm": 1.8908021450042725, "learning_rate": 4.033034257748777e-05, "loss": 0.2806, "num_input_tokens_seen": 21378240, "step": 9890 }, { "epoch": 1.6141924959216967, "grad_norm": 2.232450246810913, "learning_rate": 4.035073409461664e-05, "loss": 0.4726, "num_input_tokens_seen": 21388992, "step": 9895 }, { "epoch": 1.6150081566068515, "grad_norm": 0.5141802430152893, "learning_rate": 4.0371125611745516e-05, "loss": 0.1748, "num_input_tokens_seen": 21399808, "step": 9900 }, { "epoch": 1.6158238172920065, "grad_norm": 1.1142812967300415, "learning_rate": 4.039151712887439e-05, "loss": 0.0775, "num_input_tokens_seen": 21409696, "step": 9905 }, { "epoch": 1.6166394779771616, "grad_norm": 0.11027383804321289, "learning_rate": 4.0411908646003264e-05, "loss": 0.1137, "num_input_tokens_seen": 21420416, "step": 9910 }, { "epoch": 1.6174551386623164, "grad_norm": 2.051628828048706, "learning_rate": 4.043230016313214e-05, "loss": 0.4027, "num_input_tokens_seen": 21431648, "step": 9915 }, { "epoch": 1.6182707993474714, "grad_norm": 0.7029932737350464, "learning_rate": 4.045269168026101e-05, "loss": 0.2534, "num_input_tokens_seen": 21441856, "step": 9920 }, { "epoch": 1.6190864600326265, "grad_norm": 0.9705846905708313, "learning_rate": 4.0473083197389885e-05, "loss": 0.2018, "num_input_tokens_seen": 21452736, "step": 9925 }, { "epoch": 1.6199021207177813, "grad_norm": 0.1701372116804123, "learning_rate": 4.0493474714518766e-05, "loss": 0.1119, "num_input_tokens_seen": 21463552, "step": 9930 }, { "epoch": 1.6207177814029365, "grad_norm": 1.4758479595184326, "learning_rate": 4.051386623164763e-05, "loss": 0.209, "num_input_tokens_seen": 21475136, "step": 9935 }, { "epoch": 1.6215334420880914, "grad_norm": 0.04265560954809189, "learning_rate": 4.0534257748776514e-05, "loss": 0.1882, "num_input_tokens_seen": 21485536, "step": 9940 }, { "epoch": 1.6223491027732462, "grad_norm": 0.13012133538722992, "learning_rate": 4.055464926590539e-05, "loss": 0.0954, "num_input_tokens_seen": 21496320, "step": 9945 }, { "epoch": 1.6231647634584014, "grad_norm": 0.8822688460350037, "learning_rate": 4.0575040783034255e-05, "loss": 0.112, "num_input_tokens_seen": 21506688, "step": 9950 }, { "epoch": 1.6239804241435563, "grad_norm": 1.7504024505615234, "learning_rate": 4.0595432300163135e-05, "loss": 0.3269, "num_input_tokens_seen": 21517184, "step": 9955 }, { "epoch": 1.6247960848287113, "grad_norm": 0.3862159252166748, "learning_rate": 4.061582381729201e-05, "loss": 0.0922, "num_input_tokens_seen": 21527104, "step": 9960 }, { "epoch": 1.6256117455138663, "grad_norm": 2.2891268730163574, "learning_rate": 4.063621533442088e-05, "loss": 0.2985, "num_input_tokens_seen": 21537024, "step": 9965 }, { "epoch": 1.6264274061990212, "grad_norm": 2.1083755493164062, "learning_rate": 4.065660685154976e-05, "loss": 0.1223, "num_input_tokens_seen": 21547648, "step": 9970 }, { "epoch": 1.6272430668841762, "grad_norm": 1.702569603919983, "learning_rate": 4.067699836867863e-05, "loss": 0.1797, "num_input_tokens_seen": 21558944, "step": 9975 }, { "epoch": 1.6280587275693312, "grad_norm": 2.6267049312591553, "learning_rate": 4.0697389885807504e-05, "loss": 0.3469, "num_input_tokens_seen": 21569408, "step": 9980 }, { "epoch": 1.628874388254486, "grad_norm": 1.0059809684753418, "learning_rate": 4.071778140293638e-05, "loss": 0.1511, "num_input_tokens_seen": 21580384, "step": 9985 }, { "epoch": 1.629690048939641, "grad_norm": 0.8725573420524597, "learning_rate": 4.073817292006525e-05, "loss": 0.1061, "num_input_tokens_seen": 21591328, "step": 9990 }, { "epoch": 1.6305057096247961, "grad_norm": 0.7223463654518127, "learning_rate": 4.075856443719413e-05, "loss": 0.0906, "num_input_tokens_seen": 21603456, "step": 9995 }, { "epoch": 1.631321370309951, "grad_norm": 0.4944842755794525, "learning_rate": 4.0778955954323006e-05, "loss": 0.2613, "num_input_tokens_seen": 21613248, "step": 10000 }, { "epoch": 1.632137030995106, "grad_norm": 0.4315822422504425, "learning_rate": 4.079934747145187e-05, "loss": 0.076, "num_input_tokens_seen": 21624256, "step": 10005 }, { "epoch": 1.632952691680261, "grad_norm": 0.486806184053421, "learning_rate": 4.0819738988580754e-05, "loss": 0.0982, "num_input_tokens_seen": 21636032, "step": 10010 }, { "epoch": 1.6337683523654158, "grad_norm": 0.9303222298622131, "learning_rate": 4.084013050570963e-05, "loss": 0.1157, "num_input_tokens_seen": 21647264, "step": 10015 }, { "epoch": 1.634584013050571, "grad_norm": 1.7905093431472778, "learning_rate": 4.08605220228385e-05, "loss": 0.1268, "num_input_tokens_seen": 21658496, "step": 10020 }, { "epoch": 1.635399673735726, "grad_norm": 0.72425377368927, "learning_rate": 4.0880913539967375e-05, "loss": 0.178, "num_input_tokens_seen": 21669248, "step": 10025 }, { "epoch": 1.636215334420881, "grad_norm": 0.946662962436676, "learning_rate": 4.090130505709625e-05, "loss": 0.1581, "num_input_tokens_seen": 21679840, "step": 10030 }, { "epoch": 1.637030995106036, "grad_norm": 0.2080681025981903, "learning_rate": 4.092169657422513e-05, "loss": 0.1661, "num_input_tokens_seen": 21691904, "step": 10035 }, { "epoch": 1.6378466557911908, "grad_norm": 0.39395368099212646, "learning_rate": 4.0942088091354e-05, "loss": 0.0601, "num_input_tokens_seen": 21701408, "step": 10040 }, { "epoch": 1.6386623164763459, "grad_norm": 0.40419185161590576, "learning_rate": 4.096247960848287e-05, "loss": 0.0576, "num_input_tokens_seen": 21712256, "step": 10045 }, { "epoch": 1.639477977161501, "grad_norm": 0.47006332874298096, "learning_rate": 4.098287112561175e-05, "loss": 0.1042, "num_input_tokens_seen": 21722304, "step": 10050 }, { "epoch": 1.6402936378466557, "grad_norm": 0.4128117561340332, "learning_rate": 4.100326264274062e-05, "loss": 0.0928, "num_input_tokens_seen": 21733152, "step": 10055 }, { "epoch": 1.6411092985318108, "grad_norm": 0.6930370926856995, "learning_rate": 4.10236541598695e-05, "loss": 0.151, "num_input_tokens_seen": 21744160, "step": 10060 }, { "epoch": 1.6419249592169658, "grad_norm": 0.8039340376853943, "learning_rate": 4.104404567699837e-05, "loss": 0.0743, "num_input_tokens_seen": 21754432, "step": 10065 }, { "epoch": 1.6427406199021206, "grad_norm": 0.14282670617103577, "learning_rate": 4.106443719412725e-05, "loss": 0.0348, "num_input_tokens_seen": 21764928, "step": 10070 }, { "epoch": 1.6435562805872757, "grad_norm": 1.6168749332427979, "learning_rate": 4.108482871125612e-05, "loss": 0.2703, "num_input_tokens_seen": 21775008, "step": 10075 }, { "epoch": 1.6443719412724307, "grad_norm": 2.1672070026397705, "learning_rate": 4.1105220228384994e-05, "loss": 0.1582, "num_input_tokens_seen": 21785792, "step": 10080 }, { "epoch": 1.6451876019575855, "grad_norm": 0.21262481808662415, "learning_rate": 4.112561174551387e-05, "loss": 0.1428, "num_input_tokens_seen": 21796704, "step": 10085 }, { "epoch": 1.6460032626427408, "grad_norm": 0.058420270681381226, "learning_rate": 4.114600326264274e-05, "loss": 0.301, "num_input_tokens_seen": 21807840, "step": 10090 }, { "epoch": 1.6468189233278956, "grad_norm": 1.1617473363876343, "learning_rate": 4.1166394779771616e-05, "loss": 0.1774, "num_input_tokens_seen": 21818528, "step": 10095 }, { "epoch": 1.6476345840130504, "grad_norm": 0.031602825969457626, "learning_rate": 4.118678629690049e-05, "loss": 0.0788, "num_input_tokens_seen": 21829472, "step": 10100 }, { "epoch": 1.6484502446982057, "grad_norm": 1.4425946474075317, "learning_rate": 4.120717781402937e-05, "loss": 0.433, "num_input_tokens_seen": 21841024, "step": 10105 }, { "epoch": 1.6492659053833605, "grad_norm": 0.88074791431427, "learning_rate": 4.122756933115824e-05, "loss": 0.3895, "num_input_tokens_seen": 21852096, "step": 10110 }, { "epoch": 1.6500815660685155, "grad_norm": 1.642869472503662, "learning_rate": 4.124796084828712e-05, "loss": 0.2483, "num_input_tokens_seen": 21863904, "step": 10115 }, { "epoch": 1.6508972267536706, "grad_norm": 0.712256133556366, "learning_rate": 4.126835236541599e-05, "loss": 0.117, "num_input_tokens_seen": 21874048, "step": 10120 }, { "epoch": 1.6517128874388254, "grad_norm": 2.635544538497925, "learning_rate": 4.128874388254486e-05, "loss": 0.1836, "num_input_tokens_seen": 21885632, "step": 10125 }, { "epoch": 1.6525285481239804, "grad_norm": 0.2368198186159134, "learning_rate": 4.130913539967374e-05, "loss": 0.2578, "num_input_tokens_seen": 21896864, "step": 10130 }, { "epoch": 1.6533442088091355, "grad_norm": 1.1837373971939087, "learning_rate": 4.132952691680261e-05, "loss": 0.1412, "num_input_tokens_seen": 21907712, "step": 10135 }, { "epoch": 1.6541598694942903, "grad_norm": 0.7420065402984619, "learning_rate": 4.134991843393149e-05, "loss": 0.2049, "num_input_tokens_seen": 21918464, "step": 10140 }, { "epoch": 1.6549755301794453, "grad_norm": 0.1718529909849167, "learning_rate": 4.137030995106036e-05, "loss": 0.226, "num_input_tokens_seen": 21929664, "step": 10145 }, { "epoch": 1.6557911908646004, "grad_norm": 2.075949192047119, "learning_rate": 4.1390701468189235e-05, "loss": 0.1589, "num_input_tokens_seen": 21940288, "step": 10150 }, { "epoch": 1.6566068515497552, "grad_norm": 0.9892008900642395, "learning_rate": 4.141109298531811e-05, "loss": 0.0615, "num_input_tokens_seen": 21950464, "step": 10155 }, { "epoch": 1.6574225122349104, "grad_norm": 1.2766722440719604, "learning_rate": 4.143148450244698e-05, "loss": 0.1318, "num_input_tokens_seen": 21961696, "step": 10160 }, { "epoch": 1.6582381729200653, "grad_norm": 0.34528034925460815, "learning_rate": 4.1451876019575856e-05, "loss": 0.2352, "num_input_tokens_seen": 21972192, "step": 10165 }, { "epoch": 1.65905383360522, "grad_norm": 0.1713259518146515, "learning_rate": 4.147226753670474e-05, "loss": 0.2371, "num_input_tokens_seen": 21982112, "step": 10170 }, { "epoch": 1.6598694942903753, "grad_norm": 1.6870263814926147, "learning_rate": 4.1492659053833604e-05, "loss": 0.1966, "num_input_tokens_seen": 21993568, "step": 10175 }, { "epoch": 1.6606851549755302, "grad_norm": 0.6525558233261108, "learning_rate": 4.151305057096248e-05, "loss": 0.1071, "num_input_tokens_seen": 22003296, "step": 10180 }, { "epoch": 1.6615008156606852, "grad_norm": 0.06826697289943695, "learning_rate": 4.153344208809136e-05, "loss": 0.0908, "num_input_tokens_seen": 22013344, "step": 10185 }, { "epoch": 1.6623164763458402, "grad_norm": 0.34700116515159607, "learning_rate": 4.155383360522023e-05, "loss": 0.2323, "num_input_tokens_seen": 22022112, "step": 10190 }, { "epoch": 1.663132137030995, "grad_norm": 0.2069404274225235, "learning_rate": 4.1574225122349106e-05, "loss": 0.1262, "num_input_tokens_seen": 22032704, "step": 10195 }, { "epoch": 1.66394779771615, "grad_norm": 0.2624964118003845, "learning_rate": 4.159461663947798e-05, "loss": 0.127, "num_input_tokens_seen": 22044320, "step": 10200 }, { "epoch": 1.6647634584013051, "grad_norm": 0.09356212615966797, "learning_rate": 4.1615008156606853e-05, "loss": 0.0153, "num_input_tokens_seen": 22055008, "step": 10205 }, { "epoch": 1.66557911908646, "grad_norm": 0.3544616401195526, "learning_rate": 4.163539967373573e-05, "loss": 0.1987, "num_input_tokens_seen": 22065920, "step": 10210 }, { "epoch": 1.666394779771615, "grad_norm": 0.12332234531641006, "learning_rate": 4.16557911908646e-05, "loss": 0.0726, "num_input_tokens_seen": 22077024, "step": 10215 }, { "epoch": 1.66721044045677, "grad_norm": 1.4642345905303955, "learning_rate": 4.1676182707993475e-05, "loss": 0.3489, "num_input_tokens_seen": 22088320, "step": 10220 }, { "epoch": 1.6680261011419248, "grad_norm": 0.3583391308784485, "learning_rate": 4.1696574225122355e-05, "loss": 0.0306, "num_input_tokens_seen": 22098528, "step": 10225 }, { "epoch": 1.6688417618270799, "grad_norm": 2.0248467922210693, "learning_rate": 4.171696574225122e-05, "loss": 0.1882, "num_input_tokens_seen": 22109792, "step": 10230 }, { "epoch": 1.669657422512235, "grad_norm": 0.26104873418807983, "learning_rate": 4.17373572593801e-05, "loss": 0.3242, "num_input_tokens_seen": 22120064, "step": 10235 }, { "epoch": 1.6704730831973897, "grad_norm": 0.10722088068723679, "learning_rate": 4.175774877650898e-05, "loss": 0.0933, "num_input_tokens_seen": 22131200, "step": 10240 }, { "epoch": 1.671288743882545, "grad_norm": 0.233281672000885, "learning_rate": 4.1778140293637844e-05, "loss": 0.1498, "num_input_tokens_seen": 22142880, "step": 10245 }, { "epoch": 1.6721044045676998, "grad_norm": 0.08957032114267349, "learning_rate": 4.1798531810766725e-05, "loss": 0.2517, "num_input_tokens_seen": 22151744, "step": 10250 }, { "epoch": 1.6729200652528549, "grad_norm": 1.5368355512619019, "learning_rate": 4.18189233278956e-05, "loss": 0.1609, "num_input_tokens_seen": 22162432, "step": 10255 }, { "epoch": 1.67373572593801, "grad_norm": 0.5890401005744934, "learning_rate": 4.183931484502447e-05, "loss": 0.0742, "num_input_tokens_seen": 22174432, "step": 10260 }, { "epoch": 1.6745513866231647, "grad_norm": 1.7598167657852173, "learning_rate": 4.1859706362153346e-05, "loss": 0.2289, "num_input_tokens_seen": 22185184, "step": 10265 }, { "epoch": 1.6753670473083198, "grad_norm": 0.45390769839286804, "learning_rate": 4.188009787928222e-05, "loss": 0.1181, "num_input_tokens_seen": 22196480, "step": 10270 }, { "epoch": 1.6761827079934748, "grad_norm": 0.20149928331375122, "learning_rate": 4.1900489396411094e-05, "loss": 0.1461, "num_input_tokens_seen": 22207392, "step": 10275 }, { "epoch": 1.6769983686786296, "grad_norm": 0.9155957102775574, "learning_rate": 4.192088091353997e-05, "loss": 0.1533, "num_input_tokens_seen": 22218880, "step": 10280 }, { "epoch": 1.6778140293637847, "grad_norm": 0.0933481752872467, "learning_rate": 4.194127243066884e-05, "loss": 0.2309, "num_input_tokens_seen": 22229184, "step": 10285 }, { "epoch": 1.6786296900489397, "grad_norm": 2.020418643951416, "learning_rate": 4.196166394779772e-05, "loss": 0.1686, "num_input_tokens_seen": 22240096, "step": 10290 }, { "epoch": 1.6794453507340945, "grad_norm": 0.12992827594280243, "learning_rate": 4.1982055464926596e-05, "loss": 0.2962, "num_input_tokens_seen": 22251616, "step": 10295 }, { "epoch": 1.6802610114192496, "grad_norm": 0.43139976263046265, "learning_rate": 4.200244698205546e-05, "loss": 0.1427, "num_input_tokens_seen": 22262464, "step": 10300 }, { "epoch": 1.6810766721044046, "grad_norm": 0.7508013844490051, "learning_rate": 4.2022838499184343e-05, "loss": 0.1587, "num_input_tokens_seen": 22272640, "step": 10305 }, { "epoch": 1.6818923327895594, "grad_norm": 1.6990227699279785, "learning_rate": 4.204323001631322e-05, "loss": 0.1507, "num_input_tokens_seen": 22283264, "step": 10310 }, { "epoch": 1.6827079934747147, "grad_norm": 0.1775861233472824, "learning_rate": 4.206362153344209e-05, "loss": 0.0621, "num_input_tokens_seen": 22294112, "step": 10315 }, { "epoch": 1.6835236541598695, "grad_norm": 0.15752291679382324, "learning_rate": 4.2084013050570965e-05, "loss": 0.2569, "num_input_tokens_seen": 22305184, "step": 10320 }, { "epoch": 1.6843393148450243, "grad_norm": 0.6950480341911316, "learning_rate": 4.210440456769984e-05, "loss": 0.15, "num_input_tokens_seen": 22315840, "step": 10325 }, { "epoch": 1.6851549755301796, "grad_norm": 1.3888614177703857, "learning_rate": 4.212479608482872e-05, "loss": 0.1663, "num_input_tokens_seen": 22325728, "step": 10330 }, { "epoch": 1.6859706362153344, "grad_norm": 0.9319537878036499, "learning_rate": 4.2145187601957586e-05, "loss": 0.1971, "num_input_tokens_seen": 22335904, "step": 10335 }, { "epoch": 1.6867862969004894, "grad_norm": 0.3226921856403351, "learning_rate": 4.216557911908646e-05, "loss": 0.2043, "num_input_tokens_seen": 22345184, "step": 10340 }, { "epoch": 1.6876019575856445, "grad_norm": 0.9900749325752258, "learning_rate": 4.218597063621534e-05, "loss": 0.1579, "num_input_tokens_seen": 22356736, "step": 10345 }, { "epoch": 1.6884176182707993, "grad_norm": 0.953535258769989, "learning_rate": 4.220636215334421e-05, "loss": 0.1651, "num_input_tokens_seen": 22368704, "step": 10350 }, { "epoch": 1.6892332789559543, "grad_norm": 0.4481549561023712, "learning_rate": 4.222675367047308e-05, "loss": 0.224, "num_input_tokens_seen": 22379904, "step": 10355 }, { "epoch": 1.6900489396411094, "grad_norm": 1.7745531797409058, "learning_rate": 4.224714518760196e-05, "loss": 0.2191, "num_input_tokens_seen": 22391776, "step": 10360 }, { "epoch": 1.6908646003262642, "grad_norm": 1.194338083267212, "learning_rate": 4.226753670473083e-05, "loss": 0.1362, "num_input_tokens_seen": 22401792, "step": 10365 }, { "epoch": 1.6916802610114192, "grad_norm": 0.6248285174369812, "learning_rate": 4.228792822185971e-05, "loss": 0.0828, "num_input_tokens_seen": 22411968, "step": 10370 }, { "epoch": 1.6924959216965743, "grad_norm": 0.48842117190361023, "learning_rate": 4.2308319738988584e-05, "loss": 0.1524, "num_input_tokens_seen": 22424000, "step": 10375 }, { "epoch": 1.693311582381729, "grad_norm": 0.8249492645263672, "learning_rate": 4.232871125611746e-05, "loss": 0.1137, "num_input_tokens_seen": 22434048, "step": 10380 }, { "epoch": 1.6941272430668843, "grad_norm": 0.12207357585430145, "learning_rate": 4.234910277324633e-05, "loss": 0.0862, "num_input_tokens_seen": 22446176, "step": 10385 }, { "epoch": 1.6949429037520392, "grad_norm": 0.37424930930137634, "learning_rate": 4.2369494290375205e-05, "loss": 0.0644, "num_input_tokens_seen": 22457600, "step": 10390 }, { "epoch": 1.695758564437194, "grad_norm": 1.0335602760314941, "learning_rate": 4.238988580750408e-05, "loss": 0.1343, "num_input_tokens_seen": 22468768, "step": 10395 }, { "epoch": 1.6965742251223492, "grad_norm": 1.2848255634307861, "learning_rate": 4.241027732463295e-05, "loss": 0.2343, "num_input_tokens_seen": 22480448, "step": 10400 }, { "epoch": 1.697389885807504, "grad_norm": 1.0322879552841187, "learning_rate": 4.243066884176183e-05, "loss": 0.12, "num_input_tokens_seen": 22490880, "step": 10405 }, { "epoch": 1.698205546492659, "grad_norm": 0.11606274545192719, "learning_rate": 4.245106035889071e-05, "loss": 0.3535, "num_input_tokens_seen": 22500704, "step": 10410 }, { "epoch": 1.6990212071778141, "grad_norm": 0.2246686965227127, "learning_rate": 4.247145187601958e-05, "loss": 0.0671, "num_input_tokens_seen": 22511040, "step": 10415 }, { "epoch": 1.699836867862969, "grad_norm": 0.22689926624298096, "learning_rate": 4.249184339314845e-05, "loss": 0.1229, "num_input_tokens_seen": 22521664, "step": 10420 }, { "epoch": 1.700652528548124, "grad_norm": 0.41013139486312866, "learning_rate": 4.251223491027733e-05, "loss": 0.1736, "num_input_tokens_seen": 22532192, "step": 10425 }, { "epoch": 1.701468189233279, "grad_norm": 0.41014641523361206, "learning_rate": 4.25326264274062e-05, "loss": 0.1927, "num_input_tokens_seen": 22543712, "step": 10430 }, { "epoch": 1.7022838499184338, "grad_norm": 0.4594472348690033, "learning_rate": 4.255301794453507e-05, "loss": 0.1707, "num_input_tokens_seen": 22555232, "step": 10435 }, { "epoch": 1.7030995106035889, "grad_norm": 0.7270563840866089, "learning_rate": 4.257340946166395e-05, "loss": 0.0617, "num_input_tokens_seen": 22566720, "step": 10440 }, { "epoch": 1.703915171288744, "grad_norm": 0.6709803938865662, "learning_rate": 4.2593800978792824e-05, "loss": 0.0906, "num_input_tokens_seen": 22577376, "step": 10445 }, { "epoch": 1.7047308319738987, "grad_norm": 2.453733205795288, "learning_rate": 4.26141924959217e-05, "loss": 0.2707, "num_input_tokens_seen": 22589568, "step": 10450 }, { "epoch": 1.7055464926590538, "grad_norm": 0.25135162472724915, "learning_rate": 4.263458401305057e-05, "loss": 0.1437, "num_input_tokens_seen": 22600672, "step": 10455 }, { "epoch": 1.7063621533442088, "grad_norm": 0.43739402294158936, "learning_rate": 4.2654975530179445e-05, "loss": 0.2809, "num_input_tokens_seen": 22611200, "step": 10460 }, { "epoch": 1.7071778140293636, "grad_norm": 0.33667588233947754, "learning_rate": 4.2675367047308326e-05, "loss": 0.1165, "num_input_tokens_seen": 22620608, "step": 10465 }, { "epoch": 1.707993474714519, "grad_norm": 0.06033709645271301, "learning_rate": 4.269575856443719e-05, "loss": 0.2311, "num_input_tokens_seen": 22631168, "step": 10470 }, { "epoch": 1.7088091353996737, "grad_norm": 0.3078092336654663, "learning_rate": 4.271615008156607e-05, "loss": 0.2156, "num_input_tokens_seen": 22640640, "step": 10475 }, { "epoch": 1.7096247960848288, "grad_norm": 0.7061067819595337, "learning_rate": 4.273654159869495e-05, "loss": 0.1754, "num_input_tokens_seen": 22650976, "step": 10480 }, { "epoch": 1.7104404567699838, "grad_norm": 1.134365439414978, "learning_rate": 4.275693311582382e-05, "loss": 0.348, "num_input_tokens_seen": 22661472, "step": 10485 }, { "epoch": 1.7112561174551386, "grad_norm": 0.7520764470100403, "learning_rate": 4.2777324632952695e-05, "loss": 0.1741, "num_input_tokens_seen": 22673024, "step": 10490 }, { "epoch": 1.7120717781402937, "grad_norm": 0.12516237795352936, "learning_rate": 4.279771615008157e-05, "loss": 0.1521, "num_input_tokens_seen": 22684416, "step": 10495 }, { "epoch": 1.7128874388254487, "grad_norm": 0.230015367269516, "learning_rate": 4.281810766721044e-05, "loss": 0.0839, "num_input_tokens_seen": 22695136, "step": 10500 }, { "epoch": 1.7137030995106035, "grad_norm": 0.3124724328517914, "learning_rate": 4.283849918433932e-05, "loss": 0.1747, "num_input_tokens_seen": 22705408, "step": 10505 }, { "epoch": 1.7145187601957586, "grad_norm": 1.0958698987960815, "learning_rate": 4.285889070146819e-05, "loss": 0.1107, "num_input_tokens_seen": 22716000, "step": 10510 }, { "epoch": 1.7153344208809136, "grad_norm": 1.6009018421173096, "learning_rate": 4.2879282218597064e-05, "loss": 0.2363, "num_input_tokens_seen": 22727936, "step": 10515 }, { "epoch": 1.7161500815660684, "grad_norm": 0.9035595655441284, "learning_rate": 4.2899673735725945e-05, "loss": 0.295, "num_input_tokens_seen": 22737632, "step": 10520 }, { "epoch": 1.7169657422512234, "grad_norm": 2.2815330028533936, "learning_rate": 4.292006525285481e-05, "loss": 0.214, "num_input_tokens_seen": 22748288, "step": 10525 }, { "epoch": 1.7177814029363785, "grad_norm": 1.2458442449569702, "learning_rate": 4.2940456769983686e-05, "loss": 0.2304, "num_input_tokens_seen": 22760288, "step": 10530 }, { "epoch": 1.7185970636215333, "grad_norm": 0.443835586309433, "learning_rate": 4.2960848287112566e-05, "loss": 0.1541, "num_input_tokens_seen": 22772032, "step": 10535 }, { "epoch": 1.7194127243066886, "grad_norm": 0.4853062927722931, "learning_rate": 4.2981239804241433e-05, "loss": 0.1629, "num_input_tokens_seen": 22783392, "step": 10540 }, { "epoch": 1.7202283849918434, "grad_norm": 0.09460580348968506, "learning_rate": 4.3001631321370314e-05, "loss": 0.1433, "num_input_tokens_seen": 22795808, "step": 10545 }, { "epoch": 1.7210440456769984, "grad_norm": 0.3715404272079468, "learning_rate": 4.302202283849919e-05, "loss": 0.2733, "num_input_tokens_seen": 22806400, "step": 10550 }, { "epoch": 1.7218597063621535, "grad_norm": 0.9994909167289734, "learning_rate": 4.304241435562806e-05, "loss": 0.1606, "num_input_tokens_seen": 22816256, "step": 10555 }, { "epoch": 1.7226753670473083, "grad_norm": 0.5944874286651611, "learning_rate": 4.3062805872756935e-05, "loss": 0.0557, "num_input_tokens_seen": 22827200, "step": 10560 }, { "epoch": 1.7234910277324633, "grad_norm": 0.5090692639350891, "learning_rate": 4.308319738988581e-05, "loss": 0.1242, "num_input_tokens_seen": 22837248, "step": 10565 }, { "epoch": 1.7243066884176184, "grad_norm": 0.6466764807701111, "learning_rate": 4.310358890701468e-05, "loss": 0.0882, "num_input_tokens_seen": 22846048, "step": 10570 }, { "epoch": 1.7251223491027732, "grad_norm": 1.164840817451477, "learning_rate": 4.312398042414356e-05, "loss": 0.1611, "num_input_tokens_seen": 22856672, "step": 10575 }, { "epoch": 1.7259380097879282, "grad_norm": 1.4596916437149048, "learning_rate": 4.314437194127243e-05, "loss": 0.1526, "num_input_tokens_seen": 22868224, "step": 10580 }, { "epoch": 1.7267536704730833, "grad_norm": 1.0535959005355835, "learning_rate": 4.316476345840131e-05, "loss": 0.3416, "num_input_tokens_seen": 22879584, "step": 10585 }, { "epoch": 1.727569331158238, "grad_norm": 0.10279254615306854, "learning_rate": 4.318515497553018e-05, "loss": 0.0517, "num_input_tokens_seen": 22889696, "step": 10590 }, { "epoch": 1.7283849918433931, "grad_norm": 0.9044974446296692, "learning_rate": 4.320554649265905e-05, "loss": 0.1082, "num_input_tokens_seen": 22900224, "step": 10595 }, { "epoch": 1.7292006525285482, "grad_norm": 1.2322388887405396, "learning_rate": 4.322593800978793e-05, "loss": 0.2756, "num_input_tokens_seen": 22911136, "step": 10600 }, { "epoch": 1.730016313213703, "grad_norm": 0.3500719666481018, "learning_rate": 4.324632952691681e-05, "loss": 0.2127, "num_input_tokens_seen": 22922400, "step": 10605 }, { "epoch": 1.7308319738988582, "grad_norm": 0.56548011302948, "learning_rate": 4.326672104404568e-05, "loss": 0.0951, "num_input_tokens_seen": 22933664, "step": 10610 }, { "epoch": 1.731647634584013, "grad_norm": 1.0062696933746338, "learning_rate": 4.3287112561174554e-05, "loss": 0.2087, "num_input_tokens_seen": 22943104, "step": 10615 }, { "epoch": 1.7324632952691679, "grad_norm": 1.2800219058990479, "learning_rate": 4.330750407830343e-05, "loss": 0.1781, "num_input_tokens_seen": 22953344, "step": 10620 }, { "epoch": 1.7332789559543231, "grad_norm": 0.5786211490631104, "learning_rate": 4.33278955954323e-05, "loss": 0.1518, "num_input_tokens_seen": 22964928, "step": 10625 }, { "epoch": 1.734094616639478, "grad_norm": 0.50901198387146, "learning_rate": 4.3348287112561176e-05, "loss": 0.2227, "num_input_tokens_seen": 22977152, "step": 10630 }, { "epoch": 1.734910277324633, "grad_norm": 1.4840363264083862, "learning_rate": 4.336867862969005e-05, "loss": 0.1617, "num_input_tokens_seen": 22988448, "step": 10635 }, { "epoch": 1.735725938009788, "grad_norm": 0.1121746376156807, "learning_rate": 4.338907014681893e-05, "loss": 0.1138, "num_input_tokens_seen": 22998592, "step": 10640 }, { "epoch": 1.7365415986949428, "grad_norm": 0.5080600380897522, "learning_rate": 4.34094616639478e-05, "loss": 0.119, "num_input_tokens_seen": 23009344, "step": 10645 }, { "epoch": 1.7373572593800979, "grad_norm": 0.729982316493988, "learning_rate": 4.342985318107667e-05, "loss": 0.2559, "num_input_tokens_seen": 23019584, "step": 10650 }, { "epoch": 1.738172920065253, "grad_norm": 1.5101280212402344, "learning_rate": 4.345024469820555e-05, "loss": 0.2252, "num_input_tokens_seen": 23030336, "step": 10655 }, { "epoch": 1.7389885807504077, "grad_norm": 0.391831636428833, "learning_rate": 4.347063621533442e-05, "loss": 0.2059, "num_input_tokens_seen": 23041760, "step": 10660 }, { "epoch": 1.7398042414355628, "grad_norm": 2.0682175159454346, "learning_rate": 4.34910277324633e-05, "loss": 0.2214, "num_input_tokens_seen": 23051840, "step": 10665 }, { "epoch": 1.7406199021207178, "grad_norm": 1.3553954362869263, "learning_rate": 4.351141924959217e-05, "loss": 0.0773, "num_input_tokens_seen": 23061632, "step": 10670 }, { "epoch": 1.7414355628058726, "grad_norm": 0.9820231795310974, "learning_rate": 4.353181076672105e-05, "loss": 0.1999, "num_input_tokens_seen": 23071520, "step": 10675 }, { "epoch": 1.7422512234910277, "grad_norm": 0.3235790431499481, "learning_rate": 4.355220228384992e-05, "loss": 0.2764, "num_input_tokens_seen": 23082304, "step": 10680 }, { "epoch": 1.7430668841761827, "grad_norm": 1.977765679359436, "learning_rate": 4.3572593800978795e-05, "loss": 0.2442, "num_input_tokens_seen": 23092416, "step": 10685 }, { "epoch": 1.7438825448613375, "grad_norm": 2.4281067848205566, "learning_rate": 4.359298531810767e-05, "loss": 0.2193, "num_input_tokens_seen": 23102720, "step": 10690 }, { "epoch": 1.7446982055464928, "grad_norm": 0.5474498867988586, "learning_rate": 4.361337683523654e-05, "loss": 0.0916, "num_input_tokens_seen": 23114016, "step": 10695 }, { "epoch": 1.7455138662316476, "grad_norm": 0.30714893341064453, "learning_rate": 4.3633768352365416e-05, "loss": 0.0758, "num_input_tokens_seen": 23125888, "step": 10700 }, { "epoch": 1.7463295269168027, "grad_norm": 0.9000146985054016, "learning_rate": 4.365415986949429e-05, "loss": 0.0859, "num_input_tokens_seen": 23137248, "step": 10705 }, { "epoch": 1.7471451876019577, "grad_norm": 0.37244918942451477, "learning_rate": 4.367455138662317e-05, "loss": 0.1921, "num_input_tokens_seen": 23148064, "step": 10710 }, { "epoch": 1.7479608482871125, "grad_norm": 0.8954128623008728, "learning_rate": 4.369494290375204e-05, "loss": 0.0284, "num_input_tokens_seen": 23158208, "step": 10715 }, { "epoch": 1.7487765089722676, "grad_norm": 1.3779873847961426, "learning_rate": 4.371533442088092e-05, "loss": 0.1711, "num_input_tokens_seen": 23170144, "step": 10720 }, { "epoch": 1.7495921696574226, "grad_norm": 0.2604095935821533, "learning_rate": 4.373572593800979e-05, "loss": 0.2182, "num_input_tokens_seen": 23180768, "step": 10725 }, { "epoch": 1.7504078303425774, "grad_norm": 1.664499282836914, "learning_rate": 4.375611745513866e-05, "loss": 0.1539, "num_input_tokens_seen": 23191840, "step": 10730 }, { "epoch": 1.7512234910277324, "grad_norm": 0.41473132371902466, "learning_rate": 4.377650897226754e-05, "loss": 0.1558, "num_input_tokens_seen": 23201760, "step": 10735 }, { "epoch": 1.7520391517128875, "grad_norm": 0.17452983558177948, "learning_rate": 4.3796900489396413e-05, "loss": 0.1394, "num_input_tokens_seen": 23213024, "step": 10740 }, { "epoch": 1.7528548123980423, "grad_norm": 0.37750595808029175, "learning_rate": 4.381729200652529e-05, "loss": 0.1361, "num_input_tokens_seen": 23224512, "step": 10745 }, { "epoch": 1.7536704730831973, "grad_norm": 0.3361871838569641, "learning_rate": 4.383768352365416e-05, "loss": 0.1036, "num_input_tokens_seen": 23235040, "step": 10750 }, { "epoch": 1.7544861337683524, "grad_norm": 0.23989228904247284, "learning_rate": 4.3858075040783035e-05, "loss": 0.1483, "num_input_tokens_seen": 23245056, "step": 10755 }, { "epoch": 1.7553017944535072, "grad_norm": 0.6341948509216309, "learning_rate": 4.3878466557911915e-05, "loss": 0.158, "num_input_tokens_seen": 23257824, "step": 10760 }, { "epoch": 1.7561174551386625, "grad_norm": 0.778577446937561, "learning_rate": 4.389885807504078e-05, "loss": 0.1549, "num_input_tokens_seen": 23269376, "step": 10765 }, { "epoch": 1.7569331158238173, "grad_norm": 1.9633933305740356, "learning_rate": 4.3919249592169656e-05, "loss": 0.1175, "num_input_tokens_seen": 23282560, "step": 10770 }, { "epoch": 1.7577487765089723, "grad_norm": 0.6166591644287109, "learning_rate": 4.393964110929854e-05, "loss": 0.1036, "num_input_tokens_seen": 23292896, "step": 10775 }, { "epoch": 1.7585644371941274, "grad_norm": 0.40098854899406433, "learning_rate": 4.396003262642741e-05, "loss": 0.0335, "num_input_tokens_seen": 23303168, "step": 10780 }, { "epoch": 1.7593800978792822, "grad_norm": 0.2840223014354706, "learning_rate": 4.3980424143556285e-05, "loss": 0.1947, "num_input_tokens_seen": 23314784, "step": 10785 }, { "epoch": 1.7601957585644372, "grad_norm": 1.0641961097717285, "learning_rate": 4.400081566068516e-05, "loss": 0.1533, "num_input_tokens_seen": 23325696, "step": 10790 }, { "epoch": 1.7610114192495923, "grad_norm": 1.7165175676345825, "learning_rate": 4.402120717781403e-05, "loss": 0.2433, "num_input_tokens_seen": 23337120, "step": 10795 }, { "epoch": 1.761827079934747, "grad_norm": 1.8060712814331055, "learning_rate": 4.4041598694942906e-05, "loss": 0.1987, "num_input_tokens_seen": 23347584, "step": 10800 }, { "epoch": 1.7626427406199021, "grad_norm": 1.3011949062347412, "learning_rate": 4.406199021207178e-05, "loss": 0.2753, "num_input_tokens_seen": 23357504, "step": 10805 }, { "epoch": 1.7634584013050572, "grad_norm": 0.27326029539108276, "learning_rate": 4.4082381729200654e-05, "loss": 0.1728, "num_input_tokens_seen": 23367456, "step": 10810 }, { "epoch": 1.764274061990212, "grad_norm": 0.20633482933044434, "learning_rate": 4.4102773246329534e-05, "loss": 0.1333, "num_input_tokens_seen": 23377984, "step": 10815 }, { "epoch": 1.765089722675367, "grad_norm": 0.4169932007789612, "learning_rate": 4.41231647634584e-05, "loss": 0.2379, "num_input_tokens_seen": 23390272, "step": 10820 }, { "epoch": 1.765905383360522, "grad_norm": 0.1745116412639618, "learning_rate": 4.4143556280587275e-05, "loss": 0.0333, "num_input_tokens_seen": 23400416, "step": 10825 }, { "epoch": 1.7667210440456769, "grad_norm": 1.126070261001587, "learning_rate": 4.4163947797716156e-05, "loss": 0.1757, "num_input_tokens_seen": 23412704, "step": 10830 }, { "epoch": 1.7675367047308321, "grad_norm": 0.20589584112167358, "learning_rate": 4.418433931484502e-05, "loss": 0.1669, "num_input_tokens_seen": 23421952, "step": 10835 }, { "epoch": 1.768352365415987, "grad_norm": 0.16017989814281464, "learning_rate": 4.4204730831973903e-05, "loss": 0.1256, "num_input_tokens_seen": 23431136, "step": 10840 }, { "epoch": 1.7691680261011418, "grad_norm": 0.23804426193237305, "learning_rate": 4.422512234910278e-05, "loss": 0.1379, "num_input_tokens_seen": 23442880, "step": 10845 }, { "epoch": 1.769983686786297, "grad_norm": 0.8429321050643921, "learning_rate": 4.4245513866231644e-05, "loss": 0.184, "num_input_tokens_seen": 23453120, "step": 10850 }, { "epoch": 1.7707993474714518, "grad_norm": 0.5744715332984924, "learning_rate": 4.4265905383360525e-05, "loss": 0.2448, "num_input_tokens_seen": 23464192, "step": 10855 }, { "epoch": 1.7716150081566069, "grad_norm": 0.78560471534729, "learning_rate": 4.42862969004894e-05, "loss": 0.224, "num_input_tokens_seen": 23475264, "step": 10860 }, { "epoch": 1.772430668841762, "grad_norm": 0.7334033250808716, "learning_rate": 4.430668841761827e-05, "loss": 0.1496, "num_input_tokens_seen": 23485440, "step": 10865 }, { "epoch": 1.7732463295269167, "grad_norm": 0.5863547921180725, "learning_rate": 4.4327079934747146e-05, "loss": 0.1157, "num_input_tokens_seen": 23496768, "step": 10870 }, { "epoch": 1.7740619902120718, "grad_norm": 0.03898969665169716, "learning_rate": 4.434747145187602e-05, "loss": 0.1719, "num_input_tokens_seen": 23507328, "step": 10875 }, { "epoch": 1.7748776508972268, "grad_norm": 0.550349235534668, "learning_rate": 4.43678629690049e-05, "loss": 0.1087, "num_input_tokens_seen": 23518976, "step": 10880 }, { "epoch": 1.7756933115823816, "grad_norm": 0.24229158461093903, "learning_rate": 4.438825448613377e-05, "loss": 0.1703, "num_input_tokens_seen": 23530720, "step": 10885 }, { "epoch": 1.7765089722675367, "grad_norm": 0.5782918930053711, "learning_rate": 4.440864600326264e-05, "loss": 0.1223, "num_input_tokens_seen": 23541312, "step": 10890 }, { "epoch": 1.7773246329526917, "grad_norm": 0.5132896900177002, "learning_rate": 4.442903752039152e-05, "loss": 0.1591, "num_input_tokens_seen": 23552064, "step": 10895 }, { "epoch": 1.7781402936378465, "grad_norm": 0.35689839720726013, "learning_rate": 4.4449429037520396e-05, "loss": 0.1544, "num_input_tokens_seen": 23563104, "step": 10900 }, { "epoch": 1.7789559543230016, "grad_norm": 0.7416607141494751, "learning_rate": 4.446982055464926e-05, "loss": 0.1109, "num_input_tokens_seen": 23572992, "step": 10905 }, { "epoch": 1.7797716150081566, "grad_norm": 1.584720492362976, "learning_rate": 4.4490212071778144e-05, "loss": 0.2264, "num_input_tokens_seen": 23583328, "step": 10910 }, { "epoch": 1.7805872756933114, "grad_norm": 0.32088467478752136, "learning_rate": 4.451060358890702e-05, "loss": 0.1112, "num_input_tokens_seen": 23593888, "step": 10915 }, { "epoch": 1.7814029363784667, "grad_norm": 1.4843329191207886, "learning_rate": 4.453099510603589e-05, "loss": 0.1678, "num_input_tokens_seen": 23604544, "step": 10920 }, { "epoch": 1.7822185970636215, "grad_norm": 0.3826216161251068, "learning_rate": 4.4551386623164765e-05, "loss": 0.2207, "num_input_tokens_seen": 23615584, "step": 10925 }, { "epoch": 1.7830342577487766, "grad_norm": 0.07535284012556076, "learning_rate": 4.457177814029364e-05, "loss": 0.0989, "num_input_tokens_seen": 23625568, "step": 10930 }, { "epoch": 1.7838499184339316, "grad_norm": 0.09259688854217529, "learning_rate": 4.459216965742252e-05, "loss": 0.0508, "num_input_tokens_seen": 23636320, "step": 10935 }, { "epoch": 1.7846655791190864, "grad_norm": 0.5608853697776794, "learning_rate": 4.461256117455139e-05, "loss": 0.2309, "num_input_tokens_seen": 23646912, "step": 10940 }, { "epoch": 1.7854812398042414, "grad_norm": 0.1512049436569214, "learning_rate": 4.463295269168026e-05, "loss": 0.1256, "num_input_tokens_seen": 23657120, "step": 10945 }, { "epoch": 1.7862969004893965, "grad_norm": 0.9360044598579407, "learning_rate": 4.465334420880914e-05, "loss": 0.2276, "num_input_tokens_seen": 23667712, "step": 10950 }, { "epoch": 1.7871125611745513, "grad_norm": 0.236677348613739, "learning_rate": 4.467373572593801e-05, "loss": 0.1207, "num_input_tokens_seen": 23678720, "step": 10955 }, { "epoch": 1.7879282218597063, "grad_norm": 0.5926308631896973, "learning_rate": 4.469412724306689e-05, "loss": 0.0656, "num_input_tokens_seen": 23688160, "step": 10960 }, { "epoch": 1.7887438825448614, "grad_norm": 1.482252836227417, "learning_rate": 4.471451876019576e-05, "loss": 0.2197, "num_input_tokens_seen": 23699680, "step": 10965 }, { "epoch": 1.7895595432300162, "grad_norm": 0.9526251554489136, "learning_rate": 4.4734910277324636e-05, "loss": 0.1323, "num_input_tokens_seen": 23710880, "step": 10970 }, { "epoch": 1.7903752039151712, "grad_norm": 0.4963742792606354, "learning_rate": 4.475530179445351e-05, "loss": 0.1129, "num_input_tokens_seen": 23722304, "step": 10975 }, { "epoch": 1.7911908646003263, "grad_norm": 1.6306393146514893, "learning_rate": 4.4775693311582384e-05, "loss": 0.1972, "num_input_tokens_seen": 23734048, "step": 10980 }, { "epoch": 1.792006525285481, "grad_norm": 0.06123047694563866, "learning_rate": 4.479608482871126e-05, "loss": 0.1085, "num_input_tokens_seen": 23745600, "step": 10985 }, { "epoch": 1.7928221859706364, "grad_norm": 0.3482891321182251, "learning_rate": 4.481647634584013e-05, "loss": 0.0628, "num_input_tokens_seen": 23756800, "step": 10990 }, { "epoch": 1.7936378466557912, "grad_norm": 0.7512561678886414, "learning_rate": 4.4836867862969005e-05, "loss": 0.219, "num_input_tokens_seen": 23766880, "step": 10995 }, { "epoch": 1.7944535073409462, "grad_norm": 0.0899544432759285, "learning_rate": 4.485725938009788e-05, "loss": 0.1569, "num_input_tokens_seen": 23777472, "step": 11000 }, { "epoch": 1.7952691680261013, "grad_norm": 0.815967321395874, "learning_rate": 4.487765089722676e-05, "loss": 0.0905, "num_input_tokens_seen": 23786336, "step": 11005 }, { "epoch": 1.796084828711256, "grad_norm": 0.6469904780387878, "learning_rate": 4.489804241435563e-05, "loss": 0.1434, "num_input_tokens_seen": 23797184, "step": 11010 }, { "epoch": 1.7969004893964111, "grad_norm": 0.1766345351934433, "learning_rate": 4.491843393148451e-05, "loss": 0.1159, "num_input_tokens_seen": 23808288, "step": 11015 }, { "epoch": 1.7977161500815662, "grad_norm": 1.3614035844802856, "learning_rate": 4.493882544861338e-05, "loss": 0.2402, "num_input_tokens_seen": 23818176, "step": 11020 }, { "epoch": 1.798531810766721, "grad_norm": 1.5533246994018555, "learning_rate": 4.495921696574225e-05, "loss": 0.1613, "num_input_tokens_seen": 23829344, "step": 11025 }, { "epoch": 1.799347471451876, "grad_norm": 1.4667561054229736, "learning_rate": 4.497960848287113e-05, "loss": 0.1562, "num_input_tokens_seen": 23839936, "step": 11030 }, { "epoch": 1.800163132137031, "grad_norm": 1.2913507223129272, "learning_rate": 4.5e-05, "loss": 0.1046, "num_input_tokens_seen": 23852736, "step": 11035 }, { "epoch": 1.8009787928221859, "grad_norm": 0.23449265956878662, "learning_rate": 4.502039151712888e-05, "loss": 0.1684, "num_input_tokens_seen": 23864736, "step": 11040 }, { "epoch": 1.801794453507341, "grad_norm": 1.5487613677978516, "learning_rate": 4.504078303425775e-05, "loss": 0.2037, "num_input_tokens_seen": 23874432, "step": 11045 }, { "epoch": 1.802610114192496, "grad_norm": 0.4134248197078705, "learning_rate": 4.5061174551386624e-05, "loss": 0.0553, "num_input_tokens_seen": 23885056, "step": 11050 }, { "epoch": 1.8034257748776508, "grad_norm": 0.6773508191108704, "learning_rate": 4.5081566068515505e-05, "loss": 0.1482, "num_input_tokens_seen": 23896096, "step": 11055 }, { "epoch": 1.804241435562806, "grad_norm": 0.3346019983291626, "learning_rate": 4.510195758564437e-05, "loss": 0.1315, "num_input_tokens_seen": 23907200, "step": 11060 }, { "epoch": 1.8050570962479608, "grad_norm": 0.4182210862636566, "learning_rate": 4.5122349102773246e-05, "loss": 0.1039, "num_input_tokens_seen": 23916576, "step": 11065 }, { "epoch": 1.8058727569331157, "grad_norm": 0.11701406538486481, "learning_rate": 4.5142740619902126e-05, "loss": 0.106, "num_input_tokens_seen": 23926784, "step": 11070 }, { "epoch": 1.806688417618271, "grad_norm": 1.1485896110534668, "learning_rate": 4.5163132137030993e-05, "loss": 0.2193, "num_input_tokens_seen": 23937632, "step": 11075 }, { "epoch": 1.8075040783034257, "grad_norm": 1.157413363456726, "learning_rate": 4.518352365415987e-05, "loss": 0.0466, "num_input_tokens_seen": 23949024, "step": 11080 }, { "epoch": 1.8083197389885808, "grad_norm": 0.8326678276062012, "learning_rate": 4.520391517128875e-05, "loss": 0.2019, "num_input_tokens_seen": 23960192, "step": 11085 }, { "epoch": 1.8091353996737358, "grad_norm": 0.7125982046127319, "learning_rate": 4.522430668841762e-05, "loss": 0.1945, "num_input_tokens_seen": 23970304, "step": 11090 }, { "epoch": 1.8099510603588906, "grad_norm": 0.2559697926044464, "learning_rate": 4.5244698205546495e-05, "loss": 0.104, "num_input_tokens_seen": 23980544, "step": 11095 }, { "epoch": 1.8107667210440457, "grad_norm": 1.514962077140808, "learning_rate": 4.526508972267537e-05, "loss": 0.1087, "num_input_tokens_seen": 23991680, "step": 11100 }, { "epoch": 1.8115823817292007, "grad_norm": 0.3168379068374634, "learning_rate": 4.528548123980424e-05, "loss": 0.2674, "num_input_tokens_seen": 24000672, "step": 11105 }, { "epoch": 1.8123980424143555, "grad_norm": 0.41445502638816833, "learning_rate": 4.530587275693312e-05, "loss": 0.1954, "num_input_tokens_seen": 24010496, "step": 11110 }, { "epoch": 1.8132137030995106, "grad_norm": 0.25684934854507446, "learning_rate": 4.532626427406199e-05, "loss": 0.1158, "num_input_tokens_seen": 24021056, "step": 11115 }, { "epoch": 1.8140293637846656, "grad_norm": 0.17882606387138367, "learning_rate": 4.5346655791190865e-05, "loss": 0.1473, "num_input_tokens_seen": 24031136, "step": 11120 }, { "epoch": 1.8148450244698204, "grad_norm": 0.2098769098520279, "learning_rate": 4.5367047308319745e-05, "loss": 0.0266, "num_input_tokens_seen": 24040864, "step": 11125 }, { "epoch": 1.8156606851549757, "grad_norm": 0.5063793659210205, "learning_rate": 4.538743882544861e-05, "loss": 0.2495, "num_input_tokens_seen": 24051168, "step": 11130 }, { "epoch": 1.8164763458401305, "grad_norm": 1.2180007696151733, "learning_rate": 4.540783034257749e-05, "loss": 0.1646, "num_input_tokens_seen": 24060448, "step": 11135 }, { "epoch": 1.8172920065252853, "grad_norm": 0.38184863328933716, "learning_rate": 4.542822185970637e-05, "loss": 0.3742, "num_input_tokens_seen": 24070208, "step": 11140 }, { "epoch": 1.8181076672104406, "grad_norm": 0.214678555727005, "learning_rate": 4.5448613376835234e-05, "loss": 0.0799, "num_input_tokens_seen": 24080000, "step": 11145 }, { "epoch": 1.8189233278955954, "grad_norm": 1.119219183921814, "learning_rate": 4.5469004893964114e-05, "loss": 0.1285, "num_input_tokens_seen": 24089120, "step": 11150 }, { "epoch": 1.8197389885807504, "grad_norm": 0.1600622683763504, "learning_rate": 4.548939641109299e-05, "loss": 0.0865, "num_input_tokens_seen": 24099968, "step": 11155 }, { "epoch": 1.8205546492659055, "grad_norm": 0.5912790298461914, "learning_rate": 4.550978792822186e-05, "loss": 0.0685, "num_input_tokens_seen": 24110560, "step": 11160 }, { "epoch": 1.8213703099510603, "grad_norm": 0.10067907720804214, "learning_rate": 4.5530179445350736e-05, "loss": 0.1311, "num_input_tokens_seen": 24121984, "step": 11165 }, { "epoch": 1.8221859706362153, "grad_norm": 1.303975224494934, "learning_rate": 4.555057096247961e-05, "loss": 0.2686, "num_input_tokens_seen": 24132544, "step": 11170 }, { "epoch": 1.8230016313213704, "grad_norm": 1.1920920610427856, "learning_rate": 4.5570962479608483e-05, "loss": 0.1788, "num_input_tokens_seen": 24143808, "step": 11175 }, { "epoch": 1.8238172920065252, "grad_norm": 1.0519331693649292, "learning_rate": 4.559135399673736e-05, "loss": 0.2708, "num_input_tokens_seen": 24154496, "step": 11180 }, { "epoch": 1.8246329526916802, "grad_norm": 1.123536229133606, "learning_rate": 4.561174551386623e-05, "loss": 0.1708, "num_input_tokens_seen": 24165472, "step": 11185 }, { "epoch": 1.8254486133768353, "grad_norm": 0.21667104959487915, "learning_rate": 4.563213703099511e-05, "loss": 0.0679, "num_input_tokens_seen": 24177088, "step": 11190 }, { "epoch": 1.82626427406199, "grad_norm": 0.2371433824300766, "learning_rate": 4.5652528548123985e-05, "loss": 0.1102, "num_input_tokens_seen": 24188768, "step": 11195 }, { "epoch": 1.8270799347471451, "grad_norm": 0.3522236943244934, "learning_rate": 4.567292006525285e-05, "loss": 0.1762, "num_input_tokens_seen": 24199712, "step": 11200 }, { "epoch": 1.8278955954323002, "grad_norm": 0.460398405790329, "learning_rate": 4.569331158238173e-05, "loss": 0.1626, "num_input_tokens_seen": 24210912, "step": 11205 }, { "epoch": 1.828711256117455, "grad_norm": 1.2213177680969238, "learning_rate": 4.571370309951061e-05, "loss": 0.135, "num_input_tokens_seen": 24221824, "step": 11210 }, { "epoch": 1.8295269168026103, "grad_norm": 0.15327540040016174, "learning_rate": 4.573409461663948e-05, "loss": 0.0851, "num_input_tokens_seen": 24231168, "step": 11215 }, { "epoch": 1.830342577487765, "grad_norm": 0.07392587512731552, "learning_rate": 4.5754486133768355e-05, "loss": 0.0263, "num_input_tokens_seen": 24241408, "step": 11220 }, { "epoch": 1.8311582381729201, "grad_norm": 0.6502697467803955, "learning_rate": 4.577487765089723e-05, "loss": 0.197, "num_input_tokens_seen": 24251520, "step": 11225 }, { "epoch": 1.8319738988580752, "grad_norm": 0.1206514835357666, "learning_rate": 4.579526916802611e-05, "loss": 0.2703, "num_input_tokens_seen": 24262144, "step": 11230 }, { "epoch": 1.83278955954323, "grad_norm": 1.2672293186187744, "learning_rate": 4.5815660685154976e-05, "loss": 0.1521, "num_input_tokens_seen": 24273440, "step": 11235 }, { "epoch": 1.833605220228385, "grad_norm": 0.7890950441360474, "learning_rate": 4.583605220228385e-05, "loss": 0.0507, "num_input_tokens_seen": 24284832, "step": 11240 }, { "epoch": 1.83442088091354, "grad_norm": 0.4506867825984955, "learning_rate": 4.585644371941273e-05, "loss": 0.1752, "num_input_tokens_seen": 24295488, "step": 11245 }, { "epoch": 1.8352365415986949, "grad_norm": 0.523175060749054, "learning_rate": 4.58768352365416e-05, "loss": 0.1664, "num_input_tokens_seen": 24306624, "step": 11250 }, { "epoch": 1.83605220228385, "grad_norm": 0.19284872710704803, "learning_rate": 4.589722675367048e-05, "loss": 0.0891, "num_input_tokens_seen": 24318176, "step": 11255 }, { "epoch": 1.836867862969005, "grad_norm": 0.543318510055542, "learning_rate": 4.591761827079935e-05, "loss": 0.173, "num_input_tokens_seen": 24329024, "step": 11260 }, { "epoch": 1.8376835236541598, "grad_norm": 0.04997562617063522, "learning_rate": 4.5938009787928226e-05, "loss": 0.0813, "num_input_tokens_seen": 24339680, "step": 11265 }, { "epoch": 1.8384991843393148, "grad_norm": 0.05849263817071915, "learning_rate": 4.59584013050571e-05, "loss": 0.2244, "num_input_tokens_seen": 24350368, "step": 11270 }, { "epoch": 1.8393148450244698, "grad_norm": 0.3025776147842407, "learning_rate": 4.5978792822185973e-05, "loss": 0.0956, "num_input_tokens_seen": 24361536, "step": 11275 }, { "epoch": 1.8401305057096247, "grad_norm": 0.07447220385074615, "learning_rate": 4.599918433931485e-05, "loss": 0.2124, "num_input_tokens_seen": 24372768, "step": 11280 }, { "epoch": 1.84094616639478, "grad_norm": 1.7499589920043945, "learning_rate": 4.601957585644372e-05, "loss": 0.1588, "num_input_tokens_seen": 24382592, "step": 11285 }, { "epoch": 1.8417618270799347, "grad_norm": 0.46698132157325745, "learning_rate": 4.6039967373572595e-05, "loss": 0.1073, "num_input_tokens_seen": 24393248, "step": 11290 }, { "epoch": 1.8425774877650896, "grad_norm": 0.6428796648979187, "learning_rate": 4.606035889070147e-05, "loss": 0.142, "num_input_tokens_seen": 24403232, "step": 11295 }, { "epoch": 1.8433931484502448, "grad_norm": 0.4536406993865967, "learning_rate": 4.608075040783034e-05, "loss": 0.0983, "num_input_tokens_seen": 24413792, "step": 11300 }, { "epoch": 1.8442088091353996, "grad_norm": 0.3025458753108978, "learning_rate": 4.6101141924959216e-05, "loss": 0.0879, "num_input_tokens_seen": 24423328, "step": 11305 }, { "epoch": 1.8450244698205547, "grad_norm": 0.6967600584030151, "learning_rate": 4.61215334420881e-05, "loss": 0.2698, "num_input_tokens_seen": 24434464, "step": 11310 }, { "epoch": 1.8458401305057097, "grad_norm": 0.09630754590034485, "learning_rate": 4.614192495921697e-05, "loss": 0.1256, "num_input_tokens_seen": 24444224, "step": 11315 }, { "epoch": 1.8466557911908645, "grad_norm": 0.17252019047737122, "learning_rate": 4.616231647634584e-05, "loss": 0.1253, "num_input_tokens_seen": 24454912, "step": 11320 }, { "epoch": 1.8474714518760196, "grad_norm": 0.5796010494232178, "learning_rate": 4.618270799347472e-05, "loss": 0.055, "num_input_tokens_seen": 24466304, "step": 11325 }, { "epoch": 1.8482871125611746, "grad_norm": 0.577325165271759, "learning_rate": 4.620309951060359e-05, "loss": 0.166, "num_input_tokens_seen": 24476960, "step": 11330 }, { "epoch": 1.8491027732463294, "grad_norm": 0.6932263970375061, "learning_rate": 4.6223491027732466e-05, "loss": 0.2043, "num_input_tokens_seen": 24486784, "step": 11335 }, { "epoch": 1.8499184339314845, "grad_norm": 0.19467449188232422, "learning_rate": 4.624388254486134e-05, "loss": 0.0636, "num_input_tokens_seen": 24497120, "step": 11340 }, { "epoch": 1.8507340946166395, "grad_norm": 0.27132606506347656, "learning_rate": 4.6264274061990214e-05, "loss": 0.1606, "num_input_tokens_seen": 24507200, "step": 11345 }, { "epoch": 1.8515497553017943, "grad_norm": 0.9851618409156799, "learning_rate": 4.628466557911909e-05, "loss": 0.2784, "num_input_tokens_seen": 24517920, "step": 11350 }, { "epoch": 1.8523654159869496, "grad_norm": 0.32506176829338074, "learning_rate": 4.630505709624796e-05, "loss": 0.1261, "num_input_tokens_seen": 24528640, "step": 11355 }, { "epoch": 1.8531810766721044, "grad_norm": 1.4405975341796875, "learning_rate": 4.6325448613376835e-05, "loss": 0.2248, "num_input_tokens_seen": 24540352, "step": 11360 }, { "epoch": 1.8539967373572592, "grad_norm": 0.9003655314445496, "learning_rate": 4.6345840130505716e-05, "loss": 0.0931, "num_input_tokens_seen": 24551584, "step": 11365 }, { "epoch": 1.8548123980424145, "grad_norm": 0.4290538728237152, "learning_rate": 4.636623164763458e-05, "loss": 0.1214, "num_input_tokens_seen": 24562400, "step": 11370 }, { "epoch": 1.8556280587275693, "grad_norm": 0.20982399582862854, "learning_rate": 4.638662316476346e-05, "loss": 0.1386, "num_input_tokens_seen": 24572736, "step": 11375 }, { "epoch": 1.8564437194127243, "grad_norm": 1.5868638753890991, "learning_rate": 4.640701468189234e-05, "loss": 0.2264, "num_input_tokens_seen": 24584416, "step": 11380 }, { "epoch": 1.8572593800978794, "grad_norm": 2.0748746395111084, "learning_rate": 4.642740619902121e-05, "loss": 0.2626, "num_input_tokens_seen": 24593472, "step": 11385 }, { "epoch": 1.8580750407830342, "grad_norm": 0.11523067951202393, "learning_rate": 4.6447797716150085e-05, "loss": 0.3949, "num_input_tokens_seen": 24604544, "step": 11390 }, { "epoch": 1.8588907014681892, "grad_norm": 0.23048162460327148, "learning_rate": 4.646818923327896e-05, "loss": 0.1249, "num_input_tokens_seen": 24614784, "step": 11395 }, { "epoch": 1.8597063621533443, "grad_norm": 0.9789328575134277, "learning_rate": 4.648858075040783e-05, "loss": 0.1463, "num_input_tokens_seen": 24625504, "step": 11400 }, { "epoch": 1.860522022838499, "grad_norm": 0.8910218477249146, "learning_rate": 4.6508972267536706e-05, "loss": 0.1194, "num_input_tokens_seen": 24636192, "step": 11405 }, { "epoch": 1.8613376835236541, "grad_norm": 0.7049740552902222, "learning_rate": 4.652936378466558e-05, "loss": 0.1382, "num_input_tokens_seen": 24647200, "step": 11410 }, { "epoch": 1.8621533442088092, "grad_norm": 0.12269469350576401, "learning_rate": 4.6549755301794454e-05, "loss": 0.0703, "num_input_tokens_seen": 24656640, "step": 11415 }, { "epoch": 1.862969004893964, "grad_norm": 1.5222628116607666, "learning_rate": 4.6570146818923335e-05, "loss": 0.0899, "num_input_tokens_seen": 24665664, "step": 11420 }, { "epoch": 1.863784665579119, "grad_norm": 0.9139931797981262, "learning_rate": 4.65905383360522e-05, "loss": 0.1676, "num_input_tokens_seen": 24676896, "step": 11425 }, { "epoch": 1.864600326264274, "grad_norm": 0.6180709600448608, "learning_rate": 4.661092985318108e-05, "loss": 0.1084, "num_input_tokens_seen": 24688960, "step": 11430 }, { "epoch": 1.865415986949429, "grad_norm": 0.539198637008667, "learning_rate": 4.6631321370309956e-05, "loss": 0.0888, "num_input_tokens_seen": 24699200, "step": 11435 }, { "epoch": 1.8662316476345842, "grad_norm": 0.7995553016662598, "learning_rate": 4.665171288743882e-05, "loss": 0.1058, "num_input_tokens_seen": 24709600, "step": 11440 }, { "epoch": 1.867047308319739, "grad_norm": 1.7551579475402832, "learning_rate": 4.6672104404567704e-05, "loss": 0.2804, "num_input_tokens_seen": 24719968, "step": 11445 }, { "epoch": 1.867862969004894, "grad_norm": 0.11601201444864273, "learning_rate": 4.669249592169658e-05, "loss": 0.0425, "num_input_tokens_seen": 24731520, "step": 11450 }, { "epoch": 1.868678629690049, "grad_norm": 0.6100310683250427, "learning_rate": 4.671288743882545e-05, "loss": 0.0876, "num_input_tokens_seen": 24742880, "step": 11455 }, { "epoch": 1.8694942903752039, "grad_norm": 0.03727011755108833, "learning_rate": 4.6733278955954325e-05, "loss": 0.2017, "num_input_tokens_seen": 24753600, "step": 11460 }, { "epoch": 1.870309951060359, "grad_norm": 0.27568426728248596, "learning_rate": 4.67536704730832e-05, "loss": 0.1955, "num_input_tokens_seen": 24764608, "step": 11465 }, { "epoch": 1.871125611745514, "grad_norm": 1.783902645111084, "learning_rate": 4.677406199021207e-05, "loss": 0.1756, "num_input_tokens_seen": 24775872, "step": 11470 }, { "epoch": 1.8719412724306688, "grad_norm": 1.0608326196670532, "learning_rate": 4.679445350734095e-05, "loss": 0.1444, "num_input_tokens_seen": 24786208, "step": 11475 }, { "epoch": 1.8727569331158238, "grad_norm": 1.7573257684707642, "learning_rate": 4.681484502446982e-05, "loss": 0.2925, "num_input_tokens_seen": 24795296, "step": 11480 }, { "epoch": 1.8735725938009788, "grad_norm": 0.19335485994815826, "learning_rate": 4.68352365415987e-05, "loss": 0.121, "num_input_tokens_seen": 24805632, "step": 11485 }, { "epoch": 1.8743882544861337, "grad_norm": 0.7406347990036011, "learning_rate": 4.6855628058727575e-05, "loss": 0.0961, "num_input_tokens_seen": 24815552, "step": 11490 }, { "epoch": 1.8752039151712887, "grad_norm": 0.09130092710256577, "learning_rate": 4.687601957585644e-05, "loss": 0.2184, "num_input_tokens_seen": 24827264, "step": 11495 }, { "epoch": 1.8760195758564437, "grad_norm": 0.0933997854590416, "learning_rate": 4.689641109298532e-05, "loss": 0.1069, "num_input_tokens_seen": 24839168, "step": 11500 }, { "epoch": 1.8768352365415986, "grad_norm": 0.8041461706161499, "learning_rate": 4.6916802610114196e-05, "loss": 0.1718, "num_input_tokens_seen": 24850336, "step": 11505 }, { "epoch": 1.8776508972267538, "grad_norm": 0.21399062871932983, "learning_rate": 4.693719412724307e-05, "loss": 0.0351, "num_input_tokens_seen": 24861984, "step": 11510 }, { "epoch": 1.8784665579119086, "grad_norm": 1.1564180850982666, "learning_rate": 4.6957585644371944e-05, "loss": 0.1363, "num_input_tokens_seen": 24872160, "step": 11515 }, { "epoch": 1.8792822185970635, "grad_norm": 1.0503208637237549, "learning_rate": 4.697797716150082e-05, "loss": 0.2164, "num_input_tokens_seen": 24882272, "step": 11520 }, { "epoch": 1.8800978792822187, "grad_norm": 1.3256345987319946, "learning_rate": 4.69983686786297e-05, "loss": 0.2591, "num_input_tokens_seen": 24892448, "step": 11525 }, { "epoch": 1.8809135399673735, "grad_norm": 0.4416585862636566, "learning_rate": 4.7018760195758565e-05, "loss": 0.1029, "num_input_tokens_seen": 24902016, "step": 11530 }, { "epoch": 1.8817292006525286, "grad_norm": 2.029855728149414, "learning_rate": 4.703915171288744e-05, "loss": 0.2404, "num_input_tokens_seen": 24913344, "step": 11535 }, { "epoch": 1.8825448613376836, "grad_norm": 0.9937605857849121, "learning_rate": 4.705954323001632e-05, "loss": 0.2224, "num_input_tokens_seen": 24923840, "step": 11540 }, { "epoch": 1.8833605220228384, "grad_norm": 0.1798640638589859, "learning_rate": 4.707993474714519e-05, "loss": 0.2112, "num_input_tokens_seen": 24933568, "step": 11545 }, { "epoch": 1.8841761827079935, "grad_norm": 0.7377181053161621, "learning_rate": 4.710032626427406e-05, "loss": 0.0957, "num_input_tokens_seen": 24943520, "step": 11550 }, { "epoch": 1.8849918433931485, "grad_norm": 2.409095048904419, "learning_rate": 4.712071778140294e-05, "loss": 0.2188, "num_input_tokens_seen": 24954976, "step": 11555 }, { "epoch": 1.8858075040783033, "grad_norm": 0.329158216714859, "learning_rate": 4.714110929853181e-05, "loss": 0.0505, "num_input_tokens_seen": 24964384, "step": 11560 }, { "epoch": 1.8866231647634584, "grad_norm": 0.5105241537094116, "learning_rate": 4.716150081566069e-05, "loss": 0.0645, "num_input_tokens_seen": 24975584, "step": 11565 }, { "epoch": 1.8874388254486134, "grad_norm": 1.7043907642364502, "learning_rate": 4.718189233278956e-05, "loss": 0.1006, "num_input_tokens_seen": 24987520, "step": 11570 }, { "epoch": 1.8882544861337682, "grad_norm": 0.2540159821510315, "learning_rate": 4.720228384991844e-05, "loss": 0.1379, "num_input_tokens_seen": 24999424, "step": 11575 }, { "epoch": 1.8890701468189235, "grad_norm": 1.9492231607437134, "learning_rate": 4.722267536704731e-05, "loss": 0.2494, "num_input_tokens_seen": 25010624, "step": 11580 }, { "epoch": 1.8898858075040783, "grad_norm": 2.8998844623565674, "learning_rate": 4.7243066884176184e-05, "loss": 0.2562, "num_input_tokens_seen": 25021696, "step": 11585 }, { "epoch": 1.8907014681892331, "grad_norm": 0.9315274357795715, "learning_rate": 4.726345840130506e-05, "loss": 0.1458, "num_input_tokens_seen": 25032640, "step": 11590 }, { "epoch": 1.8915171288743884, "grad_norm": 0.05771252140402794, "learning_rate": 4.728384991843393e-05, "loss": 0.2462, "num_input_tokens_seen": 25043744, "step": 11595 }, { "epoch": 1.8923327895595432, "grad_norm": 1.3344706296920776, "learning_rate": 4.7304241435562806e-05, "loss": 0.2883, "num_input_tokens_seen": 25055136, "step": 11600 }, { "epoch": 1.8931484502446982, "grad_norm": 0.08289391547441483, "learning_rate": 4.7324632952691686e-05, "loss": 0.1484, "num_input_tokens_seen": 25066528, "step": 11605 }, { "epoch": 1.8939641109298533, "grad_norm": 0.551907479763031, "learning_rate": 4.734502446982056e-05, "loss": 0.0519, "num_input_tokens_seen": 25077280, "step": 11610 }, { "epoch": 1.894779771615008, "grad_norm": 1.6282951831817627, "learning_rate": 4.736541598694943e-05, "loss": 0.2005, "num_input_tokens_seen": 25088544, "step": 11615 }, { "epoch": 1.8955954323001631, "grad_norm": 0.21690109372138977, "learning_rate": 4.738580750407831e-05, "loss": 0.0562, "num_input_tokens_seen": 25097344, "step": 11620 }, { "epoch": 1.8964110929853182, "grad_norm": 0.971717119216919, "learning_rate": 4.740619902120718e-05, "loss": 0.2242, "num_input_tokens_seen": 25108736, "step": 11625 }, { "epoch": 1.897226753670473, "grad_norm": 0.8487144112586975, "learning_rate": 4.742659053833605e-05, "loss": 0.1767, "num_input_tokens_seen": 25119648, "step": 11630 }, { "epoch": 1.898042414355628, "grad_norm": 0.30013641715049744, "learning_rate": 4.744698205546493e-05, "loss": 0.0542, "num_input_tokens_seen": 25130048, "step": 11635 }, { "epoch": 1.898858075040783, "grad_norm": 1.4281977415084839, "learning_rate": 4.74673735725938e-05, "loss": 0.137, "num_input_tokens_seen": 25140032, "step": 11640 }, { "epoch": 1.899673735725938, "grad_norm": 0.24099242687225342, "learning_rate": 4.748776508972268e-05, "loss": 0.182, "num_input_tokens_seen": 25151136, "step": 11645 }, { "epoch": 1.900489396411093, "grad_norm": 0.2842320203781128, "learning_rate": 4.750815660685155e-05, "loss": 0.1273, "num_input_tokens_seen": 25161408, "step": 11650 }, { "epoch": 1.901305057096248, "grad_norm": 1.7655251026153564, "learning_rate": 4.7528548123980425e-05, "loss": 0.1764, "num_input_tokens_seen": 25172224, "step": 11655 }, { "epoch": 1.9021207177814028, "grad_norm": 0.7428550720214844, "learning_rate": 4.7548939641109305e-05, "loss": 0.3363, "num_input_tokens_seen": 25183424, "step": 11660 }, { "epoch": 1.902936378466558, "grad_norm": 1.023829698562622, "learning_rate": 4.756933115823817e-05, "loss": 0.1305, "num_input_tokens_seen": 25194944, "step": 11665 }, { "epoch": 1.9037520391517129, "grad_norm": 0.8751794695854187, "learning_rate": 4.7589722675367046e-05, "loss": 0.1273, "num_input_tokens_seen": 25205728, "step": 11670 }, { "epoch": 1.904567699836868, "grad_norm": 0.18843181431293488, "learning_rate": 4.761011419249593e-05, "loss": 0.2004, "num_input_tokens_seen": 25216576, "step": 11675 }, { "epoch": 1.905383360522023, "grad_norm": 0.04006683826446533, "learning_rate": 4.76305057096248e-05, "loss": 0.022, "num_input_tokens_seen": 25227616, "step": 11680 }, { "epoch": 1.9061990212071778, "grad_norm": 0.36442387104034424, "learning_rate": 4.7650897226753674e-05, "loss": 0.1279, "num_input_tokens_seen": 25237856, "step": 11685 }, { "epoch": 1.9070146818923328, "grad_norm": 1.8749706745147705, "learning_rate": 4.767128874388255e-05, "loss": 0.1758, "num_input_tokens_seen": 25248256, "step": 11690 }, { "epoch": 1.9078303425774878, "grad_norm": 0.20050457119941711, "learning_rate": 4.769168026101142e-05, "loss": 0.1159, "num_input_tokens_seen": 25258944, "step": 11695 }, { "epoch": 1.9086460032626427, "grad_norm": 0.28514301776885986, "learning_rate": 4.7712071778140296e-05, "loss": 0.1292, "num_input_tokens_seen": 25269120, "step": 11700 }, { "epoch": 1.9094616639477977, "grad_norm": 1.2212804555892944, "learning_rate": 4.773246329526917e-05, "loss": 0.1402, "num_input_tokens_seen": 25280160, "step": 11705 }, { "epoch": 1.9102773246329527, "grad_norm": 1.144649624824524, "learning_rate": 4.7752854812398043e-05, "loss": 0.1889, "num_input_tokens_seen": 25290112, "step": 11710 }, { "epoch": 1.9110929853181076, "grad_norm": 0.4431595504283905, "learning_rate": 4.7773246329526924e-05, "loss": 0.1671, "num_input_tokens_seen": 25301664, "step": 11715 }, { "epoch": 1.9119086460032626, "grad_norm": 1.0054125785827637, "learning_rate": 4.779363784665579e-05, "loss": 0.2452, "num_input_tokens_seen": 25311264, "step": 11720 }, { "epoch": 1.9127243066884176, "grad_norm": 0.653904378414154, "learning_rate": 4.7814029363784665e-05, "loss": 0.0461, "num_input_tokens_seen": 25321984, "step": 11725 }, { "epoch": 1.9135399673735725, "grad_norm": 0.45711806416511536, "learning_rate": 4.7834420880913546e-05, "loss": 0.0902, "num_input_tokens_seen": 25333664, "step": 11730 }, { "epoch": 1.9143556280587277, "grad_norm": 0.8494669198989868, "learning_rate": 4.785481239804241e-05, "loss": 0.1307, "num_input_tokens_seen": 25342944, "step": 11735 }, { "epoch": 1.9151712887438825, "grad_norm": 0.6909138560295105, "learning_rate": 4.787520391517129e-05, "loss": 0.1505, "num_input_tokens_seen": 25353568, "step": 11740 }, { "epoch": 1.9159869494290374, "grad_norm": 0.4805160462856293, "learning_rate": 4.789559543230017e-05, "loss": 0.1307, "num_input_tokens_seen": 25364736, "step": 11745 }, { "epoch": 1.9168026101141926, "grad_norm": 0.04751911759376526, "learning_rate": 4.791598694942904e-05, "loss": 0.1414, "num_input_tokens_seen": 25374528, "step": 11750 }, { "epoch": 1.9176182707993474, "grad_norm": 0.2531706988811493, "learning_rate": 4.7936378466557915e-05, "loss": 0.1392, "num_input_tokens_seen": 25385440, "step": 11755 }, { "epoch": 1.9184339314845025, "grad_norm": 1.5347001552581787, "learning_rate": 4.795676998368679e-05, "loss": 0.2523, "num_input_tokens_seen": 25397344, "step": 11760 }, { "epoch": 1.9192495921696575, "grad_norm": 0.14662384986877441, "learning_rate": 4.797716150081566e-05, "loss": 0.0663, "num_input_tokens_seen": 25408448, "step": 11765 }, { "epoch": 1.9200652528548123, "grad_norm": 0.4576381742954254, "learning_rate": 4.7997553017944536e-05, "loss": 0.1819, "num_input_tokens_seen": 25418336, "step": 11770 }, { "epoch": 1.9208809135399674, "grad_norm": 0.8053319454193115, "learning_rate": 4.801794453507341e-05, "loss": 0.1884, "num_input_tokens_seen": 25429824, "step": 11775 }, { "epoch": 1.9216965742251224, "grad_norm": 0.28487515449523926, "learning_rate": 4.803833605220229e-05, "loss": 0.2113, "num_input_tokens_seen": 25439584, "step": 11780 }, { "epoch": 1.9225122349102772, "grad_norm": 0.8440260887145996, "learning_rate": 4.805872756933116e-05, "loss": 0.1688, "num_input_tokens_seen": 25451744, "step": 11785 }, { "epoch": 1.9233278955954323, "grad_norm": 0.19171148538589478, "learning_rate": 4.807911908646003e-05, "loss": 0.0382, "num_input_tokens_seen": 25461856, "step": 11790 }, { "epoch": 1.9241435562805873, "grad_norm": 2.601224422454834, "learning_rate": 4.809951060358891e-05, "loss": 0.2893, "num_input_tokens_seen": 25473120, "step": 11795 }, { "epoch": 1.9249592169657421, "grad_norm": 0.9384394884109497, "learning_rate": 4.8119902120717786e-05, "loss": 0.058, "num_input_tokens_seen": 25485632, "step": 11800 }, { "epoch": 1.9257748776508974, "grad_norm": 0.6536388993263245, "learning_rate": 4.814029363784666e-05, "loss": 0.0708, "num_input_tokens_seen": 25496160, "step": 11805 }, { "epoch": 1.9265905383360522, "grad_norm": 1.2873486280441284, "learning_rate": 4.8160685154975533e-05, "loss": 0.1707, "num_input_tokens_seen": 25507712, "step": 11810 }, { "epoch": 1.927406199021207, "grad_norm": 2.269892692565918, "learning_rate": 4.818107667210441e-05, "loss": 0.22, "num_input_tokens_seen": 25519328, "step": 11815 }, { "epoch": 1.9282218597063623, "grad_norm": 0.7041095495223999, "learning_rate": 4.820146818923328e-05, "loss": 0.0654, "num_input_tokens_seen": 25528992, "step": 11820 }, { "epoch": 1.929037520391517, "grad_norm": 0.38352537155151367, "learning_rate": 4.8221859706362155e-05, "loss": 0.1834, "num_input_tokens_seen": 25540032, "step": 11825 }, { "epoch": 1.9298531810766721, "grad_norm": 3.602619171142578, "learning_rate": 4.824225122349103e-05, "loss": 0.1455, "num_input_tokens_seen": 25551392, "step": 11830 }, { "epoch": 1.9306688417618272, "grad_norm": 1.5224151611328125, "learning_rate": 4.826264274061991e-05, "loss": 0.129, "num_input_tokens_seen": 25561856, "step": 11835 }, { "epoch": 1.931484502446982, "grad_norm": 0.6132806539535522, "learning_rate": 4.8283034257748776e-05, "loss": 0.1636, "num_input_tokens_seen": 25571712, "step": 11840 }, { "epoch": 1.932300163132137, "grad_norm": 0.1931745857000351, "learning_rate": 4.830342577487765e-05, "loss": 0.0843, "num_input_tokens_seen": 25581088, "step": 11845 }, { "epoch": 1.933115823817292, "grad_norm": 0.07220447063446045, "learning_rate": 4.832381729200653e-05, "loss": 0.0296, "num_input_tokens_seen": 25593376, "step": 11850 }, { "epoch": 1.933931484502447, "grad_norm": 1.225407600402832, "learning_rate": 4.83442088091354e-05, "loss": 0.2115, "num_input_tokens_seen": 25603488, "step": 11855 }, { "epoch": 1.934747145187602, "grad_norm": 0.9908203482627869, "learning_rate": 4.836460032626428e-05, "loss": 0.2995, "num_input_tokens_seen": 25613216, "step": 11860 }, { "epoch": 1.935562805872757, "grad_norm": 0.8625867962837219, "learning_rate": 4.838499184339315e-05, "loss": 0.0772, "num_input_tokens_seen": 25624096, "step": 11865 }, { "epoch": 1.9363784665579118, "grad_norm": 0.21067066490650177, "learning_rate": 4.8405383360522026e-05, "loss": 0.1797, "num_input_tokens_seen": 25633760, "step": 11870 }, { "epoch": 1.9371941272430668, "grad_norm": 0.42124685645103455, "learning_rate": 4.84257748776509e-05, "loss": 0.1337, "num_input_tokens_seen": 25645216, "step": 11875 }, { "epoch": 1.9380097879282219, "grad_norm": 0.40340545773506165, "learning_rate": 4.8446166394779774e-05, "loss": 0.256, "num_input_tokens_seen": 25656288, "step": 11880 }, { "epoch": 1.9388254486133767, "grad_norm": 0.29645222425460815, "learning_rate": 4.846655791190865e-05, "loss": 0.188, "num_input_tokens_seen": 25667488, "step": 11885 }, { "epoch": 1.939641109298532, "grad_norm": 2.2555112838745117, "learning_rate": 4.848694942903752e-05, "loss": 0.1289, "num_input_tokens_seen": 25678848, "step": 11890 }, { "epoch": 1.9404567699836868, "grad_norm": 0.5733392238616943, "learning_rate": 4.8507340946166395e-05, "loss": 0.153, "num_input_tokens_seen": 25689280, "step": 11895 }, { "epoch": 1.9412724306688418, "grad_norm": 1.3019911050796509, "learning_rate": 4.852773246329527e-05, "loss": 0.1132, "num_input_tokens_seen": 25699552, "step": 11900 }, { "epoch": 1.9420880913539968, "grad_norm": 0.07448702305555344, "learning_rate": 4.854812398042415e-05, "loss": 0.1822, "num_input_tokens_seen": 25709920, "step": 11905 }, { "epoch": 1.9429037520391517, "grad_norm": 2.143873691558838, "learning_rate": 4.856851549755302e-05, "loss": 0.2664, "num_input_tokens_seen": 25722592, "step": 11910 }, { "epoch": 1.9437194127243067, "grad_norm": 0.252199649810791, "learning_rate": 4.85889070146819e-05, "loss": 0.0466, "num_input_tokens_seen": 25733696, "step": 11915 }, { "epoch": 1.9445350734094617, "grad_norm": 0.6101998686790466, "learning_rate": 4.860929853181077e-05, "loss": 0.225, "num_input_tokens_seen": 25743712, "step": 11920 }, { "epoch": 1.9453507340946166, "grad_norm": 0.17419524490833282, "learning_rate": 4.862969004893964e-05, "loss": 0.2188, "num_input_tokens_seen": 25753952, "step": 11925 }, { "epoch": 1.9461663947797716, "grad_norm": 0.43358880281448364, "learning_rate": 4.865008156606852e-05, "loss": 0.1765, "num_input_tokens_seen": 25764896, "step": 11930 }, { "epoch": 1.9469820554649266, "grad_norm": 0.9988438487052917, "learning_rate": 4.867047308319739e-05, "loss": 0.1477, "num_input_tokens_seen": 25774944, "step": 11935 }, { "epoch": 1.9477977161500815, "grad_norm": 0.10932786762714386, "learning_rate": 4.8690864600326266e-05, "loss": 0.1058, "num_input_tokens_seen": 25785472, "step": 11940 }, { "epoch": 1.9486133768352365, "grad_norm": 0.6316084265708923, "learning_rate": 4.871125611745514e-05, "loss": 0.2491, "num_input_tokens_seen": 25796448, "step": 11945 }, { "epoch": 1.9494290375203915, "grad_norm": 0.1388283222913742, "learning_rate": 4.8731647634584014e-05, "loss": 0.111, "num_input_tokens_seen": 25807168, "step": 11950 }, { "epoch": 1.9502446982055464, "grad_norm": 1.6179802417755127, "learning_rate": 4.8752039151712895e-05, "loss": 0.1306, "num_input_tokens_seen": 25817792, "step": 11955 }, { "epoch": 1.9510603588907016, "grad_norm": 0.3803667724132538, "learning_rate": 4.877243066884176e-05, "loss": 0.2105, "num_input_tokens_seen": 25827392, "step": 11960 }, { "epoch": 1.9518760195758564, "grad_norm": 0.193934366106987, "learning_rate": 4.8792822185970636e-05, "loss": 0.0492, "num_input_tokens_seen": 25838720, "step": 11965 }, { "epoch": 1.9526916802610113, "grad_norm": 1.696301817893982, "learning_rate": 4.8813213703099516e-05, "loss": 0.0885, "num_input_tokens_seen": 25850496, "step": 11970 }, { "epoch": 1.9535073409461665, "grad_norm": 0.25832194089889526, "learning_rate": 4.883360522022839e-05, "loss": 0.1424, "num_input_tokens_seen": 25862240, "step": 11975 }, { "epoch": 1.9543230016313213, "grad_norm": 0.5201337337493896, "learning_rate": 4.8853996737357264e-05, "loss": 0.2721, "num_input_tokens_seen": 25873280, "step": 11980 }, { "epoch": 1.9551386623164764, "grad_norm": 0.12405991554260254, "learning_rate": 4.887438825448614e-05, "loss": 0.0677, "num_input_tokens_seen": 25883872, "step": 11985 }, { "epoch": 1.9559543230016314, "grad_norm": 0.2146562933921814, "learning_rate": 4.889477977161501e-05, "loss": 0.0338, "num_input_tokens_seen": 25895424, "step": 11990 }, { "epoch": 1.9567699836867862, "grad_norm": 2.082291603088379, "learning_rate": 4.8915171288743885e-05, "loss": 0.2499, "num_input_tokens_seen": 25906944, "step": 11995 }, { "epoch": 1.9575856443719413, "grad_norm": 0.9380645751953125, "learning_rate": 4.893556280587276e-05, "loss": 0.1443, "num_input_tokens_seen": 25918720, "step": 12000 }, { "epoch": 1.9584013050570963, "grad_norm": 0.7157341241836548, "learning_rate": 4.895595432300163e-05, "loss": 0.1085, "num_input_tokens_seen": 25929248, "step": 12005 }, { "epoch": 1.9592169657422511, "grad_norm": 1.3300206661224365, "learning_rate": 4.897634584013051e-05, "loss": 0.25, "num_input_tokens_seen": 25939040, "step": 12010 }, { "epoch": 1.9600326264274062, "grad_norm": 0.16411761939525604, "learning_rate": 4.899673735725938e-05, "loss": 0.1591, "num_input_tokens_seen": 25949024, "step": 12015 }, { "epoch": 1.9608482871125612, "grad_norm": 0.1111067533493042, "learning_rate": 4.9017128874388254e-05, "loss": 0.2755, "num_input_tokens_seen": 25959264, "step": 12020 }, { "epoch": 1.961663947797716, "grad_norm": 0.8294567465782166, "learning_rate": 4.9037520391517135e-05, "loss": 0.1047, "num_input_tokens_seen": 25970432, "step": 12025 }, { "epoch": 1.9624796084828713, "grad_norm": 2.276531457901001, "learning_rate": 4.9057911908646e-05, "loss": 0.2231, "num_input_tokens_seen": 25981856, "step": 12030 }, { "epoch": 1.963295269168026, "grad_norm": 0.6030580401420593, "learning_rate": 4.907830342577488e-05, "loss": 0.3633, "num_input_tokens_seen": 25993696, "step": 12035 }, { "epoch": 1.964110929853181, "grad_norm": 1.1170111894607544, "learning_rate": 4.9098694942903756e-05, "loss": 0.1952, "num_input_tokens_seen": 26004736, "step": 12040 }, { "epoch": 1.9649265905383362, "grad_norm": 0.3131396472454071, "learning_rate": 4.9119086460032623e-05, "loss": 0.1113, "num_input_tokens_seen": 26016480, "step": 12045 }, { "epoch": 1.965742251223491, "grad_norm": 0.8320696353912354, "learning_rate": 4.9139477977161504e-05, "loss": 0.05, "num_input_tokens_seen": 26027072, "step": 12050 }, { "epoch": 1.966557911908646, "grad_norm": 0.6892486810684204, "learning_rate": 4.915986949429038e-05, "loss": 0.1144, "num_input_tokens_seen": 26036096, "step": 12055 }, { "epoch": 1.967373572593801, "grad_norm": 0.25712358951568604, "learning_rate": 4.918026101141925e-05, "loss": 0.1242, "num_input_tokens_seen": 26047264, "step": 12060 }, { "epoch": 1.968189233278956, "grad_norm": 0.29494139552116394, "learning_rate": 4.9200652528548126e-05, "loss": 0.0978, "num_input_tokens_seen": 26058176, "step": 12065 }, { "epoch": 1.969004893964111, "grad_norm": 0.523445725440979, "learning_rate": 4.9221044045677e-05, "loss": 0.082, "num_input_tokens_seen": 26069664, "step": 12070 }, { "epoch": 1.969820554649266, "grad_norm": 0.12973642349243164, "learning_rate": 4.924143556280588e-05, "loss": 0.1171, "num_input_tokens_seen": 26080640, "step": 12075 }, { "epoch": 1.9706362153344208, "grad_norm": 1.0853031873703003, "learning_rate": 4.926182707993475e-05, "loss": 0.0773, "num_input_tokens_seen": 26091360, "step": 12080 }, { "epoch": 1.9714518760195758, "grad_norm": 0.3788798153400421, "learning_rate": 4.928221859706362e-05, "loss": 0.0668, "num_input_tokens_seen": 26102080, "step": 12085 }, { "epoch": 1.9722675367047309, "grad_norm": 2.5526182651519775, "learning_rate": 4.93026101141925e-05, "loss": 0.2484, "num_input_tokens_seen": 26112192, "step": 12090 }, { "epoch": 1.9730831973898857, "grad_norm": 2.472505569458008, "learning_rate": 4.9323001631321375e-05, "loss": 0.2123, "num_input_tokens_seen": 26122496, "step": 12095 }, { "epoch": 1.9738988580750407, "grad_norm": 1.5319981575012207, "learning_rate": 4.934339314845024e-05, "loss": 0.1097, "num_input_tokens_seen": 26134048, "step": 12100 }, { "epoch": 1.9747145187601958, "grad_norm": 0.2454528510570526, "learning_rate": 4.936378466557912e-05, "loss": 0.21, "num_input_tokens_seen": 26144192, "step": 12105 }, { "epoch": 1.9755301794453506, "grad_norm": 0.35897478461265564, "learning_rate": 4.9384176182708e-05, "loss": 0.1166, "num_input_tokens_seen": 26155328, "step": 12110 }, { "epoch": 1.9763458401305058, "grad_norm": 0.1965816766023636, "learning_rate": 4.940456769983687e-05, "loss": 0.2005, "num_input_tokens_seen": 26166272, "step": 12115 }, { "epoch": 1.9771615008156607, "grad_norm": 0.9297334551811218, "learning_rate": 4.9424959216965744e-05, "loss": 0.1899, "num_input_tokens_seen": 26175520, "step": 12120 }, { "epoch": 1.9779771615008157, "grad_norm": 1.4750139713287354, "learning_rate": 4.944535073409462e-05, "loss": 0.3454, "num_input_tokens_seen": 26187008, "step": 12125 }, { "epoch": 1.9787928221859707, "grad_norm": 2.686476469039917, "learning_rate": 4.94657422512235e-05, "loss": 0.1631, "num_input_tokens_seen": 26198016, "step": 12130 }, { "epoch": 1.9796084828711256, "grad_norm": 0.06574622541666031, "learning_rate": 4.9486133768352366e-05, "loss": 0.2184, "num_input_tokens_seen": 26209440, "step": 12135 }, { "epoch": 1.9804241435562806, "grad_norm": 0.3465072512626648, "learning_rate": 4.950652528548124e-05, "loss": 0.0828, "num_input_tokens_seen": 26220736, "step": 12140 }, { "epoch": 1.9812398042414356, "grad_norm": 1.8836584091186523, "learning_rate": 4.952691680261012e-05, "loss": 0.1023, "num_input_tokens_seen": 26231456, "step": 12145 }, { "epoch": 1.9820554649265905, "grad_norm": 0.37153756618499756, "learning_rate": 4.954730831973899e-05, "loss": 0.0698, "num_input_tokens_seen": 26241568, "step": 12150 }, { "epoch": 1.9828711256117455, "grad_norm": 2.0890307426452637, "learning_rate": 4.956769983686787e-05, "loss": 0.3131, "num_input_tokens_seen": 26252480, "step": 12155 }, { "epoch": 1.9836867862969005, "grad_norm": 0.307858943939209, "learning_rate": 4.958809135399674e-05, "loss": 0.0926, "num_input_tokens_seen": 26263264, "step": 12160 }, { "epoch": 1.9845024469820554, "grad_norm": 1.9282387495040894, "learning_rate": 4.9608482871125616e-05, "loss": 0.1394, "num_input_tokens_seen": 26274496, "step": 12165 }, { "epoch": 1.9853181076672104, "grad_norm": 0.5049535036087036, "learning_rate": 4.962887438825449e-05, "loss": 0.1466, "num_input_tokens_seen": 26285152, "step": 12170 }, { "epoch": 1.9861337683523654, "grad_norm": 1.044494867324829, "learning_rate": 4.964926590538336e-05, "loss": 0.2252, "num_input_tokens_seen": 26296064, "step": 12175 }, { "epoch": 1.9869494290375203, "grad_norm": 1.0594651699066162, "learning_rate": 4.966965742251224e-05, "loss": 0.1369, "num_input_tokens_seen": 26305984, "step": 12180 }, { "epoch": 1.9877650897226755, "grad_norm": 0.6264002323150635, "learning_rate": 4.969004893964111e-05, "loss": 0.1074, "num_input_tokens_seen": 26316896, "step": 12185 }, { "epoch": 1.9885807504078303, "grad_norm": 0.36705246567726135, "learning_rate": 4.9710440456769985e-05, "loss": 0.1038, "num_input_tokens_seen": 26326976, "step": 12190 }, { "epoch": 1.9893964110929854, "grad_norm": 0.6576601266860962, "learning_rate": 4.973083197389886e-05, "loss": 0.2255, "num_input_tokens_seen": 26337504, "step": 12195 }, { "epoch": 1.9902120717781404, "grad_norm": 1.4088600873947144, "learning_rate": 4.975122349102774e-05, "loss": 0.171, "num_input_tokens_seen": 26348608, "step": 12200 }, { "epoch": 1.9910277324632952, "grad_norm": 3.0189504623413086, "learning_rate": 4.9771615008156606e-05, "loss": 0.2622, "num_input_tokens_seen": 26358240, "step": 12205 }, { "epoch": 1.9918433931484503, "grad_norm": 0.06645012646913528, "learning_rate": 4.979200652528549e-05, "loss": 0.0985, "num_input_tokens_seen": 26369568, "step": 12210 }, { "epoch": 1.9926590538336053, "grad_norm": 0.9278361797332764, "learning_rate": 4.981239804241436e-05, "loss": 0.1725, "num_input_tokens_seen": 26379936, "step": 12215 }, { "epoch": 1.9934747145187601, "grad_norm": 0.919269859790802, "learning_rate": 4.983278955954323e-05, "loss": 0.1633, "num_input_tokens_seen": 26390624, "step": 12220 }, { "epoch": 1.9942903752039152, "grad_norm": 0.31123337149620056, "learning_rate": 4.985318107667211e-05, "loss": 0.1151, "num_input_tokens_seen": 26399616, "step": 12225 }, { "epoch": 1.9951060358890702, "grad_norm": 0.4787385165691376, "learning_rate": 4.987357259380098e-05, "loss": 0.1245, "num_input_tokens_seen": 26410208, "step": 12230 }, { "epoch": 1.995921696574225, "grad_norm": 0.502166748046875, "learning_rate": 4.9893964110929856e-05, "loss": 0.0903, "num_input_tokens_seen": 26420160, "step": 12235 }, { "epoch": 1.99673735725938, "grad_norm": 0.6089902520179749, "learning_rate": 4.991435562805873e-05, "loss": 0.1087, "num_input_tokens_seen": 26430432, "step": 12240 }, { "epoch": 1.997553017944535, "grad_norm": 0.4120921492576599, "learning_rate": 4.9934747145187603e-05, "loss": 0.095, "num_input_tokens_seen": 26440128, "step": 12245 }, { "epoch": 1.99836867862969, "grad_norm": 1.077731966972351, "learning_rate": 4.9955138662316484e-05, "loss": 0.0832, "num_input_tokens_seen": 26450496, "step": 12250 }, { "epoch": 1.9991843393148452, "grad_norm": 1.3841098546981812, "learning_rate": 4.997553017944535e-05, "loss": 0.1214, "num_input_tokens_seen": 26461824, "step": 12255 }, { "epoch": 2.0, "grad_norm": 1.332831621170044, "learning_rate": 4.9995921696574225e-05, "loss": 0.1781, "num_input_tokens_seen": 26471216, "step": 12260 }, { "epoch": 2.0, "eval_loss": 0.1503559798002243, "eval_runtime": 90.5738, "eval_samples_per_second": 30.086, "eval_steps_per_second": 7.53, "num_input_tokens_seen": 26471216, "step": 12260 }, { "epoch": 2.000815660685155, "grad_norm": 1.3718738555908203, "learning_rate": 4.999999983786985e-05, "loss": 0.1557, "num_input_tokens_seen": 26481904, "step": 12265 }, { "epoch": 2.00163132137031, "grad_norm": 2.0179665088653564, "learning_rate": 4.9999999179216125e-05, "loss": 0.2443, "num_input_tokens_seen": 26491344, "step": 12270 }, { "epoch": 2.002446982055465, "grad_norm": 0.2334105223417282, "learning_rate": 4.99999980139057e-05, "loss": 0.0782, "num_input_tokens_seen": 26502160, "step": 12275 }, { "epoch": 2.0032626427406197, "grad_norm": 1.334969401359558, "learning_rate": 4.9999996341938607e-05, "loss": 0.1123, "num_input_tokens_seen": 26512816, "step": 12280 }, { "epoch": 2.004078303425775, "grad_norm": 0.07841327041387558, "learning_rate": 4.999999416331487e-05, "loss": 0.0364, "num_input_tokens_seen": 26522640, "step": 12285 }, { "epoch": 2.00489396411093, "grad_norm": 0.09741098433732986, "learning_rate": 4.999999147803453e-05, "loss": 0.2189, "num_input_tokens_seen": 26533072, "step": 12290 }, { "epoch": 2.0057096247960846, "grad_norm": 0.09951270371675491, "learning_rate": 4.999998828609765e-05, "loss": 0.1287, "num_input_tokens_seen": 26543408, "step": 12295 }, { "epoch": 2.00652528548124, "grad_norm": 0.18058034777641296, "learning_rate": 4.99999845875043e-05, "loss": 0.1728, "num_input_tokens_seen": 26554320, "step": 12300 }, { "epoch": 2.0073409461663947, "grad_norm": 0.1544305384159088, "learning_rate": 4.999998038225454e-05, "loss": 0.119, "num_input_tokens_seen": 26565008, "step": 12305 }, { "epoch": 2.00815660685155, "grad_norm": 0.057755205780267715, "learning_rate": 4.999997567034846e-05, "loss": 0.0603, "num_input_tokens_seen": 26576912, "step": 12310 }, { "epoch": 2.0089722675367048, "grad_norm": 0.9712201356887817, "learning_rate": 4.999997045178617e-05, "loss": 0.4535, "num_input_tokens_seen": 26586800, "step": 12315 }, { "epoch": 2.0097879282218596, "grad_norm": 0.485420286655426, "learning_rate": 4.999996472656776e-05, "loss": 0.1063, "num_input_tokens_seen": 26598160, "step": 12320 }, { "epoch": 2.010603588907015, "grad_norm": 1.2764307260513306, "learning_rate": 4.9999958494693344e-05, "loss": 0.2034, "num_input_tokens_seen": 26609136, "step": 12325 }, { "epoch": 2.0114192495921697, "grad_norm": 0.33138757944107056, "learning_rate": 4.999995175616306e-05, "loss": 0.1162, "num_input_tokens_seen": 26620752, "step": 12330 }, { "epoch": 2.0122349102773245, "grad_norm": 1.9361389875411987, "learning_rate": 4.999994451097704e-05, "loss": 0.2495, "num_input_tokens_seen": 26630256, "step": 12335 }, { "epoch": 2.0130505709624797, "grad_norm": 1.5119401216506958, "learning_rate": 4.9999936759135424e-05, "loss": 0.3056, "num_input_tokens_seen": 26641136, "step": 12340 }, { "epoch": 2.0138662316476346, "grad_norm": 0.6831423044204712, "learning_rate": 4.9999928500638375e-05, "loss": 0.088, "num_input_tokens_seen": 26651824, "step": 12345 }, { "epoch": 2.0146818923327894, "grad_norm": 1.3660575151443481, "learning_rate": 4.999991973548607e-05, "loss": 0.2902, "num_input_tokens_seen": 26662064, "step": 12350 }, { "epoch": 2.0154975530179446, "grad_norm": 0.5035452842712402, "learning_rate": 4.9999910463678666e-05, "loss": 0.1435, "num_input_tokens_seen": 26673104, "step": 12355 }, { "epoch": 2.0163132137030995, "grad_norm": 1.3726166486740112, "learning_rate": 4.9999900685216365e-05, "loss": 0.2827, "num_input_tokens_seen": 26682736, "step": 12360 }, { "epoch": 2.0171288743882543, "grad_norm": 0.23580320179462433, "learning_rate": 4.999989040009936e-05, "loss": 0.1653, "num_input_tokens_seen": 26694224, "step": 12365 }, { "epoch": 2.0179445350734095, "grad_norm": 0.34302422404289246, "learning_rate": 4.999987960832787e-05, "loss": 0.0957, "num_input_tokens_seen": 26704880, "step": 12370 }, { "epoch": 2.0187601957585644, "grad_norm": 0.9015535712242126, "learning_rate": 4.9999868309902096e-05, "loss": 0.2407, "num_input_tokens_seen": 26713648, "step": 12375 }, { "epoch": 2.0195758564437196, "grad_norm": 0.04899528995156288, "learning_rate": 4.999985650482229e-05, "loss": 0.0825, "num_input_tokens_seen": 26724560, "step": 12380 }, { "epoch": 2.0203915171288744, "grad_norm": 1.0355181694030762, "learning_rate": 4.999984419308866e-05, "loss": 0.156, "num_input_tokens_seen": 26735344, "step": 12385 }, { "epoch": 2.0212071778140293, "grad_norm": 1.828096866607666, "learning_rate": 4.999983137470148e-05, "loss": 0.1512, "num_input_tokens_seen": 26746288, "step": 12390 }, { "epoch": 2.0220228384991845, "grad_norm": 1.6852869987487793, "learning_rate": 4.9999818049661004e-05, "loss": 0.2055, "num_input_tokens_seen": 26757936, "step": 12395 }, { "epoch": 2.0228384991843393, "grad_norm": 0.5734047293663025, "learning_rate": 4.9999804217967496e-05, "loss": 0.1474, "num_input_tokens_seen": 26770256, "step": 12400 }, { "epoch": 2.023654159869494, "grad_norm": 1.375340461730957, "learning_rate": 4.9999789879621244e-05, "loss": 0.0865, "num_input_tokens_seen": 26780240, "step": 12405 }, { "epoch": 2.0244698205546494, "grad_norm": 0.4310632050037384, "learning_rate": 4.999977503462254e-05, "loss": 0.139, "num_input_tokens_seen": 26791824, "step": 12410 }, { "epoch": 2.0252854812398042, "grad_norm": 0.14454692602157593, "learning_rate": 4.999975968297167e-05, "loss": 0.1316, "num_input_tokens_seen": 26802576, "step": 12415 }, { "epoch": 2.026101141924959, "grad_norm": 0.28228336572647095, "learning_rate": 4.9999743824668966e-05, "loss": 0.163, "num_input_tokens_seen": 26814256, "step": 12420 }, { "epoch": 2.0269168026101143, "grad_norm": 1.4444353580474854, "learning_rate": 4.999972745971473e-05, "loss": 0.3367, "num_input_tokens_seen": 26824528, "step": 12425 }, { "epoch": 2.027732463295269, "grad_norm": 0.8170751929283142, "learning_rate": 4.999971058810931e-05, "loss": 0.0818, "num_input_tokens_seen": 26836816, "step": 12430 }, { "epoch": 2.028548123980424, "grad_norm": 0.5945093631744385, "learning_rate": 4.999969320985304e-05, "loss": 0.1862, "num_input_tokens_seen": 26846800, "step": 12435 }, { "epoch": 2.029363784665579, "grad_norm": 0.6102951169013977, "learning_rate": 4.9999675324946274e-05, "loss": 0.111, "num_input_tokens_seen": 26858288, "step": 12440 }, { "epoch": 2.030179445350734, "grad_norm": 0.6812986135482788, "learning_rate": 4.999965693338937e-05, "loss": 0.1083, "num_input_tokens_seen": 26868624, "step": 12445 }, { "epoch": 2.0309951060358893, "grad_norm": 0.37715068459510803, "learning_rate": 4.99996380351827e-05, "loss": 0.2035, "num_input_tokens_seen": 26879536, "step": 12450 }, { "epoch": 2.031810766721044, "grad_norm": 0.9757757186889648, "learning_rate": 4.999961863032665e-05, "loss": 0.4217, "num_input_tokens_seen": 26890320, "step": 12455 }, { "epoch": 2.032626427406199, "grad_norm": 0.2776673138141632, "learning_rate": 4.9999598718821625e-05, "loss": 0.1528, "num_input_tokens_seen": 26900784, "step": 12460 }, { "epoch": 2.033442088091354, "grad_norm": 0.2966485917568207, "learning_rate": 4.999957830066801e-05, "loss": 0.0972, "num_input_tokens_seen": 26912944, "step": 12465 }, { "epoch": 2.034257748776509, "grad_norm": 0.07258812338113785, "learning_rate": 4.9999557375866226e-05, "loss": 0.1723, "num_input_tokens_seen": 26923792, "step": 12470 }, { "epoch": 2.035073409461664, "grad_norm": 0.34596991539001465, "learning_rate": 4.9999535944416695e-05, "loss": 0.0854, "num_input_tokens_seen": 26934736, "step": 12475 }, { "epoch": 2.035889070146819, "grad_norm": 0.1928817331790924, "learning_rate": 4.999951400631987e-05, "loss": 0.1276, "num_input_tokens_seen": 26945840, "step": 12480 }, { "epoch": 2.036704730831974, "grad_norm": 0.1019616350531578, "learning_rate": 4.999949156157616e-05, "loss": 0.1116, "num_input_tokens_seen": 26956304, "step": 12485 }, { "epoch": 2.0375203915171287, "grad_norm": 0.762148380279541, "learning_rate": 4.999946861018605e-05, "loss": 0.0935, "num_input_tokens_seen": 26967632, "step": 12490 }, { "epoch": 2.038336052202284, "grad_norm": 0.41267532110214233, "learning_rate": 4.999944515214999e-05, "loss": 0.1392, "num_input_tokens_seen": 26978160, "step": 12495 }, { "epoch": 2.039151712887439, "grad_norm": 0.6989384293556213, "learning_rate": 4.999942118746847e-05, "loss": 0.1941, "num_input_tokens_seen": 26987568, "step": 12500 }, { "epoch": 2.0399673735725936, "grad_norm": 0.24405083060264587, "learning_rate": 4.999939671614195e-05, "loss": 0.1103, "num_input_tokens_seen": 26999760, "step": 12505 }, { "epoch": 2.040783034257749, "grad_norm": 1.1127533912658691, "learning_rate": 4.999937173817095e-05, "loss": 0.0791, "num_input_tokens_seen": 27011696, "step": 12510 }, { "epoch": 2.0415986949429037, "grad_norm": 0.4441587030887604, "learning_rate": 4.9999346253555965e-05, "loss": 0.1255, "num_input_tokens_seen": 27022608, "step": 12515 }, { "epoch": 2.0424143556280585, "grad_norm": 0.5806522369384766, "learning_rate": 4.9999320262297524e-05, "loss": 0.1592, "num_input_tokens_seen": 27033680, "step": 12520 }, { "epoch": 2.0432300163132138, "grad_norm": 0.9869837760925293, "learning_rate": 4.999929376439614e-05, "loss": 0.0627, "num_input_tokens_seen": 27045136, "step": 12525 }, { "epoch": 2.0440456769983686, "grad_norm": 0.5236803889274597, "learning_rate": 4.999926675985235e-05, "loss": 0.2009, "num_input_tokens_seen": 27055760, "step": 12530 }, { "epoch": 2.044861337683524, "grad_norm": 0.2321624904870987, "learning_rate": 4.99992392486667e-05, "loss": 0.1045, "num_input_tokens_seen": 27066352, "step": 12535 }, { "epoch": 2.0456769983686787, "grad_norm": 1.0329424142837524, "learning_rate": 4.999921123083976e-05, "loss": 0.1467, "num_input_tokens_seen": 27077872, "step": 12540 }, { "epoch": 2.0464926590538335, "grad_norm": 0.7467964291572571, "learning_rate": 4.9999182706372086e-05, "loss": 0.1096, "num_input_tokens_seen": 27088912, "step": 12545 }, { "epoch": 2.0473083197389887, "grad_norm": 0.06189372017979622, "learning_rate": 4.9999153675264266e-05, "loss": 0.0363, "num_input_tokens_seen": 27099216, "step": 12550 }, { "epoch": 2.0481239804241436, "grad_norm": 0.3193133473396301, "learning_rate": 4.999912413751688e-05, "loss": 0.1758, "num_input_tokens_seen": 27109712, "step": 12555 }, { "epoch": 2.0489396411092984, "grad_norm": 0.6345974206924438, "learning_rate": 4.999909409313053e-05, "loss": 0.0625, "num_input_tokens_seen": 27120304, "step": 12560 }, { "epoch": 2.0497553017944536, "grad_norm": 0.24144768714904785, "learning_rate": 4.999906354210583e-05, "loss": 0.1318, "num_input_tokens_seen": 27130992, "step": 12565 }, { "epoch": 2.0505709624796085, "grad_norm": 1.3078820705413818, "learning_rate": 4.9999032484443385e-05, "loss": 0.1234, "num_input_tokens_seen": 27141136, "step": 12570 }, { "epoch": 2.0513866231647633, "grad_norm": 0.4913622736930847, "learning_rate": 4.999900092014384e-05, "loss": 0.1144, "num_input_tokens_seen": 27152208, "step": 12575 }, { "epoch": 2.0522022838499185, "grad_norm": 0.38893669843673706, "learning_rate": 4.999896884920782e-05, "loss": 0.194, "num_input_tokens_seen": 27164016, "step": 12580 }, { "epoch": 2.0530179445350734, "grad_norm": 1.1179605722427368, "learning_rate": 4.999893627163599e-05, "loss": 0.1384, "num_input_tokens_seen": 27174800, "step": 12585 }, { "epoch": 2.053833605220228, "grad_norm": 1.1225775480270386, "learning_rate": 4.9998903187429006e-05, "loss": 0.168, "num_input_tokens_seen": 27184784, "step": 12590 }, { "epoch": 2.0546492659053834, "grad_norm": 0.48117563128471375, "learning_rate": 4.9998869596587526e-05, "loss": 0.072, "num_input_tokens_seen": 27195600, "step": 12595 }, { "epoch": 2.0554649265905383, "grad_norm": 0.3425010144710541, "learning_rate": 4.999883549911225e-05, "loss": 0.1277, "num_input_tokens_seen": 27206416, "step": 12600 }, { "epoch": 2.0562805872756935, "grad_norm": 0.772413969039917, "learning_rate": 4.999880089500385e-05, "loss": 0.1573, "num_input_tokens_seen": 27217360, "step": 12605 }, { "epoch": 2.0570962479608483, "grad_norm": 0.5228245258331299, "learning_rate": 4.999876578426304e-05, "loss": 0.211, "num_input_tokens_seen": 27228304, "step": 12610 }, { "epoch": 2.057911908646003, "grad_norm": 0.36365464329719543, "learning_rate": 4.999873016689053e-05, "loss": 0.1691, "num_input_tokens_seen": 27240208, "step": 12615 }, { "epoch": 2.0587275693311584, "grad_norm": 0.22391293942928314, "learning_rate": 4.999869404288704e-05, "loss": 0.0686, "num_input_tokens_seen": 27250256, "step": 12620 }, { "epoch": 2.0595432300163132, "grad_norm": 0.862288773059845, "learning_rate": 4.99986574122533e-05, "loss": 0.1223, "num_input_tokens_seen": 27260080, "step": 12625 }, { "epoch": 2.060358890701468, "grad_norm": 0.525518536567688, "learning_rate": 4.999862027499006e-05, "loss": 0.0602, "num_input_tokens_seen": 27272112, "step": 12630 }, { "epoch": 2.0611745513866233, "grad_norm": 0.33869409561157227, "learning_rate": 4.9998582631098055e-05, "loss": 0.1519, "num_input_tokens_seen": 27282672, "step": 12635 }, { "epoch": 2.061990212071778, "grad_norm": 0.5310342907905579, "learning_rate": 4.999854448057807e-05, "loss": 0.0594, "num_input_tokens_seen": 27294704, "step": 12640 }, { "epoch": 2.062805872756933, "grad_norm": 0.29946836829185486, "learning_rate": 4.999850582343087e-05, "loss": 0.0837, "num_input_tokens_seen": 27306320, "step": 12645 }, { "epoch": 2.063621533442088, "grad_norm": 0.3704225718975067, "learning_rate": 4.999846665965723e-05, "loss": 0.1373, "num_input_tokens_seen": 27316720, "step": 12650 }, { "epoch": 2.064437194127243, "grad_norm": 0.2895393967628479, "learning_rate": 4.9998426989257955e-05, "loss": 0.0288, "num_input_tokens_seen": 27327408, "step": 12655 }, { "epoch": 2.065252854812398, "grad_norm": 1.1409552097320557, "learning_rate": 4.9998386812233846e-05, "loss": 0.1496, "num_input_tokens_seen": 27338672, "step": 12660 }, { "epoch": 2.066068515497553, "grad_norm": 1.2979241609573364, "learning_rate": 4.9998346128585705e-05, "loss": 0.2875, "num_input_tokens_seen": 27350352, "step": 12665 }, { "epoch": 2.066884176182708, "grad_norm": 0.15666648745536804, "learning_rate": 4.999830493831438e-05, "loss": 0.128, "num_input_tokens_seen": 27360432, "step": 12670 }, { "epoch": 2.067699836867863, "grad_norm": 0.8380452394485474, "learning_rate": 4.9998263241420684e-05, "loss": 0.2261, "num_input_tokens_seen": 27369840, "step": 12675 }, { "epoch": 2.068515497553018, "grad_norm": 0.3883110582828522, "learning_rate": 4.9998221037905466e-05, "loss": 0.0528, "num_input_tokens_seen": 27380016, "step": 12680 }, { "epoch": 2.069331158238173, "grad_norm": 0.7116630673408508, "learning_rate": 4.9998178327769595e-05, "loss": 0.1, "num_input_tokens_seen": 27391504, "step": 12685 }, { "epoch": 2.070146818923328, "grad_norm": 0.20262379944324493, "learning_rate": 4.9998135111013934e-05, "loss": 0.0455, "num_input_tokens_seen": 27402064, "step": 12690 }, { "epoch": 2.070962479608483, "grad_norm": 0.09938604384660721, "learning_rate": 4.999809138763934e-05, "loss": 0.0829, "num_input_tokens_seen": 27413808, "step": 12695 }, { "epoch": 2.0717781402936377, "grad_norm": 0.5810298919677734, "learning_rate": 4.999804715764672e-05, "loss": 0.2452, "num_input_tokens_seen": 27425520, "step": 12700 }, { "epoch": 2.072593800978793, "grad_norm": 2.0578181743621826, "learning_rate": 4.999800242103696e-05, "loss": 0.2305, "num_input_tokens_seen": 27436624, "step": 12705 }, { "epoch": 2.073409461663948, "grad_norm": 0.3907141387462616, "learning_rate": 4.9997957177810966e-05, "loss": 0.1683, "num_input_tokens_seen": 27448176, "step": 12710 }, { "epoch": 2.0742251223491026, "grad_norm": 0.617568850517273, "learning_rate": 4.999791142796966e-05, "loss": 0.1312, "num_input_tokens_seen": 27459568, "step": 12715 }, { "epoch": 2.075040783034258, "grad_norm": 2.253075122833252, "learning_rate": 4.999786517151397e-05, "loss": 0.2008, "num_input_tokens_seen": 27470800, "step": 12720 }, { "epoch": 2.0758564437194127, "grad_norm": 0.9800208806991577, "learning_rate": 4.999781840844482e-05, "loss": 0.099, "num_input_tokens_seen": 27481168, "step": 12725 }, { "epoch": 2.0766721044045675, "grad_norm": 0.6358720660209656, "learning_rate": 4.999777113876317e-05, "loss": 0.1613, "num_input_tokens_seen": 27492528, "step": 12730 }, { "epoch": 2.0774877650897228, "grad_norm": 0.20857751369476318, "learning_rate": 4.999772336246998e-05, "loss": 0.1536, "num_input_tokens_seen": 27503152, "step": 12735 }, { "epoch": 2.0783034257748776, "grad_norm": 0.806732177734375, "learning_rate": 4.999767507956622e-05, "loss": 0.1569, "num_input_tokens_seen": 27513904, "step": 12740 }, { "epoch": 2.0791190864600324, "grad_norm": 1.2515902519226074, "learning_rate": 4.9997626290052855e-05, "loss": 0.2577, "num_input_tokens_seen": 27525648, "step": 12745 }, { "epoch": 2.0799347471451877, "grad_norm": 1.3259775638580322, "learning_rate": 4.999757699393088e-05, "loss": 0.1563, "num_input_tokens_seen": 27537264, "step": 12750 }, { "epoch": 2.0807504078303425, "grad_norm": 0.6747623085975647, "learning_rate": 4.999752719120131e-05, "loss": 0.0947, "num_input_tokens_seen": 27548144, "step": 12755 }, { "epoch": 2.0815660685154977, "grad_norm": 0.23269104957580566, "learning_rate": 4.999747688186512e-05, "loss": 0.0446, "num_input_tokens_seen": 27559568, "step": 12760 }, { "epoch": 2.0823817292006526, "grad_norm": 1.9610241651535034, "learning_rate": 4.999742606592336e-05, "loss": 0.2267, "num_input_tokens_seen": 27570544, "step": 12765 }, { "epoch": 2.0831973898858074, "grad_norm": 1.193060278892517, "learning_rate": 4.999737474337705e-05, "loss": 0.2018, "num_input_tokens_seen": 27581616, "step": 12770 }, { "epoch": 2.0840130505709626, "grad_norm": 0.08876442164182663, "learning_rate": 4.999732291422723e-05, "loss": 0.1984, "num_input_tokens_seen": 27592112, "step": 12775 }, { "epoch": 2.0848287112561175, "grad_norm": 1.2885771989822388, "learning_rate": 4.999727057847494e-05, "loss": 0.2015, "num_input_tokens_seen": 27603056, "step": 12780 }, { "epoch": 2.0856443719412723, "grad_norm": 0.9373740553855896, "learning_rate": 4.999721773612126e-05, "loss": 0.2382, "num_input_tokens_seen": 27613616, "step": 12785 }, { "epoch": 2.0864600326264275, "grad_norm": 0.7504169344902039, "learning_rate": 4.999716438716725e-05, "loss": 0.1964, "num_input_tokens_seen": 27624784, "step": 12790 }, { "epoch": 2.0872756933115824, "grad_norm": 0.7464221715927124, "learning_rate": 4.999711053161399e-05, "loss": 0.1048, "num_input_tokens_seen": 27636048, "step": 12795 }, { "epoch": 2.088091353996737, "grad_norm": 0.8516896963119507, "learning_rate": 4.999705616946258e-05, "loss": 0.147, "num_input_tokens_seen": 27645328, "step": 12800 }, { "epoch": 2.0889070146818924, "grad_norm": 0.11059200018644333, "learning_rate": 4.999700130071411e-05, "loss": 0.1283, "num_input_tokens_seen": 27655024, "step": 12805 }, { "epoch": 2.0897226753670473, "grad_norm": 2.008056402206421, "learning_rate": 4.99969459253697e-05, "loss": 0.2177, "num_input_tokens_seen": 27666224, "step": 12810 }, { "epoch": 2.090538336052202, "grad_norm": 0.08924542367458344, "learning_rate": 4.9996890043430464e-05, "loss": 0.0371, "num_input_tokens_seen": 27677904, "step": 12815 }, { "epoch": 2.0913539967373573, "grad_norm": 1.186776876449585, "learning_rate": 4.999683365489755e-05, "loss": 0.1619, "num_input_tokens_seen": 27687088, "step": 12820 }, { "epoch": 2.092169657422512, "grad_norm": 1.5689256191253662, "learning_rate": 4.9996776759772084e-05, "loss": 0.1864, "num_input_tokens_seen": 27697008, "step": 12825 }, { "epoch": 2.0929853181076674, "grad_norm": 1.484284520149231, "learning_rate": 4.999671935805523e-05, "loss": 0.0855, "num_input_tokens_seen": 27708208, "step": 12830 }, { "epoch": 2.0938009787928222, "grad_norm": 0.25729823112487793, "learning_rate": 4.9996661449748144e-05, "loss": 0.2907, "num_input_tokens_seen": 27718704, "step": 12835 }, { "epoch": 2.094616639477977, "grad_norm": 0.22082944214344025, "learning_rate": 4.999660303485201e-05, "loss": 0.1547, "num_input_tokens_seen": 27728816, "step": 12840 }, { "epoch": 2.0954323001631323, "grad_norm": 0.5054596066474915, "learning_rate": 4.9996544113367996e-05, "loss": 0.1094, "num_input_tokens_seen": 27738992, "step": 12845 }, { "epoch": 2.096247960848287, "grad_norm": 0.43957796692848206, "learning_rate": 4.999648468529731e-05, "loss": 0.1528, "num_input_tokens_seen": 27750384, "step": 12850 }, { "epoch": 2.097063621533442, "grad_norm": 1.0053726434707642, "learning_rate": 4.9996424750641154e-05, "loss": 0.1361, "num_input_tokens_seen": 27760784, "step": 12855 }, { "epoch": 2.097879282218597, "grad_norm": 0.22081685066223145, "learning_rate": 4.9996364309400735e-05, "loss": 0.0743, "num_input_tokens_seen": 27771952, "step": 12860 }, { "epoch": 2.098694942903752, "grad_norm": 0.39126503467559814, "learning_rate": 4.999630336157729e-05, "loss": 0.1718, "num_input_tokens_seen": 27781744, "step": 12865 }, { "epoch": 2.099510603588907, "grad_norm": 0.13682834804058075, "learning_rate": 4.9996241907172035e-05, "loss": 0.0669, "num_input_tokens_seen": 27792944, "step": 12870 }, { "epoch": 2.100326264274062, "grad_norm": 0.5604085922241211, "learning_rate": 4.999617994618624e-05, "loss": 0.1234, "num_input_tokens_seen": 27804048, "step": 12875 }, { "epoch": 2.101141924959217, "grad_norm": 0.7456160187721252, "learning_rate": 4.999611747862114e-05, "loss": 0.0844, "num_input_tokens_seen": 27814960, "step": 12880 }, { "epoch": 2.1019575856443717, "grad_norm": 0.2236858308315277, "learning_rate": 4.999605450447802e-05, "loss": 0.1058, "num_input_tokens_seen": 27827280, "step": 12885 }, { "epoch": 2.102773246329527, "grad_norm": 0.4646965265274048, "learning_rate": 4.999599102375814e-05, "loss": 0.0848, "num_input_tokens_seen": 27837520, "step": 12890 }, { "epoch": 2.103588907014682, "grad_norm": 0.9826850891113281, "learning_rate": 4.9995927036462784e-05, "loss": 0.0738, "num_input_tokens_seen": 27848912, "step": 12895 }, { "epoch": 2.104404567699837, "grad_norm": 0.30660122632980347, "learning_rate": 4.999586254259327e-05, "loss": 0.1065, "num_input_tokens_seen": 27859504, "step": 12900 }, { "epoch": 2.105220228384992, "grad_norm": 0.5545145869255066, "learning_rate": 4.9995797542150883e-05, "loss": 0.0847, "num_input_tokens_seen": 27870736, "step": 12905 }, { "epoch": 2.1060358890701467, "grad_norm": 1.2093425989151, "learning_rate": 4.999573203513695e-05, "loss": 0.194, "num_input_tokens_seen": 27880720, "step": 12910 }, { "epoch": 2.106851549755302, "grad_norm": 1.4892857074737549, "learning_rate": 4.999566602155281e-05, "loss": 0.2689, "num_input_tokens_seen": 27892048, "step": 12915 }, { "epoch": 2.107667210440457, "grad_norm": 0.4701876938343048, "learning_rate": 4.9995599501399774e-05, "loss": 0.0636, "num_input_tokens_seen": 27903760, "step": 12920 }, { "epoch": 2.1084828711256116, "grad_norm": 0.10089034587144852, "learning_rate": 4.9995532474679215e-05, "loss": 0.1175, "num_input_tokens_seen": 27914192, "step": 12925 }, { "epoch": 2.109298531810767, "grad_norm": 0.2441745102405548, "learning_rate": 4.999546494139248e-05, "loss": 0.0235, "num_input_tokens_seen": 27925936, "step": 12930 }, { "epoch": 2.1101141924959217, "grad_norm": 0.4986913800239563, "learning_rate": 4.999539690154093e-05, "loss": 0.3406, "num_input_tokens_seen": 27937392, "step": 12935 }, { "epoch": 2.1109298531810765, "grad_norm": 0.41561412811279297, "learning_rate": 4.999532835512596e-05, "loss": 0.1054, "num_input_tokens_seen": 27948304, "step": 12940 }, { "epoch": 2.1117455138662318, "grad_norm": 1.3019353151321411, "learning_rate": 4.999525930214896e-05, "loss": 0.0891, "num_input_tokens_seen": 27959344, "step": 12945 }, { "epoch": 2.1125611745513866, "grad_norm": 1.5888316631317139, "learning_rate": 4.9995189742611303e-05, "loss": 0.3695, "num_input_tokens_seen": 27970032, "step": 12950 }, { "epoch": 2.1133768352365414, "grad_norm": 0.39302584528923035, "learning_rate": 4.999511967651443e-05, "loss": 0.0855, "num_input_tokens_seen": 27979984, "step": 12955 }, { "epoch": 2.1141924959216967, "grad_norm": 0.08938200771808624, "learning_rate": 4.999504910385974e-05, "loss": 0.1398, "num_input_tokens_seen": 27989200, "step": 12960 }, { "epoch": 2.1150081566068515, "grad_norm": 0.3351214528083801, "learning_rate": 4.9994978024648684e-05, "loss": 0.1377, "num_input_tokens_seen": 28000432, "step": 12965 }, { "epoch": 2.1158238172920063, "grad_norm": 0.3868334889411926, "learning_rate": 4.9994906438882676e-05, "loss": 0.0776, "num_input_tokens_seen": 28010064, "step": 12970 }, { "epoch": 2.1166394779771616, "grad_norm": 1.54409921169281, "learning_rate": 4.999483434656319e-05, "loss": 0.2029, "num_input_tokens_seen": 28021776, "step": 12975 }, { "epoch": 2.1174551386623164, "grad_norm": 0.37215688824653625, "learning_rate": 4.9994761747691674e-05, "loss": 0.2192, "num_input_tokens_seen": 28032656, "step": 12980 }, { "epoch": 2.1182707993474716, "grad_norm": 0.0481642447412014, "learning_rate": 4.99946886422696e-05, "loss": 0.0448, "num_input_tokens_seen": 28043728, "step": 12985 }, { "epoch": 2.1190864600326265, "grad_norm": 0.2615896463394165, "learning_rate": 4.999461503029846e-05, "loss": 0.2111, "num_input_tokens_seen": 28055120, "step": 12990 }, { "epoch": 2.1199021207177813, "grad_norm": 1.889480710029602, "learning_rate": 4.999454091177974e-05, "loss": 0.1955, "num_input_tokens_seen": 28065936, "step": 12995 }, { "epoch": 2.1207177814029365, "grad_norm": 2.109635353088379, "learning_rate": 4.999446628671493e-05, "loss": 0.1537, "num_input_tokens_seen": 28077040, "step": 13000 }, { "epoch": 2.1215334420880914, "grad_norm": 1.026237964630127, "learning_rate": 4.9994391155105555e-05, "loss": 0.1168, "num_input_tokens_seen": 28088624, "step": 13005 }, { "epoch": 2.122349102773246, "grad_norm": 1.3851678371429443, "learning_rate": 4.999431551695314e-05, "loss": 0.1967, "num_input_tokens_seen": 28097520, "step": 13010 }, { "epoch": 2.1231647634584014, "grad_norm": 0.4387195408344269, "learning_rate": 4.999423937225921e-05, "loss": 0.3129, "num_input_tokens_seen": 28107504, "step": 13015 }, { "epoch": 2.1239804241435563, "grad_norm": 0.6851591467857361, "learning_rate": 4.999416272102532e-05, "loss": 0.1781, "num_input_tokens_seen": 28118352, "step": 13020 }, { "epoch": 2.124796084828711, "grad_norm": 1.3601551055908203, "learning_rate": 4.999408556325301e-05, "loss": 0.1486, "num_input_tokens_seen": 28129936, "step": 13025 }, { "epoch": 2.1256117455138663, "grad_norm": 1.1937931776046753, "learning_rate": 4.999400789894385e-05, "loss": 0.1479, "num_input_tokens_seen": 28140496, "step": 13030 }, { "epoch": 2.126427406199021, "grad_norm": 0.689397931098938, "learning_rate": 4.9993929728099406e-05, "loss": 0.0872, "num_input_tokens_seen": 28151440, "step": 13035 }, { "epoch": 2.1272430668841764, "grad_norm": 0.6503987312316895, "learning_rate": 4.999385105072128e-05, "loss": 0.1418, "num_input_tokens_seen": 28161200, "step": 13040 }, { "epoch": 2.1280587275693312, "grad_norm": 1.0884451866149902, "learning_rate": 4.999377186681105e-05, "loss": 0.0767, "num_input_tokens_seen": 28171568, "step": 13045 }, { "epoch": 2.128874388254486, "grad_norm": 0.5314123034477234, "learning_rate": 4.999369217637032e-05, "loss": 0.1382, "num_input_tokens_seen": 28183344, "step": 13050 }, { "epoch": 2.1296900489396413, "grad_norm": 0.24291189014911652, "learning_rate": 4.999361197940072e-05, "loss": 0.0584, "num_input_tokens_seen": 28193552, "step": 13055 }, { "epoch": 2.130505709624796, "grad_norm": 0.2638334631919861, "learning_rate": 4.9993531275903866e-05, "loss": 0.1868, "num_input_tokens_seen": 28203600, "step": 13060 }, { "epoch": 2.131321370309951, "grad_norm": 0.49718406796455383, "learning_rate": 4.9993450065881394e-05, "loss": 0.355, "num_input_tokens_seen": 28213584, "step": 13065 }, { "epoch": 2.132137030995106, "grad_norm": 0.18802642822265625, "learning_rate": 4.999336834933495e-05, "loss": 0.0906, "num_input_tokens_seen": 28224016, "step": 13070 }, { "epoch": 2.132952691680261, "grad_norm": 0.37822243571281433, "learning_rate": 4.999328612626618e-05, "loss": 0.208, "num_input_tokens_seen": 28235760, "step": 13075 }, { "epoch": 2.133768352365416, "grad_norm": 1.232106328010559, "learning_rate": 4.999320339667677e-05, "loss": 0.1664, "num_input_tokens_seen": 28244656, "step": 13080 }, { "epoch": 2.134584013050571, "grad_norm": 0.33600324392318726, "learning_rate": 4.9993120160568386e-05, "loss": 0.0544, "num_input_tokens_seen": 28254096, "step": 13085 }, { "epoch": 2.135399673735726, "grad_norm": 0.710578978061676, "learning_rate": 4.999303641794272e-05, "loss": 0.1123, "num_input_tokens_seen": 28264880, "step": 13090 }, { "epoch": 2.1362153344208807, "grad_norm": 0.8375298380851746, "learning_rate": 4.999295216880145e-05, "loss": 0.1728, "num_input_tokens_seen": 28274448, "step": 13095 }, { "epoch": 2.137030995106036, "grad_norm": 0.5768053531646729, "learning_rate": 4.9992867413146314e-05, "loss": 0.0732, "num_input_tokens_seen": 28285776, "step": 13100 }, { "epoch": 2.137846655791191, "grad_norm": 1.3248705863952637, "learning_rate": 4.999278215097901e-05, "loss": 0.0858, "num_input_tokens_seen": 28296016, "step": 13105 }, { "epoch": 2.1386623164763456, "grad_norm": 1.6079235076904297, "learning_rate": 4.9992696382301276e-05, "loss": 0.2399, "num_input_tokens_seen": 28307952, "step": 13110 }, { "epoch": 2.139477977161501, "grad_norm": 1.5810678005218506, "learning_rate": 4.999261010711483e-05, "loss": 0.194, "num_input_tokens_seen": 28318960, "step": 13115 }, { "epoch": 2.1402936378466557, "grad_norm": 0.24830114841461182, "learning_rate": 4.999252332542145e-05, "loss": 0.0339, "num_input_tokens_seen": 28329136, "step": 13120 }, { "epoch": 2.141109298531811, "grad_norm": 0.13644461333751678, "learning_rate": 4.999243603722287e-05, "loss": 0.0529, "num_input_tokens_seen": 28339696, "step": 13125 }, { "epoch": 2.141924959216966, "grad_norm": 0.636090099811554, "learning_rate": 4.999234824252087e-05, "loss": 0.1787, "num_input_tokens_seen": 28351440, "step": 13130 }, { "epoch": 2.1427406199021206, "grad_norm": 0.32650619745254517, "learning_rate": 4.999225994131722e-05, "loss": 0.1913, "num_input_tokens_seen": 28361232, "step": 13135 }, { "epoch": 2.143556280587276, "grad_norm": 0.1870148926973343, "learning_rate": 4.999217113361373e-05, "loss": 0.3123, "num_input_tokens_seen": 28370864, "step": 13140 }, { "epoch": 2.1443719412724307, "grad_norm": 0.7770249843597412, "learning_rate": 4.9992081819412185e-05, "loss": 0.0671, "num_input_tokens_seen": 28381456, "step": 13145 }, { "epoch": 2.1451876019575855, "grad_norm": 0.1981910914182663, "learning_rate": 4.99919919987144e-05, "loss": 0.0739, "num_input_tokens_seen": 28391600, "step": 13150 }, { "epoch": 2.1460032626427408, "grad_norm": 1.0516338348388672, "learning_rate": 4.999190167152218e-05, "loss": 0.2017, "num_input_tokens_seen": 28402640, "step": 13155 }, { "epoch": 2.1468189233278956, "grad_norm": 0.4098997712135315, "learning_rate": 4.999181083783738e-05, "loss": 0.2621, "num_input_tokens_seen": 28412272, "step": 13160 }, { "epoch": 2.1476345840130504, "grad_norm": 0.6463992595672607, "learning_rate": 4.999171949766182e-05, "loss": 0.1865, "num_input_tokens_seen": 28423312, "step": 13165 }, { "epoch": 2.1484502446982057, "grad_norm": 0.9996065497398376, "learning_rate": 4.999162765099736e-05, "loss": 0.2577, "num_input_tokens_seen": 28433424, "step": 13170 }, { "epoch": 2.1492659053833605, "grad_norm": 1.870043396949768, "learning_rate": 4.999153529784587e-05, "loss": 0.2286, "num_input_tokens_seen": 28443984, "step": 13175 }, { "epoch": 2.1500815660685153, "grad_norm": 0.6591420769691467, "learning_rate": 4.9991442438209214e-05, "loss": 0.1663, "num_input_tokens_seen": 28454832, "step": 13180 }, { "epoch": 2.1508972267536706, "grad_norm": 0.11164797842502594, "learning_rate": 4.9991349072089264e-05, "loss": 0.1601, "num_input_tokens_seen": 28467280, "step": 13185 }, { "epoch": 2.1517128874388254, "grad_norm": 0.6894841194152832, "learning_rate": 4.999125519948793e-05, "loss": 0.2349, "num_input_tokens_seen": 28478128, "step": 13190 }, { "epoch": 2.15252854812398, "grad_norm": 1.0132077932357788, "learning_rate": 4.9991160820407104e-05, "loss": 0.1042, "num_input_tokens_seen": 28489200, "step": 13195 }, { "epoch": 2.1533442088091355, "grad_norm": 0.2590457797050476, "learning_rate": 4.999106593484869e-05, "loss": 0.042, "num_input_tokens_seen": 28500240, "step": 13200 }, { "epoch": 2.1541598694942903, "grad_norm": 1.2111996412277222, "learning_rate": 4.999097054281463e-05, "loss": 0.1387, "num_input_tokens_seen": 28511824, "step": 13205 }, { "epoch": 2.1549755301794455, "grad_norm": 0.14306527376174927, "learning_rate": 4.999087464430685e-05, "loss": 0.1483, "num_input_tokens_seen": 28521456, "step": 13210 }, { "epoch": 2.1557911908646004, "grad_norm": 0.6315906047821045, "learning_rate": 4.99907782393273e-05, "loss": 0.125, "num_input_tokens_seen": 28531888, "step": 13215 }, { "epoch": 2.156606851549755, "grad_norm": 0.09550295770168304, "learning_rate": 4.9990681327877906e-05, "loss": 0.0829, "num_input_tokens_seen": 28541616, "step": 13220 }, { "epoch": 2.1574225122349104, "grad_norm": 0.9331651329994202, "learning_rate": 4.999058390996067e-05, "loss": 0.1755, "num_input_tokens_seen": 28553776, "step": 13225 }, { "epoch": 2.1582381729200653, "grad_norm": 1.7436535358428955, "learning_rate": 4.999048598557753e-05, "loss": 0.1357, "num_input_tokens_seen": 28565680, "step": 13230 }, { "epoch": 2.15905383360522, "grad_norm": 1.4758650064468384, "learning_rate": 4.99903875547305e-05, "loss": 0.2233, "num_input_tokens_seen": 28576560, "step": 13235 }, { "epoch": 2.1598694942903753, "grad_norm": 0.5474169850349426, "learning_rate": 4.999028861742157e-05, "loss": 0.3369, "num_input_tokens_seen": 28586352, "step": 13240 }, { "epoch": 2.16068515497553, "grad_norm": 0.819640576839447, "learning_rate": 4.999018917365273e-05, "loss": 0.2451, "num_input_tokens_seen": 28598416, "step": 13245 }, { "epoch": 2.161500815660685, "grad_norm": 0.15401630103588104, "learning_rate": 4.9990089223426005e-05, "loss": 0.1329, "num_input_tokens_seen": 28609424, "step": 13250 }, { "epoch": 2.1623164763458402, "grad_norm": 0.7526512742042542, "learning_rate": 4.998998876674342e-05, "loss": 0.0879, "num_input_tokens_seen": 28619184, "step": 13255 }, { "epoch": 2.163132137030995, "grad_norm": 0.5536556243896484, "learning_rate": 4.998988780360701e-05, "loss": 0.1824, "num_input_tokens_seen": 28629232, "step": 13260 }, { "epoch": 2.1639477977161503, "grad_norm": 0.6287527084350586, "learning_rate": 4.998978633401883e-05, "loss": 0.0679, "num_input_tokens_seen": 28639408, "step": 13265 }, { "epoch": 2.164763458401305, "grad_norm": 0.2814422845840454, "learning_rate": 4.9989684357980914e-05, "loss": 0.1406, "num_input_tokens_seen": 28650736, "step": 13270 }, { "epoch": 2.16557911908646, "grad_norm": 1.7226645946502686, "learning_rate": 4.998958187549535e-05, "loss": 0.1487, "num_input_tokens_seen": 28660784, "step": 13275 }, { "epoch": 2.166394779771615, "grad_norm": 1.1694226264953613, "learning_rate": 4.998947888656421e-05, "loss": 0.1104, "num_input_tokens_seen": 28671248, "step": 13280 }, { "epoch": 2.16721044045677, "grad_norm": 0.6701900959014893, "learning_rate": 4.9989375391189574e-05, "loss": 0.1393, "num_input_tokens_seen": 28682000, "step": 13285 }, { "epoch": 2.168026101141925, "grad_norm": 0.45711761713027954, "learning_rate": 4.998927138937355e-05, "loss": 0.2322, "num_input_tokens_seen": 28692912, "step": 13290 }, { "epoch": 2.16884176182708, "grad_norm": 2.4102394580841064, "learning_rate": 4.998916688111823e-05, "loss": 0.2388, "num_input_tokens_seen": 28703504, "step": 13295 }, { "epoch": 2.169657422512235, "grad_norm": 0.18731364607810974, "learning_rate": 4.998906186642576e-05, "loss": 0.0607, "num_input_tokens_seen": 28714416, "step": 13300 }, { "epoch": 2.1704730831973897, "grad_norm": 1.7527638673782349, "learning_rate": 4.998895634529823e-05, "loss": 0.0966, "num_input_tokens_seen": 28725200, "step": 13305 }, { "epoch": 2.171288743882545, "grad_norm": 0.6683935523033142, "learning_rate": 4.9988850317737815e-05, "loss": 0.1261, "num_input_tokens_seen": 28736176, "step": 13310 }, { "epoch": 2.1721044045677, "grad_norm": 0.2626304626464844, "learning_rate": 4.998874378374664e-05, "loss": 0.0998, "num_input_tokens_seen": 28746928, "step": 13315 }, { "epoch": 2.1729200652528546, "grad_norm": 1.1662184000015259, "learning_rate": 4.998863674332687e-05, "loss": 0.1985, "num_input_tokens_seen": 28757584, "step": 13320 }, { "epoch": 2.17373572593801, "grad_norm": 0.5399181842803955, "learning_rate": 4.998852919648068e-05, "loss": 0.1021, "num_input_tokens_seen": 28766224, "step": 13325 }, { "epoch": 2.1745513866231647, "grad_norm": 0.2078550010919571, "learning_rate": 4.998842114321025e-05, "loss": 0.0868, "num_input_tokens_seen": 28775984, "step": 13330 }, { "epoch": 2.1753670473083195, "grad_norm": 0.0515054352581501, "learning_rate": 4.998831258351776e-05, "loss": 0.0796, "num_input_tokens_seen": 28787056, "step": 13335 }, { "epoch": 2.176182707993475, "grad_norm": 0.3997419476509094, "learning_rate": 4.998820351740541e-05, "loss": 0.1024, "num_input_tokens_seen": 28797648, "step": 13340 }, { "epoch": 2.1769983686786296, "grad_norm": 0.14759215712547302, "learning_rate": 4.998809394487543e-05, "loss": 0.1105, "num_input_tokens_seen": 28808656, "step": 13345 }, { "epoch": 2.177814029363785, "grad_norm": 1.5433107614517212, "learning_rate": 4.998798386593001e-05, "loss": 0.0769, "num_input_tokens_seen": 28818448, "step": 13350 }, { "epoch": 2.1786296900489397, "grad_norm": 1.8244967460632324, "learning_rate": 4.9987873280571406e-05, "loss": 0.1584, "num_input_tokens_seen": 28828752, "step": 13355 }, { "epoch": 2.1794453507340945, "grad_norm": 2.0944433212280273, "learning_rate": 4.9987762188801854e-05, "loss": 0.1955, "num_input_tokens_seen": 28838896, "step": 13360 }, { "epoch": 2.1802610114192498, "grad_norm": 0.4561474323272705, "learning_rate": 4.99876505906236e-05, "loss": 0.1878, "num_input_tokens_seen": 28848880, "step": 13365 }, { "epoch": 2.1810766721044046, "grad_norm": 0.5387064814567566, "learning_rate": 4.9987538486038895e-05, "loss": 0.3516, "num_input_tokens_seen": 28860592, "step": 13370 }, { "epoch": 2.1818923327895594, "grad_norm": 0.2754017412662506, "learning_rate": 4.998742587505004e-05, "loss": 0.1709, "num_input_tokens_seen": 28872432, "step": 13375 }, { "epoch": 2.1827079934747147, "grad_norm": 0.5929624438285828, "learning_rate": 4.99873127576593e-05, "loss": 0.0509, "num_input_tokens_seen": 28883856, "step": 13380 }, { "epoch": 2.1835236541598695, "grad_norm": 0.9277820587158203, "learning_rate": 4.998719913386896e-05, "loss": 0.2214, "num_input_tokens_seen": 28894384, "step": 13385 }, { "epoch": 2.1843393148450243, "grad_norm": 1.8495320081710815, "learning_rate": 4.9987085003681334e-05, "loss": 0.1667, "num_input_tokens_seen": 28902288, "step": 13390 }, { "epoch": 2.1851549755301796, "grad_norm": 1.7208924293518066, "learning_rate": 4.998697036709873e-05, "loss": 0.1784, "num_input_tokens_seen": 28913296, "step": 13395 }, { "epoch": 2.1859706362153344, "grad_norm": 0.07445711642503738, "learning_rate": 4.998685522412348e-05, "loss": 0.1361, "num_input_tokens_seen": 28923888, "step": 13400 }, { "epoch": 2.186786296900489, "grad_norm": 0.3873595595359802, "learning_rate": 4.9986739574757907e-05, "loss": 0.2323, "num_input_tokens_seen": 28934640, "step": 13405 }, { "epoch": 2.1876019575856445, "grad_norm": 0.14859026670455933, "learning_rate": 4.998662341900436e-05, "loss": 0.1578, "num_input_tokens_seen": 28945648, "step": 13410 }, { "epoch": 2.1884176182707993, "grad_norm": 0.7687433362007141, "learning_rate": 4.9986506756865195e-05, "loss": 0.0633, "num_input_tokens_seen": 28957392, "step": 13415 }, { "epoch": 2.189233278955954, "grad_norm": 0.5621496438980103, "learning_rate": 4.9986389588342766e-05, "loss": 0.0723, "num_input_tokens_seen": 28967056, "step": 13420 }, { "epoch": 2.1900489396411094, "grad_norm": 0.30135637521743774, "learning_rate": 4.9986271913439456e-05, "loss": 0.2503, "num_input_tokens_seen": 28977808, "step": 13425 }, { "epoch": 2.190864600326264, "grad_norm": 0.36119723320007324, "learning_rate": 4.9986153732157645e-05, "loss": 0.0841, "num_input_tokens_seen": 28987440, "step": 13430 }, { "epoch": 2.1916802610114194, "grad_norm": 0.9107398986816406, "learning_rate": 4.998603504449974e-05, "loss": 0.1215, "num_input_tokens_seen": 28997392, "step": 13435 }, { "epoch": 2.1924959216965743, "grad_norm": 0.6235585808753967, "learning_rate": 4.9985915850468126e-05, "loss": 0.2231, "num_input_tokens_seen": 29007824, "step": 13440 }, { "epoch": 2.193311582381729, "grad_norm": 0.2665006220340729, "learning_rate": 4.9985796150065243e-05, "loss": 0.1273, "num_input_tokens_seen": 29019024, "step": 13445 }, { "epoch": 2.1941272430668843, "grad_norm": 1.3035330772399902, "learning_rate": 4.9985675943293495e-05, "loss": 0.1747, "num_input_tokens_seen": 29031600, "step": 13450 }, { "epoch": 2.194942903752039, "grad_norm": 1.1215633153915405, "learning_rate": 4.998555523015533e-05, "loss": 0.1668, "num_input_tokens_seen": 29043056, "step": 13455 }, { "epoch": 2.195758564437194, "grad_norm": 0.5518949031829834, "learning_rate": 4.99854340106532e-05, "loss": 0.1161, "num_input_tokens_seen": 29053648, "step": 13460 }, { "epoch": 2.1965742251223492, "grad_norm": 0.3728121519088745, "learning_rate": 4.998531228478954e-05, "loss": 0.0738, "num_input_tokens_seen": 29064112, "step": 13465 }, { "epoch": 2.197389885807504, "grad_norm": 0.21311938762664795, "learning_rate": 4.998519005256684e-05, "loss": 0.0507, "num_input_tokens_seen": 29075216, "step": 13470 }, { "epoch": 2.198205546492659, "grad_norm": 1.6535823345184326, "learning_rate": 4.9985067313987565e-05, "loss": 0.1353, "num_input_tokens_seen": 29084368, "step": 13475 }, { "epoch": 2.199021207177814, "grad_norm": 0.13241784274578094, "learning_rate": 4.998494406905421e-05, "loss": 0.0577, "num_input_tokens_seen": 29096016, "step": 13480 }, { "epoch": 2.199836867862969, "grad_norm": 0.5509206652641296, "learning_rate": 4.998482031776925e-05, "loss": 0.0931, "num_input_tokens_seen": 29106768, "step": 13485 }, { "epoch": 2.200652528548124, "grad_norm": 2.182535171508789, "learning_rate": 4.9984696060135224e-05, "loss": 0.2036, "num_input_tokens_seen": 29116560, "step": 13490 }, { "epoch": 2.201468189233279, "grad_norm": 0.6126439571380615, "learning_rate": 4.9984571296154645e-05, "loss": 0.1933, "num_input_tokens_seen": 29127376, "step": 13495 }, { "epoch": 2.202283849918434, "grad_norm": 0.5907752513885498, "learning_rate": 4.998444602583002e-05, "loss": 0.1695, "num_input_tokens_seen": 29139088, "step": 13500 }, { "epoch": 2.203099510603589, "grad_norm": 0.3200514018535614, "learning_rate": 4.9984320249163906e-05, "loss": 0.153, "num_input_tokens_seen": 29149488, "step": 13505 }, { "epoch": 2.203915171288744, "grad_norm": 0.19444213807582855, "learning_rate": 4.998419396615886e-05, "loss": 0.0766, "num_input_tokens_seen": 29159728, "step": 13510 }, { "epoch": 2.2047308319738987, "grad_norm": 0.6033416986465454, "learning_rate": 4.998406717681741e-05, "loss": 0.1028, "num_input_tokens_seen": 29170928, "step": 13515 }, { "epoch": 2.205546492659054, "grad_norm": 1.996845006942749, "learning_rate": 4.998393988114215e-05, "loss": 0.2756, "num_input_tokens_seen": 29180912, "step": 13520 }, { "epoch": 2.206362153344209, "grad_norm": 0.19314655661582947, "learning_rate": 4.998381207913565e-05, "loss": 0.152, "num_input_tokens_seen": 29191216, "step": 13525 }, { "epoch": 2.2071778140293636, "grad_norm": 0.17089173197746277, "learning_rate": 4.998368377080051e-05, "loss": 0.0521, "num_input_tokens_seen": 29201424, "step": 13530 }, { "epoch": 2.207993474714519, "grad_norm": 0.15853461623191833, "learning_rate": 4.998355495613932e-05, "loss": 0.2141, "num_input_tokens_seen": 29211152, "step": 13535 }, { "epoch": 2.2088091353996737, "grad_norm": 2.0609076023101807, "learning_rate": 4.9983425635154704e-05, "loss": 0.1844, "num_input_tokens_seen": 29221744, "step": 13540 }, { "epoch": 2.2096247960848285, "grad_norm": 1.0055819749832153, "learning_rate": 4.9983295807849263e-05, "loss": 0.2307, "num_input_tokens_seen": 29231568, "step": 13545 }, { "epoch": 2.210440456769984, "grad_norm": 2.5056300163269043, "learning_rate": 4.998316547422564e-05, "loss": 0.3924, "num_input_tokens_seen": 29242128, "step": 13550 }, { "epoch": 2.2112561174551386, "grad_norm": 0.5422676205635071, "learning_rate": 4.998303463428648e-05, "loss": 0.1523, "num_input_tokens_seen": 29250992, "step": 13555 }, { "epoch": 2.2120717781402934, "grad_norm": 0.513318657875061, "learning_rate": 4.998290328803443e-05, "loss": 0.0738, "num_input_tokens_seen": 29261072, "step": 13560 }, { "epoch": 2.2128874388254487, "grad_norm": 0.4772043228149414, "learning_rate": 4.9982771435472146e-05, "loss": 0.1048, "num_input_tokens_seen": 29271760, "step": 13565 }, { "epoch": 2.2137030995106035, "grad_norm": 0.24260753393173218, "learning_rate": 4.998263907660231e-05, "loss": 0.2312, "num_input_tokens_seen": 29282480, "step": 13570 }, { "epoch": 2.2145187601957588, "grad_norm": 1.0674314498901367, "learning_rate": 4.9982506211427604e-05, "loss": 0.1454, "num_input_tokens_seen": 29292912, "step": 13575 }, { "epoch": 2.2153344208809136, "grad_norm": 0.255156546831131, "learning_rate": 4.998237283995071e-05, "loss": 0.1449, "num_input_tokens_seen": 29302800, "step": 13580 }, { "epoch": 2.2161500815660684, "grad_norm": 0.42985936999320984, "learning_rate": 4.9982238962174345e-05, "loss": 0.1051, "num_input_tokens_seen": 29313424, "step": 13585 }, { "epoch": 2.2169657422512237, "grad_norm": 0.7667890787124634, "learning_rate": 4.998210457810121e-05, "loss": 0.1223, "num_input_tokens_seen": 29323856, "step": 13590 }, { "epoch": 2.2177814029363785, "grad_norm": 0.10374458134174347, "learning_rate": 4.9981969687734035e-05, "loss": 0.0717, "num_input_tokens_seen": 29334832, "step": 13595 }, { "epoch": 2.2185970636215333, "grad_norm": 0.08557843416929245, "learning_rate": 4.998183429107555e-05, "loss": 0.0992, "num_input_tokens_seen": 29345936, "step": 13600 }, { "epoch": 2.2194127243066886, "grad_norm": 3.240870475769043, "learning_rate": 4.9981698388128505e-05, "loss": 0.201, "num_input_tokens_seen": 29356560, "step": 13605 }, { "epoch": 2.2202283849918434, "grad_norm": 0.22043253481388092, "learning_rate": 4.998156197889565e-05, "loss": 0.0711, "num_input_tokens_seen": 29367024, "step": 13610 }, { "epoch": 2.221044045676998, "grad_norm": 0.5654991269111633, "learning_rate": 4.998142506337975e-05, "loss": 0.0707, "num_input_tokens_seen": 29376944, "step": 13615 }, { "epoch": 2.2218597063621535, "grad_norm": 2.1481778621673584, "learning_rate": 4.998128764158358e-05, "loss": 0.1794, "num_input_tokens_seen": 29389552, "step": 13620 }, { "epoch": 2.2226753670473083, "grad_norm": 0.5916892886161804, "learning_rate": 4.998114971350992e-05, "loss": 0.0552, "num_input_tokens_seen": 29399600, "step": 13625 }, { "epoch": 2.223491027732463, "grad_norm": 1.196581244468689, "learning_rate": 4.998101127916158e-05, "loss": 0.0829, "num_input_tokens_seen": 29410896, "step": 13630 }, { "epoch": 2.2243066884176184, "grad_norm": 1.3406492471694946, "learning_rate": 4.9980872338541354e-05, "loss": 0.1583, "num_input_tokens_seen": 29421136, "step": 13635 }, { "epoch": 2.225122349102773, "grad_norm": 1.3405640125274658, "learning_rate": 4.998073289165205e-05, "loss": 0.121, "num_input_tokens_seen": 29431536, "step": 13640 }, { "epoch": 2.225938009787928, "grad_norm": 0.10917265713214874, "learning_rate": 4.998059293849651e-05, "loss": 0.1394, "num_input_tokens_seen": 29442416, "step": 13645 }, { "epoch": 2.2267536704730833, "grad_norm": 0.8347209692001343, "learning_rate": 4.998045247907757e-05, "loss": 0.0887, "num_input_tokens_seen": 29453232, "step": 13650 }, { "epoch": 2.227569331158238, "grad_norm": 0.6217708587646484, "learning_rate": 4.998031151339806e-05, "loss": 0.1004, "num_input_tokens_seen": 29464976, "step": 13655 }, { "epoch": 2.2283849918433933, "grad_norm": 1.4285110235214233, "learning_rate": 4.998017004146085e-05, "loss": 0.1957, "num_input_tokens_seen": 29475536, "step": 13660 }, { "epoch": 2.229200652528548, "grad_norm": 0.11192301660776138, "learning_rate": 4.998002806326881e-05, "loss": 0.03, "num_input_tokens_seen": 29486256, "step": 13665 }, { "epoch": 2.230016313213703, "grad_norm": 1.2904534339904785, "learning_rate": 4.99798855788248e-05, "loss": 0.2448, "num_input_tokens_seen": 29498512, "step": 13670 }, { "epoch": 2.2308319738988582, "grad_norm": 0.9199331998825073, "learning_rate": 4.997974258813174e-05, "loss": 0.1771, "num_input_tokens_seen": 29509360, "step": 13675 }, { "epoch": 2.231647634584013, "grad_norm": 0.9117249846458435, "learning_rate": 4.9979599091192484e-05, "loss": 0.0397, "num_input_tokens_seen": 29520400, "step": 13680 }, { "epoch": 2.232463295269168, "grad_norm": 0.6374601125717163, "learning_rate": 4.997945508800998e-05, "loss": 0.2015, "num_input_tokens_seen": 29532016, "step": 13685 }, { "epoch": 2.233278955954323, "grad_norm": 0.31951209902763367, "learning_rate": 4.9979310578587124e-05, "loss": 0.0875, "num_input_tokens_seen": 29543728, "step": 13690 }, { "epoch": 2.234094616639478, "grad_norm": 0.9902100563049316, "learning_rate": 4.997916556292686e-05, "loss": 0.5606, "num_input_tokens_seen": 29554352, "step": 13695 }, { "epoch": 2.2349102773246328, "grad_norm": 0.8067732453346252, "learning_rate": 4.99790200410321e-05, "loss": 0.1342, "num_input_tokens_seen": 29565168, "step": 13700 }, { "epoch": 2.235725938009788, "grad_norm": 1.150559902191162, "learning_rate": 4.997887401290582e-05, "loss": 0.2798, "num_input_tokens_seen": 29575792, "step": 13705 }, { "epoch": 2.236541598694943, "grad_norm": 0.2764955461025238, "learning_rate": 4.9978727478550966e-05, "loss": 0.1641, "num_input_tokens_seen": 29586384, "step": 13710 }, { "epoch": 2.237357259380098, "grad_norm": 0.09672193229198456, "learning_rate": 4.997858043797052e-05, "loss": 0.0696, "num_input_tokens_seen": 29597232, "step": 13715 }, { "epoch": 2.238172920065253, "grad_norm": 2.2297275066375732, "learning_rate": 4.9978432891167446e-05, "loss": 0.1558, "num_input_tokens_seen": 29606896, "step": 13720 }, { "epoch": 2.2389885807504077, "grad_norm": 0.0833422988653183, "learning_rate": 4.997828483814475e-05, "loss": 0.0697, "num_input_tokens_seen": 29617648, "step": 13725 }, { "epoch": 2.239804241435563, "grad_norm": 1.2998358011245728, "learning_rate": 4.997813627890542e-05, "loss": 0.2722, "num_input_tokens_seen": 29627664, "step": 13730 }, { "epoch": 2.240619902120718, "grad_norm": 0.24643617868423462, "learning_rate": 4.997798721345247e-05, "loss": 0.0633, "num_input_tokens_seen": 29639088, "step": 13735 }, { "epoch": 2.2414355628058726, "grad_norm": 0.9176464676856995, "learning_rate": 4.9977837641788925e-05, "loss": 0.1015, "num_input_tokens_seen": 29649648, "step": 13740 }, { "epoch": 2.242251223491028, "grad_norm": 0.44931694865226746, "learning_rate": 4.997768756391781e-05, "loss": 0.1899, "num_input_tokens_seen": 29661360, "step": 13745 }, { "epoch": 2.2430668841761827, "grad_norm": 0.5603500008583069, "learning_rate": 4.9977536979842176e-05, "loss": 0.1136, "num_input_tokens_seen": 29672336, "step": 13750 }, { "epoch": 2.2438825448613375, "grad_norm": 2.7764949798583984, "learning_rate": 4.9977385889565066e-05, "loss": 0.3504, "num_input_tokens_seen": 29683472, "step": 13755 }, { "epoch": 2.244698205546493, "grad_norm": 0.41940930485725403, "learning_rate": 4.9977234293089534e-05, "loss": 0.0247, "num_input_tokens_seen": 29695472, "step": 13760 }, { "epoch": 2.2455138662316476, "grad_norm": 0.8970036506652832, "learning_rate": 4.9977082190418675e-05, "loss": 0.2089, "num_input_tokens_seen": 29705520, "step": 13765 }, { "epoch": 2.2463295269168024, "grad_norm": 0.5370916128158569, "learning_rate": 4.997692958155557e-05, "loss": 0.0663, "num_input_tokens_seen": 29715824, "step": 13770 }, { "epoch": 2.2471451876019577, "grad_norm": 0.5011693239212036, "learning_rate": 4.997677646650328e-05, "loss": 0.0495, "num_input_tokens_seen": 29726864, "step": 13775 }, { "epoch": 2.2479608482871125, "grad_norm": 0.17822083830833435, "learning_rate": 4.9976622845264944e-05, "loss": 0.0366, "num_input_tokens_seen": 29738128, "step": 13780 }, { "epoch": 2.2487765089722673, "grad_norm": 0.20828501880168915, "learning_rate": 4.997646871784365e-05, "loss": 0.2213, "num_input_tokens_seen": 29747152, "step": 13785 }, { "epoch": 2.2495921696574226, "grad_norm": 1.5994833707809448, "learning_rate": 4.9976314084242546e-05, "loss": 0.1962, "num_input_tokens_seen": 29756912, "step": 13790 }, { "epoch": 2.2504078303425774, "grad_norm": 1.568878412246704, "learning_rate": 4.997615894446474e-05, "loss": 0.201, "num_input_tokens_seen": 29766768, "step": 13795 }, { "epoch": 2.2512234910277327, "grad_norm": 0.9979236125946045, "learning_rate": 4.9976003298513396e-05, "loss": 0.1314, "num_input_tokens_seen": 29778032, "step": 13800 }, { "epoch": 2.2520391517128875, "grad_norm": 1.1000559329986572, "learning_rate": 4.997584714639165e-05, "loss": 0.2107, "num_input_tokens_seen": 29787408, "step": 13805 }, { "epoch": 2.2528548123980423, "grad_norm": 0.317669153213501, "learning_rate": 4.997569048810269e-05, "loss": 0.0749, "num_input_tokens_seen": 29797232, "step": 13810 }, { "epoch": 2.2536704730831976, "grad_norm": 0.9073019623756409, "learning_rate": 4.9975533323649676e-05, "loss": 0.2377, "num_input_tokens_seen": 29808016, "step": 13815 }, { "epoch": 2.2544861337683524, "grad_norm": 0.38994306325912476, "learning_rate": 4.997537565303579e-05, "loss": 0.0651, "num_input_tokens_seen": 29818640, "step": 13820 }, { "epoch": 2.255301794453507, "grad_norm": 0.16042886674404144, "learning_rate": 4.997521747626424e-05, "loss": 0.1594, "num_input_tokens_seen": 29829776, "step": 13825 }, { "epoch": 2.2561174551386625, "grad_norm": 0.15405841171741486, "learning_rate": 4.997505879333822e-05, "loss": 0.0914, "num_input_tokens_seen": 29840304, "step": 13830 }, { "epoch": 2.2569331158238173, "grad_norm": 0.34714680910110474, "learning_rate": 4.997489960426095e-05, "loss": 0.2488, "num_input_tokens_seen": 29852048, "step": 13835 }, { "epoch": 2.257748776508972, "grad_norm": 1.275622010231018, "learning_rate": 4.997473990903566e-05, "loss": 0.108, "num_input_tokens_seen": 29861264, "step": 13840 }, { "epoch": 2.2585644371941274, "grad_norm": 0.06816834956407547, "learning_rate": 4.997457970766558e-05, "loss": 0.2304, "num_input_tokens_seen": 29871984, "step": 13845 }, { "epoch": 2.259380097879282, "grad_norm": 0.8294312357902527, "learning_rate": 4.997441900015396e-05, "loss": 0.2435, "num_input_tokens_seen": 29882256, "step": 13850 }, { "epoch": 2.2601957585644374, "grad_norm": 1.029234766960144, "learning_rate": 4.997425778650406e-05, "loss": 0.121, "num_input_tokens_seen": 29891696, "step": 13855 }, { "epoch": 2.2610114192495923, "grad_norm": 0.42017844319343567, "learning_rate": 4.997409606671915e-05, "loss": 0.0916, "num_input_tokens_seen": 29903856, "step": 13860 }, { "epoch": 2.261827079934747, "grad_norm": 0.2410479187965393, "learning_rate": 4.997393384080248e-05, "loss": 0.0865, "num_input_tokens_seen": 29913520, "step": 13865 }, { "epoch": 2.262642740619902, "grad_norm": 0.8487958908081055, "learning_rate": 4.997377110875737e-05, "loss": 0.0489, "num_input_tokens_seen": 29924016, "step": 13870 }, { "epoch": 2.263458401305057, "grad_norm": 1.75754976272583, "learning_rate": 4.9973607870587115e-05, "loss": 0.2545, "num_input_tokens_seen": 29935120, "step": 13875 }, { "epoch": 2.264274061990212, "grad_norm": 1.3466365337371826, "learning_rate": 4.997344412629501e-05, "loss": 0.1913, "num_input_tokens_seen": 29946288, "step": 13880 }, { "epoch": 2.2650897226753672, "grad_norm": 1.4591937065124512, "learning_rate": 4.9973279875884374e-05, "loss": 0.1666, "num_input_tokens_seen": 29957008, "step": 13885 }, { "epoch": 2.265905383360522, "grad_norm": 0.12075674533843994, "learning_rate": 4.997311511935855e-05, "loss": 0.0913, "num_input_tokens_seen": 29966896, "step": 13890 }, { "epoch": 2.266721044045677, "grad_norm": 2.3191864490509033, "learning_rate": 4.997294985672086e-05, "loss": 0.2251, "num_input_tokens_seen": 29976528, "step": 13895 }, { "epoch": 2.267536704730832, "grad_norm": 0.3291946351528168, "learning_rate": 4.997278408797466e-05, "loss": 0.0886, "num_input_tokens_seen": 29987120, "step": 13900 }, { "epoch": 2.268352365415987, "grad_norm": 0.48086410760879517, "learning_rate": 4.9972617813123314e-05, "loss": 0.0639, "num_input_tokens_seen": 29997680, "step": 13905 }, { "epoch": 2.2691680261011418, "grad_norm": 0.18980196118354797, "learning_rate": 4.9972451032170185e-05, "loss": 0.2306, "num_input_tokens_seen": 30008784, "step": 13910 }, { "epoch": 2.269983686786297, "grad_norm": 0.10841764509677887, "learning_rate": 4.997228374511865e-05, "loss": 0.3773, "num_input_tokens_seen": 30018416, "step": 13915 }, { "epoch": 2.270799347471452, "grad_norm": 1.272627830505371, "learning_rate": 4.997211595197212e-05, "loss": 0.1191, "num_input_tokens_seen": 30030352, "step": 13920 }, { "epoch": 2.2716150081566067, "grad_norm": 0.3583517372608185, "learning_rate": 4.9971947652733974e-05, "loss": 0.1161, "num_input_tokens_seen": 30041392, "step": 13925 }, { "epoch": 2.272430668841762, "grad_norm": 0.4301094114780426, "learning_rate": 4.997177884740762e-05, "loss": 0.0304, "num_input_tokens_seen": 30052240, "step": 13930 }, { "epoch": 2.2732463295269167, "grad_norm": 0.517392635345459, "learning_rate": 4.9971609535996505e-05, "loss": 0.2869, "num_input_tokens_seen": 30062192, "step": 13935 }, { "epoch": 2.274061990212072, "grad_norm": 0.5652008652687073, "learning_rate": 4.997143971850404e-05, "loss": 0.1353, "num_input_tokens_seen": 30073808, "step": 13940 }, { "epoch": 2.274877650897227, "grad_norm": 0.5646787881851196, "learning_rate": 4.997126939493367e-05, "loss": 0.1784, "num_input_tokens_seen": 30084016, "step": 13945 }, { "epoch": 2.2756933115823816, "grad_norm": 0.9416301250457764, "learning_rate": 4.9971098565288845e-05, "loss": 0.0875, "num_input_tokens_seen": 30095600, "step": 13950 }, { "epoch": 2.2765089722675365, "grad_norm": 0.9017264246940613, "learning_rate": 4.997092722957303e-05, "loss": 0.0907, "num_input_tokens_seen": 30106800, "step": 13955 }, { "epoch": 2.2773246329526917, "grad_norm": 0.4105575680732727, "learning_rate": 4.997075538778969e-05, "loss": 0.0378, "num_input_tokens_seen": 30118288, "step": 13960 }, { "epoch": 2.2781402936378465, "grad_norm": 0.8017412424087524, "learning_rate": 4.997058303994232e-05, "loss": 0.096, "num_input_tokens_seen": 30129552, "step": 13965 }, { "epoch": 2.278955954323002, "grad_norm": 0.40061020851135254, "learning_rate": 4.997041018603441e-05, "loss": 0.1261, "num_input_tokens_seen": 30140720, "step": 13970 }, { "epoch": 2.2797716150081566, "grad_norm": 1.1101737022399902, "learning_rate": 4.997023682606946e-05, "loss": 0.1013, "num_input_tokens_seen": 30150640, "step": 13975 }, { "epoch": 2.2805872756933114, "grad_norm": 1.1374011039733887, "learning_rate": 4.9970062960050975e-05, "loss": 0.1384, "num_input_tokens_seen": 30160976, "step": 13980 }, { "epoch": 2.2814029363784667, "grad_norm": 1.9580893516540527, "learning_rate": 4.99698885879825e-05, "loss": 0.2494, "num_input_tokens_seen": 30170576, "step": 13985 }, { "epoch": 2.2822185970636215, "grad_norm": 1.4612903594970703, "learning_rate": 4.996971370986755e-05, "loss": 0.1419, "num_input_tokens_seen": 30181904, "step": 13990 }, { "epoch": 2.2830342577487763, "grad_norm": 0.3036677837371826, "learning_rate": 4.9969538325709675e-05, "loss": 0.0585, "num_input_tokens_seen": 30193392, "step": 13995 }, { "epoch": 2.2838499184339316, "grad_norm": 2.2025678157806396, "learning_rate": 4.996936243551244e-05, "loss": 0.0959, "num_input_tokens_seen": 30204528, "step": 14000 }, { "epoch": 2.2846655791190864, "grad_norm": 1.505364179611206, "learning_rate": 4.996918603927938e-05, "loss": 0.1243, "num_input_tokens_seen": 30215760, "step": 14005 }, { "epoch": 2.2854812398042412, "grad_norm": 0.386129230260849, "learning_rate": 4.99690091370141e-05, "loss": 0.0439, "num_input_tokens_seen": 30226096, "step": 14010 }, { "epoch": 2.2862969004893965, "grad_norm": 0.19467812776565552, "learning_rate": 4.996883172872018e-05, "loss": 0.0992, "num_input_tokens_seen": 30236784, "step": 14015 }, { "epoch": 2.2871125611745513, "grad_norm": 0.3519984185695648, "learning_rate": 4.99686538144012e-05, "loss": 0.1293, "num_input_tokens_seen": 30248432, "step": 14020 }, { "epoch": 2.2879282218597066, "grad_norm": 0.40258342027664185, "learning_rate": 4.9968475394060773e-05, "loss": 0.1673, "num_input_tokens_seen": 30258448, "step": 14025 }, { "epoch": 2.2887438825448614, "grad_norm": 0.6871201395988464, "learning_rate": 4.996829646770253e-05, "loss": 0.2248, "num_input_tokens_seen": 30269424, "step": 14030 }, { "epoch": 2.289559543230016, "grad_norm": 0.3930063843727112, "learning_rate": 4.9968117035330076e-05, "loss": 0.0724, "num_input_tokens_seen": 30280144, "step": 14035 }, { "epoch": 2.2903752039151715, "grad_norm": 0.4710918068885803, "learning_rate": 4.996793709694706e-05, "loss": 0.0742, "num_input_tokens_seen": 30290064, "step": 14040 }, { "epoch": 2.2911908646003263, "grad_norm": 0.23779790103435516, "learning_rate": 4.9967756652557116e-05, "loss": 0.1728, "num_input_tokens_seen": 30299984, "step": 14045 }, { "epoch": 2.292006525285481, "grad_norm": 0.46782487630844116, "learning_rate": 4.9967575702163924e-05, "loss": 0.0511, "num_input_tokens_seen": 30311248, "step": 14050 }, { "epoch": 2.2928221859706364, "grad_norm": 0.2199455201625824, "learning_rate": 4.9967394245771125e-05, "loss": 0.1463, "num_input_tokens_seen": 30322160, "step": 14055 }, { "epoch": 2.293637846655791, "grad_norm": 1.7432650327682495, "learning_rate": 4.996721228338241e-05, "loss": 0.1604, "num_input_tokens_seen": 30333616, "step": 14060 }, { "epoch": 2.294453507340946, "grad_norm": 2.2460787296295166, "learning_rate": 4.996702981500147e-05, "loss": 0.0994, "num_input_tokens_seen": 30344112, "step": 14065 }, { "epoch": 2.2952691680261013, "grad_norm": 0.40191251039505005, "learning_rate": 4.9966846840632e-05, "loss": 0.0545, "num_input_tokens_seen": 30354192, "step": 14070 }, { "epoch": 2.296084828711256, "grad_norm": 1.5191866159439087, "learning_rate": 4.99666633602777e-05, "loss": 0.1269, "num_input_tokens_seen": 30365168, "step": 14075 }, { "epoch": 2.2969004893964113, "grad_norm": 1.1534688472747803, "learning_rate": 4.996647937394229e-05, "loss": 0.1391, "num_input_tokens_seen": 30376464, "step": 14080 }, { "epoch": 2.297716150081566, "grad_norm": 0.5267350673675537, "learning_rate": 4.996629488162951e-05, "loss": 0.105, "num_input_tokens_seen": 30387472, "step": 14085 }, { "epoch": 2.298531810766721, "grad_norm": 1.4806708097457886, "learning_rate": 4.9966109883343095e-05, "loss": 0.2651, "num_input_tokens_seen": 30398224, "step": 14090 }, { "epoch": 2.299347471451876, "grad_norm": 1.6815396547317505, "learning_rate": 4.9965924379086785e-05, "loss": 0.3, "num_input_tokens_seen": 30409872, "step": 14095 }, { "epoch": 2.300163132137031, "grad_norm": 1.053823709487915, "learning_rate": 4.996573836886435e-05, "loss": 0.1704, "num_input_tokens_seen": 30420144, "step": 14100 }, { "epoch": 2.300978792822186, "grad_norm": 0.8334737420082092, "learning_rate": 4.9965551852679554e-05, "loss": 0.1093, "num_input_tokens_seen": 30430704, "step": 14105 }, { "epoch": 2.301794453507341, "grad_norm": 0.3118553161621094, "learning_rate": 4.996536483053618e-05, "loss": 0.1023, "num_input_tokens_seen": 30441328, "step": 14110 }, { "epoch": 2.302610114192496, "grad_norm": 0.48884978890419006, "learning_rate": 4.996517730243802e-05, "loss": 0.123, "num_input_tokens_seen": 30452208, "step": 14115 }, { "epoch": 2.3034257748776508, "grad_norm": 0.5146333575248718, "learning_rate": 4.9964989268388864e-05, "loss": 0.098, "num_input_tokens_seen": 30462960, "step": 14120 }, { "epoch": 2.304241435562806, "grad_norm": 0.9709353446960449, "learning_rate": 4.996480072839253e-05, "loss": 0.1624, "num_input_tokens_seen": 30473168, "step": 14125 }, { "epoch": 2.305057096247961, "grad_norm": 1.6372745037078857, "learning_rate": 4.996461168245284e-05, "loss": 0.118, "num_input_tokens_seen": 30483824, "step": 14130 }, { "epoch": 2.3058727569331157, "grad_norm": 0.7061851024627686, "learning_rate": 4.996442213057363e-05, "loss": 0.1755, "num_input_tokens_seen": 30495632, "step": 14135 }, { "epoch": 2.306688417618271, "grad_norm": 0.7466757297515869, "learning_rate": 4.9964232072758734e-05, "loss": 0.1135, "num_input_tokens_seen": 30505808, "step": 14140 }, { "epoch": 2.3075040783034257, "grad_norm": 0.613222599029541, "learning_rate": 4.9964041509012005e-05, "loss": 0.0875, "num_input_tokens_seen": 30516368, "step": 14145 }, { "epoch": 2.3083197389885806, "grad_norm": 0.14843852818012238, "learning_rate": 4.99638504393373e-05, "loss": 0.0768, "num_input_tokens_seen": 30527856, "step": 14150 }, { "epoch": 2.309135399673736, "grad_norm": 1.492071509361267, "learning_rate": 4.9963658863738506e-05, "loss": 0.3338, "num_input_tokens_seen": 30537936, "step": 14155 }, { "epoch": 2.3099510603588906, "grad_norm": 0.8296903967857361, "learning_rate": 4.9963466782219494e-05, "loss": 0.1354, "num_input_tokens_seen": 30548304, "step": 14160 }, { "epoch": 2.310766721044046, "grad_norm": 1.167007565498352, "learning_rate": 4.9963274194784156e-05, "loss": 0.0542, "num_input_tokens_seen": 30559728, "step": 14165 }, { "epoch": 2.3115823817292007, "grad_norm": 0.39983102679252625, "learning_rate": 4.99630811014364e-05, "loss": 0.0818, "num_input_tokens_seen": 30569840, "step": 14170 }, { "epoch": 2.3123980424143555, "grad_norm": 0.40703994035720825, "learning_rate": 4.996288750218013e-05, "loss": 0.1222, "num_input_tokens_seen": 30580400, "step": 14175 }, { "epoch": 2.3132137030995104, "grad_norm": 1.8319507837295532, "learning_rate": 4.996269339701929e-05, "loss": 0.2586, "num_input_tokens_seen": 30594544, "step": 14180 }, { "epoch": 2.3140293637846656, "grad_norm": 0.2999475598335266, "learning_rate": 4.9962498785957795e-05, "loss": 0.1112, "num_input_tokens_seen": 30604912, "step": 14185 }, { "epoch": 2.3148450244698204, "grad_norm": 0.4981164336204529, "learning_rate": 4.99623036689996e-05, "loss": 0.0909, "num_input_tokens_seen": 30616400, "step": 14190 }, { "epoch": 2.3156606851549757, "grad_norm": 0.7361012101173401, "learning_rate": 4.996210804614865e-05, "loss": 0.0839, "num_input_tokens_seen": 30627824, "step": 14195 }, { "epoch": 2.3164763458401305, "grad_norm": 1.0200151205062866, "learning_rate": 4.996191191740891e-05, "loss": 0.1022, "num_input_tokens_seen": 30638320, "step": 14200 }, { "epoch": 2.3172920065252853, "grad_norm": 0.18872486054897308, "learning_rate": 4.996171528278436e-05, "loss": 0.2213, "num_input_tokens_seen": 30649744, "step": 14205 }, { "epoch": 2.3181076672104406, "grad_norm": 1.259871006011963, "learning_rate": 4.9961518142278986e-05, "loss": 0.2728, "num_input_tokens_seen": 30661136, "step": 14210 }, { "epoch": 2.3189233278955954, "grad_norm": 0.47414663434028625, "learning_rate": 4.996132049589678e-05, "loss": 0.1003, "num_input_tokens_seen": 30672368, "step": 14215 }, { "epoch": 2.3197389885807502, "grad_norm": 1.3450078964233398, "learning_rate": 4.996112234364175e-05, "loss": 0.1754, "num_input_tokens_seen": 30684784, "step": 14220 }, { "epoch": 2.3205546492659055, "grad_norm": 0.823032021522522, "learning_rate": 4.9960923685517914e-05, "loss": 0.0831, "num_input_tokens_seen": 30696432, "step": 14225 }, { "epoch": 2.3213703099510603, "grad_norm": 0.6261426210403442, "learning_rate": 4.996072452152929e-05, "loss": 0.2373, "num_input_tokens_seen": 30706256, "step": 14230 }, { "epoch": 2.322185970636215, "grad_norm": 1.8959767818450928, "learning_rate": 4.9960524851679915e-05, "loss": 0.0942, "num_input_tokens_seen": 30716976, "step": 14235 }, { "epoch": 2.3230016313213704, "grad_norm": 1.7482415437698364, "learning_rate": 4.996032467597384e-05, "loss": 0.2743, "num_input_tokens_seen": 30726768, "step": 14240 }, { "epoch": 2.323817292006525, "grad_norm": 1.9620435237884521, "learning_rate": 4.996012399441513e-05, "loss": 0.136, "num_input_tokens_seen": 30737072, "step": 14245 }, { "epoch": 2.3246329526916805, "grad_norm": 0.7041464447975159, "learning_rate": 4.995992280700783e-05, "loss": 0.1407, "num_input_tokens_seen": 30748400, "step": 14250 }, { "epoch": 2.3254486133768353, "grad_norm": 1.6028379201889038, "learning_rate": 4.995972111375604e-05, "loss": 0.1499, "num_input_tokens_seen": 30760240, "step": 14255 }, { "epoch": 2.32626427406199, "grad_norm": 0.1216692179441452, "learning_rate": 4.9959518914663836e-05, "loss": 0.1077, "num_input_tokens_seen": 30770928, "step": 14260 }, { "epoch": 2.3270799347471454, "grad_norm": 1.3172913789749146, "learning_rate": 4.995931620973532e-05, "loss": 0.0845, "num_input_tokens_seen": 30780240, "step": 14265 }, { "epoch": 2.3278955954323, "grad_norm": 0.10251553356647491, "learning_rate": 4.995911299897459e-05, "loss": 0.0734, "num_input_tokens_seen": 30792144, "step": 14270 }, { "epoch": 2.328711256117455, "grad_norm": 0.9616447687149048, "learning_rate": 4.995890928238578e-05, "loss": 0.146, "num_input_tokens_seen": 30802416, "step": 14275 }, { "epoch": 2.3295269168026103, "grad_norm": 0.7350335121154785, "learning_rate": 4.9958705059973e-05, "loss": 0.135, "num_input_tokens_seen": 30813872, "step": 14280 }, { "epoch": 2.330342577487765, "grad_norm": 0.5562069416046143, "learning_rate": 4.995850033174041e-05, "loss": 0.0654, "num_input_tokens_seen": 30825584, "step": 14285 }, { "epoch": 2.33115823817292, "grad_norm": 0.8135204315185547, "learning_rate": 4.995829509769215e-05, "loss": 0.0615, "num_input_tokens_seen": 30836688, "step": 14290 }, { "epoch": 2.331973898858075, "grad_norm": 0.30318477749824524, "learning_rate": 4.995808935783237e-05, "loss": 0.1118, "num_input_tokens_seen": 30848816, "step": 14295 }, { "epoch": 2.33278955954323, "grad_norm": 1.2048496007919312, "learning_rate": 4.9957883112165246e-05, "loss": 0.1563, "num_input_tokens_seen": 30858608, "step": 14300 }, { "epoch": 2.3336052202283852, "grad_norm": 0.1767560988664627, "learning_rate": 4.995767636069497e-05, "loss": 0.1012, "num_input_tokens_seen": 30868560, "step": 14305 }, { "epoch": 2.33442088091354, "grad_norm": 0.14414174854755402, "learning_rate": 4.995746910342571e-05, "loss": 0.0266, "num_input_tokens_seen": 30878192, "step": 14310 }, { "epoch": 2.335236541598695, "grad_norm": 0.15969465672969818, "learning_rate": 4.9957261340361694e-05, "loss": 0.0611, "num_input_tokens_seen": 30888432, "step": 14315 }, { "epoch": 2.3360522022838497, "grad_norm": 0.12849991023540497, "learning_rate": 4.9957053071507105e-05, "loss": 0.0961, "num_input_tokens_seen": 30898864, "step": 14320 }, { "epoch": 2.336867862969005, "grad_norm": 0.8868157267570496, "learning_rate": 4.995684429686618e-05, "loss": 0.1884, "num_input_tokens_seen": 30909424, "step": 14325 }, { "epoch": 2.3376835236541598, "grad_norm": 0.4184124171733856, "learning_rate": 4.995663501644314e-05, "loss": 0.0894, "num_input_tokens_seen": 30919600, "step": 14330 }, { "epoch": 2.338499184339315, "grad_norm": 0.040639109909534454, "learning_rate": 4.9956425230242234e-05, "loss": 0.0511, "num_input_tokens_seen": 30930416, "step": 14335 }, { "epoch": 2.33931484502447, "grad_norm": 0.17510588467121124, "learning_rate": 4.995621493826771e-05, "loss": 0.0627, "num_input_tokens_seen": 30940560, "step": 14340 }, { "epoch": 2.3401305057096247, "grad_norm": 1.0120326280593872, "learning_rate": 4.9956004140523835e-05, "loss": 0.2415, "num_input_tokens_seen": 30951440, "step": 14345 }, { "epoch": 2.34094616639478, "grad_norm": 0.08555498719215393, "learning_rate": 4.995579283701487e-05, "loss": 0.1636, "num_input_tokens_seen": 30961936, "step": 14350 }, { "epoch": 2.3417618270799347, "grad_norm": 0.8076527118682861, "learning_rate": 4.9955581027745114e-05, "loss": 0.1559, "num_input_tokens_seen": 30972112, "step": 14355 }, { "epoch": 2.3425774877650896, "grad_norm": 1.8340954780578613, "learning_rate": 4.995536871271884e-05, "loss": 0.1823, "num_input_tokens_seen": 30984272, "step": 14360 }, { "epoch": 2.343393148450245, "grad_norm": 0.2473660260438919, "learning_rate": 4.995515589194037e-05, "loss": 0.2349, "num_input_tokens_seen": 30996368, "step": 14365 }, { "epoch": 2.3442088091353996, "grad_norm": 1.5629115104675293, "learning_rate": 4.9954942565414e-05, "loss": 0.2269, "num_input_tokens_seen": 31007216, "step": 14370 }, { "epoch": 2.3450244698205545, "grad_norm": 0.29729345440864563, "learning_rate": 4.995472873314406e-05, "loss": 0.0428, "num_input_tokens_seen": 31018416, "step": 14375 }, { "epoch": 2.3458401305057097, "grad_norm": 0.13774876296520233, "learning_rate": 4.9954514395134896e-05, "loss": 0.0497, "num_input_tokens_seen": 31030416, "step": 14380 }, { "epoch": 2.3466557911908645, "grad_norm": 0.8848845362663269, "learning_rate": 4.995429955139083e-05, "loss": 0.1016, "num_input_tokens_seen": 31040752, "step": 14385 }, { "epoch": 2.34747145187602, "grad_norm": 0.22326458990573883, "learning_rate": 4.9954084201916235e-05, "loss": 0.1076, "num_input_tokens_seen": 31052048, "step": 14390 }, { "epoch": 2.3482871125611746, "grad_norm": 2.5469260215759277, "learning_rate": 4.995386834671546e-05, "loss": 0.2054, "num_input_tokens_seen": 31062320, "step": 14395 }, { "epoch": 2.3491027732463294, "grad_norm": 1.8705164194107056, "learning_rate": 4.995365198579289e-05, "loss": 0.1555, "num_input_tokens_seen": 31071856, "step": 14400 }, { "epoch": 2.3499184339314847, "grad_norm": 0.01961262710392475, "learning_rate": 4.9953435119152915e-05, "loss": 0.0274, "num_input_tokens_seen": 31082096, "step": 14405 }, { "epoch": 2.3507340946166395, "grad_norm": 0.8534370064735413, "learning_rate": 4.995321774679991e-05, "loss": 0.2325, "num_input_tokens_seen": 31092400, "step": 14410 }, { "epoch": 2.3515497553017943, "grad_norm": 0.17109975218772888, "learning_rate": 4.9952999868738293e-05, "loss": 0.1898, "num_input_tokens_seen": 31102768, "step": 14415 }, { "epoch": 2.3523654159869496, "grad_norm": 1.0582104921340942, "learning_rate": 4.9952781484972483e-05, "loss": 0.065, "num_input_tokens_seen": 31113936, "step": 14420 }, { "epoch": 2.3531810766721044, "grad_norm": 0.19937384128570557, "learning_rate": 4.99525625955069e-05, "loss": 0.0755, "num_input_tokens_seen": 31123920, "step": 14425 }, { "epoch": 2.3539967373572592, "grad_norm": 0.8423159122467041, "learning_rate": 4.995234320034598e-05, "loss": 0.2883, "num_input_tokens_seen": 31135472, "step": 14430 }, { "epoch": 2.3548123980424145, "grad_norm": 0.05629667267203331, "learning_rate": 4.995212329949417e-05, "loss": 0.1521, "num_input_tokens_seen": 31145552, "step": 14435 }, { "epoch": 2.3556280587275693, "grad_norm": 1.8618853092193604, "learning_rate": 4.995190289295593e-05, "loss": 0.1325, "num_input_tokens_seen": 31157680, "step": 14440 }, { "epoch": 2.356443719412724, "grad_norm": 0.38815391063690186, "learning_rate": 4.995168198073572e-05, "loss": 0.0816, "num_input_tokens_seen": 31168720, "step": 14445 }, { "epoch": 2.3572593800978794, "grad_norm": 2.008284091949463, "learning_rate": 4.9951460562838027e-05, "loss": 0.3288, "num_input_tokens_seen": 31178832, "step": 14450 }, { "epoch": 2.358075040783034, "grad_norm": 1.3413231372833252, "learning_rate": 4.995123863926733e-05, "loss": 0.1369, "num_input_tokens_seen": 31190288, "step": 14455 }, { "epoch": 2.358890701468189, "grad_norm": 0.2790631353855133, "learning_rate": 4.995101621002813e-05, "loss": 0.142, "num_input_tokens_seen": 31202128, "step": 14460 }, { "epoch": 2.3597063621533443, "grad_norm": 0.21699996292591095, "learning_rate": 4.995079327512493e-05, "loss": 0.0763, "num_input_tokens_seen": 31213232, "step": 14465 }, { "epoch": 2.360522022838499, "grad_norm": 1.5494275093078613, "learning_rate": 4.9950569834562255e-05, "loss": 0.2149, "num_input_tokens_seen": 31224272, "step": 14470 }, { "epoch": 2.3613376835236544, "grad_norm": 0.35930734872817993, "learning_rate": 4.995034588834463e-05, "loss": 0.1473, "num_input_tokens_seen": 31234064, "step": 14475 }, { "epoch": 2.362153344208809, "grad_norm": 0.10245024412870407, "learning_rate": 4.995012143647659e-05, "loss": 0.3256, "num_input_tokens_seen": 31245776, "step": 14480 }, { "epoch": 2.362969004893964, "grad_norm": 0.08120241016149521, "learning_rate": 4.99498964789627e-05, "loss": 0.1422, "num_input_tokens_seen": 31257392, "step": 14485 }, { "epoch": 2.3637846655791193, "grad_norm": 0.49443021416664124, "learning_rate": 4.99496710158075e-05, "loss": 0.0713, "num_input_tokens_seen": 31267408, "step": 14490 }, { "epoch": 2.364600326264274, "grad_norm": 0.5970107316970825, "learning_rate": 4.9949445047015556e-05, "loss": 0.1068, "num_input_tokens_seen": 31278960, "step": 14495 }, { "epoch": 2.365415986949429, "grad_norm": 1.4382009506225586, "learning_rate": 4.994921857259147e-05, "loss": 0.2966, "num_input_tokens_seen": 31289264, "step": 14500 }, { "epoch": 2.366231647634584, "grad_norm": 1.8343677520751953, "learning_rate": 4.994899159253981e-05, "loss": 0.125, "num_input_tokens_seen": 31299152, "step": 14505 }, { "epoch": 2.367047308319739, "grad_norm": 0.7329450249671936, "learning_rate": 4.9948764106865194e-05, "loss": 0.1253, "num_input_tokens_seen": 31308944, "step": 14510 }, { "epoch": 2.367862969004894, "grad_norm": 0.020255832001566887, "learning_rate": 4.994853611557222e-05, "loss": 0.092, "num_input_tokens_seen": 31319824, "step": 14515 }, { "epoch": 2.368678629690049, "grad_norm": 0.9065136909484863, "learning_rate": 4.9948307618665516e-05, "loss": 0.1802, "num_input_tokens_seen": 31332016, "step": 14520 }, { "epoch": 2.369494290375204, "grad_norm": 1.7484170198440552, "learning_rate": 4.99480786161497e-05, "loss": 0.2934, "num_input_tokens_seen": 31342512, "step": 14525 }, { "epoch": 2.370309951060359, "grad_norm": 0.20684857666492462, "learning_rate": 4.994784910802943e-05, "loss": 0.0571, "num_input_tokens_seen": 31352528, "step": 14530 }, { "epoch": 2.371125611745514, "grad_norm": 0.4585258662700653, "learning_rate": 4.994761909430934e-05, "loss": 0.0528, "num_input_tokens_seen": 31361520, "step": 14535 }, { "epoch": 2.3719412724306688, "grad_norm": 1.4120346307754517, "learning_rate": 4.994738857499411e-05, "loss": 0.1764, "num_input_tokens_seen": 31372304, "step": 14540 }, { "epoch": 2.3727569331158236, "grad_norm": 0.6946755051612854, "learning_rate": 4.9947157550088394e-05, "loss": 0.2039, "num_input_tokens_seen": 31382960, "step": 14545 }, { "epoch": 2.373572593800979, "grad_norm": 1.4880932569503784, "learning_rate": 4.99469260195969e-05, "loss": 0.2108, "num_input_tokens_seen": 31394128, "step": 14550 }, { "epoch": 2.3743882544861337, "grad_norm": 0.5965209603309631, "learning_rate": 4.9946693983524285e-05, "loss": 0.0608, "num_input_tokens_seen": 31405072, "step": 14555 }, { "epoch": 2.375203915171289, "grad_norm": 0.5714390873908997, "learning_rate": 4.994646144187527e-05, "loss": 0.0619, "num_input_tokens_seen": 31415280, "step": 14560 }, { "epoch": 2.3760195758564437, "grad_norm": 0.5995455980300903, "learning_rate": 4.994622839465457e-05, "loss": 0.0586, "num_input_tokens_seen": 31426384, "step": 14565 }, { "epoch": 2.3768352365415986, "grad_norm": 0.5444142818450928, "learning_rate": 4.994599484186691e-05, "loss": 0.0969, "num_input_tokens_seen": 31436304, "step": 14570 }, { "epoch": 2.377650897226754, "grad_norm": 0.12842991948127747, "learning_rate": 4.994576078351701e-05, "loss": 0.1254, "num_input_tokens_seen": 31448240, "step": 14575 }, { "epoch": 2.3784665579119086, "grad_norm": 0.04615698754787445, "learning_rate": 4.994552621960963e-05, "loss": 0.2369, "num_input_tokens_seen": 31458928, "step": 14580 }, { "epoch": 2.3792822185970635, "grad_norm": 0.3577094078063965, "learning_rate": 4.994529115014952e-05, "loss": 0.1492, "num_input_tokens_seen": 31470480, "step": 14585 }, { "epoch": 2.3800978792822187, "grad_norm": 0.20597073435783386, "learning_rate": 4.994505557514143e-05, "loss": 0.0703, "num_input_tokens_seen": 31481968, "step": 14590 }, { "epoch": 2.3809135399673735, "grad_norm": 0.6222265362739563, "learning_rate": 4.994481949459014e-05, "loss": 0.1375, "num_input_tokens_seen": 31493360, "step": 14595 }, { "epoch": 2.3817292006525284, "grad_norm": 1.7763041257858276, "learning_rate": 4.994458290850045e-05, "loss": 0.1622, "num_input_tokens_seen": 31505552, "step": 14600 }, { "epoch": 2.3825448613376836, "grad_norm": 0.052474088966846466, "learning_rate": 4.9944345816877136e-05, "loss": 0.0708, "num_input_tokens_seen": 31516336, "step": 14605 }, { "epoch": 2.3833605220228384, "grad_norm": 0.06288166344165802, "learning_rate": 4.994410821972501e-05, "loss": 0.0592, "num_input_tokens_seen": 31527152, "step": 14610 }, { "epoch": 2.3841761827079937, "grad_norm": 0.0696997344493866, "learning_rate": 4.994387011704889e-05, "loss": 0.0314, "num_input_tokens_seen": 31536400, "step": 14615 }, { "epoch": 2.3849918433931485, "grad_norm": 1.1214076280593872, "learning_rate": 4.994363150885359e-05, "loss": 0.0678, "num_input_tokens_seen": 31547056, "step": 14620 }, { "epoch": 2.3858075040783033, "grad_norm": 0.24260136485099792, "learning_rate": 4.9943392395143965e-05, "loss": 0.1251, "num_input_tokens_seen": 31557840, "step": 14625 }, { "epoch": 2.3866231647634586, "grad_norm": 0.045223917812108994, "learning_rate": 4.9943152775924845e-05, "loss": 0.0301, "num_input_tokens_seen": 31569424, "step": 14630 }, { "epoch": 2.3874388254486134, "grad_norm": 0.7279911041259766, "learning_rate": 4.994291265120109e-05, "loss": 0.354, "num_input_tokens_seen": 31579504, "step": 14635 }, { "epoch": 2.3882544861337682, "grad_norm": 0.6131106615066528, "learning_rate": 4.994267202097758e-05, "loss": 0.2365, "num_input_tokens_seen": 31591120, "step": 14640 }, { "epoch": 2.3890701468189235, "grad_norm": 0.554283857345581, "learning_rate": 4.9942430885259165e-05, "loss": 0.0401, "num_input_tokens_seen": 31601968, "step": 14645 }, { "epoch": 2.3898858075040783, "grad_norm": 0.2474360316991806, "learning_rate": 4.994218924405075e-05, "loss": 0.2123, "num_input_tokens_seen": 31612944, "step": 14650 }, { "epoch": 2.390701468189233, "grad_norm": 0.8412249684333801, "learning_rate": 4.994194709735723e-05, "loss": 0.0267, "num_input_tokens_seen": 31623472, "step": 14655 }, { "epoch": 2.3915171288743884, "grad_norm": 0.43420472741127014, "learning_rate": 4.994170444518351e-05, "loss": 0.1384, "num_input_tokens_seen": 31635248, "step": 14660 }, { "epoch": 2.392332789559543, "grad_norm": 1.1599472761154175, "learning_rate": 4.9941461287534505e-05, "loss": 0.2425, "num_input_tokens_seen": 31646320, "step": 14665 }, { "epoch": 2.393148450244698, "grad_norm": 0.11162879317998886, "learning_rate": 4.994121762441515e-05, "loss": 0.1114, "num_input_tokens_seen": 31658608, "step": 14670 }, { "epoch": 2.3939641109298533, "grad_norm": 1.238590121269226, "learning_rate": 4.994097345583039e-05, "loss": 0.1384, "num_input_tokens_seen": 31669680, "step": 14675 }, { "epoch": 2.394779771615008, "grad_norm": 0.6302983164787292, "learning_rate": 4.994072878178514e-05, "loss": 0.2365, "num_input_tokens_seen": 31679696, "step": 14680 }, { "epoch": 2.395595432300163, "grad_norm": 1.8878062963485718, "learning_rate": 4.9940483602284404e-05, "loss": 0.1955, "num_input_tokens_seen": 31691472, "step": 14685 }, { "epoch": 2.396411092985318, "grad_norm": 0.8568604588508606, "learning_rate": 4.994023791733312e-05, "loss": 0.1849, "num_input_tokens_seen": 31701968, "step": 14690 }, { "epoch": 2.397226753670473, "grad_norm": 0.14190800487995148, "learning_rate": 4.9939991726936274e-05, "loss": 0.2692, "num_input_tokens_seen": 31713104, "step": 14695 }, { "epoch": 2.3980424143556283, "grad_norm": 0.15068891644477844, "learning_rate": 4.993974503109886e-05, "loss": 0.1088, "num_input_tokens_seen": 31724560, "step": 14700 }, { "epoch": 2.398858075040783, "grad_norm": 0.1039973720908165, "learning_rate": 4.993949782982586e-05, "loss": 0.087, "num_input_tokens_seen": 31733776, "step": 14705 }, { "epoch": 2.399673735725938, "grad_norm": 0.41949498653411865, "learning_rate": 4.993925012312232e-05, "loss": 0.1266, "num_input_tokens_seen": 31745680, "step": 14710 }, { "epoch": 2.400489396411093, "grad_norm": 1.113813042640686, "learning_rate": 4.993900191099323e-05, "loss": 0.1154, "num_input_tokens_seen": 31756240, "step": 14715 }, { "epoch": 2.401305057096248, "grad_norm": 0.33624565601348877, "learning_rate": 4.9938753193443624e-05, "loss": 0.0487, "num_input_tokens_seen": 31767056, "step": 14720 }, { "epoch": 2.402120717781403, "grad_norm": 0.5839208364486694, "learning_rate": 4.993850397047856e-05, "loss": 0.0814, "num_input_tokens_seen": 31777968, "step": 14725 }, { "epoch": 2.402936378466558, "grad_norm": 0.263527512550354, "learning_rate": 4.993825424210307e-05, "loss": 0.0805, "num_input_tokens_seen": 31788976, "step": 14730 }, { "epoch": 2.403752039151713, "grad_norm": 0.3922027349472046, "learning_rate": 4.9938004008322213e-05, "loss": 0.0824, "num_input_tokens_seen": 31799280, "step": 14735 }, { "epoch": 2.4045676998368677, "grad_norm": 0.259439617395401, "learning_rate": 4.993775326914107e-05, "loss": 0.1337, "num_input_tokens_seen": 31810288, "step": 14740 }, { "epoch": 2.405383360522023, "grad_norm": 0.4288676977157593, "learning_rate": 4.993750202456473e-05, "loss": 0.1294, "num_input_tokens_seen": 31820880, "step": 14745 }, { "epoch": 2.4061990212071778, "grad_norm": 0.23828239738941193, "learning_rate": 4.993725027459828e-05, "loss": 0.0624, "num_input_tokens_seen": 31830704, "step": 14750 }, { "epoch": 2.407014681892333, "grad_norm": 0.5213771462440491, "learning_rate": 4.9936998019246805e-05, "loss": 0.1325, "num_input_tokens_seen": 31842608, "step": 14755 }, { "epoch": 2.407830342577488, "grad_norm": 0.13014283776283264, "learning_rate": 4.993674525851544e-05, "loss": 0.1443, "num_input_tokens_seen": 31853456, "step": 14760 }, { "epoch": 2.4086460032626427, "grad_norm": 0.35695379972457886, "learning_rate": 4.9936491992409294e-05, "loss": 0.2612, "num_input_tokens_seen": 31864240, "step": 14765 }, { "epoch": 2.4094616639477975, "grad_norm": 0.3495272397994995, "learning_rate": 4.99362382209335e-05, "loss": 0.189, "num_input_tokens_seen": 31874896, "step": 14770 }, { "epoch": 2.4102773246329527, "grad_norm": 0.7859742641448975, "learning_rate": 4.993598394409321e-05, "loss": 0.0998, "num_input_tokens_seen": 31887120, "step": 14775 }, { "epoch": 2.4110929853181076, "grad_norm": 2.105457305908203, "learning_rate": 4.9935729161893566e-05, "loss": 0.203, "num_input_tokens_seen": 31899216, "step": 14780 }, { "epoch": 2.411908646003263, "grad_norm": 2.090519428253174, "learning_rate": 4.993547387433975e-05, "loss": 0.1117, "num_input_tokens_seen": 31911248, "step": 14785 }, { "epoch": 2.4127243066884176, "grad_norm": 0.5037726759910583, "learning_rate": 4.993521808143691e-05, "loss": 0.0861, "num_input_tokens_seen": 31921264, "step": 14790 }, { "epoch": 2.4135399673735725, "grad_norm": 0.539825975894928, "learning_rate": 4.9934961783190247e-05, "loss": 0.0562, "num_input_tokens_seen": 31932208, "step": 14795 }, { "epoch": 2.4143556280587277, "grad_norm": 2.1187233924865723, "learning_rate": 4.993470497960495e-05, "loss": 0.2223, "num_input_tokens_seen": 31943792, "step": 14800 }, { "epoch": 2.4151712887438825, "grad_norm": 0.04933340474963188, "learning_rate": 4.993444767068623e-05, "loss": 0.2772, "num_input_tokens_seen": 31954128, "step": 14805 }, { "epoch": 2.4159869494290374, "grad_norm": 0.9354478120803833, "learning_rate": 4.993418985643929e-05, "loss": 0.1444, "num_input_tokens_seen": 31964176, "step": 14810 }, { "epoch": 2.4168026101141926, "grad_norm": 0.4278511106967926, "learning_rate": 4.993393153686936e-05, "loss": 0.0933, "num_input_tokens_seen": 31975312, "step": 14815 }, { "epoch": 2.4176182707993474, "grad_norm": 0.5592229962348938, "learning_rate": 4.9933672711981685e-05, "loss": 0.129, "num_input_tokens_seen": 31985520, "step": 14820 }, { "epoch": 2.4184339314845023, "grad_norm": 0.05800825357437134, "learning_rate": 4.9933413381781494e-05, "loss": 0.0275, "num_input_tokens_seen": 31996656, "step": 14825 }, { "epoch": 2.4192495921696575, "grad_norm": 0.7536208033561707, "learning_rate": 4.993315354627406e-05, "loss": 0.0526, "num_input_tokens_seen": 32005680, "step": 14830 }, { "epoch": 2.4200652528548123, "grad_norm": 0.06820283085107803, "learning_rate": 4.9932893205464625e-05, "loss": 0.0959, "num_input_tokens_seen": 32017232, "step": 14835 }, { "epoch": 2.4208809135399676, "grad_norm": 0.24556943774223328, "learning_rate": 4.993263235935849e-05, "loss": 0.113, "num_input_tokens_seen": 32028112, "step": 14840 }, { "epoch": 2.4216965742251224, "grad_norm": 2.3409359455108643, "learning_rate": 4.9932371007960935e-05, "loss": 0.2476, "num_input_tokens_seen": 32038608, "step": 14845 }, { "epoch": 2.4225122349102772, "grad_norm": 0.4000239074230194, "learning_rate": 4.993210915127724e-05, "loss": 0.1847, "num_input_tokens_seen": 32048880, "step": 14850 }, { "epoch": 2.4233278955954325, "grad_norm": 1.1075351238250732, "learning_rate": 4.993184678931273e-05, "loss": 0.1958, "num_input_tokens_seen": 32058864, "step": 14855 }, { "epoch": 2.4241435562805873, "grad_norm": 0.12574052810668945, "learning_rate": 4.993158392207272e-05, "loss": 0.069, "num_input_tokens_seen": 32068816, "step": 14860 }, { "epoch": 2.424959216965742, "grad_norm": 1.5718268156051636, "learning_rate": 4.9931320549562534e-05, "loss": 0.2648, "num_input_tokens_seen": 32080848, "step": 14865 }, { "epoch": 2.4257748776508974, "grad_norm": 1.0698295831680298, "learning_rate": 4.99310566717875e-05, "loss": 0.0977, "num_input_tokens_seen": 32091824, "step": 14870 }, { "epoch": 2.426590538336052, "grad_norm": 0.9533593654632568, "learning_rate": 4.993079228875298e-05, "loss": 0.1281, "num_input_tokens_seen": 32102448, "step": 14875 }, { "epoch": 2.427406199021207, "grad_norm": 1.0604594945907593, "learning_rate": 4.9930527400464336e-05, "loss": 0.0661, "num_input_tokens_seen": 32114000, "step": 14880 }, { "epoch": 2.4282218597063623, "grad_norm": 0.23769007623195648, "learning_rate": 4.993026200692692e-05, "loss": 0.109, "num_input_tokens_seen": 32126192, "step": 14885 }, { "epoch": 2.429037520391517, "grad_norm": 0.9183831214904785, "learning_rate": 4.992999610814612e-05, "loss": 0.0325, "num_input_tokens_seen": 32136880, "step": 14890 }, { "epoch": 2.429853181076672, "grad_norm": 1.340388298034668, "learning_rate": 4.992972970412732e-05, "loss": 0.2003, "num_input_tokens_seen": 32147408, "step": 14895 }, { "epoch": 2.430668841761827, "grad_norm": 0.7028934955596924, "learning_rate": 4.9929462794875924e-05, "loss": 0.2537, "num_input_tokens_seen": 32158416, "step": 14900 }, { "epoch": 2.431484502446982, "grad_norm": 0.5153692960739136, "learning_rate": 4.992919538039734e-05, "loss": 0.2883, "num_input_tokens_seen": 32169264, "step": 14905 }, { "epoch": 2.432300163132137, "grad_norm": 0.12689191102981567, "learning_rate": 4.992892746069698e-05, "loss": 0.2044, "num_input_tokens_seen": 32181872, "step": 14910 }, { "epoch": 2.433115823817292, "grad_norm": 0.3783186674118042, "learning_rate": 4.992865903578029e-05, "loss": 0.2813, "num_input_tokens_seen": 32192816, "step": 14915 }, { "epoch": 2.433931484502447, "grad_norm": 0.5921608805656433, "learning_rate": 4.99283901056527e-05, "loss": 0.0851, "num_input_tokens_seen": 32203952, "step": 14920 }, { "epoch": 2.434747145187602, "grad_norm": 0.1908661127090454, "learning_rate": 4.992812067031966e-05, "loss": 0.154, "num_input_tokens_seen": 32215088, "step": 14925 }, { "epoch": 2.435562805872757, "grad_norm": 0.11208155751228333, "learning_rate": 4.9927850729786616e-05, "loss": 0.0678, "num_input_tokens_seen": 32226256, "step": 14930 }, { "epoch": 2.436378466557912, "grad_norm": 0.732160747051239, "learning_rate": 4.992758028405907e-05, "loss": 0.1907, "num_input_tokens_seen": 32238096, "step": 14935 }, { "epoch": 2.437194127243067, "grad_norm": 0.30704376101493835, "learning_rate": 4.992730933314248e-05, "loss": 0.1246, "num_input_tokens_seen": 32247440, "step": 14940 }, { "epoch": 2.438009787928222, "grad_norm": 0.20130206644535065, "learning_rate": 4.992703787704235e-05, "loss": 0.0307, "num_input_tokens_seen": 32257488, "step": 14945 }, { "epoch": 2.4388254486133767, "grad_norm": 0.2526203989982605, "learning_rate": 4.992676591576417e-05, "loss": 0.0757, "num_input_tokens_seen": 32268816, "step": 14950 }, { "epoch": 2.439641109298532, "grad_norm": 0.4729174077510834, "learning_rate": 4.992649344931346e-05, "loss": 0.1525, "num_input_tokens_seen": 32280048, "step": 14955 }, { "epoch": 2.4404567699836868, "grad_norm": 0.8170763254165649, "learning_rate": 4.992622047769574e-05, "loss": 0.1113, "num_input_tokens_seen": 32289680, "step": 14960 }, { "epoch": 2.4412724306688416, "grad_norm": 0.33611688017845154, "learning_rate": 4.9925947000916535e-05, "loss": 0.06, "num_input_tokens_seen": 32300656, "step": 14965 }, { "epoch": 2.442088091353997, "grad_norm": 1.5261704921722412, "learning_rate": 4.992567301898139e-05, "loss": 0.1245, "num_input_tokens_seen": 32311888, "step": 14970 }, { "epoch": 2.4429037520391517, "grad_norm": 0.3591882884502411, "learning_rate": 4.992539853189587e-05, "loss": 0.1559, "num_input_tokens_seen": 32323248, "step": 14975 }, { "epoch": 2.443719412724307, "grad_norm": 0.22264203429222107, "learning_rate": 4.992512353966553e-05, "loss": 0.0651, "num_input_tokens_seen": 32333808, "step": 14980 }, { "epoch": 2.4445350734094617, "grad_norm": 2.4685587882995605, "learning_rate": 4.992484804229594e-05, "loss": 0.295, "num_input_tokens_seen": 32345648, "step": 14985 }, { "epoch": 2.4453507340946166, "grad_norm": 0.5319798588752747, "learning_rate": 4.9924572039792676e-05, "loss": 0.0809, "num_input_tokens_seen": 32356272, "step": 14990 }, { "epoch": 2.4461663947797714, "grad_norm": 0.2800239622592926, "learning_rate": 4.992429553216135e-05, "loss": 0.2462, "num_input_tokens_seen": 32366800, "step": 14995 }, { "epoch": 2.4469820554649266, "grad_norm": 0.08019404113292694, "learning_rate": 4.992401851940756e-05, "loss": 0.2747, "num_input_tokens_seen": 32377072, "step": 15000 }, { "epoch": 2.4477977161500815, "grad_norm": 1.2246243953704834, "learning_rate": 4.992374100153691e-05, "loss": 0.2281, "num_input_tokens_seen": 32386640, "step": 15005 }, { "epoch": 2.4486133768352367, "grad_norm": 1.135075330734253, "learning_rate": 4.9923462978555026e-05, "loss": 0.1203, "num_input_tokens_seen": 32396496, "step": 15010 }, { "epoch": 2.4494290375203915, "grad_norm": 0.5711943507194519, "learning_rate": 4.992318445046755e-05, "loss": 0.1419, "num_input_tokens_seen": 32407824, "step": 15015 }, { "epoch": 2.4502446982055464, "grad_norm": 0.15759418904781342, "learning_rate": 4.992290541728012e-05, "loss": 0.1279, "num_input_tokens_seen": 32417968, "step": 15020 }, { "epoch": 2.4510603588907016, "grad_norm": 0.15286098420619965, "learning_rate": 4.99226258789984e-05, "loss": 0.2349, "num_input_tokens_seen": 32428240, "step": 15025 }, { "epoch": 2.4518760195758564, "grad_norm": 0.5641433596611023, "learning_rate": 4.9922345835628046e-05, "loss": 0.0619, "num_input_tokens_seen": 32438704, "step": 15030 }, { "epoch": 2.4526916802610113, "grad_norm": 0.7204052209854126, "learning_rate": 4.992206528717475e-05, "loss": 0.1537, "num_input_tokens_seen": 32449776, "step": 15035 }, { "epoch": 2.4535073409461665, "grad_norm": 0.2841648459434509, "learning_rate": 4.9921784233644176e-05, "loss": 0.0333, "num_input_tokens_seen": 32460240, "step": 15040 }, { "epoch": 2.4543230016313213, "grad_norm": 0.08456464111804962, "learning_rate": 4.992150267504203e-05, "loss": 0.0912, "num_input_tokens_seen": 32469808, "step": 15045 }, { "epoch": 2.455138662316476, "grad_norm": 0.7529263496398926, "learning_rate": 4.9921220611374016e-05, "loss": 0.0751, "num_input_tokens_seen": 32481232, "step": 15050 }, { "epoch": 2.4559543230016314, "grad_norm": 0.8262142539024353, "learning_rate": 4.992093804264585e-05, "loss": 0.2894, "num_input_tokens_seen": 32492272, "step": 15055 }, { "epoch": 2.4567699836867862, "grad_norm": 0.26241207122802734, "learning_rate": 4.992065496886326e-05, "loss": 0.1456, "num_input_tokens_seen": 32503312, "step": 15060 }, { "epoch": 2.4575856443719415, "grad_norm": 2.0461783409118652, "learning_rate": 4.9920371390031985e-05, "loss": 0.1744, "num_input_tokens_seen": 32513424, "step": 15065 }, { "epoch": 2.4584013050570963, "grad_norm": 1.5780824422836304, "learning_rate": 4.992008730615777e-05, "loss": 0.1195, "num_input_tokens_seen": 32525264, "step": 15070 }, { "epoch": 2.459216965742251, "grad_norm": 0.7205225825309753, "learning_rate": 4.991980271724638e-05, "loss": 0.1962, "num_input_tokens_seen": 32536592, "step": 15075 }, { "epoch": 2.4600326264274064, "grad_norm": 0.21107235550880432, "learning_rate": 4.9919517623303555e-05, "loss": 0.0626, "num_input_tokens_seen": 32547344, "step": 15080 }, { "epoch": 2.460848287112561, "grad_norm": 0.4406944513320923, "learning_rate": 4.991923202433511e-05, "loss": 0.0572, "num_input_tokens_seen": 32558800, "step": 15085 }, { "epoch": 2.461663947797716, "grad_norm": 1.6760591268539429, "learning_rate": 4.991894592034682e-05, "loss": 0.0948, "num_input_tokens_seen": 32569392, "step": 15090 }, { "epoch": 2.4624796084828713, "grad_norm": 0.05471138656139374, "learning_rate": 4.9918659311344464e-05, "loss": 0.112, "num_input_tokens_seen": 32579632, "step": 15095 }, { "epoch": 2.463295269168026, "grad_norm": 0.31619369983673096, "learning_rate": 4.9918372197333865e-05, "loss": 0.0784, "num_input_tokens_seen": 32590128, "step": 15100 }, { "epoch": 2.464110929853181, "grad_norm": 0.1820586919784546, "learning_rate": 4.9918084578320854e-05, "loss": 0.0947, "num_input_tokens_seen": 32600656, "step": 15105 }, { "epoch": 2.464926590538336, "grad_norm": 0.2477773278951645, "learning_rate": 4.991779645431124e-05, "loss": 0.1107, "num_input_tokens_seen": 32610928, "step": 15110 }, { "epoch": 2.465742251223491, "grad_norm": 0.23040664196014404, "learning_rate": 4.991750782531087e-05, "loss": 0.0898, "num_input_tokens_seen": 32621488, "step": 15115 }, { "epoch": 2.466557911908646, "grad_norm": 0.5265166163444519, "learning_rate": 4.9917218691325604e-05, "loss": 0.1701, "num_input_tokens_seen": 32633712, "step": 15120 }, { "epoch": 2.467373572593801, "grad_norm": 0.08390630036592484, "learning_rate": 4.991692905236128e-05, "loss": 0.1995, "num_input_tokens_seen": 32644560, "step": 15125 }, { "epoch": 2.468189233278956, "grad_norm": 0.08192218840122223, "learning_rate": 4.991663890842379e-05, "loss": 0.0795, "num_input_tokens_seen": 32655888, "step": 15130 }, { "epoch": 2.4690048939641107, "grad_norm": 0.593856155872345, "learning_rate": 4.991634825951899e-05, "loss": 0.1163, "num_input_tokens_seen": 32666832, "step": 15135 }, { "epoch": 2.469820554649266, "grad_norm": 0.8821349143981934, "learning_rate": 4.991605710565279e-05, "loss": 0.0759, "num_input_tokens_seen": 32677616, "step": 15140 }, { "epoch": 2.470636215334421, "grad_norm": 1.5725873708724976, "learning_rate": 4.991576544683109e-05, "loss": 0.1151, "num_input_tokens_seen": 32687696, "step": 15145 }, { "epoch": 2.471451876019576, "grad_norm": 2.1366019248962402, "learning_rate": 4.991547328305979e-05, "loss": 0.2246, "num_input_tokens_seen": 32698096, "step": 15150 }, { "epoch": 2.472267536704731, "grad_norm": 0.19428211450576782, "learning_rate": 4.9915180614344816e-05, "loss": 0.0993, "num_input_tokens_seen": 32707920, "step": 15155 }, { "epoch": 2.4730831973898857, "grad_norm": 1.2120898962020874, "learning_rate": 4.99148874406921e-05, "loss": 0.2065, "num_input_tokens_seen": 32719472, "step": 15160 }, { "epoch": 2.473898858075041, "grad_norm": 2.09729266166687, "learning_rate": 4.9914593762107587e-05, "loss": 0.1474, "num_input_tokens_seen": 32730064, "step": 15165 }, { "epoch": 2.4747145187601958, "grad_norm": 0.19129645824432373, "learning_rate": 4.991429957859722e-05, "loss": 0.1743, "num_input_tokens_seen": 32740944, "step": 15170 }, { "epoch": 2.4755301794453506, "grad_norm": 0.15176153182983398, "learning_rate": 4.991400489016697e-05, "loss": 0.1157, "num_input_tokens_seen": 32751856, "step": 15175 }, { "epoch": 2.476345840130506, "grad_norm": 0.4780547320842743, "learning_rate": 4.99137096968228e-05, "loss": 0.1609, "num_input_tokens_seen": 32762928, "step": 15180 }, { "epoch": 2.4771615008156607, "grad_norm": 0.8185482025146484, "learning_rate": 4.9913413998570704e-05, "loss": 0.065, "num_input_tokens_seen": 32773840, "step": 15185 }, { "epoch": 2.4779771615008155, "grad_norm": 0.4092806875705719, "learning_rate": 4.9913117795416665e-05, "loss": 0.2165, "num_input_tokens_seen": 32786064, "step": 15190 }, { "epoch": 2.4787928221859707, "grad_norm": 1.7942112684249878, "learning_rate": 4.991282108736669e-05, "loss": 0.1736, "num_input_tokens_seen": 32796816, "step": 15195 }, { "epoch": 2.4796084828711256, "grad_norm": 0.7099953293800354, "learning_rate": 4.991252387442679e-05, "loss": 0.2466, "num_input_tokens_seen": 32807088, "step": 15200 }, { "epoch": 2.480424143556281, "grad_norm": 2.9929463863372803, "learning_rate": 4.991222615660299e-05, "loss": 0.2745, "num_input_tokens_seen": 32817424, "step": 15205 }, { "epoch": 2.4812398042414356, "grad_norm": 1.1377508640289307, "learning_rate": 4.9911927933901325e-05, "loss": 0.1625, "num_input_tokens_seen": 32827984, "step": 15210 }, { "epoch": 2.4820554649265905, "grad_norm": 0.7876735925674438, "learning_rate": 4.9911629206327845e-05, "loss": 0.16, "num_input_tokens_seen": 32839664, "step": 15215 }, { "epoch": 2.4828711256117453, "grad_norm": 1.4821795225143433, "learning_rate": 4.991132997388859e-05, "loss": 0.2425, "num_input_tokens_seen": 32851280, "step": 15220 }, { "epoch": 2.4836867862969005, "grad_norm": 0.12085286527872086, "learning_rate": 4.991103023658963e-05, "loss": 0.1417, "num_input_tokens_seen": 32862224, "step": 15225 }, { "epoch": 2.4845024469820554, "grad_norm": 0.08465952426195145, "learning_rate": 4.991072999443703e-05, "loss": 0.0333, "num_input_tokens_seen": 32872816, "step": 15230 }, { "epoch": 2.4853181076672106, "grad_norm": 0.9628843069076538, "learning_rate": 4.99104292474369e-05, "loss": 0.2102, "num_input_tokens_seen": 32882896, "step": 15235 }, { "epoch": 2.4861337683523654, "grad_norm": 0.650714099407196, "learning_rate": 4.991012799559531e-05, "loss": 0.101, "num_input_tokens_seen": 32894256, "step": 15240 }, { "epoch": 2.4869494290375203, "grad_norm": 0.05282462388277054, "learning_rate": 4.990982623891839e-05, "loss": 0.1014, "num_input_tokens_seen": 32905392, "step": 15245 }, { "epoch": 2.4877650897226755, "grad_norm": 0.14921969175338745, "learning_rate": 4.990952397741223e-05, "loss": 0.1556, "num_input_tokens_seen": 32916592, "step": 15250 }, { "epoch": 2.4885807504078303, "grad_norm": 0.16534392535686493, "learning_rate": 4.9909221211082966e-05, "loss": 0.0389, "num_input_tokens_seen": 32927216, "step": 15255 }, { "epoch": 2.489396411092985, "grad_norm": 0.6120795011520386, "learning_rate": 4.990891793993674e-05, "loss": 0.0458, "num_input_tokens_seen": 32937648, "step": 15260 }, { "epoch": 2.4902120717781404, "grad_norm": 0.16161774098873138, "learning_rate": 4.990861416397968e-05, "loss": 0.1934, "num_input_tokens_seen": 32948496, "step": 15265 }, { "epoch": 2.4910277324632952, "grad_norm": 0.7223570346832275, "learning_rate": 4.9908309883217974e-05, "loss": 0.1595, "num_input_tokens_seen": 32958832, "step": 15270 }, { "epoch": 2.49184339314845, "grad_norm": 1.1402853727340698, "learning_rate": 4.990800509765776e-05, "loss": 0.2028, "num_input_tokens_seen": 32969456, "step": 15275 }, { "epoch": 2.4926590538336053, "grad_norm": 0.04755263030529022, "learning_rate": 4.9907699807305226e-05, "loss": 0.1328, "num_input_tokens_seen": 32980496, "step": 15280 }, { "epoch": 2.49347471451876, "grad_norm": 0.5585101842880249, "learning_rate": 4.990739401216655e-05, "loss": 0.1706, "num_input_tokens_seen": 32991888, "step": 15285 }, { "epoch": 2.4942903752039154, "grad_norm": 1.5095542669296265, "learning_rate": 4.990708771224795e-05, "loss": 0.1031, "num_input_tokens_seen": 33002672, "step": 15290 }, { "epoch": 2.49510603588907, "grad_norm": 1.718408465385437, "learning_rate": 4.990678090755562e-05, "loss": 0.1283, "num_input_tokens_seen": 33012720, "step": 15295 }, { "epoch": 2.495921696574225, "grad_norm": 0.3266724944114685, "learning_rate": 4.9906473598095765e-05, "loss": 0.1562, "num_input_tokens_seen": 33023760, "step": 15300 }, { "epoch": 2.4967373572593803, "grad_norm": 0.8692115545272827, "learning_rate": 4.990616578387464e-05, "loss": 0.1626, "num_input_tokens_seen": 33034192, "step": 15305 }, { "epoch": 2.497553017944535, "grad_norm": 2.418647527694702, "learning_rate": 4.990585746489846e-05, "loss": 0.2423, "num_input_tokens_seen": 33044080, "step": 15310 }, { "epoch": 2.49836867862969, "grad_norm": 0.6202027797698975, "learning_rate": 4.990554864117349e-05, "loss": 0.0841, "num_input_tokens_seen": 33055184, "step": 15315 }, { "epoch": 2.499184339314845, "grad_norm": 1.386308193206787, "learning_rate": 4.9905239312705975e-05, "loss": 0.1327, "num_input_tokens_seen": 33065712, "step": 15320 }, { "epoch": 2.5, "grad_norm": 0.822672426700592, "learning_rate": 4.99049294795022e-05, "loss": 0.0989, "num_input_tokens_seen": 33075856, "step": 15325 }, { "epoch": 2.500815660685155, "grad_norm": 0.09194335341453552, "learning_rate": 4.990461914156843e-05, "loss": 0.0338, "num_input_tokens_seen": 33087312, "step": 15330 }, { "epoch": 2.50163132137031, "grad_norm": 0.07523810118436813, "learning_rate": 4.990430829891096e-05, "loss": 0.1515, "num_input_tokens_seen": 33098960, "step": 15335 }, { "epoch": 2.502446982055465, "grad_norm": 0.23098105192184448, "learning_rate": 4.990399695153608e-05, "loss": 0.1628, "num_input_tokens_seen": 33109712, "step": 15340 }, { "epoch": 2.50326264274062, "grad_norm": 1.4671471118927002, "learning_rate": 4.990368509945012e-05, "loss": 0.1251, "num_input_tokens_seen": 33121008, "step": 15345 }, { "epoch": 2.504078303425775, "grad_norm": 0.2797757089138031, "learning_rate": 4.990337274265939e-05, "loss": 0.1548, "num_input_tokens_seen": 33133360, "step": 15350 }, { "epoch": 2.50489396411093, "grad_norm": 0.38817867636680603, "learning_rate": 4.990305988117021e-05, "loss": 0.0758, "num_input_tokens_seen": 33144688, "step": 15355 }, { "epoch": 2.5057096247960846, "grad_norm": 0.08957715332508087, "learning_rate": 4.990274651498894e-05, "loss": 0.1333, "num_input_tokens_seen": 33155024, "step": 15360 }, { "epoch": 2.50652528548124, "grad_norm": 0.11901596933603287, "learning_rate": 4.990243264412191e-05, "loss": 0.3417, "num_input_tokens_seen": 33166576, "step": 15365 }, { "epoch": 2.5073409461663947, "grad_norm": 0.5820465683937073, "learning_rate": 4.99021182685755e-05, "loss": 0.1089, "num_input_tokens_seen": 33176720, "step": 15370 }, { "epoch": 2.50815660685155, "grad_norm": 1.8964362144470215, "learning_rate": 4.9901803388356074e-05, "loss": 0.1308, "num_input_tokens_seen": 33187824, "step": 15375 }, { "epoch": 2.5089722675367048, "grad_norm": 0.4293143153190613, "learning_rate": 4.9901488003470006e-05, "loss": 0.1482, "num_input_tokens_seen": 33198800, "step": 15380 }, { "epoch": 2.5097879282218596, "grad_norm": 1.8166656494140625, "learning_rate": 4.990117211392369e-05, "loss": 0.1419, "num_input_tokens_seen": 33209232, "step": 15385 }, { "epoch": 2.5106035889070144, "grad_norm": 1.4177395105361938, "learning_rate": 4.9900855719723535e-05, "loss": 0.3231, "num_input_tokens_seen": 33219760, "step": 15390 }, { "epoch": 2.5114192495921697, "grad_norm": 0.20844493806362152, "learning_rate": 4.990053882087595e-05, "loss": 0.1042, "num_input_tokens_seen": 33229040, "step": 15395 }, { "epoch": 2.5122349102773245, "grad_norm": 0.24310952425003052, "learning_rate": 4.990022141738737e-05, "loss": 0.1048, "num_input_tokens_seen": 33238960, "step": 15400 }, { "epoch": 2.5130505709624797, "grad_norm": 1.6914911270141602, "learning_rate": 4.98999035092642e-05, "loss": 0.2401, "num_input_tokens_seen": 33247920, "step": 15405 }, { "epoch": 2.5138662316476346, "grad_norm": 0.2265552580356598, "learning_rate": 4.9899585096512904e-05, "loss": 0.1361, "num_input_tokens_seen": 33259504, "step": 15410 }, { "epoch": 2.5146818923327894, "grad_norm": 0.06897866725921631, "learning_rate": 4.9899266179139925e-05, "loss": 0.0825, "num_input_tokens_seen": 33269872, "step": 15415 }, { "epoch": 2.5154975530179446, "grad_norm": 0.4337652623653412, "learning_rate": 4.989894675715173e-05, "loss": 0.0484, "num_input_tokens_seen": 33282192, "step": 15420 }, { "epoch": 2.5163132137030995, "grad_norm": 0.8069992065429688, "learning_rate": 4.98986268305548e-05, "loss": 0.1098, "num_input_tokens_seen": 33293648, "step": 15425 }, { "epoch": 2.5171288743882547, "grad_norm": 0.7308389544487, "learning_rate": 4.98983063993556e-05, "loss": 0.2784, "num_input_tokens_seen": 33304752, "step": 15430 }, { "epoch": 2.5179445350734095, "grad_norm": 1.2270172834396362, "learning_rate": 4.9897985463560635e-05, "loss": 0.1259, "num_input_tokens_seen": 33315216, "step": 15435 }, { "epoch": 2.5187601957585644, "grad_norm": 2.858534812927246, "learning_rate": 4.989766402317642e-05, "loss": 0.24, "num_input_tokens_seen": 33325392, "step": 15440 }, { "epoch": 2.519575856443719, "grad_norm": 0.26107943058013916, "learning_rate": 4.9897342078209455e-05, "loss": 0.1213, "num_input_tokens_seen": 33336656, "step": 15445 }, { "epoch": 2.5203915171288744, "grad_norm": 0.3901550769805908, "learning_rate": 4.9897019628666267e-05, "loss": 0.1089, "num_input_tokens_seen": 33347856, "step": 15450 }, { "epoch": 2.5212071778140293, "grad_norm": 0.5616678595542908, "learning_rate": 4.989669667455339e-05, "loss": 0.1788, "num_input_tokens_seen": 33358640, "step": 15455 }, { "epoch": 2.5220228384991845, "grad_norm": 0.5119327306747437, "learning_rate": 4.989637321587737e-05, "loss": 0.1637, "num_input_tokens_seen": 33369200, "step": 15460 }, { "epoch": 2.5228384991843393, "grad_norm": 0.2781391143798828, "learning_rate": 4.9896049252644767e-05, "loss": 0.1099, "num_input_tokens_seen": 33378320, "step": 15465 }, { "epoch": 2.523654159869494, "grad_norm": 0.10357356071472168, "learning_rate": 4.989572478486214e-05, "loss": 0.1336, "num_input_tokens_seen": 33388976, "step": 15470 }, { "epoch": 2.5244698205546494, "grad_norm": 0.3443771004676819, "learning_rate": 4.989539981253607e-05, "loss": 0.1684, "num_input_tokens_seen": 33399664, "step": 15475 }, { "epoch": 2.5252854812398042, "grad_norm": 1.1194802522659302, "learning_rate": 4.9895074335673145e-05, "loss": 0.1779, "num_input_tokens_seen": 33409616, "step": 15480 }, { "epoch": 2.5261011419249595, "grad_norm": 0.22122123837471008, "learning_rate": 4.989474835427995e-05, "loss": 0.061, "num_input_tokens_seen": 33420624, "step": 15485 }, { "epoch": 2.5269168026101143, "grad_norm": 1.6979984045028687, "learning_rate": 4.989442186836309e-05, "loss": 0.1427, "num_input_tokens_seen": 33430960, "step": 15490 }, { "epoch": 2.527732463295269, "grad_norm": 0.5044921040534973, "learning_rate": 4.989409487792921e-05, "loss": 0.0611, "num_input_tokens_seen": 33443344, "step": 15495 }, { "epoch": 2.528548123980424, "grad_norm": 0.4308808147907257, "learning_rate": 4.9893767382984907e-05, "loss": 0.1113, "num_input_tokens_seen": 33454032, "step": 15500 }, { "epoch": 2.529363784665579, "grad_norm": 1.1928311586380005, "learning_rate": 4.9893439383536836e-05, "loss": 0.1298, "num_input_tokens_seen": 33465360, "step": 15505 }, { "epoch": 2.530179445350734, "grad_norm": 0.23130670189857483, "learning_rate": 4.989311087959162e-05, "loss": 0.0549, "num_input_tokens_seen": 33476336, "step": 15510 }, { "epoch": 2.5309951060358893, "grad_norm": 0.053655222058296204, "learning_rate": 4.989278187115595e-05, "loss": 0.0308, "num_input_tokens_seen": 33486736, "step": 15515 }, { "epoch": 2.531810766721044, "grad_norm": 0.7511515617370605, "learning_rate": 4.9892452358236463e-05, "loss": 0.1054, "num_input_tokens_seen": 33497680, "step": 15520 }, { "epoch": 2.532626427406199, "grad_norm": 1.5935568809509277, "learning_rate": 4.9892122340839854e-05, "loss": 0.1781, "num_input_tokens_seen": 33509456, "step": 15525 }, { "epoch": 2.5334420880913537, "grad_norm": 1.9220540523529053, "learning_rate": 4.989179181897281e-05, "loss": 0.1491, "num_input_tokens_seen": 33521040, "step": 15530 }, { "epoch": 2.534257748776509, "grad_norm": 1.1844807863235474, "learning_rate": 4.989146079264203e-05, "loss": 0.1346, "num_input_tokens_seen": 33531472, "step": 15535 }, { "epoch": 2.535073409461664, "grad_norm": 0.39543119072914124, "learning_rate": 4.989112926185422e-05, "loss": 0.1153, "num_input_tokens_seen": 33541552, "step": 15540 }, { "epoch": 2.535889070146819, "grad_norm": 0.3062617778778076, "learning_rate": 4.9890797226616095e-05, "loss": 0.0599, "num_input_tokens_seen": 33551920, "step": 15545 }, { "epoch": 2.536704730831974, "grad_norm": 0.4861431121826172, "learning_rate": 4.989046468693439e-05, "loss": 0.3449, "num_input_tokens_seen": 33562960, "step": 15550 }, { "epoch": 2.5375203915171287, "grad_norm": 0.10849545150995255, "learning_rate": 4.989013164281584e-05, "loss": 0.1308, "num_input_tokens_seen": 33572848, "step": 15555 }, { "epoch": 2.538336052202284, "grad_norm": 1.1213555335998535, "learning_rate": 4.988979809426719e-05, "loss": 0.2356, "num_input_tokens_seen": 33582224, "step": 15560 }, { "epoch": 2.539151712887439, "grad_norm": 0.5159638524055481, "learning_rate": 4.988946404129522e-05, "loss": 0.0449, "num_input_tokens_seen": 33593680, "step": 15565 }, { "epoch": 2.539967373572594, "grad_norm": 0.5676611661911011, "learning_rate": 4.988912948390668e-05, "loss": 0.1834, "num_input_tokens_seen": 33605616, "step": 15570 }, { "epoch": 2.540783034257749, "grad_norm": 0.2636284828186035, "learning_rate": 4.9888794422108365e-05, "loss": 0.1428, "num_input_tokens_seen": 33616752, "step": 15575 }, { "epoch": 2.5415986949429037, "grad_norm": 0.11787482351064682, "learning_rate": 4.988845885590705e-05, "loss": 0.1448, "num_input_tokens_seen": 33628048, "step": 15580 }, { "epoch": 2.5424143556280585, "grad_norm": 0.6330002546310425, "learning_rate": 4.988812278530954e-05, "loss": 0.0372, "num_input_tokens_seen": 33639280, "step": 15585 }, { "epoch": 2.5432300163132138, "grad_norm": 0.7740138173103333, "learning_rate": 4.9887786210322654e-05, "loss": 0.0797, "num_input_tokens_seen": 33649808, "step": 15590 }, { "epoch": 2.5440456769983686, "grad_norm": 0.12024284899234772, "learning_rate": 4.988744913095321e-05, "loss": 0.2212, "num_input_tokens_seen": 33658992, "step": 15595 }, { "epoch": 2.544861337683524, "grad_norm": 0.184601292014122, "learning_rate": 4.988711154720803e-05, "loss": 0.0532, "num_input_tokens_seen": 33669904, "step": 15600 }, { "epoch": 2.5456769983686787, "grad_norm": 0.062159378081560135, "learning_rate": 4.988677345909397e-05, "loss": 0.151, "num_input_tokens_seen": 33680080, "step": 15605 }, { "epoch": 2.5464926590538335, "grad_norm": 0.12427522242069244, "learning_rate": 4.988643486661787e-05, "loss": 0.1262, "num_input_tokens_seen": 33692208, "step": 15610 }, { "epoch": 2.5473083197389887, "grad_norm": 0.38367465138435364, "learning_rate": 4.98860957697866e-05, "loss": 0.2163, "num_input_tokens_seen": 33703088, "step": 15615 }, { "epoch": 2.5481239804241436, "grad_norm": 0.05817263573408127, "learning_rate": 4.9885756168607027e-05, "loss": 0.1959, "num_input_tokens_seen": 33712944, "step": 15620 }, { "epoch": 2.5489396411092984, "grad_norm": 0.08336222916841507, "learning_rate": 4.988541606308604e-05, "loss": 0.1195, "num_input_tokens_seen": 33723408, "step": 15625 }, { "epoch": 2.5497553017944536, "grad_norm": 0.7961939573287964, "learning_rate": 4.9885075453230524e-05, "loss": 0.123, "num_input_tokens_seen": 33733488, "step": 15630 }, { "epoch": 2.5505709624796085, "grad_norm": 0.06706136465072632, "learning_rate": 4.988473433904738e-05, "loss": 0.1141, "num_input_tokens_seen": 33742640, "step": 15635 }, { "epoch": 2.5513866231647633, "grad_norm": 0.2814996540546417, "learning_rate": 4.988439272054353e-05, "loss": 0.2715, "num_input_tokens_seen": 33753584, "step": 15640 }, { "epoch": 2.5522022838499185, "grad_norm": 0.27200379967689514, "learning_rate": 4.98840505977259e-05, "loss": 0.0265, "num_input_tokens_seen": 33764656, "step": 15645 }, { "epoch": 2.5530179445350734, "grad_norm": 0.14781855046749115, "learning_rate": 4.9883707970601404e-05, "loss": 0.1125, "num_input_tokens_seen": 33775632, "step": 15650 }, { "epoch": 2.5538336052202286, "grad_norm": 0.04530163109302521, "learning_rate": 4.988336483917701e-05, "loss": 0.1043, "num_input_tokens_seen": 33787536, "step": 15655 }, { "epoch": 2.5546492659053834, "grad_norm": 1.3326996564865112, "learning_rate": 4.988302120345966e-05, "loss": 0.1617, "num_input_tokens_seen": 33798736, "step": 15660 }, { "epoch": 2.5554649265905383, "grad_norm": 0.38154175877571106, "learning_rate": 4.9882677063456306e-05, "loss": 0.0543, "num_input_tokens_seen": 33810256, "step": 15665 }, { "epoch": 2.556280587275693, "grad_norm": 0.4747551679611206, "learning_rate": 4.988233241917395e-05, "loss": 0.0707, "num_input_tokens_seen": 33820272, "step": 15670 }, { "epoch": 2.5570962479608483, "grad_norm": 0.48297810554504395, "learning_rate": 4.988198727061956e-05, "loss": 0.1869, "num_input_tokens_seen": 33830448, "step": 15675 }, { "epoch": 2.557911908646003, "grad_norm": 0.5839475989341736, "learning_rate": 4.988164161780012e-05, "loss": 0.3608, "num_input_tokens_seen": 33841808, "step": 15680 }, { "epoch": 2.5587275693311584, "grad_norm": 0.16262446343898773, "learning_rate": 4.9881295460722666e-05, "loss": 0.1212, "num_input_tokens_seen": 33851088, "step": 15685 }, { "epoch": 2.5595432300163132, "grad_norm": 0.517580509185791, "learning_rate": 4.9880948799394185e-05, "loss": 0.0535, "num_input_tokens_seen": 33863600, "step": 15690 }, { "epoch": 2.560358890701468, "grad_norm": 0.31779271364212036, "learning_rate": 4.988060163382171e-05, "loss": 0.1219, "num_input_tokens_seen": 33873712, "step": 15695 }, { "epoch": 2.5611745513866233, "grad_norm": 0.6039626598358154, "learning_rate": 4.988025396401229e-05, "loss": 0.1518, "num_input_tokens_seen": 33884496, "step": 15700 }, { "epoch": 2.561990212071778, "grad_norm": 2.0012621879577637, "learning_rate": 4.9879905789972956e-05, "loss": 0.2391, "num_input_tokens_seen": 33896176, "step": 15705 }, { "epoch": 2.5628058727569334, "grad_norm": 0.1984107941389084, "learning_rate": 4.987955711171076e-05, "loss": 0.0954, "num_input_tokens_seen": 33907760, "step": 15710 }, { "epoch": 2.563621533442088, "grad_norm": 0.25474628806114197, "learning_rate": 4.987920792923279e-05, "loss": 0.1757, "num_input_tokens_seen": 33918864, "step": 15715 }, { "epoch": 2.564437194127243, "grad_norm": 0.8070859313011169, "learning_rate": 4.98788582425461e-05, "loss": 0.1003, "num_input_tokens_seen": 33929072, "step": 15720 }, { "epoch": 2.565252854812398, "grad_norm": 0.12784242630004883, "learning_rate": 4.98785080516578e-05, "loss": 0.1049, "num_input_tokens_seen": 33939440, "step": 15725 }, { "epoch": 2.566068515497553, "grad_norm": 0.7344244122505188, "learning_rate": 4.987815735657496e-05, "loss": 0.0719, "num_input_tokens_seen": 33949168, "step": 15730 }, { "epoch": 2.566884176182708, "grad_norm": 0.09769424051046371, "learning_rate": 4.9877806157304706e-05, "loss": 0.0362, "num_input_tokens_seen": 33958800, "step": 15735 }, { "epoch": 2.567699836867863, "grad_norm": 0.22650246322155, "learning_rate": 4.987745445385416e-05, "loss": 0.1298, "num_input_tokens_seen": 33968144, "step": 15740 }, { "epoch": 2.568515497553018, "grad_norm": 0.5810356736183167, "learning_rate": 4.987710224623042e-05, "loss": 0.125, "num_input_tokens_seen": 33979344, "step": 15745 }, { "epoch": 2.569331158238173, "grad_norm": 1.0410268306732178, "learning_rate": 4.987674953444066e-05, "loss": 0.1776, "num_input_tokens_seen": 33990160, "step": 15750 }, { "epoch": 2.5701468189233276, "grad_norm": 1.6288090944290161, "learning_rate": 4.9876396318492e-05, "loss": 0.1068, "num_input_tokens_seen": 34000336, "step": 15755 }, { "epoch": 2.570962479608483, "grad_norm": 0.5057858228683472, "learning_rate": 4.9876042598391626e-05, "loss": 0.2503, "num_input_tokens_seen": 34011696, "step": 15760 }, { "epoch": 2.5717781402936377, "grad_norm": 0.7773817181587219, "learning_rate": 4.987568837414668e-05, "loss": 0.144, "num_input_tokens_seen": 34022256, "step": 15765 }, { "epoch": 2.572593800978793, "grad_norm": 0.9984732270240784, "learning_rate": 4.9875333645764357e-05, "loss": 0.2425, "num_input_tokens_seen": 34032848, "step": 15770 }, { "epoch": 2.573409461663948, "grad_norm": 0.7820417881011963, "learning_rate": 4.9874978413251836e-05, "loss": 0.1844, "num_input_tokens_seen": 34043888, "step": 15775 }, { "epoch": 2.5742251223491026, "grad_norm": 1.5711030960083008, "learning_rate": 4.987462267661632e-05, "loss": 0.3178, "num_input_tokens_seen": 34055760, "step": 15780 }, { "epoch": 2.575040783034258, "grad_norm": 0.7699312567710876, "learning_rate": 4.987426643586503e-05, "loss": 0.1244, "num_input_tokens_seen": 34067536, "step": 15785 }, { "epoch": 2.5758564437194127, "grad_norm": 0.3233388364315033, "learning_rate": 4.987390969100517e-05, "loss": 0.0519, "num_input_tokens_seen": 34078896, "step": 15790 }, { "epoch": 2.576672104404568, "grad_norm": 0.07663831859827042, "learning_rate": 4.9873552442043976e-05, "loss": 0.0153, "num_input_tokens_seen": 34089648, "step": 15795 }, { "epoch": 2.5774877650897228, "grad_norm": 0.18981029093265533, "learning_rate": 4.987319468898868e-05, "loss": 0.0872, "num_input_tokens_seen": 34101392, "step": 15800 }, { "epoch": 2.5783034257748776, "grad_norm": 0.5728244185447693, "learning_rate": 4.987283643184655e-05, "loss": 0.0977, "num_input_tokens_seen": 34111184, "step": 15805 }, { "epoch": 2.5791190864600324, "grad_norm": 0.3004511892795563, "learning_rate": 4.9872477670624835e-05, "loss": 0.2102, "num_input_tokens_seen": 34121872, "step": 15810 }, { "epoch": 2.5799347471451877, "grad_norm": 0.043973855674266815, "learning_rate": 4.98721184053308e-05, "loss": 0.0731, "num_input_tokens_seen": 34132912, "step": 15815 }, { "epoch": 2.5807504078303425, "grad_norm": 0.1039838045835495, "learning_rate": 4.987175863597174e-05, "loss": 0.0937, "num_input_tokens_seen": 34143632, "step": 15820 }, { "epoch": 2.5815660685154977, "grad_norm": 0.15126191079616547, "learning_rate": 4.9871398362554936e-05, "loss": 0.0335, "num_input_tokens_seen": 34154928, "step": 15825 }, { "epoch": 2.5823817292006526, "grad_norm": 0.42301326990127563, "learning_rate": 4.9871037585087695e-05, "loss": 0.1328, "num_input_tokens_seen": 34165168, "step": 15830 }, { "epoch": 2.5831973898858074, "grad_norm": 0.9256930351257324, "learning_rate": 4.9870676303577324e-05, "loss": 0.1656, "num_input_tokens_seen": 34176304, "step": 15835 }, { "epoch": 2.5840130505709626, "grad_norm": 0.24091875553131104, "learning_rate": 4.987031451803115e-05, "loss": 0.065, "num_input_tokens_seen": 34186928, "step": 15840 }, { "epoch": 2.5848287112561175, "grad_norm": 1.3291571140289307, "learning_rate": 4.986995222845649e-05, "loss": 0.1305, "num_input_tokens_seen": 34197712, "step": 15845 }, { "epoch": 2.5856443719412723, "grad_norm": 0.45104464888572693, "learning_rate": 4.9869589434860716e-05, "loss": 0.0629, "num_input_tokens_seen": 34208144, "step": 15850 }, { "epoch": 2.5864600326264275, "grad_norm": 0.753840446472168, "learning_rate": 4.986922613725116e-05, "loss": 0.2034, "num_input_tokens_seen": 34219504, "step": 15855 }, { "epoch": 2.5872756933115824, "grad_norm": 0.05904717370867729, "learning_rate": 4.9868862335635175e-05, "loss": 0.2227, "num_input_tokens_seen": 34230032, "step": 15860 }, { "epoch": 2.588091353996737, "grad_norm": 0.7280972003936768, "learning_rate": 4.986849803002015e-05, "loss": 0.0972, "num_input_tokens_seen": 34240400, "step": 15865 }, { "epoch": 2.5889070146818924, "grad_norm": 0.46716341376304626, "learning_rate": 4.986813322041347e-05, "loss": 0.115, "num_input_tokens_seen": 34252048, "step": 15870 }, { "epoch": 2.5897226753670473, "grad_norm": 0.6653571724891663, "learning_rate": 4.986776790682252e-05, "loss": 0.0913, "num_input_tokens_seen": 34262832, "step": 15875 }, { "epoch": 2.5905383360522025, "grad_norm": 0.5522048473358154, "learning_rate": 4.986740208925471e-05, "loss": 0.0307, "num_input_tokens_seen": 34274512, "step": 15880 }, { "epoch": 2.5913539967373573, "grad_norm": 1.714464545249939, "learning_rate": 4.9867035767717444e-05, "loss": 0.148, "num_input_tokens_seen": 34284240, "step": 15885 }, { "epoch": 2.592169657422512, "grad_norm": 0.0857260674238205, "learning_rate": 4.986666894221816e-05, "loss": 0.0916, "num_input_tokens_seen": 34293744, "step": 15890 }, { "epoch": 2.592985318107667, "grad_norm": 0.3992098271846771, "learning_rate": 4.986630161276428e-05, "loss": 0.0777, "num_input_tokens_seen": 34304144, "step": 15895 }, { "epoch": 2.5938009787928222, "grad_norm": 0.009092725813388824, "learning_rate": 4.986593377936325e-05, "loss": 0.195, "num_input_tokens_seen": 34314992, "step": 15900 }, { "epoch": 2.594616639477977, "grad_norm": 0.9705672860145569, "learning_rate": 4.9865565442022534e-05, "loss": 0.1973, "num_input_tokens_seen": 34326768, "step": 15905 }, { "epoch": 2.5954323001631323, "grad_norm": 0.5640473961830139, "learning_rate": 4.986519660074958e-05, "loss": 0.1365, "num_input_tokens_seen": 34336080, "step": 15910 }, { "epoch": 2.596247960848287, "grad_norm": 0.2672508656978607, "learning_rate": 4.9864827255551884e-05, "loss": 0.109, "num_input_tokens_seen": 34347568, "step": 15915 }, { "epoch": 2.597063621533442, "grad_norm": 2.240293502807617, "learning_rate": 4.9864457406436914e-05, "loss": 0.2544, "num_input_tokens_seen": 34358864, "step": 15920 }, { "epoch": 2.597879282218597, "grad_norm": 0.5768799185752869, "learning_rate": 4.986408705341217e-05, "loss": 0.0999, "num_input_tokens_seen": 34371024, "step": 15925 }, { "epoch": 2.598694942903752, "grad_norm": 0.15237300097942352, "learning_rate": 4.986371619648517e-05, "loss": 0.2185, "num_input_tokens_seen": 34381520, "step": 15930 }, { "epoch": 2.5995106035889073, "grad_norm": 0.6514886617660522, "learning_rate": 4.986334483566341e-05, "loss": 0.0235, "num_input_tokens_seen": 34392880, "step": 15935 }, { "epoch": 2.600326264274062, "grad_norm": 1.6164084672927856, "learning_rate": 4.986297297095443e-05, "loss": 0.1898, "num_input_tokens_seen": 34404816, "step": 15940 }, { "epoch": 2.601141924959217, "grad_norm": 0.799809455871582, "learning_rate": 4.986260060236575e-05, "loss": 0.3387, "num_input_tokens_seen": 34414896, "step": 15945 }, { "epoch": 2.6019575856443717, "grad_norm": 1.9573330879211426, "learning_rate": 4.9862227729904946e-05, "loss": 0.1587, "num_input_tokens_seen": 34426480, "step": 15950 }, { "epoch": 2.602773246329527, "grad_norm": 1.015711784362793, "learning_rate": 4.9861854353579544e-05, "loss": 0.0799, "num_input_tokens_seen": 34437328, "step": 15955 }, { "epoch": 2.603588907014682, "grad_norm": 0.4847928583621979, "learning_rate": 4.986148047339713e-05, "loss": 0.1377, "num_input_tokens_seen": 34449584, "step": 15960 }, { "epoch": 2.604404567699837, "grad_norm": 0.47503694891929626, "learning_rate": 4.9861106089365275e-05, "loss": 0.1338, "num_input_tokens_seen": 34460592, "step": 15965 }, { "epoch": 2.605220228384992, "grad_norm": 0.7350018620491028, "learning_rate": 4.986073120149156e-05, "loss": 0.0741, "num_input_tokens_seen": 34472944, "step": 15970 }, { "epoch": 2.6060358890701467, "grad_norm": 0.0664776936173439, "learning_rate": 4.98603558097836e-05, "loss": 0.0217, "num_input_tokens_seen": 34483536, "step": 15975 }, { "epoch": 2.6068515497553015, "grad_norm": 0.18931464850902557, "learning_rate": 4.9859979914248985e-05, "loss": 0.1654, "num_input_tokens_seen": 34495248, "step": 15980 }, { "epoch": 2.607667210440457, "grad_norm": 1.2733229398727417, "learning_rate": 4.985960351489534e-05, "loss": 0.1174, "num_input_tokens_seen": 34506832, "step": 15985 }, { "epoch": 2.6084828711256116, "grad_norm": 0.07745286822319031, "learning_rate": 4.9859226611730294e-05, "loss": 0.0877, "num_input_tokens_seen": 34517168, "step": 15990 }, { "epoch": 2.609298531810767, "grad_norm": 0.28520044684410095, "learning_rate": 4.985884920476148e-05, "loss": 0.0882, "num_input_tokens_seen": 34528272, "step": 15995 }, { "epoch": 2.6101141924959217, "grad_norm": 0.13950882852077484, "learning_rate": 4.985847129399656e-05, "loss": 0.02, "num_input_tokens_seen": 34537904, "step": 16000 }, { "epoch": 2.6109298531810765, "grad_norm": 0.41185498237609863, "learning_rate": 4.985809287944319e-05, "loss": 0.1941, "num_input_tokens_seen": 34548368, "step": 16005 }, { "epoch": 2.6117455138662318, "grad_norm": 0.3972325026988983, "learning_rate": 4.985771396110902e-05, "loss": 0.1313, "num_input_tokens_seen": 34559024, "step": 16010 }, { "epoch": 2.6125611745513866, "grad_norm": 0.5196385979652405, "learning_rate": 4.9857334539001746e-05, "loss": 0.1595, "num_input_tokens_seen": 34570224, "step": 16015 }, { "epoch": 2.613376835236542, "grad_norm": 2.035104990005493, "learning_rate": 4.985695461312905e-05, "loss": 0.063, "num_input_tokens_seen": 34581968, "step": 16020 }, { "epoch": 2.6141924959216967, "grad_norm": 0.23202696442604065, "learning_rate": 4.985657418349864e-05, "loss": 0.1426, "num_input_tokens_seen": 34593392, "step": 16025 }, { "epoch": 2.6150081566068515, "grad_norm": 0.7216973900794983, "learning_rate": 4.9856193250118224e-05, "loss": 0.0448, "num_input_tokens_seen": 34604176, "step": 16030 }, { "epoch": 2.6158238172920063, "grad_norm": 0.6705654859542847, "learning_rate": 4.985581181299551e-05, "loss": 0.0582, "num_input_tokens_seen": 34615024, "step": 16035 }, { "epoch": 2.6166394779771616, "grad_norm": 2.054126262664795, "learning_rate": 4.985542987213825e-05, "loss": 0.2024, "num_input_tokens_seen": 34626160, "step": 16040 }, { "epoch": 2.6174551386623164, "grad_norm": 1.04854416847229, "learning_rate": 4.9855047427554156e-05, "loss": 0.0672, "num_input_tokens_seen": 34635632, "step": 16045 }, { "epoch": 2.6182707993474716, "grad_norm": 0.534996509552002, "learning_rate": 4.9854664479251015e-05, "loss": 0.0599, "num_input_tokens_seen": 34646288, "step": 16050 }, { "epoch": 2.6190864600326265, "grad_norm": 0.033440519124269485, "learning_rate": 4.985428102723655e-05, "loss": 0.3072, "num_input_tokens_seen": 34657232, "step": 16055 }, { "epoch": 2.6199021207177813, "grad_norm": 1.8743160963058472, "learning_rate": 4.985389707151856e-05, "loss": 0.2638, "num_input_tokens_seen": 34667408, "step": 16060 }, { "epoch": 2.6207177814029365, "grad_norm": 1.6065715551376343, "learning_rate": 4.9853512612104814e-05, "loss": 0.1632, "num_input_tokens_seen": 34677616, "step": 16065 }, { "epoch": 2.6215334420880914, "grad_norm": 0.11111804097890854, "learning_rate": 4.985312764900311e-05, "loss": 0.0441, "num_input_tokens_seen": 34688688, "step": 16070 }, { "epoch": 2.622349102773246, "grad_norm": 0.19731149077415466, "learning_rate": 4.985274218222124e-05, "loss": 0.119, "num_input_tokens_seen": 34700272, "step": 16075 }, { "epoch": 2.6231647634584014, "grad_norm": 0.5220547318458557, "learning_rate": 4.985235621176703e-05, "loss": 0.2762, "num_input_tokens_seen": 34710832, "step": 16080 }, { "epoch": 2.6239804241435563, "grad_norm": 1.7153891324996948, "learning_rate": 4.985196973764828e-05, "loss": 0.2645, "num_input_tokens_seen": 34722224, "step": 16085 }, { "epoch": 2.624796084828711, "grad_norm": 0.04695311188697815, "learning_rate": 4.985158275987284e-05, "loss": 0.0752, "num_input_tokens_seen": 34733136, "step": 16090 }, { "epoch": 2.6256117455138663, "grad_norm": 0.15585662424564362, "learning_rate": 4.985119527844856e-05, "loss": 0.0985, "num_input_tokens_seen": 34743408, "step": 16095 }, { "epoch": 2.626427406199021, "grad_norm": 3.4649152755737305, "learning_rate": 4.985080729338327e-05, "loss": 0.3418, "num_input_tokens_seen": 34755184, "step": 16100 }, { "epoch": 2.6272430668841764, "grad_norm": 1.1755805015563965, "learning_rate": 4.985041880468485e-05, "loss": 0.0932, "num_input_tokens_seen": 34765040, "step": 16105 }, { "epoch": 2.6280587275693312, "grad_norm": 0.29966720938682556, "learning_rate": 4.985002981236117e-05, "loss": 0.1304, "num_input_tokens_seen": 34775312, "step": 16110 }, { "epoch": 2.628874388254486, "grad_norm": 0.6741494536399841, "learning_rate": 4.984964031642011e-05, "loss": 0.1321, "num_input_tokens_seen": 34784400, "step": 16115 }, { "epoch": 2.629690048939641, "grad_norm": 0.201101154088974, "learning_rate": 4.9849250316869564e-05, "loss": 0.0898, "num_input_tokens_seen": 34796080, "step": 16120 }, { "epoch": 2.630505709624796, "grad_norm": 0.3705390989780426, "learning_rate": 4.984885981371744e-05, "loss": 0.0217, "num_input_tokens_seen": 34807280, "step": 16125 }, { "epoch": 2.631321370309951, "grad_norm": 0.3726417124271393, "learning_rate": 4.984846880697164e-05, "loss": 0.0879, "num_input_tokens_seen": 34818896, "step": 16130 }, { "epoch": 2.632137030995106, "grad_norm": 0.3693656921386719, "learning_rate": 4.9848077296640105e-05, "loss": 0.0918, "num_input_tokens_seen": 34830288, "step": 16135 }, { "epoch": 2.632952691680261, "grad_norm": 0.21003499627113342, "learning_rate": 4.984768528273076e-05, "loss": 0.1035, "num_input_tokens_seen": 34840688, "step": 16140 }, { "epoch": 2.633768352365416, "grad_norm": 1.6529537439346313, "learning_rate": 4.9847292765251555e-05, "loss": 0.1293, "num_input_tokens_seen": 34852848, "step": 16145 }, { "epoch": 2.634584013050571, "grad_norm": 2.040252923965454, "learning_rate": 4.984689974421043e-05, "loss": 0.2968, "num_input_tokens_seen": 34863728, "step": 16150 }, { "epoch": 2.635399673735726, "grad_norm": 0.24897079169750214, "learning_rate": 4.984650621961537e-05, "loss": 0.1619, "num_input_tokens_seen": 34874672, "step": 16155 }, { "epoch": 2.636215334420881, "grad_norm": 0.42508330941200256, "learning_rate": 4.9846112191474335e-05, "loss": 0.0787, "num_input_tokens_seen": 34885904, "step": 16160 }, { "epoch": 2.637030995106036, "grad_norm": 0.59100341796875, "learning_rate": 4.984571765979532e-05, "loss": 0.1988, "num_input_tokens_seen": 34896144, "step": 16165 }, { "epoch": 2.637846655791191, "grad_norm": 2.230607509613037, "learning_rate": 4.984532262458632e-05, "loss": 0.2542, "num_input_tokens_seen": 34906928, "step": 16170 }, { "epoch": 2.6386623164763456, "grad_norm": 0.5740383863449097, "learning_rate": 4.984492708585534e-05, "loss": 0.0682, "num_input_tokens_seen": 34917392, "step": 16175 }, { "epoch": 2.639477977161501, "grad_norm": 0.17671573162078857, "learning_rate": 4.9844531043610384e-05, "loss": 0.0645, "num_input_tokens_seen": 34928240, "step": 16180 }, { "epoch": 2.6402936378466557, "grad_norm": 0.5809140801429749, "learning_rate": 4.9844134497859504e-05, "loss": 0.024, "num_input_tokens_seen": 34939824, "step": 16185 }, { "epoch": 2.641109298531811, "grad_norm": 0.3993092477321625, "learning_rate": 4.984373744861071e-05, "loss": 0.1207, "num_input_tokens_seen": 34950704, "step": 16190 }, { "epoch": 2.641924959216966, "grad_norm": 0.2829912602901459, "learning_rate": 4.9843339895872065e-05, "loss": 0.1187, "num_input_tokens_seen": 34962224, "step": 16195 }, { "epoch": 2.6427406199021206, "grad_norm": 0.3879877030849457, "learning_rate": 4.984294183965163e-05, "loss": 0.1158, "num_input_tokens_seen": 34973936, "step": 16200 }, { "epoch": 2.6435562805872754, "grad_norm": 0.41783273220062256, "learning_rate": 4.984254327995744e-05, "loss": 0.0451, "num_input_tokens_seen": 34985296, "step": 16205 }, { "epoch": 2.6443719412724307, "grad_norm": 2.006863832473755, "learning_rate": 4.984214421679762e-05, "loss": 0.2418, "num_input_tokens_seen": 34996528, "step": 16210 }, { "epoch": 2.6451876019575855, "grad_norm": 1.5227162837982178, "learning_rate": 4.984174465018021e-05, "loss": 0.1584, "num_input_tokens_seen": 35007056, "step": 16215 }, { "epoch": 2.6460032626427408, "grad_norm": 1.1222425699234009, "learning_rate": 4.984134458011335e-05, "loss": 0.0996, "num_input_tokens_seen": 35017744, "step": 16220 }, { "epoch": 2.6468189233278956, "grad_norm": 0.7958118915557861, "learning_rate": 4.9840944006605115e-05, "loss": 0.3141, "num_input_tokens_seen": 35029104, "step": 16225 }, { "epoch": 2.6476345840130504, "grad_norm": 0.14012788236141205, "learning_rate": 4.984054292966365e-05, "loss": 0.1489, "num_input_tokens_seen": 35039632, "step": 16230 }, { "epoch": 2.6484502446982057, "grad_norm": 0.9930813908576965, "learning_rate": 4.984014134929705e-05, "loss": 0.1721, "num_input_tokens_seen": 35050704, "step": 16235 }, { "epoch": 2.6492659053833605, "grad_norm": 0.08531464636325836, "learning_rate": 4.983973926551349e-05, "loss": 0.1097, "num_input_tokens_seen": 35060688, "step": 16240 }, { "epoch": 2.6500815660685157, "grad_norm": 1.0158613920211792, "learning_rate": 4.98393366783211e-05, "loss": 0.092, "num_input_tokens_seen": 35071920, "step": 16245 }, { "epoch": 2.6508972267536706, "grad_norm": 0.08396174758672714, "learning_rate": 4.9838933587728046e-05, "loss": 0.0375, "num_input_tokens_seen": 35083056, "step": 16250 }, { "epoch": 2.6517128874388254, "grad_norm": 0.48655763268470764, "learning_rate": 4.9838529993742476e-05, "loss": 0.1347, "num_input_tokens_seen": 35094320, "step": 16255 }, { "epoch": 2.65252854812398, "grad_norm": 0.10583477467298508, "learning_rate": 4.98381258963726e-05, "loss": 0.2104, "num_input_tokens_seen": 35104560, "step": 16260 }, { "epoch": 2.6533442088091355, "grad_norm": 2.3497798442840576, "learning_rate": 4.983772129562659e-05, "loss": 0.2468, "num_input_tokens_seen": 35115376, "step": 16265 }, { "epoch": 2.6541598694942903, "grad_norm": 0.3059203624725342, "learning_rate": 4.9837316191512654e-05, "loss": 0.1301, "num_input_tokens_seen": 35126192, "step": 16270 }, { "epoch": 2.6549755301794455, "grad_norm": 0.6789116859436035, "learning_rate": 4.9836910584038986e-05, "loss": 0.1988, "num_input_tokens_seen": 35137936, "step": 16275 }, { "epoch": 2.6557911908646004, "grad_norm": 0.5722035765647888, "learning_rate": 4.983650447321382e-05, "loss": 0.1311, "num_input_tokens_seen": 35148560, "step": 16280 }, { "epoch": 2.656606851549755, "grad_norm": 0.43476057052612305, "learning_rate": 4.9836097859045386e-05, "loss": 0.0812, "num_input_tokens_seen": 35159632, "step": 16285 }, { "epoch": 2.6574225122349104, "grad_norm": 0.11088354140520096, "learning_rate": 4.9835690741541926e-05, "loss": 0.1608, "num_input_tokens_seen": 35170000, "step": 16290 }, { "epoch": 2.6582381729200653, "grad_norm": 0.06869199872016907, "learning_rate": 4.9835283120711676e-05, "loss": 0.0384, "num_input_tokens_seen": 35181360, "step": 16295 }, { "epoch": 2.65905383360522, "grad_norm": 0.259062260389328, "learning_rate": 4.983487499656292e-05, "loss": 0.1392, "num_input_tokens_seen": 35191600, "step": 16300 }, { "epoch": 2.6598694942903753, "grad_norm": 0.2710782587528229, "learning_rate": 4.983446636910391e-05, "loss": 0.0505, "num_input_tokens_seen": 35201136, "step": 16305 }, { "epoch": 2.66068515497553, "grad_norm": 0.8672689199447632, "learning_rate": 4.9834057238342935e-05, "loss": 0.0579, "num_input_tokens_seen": 35211600, "step": 16310 }, { "epoch": 2.661500815660685, "grad_norm": 0.15991345047950745, "learning_rate": 4.9833647604288285e-05, "loss": 0.2026, "num_input_tokens_seen": 35222416, "step": 16315 }, { "epoch": 2.6623164763458402, "grad_norm": 0.6405999660491943, "learning_rate": 4.983323746694827e-05, "loss": 0.155, "num_input_tokens_seen": 35233904, "step": 16320 }, { "epoch": 2.663132137030995, "grad_norm": 0.7352588176727295, "learning_rate": 4.9832826826331184e-05, "loss": 0.0997, "num_input_tokens_seen": 35243984, "step": 16325 }, { "epoch": 2.6639477977161503, "grad_norm": 0.2840903103351593, "learning_rate": 4.983241568244537e-05, "loss": 0.0643, "num_input_tokens_seen": 35255216, "step": 16330 }, { "epoch": 2.664763458401305, "grad_norm": 1.8564167022705078, "learning_rate": 4.983200403529914e-05, "loss": 0.2515, "num_input_tokens_seen": 35265136, "step": 16335 }, { "epoch": 2.66557911908646, "grad_norm": 0.07679503411054611, "learning_rate": 4.983159188490085e-05, "loss": 0.0578, "num_input_tokens_seen": 35275600, "step": 16340 }, { "epoch": 2.6663947797716148, "grad_norm": 0.9633280038833618, "learning_rate": 4.9831179231258854e-05, "loss": 0.1377, "num_input_tokens_seen": 35286928, "step": 16345 }, { "epoch": 2.66721044045677, "grad_norm": 1.4108539819717407, "learning_rate": 4.9830766074381505e-05, "loss": 0.1776, "num_input_tokens_seen": 35299024, "step": 16350 }, { "epoch": 2.668026101141925, "grad_norm": 0.10535408556461334, "learning_rate": 4.983035241427718e-05, "loss": 0.0478, "num_input_tokens_seen": 35310320, "step": 16355 }, { "epoch": 2.66884176182708, "grad_norm": 0.12964138388633728, "learning_rate": 4.9829938250954266e-05, "loss": 0.0862, "num_input_tokens_seen": 35321104, "step": 16360 }, { "epoch": 2.669657422512235, "grad_norm": 0.9159072637557983, "learning_rate": 4.9829523584421154e-05, "loss": 0.1855, "num_input_tokens_seen": 35332688, "step": 16365 }, { "epoch": 2.6704730831973897, "grad_norm": 0.17549830675125122, "learning_rate": 4.982910841468625e-05, "loss": 0.1615, "num_input_tokens_seen": 35342416, "step": 16370 }, { "epoch": 2.671288743882545, "grad_norm": 0.22289836406707764, "learning_rate": 4.9828692741757964e-05, "loss": 0.092, "num_input_tokens_seen": 35353616, "step": 16375 }, { "epoch": 2.6721044045677, "grad_norm": 0.7332730889320374, "learning_rate": 4.9828276565644726e-05, "loss": 0.0909, "num_input_tokens_seen": 35364304, "step": 16380 }, { "epoch": 2.672920065252855, "grad_norm": 0.4179403781890869, "learning_rate": 4.982785988635497e-05, "loss": 0.1547, "num_input_tokens_seen": 35374992, "step": 16385 }, { "epoch": 2.67373572593801, "grad_norm": 1.9485212564468384, "learning_rate": 4.9827442703897124e-05, "loss": 0.1901, "num_input_tokens_seen": 35386544, "step": 16390 }, { "epoch": 2.6745513866231647, "grad_norm": 0.07865298539400101, "learning_rate": 4.9827025018279666e-05, "loss": 0.1947, "num_input_tokens_seen": 35397616, "step": 16395 }, { "epoch": 2.6753670473083195, "grad_norm": 0.20878998935222626, "learning_rate": 4.9826606829511046e-05, "loss": 0.2811, "num_input_tokens_seen": 35407408, "step": 16400 }, { "epoch": 2.676182707993475, "grad_norm": 0.5461368560791016, "learning_rate": 4.982618813759975e-05, "loss": 0.0365, "num_input_tokens_seen": 35418512, "step": 16405 }, { "epoch": 2.6769983686786296, "grad_norm": 1.1264570951461792, "learning_rate": 4.9825768942554254e-05, "loss": 0.246, "num_input_tokens_seen": 35429712, "step": 16410 }, { "epoch": 2.677814029363785, "grad_norm": 0.843024492263794, "learning_rate": 4.9825349244383045e-05, "loss": 0.207, "num_input_tokens_seen": 35440560, "step": 16415 }, { "epoch": 2.6786296900489397, "grad_norm": 0.5151340961456299, "learning_rate": 4.982492904309466e-05, "loss": 0.2219, "num_input_tokens_seen": 35451504, "step": 16420 }, { "epoch": 2.6794453507340945, "grad_norm": 0.7559535503387451, "learning_rate": 4.982450833869758e-05, "loss": 0.1915, "num_input_tokens_seen": 35462640, "step": 16425 }, { "epoch": 2.6802610114192493, "grad_norm": 0.062639981508255, "learning_rate": 4.9824087131200356e-05, "loss": 0.1121, "num_input_tokens_seen": 35474160, "step": 16430 }, { "epoch": 2.6810766721044046, "grad_norm": 0.6229922771453857, "learning_rate": 4.982366542061151e-05, "loss": 0.1233, "num_input_tokens_seen": 35485200, "step": 16435 }, { "epoch": 2.6818923327895594, "grad_norm": 0.2817709743976593, "learning_rate": 4.9823243206939594e-05, "loss": 0.4588, "num_input_tokens_seen": 35496176, "step": 16440 }, { "epoch": 2.6827079934747147, "grad_norm": 0.18143059313297272, "learning_rate": 4.982282049019316e-05, "loss": 0.1337, "num_input_tokens_seen": 35506960, "step": 16445 }, { "epoch": 2.6835236541598695, "grad_norm": 1.1942639350891113, "learning_rate": 4.982239727038078e-05, "loss": 0.1857, "num_input_tokens_seen": 35517744, "step": 16450 }, { "epoch": 2.6843393148450243, "grad_norm": 1.3915417194366455, "learning_rate": 4.9821973547511036e-05, "loss": 0.2995, "num_input_tokens_seen": 35529680, "step": 16455 }, { "epoch": 2.6851549755301796, "grad_norm": 0.1669834405183792, "learning_rate": 4.982154932159251e-05, "loss": 0.1096, "num_input_tokens_seen": 35540496, "step": 16460 }, { "epoch": 2.6859706362153344, "grad_norm": 1.4277111291885376, "learning_rate": 4.982112459263379e-05, "loss": 0.1814, "num_input_tokens_seen": 35552176, "step": 16465 }, { "epoch": 2.6867862969004896, "grad_norm": 0.095155268907547, "learning_rate": 4.9820699360643494e-05, "loss": 0.0386, "num_input_tokens_seen": 35563440, "step": 16470 }, { "epoch": 2.6876019575856445, "grad_norm": 0.5698273777961731, "learning_rate": 4.9820273625630245e-05, "loss": 0.0398, "num_input_tokens_seen": 35574256, "step": 16475 }, { "epoch": 2.6884176182707993, "grad_norm": 1.0217430591583252, "learning_rate": 4.981984738760266e-05, "loss": 0.0928, "num_input_tokens_seen": 35584368, "step": 16480 }, { "epoch": 2.689233278955954, "grad_norm": 1.090950608253479, "learning_rate": 4.981942064656938e-05, "loss": 0.147, "num_input_tokens_seen": 35594896, "step": 16485 }, { "epoch": 2.6900489396411094, "grad_norm": 0.6536192297935486, "learning_rate": 4.981899340253906e-05, "loss": 0.1347, "num_input_tokens_seen": 35605488, "step": 16490 }, { "epoch": 2.690864600326264, "grad_norm": 1.4979958534240723, "learning_rate": 4.9818565655520345e-05, "loss": 0.0836, "num_input_tokens_seen": 35617616, "step": 16495 }, { "epoch": 2.6916802610114194, "grad_norm": 0.265451580286026, "learning_rate": 4.981813740552192e-05, "loss": 0.197, "num_input_tokens_seen": 35628304, "step": 16500 }, { "epoch": 2.6924959216965743, "grad_norm": 0.2596309781074524, "learning_rate": 4.981770865255245e-05, "loss": 0.1621, "num_input_tokens_seen": 35640016, "step": 16505 }, { "epoch": 2.693311582381729, "grad_norm": 1.350095272064209, "learning_rate": 4.981727939662064e-05, "loss": 0.181, "num_input_tokens_seen": 35651632, "step": 16510 }, { "epoch": 2.6941272430668843, "grad_norm": 0.9257508516311646, "learning_rate": 4.981684963773517e-05, "loss": 0.0439, "num_input_tokens_seen": 35661200, "step": 16515 }, { "epoch": 2.694942903752039, "grad_norm": 1.5375559329986572, "learning_rate": 4.9816419375904763e-05, "loss": 0.1772, "num_input_tokens_seen": 35671472, "step": 16520 }, { "epoch": 2.695758564437194, "grad_norm": 0.8917981386184692, "learning_rate": 4.981598861113814e-05, "loss": 0.1457, "num_input_tokens_seen": 35682480, "step": 16525 }, { "epoch": 2.6965742251223492, "grad_norm": 1.6039167642593384, "learning_rate": 4.9815557343444023e-05, "loss": 0.2934, "num_input_tokens_seen": 35693168, "step": 16530 }, { "epoch": 2.697389885807504, "grad_norm": 0.4799167811870575, "learning_rate": 4.9815125572831155e-05, "loss": 0.0986, "num_input_tokens_seen": 35704624, "step": 16535 }, { "epoch": 2.698205546492659, "grad_norm": 0.36215662956237793, "learning_rate": 4.981469329930829e-05, "loss": 0.0925, "num_input_tokens_seen": 35714768, "step": 16540 }, { "epoch": 2.699021207177814, "grad_norm": 0.13234129548072815, "learning_rate": 4.9814260522884184e-05, "loss": 0.1055, "num_input_tokens_seen": 35724240, "step": 16545 }, { "epoch": 2.699836867862969, "grad_norm": 0.5166927576065063, "learning_rate": 4.9813827243567605e-05, "loss": 0.0794, "num_input_tokens_seen": 35735472, "step": 16550 }, { "epoch": 2.700652528548124, "grad_norm": 0.06256795674562454, "learning_rate": 4.981339346136734e-05, "loss": 0.0891, "num_input_tokens_seen": 35747376, "step": 16555 }, { "epoch": 2.701468189233279, "grad_norm": 0.08677855879068375, "learning_rate": 4.981295917629218e-05, "loss": 0.0279, "num_input_tokens_seen": 35758960, "step": 16560 }, { "epoch": 2.702283849918434, "grad_norm": 1.894118070602417, "learning_rate": 4.9812524388350935e-05, "loss": 0.1981, "num_input_tokens_seen": 35770512, "step": 16565 }, { "epoch": 2.7030995106035887, "grad_norm": 0.133072629570961, "learning_rate": 4.981208909755239e-05, "loss": 0.1945, "num_input_tokens_seen": 35781424, "step": 16570 }, { "epoch": 2.703915171288744, "grad_norm": 1.207529067993164, "learning_rate": 4.9811653303905394e-05, "loss": 0.1619, "num_input_tokens_seen": 35793200, "step": 16575 }, { "epoch": 2.7047308319738987, "grad_norm": 0.9107369780540466, "learning_rate": 4.981121700741876e-05, "loss": 0.0518, "num_input_tokens_seen": 35804880, "step": 16580 }, { "epoch": 2.705546492659054, "grad_norm": 0.1739262342453003, "learning_rate": 4.981078020810135e-05, "loss": 0.2647, "num_input_tokens_seen": 35815376, "step": 16585 }, { "epoch": 2.706362153344209, "grad_norm": 0.7946200966835022, "learning_rate": 4.981034290596199e-05, "loss": 0.2484, "num_input_tokens_seen": 35825008, "step": 16590 }, { "epoch": 2.7071778140293636, "grad_norm": 0.8741614818572998, "learning_rate": 4.9809905101009566e-05, "loss": 0.1091, "num_input_tokens_seen": 35834384, "step": 16595 }, { "epoch": 2.707993474714519, "grad_norm": 1.8981534242630005, "learning_rate": 4.980946679325293e-05, "loss": 0.1614, "num_input_tokens_seen": 35844752, "step": 16600 }, { "epoch": 2.7088091353996737, "grad_norm": 0.06694846600294113, "learning_rate": 4.980902798270099e-05, "loss": 0.1377, "num_input_tokens_seen": 35855664, "step": 16605 }, { "epoch": 2.709624796084829, "grad_norm": 0.19896914064884186, "learning_rate": 4.980858866936262e-05, "loss": 0.0425, "num_input_tokens_seen": 35867472, "step": 16610 }, { "epoch": 2.710440456769984, "grad_norm": 0.20310185849666595, "learning_rate": 4.980814885324673e-05, "loss": 0.1296, "num_input_tokens_seen": 35878544, "step": 16615 }, { "epoch": 2.7112561174551386, "grad_norm": 0.7238997220993042, "learning_rate": 4.980770853436224e-05, "loss": 0.1229, "num_input_tokens_seen": 35890704, "step": 16620 }, { "epoch": 2.7120717781402934, "grad_norm": 0.07051218301057816, "learning_rate": 4.9807267712718055e-05, "loss": 0.0737, "num_input_tokens_seen": 35901424, "step": 16625 }, { "epoch": 2.7128874388254487, "grad_norm": 1.3241994380950928, "learning_rate": 4.980682638832312e-05, "loss": 0.2791, "num_input_tokens_seen": 35912688, "step": 16630 }, { "epoch": 2.7137030995106035, "grad_norm": 1.1991450786590576, "learning_rate": 4.980638456118638e-05, "loss": 0.153, "num_input_tokens_seen": 35924272, "step": 16635 }, { "epoch": 2.7145187601957588, "grad_norm": 0.21599523723125458, "learning_rate": 4.980594223131678e-05, "loss": 0.1657, "num_input_tokens_seen": 35935984, "step": 16640 }, { "epoch": 2.7153344208809136, "grad_norm": 0.6109762191772461, "learning_rate": 4.980549939872331e-05, "loss": 0.2623, "num_input_tokens_seen": 35945840, "step": 16645 }, { "epoch": 2.7161500815660684, "grad_norm": 0.33699965476989746, "learning_rate": 4.980505606341491e-05, "loss": 0.1476, "num_input_tokens_seen": 35956336, "step": 16650 }, { "epoch": 2.7169657422512232, "grad_norm": 0.09101502597332001, "learning_rate": 4.980461222540059e-05, "loss": 0.0607, "num_input_tokens_seen": 35967664, "step": 16655 }, { "epoch": 2.7177814029363785, "grad_norm": 0.5902004837989807, "learning_rate": 4.980416788468933e-05, "loss": 0.2727, "num_input_tokens_seen": 35978480, "step": 16660 }, { "epoch": 2.7185970636215333, "grad_norm": 0.10186019539833069, "learning_rate": 4.9803723041290154e-05, "loss": 0.1381, "num_input_tokens_seen": 35989040, "step": 16665 }, { "epoch": 2.7194127243066886, "grad_norm": 0.5257435441017151, "learning_rate": 4.980327769521206e-05, "loss": 0.1782, "num_input_tokens_seen": 35999792, "step": 16670 }, { "epoch": 2.7202283849918434, "grad_norm": 0.127411350607872, "learning_rate": 4.980283184646407e-05, "loss": 0.0326, "num_input_tokens_seen": 36009392, "step": 16675 }, { "epoch": 2.721044045676998, "grad_norm": 0.2799662947654724, "learning_rate": 4.980238549505524e-05, "loss": 0.1242, "num_input_tokens_seen": 36019952, "step": 16680 }, { "epoch": 2.7218597063621535, "grad_norm": 0.21301311254501343, "learning_rate": 4.9801938640994594e-05, "loss": 0.066, "num_input_tokens_seen": 36031152, "step": 16685 }, { "epoch": 2.7226753670473083, "grad_norm": 0.5183460712432861, "learning_rate": 4.9801491284291196e-05, "loss": 0.1332, "num_input_tokens_seen": 36041808, "step": 16690 }, { "epoch": 2.7234910277324635, "grad_norm": 1.9092563390731812, "learning_rate": 4.9801043424954116e-05, "loss": 0.1538, "num_input_tokens_seen": 36051248, "step": 16695 }, { "epoch": 2.7243066884176184, "grad_norm": 0.06607922166585922, "learning_rate": 4.9800595062992436e-05, "loss": 0.0673, "num_input_tokens_seen": 36063024, "step": 16700 }, { "epoch": 2.725122349102773, "grad_norm": 0.26198095083236694, "learning_rate": 4.980014619841523e-05, "loss": 0.2079, "num_input_tokens_seen": 36074800, "step": 16705 }, { "epoch": 2.725938009787928, "grad_norm": 0.444671094417572, "learning_rate": 4.97996968312316e-05, "loss": 0.1158, "num_input_tokens_seen": 36084784, "step": 16710 }, { "epoch": 2.7267536704730833, "grad_norm": 1.5659650564193726, "learning_rate": 4.9799246961450654e-05, "loss": 0.106, "num_input_tokens_seen": 36095248, "step": 16715 }, { "epoch": 2.727569331158238, "grad_norm": 0.2487071007490158, "learning_rate": 4.979879658908151e-05, "loss": 0.284, "num_input_tokens_seen": 36106288, "step": 16720 }, { "epoch": 2.7283849918433933, "grad_norm": 0.39962857961654663, "learning_rate": 4.979834571413329e-05, "loss": 0.085, "num_input_tokens_seen": 36116976, "step": 16725 }, { "epoch": 2.729200652528548, "grad_norm": 0.14326906204223633, "learning_rate": 4.979789433661514e-05, "loss": 0.1439, "num_input_tokens_seen": 36127504, "step": 16730 }, { "epoch": 2.730016313213703, "grad_norm": 0.050291962921619415, "learning_rate": 4.97974424565362e-05, "loss": 0.0353, "num_input_tokens_seen": 36137744, "step": 16735 }, { "epoch": 2.7308319738988582, "grad_norm": 0.35057303309440613, "learning_rate": 4.9796990073905634e-05, "loss": 0.1351, "num_input_tokens_seen": 36149328, "step": 16740 }, { "epoch": 2.731647634584013, "grad_norm": 0.1283988207578659, "learning_rate": 4.9796537188732606e-05, "loss": 0.0982, "num_input_tokens_seen": 36160688, "step": 16745 }, { "epoch": 2.732463295269168, "grad_norm": 0.2525135576725006, "learning_rate": 4.979608380102629e-05, "loss": 0.0719, "num_input_tokens_seen": 36171056, "step": 16750 }, { "epoch": 2.733278955954323, "grad_norm": 0.09045622497797012, "learning_rate": 4.979562991079588e-05, "loss": 0.1582, "num_input_tokens_seen": 36181552, "step": 16755 }, { "epoch": 2.734094616639478, "grad_norm": 0.9430601000785828, "learning_rate": 4.979517551805058e-05, "loss": 0.1345, "num_input_tokens_seen": 36191472, "step": 16760 }, { "epoch": 2.7349102773246328, "grad_norm": 1.1056501865386963, "learning_rate": 4.979472062279959e-05, "loss": 0.2058, "num_input_tokens_seen": 36204272, "step": 16765 }, { "epoch": 2.735725938009788, "grad_norm": 0.5008929371833801, "learning_rate": 4.979426522505213e-05, "loss": 0.1808, "num_input_tokens_seen": 36216144, "step": 16770 }, { "epoch": 2.736541598694943, "grad_norm": 0.9185312986373901, "learning_rate": 4.979380932481744e-05, "loss": 0.0717, "num_input_tokens_seen": 36226672, "step": 16775 }, { "epoch": 2.737357259380098, "grad_norm": 1.8734240531921387, "learning_rate": 4.9793352922104744e-05, "loss": 0.243, "num_input_tokens_seen": 36238448, "step": 16780 }, { "epoch": 2.738172920065253, "grad_norm": 0.4931980073451996, "learning_rate": 4.9792896016923296e-05, "loss": 0.2604, "num_input_tokens_seen": 36248400, "step": 16785 }, { "epoch": 2.7389885807504077, "grad_norm": 0.5625531077384949, "learning_rate": 4.9792438609282364e-05, "loss": 0.0556, "num_input_tokens_seen": 36259824, "step": 16790 }, { "epoch": 2.7398042414355626, "grad_norm": 0.7859925031661987, "learning_rate": 4.979198069919121e-05, "loss": 0.0534, "num_input_tokens_seen": 36269168, "step": 16795 }, { "epoch": 2.740619902120718, "grad_norm": 0.6286447644233704, "learning_rate": 4.979152228665912e-05, "loss": 0.0495, "num_input_tokens_seen": 36278288, "step": 16800 }, { "epoch": 2.7414355628058726, "grad_norm": 0.4246708154678345, "learning_rate": 4.9791063371695375e-05, "loss": 0.1185, "num_input_tokens_seen": 36289200, "step": 16805 }, { "epoch": 2.742251223491028, "grad_norm": 1.2518565654754639, "learning_rate": 4.9790603954309276e-05, "loss": 0.2081, "num_input_tokens_seen": 36300112, "step": 16810 }, { "epoch": 2.7430668841761827, "grad_norm": 0.9442779421806335, "learning_rate": 4.979014403451015e-05, "loss": 0.3128, "num_input_tokens_seen": 36309872, "step": 16815 }, { "epoch": 2.7438825448613375, "grad_norm": 0.07877611368894577, "learning_rate": 4.9789683612307306e-05, "loss": 0.1179, "num_input_tokens_seen": 36320080, "step": 16820 }, { "epoch": 2.744698205546493, "grad_norm": 0.1946527063846588, "learning_rate": 4.978922268771007e-05, "loss": 0.2398, "num_input_tokens_seen": 36330448, "step": 16825 }, { "epoch": 2.7455138662316476, "grad_norm": 0.6552961468696594, "learning_rate": 4.9788761260727787e-05, "loss": 0.2191, "num_input_tokens_seen": 36341008, "step": 16830 }, { "epoch": 2.746329526916803, "grad_norm": 0.25558674335479736, "learning_rate": 4.9788299331369815e-05, "loss": 0.0909, "num_input_tokens_seen": 36351728, "step": 16835 }, { "epoch": 2.7471451876019577, "grad_norm": 0.2887307405471802, "learning_rate": 4.9787836899645514e-05, "loss": 0.2108, "num_input_tokens_seen": 36362832, "step": 16840 }, { "epoch": 2.7479608482871125, "grad_norm": 1.1836745738983154, "learning_rate": 4.978737396556424e-05, "loss": 0.1361, "num_input_tokens_seen": 36373520, "step": 16845 }, { "epoch": 2.7487765089722673, "grad_norm": 1.6394250392913818, "learning_rate": 4.97869105291354e-05, "loss": 0.0787, "num_input_tokens_seen": 36384912, "step": 16850 }, { "epoch": 2.7495921696574226, "grad_norm": 0.0954187884926796, "learning_rate": 4.978644659036837e-05, "loss": 0.0212, "num_input_tokens_seen": 36395760, "step": 16855 }, { "epoch": 2.7504078303425774, "grad_norm": 0.7445425987243652, "learning_rate": 4.978598214927256e-05, "loss": 0.0779, "num_input_tokens_seen": 36406768, "step": 16860 }, { "epoch": 2.7512234910277327, "grad_norm": 0.7160307168960571, "learning_rate": 4.978551720585737e-05, "loss": 0.0879, "num_input_tokens_seen": 36415728, "step": 16865 }, { "epoch": 2.7520391517128875, "grad_norm": 0.2467574179172516, "learning_rate": 4.978505176013224e-05, "loss": 0.2662, "num_input_tokens_seen": 36427728, "step": 16870 }, { "epoch": 2.7528548123980423, "grad_norm": 0.05462495982646942, "learning_rate": 4.978458581210659e-05, "loss": 0.3036, "num_input_tokens_seen": 36438640, "step": 16875 }, { "epoch": 2.753670473083197, "grad_norm": 1.7856532335281372, "learning_rate": 4.978411936178986e-05, "loss": 0.1532, "num_input_tokens_seen": 36448976, "step": 16880 }, { "epoch": 2.7544861337683524, "grad_norm": 0.4694873094558716, "learning_rate": 4.978365240919152e-05, "loss": 0.209, "num_input_tokens_seen": 36458736, "step": 16885 }, { "epoch": 2.755301794453507, "grad_norm": 2.857898712158203, "learning_rate": 4.978318495432102e-05, "loss": 0.2727, "num_input_tokens_seen": 36468144, "step": 16890 }, { "epoch": 2.7561174551386625, "grad_norm": 0.2968877851963043, "learning_rate": 4.978271699718784e-05, "loss": 0.2063, "num_input_tokens_seen": 36478512, "step": 16895 }, { "epoch": 2.7569331158238173, "grad_norm": 1.0022410154342651, "learning_rate": 4.9782248537801456e-05, "loss": 0.1476, "num_input_tokens_seen": 36490384, "step": 16900 }, { "epoch": 2.757748776508972, "grad_norm": 1.2584549188613892, "learning_rate": 4.978177957617137e-05, "loss": 0.0845, "num_input_tokens_seen": 36500944, "step": 16905 }, { "epoch": 2.7585644371941274, "grad_norm": 0.8477892279624939, "learning_rate": 4.978131011230708e-05, "loss": 0.0946, "num_input_tokens_seen": 36511056, "step": 16910 }, { "epoch": 2.759380097879282, "grad_norm": 0.5110397934913635, "learning_rate": 4.9780840146218106e-05, "loss": 0.153, "num_input_tokens_seen": 36522448, "step": 16915 }, { "epoch": 2.7601957585644374, "grad_norm": 0.16354666650295258, "learning_rate": 4.978036967791397e-05, "loss": 0.1654, "num_input_tokens_seen": 36533456, "step": 16920 }, { "epoch": 2.7610114192495923, "grad_norm": 0.32212603092193604, "learning_rate": 4.97798987074042e-05, "loss": 0.105, "num_input_tokens_seen": 36544656, "step": 16925 }, { "epoch": 2.761827079934747, "grad_norm": 0.8156349658966064, "learning_rate": 4.9779427234698356e-05, "loss": 0.1318, "num_input_tokens_seen": 36556144, "step": 16930 }, { "epoch": 2.762642740619902, "grad_norm": 0.4823524057865143, "learning_rate": 4.977895525980598e-05, "loss": 0.1277, "num_input_tokens_seen": 36566064, "step": 16935 }, { "epoch": 2.763458401305057, "grad_norm": 0.5868009328842163, "learning_rate": 4.977848278273664e-05, "loss": 0.089, "num_input_tokens_seen": 36576528, "step": 16940 }, { "epoch": 2.764274061990212, "grad_norm": 0.989041805267334, "learning_rate": 4.977800980349992e-05, "loss": 0.1175, "num_input_tokens_seen": 36587856, "step": 16945 }, { "epoch": 2.7650897226753672, "grad_norm": 1.3809293508529663, "learning_rate": 4.977753632210539e-05, "loss": 0.298, "num_input_tokens_seen": 36597936, "step": 16950 }, { "epoch": 2.765905383360522, "grad_norm": 0.3702054023742676, "learning_rate": 4.977706233856266e-05, "loss": 0.0632, "num_input_tokens_seen": 36608688, "step": 16955 }, { "epoch": 2.766721044045677, "grad_norm": 0.7904069423675537, "learning_rate": 4.977658785288133e-05, "loss": 0.1954, "num_input_tokens_seen": 36619824, "step": 16960 }, { "epoch": 2.767536704730832, "grad_norm": 1.9772608280181885, "learning_rate": 4.9776112865071014e-05, "loss": 0.2012, "num_input_tokens_seen": 36630960, "step": 16965 }, { "epoch": 2.768352365415987, "grad_norm": 0.2651214599609375, "learning_rate": 4.977563737514133e-05, "loss": 0.1293, "num_input_tokens_seen": 36640976, "step": 16970 }, { "epoch": 2.7691680261011418, "grad_norm": 0.3397878408432007, "learning_rate": 4.9775161383101934e-05, "loss": 0.2451, "num_input_tokens_seen": 36651792, "step": 16975 }, { "epoch": 2.769983686786297, "grad_norm": 1.3309837579727173, "learning_rate": 4.9774684888962466e-05, "loss": 0.0742, "num_input_tokens_seen": 36662896, "step": 16980 }, { "epoch": 2.770799347471452, "grad_norm": 0.792198896408081, "learning_rate": 4.977420789273257e-05, "loss": 0.1796, "num_input_tokens_seen": 36673040, "step": 16985 }, { "epoch": 2.7716150081566067, "grad_norm": 1.4841688871383667, "learning_rate": 4.977373039442194e-05, "loss": 0.1872, "num_input_tokens_seen": 36684464, "step": 16990 }, { "epoch": 2.772430668841762, "grad_norm": 0.6186941266059875, "learning_rate": 4.977325239404022e-05, "loss": 0.1322, "num_input_tokens_seen": 36696368, "step": 16995 }, { "epoch": 2.7732463295269167, "grad_norm": 0.268716037273407, "learning_rate": 4.977277389159711e-05, "loss": 0.1493, "num_input_tokens_seen": 36707600, "step": 17000 }, { "epoch": 2.774061990212072, "grad_norm": 1.2727550268173218, "learning_rate": 4.977229488710232e-05, "loss": 0.2144, "num_input_tokens_seen": 36718992, "step": 17005 }, { "epoch": 2.774877650897227, "grad_norm": 0.3677535951137543, "learning_rate": 4.977181538056555e-05, "loss": 0.1109, "num_input_tokens_seen": 36729840, "step": 17010 }, { "epoch": 2.7756933115823816, "grad_norm": 0.34041261672973633, "learning_rate": 4.977133537199651e-05, "loss": 0.1092, "num_input_tokens_seen": 36738672, "step": 17015 }, { "epoch": 2.7765089722675365, "grad_norm": 0.5806684494018555, "learning_rate": 4.977085486140494e-05, "loss": 0.0763, "num_input_tokens_seen": 36749104, "step": 17020 }, { "epoch": 2.7773246329526917, "grad_norm": 0.29880449175834656, "learning_rate": 4.977037384880057e-05, "loss": 0.101, "num_input_tokens_seen": 36760560, "step": 17025 }, { "epoch": 2.7781402936378465, "grad_norm": 0.5044044852256775, "learning_rate": 4.976989233419315e-05, "loss": 0.121, "num_input_tokens_seen": 36771728, "step": 17030 }, { "epoch": 2.778955954323002, "grad_norm": 1.234563946723938, "learning_rate": 4.9769410317592434e-05, "loss": 0.2004, "num_input_tokens_seen": 36783472, "step": 17035 }, { "epoch": 2.7797716150081566, "grad_norm": 0.06436081230640411, "learning_rate": 4.97689277990082e-05, "loss": 0.2391, "num_input_tokens_seen": 36794672, "step": 17040 }, { "epoch": 2.7805872756933114, "grad_norm": 0.11315470933914185, "learning_rate": 4.9768444778450224e-05, "loss": 0.1525, "num_input_tokens_seen": 36805840, "step": 17045 }, { "epoch": 2.7814029363784667, "grad_norm": 0.39504578709602356, "learning_rate": 4.976796125592829e-05, "loss": 0.1072, "num_input_tokens_seen": 36816368, "step": 17050 }, { "epoch": 2.7822185970636215, "grad_norm": 1.2209738492965698, "learning_rate": 4.976747723145221e-05, "loss": 0.1114, "num_input_tokens_seen": 36827920, "step": 17055 }, { "epoch": 2.7830342577487768, "grad_norm": 1.994787573814392, "learning_rate": 4.976699270503177e-05, "loss": 0.3235, "num_input_tokens_seen": 36839312, "step": 17060 }, { "epoch": 2.7838499184339316, "grad_norm": 1.4235384464263916, "learning_rate": 4.976650767667681e-05, "loss": 0.1343, "num_input_tokens_seen": 36849264, "step": 17065 }, { "epoch": 2.7846655791190864, "grad_norm": 1.1186970472335815, "learning_rate": 4.9766022146397154e-05, "loss": 0.2743, "num_input_tokens_seen": 36859664, "step": 17070 }, { "epoch": 2.7854812398042412, "grad_norm": 0.2186206430196762, "learning_rate": 4.976553611420264e-05, "loss": 0.1231, "num_input_tokens_seen": 36869776, "step": 17075 }, { "epoch": 2.7862969004893965, "grad_norm": 0.4906648099422455, "learning_rate": 4.976504958010312e-05, "loss": 0.0879, "num_input_tokens_seen": 36880912, "step": 17080 }, { "epoch": 2.7871125611745513, "grad_norm": 1.1088531017303467, "learning_rate": 4.9764562544108463e-05, "loss": 0.2717, "num_input_tokens_seen": 36891056, "step": 17085 }, { "epoch": 2.7879282218597066, "grad_norm": 0.3903461992740631, "learning_rate": 4.9764075006228516e-05, "loss": 0.1388, "num_input_tokens_seen": 36901456, "step": 17090 }, { "epoch": 2.7887438825448614, "grad_norm": 0.1402750313282013, "learning_rate": 4.976358696647318e-05, "loss": 0.1729, "num_input_tokens_seen": 36912336, "step": 17095 }, { "epoch": 2.789559543230016, "grad_norm": 0.06470812112092972, "learning_rate": 4.976309842485234e-05, "loss": 0.0728, "num_input_tokens_seen": 36921456, "step": 17100 }, { "epoch": 2.790375203915171, "grad_norm": 1.0306854248046875, "learning_rate": 4.976260938137589e-05, "loss": 0.1258, "num_input_tokens_seen": 36932272, "step": 17105 }, { "epoch": 2.7911908646003263, "grad_norm": 0.24056002497673035, "learning_rate": 4.9762119836053755e-05, "loss": 0.1158, "num_input_tokens_seen": 36942640, "step": 17110 }, { "epoch": 2.792006525285481, "grad_norm": 1.0552620887756348, "learning_rate": 4.976162978889584e-05, "loss": 0.4166, "num_input_tokens_seen": 36954192, "step": 17115 }, { "epoch": 2.7928221859706364, "grad_norm": 0.9128326177597046, "learning_rate": 4.9761139239912094e-05, "loss": 0.1719, "num_input_tokens_seen": 36964880, "step": 17120 }, { "epoch": 2.793637846655791, "grad_norm": 0.6416493058204651, "learning_rate": 4.976064818911245e-05, "loss": 0.1392, "num_input_tokens_seen": 36976112, "step": 17125 }, { "epoch": 2.794453507340946, "grad_norm": 0.3678363561630249, "learning_rate": 4.976015663650685e-05, "loss": 0.0721, "num_input_tokens_seen": 36985232, "step": 17130 }, { "epoch": 2.7952691680261013, "grad_norm": 0.42257198691368103, "learning_rate": 4.975966458210527e-05, "loss": 0.075, "num_input_tokens_seen": 36995376, "step": 17135 }, { "epoch": 2.796084828711256, "grad_norm": 0.9098528027534485, "learning_rate": 4.9759172025917676e-05, "loss": 0.113, "num_input_tokens_seen": 37006576, "step": 17140 }, { "epoch": 2.7969004893964113, "grad_norm": 0.7431926131248474, "learning_rate": 4.975867896795405e-05, "loss": 0.1469, "num_input_tokens_seen": 37017264, "step": 17145 }, { "epoch": 2.797716150081566, "grad_norm": 0.26259779930114746, "learning_rate": 4.975818540822439e-05, "loss": 0.0441, "num_input_tokens_seen": 37027472, "step": 17150 }, { "epoch": 2.798531810766721, "grad_norm": 0.46619200706481934, "learning_rate": 4.97576913467387e-05, "loss": 0.1329, "num_input_tokens_seen": 37038768, "step": 17155 }, { "epoch": 2.799347471451876, "grad_norm": 1.0104883909225464, "learning_rate": 4.9757196783506975e-05, "loss": 0.1734, "num_input_tokens_seen": 37049424, "step": 17160 }, { "epoch": 2.800163132137031, "grad_norm": 2.359254837036133, "learning_rate": 4.975670171853926e-05, "loss": 0.2271, "num_input_tokens_seen": 37060912, "step": 17165 }, { "epoch": 2.800978792822186, "grad_norm": 0.8929365277290344, "learning_rate": 4.975620615184558e-05, "loss": 0.1825, "num_input_tokens_seen": 37070032, "step": 17170 }, { "epoch": 2.801794453507341, "grad_norm": 1.133201003074646, "learning_rate": 4.975571008343597e-05, "loss": 0.131, "num_input_tokens_seen": 37079792, "step": 17175 }, { "epoch": 2.802610114192496, "grad_norm": 0.39147117733955383, "learning_rate": 4.9755213513320496e-05, "loss": 0.1193, "num_input_tokens_seen": 37091344, "step": 17180 }, { "epoch": 2.8034257748776508, "grad_norm": 0.6009216904640198, "learning_rate": 4.9754716441509205e-05, "loss": 0.0773, "num_input_tokens_seen": 37102800, "step": 17185 }, { "epoch": 2.804241435562806, "grad_norm": 0.6922805309295654, "learning_rate": 4.975421886801219e-05, "loss": 0.1949, "num_input_tokens_seen": 37114512, "step": 17190 }, { "epoch": 2.805057096247961, "grad_norm": 1.7952244281768799, "learning_rate": 4.9753720792839534e-05, "loss": 0.3493, "num_input_tokens_seen": 37125232, "step": 17195 }, { "epoch": 2.8058727569331157, "grad_norm": 1.0293689966201782, "learning_rate": 4.975322221600131e-05, "loss": 0.1384, "num_input_tokens_seen": 37136656, "step": 17200 }, { "epoch": 2.806688417618271, "grad_norm": 0.4305753707885742, "learning_rate": 4.975272313750764e-05, "loss": 0.1714, "num_input_tokens_seen": 37148112, "step": 17205 }, { "epoch": 2.8075040783034257, "grad_norm": 0.24046023190021515, "learning_rate": 4.9752223557368636e-05, "loss": 0.0595, "num_input_tokens_seen": 37160208, "step": 17210 }, { "epoch": 2.8083197389885806, "grad_norm": 0.26394760608673096, "learning_rate": 4.975172347559443e-05, "loss": 0.0704, "num_input_tokens_seen": 37170288, "step": 17215 }, { "epoch": 2.809135399673736, "grad_norm": 0.36076006293296814, "learning_rate": 4.975122289219514e-05, "loss": 0.0748, "num_input_tokens_seen": 37179408, "step": 17220 }, { "epoch": 2.8099510603588906, "grad_norm": 0.19306573271751404, "learning_rate": 4.975072180718091e-05, "loss": 0.0905, "num_input_tokens_seen": 37191472, "step": 17225 }, { "epoch": 2.810766721044046, "grad_norm": 0.4343973696231842, "learning_rate": 4.9750220220561916e-05, "loss": 0.0817, "num_input_tokens_seen": 37200528, "step": 17230 }, { "epoch": 2.8115823817292007, "grad_norm": 0.36631742119789124, "learning_rate": 4.974971813234831e-05, "loss": 0.1232, "num_input_tokens_seen": 37210960, "step": 17235 }, { "epoch": 2.8123980424143555, "grad_norm": 0.12967149913311005, "learning_rate": 4.974921554255027e-05, "loss": 0.1556, "num_input_tokens_seen": 37222192, "step": 17240 }, { "epoch": 2.8132137030995104, "grad_norm": 1.573225736618042, "learning_rate": 4.974871245117797e-05, "loss": 0.1946, "num_input_tokens_seen": 37234064, "step": 17245 }, { "epoch": 2.8140293637846656, "grad_norm": 0.34152311086654663, "learning_rate": 4.974820885824163e-05, "loss": 0.253, "num_input_tokens_seen": 37245968, "step": 17250 }, { "epoch": 2.8148450244698204, "grad_norm": 1.365454912185669, "learning_rate": 4.974770476375143e-05, "loss": 0.2217, "num_input_tokens_seen": 37256432, "step": 17255 }, { "epoch": 2.8156606851549757, "grad_norm": 0.5502535700798035, "learning_rate": 4.97472001677176e-05, "loss": 0.2282, "num_input_tokens_seen": 37267312, "step": 17260 }, { "epoch": 2.8164763458401305, "grad_norm": 0.4972289800643921, "learning_rate": 4.974669507015037e-05, "loss": 0.1377, "num_input_tokens_seen": 37278256, "step": 17265 }, { "epoch": 2.8172920065252853, "grad_norm": 0.21701616048812866, "learning_rate": 4.974618947105997e-05, "loss": 0.1065, "num_input_tokens_seen": 37289968, "step": 17270 }, { "epoch": 2.8181076672104406, "grad_norm": 0.17619384825229645, "learning_rate": 4.974568337045665e-05, "loss": 0.1082, "num_input_tokens_seen": 37301072, "step": 17275 }, { "epoch": 2.8189233278955954, "grad_norm": 2.761932373046875, "learning_rate": 4.974517676835066e-05, "loss": 0.2172, "num_input_tokens_seen": 37311664, "step": 17280 }, { "epoch": 2.8197389885807507, "grad_norm": 0.6271142363548279, "learning_rate": 4.974466966475228e-05, "loss": 0.0977, "num_input_tokens_seen": 37323120, "step": 17285 }, { "epoch": 2.8205546492659055, "grad_norm": 0.2613255977630615, "learning_rate": 4.9744162059671763e-05, "loss": 0.1031, "num_input_tokens_seen": 37334672, "step": 17290 }, { "epoch": 2.8213703099510603, "grad_norm": 0.10233542323112488, "learning_rate": 4.974365395311942e-05, "loss": 0.0986, "num_input_tokens_seen": 37346384, "step": 17295 }, { "epoch": 2.822185970636215, "grad_norm": 0.3719596862792969, "learning_rate": 4.974314534510554e-05, "loss": 0.0558, "num_input_tokens_seen": 37358256, "step": 17300 }, { "epoch": 2.8230016313213704, "grad_norm": 1.5399234294891357, "learning_rate": 4.9742636235640425e-05, "loss": 0.1803, "num_input_tokens_seen": 37369968, "step": 17305 }, { "epoch": 2.823817292006525, "grad_norm": 0.5191076993942261, "learning_rate": 4.9742126624734406e-05, "loss": 0.1903, "num_input_tokens_seen": 37380432, "step": 17310 }, { "epoch": 2.8246329526916805, "grad_norm": 2.1458237171173096, "learning_rate": 4.9741616512397804e-05, "loss": 0.282, "num_input_tokens_seen": 37390480, "step": 17315 }, { "epoch": 2.8254486133768353, "grad_norm": 0.22538939118385315, "learning_rate": 4.9741105898640955e-05, "loss": 0.1278, "num_input_tokens_seen": 37402608, "step": 17320 }, { "epoch": 2.82626427406199, "grad_norm": 0.4628309905529022, "learning_rate": 4.9740594783474203e-05, "loss": 0.0671, "num_input_tokens_seen": 37413936, "step": 17325 }, { "epoch": 2.827079934747145, "grad_norm": 0.20049238204956055, "learning_rate": 4.974008316690792e-05, "loss": 0.0839, "num_input_tokens_seen": 37426480, "step": 17330 }, { "epoch": 2.8278955954323, "grad_norm": 0.7858479022979736, "learning_rate": 4.973957104895246e-05, "loss": 0.2151, "num_input_tokens_seen": 37438224, "step": 17335 }, { "epoch": 2.828711256117455, "grad_norm": 0.056003671139478683, "learning_rate": 4.973905842961821e-05, "loss": 0.1044, "num_input_tokens_seen": 37448176, "step": 17340 }, { "epoch": 2.8295269168026103, "grad_norm": 0.23438020050525665, "learning_rate": 4.973854530891556e-05, "loss": 0.1505, "num_input_tokens_seen": 37458672, "step": 17345 }, { "epoch": 2.830342577487765, "grad_norm": 0.2547047436237335, "learning_rate": 4.9738031686854906e-05, "loss": 0.216, "num_input_tokens_seen": 37469840, "step": 17350 }, { "epoch": 2.83115823817292, "grad_norm": 1.5802347660064697, "learning_rate": 4.973751756344666e-05, "loss": 0.2785, "num_input_tokens_seen": 37479376, "step": 17355 }, { "epoch": 2.831973898858075, "grad_norm": 1.5442689657211304, "learning_rate": 4.973700293870123e-05, "loss": 0.2171, "num_input_tokens_seen": 37489392, "step": 17360 }, { "epoch": 2.83278955954323, "grad_norm": 0.451958030462265, "learning_rate": 4.973648781262906e-05, "loss": 0.1276, "num_input_tokens_seen": 37499504, "step": 17365 }, { "epoch": 2.8336052202283852, "grad_norm": 0.5333662033081055, "learning_rate": 4.9735972185240586e-05, "loss": 0.1469, "num_input_tokens_seen": 37510512, "step": 17370 }, { "epoch": 2.83442088091354, "grad_norm": 0.9142799973487854, "learning_rate": 4.973545605654625e-05, "loss": 0.1896, "num_input_tokens_seen": 37521424, "step": 17375 }, { "epoch": 2.835236541598695, "grad_norm": 1.6080509424209595, "learning_rate": 4.9734939426556526e-05, "loss": 0.272, "num_input_tokens_seen": 37532976, "step": 17380 }, { "epoch": 2.8360522022838497, "grad_norm": 0.5634567141532898, "learning_rate": 4.973442229528187e-05, "loss": 0.1872, "num_input_tokens_seen": 37543504, "step": 17385 }, { "epoch": 2.836867862969005, "grad_norm": 0.12848444283008575, "learning_rate": 4.973390466273277e-05, "loss": 0.1334, "num_input_tokens_seen": 37554000, "step": 17390 }, { "epoch": 2.8376835236541598, "grad_norm": 0.35847926139831543, "learning_rate": 4.973338652891971e-05, "loss": 0.0949, "num_input_tokens_seen": 37565360, "step": 17395 }, { "epoch": 2.838499184339315, "grad_norm": 0.132211372256279, "learning_rate": 4.9732867893853204e-05, "loss": 0.2333, "num_input_tokens_seen": 37576368, "step": 17400 }, { "epoch": 2.83931484502447, "grad_norm": 0.28232496976852417, "learning_rate": 4.973234875754374e-05, "loss": 0.1419, "num_input_tokens_seen": 37587664, "step": 17405 }, { "epoch": 2.8401305057096247, "grad_norm": 0.3684941828250885, "learning_rate": 4.973182912000187e-05, "loss": 0.2259, "num_input_tokens_seen": 37598256, "step": 17410 }, { "epoch": 2.84094616639478, "grad_norm": 0.328339546918869, "learning_rate": 4.97313089812381e-05, "loss": 0.1874, "num_input_tokens_seen": 37608976, "step": 17415 }, { "epoch": 2.8417618270799347, "grad_norm": 0.6330769062042236, "learning_rate": 4.973078834126298e-05, "loss": 0.154, "num_input_tokens_seen": 37618352, "step": 17420 }, { "epoch": 2.8425774877650896, "grad_norm": 0.6656023263931274, "learning_rate": 4.973026720008707e-05, "loss": 0.1984, "num_input_tokens_seen": 37628816, "step": 17425 }, { "epoch": 2.843393148450245, "grad_norm": 0.09715881198644638, "learning_rate": 4.972974555772091e-05, "loss": 0.0594, "num_input_tokens_seen": 37639824, "step": 17430 }, { "epoch": 2.8442088091353996, "grad_norm": 0.6461514234542847, "learning_rate": 4.9729223414175084e-05, "loss": 0.1329, "num_input_tokens_seen": 37651504, "step": 17435 }, { "epoch": 2.8450244698205545, "grad_norm": 0.13993321359157562, "learning_rate": 4.972870076946018e-05, "loss": 0.1202, "num_input_tokens_seen": 37662992, "step": 17440 }, { "epoch": 2.8458401305057097, "grad_norm": 0.07411029189825058, "learning_rate": 4.9728177623586794e-05, "loss": 0.0548, "num_input_tokens_seen": 37673904, "step": 17445 }, { "epoch": 2.8466557911908645, "grad_norm": 0.6690711379051208, "learning_rate": 4.97276539765655e-05, "loss": 0.1215, "num_input_tokens_seen": 37684368, "step": 17450 }, { "epoch": 2.84747145187602, "grad_norm": 0.24933195114135742, "learning_rate": 4.9727129828406936e-05, "loss": 0.0717, "num_input_tokens_seen": 37694480, "step": 17455 }, { "epoch": 2.8482871125611746, "grad_norm": 0.10569550096988678, "learning_rate": 4.972660517912172e-05, "loss": 0.0269, "num_input_tokens_seen": 37705040, "step": 17460 }, { "epoch": 2.8491027732463294, "grad_norm": 1.2286194562911987, "learning_rate": 4.9726080028720485e-05, "loss": 0.215, "num_input_tokens_seen": 37716304, "step": 17465 }, { "epoch": 2.8499184339314843, "grad_norm": 0.28381410241127014, "learning_rate": 4.972555437721387e-05, "loss": 0.0825, "num_input_tokens_seen": 37726352, "step": 17470 }, { "epoch": 2.8507340946166395, "grad_norm": 0.4301745891571045, "learning_rate": 4.972502822461252e-05, "loss": 0.0632, "num_input_tokens_seen": 37736048, "step": 17475 }, { "epoch": 2.8515497553017943, "grad_norm": 2.0031800270080566, "learning_rate": 4.972450157092712e-05, "loss": 0.215, "num_input_tokens_seen": 37746704, "step": 17480 }, { "epoch": 2.8523654159869496, "grad_norm": 0.172691211104393, "learning_rate": 4.9723974416168316e-05, "loss": 0.0597, "num_input_tokens_seen": 37758000, "step": 17485 }, { "epoch": 2.8531810766721044, "grad_norm": 0.14804159104824066, "learning_rate": 4.9723446760346814e-05, "loss": 0.3526, "num_input_tokens_seen": 37768240, "step": 17490 }, { "epoch": 2.8539967373572592, "grad_norm": 0.49158141016960144, "learning_rate": 4.97229186034733e-05, "loss": 0.2322, "num_input_tokens_seen": 37778576, "step": 17495 }, { "epoch": 2.8548123980424145, "grad_norm": 0.24350574612617493, "learning_rate": 4.972238994555847e-05, "loss": 0.0833, "num_input_tokens_seen": 37788880, "step": 17500 }, { "epoch": 2.8556280587275693, "grad_norm": 0.09374912828207016, "learning_rate": 4.972186078661306e-05, "loss": 0.1164, "num_input_tokens_seen": 37799856, "step": 17505 }, { "epoch": 2.8564437194127246, "grad_norm": 0.2547605037689209, "learning_rate": 4.972133112664776e-05, "loss": 0.2012, "num_input_tokens_seen": 37809808, "step": 17510 }, { "epoch": 2.8572593800978794, "grad_norm": 1.7444710731506348, "learning_rate": 4.972080096567334e-05, "loss": 0.2397, "num_input_tokens_seen": 37819856, "step": 17515 }, { "epoch": 2.858075040783034, "grad_norm": 0.6035400629043579, "learning_rate": 4.972027030370052e-05, "loss": 0.176, "num_input_tokens_seen": 37831792, "step": 17520 }, { "epoch": 2.858890701468189, "grad_norm": 0.6006734371185303, "learning_rate": 4.9719739140740065e-05, "loss": 0.1197, "num_input_tokens_seen": 37843280, "step": 17525 }, { "epoch": 2.8597063621533443, "grad_norm": 0.6905244588851929, "learning_rate": 4.971920747680273e-05, "loss": 0.1214, "num_input_tokens_seen": 37854928, "step": 17530 }, { "epoch": 2.860522022838499, "grad_norm": 0.48908287286758423, "learning_rate": 4.971867531189931e-05, "loss": 0.1149, "num_input_tokens_seen": 37865424, "step": 17535 }, { "epoch": 2.8613376835236544, "grad_norm": 0.6208168864250183, "learning_rate": 4.971814264604057e-05, "loss": 0.1057, "num_input_tokens_seen": 37876624, "step": 17540 }, { "epoch": 2.862153344208809, "grad_norm": 0.7438675761222839, "learning_rate": 4.971760947923731e-05, "loss": 0.1738, "num_input_tokens_seen": 37886896, "step": 17545 }, { "epoch": 2.862969004893964, "grad_norm": 0.9491157531738281, "learning_rate": 4.971707581150033e-05, "loss": 0.2803, "num_input_tokens_seen": 37897424, "step": 17550 }, { "epoch": 2.863784665579119, "grad_norm": 0.20241615176200867, "learning_rate": 4.971654164284047e-05, "loss": 0.0819, "num_input_tokens_seen": 37908080, "step": 17555 }, { "epoch": 2.864600326264274, "grad_norm": 0.5238500833511353, "learning_rate": 4.971600697326854e-05, "loss": 0.0613, "num_input_tokens_seen": 37918768, "step": 17560 }, { "epoch": 2.865415986949429, "grad_norm": 0.541431725025177, "learning_rate": 4.971547180279535e-05, "loss": 0.2646, "num_input_tokens_seen": 37929488, "step": 17565 }, { "epoch": 2.866231647634584, "grad_norm": 0.40330231189727783, "learning_rate": 4.971493613143179e-05, "loss": 0.228, "num_input_tokens_seen": 37940560, "step": 17570 }, { "epoch": 2.867047308319739, "grad_norm": 0.5588533878326416, "learning_rate": 4.97143999591887e-05, "loss": 0.2109, "num_input_tokens_seen": 37951248, "step": 17575 }, { "epoch": 2.867862969004894, "grad_norm": 0.03585593029856682, "learning_rate": 4.9713863286076925e-05, "loss": 0.0537, "num_input_tokens_seen": 37961936, "step": 17580 }, { "epoch": 2.868678629690049, "grad_norm": 1.2905242443084717, "learning_rate": 4.9713326112107374e-05, "loss": 0.2112, "num_input_tokens_seen": 37972560, "step": 17585 }, { "epoch": 2.869494290375204, "grad_norm": 0.20599119365215302, "learning_rate": 4.9712788437290906e-05, "loss": 0.0784, "num_input_tokens_seen": 37984016, "step": 17590 }, { "epoch": 2.870309951060359, "grad_norm": 0.17814664542675018, "learning_rate": 4.971225026163844e-05, "loss": 0.0939, "num_input_tokens_seen": 37994896, "step": 17595 }, { "epoch": 2.871125611745514, "grad_norm": 0.3993019759654999, "learning_rate": 4.9711711585160864e-05, "loss": 0.1954, "num_input_tokens_seen": 38006832, "step": 17600 }, { "epoch": 2.8719412724306688, "grad_norm": 1.8993479013442993, "learning_rate": 4.971117240786911e-05, "loss": 0.1276, "num_input_tokens_seen": 38018832, "step": 17605 }, { "epoch": 2.8727569331158236, "grad_norm": 0.14130622148513794, "learning_rate": 4.9710632729774096e-05, "loss": 0.2304, "num_input_tokens_seen": 38029520, "step": 17610 }, { "epoch": 2.873572593800979, "grad_norm": 0.16661526262760162, "learning_rate": 4.971009255088676e-05, "loss": 0.1198, "num_input_tokens_seen": 38039568, "step": 17615 }, { "epoch": 2.8743882544861337, "grad_norm": 0.13063587248325348, "learning_rate": 4.970955187121806e-05, "loss": 0.124, "num_input_tokens_seen": 38051344, "step": 17620 }, { "epoch": 2.875203915171289, "grad_norm": 1.3845019340515137, "learning_rate": 4.970901069077893e-05, "loss": 0.1515, "num_input_tokens_seen": 38062448, "step": 17625 }, { "epoch": 2.8760195758564437, "grad_norm": 0.462797611951828, "learning_rate": 4.970846900958037e-05, "loss": 0.1182, "num_input_tokens_seen": 38073520, "step": 17630 }, { "epoch": 2.8768352365415986, "grad_norm": 0.733984112739563, "learning_rate": 4.9707926827633335e-05, "loss": 0.1587, "num_input_tokens_seen": 38084400, "step": 17635 }, { "epoch": 2.877650897226754, "grad_norm": 0.033170077949762344, "learning_rate": 4.970738414494881e-05, "loss": 0.2183, "num_input_tokens_seen": 38094288, "step": 17640 }, { "epoch": 2.8784665579119086, "grad_norm": 1.1569892168045044, "learning_rate": 4.970684096153781e-05, "loss": 0.0823, "num_input_tokens_seen": 38104752, "step": 17645 }, { "epoch": 2.8792822185970635, "grad_norm": 0.40013307332992554, "learning_rate": 4.9706297277411334e-05, "loss": 0.1307, "num_input_tokens_seen": 38113456, "step": 17650 }, { "epoch": 2.8800978792822187, "grad_norm": 0.34010282158851624, "learning_rate": 4.9705753092580395e-05, "loss": 0.1086, "num_input_tokens_seen": 38123888, "step": 17655 }, { "epoch": 2.8809135399673735, "grad_norm": 1.2232462167739868, "learning_rate": 4.9705208407056037e-05, "loss": 0.1231, "num_input_tokens_seen": 38134448, "step": 17660 }, { "epoch": 2.8817292006525284, "grad_norm": 2.612230062484741, "learning_rate": 4.970466322084929e-05, "loss": 0.2902, "num_input_tokens_seen": 38145104, "step": 17665 }, { "epoch": 2.8825448613376836, "grad_norm": 1.4757142066955566, "learning_rate": 4.9704117533971193e-05, "loss": 0.2283, "num_input_tokens_seen": 38155984, "step": 17670 }, { "epoch": 2.8833605220228384, "grad_norm": 1.136480450630188, "learning_rate": 4.970357134643283e-05, "loss": 0.1995, "num_input_tokens_seen": 38167408, "step": 17675 }, { "epoch": 2.8841761827079937, "grad_norm": 1.1027251482009888, "learning_rate": 4.970302465824525e-05, "loss": 0.0783, "num_input_tokens_seen": 38178096, "step": 17680 }, { "epoch": 2.8849918433931485, "grad_norm": 0.6670579314231873, "learning_rate": 4.970247746941953e-05, "loss": 0.0538, "num_input_tokens_seen": 38188304, "step": 17685 }, { "epoch": 2.8858075040783033, "grad_norm": 1.1266913414001465, "learning_rate": 4.970192977996677e-05, "loss": 0.1129, "num_input_tokens_seen": 38198512, "step": 17690 }, { "epoch": 2.886623164763458, "grad_norm": 0.1507110595703125, "learning_rate": 4.970138158989806e-05, "loss": 0.1224, "num_input_tokens_seen": 38209840, "step": 17695 }, { "epoch": 2.8874388254486134, "grad_norm": 0.25025051832199097, "learning_rate": 4.970083289922453e-05, "loss": 0.1172, "num_input_tokens_seen": 38219760, "step": 17700 }, { "epoch": 2.8882544861337682, "grad_norm": 0.42621979117393494, "learning_rate": 4.9700283707957275e-05, "loss": 0.0563, "num_input_tokens_seen": 38231376, "step": 17705 }, { "epoch": 2.8890701468189235, "grad_norm": 1.9409741163253784, "learning_rate": 4.969973401610744e-05, "loss": 0.2338, "num_input_tokens_seen": 38242096, "step": 17710 }, { "epoch": 2.8898858075040783, "grad_norm": 0.07411835342645645, "learning_rate": 4.969918382368616e-05, "loss": 0.0638, "num_input_tokens_seen": 38252624, "step": 17715 }, { "epoch": 2.890701468189233, "grad_norm": 0.9166484475135803, "learning_rate": 4.9698633130704586e-05, "loss": 0.1147, "num_input_tokens_seen": 38262864, "step": 17720 }, { "epoch": 2.8915171288743884, "grad_norm": 0.18061546981334686, "learning_rate": 4.969808193717388e-05, "loss": 0.0558, "num_input_tokens_seen": 38274640, "step": 17725 }, { "epoch": 2.892332789559543, "grad_norm": 0.07313041388988495, "learning_rate": 4.969753024310522e-05, "loss": 0.0417, "num_input_tokens_seen": 38286512, "step": 17730 }, { "epoch": 2.8931484502446985, "grad_norm": 0.03911302611231804, "learning_rate": 4.969697804850977e-05, "loss": 0.0864, "num_input_tokens_seen": 38297456, "step": 17735 }, { "epoch": 2.8939641109298533, "grad_norm": 1.1830817461013794, "learning_rate": 4.969642535339872e-05, "loss": 0.1573, "num_input_tokens_seen": 38308304, "step": 17740 }, { "epoch": 2.894779771615008, "grad_norm": 0.9044387936592102, "learning_rate": 4.9695872157783295e-05, "loss": 0.2721, "num_input_tokens_seen": 38318640, "step": 17745 }, { "epoch": 2.895595432300163, "grad_norm": 0.8858193159103394, "learning_rate": 4.969531846167469e-05, "loss": 0.0592, "num_input_tokens_seen": 38330320, "step": 17750 }, { "epoch": 2.896411092985318, "grad_norm": 0.3953441083431244, "learning_rate": 4.969476426508412e-05, "loss": 0.0559, "num_input_tokens_seen": 38340432, "step": 17755 }, { "epoch": 2.897226753670473, "grad_norm": 0.8888004422187805, "learning_rate": 4.969420956802284e-05, "loss": 0.1221, "num_input_tokens_seen": 38351760, "step": 17760 }, { "epoch": 2.8980424143556283, "grad_norm": 0.35999608039855957, "learning_rate": 4.9693654370502066e-05, "loss": 0.0621, "num_input_tokens_seen": 38361136, "step": 17765 }, { "epoch": 2.898858075040783, "grad_norm": 0.11340724676847458, "learning_rate": 4.969309867253306e-05, "loss": 0.0383, "num_input_tokens_seen": 38371408, "step": 17770 }, { "epoch": 2.899673735725938, "grad_norm": 1.1317050457000732, "learning_rate": 4.969254247412709e-05, "loss": 0.0972, "num_input_tokens_seen": 38383056, "step": 17775 }, { "epoch": 2.9004893964110927, "grad_norm": 0.34824642539024353, "learning_rate": 4.969198577529541e-05, "loss": 0.0707, "num_input_tokens_seen": 38393840, "step": 17780 }, { "epoch": 2.901305057096248, "grad_norm": 0.471093088388443, "learning_rate": 4.9691428576049326e-05, "loss": 0.0842, "num_input_tokens_seen": 38406096, "step": 17785 }, { "epoch": 2.902120717781403, "grad_norm": 0.2579663097858429, "learning_rate": 4.9690870876400115e-05, "loss": 0.1275, "num_input_tokens_seen": 38416912, "step": 17790 }, { "epoch": 2.902936378466558, "grad_norm": 1.6372424364089966, "learning_rate": 4.969031267635909e-05, "loss": 0.1349, "num_input_tokens_seen": 38428016, "step": 17795 }, { "epoch": 2.903752039151713, "grad_norm": 0.9729329347610474, "learning_rate": 4.968975397593754e-05, "loss": 0.1436, "num_input_tokens_seen": 38438480, "step": 17800 }, { "epoch": 2.9045676998368677, "grad_norm": 0.7580547332763672, "learning_rate": 4.9689194775146815e-05, "loss": 0.062, "num_input_tokens_seen": 38448112, "step": 17805 }, { "epoch": 2.905383360522023, "grad_norm": 0.7057550549507141, "learning_rate": 4.9688635073998235e-05, "loss": 0.2246, "num_input_tokens_seen": 38457872, "step": 17810 }, { "epoch": 2.9061990212071778, "grad_norm": 0.22652149200439453, "learning_rate": 4.9688074872503145e-05, "loss": 0.1503, "num_input_tokens_seen": 38469040, "step": 17815 }, { "epoch": 2.907014681892333, "grad_norm": 2.4358725547790527, "learning_rate": 4.96875141706729e-05, "loss": 0.2379, "num_input_tokens_seen": 38479344, "step": 17820 }, { "epoch": 2.907830342577488, "grad_norm": 1.733978033065796, "learning_rate": 4.968695296851886e-05, "loss": 0.1335, "num_input_tokens_seen": 38489552, "step": 17825 }, { "epoch": 2.9086460032626427, "grad_norm": 0.8782638311386108, "learning_rate": 4.9686391266052406e-05, "loss": 0.1666, "num_input_tokens_seen": 38500752, "step": 17830 }, { "epoch": 2.9094616639477975, "grad_norm": 1.6202337741851807, "learning_rate": 4.968582906328492e-05, "loss": 0.1749, "num_input_tokens_seen": 38512432, "step": 17835 }, { "epoch": 2.9102773246329527, "grad_norm": 0.026386253535747528, "learning_rate": 4.9685266360227775e-05, "loss": 0.1034, "num_input_tokens_seen": 38522256, "step": 17840 }, { "epoch": 2.9110929853181076, "grad_norm": 0.39469650387763977, "learning_rate": 4.9684703156892406e-05, "loss": 0.153, "num_input_tokens_seen": 38532784, "step": 17845 }, { "epoch": 2.911908646003263, "grad_norm": 1.8728200197219849, "learning_rate": 4.9684139453290204e-05, "loss": 0.2518, "num_input_tokens_seen": 38542640, "step": 17850 }, { "epoch": 2.9127243066884176, "grad_norm": 0.952305257320404, "learning_rate": 4.9683575249432614e-05, "loss": 0.1665, "num_input_tokens_seen": 38553008, "step": 17855 }, { "epoch": 2.9135399673735725, "grad_norm": 1.0594700574874878, "learning_rate": 4.9683010545331046e-05, "loss": 0.2553, "num_input_tokens_seen": 38565200, "step": 17860 }, { "epoch": 2.9143556280587277, "grad_norm": 0.26008105278015137, "learning_rate": 4.9682445340996966e-05, "loss": 0.1578, "num_input_tokens_seen": 38574736, "step": 17865 }, { "epoch": 2.9151712887438825, "grad_norm": 0.3352341651916504, "learning_rate": 4.9681879636441805e-05, "loss": 0.0369, "num_input_tokens_seen": 38584496, "step": 17870 }, { "epoch": 2.9159869494290374, "grad_norm": 0.0984460785984993, "learning_rate": 4.968131343167706e-05, "loss": 0.1314, "num_input_tokens_seen": 38594448, "step": 17875 }, { "epoch": 2.9168026101141926, "grad_norm": 0.15714547038078308, "learning_rate": 4.968074672671417e-05, "loss": 0.0824, "num_input_tokens_seen": 38604976, "step": 17880 }, { "epoch": 2.9176182707993474, "grad_norm": 0.1419895887374878, "learning_rate": 4.968017952156465e-05, "loss": 0.2476, "num_input_tokens_seen": 38615792, "step": 17885 }, { "epoch": 2.9184339314845023, "grad_norm": 1.4294267892837524, "learning_rate": 4.967961181623998e-05, "loss": 0.2107, "num_input_tokens_seen": 38626320, "step": 17890 }, { "epoch": 2.9192495921696575, "grad_norm": 0.5808171033859253, "learning_rate": 4.9679043610751664e-05, "loss": 0.1154, "num_input_tokens_seen": 38637008, "step": 17895 }, { "epoch": 2.9200652528548123, "grad_norm": 0.7101060748100281, "learning_rate": 4.967847490511123e-05, "loss": 0.0782, "num_input_tokens_seen": 38647056, "step": 17900 }, { "epoch": 2.9208809135399676, "grad_norm": 1.3668502569198608, "learning_rate": 4.967790569933019e-05, "loss": 0.4158, "num_input_tokens_seen": 38658512, "step": 17905 }, { "epoch": 2.9216965742251224, "grad_norm": 0.42118144035339355, "learning_rate": 4.9677335993420083e-05, "loss": 0.1487, "num_input_tokens_seen": 38669328, "step": 17910 }, { "epoch": 2.9225122349102772, "grad_norm": 0.5431349277496338, "learning_rate": 4.9676765787392466e-05, "loss": 0.1214, "num_input_tokens_seen": 38680048, "step": 17915 }, { "epoch": 2.923327895595432, "grad_norm": 0.3918423652648926, "learning_rate": 4.9676195081258876e-05, "loss": 0.2573, "num_input_tokens_seen": 38691952, "step": 17920 }, { "epoch": 2.9241435562805873, "grad_norm": 0.22365230321884155, "learning_rate": 4.967562387503089e-05, "loss": 0.0918, "num_input_tokens_seen": 38701872, "step": 17925 }, { "epoch": 2.924959216965742, "grad_norm": 0.06948385387659073, "learning_rate": 4.967505216872008e-05, "loss": 0.0528, "num_input_tokens_seen": 38713104, "step": 17930 }, { "epoch": 2.9257748776508974, "grad_norm": 0.22032999992370605, "learning_rate": 4.967447996233804e-05, "loss": 0.1798, "num_input_tokens_seen": 38723952, "step": 17935 }, { "epoch": 2.926590538336052, "grad_norm": 0.1738394945859909, "learning_rate": 4.967390725589637e-05, "loss": 0.0519, "num_input_tokens_seen": 38735216, "step": 17940 }, { "epoch": 2.927406199021207, "grad_norm": 0.8852603435516357, "learning_rate": 4.9673334049406657e-05, "loss": 0.1088, "num_input_tokens_seen": 38746032, "step": 17945 }, { "epoch": 2.9282218597063623, "grad_norm": 1.020583152770996, "learning_rate": 4.967276034288053e-05, "loss": 0.1789, "num_input_tokens_seen": 38756464, "step": 17950 }, { "epoch": 2.929037520391517, "grad_norm": 0.6498134732246399, "learning_rate": 4.967218613632962e-05, "loss": 0.1779, "num_input_tokens_seen": 38767952, "step": 17955 }, { "epoch": 2.9298531810766724, "grad_norm": 0.11423951387405396, "learning_rate": 4.9671611429765555e-05, "loss": 0.0766, "num_input_tokens_seen": 38779312, "step": 17960 }, { "epoch": 2.930668841761827, "grad_norm": 0.1320653259754181, "learning_rate": 4.967103622319998e-05, "loss": 0.0669, "num_input_tokens_seen": 38790576, "step": 17965 }, { "epoch": 2.931484502446982, "grad_norm": 0.48872342705726624, "learning_rate": 4.967046051664457e-05, "loss": 0.0969, "num_input_tokens_seen": 38802032, "step": 17970 }, { "epoch": 2.932300163132137, "grad_norm": 1.352181077003479, "learning_rate": 4.966988431011098e-05, "loss": 0.2659, "num_input_tokens_seen": 38812080, "step": 17975 }, { "epoch": 2.933115823817292, "grad_norm": 1.320924997329712, "learning_rate": 4.966930760361088e-05, "loss": 0.2081, "num_input_tokens_seen": 38823664, "step": 17980 }, { "epoch": 2.933931484502447, "grad_norm": 1.996290683746338, "learning_rate": 4.966873039715598e-05, "loss": 0.1269, "num_input_tokens_seen": 38833840, "step": 17985 }, { "epoch": 2.934747145187602, "grad_norm": 1.2550230026245117, "learning_rate": 4.966815269075795e-05, "loss": 0.2056, "num_input_tokens_seen": 38844400, "step": 17990 }, { "epoch": 2.935562805872757, "grad_norm": 0.2190944254398346, "learning_rate": 4.966757448442852e-05, "loss": 0.0939, "num_input_tokens_seen": 38855248, "step": 17995 }, { "epoch": 2.936378466557912, "grad_norm": 0.18698784708976746, "learning_rate": 4.96669957781794e-05, "loss": 0.1096, "num_input_tokens_seen": 38867472, "step": 18000 }, { "epoch": 2.9371941272430666, "grad_norm": 0.30852100253105164, "learning_rate": 4.9666416572022315e-05, "loss": 0.0824, "num_input_tokens_seen": 38878864, "step": 18005 }, { "epoch": 2.938009787928222, "grad_norm": 1.7624353170394897, "learning_rate": 4.9665836865969006e-05, "loss": 0.1924, "num_input_tokens_seen": 38890032, "step": 18010 }, { "epoch": 2.9388254486133767, "grad_norm": 0.23119784891605377, "learning_rate": 4.966525666003122e-05, "loss": 0.1167, "num_input_tokens_seen": 38900624, "step": 18015 }, { "epoch": 2.939641109298532, "grad_norm": 0.07830218225717545, "learning_rate": 4.966467595422072e-05, "loss": 0.0561, "num_input_tokens_seen": 38911984, "step": 18020 }, { "epoch": 2.9404567699836868, "grad_norm": 0.7828739285469055, "learning_rate": 4.9664094748549275e-05, "loss": 0.0805, "num_input_tokens_seen": 38922896, "step": 18025 }, { "epoch": 2.9412724306688416, "grad_norm": 2.3643782138824463, "learning_rate": 4.966351304302866e-05, "loss": 0.2748, "num_input_tokens_seen": 38933872, "step": 18030 }, { "epoch": 2.942088091353997, "grad_norm": 0.5619105100631714, "learning_rate": 4.966293083767067e-05, "loss": 0.0469, "num_input_tokens_seen": 38943088, "step": 18035 }, { "epoch": 2.9429037520391517, "grad_norm": 1.4021399021148682, "learning_rate": 4.966234813248709e-05, "loss": 0.2266, "num_input_tokens_seen": 38954064, "step": 18040 }, { "epoch": 2.943719412724307, "grad_norm": 0.5018907189369202, "learning_rate": 4.966176492748975e-05, "loss": 0.1006, "num_input_tokens_seen": 38965904, "step": 18045 }, { "epoch": 2.9445350734094617, "grad_norm": 0.6063344478607178, "learning_rate": 4.966118122269044e-05, "loss": 0.1199, "num_input_tokens_seen": 38976080, "step": 18050 }, { "epoch": 2.9453507340946166, "grad_norm": 0.15892519056797028, "learning_rate": 4.966059701810103e-05, "loss": 0.0924, "num_input_tokens_seen": 38987344, "step": 18055 }, { "epoch": 2.9461663947797714, "grad_norm": 3.2631032466888428, "learning_rate": 4.966001231373332e-05, "loss": 0.0979, "num_input_tokens_seen": 38999472, "step": 18060 }, { "epoch": 2.9469820554649266, "grad_norm": 0.16215908527374268, "learning_rate": 4.965942710959919e-05, "loss": 0.1358, "num_input_tokens_seen": 39010352, "step": 18065 }, { "epoch": 2.9477977161500815, "grad_norm": 0.13825465738773346, "learning_rate": 4.965884140571048e-05, "loss": 0.1404, "num_input_tokens_seen": 39020816, "step": 18070 }, { "epoch": 2.9486133768352367, "grad_norm": 0.7976029515266418, "learning_rate": 4.9658255202079065e-05, "loss": 0.1095, "num_input_tokens_seen": 39032112, "step": 18075 }, { "epoch": 2.9494290375203915, "grad_norm": 0.055461086332798004, "learning_rate": 4.965766849871683e-05, "loss": 0.0283, "num_input_tokens_seen": 39043536, "step": 18080 }, { "epoch": 2.9502446982055464, "grad_norm": 0.8388701677322388, "learning_rate": 4.965708129563565e-05, "loss": 0.1475, "num_input_tokens_seen": 39053232, "step": 18085 }, { "epoch": 2.9510603588907016, "grad_norm": 0.6103430986404419, "learning_rate": 4.9656493592847456e-05, "loss": 0.0418, "num_input_tokens_seen": 39064368, "step": 18090 }, { "epoch": 2.9518760195758564, "grad_norm": 0.44472604990005493, "learning_rate": 4.965590539036413e-05, "loss": 0.351, "num_input_tokens_seen": 39072560, "step": 18095 }, { "epoch": 2.9526916802610113, "grad_norm": 0.6626334190368652, "learning_rate": 4.965531668819761e-05, "loss": 0.1187, "num_input_tokens_seen": 39084080, "step": 18100 }, { "epoch": 2.9535073409461665, "grad_norm": 0.8074207901954651, "learning_rate": 4.965472748635982e-05, "loss": 0.1953, "num_input_tokens_seen": 39095056, "step": 18105 }, { "epoch": 2.9543230016313213, "grad_norm": 0.14162248373031616, "learning_rate": 4.96541377848627e-05, "loss": 0.1431, "num_input_tokens_seen": 39106000, "step": 18110 }, { "epoch": 2.955138662316476, "grad_norm": 0.9112270474433899, "learning_rate": 4.96535475837182e-05, "loss": 0.2385, "num_input_tokens_seen": 39115760, "step": 18115 }, { "epoch": 2.9559543230016314, "grad_norm": 2.6828341484069824, "learning_rate": 4.9652956882938274e-05, "loss": 0.2367, "num_input_tokens_seen": 39126160, "step": 18120 }, { "epoch": 2.9567699836867862, "grad_norm": 0.5949549674987793, "learning_rate": 4.965236568253491e-05, "loss": 0.1039, "num_input_tokens_seen": 39137264, "step": 18125 }, { "epoch": 2.9575856443719415, "grad_norm": 1.2245110273361206, "learning_rate": 4.9651773982520086e-05, "loss": 0.0869, "num_input_tokens_seen": 39147824, "step": 18130 }, { "epoch": 2.9584013050570963, "grad_norm": 0.2313842922449112, "learning_rate": 4.965118178290579e-05, "loss": 0.2, "num_input_tokens_seen": 39158768, "step": 18135 }, { "epoch": 2.959216965742251, "grad_norm": 0.5231760144233704, "learning_rate": 4.965058908370401e-05, "loss": 0.085, "num_input_tokens_seen": 39168752, "step": 18140 }, { "epoch": 2.960032626427406, "grad_norm": 0.2525261640548706, "learning_rate": 4.964999588492678e-05, "loss": 0.0306, "num_input_tokens_seen": 39180528, "step": 18145 }, { "epoch": 2.960848287112561, "grad_norm": 1.1965709924697876, "learning_rate": 4.9649402186586117e-05, "loss": 0.2914, "num_input_tokens_seen": 39192336, "step": 18150 }, { "epoch": 2.961663947797716, "grad_norm": 0.6340651512145996, "learning_rate": 4.964880798869404e-05, "loss": 0.117, "num_input_tokens_seen": 39203056, "step": 18155 }, { "epoch": 2.9624796084828713, "grad_norm": 0.38590261340141296, "learning_rate": 4.9648213291262605e-05, "loss": 0.1261, "num_input_tokens_seen": 39213104, "step": 18160 }, { "epoch": 2.963295269168026, "grad_norm": 1.4334486722946167, "learning_rate": 4.9647618094303856e-05, "loss": 0.2545, "num_input_tokens_seen": 39223792, "step": 18165 }, { "epoch": 2.964110929853181, "grad_norm": 2.429401159286499, "learning_rate": 4.9647022397829855e-05, "loss": 0.1348, "num_input_tokens_seen": 39235248, "step": 18170 }, { "epoch": 2.964926590538336, "grad_norm": 0.05691371485590935, "learning_rate": 4.964642620185269e-05, "loss": 0.1462, "num_input_tokens_seen": 39245424, "step": 18175 }, { "epoch": 2.965742251223491, "grad_norm": 0.40623217821121216, "learning_rate": 4.964582950638442e-05, "loss": 0.2018, "num_input_tokens_seen": 39255344, "step": 18180 }, { "epoch": 2.9665579119086463, "grad_norm": 1.3663816452026367, "learning_rate": 4.964523231143715e-05, "loss": 0.1213, "num_input_tokens_seen": 39266192, "step": 18185 }, { "epoch": 2.967373572593801, "grad_norm": 0.9745461940765381, "learning_rate": 4.9644634617022996e-05, "loss": 0.1976, "num_input_tokens_seen": 39277040, "step": 18190 }, { "epoch": 2.968189233278956, "grad_norm": 1.916093349456787, "learning_rate": 4.964403642315405e-05, "loss": 0.1916, "num_input_tokens_seen": 39286160, "step": 18195 }, { "epoch": 2.9690048939641107, "grad_norm": 0.30416393280029297, "learning_rate": 4.964343772984244e-05, "loss": 0.0575, "num_input_tokens_seen": 39296496, "step": 18200 }, { "epoch": 2.969820554649266, "grad_norm": 1.347125768661499, "learning_rate": 4.96428385371003e-05, "loss": 0.1664, "num_input_tokens_seen": 39306672, "step": 18205 }, { "epoch": 2.970636215334421, "grad_norm": 0.11967674642801285, "learning_rate": 4.964223884493978e-05, "loss": 0.0357, "num_input_tokens_seen": 39316816, "step": 18210 }, { "epoch": 2.971451876019576, "grad_norm": 0.5783727765083313, "learning_rate": 4.9641638653373026e-05, "loss": 0.1618, "num_input_tokens_seen": 39327984, "step": 18215 }, { "epoch": 2.972267536704731, "grad_norm": 0.45981982350349426, "learning_rate": 4.964103796241221e-05, "loss": 0.1308, "num_input_tokens_seen": 39338832, "step": 18220 }, { "epoch": 2.9730831973898857, "grad_norm": 0.5367425084114075, "learning_rate": 4.96404367720695e-05, "loss": 0.1842, "num_input_tokens_seen": 39349104, "step": 18225 }, { "epoch": 2.9738988580750405, "grad_norm": 1.535853385925293, "learning_rate": 4.9639835082357076e-05, "loss": 0.3555, "num_input_tokens_seen": 39358640, "step": 18230 }, { "epoch": 2.9747145187601958, "grad_norm": 0.24178457260131836, "learning_rate": 4.963923289328714e-05, "loss": 0.0967, "num_input_tokens_seen": 39368688, "step": 18235 }, { "epoch": 2.9755301794453506, "grad_norm": 0.9747298359870911, "learning_rate": 4.9638630204871884e-05, "loss": 0.1158, "num_input_tokens_seen": 39378224, "step": 18240 }, { "epoch": 2.976345840130506, "grad_norm": 0.43444138765335083, "learning_rate": 4.963802701712353e-05, "loss": 0.0844, "num_input_tokens_seen": 39388368, "step": 18245 }, { "epoch": 2.9771615008156607, "grad_norm": 1.0829076766967773, "learning_rate": 4.963742333005431e-05, "loss": 0.1379, "num_input_tokens_seen": 39399120, "step": 18250 }, { "epoch": 2.9779771615008155, "grad_norm": 0.3717573881149292, "learning_rate": 4.963681914367645e-05, "loss": 0.1073, "num_input_tokens_seen": 39408560, "step": 18255 }, { "epoch": 2.9787928221859707, "grad_norm": 0.4161696135997772, "learning_rate": 4.9636214458002196e-05, "loss": 0.1275, "num_input_tokens_seen": 39419664, "step": 18260 }, { "epoch": 2.9796084828711256, "grad_norm": 0.9309355616569519, "learning_rate": 4.96356092730438e-05, "loss": 0.0719, "num_input_tokens_seen": 39430480, "step": 18265 }, { "epoch": 2.980424143556281, "grad_norm": 0.35328808426856995, "learning_rate": 4.9635003588813533e-05, "loss": 0.0926, "num_input_tokens_seen": 39441008, "step": 18270 }, { "epoch": 2.9812398042414356, "grad_norm": 0.7587143778800964, "learning_rate": 4.9634397405323675e-05, "loss": 0.0862, "num_input_tokens_seen": 39451984, "step": 18275 }, { "epoch": 2.9820554649265905, "grad_norm": 1.4651280641555786, "learning_rate": 4.963379072258649e-05, "loss": 0.2851, "num_input_tokens_seen": 39461904, "step": 18280 }, { "epoch": 2.9828711256117453, "grad_norm": 0.43235617876052856, "learning_rate": 4.963318354061429e-05, "loss": 0.049, "num_input_tokens_seen": 39472656, "step": 18285 }, { "epoch": 2.9836867862969005, "grad_norm": 0.11779060959815979, "learning_rate": 4.9632575859419376e-05, "loss": 0.2132, "num_input_tokens_seen": 39483376, "step": 18290 }, { "epoch": 2.9845024469820554, "grad_norm": 0.34661757946014404, "learning_rate": 4.963196767901406e-05, "loss": 0.0944, "num_input_tokens_seen": 39496144, "step": 18295 }, { "epoch": 2.9853181076672106, "grad_norm": 0.40790992975234985, "learning_rate": 4.963135899941068e-05, "loss": 0.2089, "num_input_tokens_seen": 39506128, "step": 18300 }, { "epoch": 2.9861337683523654, "grad_norm": 1.2800570726394653, "learning_rate": 4.963074982062155e-05, "loss": 0.4316, "num_input_tokens_seen": 39515440, "step": 18305 }, { "epoch": 2.9869494290375203, "grad_norm": 0.14346164464950562, "learning_rate": 4.9630140142659045e-05, "loss": 0.0462, "num_input_tokens_seen": 39526512, "step": 18310 }, { "epoch": 2.9877650897226755, "grad_norm": 0.050717543810606, "learning_rate": 4.9629529965535495e-05, "loss": 0.0267, "num_input_tokens_seen": 39537136, "step": 18315 }, { "epoch": 2.9885807504078303, "grad_norm": 0.5006159543991089, "learning_rate": 4.962891928926327e-05, "loss": 0.0924, "num_input_tokens_seen": 39545808, "step": 18320 }, { "epoch": 2.9893964110929856, "grad_norm": 0.6888518333435059, "learning_rate": 4.962830811385476e-05, "loss": 0.0716, "num_input_tokens_seen": 39558000, "step": 18325 }, { "epoch": 2.9902120717781404, "grad_norm": 0.47732266783714294, "learning_rate": 4.9627696439322334e-05, "loss": 0.1945, "num_input_tokens_seen": 39570000, "step": 18330 }, { "epoch": 2.9910277324632952, "grad_norm": 0.2922027111053467, "learning_rate": 4.9627084265678404e-05, "loss": 0.1392, "num_input_tokens_seen": 39580560, "step": 18335 }, { "epoch": 2.99184339314845, "grad_norm": 0.7589938044548035, "learning_rate": 4.962647159293537e-05, "loss": 0.041, "num_input_tokens_seen": 39591408, "step": 18340 }, { "epoch": 2.9926590538336053, "grad_norm": 1.425829529762268, "learning_rate": 4.962585842110565e-05, "loss": 0.1073, "num_input_tokens_seen": 39601872, "step": 18345 }, { "epoch": 2.99347471451876, "grad_norm": 0.5289862155914307, "learning_rate": 4.9625244750201666e-05, "loss": 0.0899, "num_input_tokens_seen": 39613008, "step": 18350 }, { "epoch": 2.9942903752039154, "grad_norm": 0.8151041865348816, "learning_rate": 4.962463058023585e-05, "loss": 0.1116, "num_input_tokens_seen": 39624272, "step": 18355 }, { "epoch": 2.99510603588907, "grad_norm": 0.2872672379016876, "learning_rate": 4.962401591122067e-05, "loss": 0.0813, "num_input_tokens_seen": 39635216, "step": 18360 }, { "epoch": 2.995921696574225, "grad_norm": 0.4721495509147644, "learning_rate": 4.962340074316857e-05, "loss": 0.1146, "num_input_tokens_seen": 39645808, "step": 18365 }, { "epoch": 2.99673735725938, "grad_norm": 0.3734080493450165, "learning_rate": 4.9622785076092006e-05, "loss": 0.1426, "num_input_tokens_seen": 39656560, "step": 18370 }, { "epoch": 2.997553017944535, "grad_norm": 1.079584002494812, "learning_rate": 4.962216891000348e-05, "loss": 0.0768, "num_input_tokens_seen": 39666704, "step": 18375 }, { "epoch": 2.99836867862969, "grad_norm": 1.4916282892227173, "learning_rate": 4.9621552244915445e-05, "loss": 0.1072, "num_input_tokens_seen": 39677008, "step": 18380 }, { "epoch": 2.999184339314845, "grad_norm": 0.46853378415107727, "learning_rate": 4.962093508084044e-05, "loss": 0.0382, "num_input_tokens_seen": 39686416, "step": 18385 }, { "epoch": 3.0, "grad_norm": 0.014542927034199238, "learning_rate": 4.962031741779094e-05, "loss": 0.133, "num_input_tokens_seen": 39694112, "step": 18390 }, { "epoch": 3.0, "eval_loss": 0.14694160223007202, "eval_runtime": 90.6098, "eval_samples_per_second": 30.074, "eval_steps_per_second": 7.527, "num_input_tokens_seen": 39694112, "step": 18390 }, { "epoch": 3.000815660685155, "grad_norm": 0.09970025718212128, "learning_rate": 4.961969925577948e-05, "loss": 0.0211, "num_input_tokens_seen": 39703936, "step": 18395 }, { "epoch": 3.00163132137031, "grad_norm": 0.10174940526485443, "learning_rate": 4.961908059481858e-05, "loss": 0.0756, "num_input_tokens_seen": 39715968, "step": 18400 }, { "epoch": 3.002446982055465, "grad_norm": 0.37441056966781616, "learning_rate": 4.9618461434920775e-05, "loss": 0.1189, "num_input_tokens_seen": 39726912, "step": 18405 }, { "epoch": 3.0032626427406197, "grad_norm": 1.5522252321243286, "learning_rate": 4.961784177609863e-05, "loss": 0.2491, "num_input_tokens_seen": 39737504, "step": 18410 }, { "epoch": 3.004078303425775, "grad_norm": 2.109215259552002, "learning_rate": 4.961722161836469e-05, "loss": 0.3743, "num_input_tokens_seen": 39749792, "step": 18415 }, { "epoch": 3.00489396411093, "grad_norm": 1.2134950160980225, "learning_rate": 4.961660096173151e-05, "loss": 0.0663, "num_input_tokens_seen": 39760928, "step": 18420 }, { "epoch": 3.0057096247960846, "grad_norm": 1.4583772420883179, "learning_rate": 4.9615979806211695e-05, "loss": 0.0734, "num_input_tokens_seen": 39772096, "step": 18425 }, { "epoch": 3.00652528548124, "grad_norm": 0.37403926253318787, "learning_rate": 4.961535815181781e-05, "loss": 0.0787, "num_input_tokens_seen": 39782528, "step": 18430 }, { "epoch": 3.0073409461663947, "grad_norm": 0.17740899324417114, "learning_rate": 4.961473599856248e-05, "loss": 0.0127, "num_input_tokens_seen": 39794336, "step": 18435 }, { "epoch": 3.00815660685155, "grad_norm": 1.5574673414230347, "learning_rate": 4.9614113346458294e-05, "loss": 0.2343, "num_input_tokens_seen": 39805376, "step": 18440 }, { "epoch": 3.0089722675367048, "grad_norm": 2.0178678035736084, "learning_rate": 4.961349019551788e-05, "loss": 0.1425, "num_input_tokens_seen": 39816768, "step": 18445 }, { "epoch": 3.0097879282218596, "grad_norm": 0.16500937938690186, "learning_rate": 4.961286654575385e-05, "loss": 0.1977, "num_input_tokens_seen": 39827520, "step": 18450 }, { "epoch": 3.010603588907015, "grad_norm": 0.3473271131515503, "learning_rate": 4.9612242397178866e-05, "loss": 0.155, "num_input_tokens_seen": 39839072, "step": 18455 }, { "epoch": 3.0114192495921697, "grad_norm": 0.4564594626426697, "learning_rate": 4.961161774980557e-05, "loss": 0.1123, "num_input_tokens_seen": 39850432, "step": 18460 }, { "epoch": 3.0122349102773245, "grad_norm": 0.06909140199422836, "learning_rate": 4.961099260364661e-05, "loss": 0.0917, "num_input_tokens_seen": 39861600, "step": 18465 }, { "epoch": 3.0130505709624797, "grad_norm": 0.9588829874992371, "learning_rate": 4.9610366958714664e-05, "loss": 0.1075, "num_input_tokens_seen": 39872288, "step": 18470 }, { "epoch": 3.0138662316476346, "grad_norm": 0.09918234497308731, "learning_rate": 4.9609740815022423e-05, "loss": 0.2916, "num_input_tokens_seen": 39881984, "step": 18475 }, { "epoch": 3.0146818923327894, "grad_norm": 0.05905257165431976, "learning_rate": 4.960911417258255e-05, "loss": 0.0675, "num_input_tokens_seen": 39893184, "step": 18480 }, { "epoch": 3.0154975530179446, "grad_norm": 0.19324755668640137, "learning_rate": 4.960848703140777e-05, "loss": 0.0414, "num_input_tokens_seen": 39903200, "step": 18485 }, { "epoch": 3.0163132137030995, "grad_norm": 0.2803761065006256, "learning_rate": 4.960785939151077e-05, "loss": 0.1311, "num_input_tokens_seen": 39913728, "step": 18490 }, { "epoch": 3.0171288743882543, "grad_norm": 0.9363030195236206, "learning_rate": 4.960723125290429e-05, "loss": 0.0647, "num_input_tokens_seen": 39924576, "step": 18495 }, { "epoch": 3.0179445350734095, "grad_norm": 1.010617733001709, "learning_rate": 4.960660261560105e-05, "loss": 0.1482, "num_input_tokens_seen": 39935808, "step": 18500 }, { "epoch": 3.0187601957585644, "grad_norm": 0.20309537649154663, "learning_rate": 4.96059734796138e-05, "loss": 0.1308, "num_input_tokens_seen": 39946592, "step": 18505 }, { "epoch": 3.0195758564437196, "grad_norm": 0.23624058067798615, "learning_rate": 4.960534384495528e-05, "loss": 0.0509, "num_input_tokens_seen": 39958592, "step": 18510 }, { "epoch": 3.0203915171288744, "grad_norm": 0.9581770300865173, "learning_rate": 4.960471371163824e-05, "loss": 0.1483, "num_input_tokens_seen": 39969760, "step": 18515 }, { "epoch": 3.0212071778140293, "grad_norm": 0.7092381715774536, "learning_rate": 4.960408307967548e-05, "loss": 0.115, "num_input_tokens_seen": 39980064, "step": 18520 }, { "epoch": 3.0220228384991845, "grad_norm": 1.3015117645263672, "learning_rate": 4.960345194907975e-05, "loss": 0.1053, "num_input_tokens_seen": 39991840, "step": 18525 }, { "epoch": 3.0228384991843393, "grad_norm": 0.04057583585381508, "learning_rate": 4.9602820319863866e-05, "loss": 0.1121, "num_input_tokens_seen": 40002912, "step": 18530 }, { "epoch": 3.023654159869494, "grad_norm": 0.18497052788734436, "learning_rate": 4.960218819204061e-05, "loss": 0.2058, "num_input_tokens_seen": 40014624, "step": 18535 }, { "epoch": 3.0244698205546494, "grad_norm": 0.5895796418190002, "learning_rate": 4.9601555565622805e-05, "loss": 0.2773, "num_input_tokens_seen": 40025408, "step": 18540 }, { "epoch": 3.0252854812398042, "grad_norm": 0.7431747317314148, "learning_rate": 4.960092244062327e-05, "loss": 0.1229, "num_input_tokens_seen": 40036352, "step": 18545 }, { "epoch": 3.026101141924959, "grad_norm": 0.46731722354888916, "learning_rate": 4.960028881705482e-05, "loss": 0.0842, "num_input_tokens_seen": 40045888, "step": 18550 }, { "epoch": 3.0269168026101143, "grad_norm": 0.08565615117549896, "learning_rate": 4.959965469493032e-05, "loss": 0.1533, "num_input_tokens_seen": 40056928, "step": 18555 }, { "epoch": 3.027732463295269, "grad_norm": 0.12781494855880737, "learning_rate": 4.95990200742626e-05, "loss": 0.0441, "num_input_tokens_seen": 40067456, "step": 18560 }, { "epoch": 3.028548123980424, "grad_norm": 0.17192411422729492, "learning_rate": 4.9598384955064545e-05, "loss": 0.1183, "num_input_tokens_seen": 40077760, "step": 18565 }, { "epoch": 3.029363784665579, "grad_norm": 0.4375765919685364, "learning_rate": 4.9597749337349006e-05, "loss": 0.0795, "num_input_tokens_seen": 40089376, "step": 18570 }, { "epoch": 3.030179445350734, "grad_norm": 1.3411352634429932, "learning_rate": 4.9597113221128876e-05, "loss": 0.1154, "num_input_tokens_seen": 40100544, "step": 18575 }, { "epoch": 3.0309951060358893, "grad_norm": 0.15594948828220367, "learning_rate": 4.959647660641704e-05, "loss": 0.0261, "num_input_tokens_seen": 40111616, "step": 18580 }, { "epoch": 3.031810766721044, "grad_norm": 0.4085423946380615, "learning_rate": 4.959583949322641e-05, "loss": 0.1967, "num_input_tokens_seen": 40123712, "step": 18585 }, { "epoch": 3.032626427406199, "grad_norm": 0.5179871320724487, "learning_rate": 4.959520188156989e-05, "loss": 0.053, "num_input_tokens_seen": 40134976, "step": 18590 }, { "epoch": 3.033442088091354, "grad_norm": 1.5645172595977783, "learning_rate": 4.959456377146039e-05, "loss": 0.1607, "num_input_tokens_seen": 40144992, "step": 18595 }, { "epoch": 3.034257748776509, "grad_norm": 0.03547582030296326, "learning_rate": 4.9593925162910865e-05, "loss": 0.0495, "num_input_tokens_seen": 40156928, "step": 18600 }, { "epoch": 3.035073409461664, "grad_norm": 0.4290485680103302, "learning_rate": 4.959328605593425e-05, "loss": 0.0932, "num_input_tokens_seen": 40167264, "step": 18605 }, { "epoch": 3.035889070146819, "grad_norm": 0.8473891615867615, "learning_rate": 4.959264645054348e-05, "loss": 0.2198, "num_input_tokens_seen": 40178944, "step": 18610 }, { "epoch": 3.036704730831974, "grad_norm": 0.5067782998085022, "learning_rate": 4.959200634675154e-05, "loss": 0.2029, "num_input_tokens_seen": 40190240, "step": 18615 }, { "epoch": 3.0375203915171287, "grad_norm": 1.317222237586975, "learning_rate": 4.95913657445714e-05, "loss": 0.1336, "num_input_tokens_seen": 40201696, "step": 18620 }, { "epoch": 3.038336052202284, "grad_norm": 0.2292579561471939, "learning_rate": 4.959072464401603e-05, "loss": 0.225, "num_input_tokens_seen": 40211424, "step": 18625 }, { "epoch": 3.039151712887439, "grad_norm": 1.4080429077148438, "learning_rate": 4.959008304509843e-05, "loss": 0.1348, "num_input_tokens_seen": 40221888, "step": 18630 }, { "epoch": 3.0399673735725936, "grad_norm": 0.15380558371543884, "learning_rate": 4.958944094783161e-05, "loss": 0.0629, "num_input_tokens_seen": 40233120, "step": 18635 }, { "epoch": 3.040783034257749, "grad_norm": 1.8761152029037476, "learning_rate": 4.9588798352228564e-05, "loss": 0.1288, "num_input_tokens_seen": 40243104, "step": 18640 }, { "epoch": 3.0415986949429037, "grad_norm": 0.0825396403670311, "learning_rate": 4.9588155258302336e-05, "loss": 0.143, "num_input_tokens_seen": 40253344, "step": 18645 }, { "epoch": 3.0424143556280585, "grad_norm": 1.484968900680542, "learning_rate": 4.9587511666065946e-05, "loss": 0.2884, "num_input_tokens_seen": 40265280, "step": 18650 }, { "epoch": 3.0432300163132138, "grad_norm": 0.7431877255439758, "learning_rate": 4.9586867575532434e-05, "loss": 0.1044, "num_input_tokens_seen": 40276224, "step": 18655 }, { "epoch": 3.0440456769983686, "grad_norm": 0.15280628204345703, "learning_rate": 4.958622298671487e-05, "loss": 0.2013, "num_input_tokens_seen": 40287168, "step": 18660 }, { "epoch": 3.044861337683524, "grad_norm": 0.879505455493927, "learning_rate": 4.95855778996263e-05, "loss": 0.0557, "num_input_tokens_seen": 40297184, "step": 18665 }, { "epoch": 3.0456769983686787, "grad_norm": 1.034510612487793, "learning_rate": 4.9584932314279806e-05, "loss": 0.0752, "num_input_tokens_seen": 40308128, "step": 18670 }, { "epoch": 3.0464926590538335, "grad_norm": 0.6935213804244995, "learning_rate": 4.958428623068847e-05, "loss": 0.0937, "num_input_tokens_seen": 40320352, "step": 18675 }, { "epoch": 3.0473083197389887, "grad_norm": 0.6650720834732056, "learning_rate": 4.9583639648865385e-05, "loss": 0.1882, "num_input_tokens_seen": 40331424, "step": 18680 }, { "epoch": 3.0481239804241436, "grad_norm": 0.38172948360443115, "learning_rate": 4.958299256882367e-05, "loss": 0.0801, "num_input_tokens_seen": 40343264, "step": 18685 }, { "epoch": 3.0489396411092984, "grad_norm": 1.1209537982940674, "learning_rate": 4.958234499057641e-05, "loss": 0.1159, "num_input_tokens_seen": 40356608, "step": 18690 }, { "epoch": 3.0497553017944536, "grad_norm": 1.4076194763183594, "learning_rate": 4.958169691413674e-05, "loss": 0.1651, "num_input_tokens_seen": 40367456, "step": 18695 }, { "epoch": 3.0505709624796085, "grad_norm": 1.1534968614578247, "learning_rate": 4.958104833951781e-05, "loss": 0.0898, "num_input_tokens_seen": 40377728, "step": 18700 }, { "epoch": 3.0513866231647633, "grad_norm": 0.7264713644981384, "learning_rate": 4.958039926673275e-05, "loss": 0.0986, "num_input_tokens_seen": 40389184, "step": 18705 }, { "epoch": 3.0522022838499185, "grad_norm": 0.717922568321228, "learning_rate": 4.9579749695794716e-05, "loss": 0.0868, "num_input_tokens_seen": 40400512, "step": 18710 }, { "epoch": 3.0530179445350734, "grad_norm": 0.6956278085708618, "learning_rate": 4.957909962671687e-05, "loss": 0.1163, "num_input_tokens_seen": 40411008, "step": 18715 }, { "epoch": 3.053833605220228, "grad_norm": 1.5566141605377197, "learning_rate": 4.9578449059512386e-05, "loss": 0.1318, "num_input_tokens_seen": 40422528, "step": 18720 }, { "epoch": 3.0546492659053834, "grad_norm": 0.03294513374567032, "learning_rate": 4.9577797994194456e-05, "loss": 0.0973, "num_input_tokens_seen": 40432384, "step": 18725 }, { "epoch": 3.0554649265905383, "grad_norm": 0.15924644470214844, "learning_rate": 4.9577146430776274e-05, "loss": 0.2073, "num_input_tokens_seen": 40443424, "step": 18730 }, { "epoch": 3.0562805872756935, "grad_norm": 0.11434194445610046, "learning_rate": 4.957649436927104e-05, "loss": 0.0539, "num_input_tokens_seen": 40453760, "step": 18735 }, { "epoch": 3.0570962479608483, "grad_norm": 1.5330877304077148, "learning_rate": 4.957584180969197e-05, "loss": 0.119, "num_input_tokens_seen": 40464832, "step": 18740 }, { "epoch": 3.057911908646003, "grad_norm": 3.7938413619995117, "learning_rate": 4.957518875205228e-05, "loss": 0.2733, "num_input_tokens_seen": 40475136, "step": 18745 }, { "epoch": 3.0587275693311584, "grad_norm": 1.1684212684631348, "learning_rate": 4.957453519636522e-05, "loss": 0.2401, "num_input_tokens_seen": 40485856, "step": 18750 }, { "epoch": 3.0595432300163132, "grad_norm": 0.35272589325904846, "learning_rate": 4.957388114264403e-05, "loss": 0.1501, "num_input_tokens_seen": 40497120, "step": 18755 }, { "epoch": 3.060358890701468, "grad_norm": 0.7713191509246826, "learning_rate": 4.957322659090197e-05, "loss": 0.0888, "num_input_tokens_seen": 40508928, "step": 18760 }, { "epoch": 3.0611745513866233, "grad_norm": 0.17858757078647614, "learning_rate": 4.957257154115229e-05, "loss": 0.1961, "num_input_tokens_seen": 40519904, "step": 18765 }, { "epoch": 3.061990212071778, "grad_norm": 1.906164288520813, "learning_rate": 4.957191599340828e-05, "loss": 0.1997, "num_input_tokens_seen": 40529568, "step": 18770 }, { "epoch": 3.062805872756933, "grad_norm": 0.7929359078407288, "learning_rate": 4.957125994768322e-05, "loss": 0.1267, "num_input_tokens_seen": 40540512, "step": 18775 }, { "epoch": 3.063621533442088, "grad_norm": 0.19163629412651062, "learning_rate": 4.957060340399041e-05, "loss": 0.151, "num_input_tokens_seen": 40550592, "step": 18780 }, { "epoch": 3.064437194127243, "grad_norm": 0.5993306040763855, "learning_rate": 4.9569946362343145e-05, "loss": 0.0685, "num_input_tokens_seen": 40561824, "step": 18785 }, { "epoch": 3.065252854812398, "grad_norm": 0.5070936679840088, "learning_rate": 4.9569288822754754e-05, "loss": 0.1094, "num_input_tokens_seen": 40572416, "step": 18790 }, { "epoch": 3.066068515497553, "grad_norm": 1.472542405128479, "learning_rate": 4.9568630785238554e-05, "loss": 0.2282, "num_input_tokens_seen": 40583712, "step": 18795 }, { "epoch": 3.066884176182708, "grad_norm": 1.4970425367355347, "learning_rate": 4.956797224980788e-05, "loss": 0.1888, "num_input_tokens_seen": 40594560, "step": 18800 }, { "epoch": 3.067699836867863, "grad_norm": 1.3525822162628174, "learning_rate": 4.956731321647609e-05, "loss": 0.1417, "num_input_tokens_seen": 40605536, "step": 18805 }, { "epoch": 3.068515497553018, "grad_norm": 0.618307888507843, "learning_rate": 4.9566653685256524e-05, "loss": 0.1054, "num_input_tokens_seen": 40615328, "step": 18810 }, { "epoch": 3.069331158238173, "grad_norm": 0.892562210559845, "learning_rate": 4.9565993656162564e-05, "loss": 0.0909, "num_input_tokens_seen": 40625504, "step": 18815 }, { "epoch": 3.070146818923328, "grad_norm": 1.407273769378662, "learning_rate": 4.956533312920757e-05, "loss": 0.2421, "num_input_tokens_seen": 40636896, "step": 18820 }, { "epoch": 3.070962479608483, "grad_norm": 1.246140956878662, "learning_rate": 4.956467210440494e-05, "loss": 0.2121, "num_input_tokens_seen": 40647456, "step": 18825 }, { "epoch": 3.0717781402936377, "grad_norm": 0.6678705215454102, "learning_rate": 4.9564010581768075e-05, "loss": 0.1353, "num_input_tokens_seen": 40656832, "step": 18830 }, { "epoch": 3.072593800978793, "grad_norm": 1.0923020839691162, "learning_rate": 4.956334856131036e-05, "loss": 0.0735, "num_input_tokens_seen": 40666752, "step": 18835 }, { "epoch": 3.073409461663948, "grad_norm": 0.9662344455718994, "learning_rate": 4.956268604304524e-05, "loss": 0.0685, "num_input_tokens_seen": 40678368, "step": 18840 }, { "epoch": 3.0742251223491026, "grad_norm": 2.002080202102661, "learning_rate": 4.956202302698612e-05, "loss": 0.2336, "num_input_tokens_seen": 40689632, "step": 18845 }, { "epoch": 3.075040783034258, "grad_norm": 0.7028719782829285, "learning_rate": 4.956135951314644e-05, "loss": 0.0888, "num_input_tokens_seen": 40700576, "step": 18850 }, { "epoch": 3.0758564437194127, "grad_norm": 1.174317479133606, "learning_rate": 4.9560695501539654e-05, "loss": 0.1736, "num_input_tokens_seen": 40711488, "step": 18855 }, { "epoch": 3.0766721044045675, "grad_norm": 0.5433820486068726, "learning_rate": 4.956003099217922e-05, "loss": 0.1631, "num_input_tokens_seen": 40720832, "step": 18860 }, { "epoch": 3.0774877650897228, "grad_norm": 2.3543527126312256, "learning_rate": 4.95593659850786e-05, "loss": 0.1775, "num_input_tokens_seen": 40731200, "step": 18865 }, { "epoch": 3.0783034257748776, "grad_norm": 1.165195345878601, "learning_rate": 4.9558700480251266e-05, "loss": 0.0976, "num_input_tokens_seen": 40742432, "step": 18870 }, { "epoch": 3.0791190864600324, "grad_norm": 0.24401745200157166, "learning_rate": 4.955803447771072e-05, "loss": 0.2623, "num_input_tokens_seen": 40753856, "step": 18875 }, { "epoch": 3.0799347471451877, "grad_norm": 0.6391231417655945, "learning_rate": 4.955736797747045e-05, "loss": 0.1685, "num_input_tokens_seen": 40762976, "step": 18880 }, { "epoch": 3.0807504078303425, "grad_norm": 1.5304064750671387, "learning_rate": 4.955670097954396e-05, "loss": 0.1288, "num_input_tokens_seen": 40773504, "step": 18885 }, { "epoch": 3.0815660685154977, "grad_norm": 0.2747229337692261, "learning_rate": 4.9556033483944775e-05, "loss": 0.0283, "num_input_tokens_seen": 40784576, "step": 18890 }, { "epoch": 3.0823817292006526, "grad_norm": 0.27506884932518005, "learning_rate": 4.9555365490686424e-05, "loss": 0.0401, "num_input_tokens_seen": 40794240, "step": 18895 }, { "epoch": 3.0831973898858074, "grad_norm": 0.0753101110458374, "learning_rate": 4.9554696999782435e-05, "loss": 0.0742, "num_input_tokens_seen": 40805600, "step": 18900 }, { "epoch": 3.0840130505709626, "grad_norm": 0.42316845059394836, "learning_rate": 4.9554028011246365e-05, "loss": 0.1119, "num_input_tokens_seen": 40816352, "step": 18905 }, { "epoch": 3.0848287112561175, "grad_norm": 0.3635318875312805, "learning_rate": 4.955335852509177e-05, "loss": 0.0683, "num_input_tokens_seen": 40826720, "step": 18910 }, { "epoch": 3.0856443719412723, "grad_norm": 0.9209347367286682, "learning_rate": 4.95526885413322e-05, "loss": 0.2152, "num_input_tokens_seen": 40837216, "step": 18915 }, { "epoch": 3.0864600326264275, "grad_norm": 2.3221330642700195, "learning_rate": 4.955201805998127e-05, "loss": 0.0884, "num_input_tokens_seen": 40847904, "step": 18920 }, { "epoch": 3.0872756933115824, "grad_norm": 0.2187131941318512, "learning_rate": 4.955134708105253e-05, "loss": 0.1594, "num_input_tokens_seen": 40858304, "step": 18925 }, { "epoch": 3.088091353996737, "grad_norm": 0.23830819129943848, "learning_rate": 4.955067560455961e-05, "loss": 0.0644, "num_input_tokens_seen": 40869088, "step": 18930 }, { "epoch": 3.0889070146818924, "grad_norm": 0.7068062424659729, "learning_rate": 4.9550003630516106e-05, "loss": 0.0784, "num_input_tokens_seen": 40878976, "step": 18935 }, { "epoch": 3.0897226753670473, "grad_norm": 1.4774718284606934, "learning_rate": 4.9549331158935624e-05, "loss": 0.1751, "num_input_tokens_seen": 40890144, "step": 18940 }, { "epoch": 3.090538336052202, "grad_norm": 1.0593514442443848, "learning_rate": 4.954865818983181e-05, "loss": 0.141, "num_input_tokens_seen": 40900448, "step": 18945 }, { "epoch": 3.0913539967373573, "grad_norm": 0.17874115705490112, "learning_rate": 4.954798472321829e-05, "loss": 0.0466, "num_input_tokens_seen": 40910880, "step": 18950 }, { "epoch": 3.092169657422512, "grad_norm": 0.249684676527977, "learning_rate": 4.9547310759108725e-05, "loss": 0.0547, "num_input_tokens_seen": 40922752, "step": 18955 }, { "epoch": 3.0929853181076674, "grad_norm": 0.8448081016540527, "learning_rate": 4.9546636297516754e-05, "loss": 0.1077, "num_input_tokens_seen": 40932640, "step": 18960 }, { "epoch": 3.0938009787928222, "grad_norm": 0.2504555881023407, "learning_rate": 4.954596133845607e-05, "loss": 0.1209, "num_input_tokens_seen": 40943264, "step": 18965 }, { "epoch": 3.094616639477977, "grad_norm": 0.771717369556427, "learning_rate": 4.954528588194035e-05, "loss": 0.1016, "num_input_tokens_seen": 40953664, "step": 18970 }, { "epoch": 3.0954323001631323, "grad_norm": 1.736933708190918, "learning_rate": 4.9544609927983254e-05, "loss": 0.1888, "num_input_tokens_seen": 40964544, "step": 18975 }, { "epoch": 3.096247960848287, "grad_norm": 0.05269375815987587, "learning_rate": 4.954393347659851e-05, "loss": 0.033, "num_input_tokens_seen": 40975520, "step": 18980 }, { "epoch": 3.097063621533442, "grad_norm": 0.7886321544647217, "learning_rate": 4.954325652779982e-05, "loss": 0.0986, "num_input_tokens_seen": 40985728, "step": 18985 }, { "epoch": 3.097879282218597, "grad_norm": 0.3193559944629669, "learning_rate": 4.9542579081600896e-05, "loss": 0.153, "num_input_tokens_seen": 40996064, "step": 18990 }, { "epoch": 3.098694942903752, "grad_norm": 0.06400563567876816, "learning_rate": 4.954190113801548e-05, "loss": 0.1034, "num_input_tokens_seen": 41007456, "step": 18995 }, { "epoch": 3.099510603588907, "grad_norm": 1.7705724239349365, "learning_rate": 4.954122269705729e-05, "loss": 0.2429, "num_input_tokens_seen": 41017792, "step": 19000 }, { "epoch": 3.100326264274062, "grad_norm": 0.1603657752275467, "learning_rate": 4.95405437587401e-05, "loss": 0.12, "num_input_tokens_seen": 41028512, "step": 19005 }, { "epoch": 3.101141924959217, "grad_norm": 0.903112530708313, "learning_rate": 4.9539864323077656e-05, "loss": 0.0911, "num_input_tokens_seen": 41039520, "step": 19010 }, { "epoch": 3.1019575856443717, "grad_norm": 0.16501715779304504, "learning_rate": 4.9539184390083735e-05, "loss": 0.1755, "num_input_tokens_seen": 41050336, "step": 19015 }, { "epoch": 3.102773246329527, "grad_norm": 0.22855861485004425, "learning_rate": 4.9538503959772106e-05, "loss": 0.018, "num_input_tokens_seen": 41061088, "step": 19020 }, { "epoch": 3.103588907014682, "grad_norm": 0.045484818518161774, "learning_rate": 4.953782303215657e-05, "loss": 0.2294, "num_input_tokens_seen": 41071680, "step": 19025 }, { "epoch": 3.104404567699837, "grad_norm": 0.07366777211427689, "learning_rate": 4.953714160725091e-05, "loss": 0.1498, "num_input_tokens_seen": 41083744, "step": 19030 }, { "epoch": 3.105220228384992, "grad_norm": 1.0015872716903687, "learning_rate": 4.953645968506896e-05, "loss": 0.1504, "num_input_tokens_seen": 41094688, "step": 19035 }, { "epoch": 3.1060358890701467, "grad_norm": 0.0640719011425972, "learning_rate": 4.953577726562453e-05, "loss": 0.2425, "num_input_tokens_seen": 41106656, "step": 19040 }, { "epoch": 3.106851549755302, "grad_norm": 0.2816218137741089, "learning_rate": 4.9535094348931445e-05, "loss": 0.0942, "num_input_tokens_seen": 41117280, "step": 19045 }, { "epoch": 3.107667210440457, "grad_norm": 0.43357738852500916, "learning_rate": 4.953441093500354e-05, "loss": 0.1277, "num_input_tokens_seen": 41128064, "step": 19050 }, { "epoch": 3.1084828711256116, "grad_norm": 0.5504752993583679, "learning_rate": 4.953372702385468e-05, "loss": 0.1238, "num_input_tokens_seen": 41137440, "step": 19055 }, { "epoch": 3.109298531810767, "grad_norm": 1.063521385192871, "learning_rate": 4.953304261549872e-05, "loss": 0.1191, "num_input_tokens_seen": 41147648, "step": 19060 }, { "epoch": 3.1101141924959217, "grad_norm": 0.6556297540664673, "learning_rate": 4.953235770994953e-05, "loss": 0.1712, "num_input_tokens_seen": 41158688, "step": 19065 }, { "epoch": 3.1109298531810765, "grad_norm": 0.31831005215644836, "learning_rate": 4.953167230722098e-05, "loss": 0.0192, "num_input_tokens_seen": 41168160, "step": 19070 }, { "epoch": 3.1117455138662318, "grad_norm": 1.4418621063232422, "learning_rate": 4.9530986407326976e-05, "loss": 0.1402, "num_input_tokens_seen": 41179936, "step": 19075 }, { "epoch": 3.1125611745513866, "grad_norm": 0.12248639762401581, "learning_rate": 4.953030001028142e-05, "loss": 0.1794, "num_input_tokens_seen": 41190528, "step": 19080 }, { "epoch": 3.1133768352365414, "grad_norm": 0.7443529963493347, "learning_rate": 4.952961311609821e-05, "loss": 0.0874, "num_input_tokens_seen": 41200128, "step": 19085 }, { "epoch": 3.1141924959216967, "grad_norm": 1.3052303791046143, "learning_rate": 4.952892572479126e-05, "loss": 0.1008, "num_input_tokens_seen": 41211040, "step": 19090 }, { "epoch": 3.1150081566068515, "grad_norm": 0.5318034291267395, "learning_rate": 4.952823783637453e-05, "loss": 0.4115, "num_input_tokens_seen": 41223392, "step": 19095 }, { "epoch": 3.1158238172920063, "grad_norm": 0.8186287879943848, "learning_rate": 4.952754945086193e-05, "loss": 0.1008, "num_input_tokens_seen": 41233344, "step": 19100 }, { "epoch": 3.1166394779771616, "grad_norm": 2.1827096939086914, "learning_rate": 4.952686056826744e-05, "loss": 0.3333, "num_input_tokens_seen": 41244512, "step": 19105 }, { "epoch": 3.1174551386623164, "grad_norm": 0.3623180687427521, "learning_rate": 4.9526171188605e-05, "loss": 0.1853, "num_input_tokens_seen": 41255360, "step": 19110 }, { "epoch": 3.1182707993474716, "grad_norm": 1.0442841053009033, "learning_rate": 4.9525481311888577e-05, "loss": 0.0967, "num_input_tokens_seen": 41265856, "step": 19115 }, { "epoch": 3.1190864600326265, "grad_norm": 0.1178278923034668, "learning_rate": 4.952479093813217e-05, "loss": 0.1925, "num_input_tokens_seen": 41277216, "step": 19120 }, { "epoch": 3.1199021207177813, "grad_norm": 2.9703025817871094, "learning_rate": 4.952410006734977e-05, "loss": 0.3537, "num_input_tokens_seen": 41288256, "step": 19125 }, { "epoch": 3.1207177814029365, "grad_norm": 0.2570064067840576, "learning_rate": 4.952340869955536e-05, "loss": 0.1365, "num_input_tokens_seen": 41298784, "step": 19130 }, { "epoch": 3.1215334420880914, "grad_norm": 0.21580469608306885, "learning_rate": 4.9522716834762973e-05, "loss": 0.1842, "num_input_tokens_seen": 41309792, "step": 19135 }, { "epoch": 3.122349102773246, "grad_norm": 0.8323580026626587, "learning_rate": 4.952202447298661e-05, "loss": 0.1382, "num_input_tokens_seen": 41320320, "step": 19140 }, { "epoch": 3.1231647634584014, "grad_norm": 0.9010335803031921, "learning_rate": 4.9521331614240315e-05, "loss": 0.076, "num_input_tokens_seen": 41331008, "step": 19145 }, { "epoch": 3.1239804241435563, "grad_norm": 0.3006012439727783, "learning_rate": 4.952063825853813e-05, "loss": 0.2254, "num_input_tokens_seen": 41341888, "step": 19150 }, { "epoch": 3.124796084828711, "grad_norm": 0.26157864928245544, "learning_rate": 4.95199444058941e-05, "loss": 0.128, "num_input_tokens_seen": 41351840, "step": 19155 }, { "epoch": 3.1256117455138663, "grad_norm": 0.38469022512435913, "learning_rate": 4.95192500563223e-05, "loss": 0.0507, "num_input_tokens_seen": 41362592, "step": 19160 }, { "epoch": 3.126427406199021, "grad_norm": 0.5178022980690002, "learning_rate": 4.951855520983679e-05, "loss": 0.1373, "num_input_tokens_seen": 41372704, "step": 19165 }, { "epoch": 3.1272430668841764, "grad_norm": 0.39222103357315063, "learning_rate": 4.951785986645165e-05, "loss": 0.1538, "num_input_tokens_seen": 41383584, "step": 19170 }, { "epoch": 3.1280587275693312, "grad_norm": 2.077378749847412, "learning_rate": 4.951716402618097e-05, "loss": 0.3766, "num_input_tokens_seen": 41394976, "step": 19175 }, { "epoch": 3.128874388254486, "grad_norm": 1.281856894493103, "learning_rate": 4.951646768903887e-05, "loss": 0.1629, "num_input_tokens_seen": 41406016, "step": 19180 }, { "epoch": 3.1296900489396413, "grad_norm": 0.07331240177154541, "learning_rate": 4.951577085503945e-05, "loss": 0.0803, "num_input_tokens_seen": 41417376, "step": 19185 }, { "epoch": 3.130505709624796, "grad_norm": 1.0309315919876099, "learning_rate": 4.951507352419683e-05, "loss": 0.2094, "num_input_tokens_seen": 41428416, "step": 19190 }, { "epoch": 3.131321370309951, "grad_norm": 1.11721670627594, "learning_rate": 4.951437569652515e-05, "loss": 0.2499, "num_input_tokens_seen": 41439552, "step": 19195 }, { "epoch": 3.132137030995106, "grad_norm": 0.225084587931633, "learning_rate": 4.9513677372038545e-05, "loss": 0.0398, "num_input_tokens_seen": 41448928, "step": 19200 }, { "epoch": 3.132952691680261, "grad_norm": 0.554641842842102, "learning_rate": 4.951297855075117e-05, "loss": 0.1201, "num_input_tokens_seen": 41460096, "step": 19205 }, { "epoch": 3.133768352365416, "grad_norm": 1.417564034461975, "learning_rate": 4.9512279232677185e-05, "loss": 0.1343, "num_input_tokens_seen": 41469696, "step": 19210 }, { "epoch": 3.134584013050571, "grad_norm": 0.1993839144706726, "learning_rate": 4.9511579417830766e-05, "loss": 0.0455, "num_input_tokens_seen": 41480224, "step": 19215 }, { "epoch": 3.135399673735726, "grad_norm": 0.3111076354980469, "learning_rate": 4.951087910622611e-05, "loss": 0.1162, "num_input_tokens_seen": 41490176, "step": 19220 }, { "epoch": 3.1362153344208807, "grad_norm": 0.9046695232391357, "learning_rate": 4.9510178297877376e-05, "loss": 0.1898, "num_input_tokens_seen": 41500928, "step": 19225 }, { "epoch": 3.137030995106036, "grad_norm": 0.18078558146953583, "learning_rate": 4.9509476992798785e-05, "loss": 0.1188, "num_input_tokens_seen": 41512608, "step": 19230 }, { "epoch": 3.137846655791191, "grad_norm": 0.5601058602333069, "learning_rate": 4.9508775191004555e-05, "loss": 0.0609, "num_input_tokens_seen": 41522880, "step": 19235 }, { "epoch": 3.1386623164763456, "grad_norm": 0.6050176024436951, "learning_rate": 4.9508072892508904e-05, "loss": 0.2043, "num_input_tokens_seen": 41533632, "step": 19240 }, { "epoch": 3.139477977161501, "grad_norm": 0.9686516523361206, "learning_rate": 4.9507370097326075e-05, "loss": 0.0986, "num_input_tokens_seen": 41546112, "step": 19245 }, { "epoch": 3.1402936378466557, "grad_norm": 1.2494823932647705, "learning_rate": 4.950666680547029e-05, "loss": 0.0743, "num_input_tokens_seen": 41557376, "step": 19250 }, { "epoch": 3.141109298531811, "grad_norm": 0.5755285620689392, "learning_rate": 4.950596301695581e-05, "loss": 0.2585, "num_input_tokens_seen": 41568160, "step": 19255 }, { "epoch": 3.141924959216966, "grad_norm": 0.916634202003479, "learning_rate": 4.9505258731796914e-05, "loss": 0.1707, "num_input_tokens_seen": 41577696, "step": 19260 }, { "epoch": 3.1427406199021206, "grad_norm": 0.2146979719400406, "learning_rate": 4.950455395000785e-05, "loss": 0.0625, "num_input_tokens_seen": 41589088, "step": 19265 }, { "epoch": 3.143556280587276, "grad_norm": 0.6145527362823486, "learning_rate": 4.950384867160293e-05, "loss": 0.1083, "num_input_tokens_seen": 41599904, "step": 19270 }, { "epoch": 3.1443719412724307, "grad_norm": 0.16554659605026245, "learning_rate": 4.9503142896596415e-05, "loss": 0.053, "num_input_tokens_seen": 41610784, "step": 19275 }, { "epoch": 3.1451876019575855, "grad_norm": 0.12114576995372772, "learning_rate": 4.9502436625002634e-05, "loss": 0.0909, "num_input_tokens_seen": 41620768, "step": 19280 }, { "epoch": 3.1460032626427408, "grad_norm": 0.18925631046295166, "learning_rate": 4.950172985683589e-05, "loss": 0.1, "num_input_tokens_seen": 41632384, "step": 19285 }, { "epoch": 3.1468189233278956, "grad_norm": 1.5698938369750977, "learning_rate": 4.95010225921105e-05, "loss": 0.1207, "num_input_tokens_seen": 41643680, "step": 19290 }, { "epoch": 3.1476345840130504, "grad_norm": 1.3839261531829834, "learning_rate": 4.950031483084081e-05, "loss": 0.1106, "num_input_tokens_seen": 41655424, "step": 19295 }, { "epoch": 3.1484502446982057, "grad_norm": 0.6759423017501831, "learning_rate": 4.9499606573041166e-05, "loss": 0.0972, "num_input_tokens_seen": 41666144, "step": 19300 }, { "epoch": 3.1492659053833605, "grad_norm": 0.7091131210327148, "learning_rate": 4.9498897818725906e-05, "loss": 0.0532, "num_input_tokens_seen": 41677312, "step": 19305 }, { "epoch": 3.1500815660685153, "grad_norm": 1.5706981420516968, "learning_rate": 4.949818856790941e-05, "loss": 0.0793, "num_input_tokens_seen": 41689344, "step": 19310 }, { "epoch": 3.1508972267536706, "grad_norm": 0.11824019253253937, "learning_rate": 4.949747882060603e-05, "loss": 0.0451, "num_input_tokens_seen": 41700032, "step": 19315 }, { "epoch": 3.1517128874388254, "grad_norm": 0.04297294467687607, "learning_rate": 4.9496768576830185e-05, "loss": 0.0878, "num_input_tokens_seen": 41711040, "step": 19320 }, { "epoch": 3.15252854812398, "grad_norm": 0.3148386776447296, "learning_rate": 4.949605783659624e-05, "loss": 0.1568, "num_input_tokens_seen": 41721184, "step": 19325 }, { "epoch": 3.1533442088091355, "grad_norm": 1.3961981534957886, "learning_rate": 4.94953465999186e-05, "loss": 0.1895, "num_input_tokens_seen": 41731616, "step": 19330 }, { "epoch": 3.1541598694942903, "grad_norm": 0.23324933648109436, "learning_rate": 4.9494634866811694e-05, "loss": 0.1421, "num_input_tokens_seen": 41741568, "step": 19335 }, { "epoch": 3.1549755301794455, "grad_norm": 1.7242072820663452, "learning_rate": 4.9493922637289935e-05, "loss": 0.169, "num_input_tokens_seen": 41752512, "step": 19340 }, { "epoch": 3.1557911908646004, "grad_norm": 1.339334487915039, "learning_rate": 4.949320991136776e-05, "loss": 0.1408, "num_input_tokens_seen": 41763040, "step": 19345 }, { "epoch": 3.156606851549755, "grad_norm": 0.2639526128768921, "learning_rate": 4.9492496689059625e-05, "loss": 0.2807, "num_input_tokens_seen": 41773472, "step": 19350 }, { "epoch": 3.1574225122349104, "grad_norm": 0.8988432884216309, "learning_rate": 4.949178297037996e-05, "loss": 0.0489, "num_input_tokens_seen": 41784096, "step": 19355 }, { "epoch": 3.1582381729200653, "grad_norm": 0.13519281148910522, "learning_rate": 4.9491068755343254e-05, "loss": 0.0362, "num_input_tokens_seen": 41795616, "step": 19360 }, { "epoch": 3.15905383360522, "grad_norm": 0.33990931510925293, "learning_rate": 4.949035404396397e-05, "loss": 0.0186, "num_input_tokens_seen": 41806784, "step": 19365 }, { "epoch": 3.1598694942903753, "grad_norm": 1.0272772312164307, "learning_rate": 4.948963883625659e-05, "loss": 0.1362, "num_input_tokens_seen": 41817920, "step": 19370 }, { "epoch": 3.16068515497553, "grad_norm": 1.285616159439087, "learning_rate": 4.948892313223561e-05, "loss": 0.2848, "num_input_tokens_seen": 41827520, "step": 19375 }, { "epoch": 3.161500815660685, "grad_norm": 0.5525727868080139, "learning_rate": 4.9488206931915537e-05, "loss": 0.1308, "num_input_tokens_seen": 41838496, "step": 19380 }, { "epoch": 3.1623164763458402, "grad_norm": 0.8650786280632019, "learning_rate": 4.9487490235310885e-05, "loss": 0.1361, "num_input_tokens_seen": 41848800, "step": 19385 }, { "epoch": 3.163132137030995, "grad_norm": 0.19178374111652374, "learning_rate": 4.9486773042436176e-05, "loss": 0.3169, "num_input_tokens_seen": 41859616, "step": 19390 }, { "epoch": 3.1639477977161503, "grad_norm": 0.467532217502594, "learning_rate": 4.948605535330595e-05, "loss": 0.108, "num_input_tokens_seen": 41870432, "step": 19395 }, { "epoch": 3.164763458401305, "grad_norm": 0.5583205819129944, "learning_rate": 4.948533716793475e-05, "loss": 0.2097, "num_input_tokens_seen": 41880960, "step": 19400 }, { "epoch": 3.16557911908646, "grad_norm": 1.0445327758789062, "learning_rate": 4.948461848633713e-05, "loss": 0.2681, "num_input_tokens_seen": 41891456, "step": 19405 }, { "epoch": 3.166394779771615, "grad_norm": 0.6009696125984192, "learning_rate": 4.948389930852766e-05, "loss": 0.0509, "num_input_tokens_seen": 41903040, "step": 19410 }, { "epoch": 3.16721044045677, "grad_norm": 0.9539790749549866, "learning_rate": 4.94831796345209e-05, "loss": 0.1305, "num_input_tokens_seen": 41914272, "step": 19415 }, { "epoch": 3.168026101141925, "grad_norm": 0.7413142323493958, "learning_rate": 4.9482459464331455e-05, "loss": 0.1915, "num_input_tokens_seen": 41925184, "step": 19420 }, { "epoch": 3.16884176182708, "grad_norm": 2.417978525161743, "learning_rate": 4.948173879797391e-05, "loss": 0.2218, "num_input_tokens_seen": 41936288, "step": 19425 }, { "epoch": 3.169657422512235, "grad_norm": 1.527023196220398, "learning_rate": 4.948101763546287e-05, "loss": 0.1225, "num_input_tokens_seen": 41945984, "step": 19430 }, { "epoch": 3.1704730831973897, "grad_norm": 0.6753785014152527, "learning_rate": 4.948029597681295e-05, "loss": 0.2439, "num_input_tokens_seen": 41956256, "step": 19435 }, { "epoch": 3.171288743882545, "grad_norm": 1.3183648586273193, "learning_rate": 4.947957382203877e-05, "loss": 0.1299, "num_input_tokens_seen": 41967040, "step": 19440 }, { "epoch": 3.1721044045677, "grad_norm": 0.03903450816869736, "learning_rate": 4.9478851171154985e-05, "loss": 0.062, "num_input_tokens_seen": 41977664, "step": 19445 }, { "epoch": 3.1729200652528546, "grad_norm": 0.8988629579544067, "learning_rate": 4.947812802417622e-05, "loss": 0.0974, "num_input_tokens_seen": 41988384, "step": 19450 }, { "epoch": 3.17373572593801, "grad_norm": 1.576564073562622, "learning_rate": 4.9477404381117134e-05, "loss": 0.2876, "num_input_tokens_seen": 42000576, "step": 19455 }, { "epoch": 3.1745513866231647, "grad_norm": 0.2510572373867035, "learning_rate": 4.94766802419924e-05, "loss": 0.1207, "num_input_tokens_seen": 42010176, "step": 19460 }, { "epoch": 3.1753670473083195, "grad_norm": 0.6887116432189941, "learning_rate": 4.9475955606816696e-05, "loss": 0.1129, "num_input_tokens_seen": 42021248, "step": 19465 }, { "epoch": 3.176182707993475, "grad_norm": 0.17441341280937195, "learning_rate": 4.9475230475604696e-05, "loss": 0.1208, "num_input_tokens_seen": 42031712, "step": 19470 }, { "epoch": 3.1769983686786296, "grad_norm": 0.3749530017375946, "learning_rate": 4.94745048483711e-05, "loss": 0.1666, "num_input_tokens_seen": 42042848, "step": 19475 }, { "epoch": 3.177814029363785, "grad_norm": 1.4288713932037354, "learning_rate": 4.947377872513062e-05, "loss": 0.1327, "num_input_tokens_seen": 42053856, "step": 19480 }, { "epoch": 3.1786296900489397, "grad_norm": 1.1500355005264282, "learning_rate": 4.9473052105897964e-05, "loss": 0.1324, "num_input_tokens_seen": 42064864, "step": 19485 }, { "epoch": 3.1794453507340945, "grad_norm": 1.0060371160507202, "learning_rate": 4.9472324990687854e-05, "loss": 0.0837, "num_input_tokens_seen": 42076864, "step": 19490 }, { "epoch": 3.1802610114192498, "grad_norm": 0.26184967160224915, "learning_rate": 4.9471597379515044e-05, "loss": 0.0839, "num_input_tokens_seen": 42086592, "step": 19495 }, { "epoch": 3.1810766721044046, "grad_norm": 0.3976326882839203, "learning_rate": 4.947086927239427e-05, "loss": 0.1793, "num_input_tokens_seen": 42096736, "step": 19500 }, { "epoch": 3.1818923327895594, "grad_norm": 0.1839563548564911, "learning_rate": 4.9470140669340284e-05, "loss": 0.1272, "num_input_tokens_seen": 42106592, "step": 19505 }, { "epoch": 3.1827079934747147, "grad_norm": 0.32212087512016296, "learning_rate": 4.9469411570367844e-05, "loss": 0.1095, "num_input_tokens_seen": 42117024, "step": 19510 }, { "epoch": 3.1835236541598695, "grad_norm": 0.4462459683418274, "learning_rate": 4.9468681975491746e-05, "loss": 0.0964, "num_input_tokens_seen": 42127776, "step": 19515 }, { "epoch": 3.1843393148450243, "grad_norm": 0.3417637050151825, "learning_rate": 4.946795188472677e-05, "loss": 0.0756, "num_input_tokens_seen": 42138976, "step": 19520 }, { "epoch": 3.1851549755301796, "grad_norm": 1.4264713525772095, "learning_rate": 4.94672212980877e-05, "loss": 0.0678, "num_input_tokens_seen": 42150752, "step": 19525 }, { "epoch": 3.1859706362153344, "grad_norm": 0.4940296411514282, "learning_rate": 4.946649021558936e-05, "loss": 0.0822, "num_input_tokens_seen": 42161696, "step": 19530 }, { "epoch": 3.186786296900489, "grad_norm": 1.155530571937561, "learning_rate": 4.946575863724655e-05, "loss": 0.1518, "num_input_tokens_seen": 42173312, "step": 19535 }, { "epoch": 3.1876019575856445, "grad_norm": 0.3499072790145874, "learning_rate": 4.9465026563074116e-05, "loss": 0.2746, "num_input_tokens_seen": 42185344, "step": 19540 }, { "epoch": 3.1884176182707993, "grad_norm": 0.41059738397598267, "learning_rate": 4.946429399308687e-05, "loss": 0.1223, "num_input_tokens_seen": 42196480, "step": 19545 }, { "epoch": 3.189233278955954, "grad_norm": 0.3496206998825073, "learning_rate": 4.946356092729968e-05, "loss": 0.0518, "num_input_tokens_seen": 42206624, "step": 19550 }, { "epoch": 3.1900489396411094, "grad_norm": 1.990110993385315, "learning_rate": 4.946282736572739e-05, "loss": 0.1981, "num_input_tokens_seen": 42218688, "step": 19555 }, { "epoch": 3.190864600326264, "grad_norm": 0.059559617191553116, "learning_rate": 4.946209330838487e-05, "loss": 0.0278, "num_input_tokens_seen": 42228896, "step": 19560 }, { "epoch": 3.1916802610114194, "grad_norm": 0.5010003447532654, "learning_rate": 4.946135875528699e-05, "loss": 0.2881, "num_input_tokens_seen": 42239424, "step": 19565 }, { "epoch": 3.1924959216965743, "grad_norm": 0.6716383695602417, "learning_rate": 4.9460623706448655e-05, "loss": 0.1844, "num_input_tokens_seen": 42250272, "step": 19570 }, { "epoch": 3.193311582381729, "grad_norm": 0.5395756959915161, "learning_rate": 4.945988816188475e-05, "loss": 0.1217, "num_input_tokens_seen": 42261024, "step": 19575 }, { "epoch": 3.1941272430668843, "grad_norm": 0.06774783879518509, "learning_rate": 4.945915212161018e-05, "loss": 0.1058, "num_input_tokens_seen": 42271680, "step": 19580 }, { "epoch": 3.194942903752039, "grad_norm": 0.0700925886631012, "learning_rate": 4.945841558563985e-05, "loss": 0.0974, "num_input_tokens_seen": 42281792, "step": 19585 }, { "epoch": 3.195758564437194, "grad_norm": 0.12846826016902924, "learning_rate": 4.945767855398872e-05, "loss": 0.0238, "num_input_tokens_seen": 42292128, "step": 19590 }, { "epoch": 3.1965742251223492, "grad_norm": 0.558779776096344, "learning_rate": 4.9456941026671695e-05, "loss": 0.1015, "num_input_tokens_seen": 42303232, "step": 19595 }, { "epoch": 3.197389885807504, "grad_norm": 0.1141200140118599, "learning_rate": 4.945620300370374e-05, "loss": 0.0532, "num_input_tokens_seen": 42313056, "step": 19600 }, { "epoch": 3.198205546492659, "grad_norm": 0.37517908215522766, "learning_rate": 4.9455464485099803e-05, "loss": 0.0827, "num_input_tokens_seen": 42324320, "step": 19605 }, { "epoch": 3.199021207177814, "grad_norm": 0.29492220282554626, "learning_rate": 4.945472547087485e-05, "loss": 0.0721, "num_input_tokens_seen": 42332800, "step": 19610 }, { "epoch": 3.199836867862969, "grad_norm": 2.0240566730499268, "learning_rate": 4.945398596104387e-05, "loss": 0.3688, "num_input_tokens_seen": 42344448, "step": 19615 }, { "epoch": 3.200652528548124, "grad_norm": 0.18531227111816406, "learning_rate": 4.945324595562184e-05, "loss": 0.0803, "num_input_tokens_seen": 42355520, "step": 19620 }, { "epoch": 3.201468189233279, "grad_norm": 0.5349664688110352, "learning_rate": 4.945250545462376e-05, "loss": 0.0887, "num_input_tokens_seen": 42366656, "step": 19625 }, { "epoch": 3.202283849918434, "grad_norm": 0.4401543140411377, "learning_rate": 4.945176445806463e-05, "loss": 0.0795, "num_input_tokens_seen": 42377824, "step": 19630 }, { "epoch": 3.203099510603589, "grad_norm": 1.1260027885437012, "learning_rate": 4.945102296595948e-05, "loss": 0.0977, "num_input_tokens_seen": 42388064, "step": 19635 }, { "epoch": 3.203915171288744, "grad_norm": 0.11727247387170792, "learning_rate": 4.945028097832334e-05, "loss": 0.1226, "num_input_tokens_seen": 42399648, "step": 19640 }, { "epoch": 3.2047308319738987, "grad_norm": 0.03859243541955948, "learning_rate": 4.9449538495171225e-05, "loss": 0.0898, "num_input_tokens_seen": 42409024, "step": 19645 }, { "epoch": 3.205546492659054, "grad_norm": 0.2064598947763443, "learning_rate": 4.9448795516518204e-05, "loss": 0.0305, "num_input_tokens_seen": 42419552, "step": 19650 }, { "epoch": 3.206362153344209, "grad_norm": 1.0877948999404907, "learning_rate": 4.944805204237932e-05, "loss": 0.2319, "num_input_tokens_seen": 42428992, "step": 19655 }, { "epoch": 3.2071778140293636, "grad_norm": 0.20169886946678162, "learning_rate": 4.944730807276965e-05, "loss": 0.1207, "num_input_tokens_seen": 42440736, "step": 19660 }, { "epoch": 3.207993474714519, "grad_norm": 0.28356286883354187, "learning_rate": 4.944656360770427e-05, "loss": 0.0781, "num_input_tokens_seen": 42451168, "step": 19665 }, { "epoch": 3.2088091353996737, "grad_norm": 2.051271677017212, "learning_rate": 4.944581864719827e-05, "loss": 0.1206, "num_input_tokens_seen": 42462528, "step": 19670 }, { "epoch": 3.2096247960848285, "grad_norm": 0.14194221794605255, "learning_rate": 4.944507319126673e-05, "loss": 0.0826, "num_input_tokens_seen": 42471648, "step": 19675 }, { "epoch": 3.210440456769984, "grad_norm": 0.7419605255126953, "learning_rate": 4.9444327239924783e-05, "loss": 0.1036, "num_input_tokens_seen": 42482944, "step": 19680 }, { "epoch": 3.2112561174551386, "grad_norm": 1.1890034675598145, "learning_rate": 4.9443580793187524e-05, "loss": 0.1894, "num_input_tokens_seen": 42493152, "step": 19685 }, { "epoch": 3.2120717781402934, "grad_norm": 1.596082091331482, "learning_rate": 4.94428338510701e-05, "loss": 0.1166, "num_input_tokens_seen": 42504544, "step": 19690 }, { "epoch": 3.2128874388254487, "grad_norm": 0.9483875036239624, "learning_rate": 4.944208641358764e-05, "loss": 0.1874, "num_input_tokens_seen": 42514496, "step": 19695 }, { "epoch": 3.2137030995106035, "grad_norm": 0.3938862681388855, "learning_rate": 4.9441338480755295e-05, "loss": 0.0738, "num_input_tokens_seen": 42524704, "step": 19700 }, { "epoch": 3.2145187601957588, "grad_norm": 2.694547176361084, "learning_rate": 4.944059005258821e-05, "loss": 0.244, "num_input_tokens_seen": 42534688, "step": 19705 }, { "epoch": 3.2153344208809136, "grad_norm": 0.7291055917739868, "learning_rate": 4.943984112910157e-05, "loss": 0.228, "num_input_tokens_seen": 42546016, "step": 19710 }, { "epoch": 3.2161500815660684, "grad_norm": 1.1291714906692505, "learning_rate": 4.9439091710310535e-05, "loss": 0.143, "num_input_tokens_seen": 42556672, "step": 19715 }, { "epoch": 3.2169657422512237, "grad_norm": 0.38121312856674194, "learning_rate": 4.943834179623031e-05, "loss": 0.0849, "num_input_tokens_seen": 42567168, "step": 19720 }, { "epoch": 3.2177814029363785, "grad_norm": 0.11091160774230957, "learning_rate": 4.943759138687609e-05, "loss": 0.0237, "num_input_tokens_seen": 42577760, "step": 19725 }, { "epoch": 3.2185970636215333, "grad_norm": 1.2881686687469482, "learning_rate": 4.943684048226308e-05, "loss": 0.1496, "num_input_tokens_seen": 42588672, "step": 19730 }, { "epoch": 3.2194127243066886, "grad_norm": 1.5323867797851562, "learning_rate": 4.943608908240649e-05, "loss": 0.1104, "num_input_tokens_seen": 42600480, "step": 19735 }, { "epoch": 3.2202283849918434, "grad_norm": 0.01832229271531105, "learning_rate": 4.9435337187321565e-05, "loss": 0.0966, "num_input_tokens_seen": 42612992, "step": 19740 }, { "epoch": 3.221044045676998, "grad_norm": 1.2267186641693115, "learning_rate": 4.943458479702352e-05, "loss": 0.0884, "num_input_tokens_seen": 42624576, "step": 19745 }, { "epoch": 3.2218597063621535, "grad_norm": 0.22037330269813538, "learning_rate": 4.943383191152763e-05, "loss": 0.1497, "num_input_tokens_seen": 42636128, "step": 19750 }, { "epoch": 3.2226753670473083, "grad_norm": 0.1233893558382988, "learning_rate": 4.9433078530849134e-05, "loss": 0.1276, "num_input_tokens_seen": 42646304, "step": 19755 }, { "epoch": 3.223491027732463, "grad_norm": 0.835167407989502, "learning_rate": 4.9432324655003304e-05, "loss": 0.2398, "num_input_tokens_seen": 42656576, "step": 19760 }, { "epoch": 3.2243066884176184, "grad_norm": 0.6570693254470825, "learning_rate": 4.943157028400542e-05, "loss": 0.0237, "num_input_tokens_seen": 42666624, "step": 19765 }, { "epoch": 3.225122349102773, "grad_norm": 0.4581691026687622, "learning_rate": 4.943081541787077e-05, "loss": 0.1265, "num_input_tokens_seen": 42676096, "step": 19770 }, { "epoch": 3.225938009787928, "grad_norm": 0.944169282913208, "learning_rate": 4.9430060056614656e-05, "loss": 0.1496, "num_input_tokens_seen": 42686752, "step": 19775 }, { "epoch": 3.2267536704730833, "grad_norm": 1.0853384733200073, "learning_rate": 4.942930420025239e-05, "loss": 0.2284, "num_input_tokens_seen": 42697792, "step": 19780 }, { "epoch": 3.227569331158238, "grad_norm": 0.6626856923103333, "learning_rate": 4.9428547848799266e-05, "loss": 0.1334, "num_input_tokens_seen": 42708384, "step": 19785 }, { "epoch": 3.2283849918433933, "grad_norm": 0.5382961630821228, "learning_rate": 4.942779100227064e-05, "loss": 0.1374, "num_input_tokens_seen": 42719136, "step": 19790 }, { "epoch": 3.229200652528548, "grad_norm": 0.1217556968331337, "learning_rate": 4.942703366068184e-05, "loss": 0.1672, "num_input_tokens_seen": 42728896, "step": 19795 }, { "epoch": 3.230016313213703, "grad_norm": 1.0875204801559448, "learning_rate": 4.942627582404822e-05, "loss": 0.1781, "num_input_tokens_seen": 42740512, "step": 19800 }, { "epoch": 3.2308319738988582, "grad_norm": 0.08751396089792252, "learning_rate": 4.9425517492385134e-05, "loss": 0.023, "num_input_tokens_seen": 42750240, "step": 19805 }, { "epoch": 3.231647634584013, "grad_norm": 0.10919368267059326, "learning_rate": 4.942475866570794e-05, "loss": 0.0245, "num_input_tokens_seen": 42760672, "step": 19810 }, { "epoch": 3.232463295269168, "grad_norm": 1.8616183996200562, "learning_rate": 4.942399934403204e-05, "loss": 0.213, "num_input_tokens_seen": 42771680, "step": 19815 }, { "epoch": 3.233278955954323, "grad_norm": 0.8663862347602844, "learning_rate": 4.9423239527372794e-05, "loss": 0.181, "num_input_tokens_seen": 42782048, "step": 19820 }, { "epoch": 3.234094616639478, "grad_norm": 1.3972079753875732, "learning_rate": 4.942247921574562e-05, "loss": 0.1175, "num_input_tokens_seen": 42792864, "step": 19825 }, { "epoch": 3.2349102773246328, "grad_norm": 0.05408838018774986, "learning_rate": 4.942171840916593e-05, "loss": 0.1945, "num_input_tokens_seen": 42803904, "step": 19830 }, { "epoch": 3.235725938009788, "grad_norm": 1.2521597146987915, "learning_rate": 4.942095710764913e-05, "loss": 0.1233, "num_input_tokens_seen": 42814720, "step": 19835 }, { "epoch": 3.236541598694943, "grad_norm": 0.3947064280509949, "learning_rate": 4.942019531121066e-05, "loss": 0.0449, "num_input_tokens_seen": 42825856, "step": 19840 }, { "epoch": 3.237357259380098, "grad_norm": 0.671553373336792, "learning_rate": 4.941943301986594e-05, "loss": 0.2991, "num_input_tokens_seen": 42837408, "step": 19845 }, { "epoch": 3.238172920065253, "grad_norm": 0.6500518918037415, "learning_rate": 4.9418670233630436e-05, "loss": 0.0865, "num_input_tokens_seen": 42848960, "step": 19850 }, { "epoch": 3.2389885807504077, "grad_norm": 0.08271671086549759, "learning_rate": 4.941790695251961e-05, "loss": 0.2298, "num_input_tokens_seen": 42859168, "step": 19855 }, { "epoch": 3.239804241435563, "grad_norm": 0.20133298635482788, "learning_rate": 4.9417143176548916e-05, "loss": 0.1365, "num_input_tokens_seen": 42867872, "step": 19860 }, { "epoch": 3.240619902120718, "grad_norm": 1.6413354873657227, "learning_rate": 4.9416378905733845e-05, "loss": 0.2697, "num_input_tokens_seen": 42879264, "step": 19865 }, { "epoch": 3.2414355628058726, "grad_norm": 0.2550680339336395, "learning_rate": 4.9415614140089885e-05, "loss": 0.0438, "num_input_tokens_seen": 42890816, "step": 19870 }, { "epoch": 3.242251223491028, "grad_norm": 1.9260947704315186, "learning_rate": 4.9414848879632526e-05, "loss": 0.1924, "num_input_tokens_seen": 42902400, "step": 19875 }, { "epoch": 3.2430668841761827, "grad_norm": 1.4433443546295166, "learning_rate": 4.941408312437727e-05, "loss": 0.372, "num_input_tokens_seen": 42913152, "step": 19880 }, { "epoch": 3.2438825448613375, "grad_norm": 1.5071953535079956, "learning_rate": 4.9413316874339664e-05, "loss": 0.174, "num_input_tokens_seen": 42924672, "step": 19885 }, { "epoch": 3.244698205546493, "grad_norm": 0.322938472032547, "learning_rate": 4.9412550129535216e-05, "loss": 0.2463, "num_input_tokens_seen": 42934880, "step": 19890 }, { "epoch": 3.2455138662316476, "grad_norm": 0.07809556275606155, "learning_rate": 4.9411782889979474e-05, "loss": 0.0509, "num_input_tokens_seen": 42946176, "step": 19895 }, { "epoch": 3.2463295269168024, "grad_norm": 0.23136551678180695, "learning_rate": 4.9411015155687976e-05, "loss": 0.2033, "num_input_tokens_seen": 42958016, "step": 19900 }, { "epoch": 3.2471451876019577, "grad_norm": 0.3675045371055603, "learning_rate": 4.941024692667629e-05, "loss": 0.0758, "num_input_tokens_seen": 42968320, "step": 19905 }, { "epoch": 3.2479608482871125, "grad_norm": 1.5067777633666992, "learning_rate": 4.940947820295999e-05, "loss": 0.18, "num_input_tokens_seen": 42980096, "step": 19910 }, { "epoch": 3.2487765089722673, "grad_norm": 0.8049182295799255, "learning_rate": 4.9408708984554644e-05, "loss": 0.2219, "num_input_tokens_seen": 42990304, "step": 19915 }, { "epoch": 3.2495921696574226, "grad_norm": 0.0928252711892128, "learning_rate": 4.940793927147585e-05, "loss": 0.257, "num_input_tokens_seen": 43001280, "step": 19920 }, { "epoch": 3.2504078303425774, "grad_norm": 0.37516817450523376, "learning_rate": 4.9407169063739194e-05, "loss": 0.0389, "num_input_tokens_seen": 43011584, "step": 19925 }, { "epoch": 3.2512234910277327, "grad_norm": 0.1340334415435791, "learning_rate": 4.94063983613603e-05, "loss": 0.1082, "num_input_tokens_seen": 43022432, "step": 19930 }, { "epoch": 3.2520391517128875, "grad_norm": 0.49872592091560364, "learning_rate": 4.9405627164354784e-05, "loss": 0.1785, "num_input_tokens_seen": 43033792, "step": 19935 }, { "epoch": 3.2528548123980423, "grad_norm": 0.4004969596862793, "learning_rate": 4.9404855472738265e-05, "loss": 0.2518, "num_input_tokens_seen": 43045504, "step": 19940 }, { "epoch": 3.2536704730831976, "grad_norm": 0.20464546978473663, "learning_rate": 4.9404083286526394e-05, "loss": 0.1584, "num_input_tokens_seen": 43056544, "step": 19945 }, { "epoch": 3.2544861337683524, "grad_norm": 1.3738988637924194, "learning_rate": 4.940331060573482e-05, "loss": 0.2118, "num_input_tokens_seen": 43065184, "step": 19950 }, { "epoch": 3.255301794453507, "grad_norm": 0.6279741525650024, "learning_rate": 4.940253743037919e-05, "loss": 0.09, "num_input_tokens_seen": 43074784, "step": 19955 }, { "epoch": 3.2561174551386625, "grad_norm": 0.7955570816993713, "learning_rate": 4.9401763760475197e-05, "loss": 0.0671, "num_input_tokens_seen": 43084864, "step": 19960 }, { "epoch": 3.2569331158238173, "grad_norm": 0.22099782526493073, "learning_rate": 4.940098959603849e-05, "loss": 0.2084, "num_input_tokens_seen": 43096576, "step": 19965 }, { "epoch": 3.257748776508972, "grad_norm": 1.4639079570770264, "learning_rate": 4.940021493708478e-05, "loss": 0.1712, "num_input_tokens_seen": 43106816, "step": 19970 }, { "epoch": 3.2585644371941274, "grad_norm": 1.7776422500610352, "learning_rate": 4.9399439783629767e-05, "loss": 0.0936, "num_input_tokens_seen": 43118304, "step": 19975 }, { "epoch": 3.259380097879282, "grad_norm": 0.049912407994270325, "learning_rate": 4.939866413568914e-05, "loss": 0.1695, "num_input_tokens_seen": 43128384, "step": 19980 }, { "epoch": 3.2601957585644374, "grad_norm": 0.954504132270813, "learning_rate": 4.939788799327865e-05, "loss": 0.1989, "num_input_tokens_seen": 43139136, "step": 19985 }, { "epoch": 3.2610114192495923, "grad_norm": 0.6989971995353699, "learning_rate": 4.9397111356413993e-05, "loss": 0.0891, "num_input_tokens_seen": 43150048, "step": 19990 }, { "epoch": 3.261827079934747, "grad_norm": 1.724759817123413, "learning_rate": 4.939633422511093e-05, "loss": 0.2352, "num_input_tokens_seen": 43159456, "step": 19995 }, { "epoch": 3.262642740619902, "grad_norm": 1.5915251970291138, "learning_rate": 4.9395556599385204e-05, "loss": 0.2567, "num_input_tokens_seen": 43170816, "step": 20000 }, { "epoch": 3.263458401305057, "grad_norm": 1.2352460622787476, "learning_rate": 4.939477847925258e-05, "loss": 0.222, "num_input_tokens_seen": 43182336, "step": 20005 }, { "epoch": 3.264274061990212, "grad_norm": 0.5290535092353821, "learning_rate": 4.939399986472882e-05, "loss": 0.256, "num_input_tokens_seen": 43192384, "step": 20010 }, { "epoch": 3.2650897226753672, "grad_norm": 0.10805080831050873, "learning_rate": 4.9393220755829706e-05, "loss": 0.1726, "num_input_tokens_seen": 43203744, "step": 20015 }, { "epoch": 3.265905383360522, "grad_norm": 0.7813723087310791, "learning_rate": 4.939244115257103e-05, "loss": 0.1011, "num_input_tokens_seen": 43213376, "step": 20020 }, { "epoch": 3.266721044045677, "grad_norm": 1.6267516613006592, "learning_rate": 4.9391661054968586e-05, "loss": 0.2565, "num_input_tokens_seen": 43223744, "step": 20025 }, { "epoch": 3.267536704730832, "grad_norm": 0.8364593982696533, "learning_rate": 4.939088046303819e-05, "loss": 0.0863, "num_input_tokens_seen": 43234752, "step": 20030 }, { "epoch": 3.268352365415987, "grad_norm": 1.0889769792556763, "learning_rate": 4.9390099376795665e-05, "loss": 0.1603, "num_input_tokens_seen": 43245280, "step": 20035 }, { "epoch": 3.2691680261011418, "grad_norm": 1.2166780233383179, "learning_rate": 4.938931779625683e-05, "loss": 0.1045, "num_input_tokens_seen": 43256672, "step": 20040 }, { "epoch": 3.269983686786297, "grad_norm": 0.6806462407112122, "learning_rate": 4.938853572143753e-05, "loss": 0.3126, "num_input_tokens_seen": 43268288, "step": 20045 }, { "epoch": 3.270799347471452, "grad_norm": 0.30268383026123047, "learning_rate": 4.9387753152353614e-05, "loss": 0.0315, "num_input_tokens_seen": 43278080, "step": 20050 }, { "epoch": 3.2716150081566067, "grad_norm": 0.15010178089141846, "learning_rate": 4.9386970089020945e-05, "loss": 0.0584, "num_input_tokens_seen": 43287392, "step": 20055 }, { "epoch": 3.272430668841762, "grad_norm": 0.7915643453598022, "learning_rate": 4.938618653145538e-05, "loss": 0.0908, "num_input_tokens_seen": 43297824, "step": 20060 }, { "epoch": 3.2732463295269167, "grad_norm": 0.7331327199935913, "learning_rate": 4.938540247967282e-05, "loss": 0.0725, "num_input_tokens_seen": 43308864, "step": 20065 }, { "epoch": 3.274061990212072, "grad_norm": 1.2465251684188843, "learning_rate": 4.938461793368914e-05, "loss": 0.1819, "num_input_tokens_seen": 43318912, "step": 20070 }, { "epoch": 3.274877650897227, "grad_norm": 0.06338854879140854, "learning_rate": 4.9383832893520244e-05, "loss": 0.1138, "num_input_tokens_seen": 43328896, "step": 20075 }, { "epoch": 3.2756933115823816, "grad_norm": 0.8511970043182373, "learning_rate": 4.9383047359182044e-05, "loss": 0.1041, "num_input_tokens_seen": 43340512, "step": 20080 }, { "epoch": 3.2765089722675365, "grad_norm": 0.6288235783576965, "learning_rate": 4.9382261330690455e-05, "loss": 0.1506, "num_input_tokens_seen": 43351840, "step": 20085 }, { "epoch": 3.2773246329526917, "grad_norm": 1.7932865619659424, "learning_rate": 4.938147480806141e-05, "loss": 0.2977, "num_input_tokens_seen": 43363328, "step": 20090 }, { "epoch": 3.2781402936378465, "grad_norm": 0.47574156522750854, "learning_rate": 4.938068779131084e-05, "loss": 0.1972, "num_input_tokens_seen": 43374688, "step": 20095 }, { "epoch": 3.278955954323002, "grad_norm": 0.3000944256782532, "learning_rate": 4.9379900280454714e-05, "loss": 0.1303, "num_input_tokens_seen": 43384832, "step": 20100 }, { "epoch": 3.2797716150081566, "grad_norm": 0.6218522191047668, "learning_rate": 4.9379112275508975e-05, "loss": 0.2278, "num_input_tokens_seen": 43396352, "step": 20105 }, { "epoch": 3.2805872756933114, "grad_norm": 2.381213426589966, "learning_rate": 4.93783237764896e-05, "loss": 0.1146, "num_input_tokens_seen": 43407104, "step": 20110 }, { "epoch": 3.2814029363784667, "grad_norm": 0.4411902129650116, "learning_rate": 4.937753478341257e-05, "loss": 0.0678, "num_input_tokens_seen": 43418816, "step": 20115 }, { "epoch": 3.2822185970636215, "grad_norm": 0.0981764942407608, "learning_rate": 4.937674529629387e-05, "loss": 0.1369, "num_input_tokens_seen": 43429088, "step": 20120 }, { "epoch": 3.2830342577487763, "grad_norm": 0.5844553112983704, "learning_rate": 4.9375955315149504e-05, "loss": 0.0877, "num_input_tokens_seen": 43439744, "step": 20125 }, { "epoch": 3.2838499184339316, "grad_norm": 0.7035700082778931, "learning_rate": 4.937516483999548e-05, "loss": 0.0559, "num_input_tokens_seen": 43451264, "step": 20130 }, { "epoch": 3.2846655791190864, "grad_norm": 0.12205472588539124, "learning_rate": 4.937437387084781e-05, "loss": 0.1846, "num_input_tokens_seen": 43461696, "step": 20135 }, { "epoch": 3.2854812398042412, "grad_norm": 1.0024017095565796, "learning_rate": 4.937358240772254e-05, "loss": 0.0569, "num_input_tokens_seen": 43471904, "step": 20140 }, { "epoch": 3.2862969004893965, "grad_norm": 0.2635960876941681, "learning_rate": 4.9372790450635706e-05, "loss": 0.1775, "num_input_tokens_seen": 43482400, "step": 20145 }, { "epoch": 3.2871125611745513, "grad_norm": 0.6386902332305908, "learning_rate": 4.937199799960335e-05, "loss": 0.2082, "num_input_tokens_seen": 43492160, "step": 20150 }, { "epoch": 3.2879282218597066, "grad_norm": 0.6968396306037903, "learning_rate": 4.937120505464153e-05, "loss": 0.1174, "num_input_tokens_seen": 43502688, "step": 20155 }, { "epoch": 3.2887438825448614, "grad_norm": 0.8102860450744629, "learning_rate": 4.9370411615766336e-05, "loss": 0.2275, "num_input_tokens_seen": 43513184, "step": 20160 }, { "epoch": 3.289559543230016, "grad_norm": 1.5865896940231323, "learning_rate": 4.936961768299383e-05, "loss": 0.222, "num_input_tokens_seen": 43522592, "step": 20165 }, { "epoch": 3.2903752039151715, "grad_norm": 1.0362486839294434, "learning_rate": 4.9368823256340105e-05, "loss": 0.0977, "num_input_tokens_seen": 43533312, "step": 20170 }, { "epoch": 3.2911908646003263, "grad_norm": 1.067967414855957, "learning_rate": 4.936802833582126e-05, "loss": 0.1622, "num_input_tokens_seen": 43543808, "step": 20175 }, { "epoch": 3.292006525285481, "grad_norm": 1.8029967546463013, "learning_rate": 4.936723292145341e-05, "loss": 0.2588, "num_input_tokens_seen": 43554528, "step": 20180 }, { "epoch": 3.2928221859706364, "grad_norm": 0.21062734723091125, "learning_rate": 4.9366437013252674e-05, "loss": 0.0639, "num_input_tokens_seen": 43564960, "step": 20185 }, { "epoch": 3.293637846655791, "grad_norm": 0.07996685802936554, "learning_rate": 4.936564061123517e-05, "loss": 0.0909, "num_input_tokens_seen": 43576576, "step": 20190 }, { "epoch": 3.294453507340946, "grad_norm": 0.9784523844718933, "learning_rate": 4.936484371541706e-05, "loss": 0.1721, "num_input_tokens_seen": 43588576, "step": 20195 }, { "epoch": 3.2952691680261013, "grad_norm": 0.5493991374969482, "learning_rate": 4.936404632581449e-05, "loss": 0.2698, "num_input_tokens_seen": 43598912, "step": 20200 }, { "epoch": 3.296084828711256, "grad_norm": 0.9721677899360657, "learning_rate": 4.936324844244361e-05, "loss": 0.1541, "num_input_tokens_seen": 43609568, "step": 20205 }, { "epoch": 3.2969004893964113, "grad_norm": 1.1132228374481201, "learning_rate": 4.936245006532059e-05, "loss": 0.1439, "num_input_tokens_seen": 43619904, "step": 20210 }, { "epoch": 3.297716150081566, "grad_norm": 0.16866564750671387, "learning_rate": 4.9361651194461605e-05, "loss": 0.1211, "num_input_tokens_seen": 43630976, "step": 20215 }, { "epoch": 3.298531810766721, "grad_norm": 0.5983760356903076, "learning_rate": 4.936085182988286e-05, "loss": 0.0614, "num_input_tokens_seen": 43640384, "step": 20220 }, { "epoch": 3.299347471451876, "grad_norm": 0.6205356121063232, "learning_rate": 4.9360051971600555e-05, "loss": 0.1756, "num_input_tokens_seen": 43650912, "step": 20225 }, { "epoch": 3.300163132137031, "grad_norm": 0.4880039095878601, "learning_rate": 4.9359251619630886e-05, "loss": 0.1572, "num_input_tokens_seen": 43662368, "step": 20230 }, { "epoch": 3.300978792822186, "grad_norm": 1.6585912704467773, "learning_rate": 4.9358450773990086e-05, "loss": 0.1516, "num_input_tokens_seen": 43674048, "step": 20235 }, { "epoch": 3.301794453507341, "grad_norm": 0.40074044466018677, "learning_rate": 4.935764943469438e-05, "loss": 0.1702, "num_input_tokens_seen": 43685120, "step": 20240 }, { "epoch": 3.302610114192496, "grad_norm": 0.10116878896951675, "learning_rate": 4.935684760176e-05, "loss": 0.121, "num_input_tokens_seen": 43696288, "step": 20245 }, { "epoch": 3.3034257748776508, "grad_norm": 0.1698395162820816, "learning_rate": 4.935604527520321e-05, "loss": 0.183, "num_input_tokens_seen": 43708000, "step": 20250 }, { "epoch": 3.304241435562806, "grad_norm": 0.170126810669899, "learning_rate": 4.935524245504026e-05, "loss": 0.0745, "num_input_tokens_seen": 43720416, "step": 20255 }, { "epoch": 3.305057096247961, "grad_norm": 0.09041358530521393, "learning_rate": 4.9354439141287436e-05, "loss": 0.1929, "num_input_tokens_seen": 43730080, "step": 20260 }, { "epoch": 3.3058727569331157, "grad_norm": 1.115763783454895, "learning_rate": 4.9353635333961e-05, "loss": 0.072, "num_input_tokens_seen": 43740896, "step": 20265 }, { "epoch": 3.306688417618271, "grad_norm": 1.8469187021255493, "learning_rate": 4.9352831033077244e-05, "loss": 0.1444, "num_input_tokens_seen": 43752128, "step": 20270 }, { "epoch": 3.3075040783034257, "grad_norm": 0.15132687985897064, "learning_rate": 4.9352026238652484e-05, "loss": 0.0685, "num_input_tokens_seen": 43763008, "step": 20275 }, { "epoch": 3.3083197389885806, "grad_norm": 0.568570077419281, "learning_rate": 4.9351220950703015e-05, "loss": 0.1843, "num_input_tokens_seen": 43774144, "step": 20280 }, { "epoch": 3.309135399673736, "grad_norm": 0.5660076141357422, "learning_rate": 4.9350415169245154e-05, "loss": 0.0804, "num_input_tokens_seen": 43785280, "step": 20285 }, { "epoch": 3.3099510603588906, "grad_norm": 0.10992153733968735, "learning_rate": 4.934960889429525e-05, "loss": 0.0576, "num_input_tokens_seen": 43796288, "step": 20290 }, { "epoch": 3.310766721044046, "grad_norm": 0.6000072360038757, "learning_rate": 4.934880212586963e-05, "loss": 0.0929, "num_input_tokens_seen": 43807392, "step": 20295 }, { "epoch": 3.3115823817292007, "grad_norm": 1.284555196762085, "learning_rate": 4.934799486398465e-05, "loss": 0.2501, "num_input_tokens_seen": 43819328, "step": 20300 }, { "epoch": 3.3123980424143555, "grad_norm": 0.35470589995384216, "learning_rate": 4.934718710865665e-05, "loss": 0.0655, "num_input_tokens_seen": 43829632, "step": 20305 }, { "epoch": 3.3132137030995104, "grad_norm": 0.3390854597091675, "learning_rate": 4.934637885990203e-05, "loss": 0.1344, "num_input_tokens_seen": 43839520, "step": 20310 }, { "epoch": 3.3140293637846656, "grad_norm": 1.365157127380371, "learning_rate": 4.934557011773716e-05, "loss": 0.0893, "num_input_tokens_seen": 43850048, "step": 20315 }, { "epoch": 3.3148450244698204, "grad_norm": 0.5240082144737244, "learning_rate": 4.934476088217842e-05, "loss": 0.0468, "num_input_tokens_seen": 43860480, "step": 20320 }, { "epoch": 3.3156606851549757, "grad_norm": 0.23375310003757477, "learning_rate": 4.934395115324222e-05, "loss": 0.0689, "num_input_tokens_seen": 43871456, "step": 20325 }, { "epoch": 3.3164763458401305, "grad_norm": 1.1596828699111938, "learning_rate": 4.9343140930944965e-05, "loss": 0.2215, "num_input_tokens_seen": 43882208, "step": 20330 }, { "epoch": 3.3172920065252853, "grad_norm": 0.024076975882053375, "learning_rate": 4.9342330215303075e-05, "loss": 0.0352, "num_input_tokens_seen": 43893568, "step": 20335 }, { "epoch": 3.3181076672104406, "grad_norm": 0.6348887085914612, "learning_rate": 4.9341519006332995e-05, "loss": 0.1929, "num_input_tokens_seen": 43904736, "step": 20340 }, { "epoch": 3.3189233278955954, "grad_norm": 0.6568289399147034, "learning_rate": 4.9340707304051136e-05, "loss": 0.0822, "num_input_tokens_seen": 43915232, "step": 20345 }, { "epoch": 3.3197389885807502, "grad_norm": 0.12987934052944183, "learning_rate": 4.933989510847398e-05, "loss": 0.2682, "num_input_tokens_seen": 43925664, "step": 20350 }, { "epoch": 3.3205546492659055, "grad_norm": 0.6635408401489258, "learning_rate": 4.933908241961797e-05, "loss": 0.1047, "num_input_tokens_seen": 43936736, "step": 20355 }, { "epoch": 3.3213703099510603, "grad_norm": 1.6445404291152954, "learning_rate": 4.9338269237499565e-05, "loss": 0.2664, "num_input_tokens_seen": 43948320, "step": 20360 }, { "epoch": 3.322185970636215, "grad_norm": 0.22988250851631165, "learning_rate": 4.933745556213527e-05, "loss": 0.0205, "num_input_tokens_seen": 43958304, "step": 20365 }, { "epoch": 3.3230016313213704, "grad_norm": 0.11142954230308533, "learning_rate": 4.933664139354156e-05, "loss": 0.1441, "num_input_tokens_seen": 43969120, "step": 20370 }, { "epoch": 3.323817292006525, "grad_norm": 0.17900297045707703, "learning_rate": 4.9335826731734945e-05, "loss": 0.0994, "num_input_tokens_seen": 43980768, "step": 20375 }, { "epoch": 3.3246329526916805, "grad_norm": 0.5475744605064392, "learning_rate": 4.933501157673192e-05, "loss": 0.1413, "num_input_tokens_seen": 43991360, "step": 20380 }, { "epoch": 3.3254486133768353, "grad_norm": 0.6194255948066711, "learning_rate": 4.933419592854902e-05, "loss": 0.075, "num_input_tokens_seen": 44002464, "step": 20385 }, { "epoch": 3.32626427406199, "grad_norm": 0.5152600407600403, "learning_rate": 4.933337978720277e-05, "loss": 0.0527, "num_input_tokens_seen": 44011264, "step": 20390 }, { "epoch": 3.3270799347471454, "grad_norm": 0.2088240385055542, "learning_rate": 4.933256315270971e-05, "loss": 0.0759, "num_input_tokens_seen": 44021632, "step": 20395 }, { "epoch": 3.3278955954323, "grad_norm": 0.055345334112644196, "learning_rate": 4.9331746025086385e-05, "loss": 0.1315, "num_input_tokens_seen": 44031872, "step": 20400 }, { "epoch": 3.328711256117455, "grad_norm": 0.499603271484375, "learning_rate": 4.933092840434936e-05, "loss": 0.2137, "num_input_tokens_seen": 44044064, "step": 20405 }, { "epoch": 3.3295269168026103, "grad_norm": 2.570492744445801, "learning_rate": 4.933011029051521e-05, "loss": 0.3386, "num_input_tokens_seen": 44055488, "step": 20410 }, { "epoch": 3.330342577487765, "grad_norm": 0.3573610186576843, "learning_rate": 4.932929168360051e-05, "loss": 0.0408, "num_input_tokens_seen": 44066496, "step": 20415 }, { "epoch": 3.33115823817292, "grad_norm": 0.11242088675498962, "learning_rate": 4.9328472583621846e-05, "loss": 0.1891, "num_input_tokens_seen": 44077664, "step": 20420 }, { "epoch": 3.331973898858075, "grad_norm": 0.31037670373916626, "learning_rate": 4.932765299059582e-05, "loss": 0.1564, "num_input_tokens_seen": 44089344, "step": 20425 }, { "epoch": 3.33278955954323, "grad_norm": 0.3771257996559143, "learning_rate": 4.932683290453905e-05, "loss": 0.0199, "num_input_tokens_seen": 44100288, "step": 20430 }, { "epoch": 3.3336052202283852, "grad_norm": 0.8764845132827759, "learning_rate": 4.932601232546815e-05, "loss": 0.0462, "num_input_tokens_seen": 44111040, "step": 20435 }, { "epoch": 3.33442088091354, "grad_norm": 0.2869792878627777, "learning_rate": 4.932519125339975e-05, "loss": 0.1163, "num_input_tokens_seen": 44122144, "step": 20440 }, { "epoch": 3.335236541598695, "grad_norm": 0.37279778718948364, "learning_rate": 4.9324369688350486e-05, "loss": 0.1343, "num_input_tokens_seen": 44133344, "step": 20445 }, { "epoch": 3.3360522022838497, "grad_norm": 1.0866787433624268, "learning_rate": 4.932354763033702e-05, "loss": 0.0881, "num_input_tokens_seen": 44143520, "step": 20450 }, { "epoch": 3.336867862969005, "grad_norm": 0.087325818836689, "learning_rate": 4.9322725079375996e-05, "loss": 0.0456, "num_input_tokens_seen": 44154496, "step": 20455 }, { "epoch": 3.3376835236541598, "grad_norm": 0.1798875629901886, "learning_rate": 4.93219020354841e-05, "loss": 0.03, "num_input_tokens_seen": 44165504, "step": 20460 }, { "epoch": 3.338499184339315, "grad_norm": 0.6157485246658325, "learning_rate": 4.9321078498677994e-05, "loss": 0.2202, "num_input_tokens_seen": 44177472, "step": 20465 }, { "epoch": 3.33931484502447, "grad_norm": 0.7097435593605042, "learning_rate": 4.932025446897439e-05, "loss": 0.1712, "num_input_tokens_seen": 44187712, "step": 20470 }, { "epoch": 3.3401305057096247, "grad_norm": 0.8837709426879883, "learning_rate": 4.931942994638998e-05, "loss": 0.3115, "num_input_tokens_seen": 44198816, "step": 20475 }, { "epoch": 3.34094616639478, "grad_norm": 1.4708235263824463, "learning_rate": 4.931860493094146e-05, "loss": 0.2689, "num_input_tokens_seen": 44209600, "step": 20480 }, { "epoch": 3.3417618270799347, "grad_norm": 1.2550053596496582, "learning_rate": 4.9317779422645575e-05, "loss": 0.1198, "num_input_tokens_seen": 44220768, "step": 20485 }, { "epoch": 3.3425774877650896, "grad_norm": 0.16184528172016144, "learning_rate": 4.931695342151902e-05, "loss": 0.1961, "num_input_tokens_seen": 44231936, "step": 20490 }, { "epoch": 3.343393148450245, "grad_norm": 0.17625391483306885, "learning_rate": 4.931612692757857e-05, "loss": 0.1391, "num_input_tokens_seen": 44242912, "step": 20495 }, { "epoch": 3.3442088091353996, "grad_norm": 0.300262451171875, "learning_rate": 4.9315299940840954e-05, "loss": 0.0982, "num_input_tokens_seen": 44254144, "step": 20500 }, { "epoch": 3.3450244698205545, "grad_norm": 0.41610395908355713, "learning_rate": 4.931447246132295e-05, "loss": 0.103, "num_input_tokens_seen": 44264832, "step": 20505 }, { "epoch": 3.3458401305057097, "grad_norm": 1.4647939205169678, "learning_rate": 4.93136444890413e-05, "loss": 0.1557, "num_input_tokens_seen": 44276800, "step": 20510 }, { "epoch": 3.3466557911908645, "grad_norm": 0.9289339184761047, "learning_rate": 4.931281602401281e-05, "loss": 0.1365, "num_input_tokens_seen": 44287584, "step": 20515 }, { "epoch": 3.34747145187602, "grad_norm": 1.0565778017044067, "learning_rate": 4.931198706625426e-05, "loss": 0.0853, "num_input_tokens_seen": 44298688, "step": 20520 }, { "epoch": 3.3482871125611746, "grad_norm": 0.3686668872833252, "learning_rate": 4.931115761578245e-05, "loss": 0.062, "num_input_tokens_seen": 44309184, "step": 20525 }, { "epoch": 3.3491027732463294, "grad_norm": 0.4433179199695587, "learning_rate": 4.931032767261419e-05, "loss": 0.2154, "num_input_tokens_seen": 44319872, "step": 20530 }, { "epoch": 3.3499184339314847, "grad_norm": 1.6032280921936035, "learning_rate": 4.93094972367663e-05, "loss": 0.1681, "num_input_tokens_seen": 44330432, "step": 20535 }, { "epoch": 3.3507340946166395, "grad_norm": 0.365969717502594, "learning_rate": 4.93086663082556e-05, "loss": 0.2318, "num_input_tokens_seen": 44341920, "step": 20540 }, { "epoch": 3.3515497553017943, "grad_norm": 0.8010940551757812, "learning_rate": 4.930783488709895e-05, "loss": 0.0769, "num_input_tokens_seen": 44352096, "step": 20545 }, { "epoch": 3.3523654159869496, "grad_norm": 0.9931228160858154, "learning_rate": 4.9307002973313186e-05, "loss": 0.1183, "num_input_tokens_seen": 44363456, "step": 20550 }, { "epoch": 3.3531810766721044, "grad_norm": 0.23592984676361084, "learning_rate": 4.930617056691518e-05, "loss": 0.0659, "num_input_tokens_seen": 44375168, "step": 20555 }, { "epoch": 3.3539967373572592, "grad_norm": 0.10971885919570923, "learning_rate": 4.930533766792178e-05, "loss": 0.0856, "num_input_tokens_seen": 44385824, "step": 20560 }, { "epoch": 3.3548123980424145, "grad_norm": 0.504206657409668, "learning_rate": 4.9304504276349886e-05, "loss": 0.2485, "num_input_tokens_seen": 44396736, "step": 20565 }, { "epoch": 3.3556280587275693, "grad_norm": 0.10495748370885849, "learning_rate": 4.930367039221638e-05, "loss": 0.1245, "num_input_tokens_seen": 44407648, "step": 20570 }, { "epoch": 3.356443719412724, "grad_norm": 0.4044572114944458, "learning_rate": 4.930283601553816e-05, "loss": 0.1193, "num_input_tokens_seen": 44418336, "step": 20575 }, { "epoch": 3.3572593800978794, "grad_norm": 1.9130940437316895, "learning_rate": 4.930200114633214e-05, "loss": 0.1753, "num_input_tokens_seen": 44428960, "step": 20580 }, { "epoch": 3.358075040783034, "grad_norm": 0.4159456491470337, "learning_rate": 4.930116578461523e-05, "loss": 0.0764, "num_input_tokens_seen": 44440608, "step": 20585 }, { "epoch": 3.358890701468189, "grad_norm": 0.11435119062662125, "learning_rate": 4.930032993040437e-05, "loss": 0.1447, "num_input_tokens_seen": 44451040, "step": 20590 }, { "epoch": 3.3597063621533443, "grad_norm": 0.9565105438232422, "learning_rate": 4.9299493583716505e-05, "loss": 0.1044, "num_input_tokens_seen": 44461792, "step": 20595 }, { "epoch": 3.360522022838499, "grad_norm": 0.07179048657417297, "learning_rate": 4.9298656744568574e-05, "loss": 0.0465, "num_input_tokens_seen": 44472000, "step": 20600 }, { "epoch": 3.3613376835236544, "grad_norm": 0.7455434203147888, "learning_rate": 4.929781941297753e-05, "loss": 0.0848, "num_input_tokens_seen": 44482752, "step": 20605 }, { "epoch": 3.362153344208809, "grad_norm": 0.2826980948448181, "learning_rate": 4.9296981588960366e-05, "loss": 0.1549, "num_input_tokens_seen": 44494624, "step": 20610 }, { "epoch": 3.362969004893964, "grad_norm": 1.2088862657546997, "learning_rate": 4.929614327253403e-05, "loss": 0.2028, "num_input_tokens_seen": 44504256, "step": 20615 }, { "epoch": 3.3637846655791193, "grad_norm": 0.0686989352107048, "learning_rate": 4.929530446371554e-05, "loss": 0.032, "num_input_tokens_seen": 44515680, "step": 20620 }, { "epoch": 3.364600326264274, "grad_norm": 0.6889645457267761, "learning_rate": 4.929446516252188e-05, "loss": 0.0934, "num_input_tokens_seen": 44527360, "step": 20625 }, { "epoch": 3.365415986949429, "grad_norm": 0.09433780610561371, "learning_rate": 4.929362536897007e-05, "loss": 0.16, "num_input_tokens_seen": 44537600, "step": 20630 }, { "epoch": 3.366231647634584, "grad_norm": 0.797422468662262, "learning_rate": 4.929278508307712e-05, "loss": 0.0441, "num_input_tokens_seen": 44548704, "step": 20635 }, { "epoch": 3.367047308319739, "grad_norm": 0.17734862864017487, "learning_rate": 4.929194430486007e-05, "loss": 0.1833, "num_input_tokens_seen": 44559584, "step": 20640 }, { "epoch": 3.367862969004894, "grad_norm": 0.5199423432350159, "learning_rate": 4.929110303433594e-05, "loss": 0.0215, "num_input_tokens_seen": 44568576, "step": 20645 }, { "epoch": 3.368678629690049, "grad_norm": 0.8720648288726807, "learning_rate": 4.9290261271521796e-05, "loss": 0.2411, "num_input_tokens_seen": 44579968, "step": 20650 }, { "epoch": 3.369494290375204, "grad_norm": 0.13628318905830383, "learning_rate": 4.928941901643469e-05, "loss": 0.2342, "num_input_tokens_seen": 44591552, "step": 20655 }, { "epoch": 3.370309951060359, "grad_norm": 0.5390905737876892, "learning_rate": 4.92885762690917e-05, "loss": 0.13, "num_input_tokens_seen": 44602016, "step": 20660 }, { "epoch": 3.371125611745514, "grad_norm": 0.07007692009210587, "learning_rate": 4.92877330295099e-05, "loss": 0.076, "num_input_tokens_seen": 44611936, "step": 20665 }, { "epoch": 3.3719412724306688, "grad_norm": 0.17196156084537506, "learning_rate": 4.928688929770637e-05, "loss": 0.1312, "num_input_tokens_seen": 44623232, "step": 20670 }, { "epoch": 3.3727569331158236, "grad_norm": 0.2037929743528366, "learning_rate": 4.928604507369822e-05, "loss": 0.0522, "num_input_tokens_seen": 44634592, "step": 20675 }, { "epoch": 3.373572593800979, "grad_norm": 0.026663076132535934, "learning_rate": 4.928520035750256e-05, "loss": 0.0467, "num_input_tokens_seen": 44645568, "step": 20680 }, { "epoch": 3.3743882544861337, "grad_norm": 2.2157719135284424, "learning_rate": 4.928435514913651e-05, "loss": 0.2174, "num_input_tokens_seen": 44656992, "step": 20685 }, { "epoch": 3.375203915171289, "grad_norm": 0.2704499065876007, "learning_rate": 4.928350944861719e-05, "loss": 0.0676, "num_input_tokens_seen": 44669248, "step": 20690 }, { "epoch": 3.3760195758564437, "grad_norm": 0.3681941628456116, "learning_rate": 4.9282663255961756e-05, "loss": 0.1031, "num_input_tokens_seen": 44680864, "step": 20695 }, { "epoch": 3.3768352365415986, "grad_norm": 0.1011476144194603, "learning_rate": 4.928181657118734e-05, "loss": 0.1653, "num_input_tokens_seen": 44691456, "step": 20700 }, { "epoch": 3.377650897226754, "grad_norm": 0.11486637592315674, "learning_rate": 4.92809693943111e-05, "loss": 0.0402, "num_input_tokens_seen": 44702336, "step": 20705 }, { "epoch": 3.3784665579119086, "grad_norm": 1.3302189111709595, "learning_rate": 4.928012172535023e-05, "loss": 0.3789, "num_input_tokens_seen": 44713728, "step": 20710 }, { "epoch": 3.3792822185970635, "grad_norm": 0.4676877558231354, "learning_rate": 4.9279273564321885e-05, "loss": 0.1157, "num_input_tokens_seen": 44723968, "step": 20715 }, { "epoch": 3.3800978792822187, "grad_norm": 0.11652758717536926, "learning_rate": 4.927842491124325e-05, "loss": 0.1028, "num_input_tokens_seen": 44734304, "step": 20720 }, { "epoch": 3.3809135399673735, "grad_norm": 1.178504467010498, "learning_rate": 4.9277575766131545e-05, "loss": 0.2052, "num_input_tokens_seen": 44745568, "step": 20725 }, { "epoch": 3.3817292006525284, "grad_norm": 0.7744405269622803, "learning_rate": 4.9276726129003965e-05, "loss": 0.0747, "num_input_tokens_seen": 44757216, "step": 20730 }, { "epoch": 3.3825448613376836, "grad_norm": 0.06369943916797638, "learning_rate": 4.927587599987773e-05, "loss": 0.0703, "num_input_tokens_seen": 44767808, "step": 20735 }, { "epoch": 3.3833605220228384, "grad_norm": 1.3416672945022583, "learning_rate": 4.927502537877008e-05, "loss": 0.1758, "num_input_tokens_seen": 44778208, "step": 20740 }, { "epoch": 3.3841761827079937, "grad_norm": 0.12421882152557373, "learning_rate": 4.9274174265698236e-05, "loss": 0.1258, "num_input_tokens_seen": 44788032, "step": 20745 }, { "epoch": 3.3849918433931485, "grad_norm": 1.1912999153137207, "learning_rate": 4.9273322660679466e-05, "loss": 0.2174, "num_input_tokens_seen": 44797440, "step": 20750 }, { "epoch": 3.3858075040783033, "grad_norm": 1.7778575420379639, "learning_rate": 4.927247056373101e-05, "loss": 0.2119, "num_input_tokens_seen": 44809248, "step": 20755 }, { "epoch": 3.3866231647634586, "grad_norm": 0.8096874952316284, "learning_rate": 4.927161797487015e-05, "loss": 0.1722, "num_input_tokens_seen": 44820768, "step": 20760 }, { "epoch": 3.3874388254486134, "grad_norm": 0.2676880359649658, "learning_rate": 4.9270764894114167e-05, "loss": 0.0522, "num_input_tokens_seen": 44832128, "step": 20765 }, { "epoch": 3.3882544861337682, "grad_norm": 0.586317241191864, "learning_rate": 4.926991132148034e-05, "loss": 0.1122, "num_input_tokens_seen": 44840992, "step": 20770 }, { "epoch": 3.3890701468189235, "grad_norm": 0.2974552810192108, "learning_rate": 4.9269057256985976e-05, "loss": 0.1095, "num_input_tokens_seen": 44851232, "step": 20775 }, { "epoch": 3.3898858075040783, "grad_norm": 0.15827816724777222, "learning_rate": 4.926820270064837e-05, "loss": 0.0509, "num_input_tokens_seen": 44860672, "step": 20780 }, { "epoch": 3.390701468189233, "grad_norm": 0.4761854410171509, "learning_rate": 4.926734765248485e-05, "loss": 0.0476, "num_input_tokens_seen": 44870688, "step": 20785 }, { "epoch": 3.3915171288743884, "grad_norm": 0.6542661190032959, "learning_rate": 4.926649211251275e-05, "loss": 0.0728, "num_input_tokens_seen": 44881760, "step": 20790 }, { "epoch": 3.392332789559543, "grad_norm": 0.33454617857933044, "learning_rate": 4.92656360807494e-05, "loss": 0.052, "num_input_tokens_seen": 44892896, "step": 20795 }, { "epoch": 3.393148450244698, "grad_norm": 1.2458750009536743, "learning_rate": 4.926477955721215e-05, "loss": 0.1413, "num_input_tokens_seen": 44903296, "step": 20800 }, { "epoch": 3.3939641109298533, "grad_norm": 0.42355960607528687, "learning_rate": 4.9263922541918366e-05, "loss": 0.1319, "num_input_tokens_seen": 44913664, "step": 20805 }, { "epoch": 3.394779771615008, "grad_norm": 1.2976281642913818, "learning_rate": 4.926306503488541e-05, "loss": 0.1891, "num_input_tokens_seen": 44924864, "step": 20810 }, { "epoch": 3.395595432300163, "grad_norm": 0.36249691247940063, "learning_rate": 4.926220703613066e-05, "loss": 0.1404, "num_input_tokens_seen": 44935008, "step": 20815 }, { "epoch": 3.396411092985318, "grad_norm": 0.035219769924879074, "learning_rate": 4.9261348545671506e-05, "loss": 0.1106, "num_input_tokens_seen": 44945280, "step": 20820 }, { "epoch": 3.397226753670473, "grad_norm": 0.190832257270813, "learning_rate": 4.926048956352534e-05, "loss": 0.0708, "num_input_tokens_seen": 44956064, "step": 20825 }, { "epoch": 3.3980424143556283, "grad_norm": 0.18741770088672638, "learning_rate": 4.9259630089709585e-05, "loss": 0.0888, "num_input_tokens_seen": 44967424, "step": 20830 }, { "epoch": 3.398858075040783, "grad_norm": 0.34962210059165955, "learning_rate": 4.925877012424164e-05, "loss": 0.1031, "num_input_tokens_seen": 44977952, "step": 20835 }, { "epoch": 3.399673735725938, "grad_norm": 0.1267140954732895, "learning_rate": 4.925790966713896e-05, "loss": 0.1275, "num_input_tokens_seen": 44988384, "step": 20840 }, { "epoch": 3.400489396411093, "grad_norm": 0.02182278037071228, "learning_rate": 4.9257048718418954e-05, "loss": 0.0404, "num_input_tokens_seen": 44998816, "step": 20845 }, { "epoch": 3.401305057096248, "grad_norm": 0.9959958791732788, "learning_rate": 4.9256187278099086e-05, "loss": 0.1565, "num_input_tokens_seen": 45010112, "step": 20850 }, { "epoch": 3.402120717781403, "grad_norm": 2.060236692428589, "learning_rate": 4.925532534619681e-05, "loss": 0.237, "num_input_tokens_seen": 45021184, "step": 20855 }, { "epoch": 3.402936378466558, "grad_norm": 0.660027265548706, "learning_rate": 4.92544629227296e-05, "loss": 0.2352, "num_input_tokens_seen": 45030976, "step": 20860 }, { "epoch": 3.403752039151713, "grad_norm": 1.1797107458114624, "learning_rate": 4.925360000771493e-05, "loss": 0.2124, "num_input_tokens_seen": 45042400, "step": 20865 }, { "epoch": 3.4045676998368677, "grad_norm": 0.0812731385231018, "learning_rate": 4.925273660117029e-05, "loss": 0.0986, "num_input_tokens_seen": 45052352, "step": 20870 }, { "epoch": 3.405383360522023, "grad_norm": 0.5809255242347717, "learning_rate": 4.9251872703113164e-05, "loss": 0.0386, "num_input_tokens_seen": 45063520, "step": 20875 }, { "epoch": 3.4061990212071778, "grad_norm": 0.9309435486793518, "learning_rate": 4.925100831356109e-05, "loss": 0.0797, "num_input_tokens_seen": 45074144, "step": 20880 }, { "epoch": 3.407014681892333, "grad_norm": 1.2278939485549927, "learning_rate": 4.9250143432531556e-05, "loss": 0.143, "num_input_tokens_seen": 45084704, "step": 20885 }, { "epoch": 3.407830342577488, "grad_norm": 0.9621807336807251, "learning_rate": 4.92492780600421e-05, "loss": 0.1053, "num_input_tokens_seen": 45095744, "step": 20890 }, { "epoch": 3.4086460032626427, "grad_norm": 0.23199523985385895, "learning_rate": 4.924841219611027e-05, "loss": 0.0907, "num_input_tokens_seen": 45105952, "step": 20895 }, { "epoch": 3.4094616639477975, "grad_norm": 0.23832257091999054, "learning_rate": 4.92475458407536e-05, "loss": 0.0337, "num_input_tokens_seen": 45118080, "step": 20900 }, { "epoch": 3.4102773246329527, "grad_norm": 0.04110715910792351, "learning_rate": 4.924667899398966e-05, "loss": 0.0368, "num_input_tokens_seen": 45129024, "step": 20905 }, { "epoch": 3.4110929853181076, "grad_norm": 2.2017691135406494, "learning_rate": 4.924581165583601e-05, "loss": 0.0676, "num_input_tokens_seen": 45139200, "step": 20910 }, { "epoch": 3.411908646003263, "grad_norm": 0.042746298015117645, "learning_rate": 4.9244943826310216e-05, "loss": 0.0749, "num_input_tokens_seen": 45150752, "step": 20915 }, { "epoch": 3.4127243066884176, "grad_norm": 0.20249050855636597, "learning_rate": 4.924407550542989e-05, "loss": 0.1757, "num_input_tokens_seen": 45162016, "step": 20920 }, { "epoch": 3.4135399673735725, "grad_norm": 0.35714587569236755, "learning_rate": 4.924320669321262e-05, "loss": 0.1134, "num_input_tokens_seen": 45171904, "step": 20925 }, { "epoch": 3.4143556280587277, "grad_norm": 2.0313148498535156, "learning_rate": 4.9242337389676005e-05, "loss": 0.2045, "num_input_tokens_seen": 45183712, "step": 20930 }, { "epoch": 3.4151712887438825, "grad_norm": 1.2277969121932983, "learning_rate": 4.924146759483767e-05, "loss": 0.0907, "num_input_tokens_seen": 45194656, "step": 20935 }, { "epoch": 3.4159869494290374, "grad_norm": 0.04852783679962158, "learning_rate": 4.924059730871524e-05, "loss": 0.1886, "num_input_tokens_seen": 45204800, "step": 20940 }, { "epoch": 3.4168026101141926, "grad_norm": 1.116856575012207, "learning_rate": 4.923972653132636e-05, "loss": 0.1413, "num_input_tokens_seen": 45215776, "step": 20945 }, { "epoch": 3.4176182707993474, "grad_norm": 0.30821672081947327, "learning_rate": 4.9238855262688665e-05, "loss": 0.0805, "num_input_tokens_seen": 45225952, "step": 20950 }, { "epoch": 3.4184339314845023, "grad_norm": 0.4447128474712372, "learning_rate": 4.923798350281983e-05, "loss": 0.0844, "num_input_tokens_seen": 45237312, "step": 20955 }, { "epoch": 3.4192495921696575, "grad_norm": 0.5265255570411682, "learning_rate": 4.92371112517375e-05, "loss": 0.0574, "num_input_tokens_seen": 45246912, "step": 20960 }, { "epoch": 3.4200652528548123, "grad_norm": 0.2088308185338974, "learning_rate": 4.923623850945936e-05, "loss": 0.0916, "num_input_tokens_seen": 45257440, "step": 20965 }, { "epoch": 3.4208809135399676, "grad_norm": 0.32016971707344055, "learning_rate": 4.923536527600312e-05, "loss": 0.1366, "num_input_tokens_seen": 45268416, "step": 20970 }, { "epoch": 3.4216965742251224, "grad_norm": 0.14470084011554718, "learning_rate": 4.9234491551386444e-05, "loss": 0.127, "num_input_tokens_seen": 45278848, "step": 20975 }, { "epoch": 3.4225122349102772, "grad_norm": 0.3285987079143524, "learning_rate": 4.9233617335627057e-05, "loss": 0.0461, "num_input_tokens_seen": 45291072, "step": 20980 }, { "epoch": 3.4233278955954325, "grad_norm": 0.5919251441955566, "learning_rate": 4.9232742628742665e-05, "loss": 0.1276, "num_input_tokens_seen": 45302336, "step": 20985 }, { "epoch": 3.4241435562805873, "grad_norm": 0.1753751039505005, "learning_rate": 4.923186743075101e-05, "loss": 0.0393, "num_input_tokens_seen": 45312864, "step": 20990 }, { "epoch": 3.424959216965742, "grad_norm": 0.5712305307388306, "learning_rate": 4.923099174166982e-05, "loss": 0.1704, "num_input_tokens_seen": 45323744, "step": 20995 }, { "epoch": 3.4257748776508974, "grad_norm": 0.4674236476421356, "learning_rate": 4.9230115561516846e-05, "loss": 0.1991, "num_input_tokens_seen": 45333760, "step": 21000 }, { "epoch": 3.426590538336052, "grad_norm": 0.6522476077079773, "learning_rate": 4.9229238890309836e-05, "loss": 0.2175, "num_input_tokens_seen": 45344896, "step": 21005 }, { "epoch": 3.427406199021207, "grad_norm": 1.7360479831695557, "learning_rate": 4.9228361728066564e-05, "loss": 0.122, "num_input_tokens_seen": 45354336, "step": 21010 }, { "epoch": 3.4282218597063623, "grad_norm": 0.10865502804517746, "learning_rate": 4.9227484074804805e-05, "loss": 0.2676, "num_input_tokens_seen": 45365376, "step": 21015 }, { "epoch": 3.429037520391517, "grad_norm": 0.7162960171699524, "learning_rate": 4.922660593054235e-05, "loss": 0.0727, "num_input_tokens_seen": 45376416, "step": 21020 }, { "epoch": 3.429853181076672, "grad_norm": 0.2447802871465683, "learning_rate": 4.922572729529699e-05, "loss": 0.1636, "num_input_tokens_seen": 45386272, "step": 21025 }, { "epoch": 3.430668841761827, "grad_norm": 0.053496118634939194, "learning_rate": 4.9224848169086534e-05, "loss": 0.0672, "num_input_tokens_seen": 45395936, "step": 21030 }, { "epoch": 3.431484502446982, "grad_norm": 0.09587855637073517, "learning_rate": 4.9223968551928804e-05, "loss": 0.0698, "num_input_tokens_seen": 45407680, "step": 21035 }, { "epoch": 3.432300163132137, "grad_norm": 1.3890730142593384, "learning_rate": 4.922308844384161e-05, "loss": 0.0602, "num_input_tokens_seen": 45417088, "step": 21040 }, { "epoch": 3.433115823817292, "grad_norm": 0.35637184977531433, "learning_rate": 4.92222078448428e-05, "loss": 0.0958, "num_input_tokens_seen": 45429536, "step": 21045 }, { "epoch": 3.433931484502447, "grad_norm": 0.12269078940153122, "learning_rate": 4.9221326754950234e-05, "loss": 0.0476, "num_input_tokens_seen": 45440320, "step": 21050 }, { "epoch": 3.434747145187602, "grad_norm": 0.9106990098953247, "learning_rate": 4.9220445174181745e-05, "loss": 0.1955, "num_input_tokens_seen": 45451008, "step": 21055 }, { "epoch": 3.435562805872757, "grad_norm": 0.6593723297119141, "learning_rate": 4.921956310255521e-05, "loss": 0.07, "num_input_tokens_seen": 45462432, "step": 21060 }, { "epoch": 3.436378466557912, "grad_norm": 1.9816616773605347, "learning_rate": 4.9218680540088504e-05, "loss": 0.1826, "num_input_tokens_seen": 45472544, "step": 21065 }, { "epoch": 3.437194127243067, "grad_norm": 0.23073381185531616, "learning_rate": 4.921779748679951e-05, "loss": 0.0791, "num_input_tokens_seen": 45483168, "step": 21070 }, { "epoch": 3.438009787928222, "grad_norm": 0.0877004936337471, "learning_rate": 4.9216913942706134e-05, "loss": 0.0314, "num_input_tokens_seen": 45492832, "step": 21075 }, { "epoch": 3.4388254486133767, "grad_norm": 1.2250310182571411, "learning_rate": 4.921602990782627e-05, "loss": 0.2698, "num_input_tokens_seen": 45504064, "step": 21080 }, { "epoch": 3.439641109298532, "grad_norm": 1.0154272317886353, "learning_rate": 4.921514538217784e-05, "loss": 0.1537, "num_input_tokens_seen": 45514336, "step": 21085 }, { "epoch": 3.4404567699836868, "grad_norm": 0.2562998831272125, "learning_rate": 4.9214260365778776e-05, "loss": 0.0946, "num_input_tokens_seen": 45524480, "step": 21090 }, { "epoch": 3.4412724306688416, "grad_norm": 2.8437559604644775, "learning_rate": 4.9213374858646996e-05, "loss": 0.297, "num_input_tokens_seen": 45535264, "step": 21095 }, { "epoch": 3.442088091353997, "grad_norm": 1.0652090311050415, "learning_rate": 4.921248886080046e-05, "loss": 0.0689, "num_input_tokens_seen": 45546560, "step": 21100 }, { "epoch": 3.4429037520391517, "grad_norm": 0.24715092778205872, "learning_rate": 4.921160237225714e-05, "loss": 0.0587, "num_input_tokens_seen": 45558016, "step": 21105 }, { "epoch": 3.443719412724307, "grad_norm": 0.15084248781204224, "learning_rate": 4.9210715393034964e-05, "loss": 0.1233, "num_input_tokens_seen": 45568512, "step": 21110 }, { "epoch": 3.4445350734094617, "grad_norm": 0.46105244755744934, "learning_rate": 4.920982792315193e-05, "loss": 0.1342, "num_input_tokens_seen": 45579968, "step": 21115 }, { "epoch": 3.4453507340946166, "grad_norm": 0.5575847625732422, "learning_rate": 4.9208939962626025e-05, "loss": 0.0728, "num_input_tokens_seen": 45592064, "step": 21120 }, { "epoch": 3.4461663947797714, "grad_norm": 0.11723986268043518, "learning_rate": 4.920805151147524e-05, "loss": 0.1283, "num_input_tokens_seen": 45603040, "step": 21125 }, { "epoch": 3.4469820554649266, "grad_norm": 0.12115319073200226, "learning_rate": 4.920716256971758e-05, "loss": 0.0475, "num_input_tokens_seen": 45613536, "step": 21130 }, { "epoch": 3.4477977161500815, "grad_norm": 0.3562172055244446, "learning_rate": 4.920627313737106e-05, "loss": 0.0731, "num_input_tokens_seen": 45625120, "step": 21135 }, { "epoch": 3.4486133768352367, "grad_norm": 2.353394031524658, "learning_rate": 4.920538321445371e-05, "loss": 0.1637, "num_input_tokens_seen": 45635744, "step": 21140 }, { "epoch": 3.4494290375203915, "grad_norm": 0.14208434522151947, "learning_rate": 4.920449280098356e-05, "loss": 0.1682, "num_input_tokens_seen": 45646432, "step": 21145 }, { "epoch": 3.4502446982055464, "grad_norm": 1.3083699941635132, "learning_rate": 4.920360189697866e-05, "loss": 0.1165, "num_input_tokens_seen": 45656576, "step": 21150 }, { "epoch": 3.4510603588907016, "grad_norm": 0.5978566408157349, "learning_rate": 4.920271050245706e-05, "loss": 0.2591, "num_input_tokens_seen": 45667904, "step": 21155 }, { "epoch": 3.4518760195758564, "grad_norm": 0.0809687077999115, "learning_rate": 4.920181861743683e-05, "loss": 0.039, "num_input_tokens_seen": 45678528, "step": 21160 }, { "epoch": 3.4526916802610113, "grad_norm": 1.5871587991714478, "learning_rate": 4.920092624193604e-05, "loss": 0.2186, "num_input_tokens_seen": 45689312, "step": 21165 }, { "epoch": 3.4535073409461665, "grad_norm": 0.878819465637207, "learning_rate": 4.9200033375972786e-05, "loss": 0.0492, "num_input_tokens_seen": 45700160, "step": 21170 }, { "epoch": 3.4543230016313213, "grad_norm": 2.0213193893432617, "learning_rate": 4.919914001956515e-05, "loss": 0.2384, "num_input_tokens_seen": 45710432, "step": 21175 }, { "epoch": 3.455138662316476, "grad_norm": 0.9824541211128235, "learning_rate": 4.9198246172731246e-05, "loss": 0.0521, "num_input_tokens_seen": 45721248, "step": 21180 }, { "epoch": 3.4559543230016314, "grad_norm": 0.48651352524757385, "learning_rate": 4.919735183548918e-05, "loss": 0.0984, "num_input_tokens_seen": 45732768, "step": 21185 }, { "epoch": 3.4567699836867862, "grad_norm": 0.15703308582305908, "learning_rate": 4.919645700785709e-05, "loss": 0.0281, "num_input_tokens_seen": 45743200, "step": 21190 }, { "epoch": 3.4575856443719415, "grad_norm": 0.23493210971355438, "learning_rate": 4.9195561689853096e-05, "loss": 0.1579, "num_input_tokens_seen": 45753376, "step": 21195 }, { "epoch": 3.4584013050570963, "grad_norm": 0.5266757011413574, "learning_rate": 4.919466588149536e-05, "loss": 0.0826, "num_input_tokens_seen": 45764064, "step": 21200 }, { "epoch": 3.459216965742251, "grad_norm": 1.1793731451034546, "learning_rate": 4.919376958280202e-05, "loss": 0.2203, "num_input_tokens_seen": 45774944, "step": 21205 }, { "epoch": 3.4600326264274064, "grad_norm": 0.6976194977760315, "learning_rate": 4.919287279379125e-05, "loss": 0.1873, "num_input_tokens_seen": 45785856, "step": 21210 }, { "epoch": 3.460848287112561, "grad_norm": 2.131650686264038, "learning_rate": 4.919197551448122e-05, "loss": 0.3744, "num_input_tokens_seen": 45796352, "step": 21215 }, { "epoch": 3.461663947797716, "grad_norm": 1.6144546270370483, "learning_rate": 4.919107774489012e-05, "loss": 0.0927, "num_input_tokens_seen": 45808160, "step": 21220 }, { "epoch": 3.4624796084828713, "grad_norm": 0.21935252845287323, "learning_rate": 4.9190179485036136e-05, "loss": 0.0652, "num_input_tokens_seen": 45819744, "step": 21225 }, { "epoch": 3.463295269168026, "grad_norm": 0.04006725922226906, "learning_rate": 4.918928073493748e-05, "loss": 0.1866, "num_input_tokens_seen": 45830048, "step": 21230 }, { "epoch": 3.464110929853181, "grad_norm": 1.3308347463607788, "learning_rate": 4.918838149461236e-05, "loss": 0.0984, "num_input_tokens_seen": 45841216, "step": 21235 }, { "epoch": 3.464926590538336, "grad_norm": 0.2210499346256256, "learning_rate": 4.918748176407901e-05, "loss": 0.078, "num_input_tokens_seen": 45850720, "step": 21240 }, { "epoch": 3.465742251223491, "grad_norm": 1.484281063079834, "learning_rate": 4.918658154335565e-05, "loss": 0.2369, "num_input_tokens_seen": 45862272, "step": 21245 }, { "epoch": 3.466557911908646, "grad_norm": 0.24084331095218658, "learning_rate": 4.918568083246054e-05, "loss": 0.2069, "num_input_tokens_seen": 45872576, "step": 21250 }, { "epoch": 3.467373572593801, "grad_norm": 0.06577011197805405, "learning_rate": 4.918477963141193e-05, "loss": 0.175, "num_input_tokens_seen": 45882816, "step": 21255 }, { "epoch": 3.468189233278956, "grad_norm": 1.3409090042114258, "learning_rate": 4.918387794022807e-05, "loss": 0.1755, "num_input_tokens_seen": 45893856, "step": 21260 }, { "epoch": 3.4690048939641107, "grad_norm": 1.28896963596344, "learning_rate": 4.918297575892725e-05, "loss": 0.1164, "num_input_tokens_seen": 45904128, "step": 21265 }, { "epoch": 3.469820554649266, "grad_norm": 0.9940065741539001, "learning_rate": 4.918207308752774e-05, "loss": 0.1625, "num_input_tokens_seen": 45914624, "step": 21270 }, { "epoch": 3.470636215334421, "grad_norm": 1.335495114326477, "learning_rate": 4.918116992604785e-05, "loss": 0.175, "num_input_tokens_seen": 45926432, "step": 21275 }, { "epoch": 3.471451876019576, "grad_norm": 0.9640886187553406, "learning_rate": 4.918026627450587e-05, "loss": 0.0727, "num_input_tokens_seen": 45936608, "step": 21280 }, { "epoch": 3.472267536704731, "grad_norm": 0.1474667489528656, "learning_rate": 4.9179362132920125e-05, "loss": 0.1671, "num_input_tokens_seen": 45948768, "step": 21285 }, { "epoch": 3.4730831973898857, "grad_norm": 0.03318638354539871, "learning_rate": 4.917845750130893e-05, "loss": 0.0708, "num_input_tokens_seen": 45958080, "step": 21290 }, { "epoch": 3.473898858075041, "grad_norm": 1.0662577152252197, "learning_rate": 4.917755237969062e-05, "loss": 0.1878, "num_input_tokens_seen": 45968768, "step": 21295 }, { "epoch": 3.4747145187601958, "grad_norm": 1.439199686050415, "learning_rate": 4.917664676808354e-05, "loss": 0.212, "num_input_tokens_seen": 45979360, "step": 21300 }, { "epoch": 3.4755301794453506, "grad_norm": 0.36989647150039673, "learning_rate": 4.917574066650604e-05, "loss": 0.0577, "num_input_tokens_seen": 45990880, "step": 21305 }, { "epoch": 3.476345840130506, "grad_norm": 0.03675192594528198, "learning_rate": 4.917483407497649e-05, "loss": 0.1678, "num_input_tokens_seen": 46001344, "step": 21310 }, { "epoch": 3.4771615008156607, "grad_norm": 1.3426170349121094, "learning_rate": 4.917392699351327e-05, "loss": 0.1054, "num_input_tokens_seen": 46013152, "step": 21315 }, { "epoch": 3.4779771615008155, "grad_norm": 0.29717519879341125, "learning_rate": 4.9173019422134736e-05, "loss": 0.1638, "num_input_tokens_seen": 46023424, "step": 21320 }, { "epoch": 3.4787928221859707, "grad_norm": 0.6890867352485657, "learning_rate": 4.91721113608593e-05, "loss": 0.0709, "num_input_tokens_seen": 46033568, "step": 21325 }, { "epoch": 3.4796084828711256, "grad_norm": 0.5649540424346924, "learning_rate": 4.9171202809705364e-05, "loss": 0.0543, "num_input_tokens_seen": 46045088, "step": 21330 }, { "epoch": 3.480424143556281, "grad_norm": 0.16652360558509827, "learning_rate": 4.9170293768691344e-05, "loss": 0.1201, "num_input_tokens_seen": 46056640, "step": 21335 }, { "epoch": 3.4812398042414356, "grad_norm": 0.2547214925289154, "learning_rate": 4.916938423783566e-05, "loss": 0.0995, "num_input_tokens_seen": 46067008, "step": 21340 }, { "epoch": 3.4820554649265905, "grad_norm": 0.08171091973781586, "learning_rate": 4.9168474217156734e-05, "loss": 0.0667, "num_input_tokens_seen": 46078048, "step": 21345 }, { "epoch": 3.4828711256117453, "grad_norm": 0.07674629241228104, "learning_rate": 4.916756370667303e-05, "loss": 0.1208, "num_input_tokens_seen": 46089568, "step": 21350 }, { "epoch": 3.4836867862969005, "grad_norm": 1.2676514387130737, "learning_rate": 4.916665270640298e-05, "loss": 0.3227, "num_input_tokens_seen": 46101024, "step": 21355 }, { "epoch": 3.4845024469820554, "grad_norm": 1.4517178535461426, "learning_rate": 4.916574121636506e-05, "loss": 0.1853, "num_input_tokens_seen": 46112064, "step": 21360 }, { "epoch": 3.4853181076672106, "grad_norm": 1.2320101261138916, "learning_rate": 4.916482923657774e-05, "loss": 0.1323, "num_input_tokens_seen": 46122080, "step": 21365 }, { "epoch": 3.4861337683523654, "grad_norm": 1.6223441362380981, "learning_rate": 4.9163916767059494e-05, "loss": 0.1835, "num_input_tokens_seen": 46132992, "step": 21370 }, { "epoch": 3.4869494290375203, "grad_norm": 1.6718087196350098, "learning_rate": 4.9163003807828825e-05, "loss": 0.1914, "num_input_tokens_seen": 46144576, "step": 21375 }, { "epoch": 3.4877650897226755, "grad_norm": 0.25454169511795044, "learning_rate": 4.9162090358904225e-05, "loss": 0.2482, "num_input_tokens_seen": 46155296, "step": 21380 }, { "epoch": 3.4885807504078303, "grad_norm": 2.130370855331421, "learning_rate": 4.9161176420304214e-05, "loss": 0.1637, "num_input_tokens_seen": 46166528, "step": 21385 }, { "epoch": 3.489396411092985, "grad_norm": 0.8201485872268677, "learning_rate": 4.9160261992047316e-05, "loss": 0.1623, "num_input_tokens_seen": 46176672, "step": 21390 }, { "epoch": 3.4902120717781404, "grad_norm": 0.2519601285457611, "learning_rate": 4.915934707415206e-05, "loss": 0.0766, "num_input_tokens_seen": 46186880, "step": 21395 }, { "epoch": 3.4910277324632952, "grad_norm": 1.358967900276184, "learning_rate": 4.9158431666636984e-05, "loss": 0.1496, "num_input_tokens_seen": 46196320, "step": 21400 }, { "epoch": 3.49184339314845, "grad_norm": 0.051032207906246185, "learning_rate": 4.915751576952064e-05, "loss": 0.0657, "num_input_tokens_seen": 46207040, "step": 21405 }, { "epoch": 3.4926590538336053, "grad_norm": 0.6579643487930298, "learning_rate": 4.91565993828216e-05, "loss": 0.1166, "num_input_tokens_seen": 46217248, "step": 21410 }, { "epoch": 3.49347471451876, "grad_norm": 0.2831958532333374, "learning_rate": 4.9155682506558423e-05, "loss": 0.1435, "num_input_tokens_seen": 46226688, "step": 21415 }, { "epoch": 3.4942903752039154, "grad_norm": 0.39995890855789185, "learning_rate": 4.9154765140749706e-05, "loss": 0.1472, "num_input_tokens_seen": 46236768, "step": 21420 }, { "epoch": 3.49510603588907, "grad_norm": 0.4011920988559723, "learning_rate": 4.915384728541402e-05, "loss": 0.0909, "num_input_tokens_seen": 46247872, "step": 21425 }, { "epoch": 3.495921696574225, "grad_norm": 1.0616942644119263, "learning_rate": 4.9152928940569974e-05, "loss": 0.2066, "num_input_tokens_seen": 46258688, "step": 21430 }, { "epoch": 3.4967373572593803, "grad_norm": 0.5784423351287842, "learning_rate": 4.915201010623619e-05, "loss": 0.1218, "num_input_tokens_seen": 46269504, "step": 21435 }, { "epoch": 3.497553017944535, "grad_norm": 0.8058150410652161, "learning_rate": 4.9151090782431286e-05, "loss": 0.1183, "num_input_tokens_seen": 46280928, "step": 21440 }, { "epoch": 3.49836867862969, "grad_norm": 0.09901560097932816, "learning_rate": 4.9150170969173886e-05, "loss": 0.1625, "num_input_tokens_seen": 46291648, "step": 21445 }, { "epoch": 3.499184339314845, "grad_norm": 1.0297198295593262, "learning_rate": 4.914925066648264e-05, "loss": 0.1156, "num_input_tokens_seen": 46302592, "step": 21450 }, { "epoch": 3.5, "grad_norm": 0.24719612300395966, "learning_rate": 4.914832987437618e-05, "loss": 0.135, "num_input_tokens_seen": 46313216, "step": 21455 }, { "epoch": 3.500815660685155, "grad_norm": 0.21549320220947266, "learning_rate": 4.914740859287319e-05, "loss": 0.2005, "num_input_tokens_seen": 46322656, "step": 21460 }, { "epoch": 3.50163132137031, "grad_norm": 0.5100536346435547, "learning_rate": 4.914648682199233e-05, "loss": 0.1081, "num_input_tokens_seen": 46332576, "step": 21465 }, { "epoch": 3.502446982055465, "grad_norm": 0.014011728577315807, "learning_rate": 4.9145564561752285e-05, "loss": 0.1575, "num_input_tokens_seen": 46343584, "step": 21470 }, { "epoch": 3.50326264274062, "grad_norm": 0.06753502041101456, "learning_rate": 4.9144641812171744e-05, "loss": 0.0363, "num_input_tokens_seen": 46354240, "step": 21475 }, { "epoch": 3.504078303425775, "grad_norm": 1.0516259670257568, "learning_rate": 4.91437185732694e-05, "loss": 0.1426, "num_input_tokens_seen": 46364736, "step": 21480 }, { "epoch": 3.50489396411093, "grad_norm": 0.11671498417854309, "learning_rate": 4.914279484506398e-05, "loss": 0.1978, "num_input_tokens_seen": 46374976, "step": 21485 }, { "epoch": 3.5057096247960846, "grad_norm": 1.3983601331710815, "learning_rate": 4.914187062757418e-05, "loss": 0.1768, "num_input_tokens_seen": 46386080, "step": 21490 }, { "epoch": 3.50652528548124, "grad_norm": 1.0579516887664795, "learning_rate": 4.914094592081876e-05, "loss": 0.1104, "num_input_tokens_seen": 46397152, "step": 21495 }, { "epoch": 3.5073409461663947, "grad_norm": 0.489810049533844, "learning_rate": 4.914002072481644e-05, "loss": 0.1172, "num_input_tokens_seen": 46408480, "step": 21500 }, { "epoch": 3.50815660685155, "grad_norm": 0.17876535654067993, "learning_rate": 4.913909503958598e-05, "loss": 0.1436, "num_input_tokens_seen": 46419712, "step": 21505 }, { "epoch": 3.5089722675367048, "grad_norm": 2.2291688919067383, "learning_rate": 4.913816886514614e-05, "loss": 0.231, "num_input_tokens_seen": 46431072, "step": 21510 }, { "epoch": 3.5097879282218596, "grad_norm": 0.31355342268943787, "learning_rate": 4.913724220151567e-05, "loss": 0.0897, "num_input_tokens_seen": 46441280, "step": 21515 }, { "epoch": 3.5106035889070144, "grad_norm": 0.24677103757858276, "learning_rate": 4.913631504871339e-05, "loss": 0.1693, "num_input_tokens_seen": 46451808, "step": 21520 }, { "epoch": 3.5114192495921697, "grad_norm": 1.9879478216171265, "learning_rate": 4.913538740675805e-05, "loss": 0.1883, "num_input_tokens_seen": 46463136, "step": 21525 }, { "epoch": 3.5122349102773245, "grad_norm": 0.4451484680175781, "learning_rate": 4.913445927566847e-05, "loss": 0.1371, "num_input_tokens_seen": 46473280, "step": 21530 }, { "epoch": 3.5130505709624797, "grad_norm": 0.9696035385131836, "learning_rate": 4.913353065546346e-05, "loss": 0.1068, "num_input_tokens_seen": 46484032, "step": 21535 }, { "epoch": 3.5138662316476346, "grad_norm": 0.12333637475967407, "learning_rate": 4.913260154616183e-05, "loss": 0.0412, "num_input_tokens_seen": 46495488, "step": 21540 }, { "epoch": 3.5146818923327894, "grad_norm": 0.9000478982925415, "learning_rate": 4.913167194778242e-05, "loss": 0.194, "num_input_tokens_seen": 46506944, "step": 21545 }, { "epoch": 3.5154975530179446, "grad_norm": 0.6587016582489014, "learning_rate": 4.9130741860344056e-05, "loss": 0.1397, "num_input_tokens_seen": 46517216, "step": 21550 }, { "epoch": 3.5163132137030995, "grad_norm": 1.4533073902130127, "learning_rate": 4.91298112838656e-05, "loss": 0.2081, "num_input_tokens_seen": 46528864, "step": 21555 }, { "epoch": 3.5171288743882547, "grad_norm": 0.2431551218032837, "learning_rate": 4.9128880218365916e-05, "loss": 0.1623, "num_input_tokens_seen": 46539072, "step": 21560 }, { "epoch": 3.5179445350734095, "grad_norm": 0.34753531217575073, "learning_rate": 4.9127948663863854e-05, "loss": 0.092, "num_input_tokens_seen": 46550016, "step": 21565 }, { "epoch": 3.5187601957585644, "grad_norm": 0.7114242315292358, "learning_rate": 4.912701662037831e-05, "loss": 0.1118, "num_input_tokens_seen": 46559968, "step": 21570 }, { "epoch": 3.519575856443719, "grad_norm": 0.5086819529533386, "learning_rate": 4.9126084087928166e-05, "loss": 0.0537, "num_input_tokens_seen": 46571456, "step": 21575 }, { "epoch": 3.5203915171288744, "grad_norm": 0.178904727101326, "learning_rate": 4.912515106653232e-05, "loss": 0.1023, "num_input_tokens_seen": 46583104, "step": 21580 }, { "epoch": 3.5212071778140293, "grad_norm": 0.6150252819061279, "learning_rate": 4.912421755620969e-05, "loss": 0.1617, "num_input_tokens_seen": 46592384, "step": 21585 }, { "epoch": 3.5220228384991845, "grad_norm": 0.6297048926353455, "learning_rate": 4.912328355697917e-05, "loss": 0.0661, "num_input_tokens_seen": 46603488, "step": 21590 }, { "epoch": 3.5228384991843393, "grad_norm": 0.0940733328461647, "learning_rate": 4.9122349068859716e-05, "loss": 0.1909, "num_input_tokens_seen": 46614528, "step": 21595 }, { "epoch": 3.523654159869494, "grad_norm": 0.7645249366760254, "learning_rate": 4.912141409187026e-05, "loss": 0.068, "num_input_tokens_seen": 46625920, "step": 21600 }, { "epoch": 3.5244698205546494, "grad_norm": 0.026764972135424614, "learning_rate": 4.912047862602975e-05, "loss": 0.0567, "num_input_tokens_seen": 46636288, "step": 21605 }, { "epoch": 3.5252854812398042, "grad_norm": 1.587603211402893, "learning_rate": 4.911954267135713e-05, "loss": 0.2229, "num_input_tokens_seen": 46645536, "step": 21610 }, { "epoch": 3.5261011419249595, "grad_norm": 0.14652010798454285, "learning_rate": 4.911860622787139e-05, "loss": 0.0649, "num_input_tokens_seen": 46656224, "step": 21615 }, { "epoch": 3.5269168026101143, "grad_norm": 1.0311726331710815, "learning_rate": 4.9117669295591494e-05, "loss": 0.0823, "num_input_tokens_seen": 46665920, "step": 21620 }, { "epoch": 3.527732463295269, "grad_norm": 0.45013999938964844, "learning_rate": 4.9116731874536434e-05, "loss": 0.0786, "num_input_tokens_seen": 46676864, "step": 21625 }, { "epoch": 3.528548123980424, "grad_norm": 0.09988947212696075, "learning_rate": 4.911579396472521e-05, "loss": 0.1384, "num_input_tokens_seen": 46688192, "step": 21630 }, { "epoch": 3.529363784665579, "grad_norm": 0.0818880945444107, "learning_rate": 4.911485556617683e-05, "loss": 0.1205, "num_input_tokens_seen": 46698848, "step": 21635 }, { "epoch": 3.530179445350734, "grad_norm": 1.4498074054718018, "learning_rate": 4.911391667891031e-05, "loss": 0.1169, "num_input_tokens_seen": 46709920, "step": 21640 }, { "epoch": 3.5309951060358893, "grad_norm": 0.7466558218002319, "learning_rate": 4.9112977302944674e-05, "loss": 0.1506, "num_input_tokens_seen": 46721152, "step": 21645 }, { "epoch": 3.531810766721044, "grad_norm": 0.546223521232605, "learning_rate": 4.911203743829896e-05, "loss": 0.0916, "num_input_tokens_seen": 46732320, "step": 21650 }, { "epoch": 3.532626427406199, "grad_norm": 0.15598686039447784, "learning_rate": 4.911109708499223e-05, "loss": 0.1471, "num_input_tokens_seen": 46744032, "step": 21655 }, { "epoch": 3.5334420880913537, "grad_norm": 0.6142042279243469, "learning_rate": 4.911015624304352e-05, "loss": 0.1724, "num_input_tokens_seen": 46755584, "step": 21660 }, { "epoch": 3.534257748776509, "grad_norm": 1.3723957538604736, "learning_rate": 4.910921491247192e-05, "loss": 0.1362, "num_input_tokens_seen": 46766368, "step": 21665 }, { "epoch": 3.535073409461664, "grad_norm": 0.399192750453949, "learning_rate": 4.910827309329649e-05, "loss": 0.1562, "num_input_tokens_seen": 46777920, "step": 21670 }, { "epoch": 3.535889070146819, "grad_norm": 0.08329088240861893, "learning_rate": 4.9107330785536314e-05, "loss": 0.197, "num_input_tokens_seen": 46789312, "step": 21675 }, { "epoch": 3.536704730831974, "grad_norm": 0.08235541731119156, "learning_rate": 4.91063879892105e-05, "loss": 0.0694, "num_input_tokens_seen": 46800736, "step": 21680 }, { "epoch": 3.5375203915171287, "grad_norm": 2.350956678390503, "learning_rate": 4.910544470433816e-05, "loss": 0.2502, "num_input_tokens_seen": 46812096, "step": 21685 }, { "epoch": 3.538336052202284, "grad_norm": 0.41069334745407104, "learning_rate": 4.91045009309384e-05, "loss": 0.2188, "num_input_tokens_seen": 46822624, "step": 21690 }, { "epoch": 3.539151712887439, "grad_norm": 2.338827133178711, "learning_rate": 4.910355666903035e-05, "loss": 0.2241, "num_input_tokens_seen": 46833760, "step": 21695 }, { "epoch": 3.539967373572594, "grad_norm": 1.1498494148254395, "learning_rate": 4.910261191863315e-05, "loss": 0.1764, "num_input_tokens_seen": 46845312, "step": 21700 }, { "epoch": 3.540783034257749, "grad_norm": 0.6733505129814148, "learning_rate": 4.9101666679765934e-05, "loss": 0.1244, "num_input_tokens_seen": 46855904, "step": 21705 }, { "epoch": 3.5415986949429037, "grad_norm": 0.3555249869823456, "learning_rate": 4.910072095244787e-05, "loss": 0.2118, "num_input_tokens_seen": 46865760, "step": 21710 }, { "epoch": 3.5424143556280585, "grad_norm": 0.3836488127708435, "learning_rate": 4.9099774736698126e-05, "loss": 0.0385, "num_input_tokens_seen": 46877888, "step": 21715 }, { "epoch": 3.5432300163132138, "grad_norm": 0.564711332321167, "learning_rate": 4.909882803253587e-05, "loss": 0.1036, "num_input_tokens_seen": 46889312, "step": 21720 }, { "epoch": 3.5440456769983686, "grad_norm": 1.3316755294799805, "learning_rate": 4.9097880839980295e-05, "loss": 0.1685, "num_input_tokens_seen": 46899392, "step": 21725 }, { "epoch": 3.544861337683524, "grad_norm": 0.5340169072151184, "learning_rate": 4.909693315905059e-05, "loss": 0.0845, "num_input_tokens_seen": 46910784, "step": 21730 }, { "epoch": 3.5456769983686787, "grad_norm": 0.42043593525886536, "learning_rate": 4.9095984989765976e-05, "loss": 0.2468, "num_input_tokens_seen": 46921920, "step": 21735 }, { "epoch": 3.5464926590538335, "grad_norm": 1.1721806526184082, "learning_rate": 4.909503633214565e-05, "loss": 0.1105, "num_input_tokens_seen": 46931776, "step": 21740 }, { "epoch": 3.5473083197389887, "grad_norm": 1.796076774597168, "learning_rate": 4.909408718620885e-05, "loss": 0.2466, "num_input_tokens_seen": 46942848, "step": 21745 }, { "epoch": 3.5481239804241436, "grad_norm": 0.08600377291440964, "learning_rate": 4.90931375519748e-05, "loss": 0.0444, "num_input_tokens_seen": 46953408, "step": 21750 }, { "epoch": 3.5489396411092984, "grad_norm": 0.34943556785583496, "learning_rate": 4.909218742946276e-05, "loss": 0.0762, "num_input_tokens_seen": 46964416, "step": 21755 }, { "epoch": 3.5497553017944536, "grad_norm": 0.18862368166446686, "learning_rate": 4.909123681869198e-05, "loss": 0.1152, "num_input_tokens_seen": 46974304, "step": 21760 }, { "epoch": 3.5505709624796085, "grad_norm": 0.25069770216941833, "learning_rate": 4.909028571968172e-05, "loss": 0.0796, "num_input_tokens_seen": 46984256, "step": 21765 }, { "epoch": 3.5513866231647633, "grad_norm": 0.5615801811218262, "learning_rate": 4.908933413245126e-05, "loss": 0.197, "num_input_tokens_seen": 46995424, "step": 21770 }, { "epoch": 3.5522022838499185, "grad_norm": 0.16498002409934998, "learning_rate": 4.908838205701988e-05, "loss": 0.2705, "num_input_tokens_seen": 47006880, "step": 21775 }, { "epoch": 3.5530179445350734, "grad_norm": 0.7990198135375977, "learning_rate": 4.908742949340689e-05, "loss": 0.1721, "num_input_tokens_seen": 47016864, "step": 21780 }, { "epoch": 3.5538336052202286, "grad_norm": 1.6254130601882935, "learning_rate": 4.9086476441631574e-05, "loss": 0.207, "num_input_tokens_seen": 47029248, "step": 21785 }, { "epoch": 3.5546492659053834, "grad_norm": 0.24589283764362335, "learning_rate": 4.9085522901713264e-05, "loss": 0.0804, "num_input_tokens_seen": 47040480, "step": 21790 }, { "epoch": 3.5554649265905383, "grad_norm": 0.3499661087989807, "learning_rate": 4.908456887367127e-05, "loss": 0.0761, "num_input_tokens_seen": 47051872, "step": 21795 }, { "epoch": 3.556280587275693, "grad_norm": 0.3557310998439789, "learning_rate": 4.908361435752494e-05, "loss": 0.0444, "num_input_tokens_seen": 47063008, "step": 21800 }, { "epoch": 3.5570962479608483, "grad_norm": 0.13736286759376526, "learning_rate": 4.908265935329361e-05, "loss": 0.1046, "num_input_tokens_seen": 47073792, "step": 21805 }, { "epoch": 3.557911908646003, "grad_norm": 0.17411018908023834, "learning_rate": 4.908170386099664e-05, "loss": 0.093, "num_input_tokens_seen": 47084832, "step": 21810 }, { "epoch": 3.5587275693311584, "grad_norm": 0.5836873650550842, "learning_rate": 4.9080747880653394e-05, "loss": 0.0439, "num_input_tokens_seen": 47096032, "step": 21815 }, { "epoch": 3.5595432300163132, "grad_norm": 0.15276670455932617, "learning_rate": 4.907979141228324e-05, "loss": 0.0942, "num_input_tokens_seen": 47106720, "step": 21820 }, { "epoch": 3.560358890701468, "grad_norm": 0.1600760817527771, "learning_rate": 4.9078834455905565e-05, "loss": 0.0732, "num_input_tokens_seen": 47118592, "step": 21825 }, { "epoch": 3.5611745513866233, "grad_norm": 2.3742423057556152, "learning_rate": 4.9077877011539764e-05, "loss": 0.1244, "num_input_tokens_seen": 47129792, "step": 21830 }, { "epoch": 3.561990212071778, "grad_norm": 0.3113363981246948, "learning_rate": 4.907691907920524e-05, "loss": 0.056, "num_input_tokens_seen": 47140224, "step": 21835 }, { "epoch": 3.5628058727569334, "grad_norm": 0.45663928985595703, "learning_rate": 4.907596065892141e-05, "loss": 0.0718, "num_input_tokens_seen": 47151904, "step": 21840 }, { "epoch": 3.563621533442088, "grad_norm": 1.025681495666504, "learning_rate": 4.907500175070769e-05, "loss": 0.0698, "num_input_tokens_seen": 47161920, "step": 21845 }, { "epoch": 3.564437194127243, "grad_norm": 0.16752757132053375, "learning_rate": 4.907404235458353e-05, "loss": 0.1601, "num_input_tokens_seen": 47172128, "step": 21850 }, { "epoch": 3.565252854812398, "grad_norm": 2.004420042037964, "learning_rate": 4.907308247056834e-05, "loss": 0.2342, "num_input_tokens_seen": 47182784, "step": 21855 }, { "epoch": 3.566068515497553, "grad_norm": 0.03431382030248642, "learning_rate": 4.9072122098681616e-05, "loss": 0.0377, "num_input_tokens_seen": 47194336, "step": 21860 }, { "epoch": 3.566884176182708, "grad_norm": 0.6060144901275635, "learning_rate": 4.907116123894279e-05, "loss": 0.1227, "num_input_tokens_seen": 47204832, "step": 21865 }, { "epoch": 3.567699836867863, "grad_norm": 0.6627119779586792, "learning_rate": 4.907019989137135e-05, "loss": 0.1664, "num_input_tokens_seen": 47215648, "step": 21870 }, { "epoch": 3.568515497553018, "grad_norm": 0.06678557395935059, "learning_rate": 4.906923805598677e-05, "loss": 0.1255, "num_input_tokens_seen": 47226080, "step": 21875 }, { "epoch": 3.569331158238173, "grad_norm": 0.6995757222175598, "learning_rate": 4.9068275732808546e-05, "loss": 0.1977, "num_input_tokens_seen": 47238112, "step": 21880 }, { "epoch": 3.5701468189233276, "grad_norm": 1.7430342435836792, "learning_rate": 4.906731292185618e-05, "loss": 0.1251, "num_input_tokens_seen": 47249216, "step": 21885 }, { "epoch": 3.570962479608483, "grad_norm": 0.05286649987101555, "learning_rate": 4.9066349623149185e-05, "loss": 0.0898, "num_input_tokens_seen": 47259296, "step": 21890 }, { "epoch": 3.5717781402936377, "grad_norm": 0.17474733293056488, "learning_rate": 4.906538583670709e-05, "loss": 0.1597, "num_input_tokens_seen": 47270336, "step": 21895 }, { "epoch": 3.572593800978793, "grad_norm": 0.9212033152580261, "learning_rate": 4.9064421562549425e-05, "loss": 0.0894, "num_input_tokens_seen": 47280736, "step": 21900 }, { "epoch": 3.573409461663948, "grad_norm": 1.454761028289795, "learning_rate": 4.906345680069572e-05, "loss": 0.1672, "num_input_tokens_seen": 47291328, "step": 21905 }, { "epoch": 3.5742251223491026, "grad_norm": 0.27025577425956726, "learning_rate": 4.906249155116555e-05, "loss": 0.1315, "num_input_tokens_seen": 47302368, "step": 21910 }, { "epoch": 3.575040783034258, "grad_norm": 1.980770468711853, "learning_rate": 4.906152581397845e-05, "loss": 0.0953, "num_input_tokens_seen": 47312768, "step": 21915 }, { "epoch": 3.5758564437194127, "grad_norm": 0.11731445044279099, "learning_rate": 4.906055958915401e-05, "loss": 0.0571, "num_input_tokens_seen": 47323520, "step": 21920 }, { "epoch": 3.576672104404568, "grad_norm": 1.7392977476119995, "learning_rate": 4.90595928767118e-05, "loss": 0.1811, "num_input_tokens_seen": 47334688, "step": 21925 }, { "epoch": 3.5774877650897228, "grad_norm": 1.2337753772735596, "learning_rate": 4.905862567667143e-05, "loss": 0.1315, "num_input_tokens_seen": 47345184, "step": 21930 }, { "epoch": 3.5783034257748776, "grad_norm": 0.5473402738571167, "learning_rate": 4.905765798905249e-05, "loss": 0.0582, "num_input_tokens_seen": 47356608, "step": 21935 }, { "epoch": 3.5791190864600324, "grad_norm": 1.7093538045883179, "learning_rate": 4.905668981387458e-05, "loss": 0.2946, "num_input_tokens_seen": 47367520, "step": 21940 }, { "epoch": 3.5799347471451877, "grad_norm": 0.5464286804199219, "learning_rate": 4.905572115115734e-05, "loss": 0.2101, "num_input_tokens_seen": 47376288, "step": 21945 }, { "epoch": 3.5807504078303425, "grad_norm": 0.4745708107948303, "learning_rate": 4.905475200092039e-05, "loss": 0.0638, "num_input_tokens_seen": 47386592, "step": 21950 }, { "epoch": 3.5815660685154977, "grad_norm": 1.184088945388794, "learning_rate": 4.905378236318338e-05, "loss": 0.2343, "num_input_tokens_seen": 47397792, "step": 21955 }, { "epoch": 3.5823817292006526, "grad_norm": 0.07583410292863846, "learning_rate": 4.9052812237965956e-05, "loss": 0.0312, "num_input_tokens_seen": 47409280, "step": 21960 }, { "epoch": 3.5831973898858074, "grad_norm": 0.05526482313871384, "learning_rate": 4.9051841625287774e-05, "loss": 0.0681, "num_input_tokens_seen": 47419680, "step": 21965 }, { "epoch": 3.5840130505709626, "grad_norm": 0.9894033670425415, "learning_rate": 4.905087052516851e-05, "loss": 0.1002, "num_input_tokens_seen": 47431296, "step": 21970 }, { "epoch": 3.5848287112561175, "grad_norm": 1.9677786827087402, "learning_rate": 4.904989893762785e-05, "loss": 0.1828, "num_input_tokens_seen": 47441824, "step": 21975 }, { "epoch": 3.5856443719412723, "grad_norm": 2.8318943977355957, "learning_rate": 4.9048926862685474e-05, "loss": 0.0627, "num_input_tokens_seen": 47452192, "step": 21980 }, { "epoch": 3.5864600326264275, "grad_norm": 1.3995014429092407, "learning_rate": 4.904795430036109e-05, "loss": 0.0922, "num_input_tokens_seen": 47461696, "step": 21985 }, { "epoch": 3.5872756933115824, "grad_norm": 0.29913845658302307, "learning_rate": 4.904698125067441e-05, "loss": 0.1543, "num_input_tokens_seen": 47472960, "step": 21990 }, { "epoch": 3.588091353996737, "grad_norm": 0.4062105715274811, "learning_rate": 4.9046007713645136e-05, "loss": 0.228, "num_input_tokens_seen": 47483712, "step": 21995 }, { "epoch": 3.5889070146818924, "grad_norm": 0.32364755868911743, "learning_rate": 4.9045033689293016e-05, "loss": 0.2079, "num_input_tokens_seen": 47493728, "step": 22000 }, { "epoch": 3.5897226753670473, "grad_norm": 0.164579376578331, "learning_rate": 4.904405917763779e-05, "loss": 0.0217, "num_input_tokens_seen": 47504928, "step": 22005 }, { "epoch": 3.5905383360522025, "grad_norm": 0.7003077268600464, "learning_rate": 4.90430841786992e-05, "loss": 0.1277, "num_input_tokens_seen": 47514880, "step": 22010 }, { "epoch": 3.5913539967373573, "grad_norm": 1.0309163331985474, "learning_rate": 4.904210869249701e-05, "loss": 0.0626, "num_input_tokens_seen": 47526688, "step": 22015 }, { "epoch": 3.592169657422512, "grad_norm": 0.13141769170761108, "learning_rate": 4.9041132719050987e-05, "loss": 0.0376, "num_input_tokens_seen": 47536768, "step": 22020 }, { "epoch": 3.592985318107667, "grad_norm": 0.06957880407571793, "learning_rate": 4.904015625838091e-05, "loss": 0.1667, "num_input_tokens_seen": 47548768, "step": 22025 }, { "epoch": 3.5938009787928222, "grad_norm": 0.07292338460683823, "learning_rate": 4.903917931050657e-05, "loss": 0.0973, "num_input_tokens_seen": 47559968, "step": 22030 }, { "epoch": 3.594616639477977, "grad_norm": 1.9557501077651978, "learning_rate": 4.903820187544776e-05, "loss": 0.136, "num_input_tokens_seen": 47571040, "step": 22035 }, { "epoch": 3.5954323001631323, "grad_norm": 0.7442717552185059, "learning_rate": 4.90372239532243e-05, "loss": 0.1152, "num_input_tokens_seen": 47582816, "step": 22040 }, { "epoch": 3.596247960848287, "grad_norm": 0.23692575097084045, "learning_rate": 4.9036245543856e-05, "loss": 0.0822, "num_input_tokens_seen": 47594112, "step": 22045 }, { "epoch": 3.597063621533442, "grad_norm": 0.41498860716819763, "learning_rate": 4.90352666473627e-05, "loss": 0.1453, "num_input_tokens_seen": 47604320, "step": 22050 }, { "epoch": 3.597879282218597, "grad_norm": 0.07991795986890793, "learning_rate": 4.903428726376422e-05, "loss": 0.1549, "num_input_tokens_seen": 47613472, "step": 22055 }, { "epoch": 3.598694942903752, "grad_norm": 2.1871445178985596, "learning_rate": 4.903330739308042e-05, "loss": 0.3132, "num_input_tokens_seen": 47624096, "step": 22060 }, { "epoch": 3.5995106035889073, "grad_norm": 1.261765480041504, "learning_rate": 4.903232703533116e-05, "loss": 0.0876, "num_input_tokens_seen": 47635456, "step": 22065 }, { "epoch": 3.600326264274062, "grad_norm": 1.043070912361145, "learning_rate": 4.9031346190536306e-05, "loss": 0.0538, "num_input_tokens_seen": 47644448, "step": 22070 }, { "epoch": 3.601141924959217, "grad_norm": 1.075421929359436, "learning_rate": 4.903036485871574e-05, "loss": 0.176, "num_input_tokens_seen": 47654144, "step": 22075 }, { "epoch": 3.6019575856443717, "grad_norm": 0.05546325445175171, "learning_rate": 4.9029383039889345e-05, "loss": 0.053, "num_input_tokens_seen": 47664672, "step": 22080 }, { "epoch": 3.602773246329527, "grad_norm": 1.506855845451355, "learning_rate": 4.9028400734077004e-05, "loss": 0.1307, "num_input_tokens_seen": 47676512, "step": 22085 }, { "epoch": 3.603588907014682, "grad_norm": 1.1175609827041626, "learning_rate": 4.902741794129866e-05, "loss": 0.2359, "num_input_tokens_seen": 47687168, "step": 22090 }, { "epoch": 3.604404567699837, "grad_norm": 2.204224109649658, "learning_rate": 4.90264346615742e-05, "loss": 0.2271, "num_input_tokens_seen": 47698208, "step": 22095 }, { "epoch": 3.605220228384992, "grad_norm": 0.6884468197822571, "learning_rate": 4.902545089492356e-05, "loss": 0.0799, "num_input_tokens_seen": 47708448, "step": 22100 }, { "epoch": 3.6060358890701467, "grad_norm": 0.556822657585144, "learning_rate": 4.9024466641366685e-05, "loss": 0.0979, "num_input_tokens_seen": 47718624, "step": 22105 }, { "epoch": 3.6068515497553015, "grad_norm": 1.3924988508224487, "learning_rate": 4.902348190092352e-05, "loss": 0.325, "num_input_tokens_seen": 47729184, "step": 22110 }, { "epoch": 3.607667210440457, "grad_norm": 0.8015451431274414, "learning_rate": 4.9022496673614006e-05, "loss": 0.1016, "num_input_tokens_seen": 47740992, "step": 22115 }, { "epoch": 3.6084828711256116, "grad_norm": 0.46785494685173035, "learning_rate": 4.9021510959458125e-05, "loss": 0.1503, "num_input_tokens_seen": 47751520, "step": 22120 }, { "epoch": 3.609298531810767, "grad_norm": 0.3539682626724243, "learning_rate": 4.902052475847586e-05, "loss": 0.1791, "num_input_tokens_seen": 47761920, "step": 22125 }, { "epoch": 3.6101141924959217, "grad_norm": 0.32748278975486755, "learning_rate": 4.901953807068718e-05, "loss": 0.2076, "num_input_tokens_seen": 47773376, "step": 22130 }, { "epoch": 3.6109298531810765, "grad_norm": 1.1772730350494385, "learning_rate": 4.90185508961121e-05, "loss": 0.0914, "num_input_tokens_seen": 47784064, "step": 22135 }, { "epoch": 3.6117455138662318, "grad_norm": 1.0425413846969604, "learning_rate": 4.9017563234770606e-05, "loss": 0.0675, "num_input_tokens_seen": 47794592, "step": 22140 }, { "epoch": 3.6125611745513866, "grad_norm": 0.6279290914535522, "learning_rate": 4.901657508668273e-05, "loss": 0.0944, "num_input_tokens_seen": 47805504, "step": 22145 }, { "epoch": 3.613376835236542, "grad_norm": 0.29264456033706665, "learning_rate": 4.901558645186849e-05, "loss": 0.1367, "num_input_tokens_seen": 47816128, "step": 22150 }, { "epoch": 3.6141924959216967, "grad_norm": 1.0637959241867065, "learning_rate": 4.901459733034792e-05, "loss": 0.2057, "num_input_tokens_seen": 47827264, "step": 22155 }, { "epoch": 3.6150081566068515, "grad_norm": 0.3634956181049347, "learning_rate": 4.9013607722141084e-05, "loss": 0.106, "num_input_tokens_seen": 47838208, "step": 22160 }, { "epoch": 3.6158238172920063, "grad_norm": 0.42940354347229004, "learning_rate": 4.901261762726801e-05, "loss": 0.31, "num_input_tokens_seen": 47848192, "step": 22165 }, { "epoch": 3.6166394779771616, "grad_norm": 0.3984166979789734, "learning_rate": 4.901162704574879e-05, "loss": 0.1099, "num_input_tokens_seen": 47858688, "step": 22170 }, { "epoch": 3.6174551386623164, "grad_norm": 0.30523011088371277, "learning_rate": 4.901063597760348e-05, "loss": 0.0232, "num_input_tokens_seen": 47870048, "step": 22175 }, { "epoch": 3.6182707993474716, "grad_norm": 1.4494048357009888, "learning_rate": 4.900964442285217e-05, "loss": 0.2257, "num_input_tokens_seen": 47881984, "step": 22180 }, { "epoch": 3.6190864600326265, "grad_norm": 0.6365559697151184, "learning_rate": 4.900865238151496e-05, "loss": 0.0592, "num_input_tokens_seen": 47893376, "step": 22185 }, { "epoch": 3.6199021207177813, "grad_norm": 0.35686561465263367, "learning_rate": 4.900765985361196e-05, "loss": 0.1473, "num_input_tokens_seen": 47903680, "step": 22190 }, { "epoch": 3.6207177814029365, "grad_norm": 1.4890305995941162, "learning_rate": 4.900666683916326e-05, "loss": 0.2072, "num_input_tokens_seen": 47914144, "step": 22195 }, { "epoch": 3.6215334420880914, "grad_norm": 0.20691226422786713, "learning_rate": 4.900567333818902e-05, "loss": 0.0193, "num_input_tokens_seen": 47924224, "step": 22200 }, { "epoch": 3.622349102773246, "grad_norm": 0.30627018213272095, "learning_rate": 4.900467935070934e-05, "loss": 0.0667, "num_input_tokens_seen": 47935264, "step": 22205 }, { "epoch": 3.6231647634584014, "grad_norm": 2.2872872352600098, "learning_rate": 4.900368487674439e-05, "loss": 0.109, "num_input_tokens_seen": 47946432, "step": 22210 }, { "epoch": 3.6239804241435563, "grad_norm": 0.17705579102039337, "learning_rate": 4.9002689916314314e-05, "loss": 0.0796, "num_input_tokens_seen": 47956608, "step": 22215 }, { "epoch": 3.624796084828711, "grad_norm": 0.35562509298324585, "learning_rate": 4.900169446943928e-05, "loss": 0.1886, "num_input_tokens_seen": 47967552, "step": 22220 }, { "epoch": 3.6256117455138663, "grad_norm": 1.4846993684768677, "learning_rate": 4.900069853613945e-05, "loss": 0.1873, "num_input_tokens_seen": 47977888, "step": 22225 }, { "epoch": 3.626427406199021, "grad_norm": 0.8446660041809082, "learning_rate": 4.899970211643503e-05, "loss": 0.0597, "num_input_tokens_seen": 47988576, "step": 22230 }, { "epoch": 3.6272430668841764, "grad_norm": 1.2072560787200928, "learning_rate": 4.8998705210346195e-05, "loss": 0.2846, "num_input_tokens_seen": 47999776, "step": 22235 }, { "epoch": 3.6280587275693312, "grad_norm": 0.054911211133003235, "learning_rate": 4.8997707817893156e-05, "loss": 0.173, "num_input_tokens_seen": 48010688, "step": 22240 }, { "epoch": 3.628874388254486, "grad_norm": 0.24793586134910583, "learning_rate": 4.899670993909612e-05, "loss": 0.029, "num_input_tokens_seen": 48021152, "step": 22245 }, { "epoch": 3.629690048939641, "grad_norm": 1.2096197605133057, "learning_rate": 4.899571157397532e-05, "loss": 0.2143, "num_input_tokens_seen": 48030176, "step": 22250 }, { "epoch": 3.630505709624796, "grad_norm": 1.1013888120651245, "learning_rate": 4.8994712722550986e-05, "loss": 0.1812, "num_input_tokens_seen": 48039232, "step": 22255 }, { "epoch": 3.631321370309951, "grad_norm": 0.12403422594070435, "learning_rate": 4.8993713384843355e-05, "loss": 0.1186, "num_input_tokens_seen": 48049984, "step": 22260 }, { "epoch": 3.632137030995106, "grad_norm": 0.18670982122421265, "learning_rate": 4.899271356087268e-05, "loss": 0.0809, "num_input_tokens_seen": 48061088, "step": 22265 }, { "epoch": 3.632952691680261, "grad_norm": 1.2386791706085205, "learning_rate": 4.899171325065923e-05, "loss": 0.1182, "num_input_tokens_seen": 48072704, "step": 22270 }, { "epoch": 3.633768352365416, "grad_norm": 0.1514204442501068, "learning_rate": 4.899071245422328e-05, "loss": 0.1059, "num_input_tokens_seen": 48082848, "step": 22275 }, { "epoch": 3.634584013050571, "grad_norm": 1.3343032598495483, "learning_rate": 4.8989711171585104e-05, "loss": 0.1338, "num_input_tokens_seen": 48094464, "step": 22280 }, { "epoch": 3.635399673735726, "grad_norm": 1.0495954751968384, "learning_rate": 4.8988709402765e-05, "loss": 0.1572, "num_input_tokens_seen": 48105088, "step": 22285 }, { "epoch": 3.636215334420881, "grad_norm": 1.4532068967819214, "learning_rate": 4.8987707147783266e-05, "loss": 0.1386, "num_input_tokens_seen": 48114432, "step": 22290 }, { "epoch": 3.637030995106036, "grad_norm": 0.882151186466217, "learning_rate": 4.898670440666022e-05, "loss": 0.0912, "num_input_tokens_seen": 48124288, "step": 22295 }, { "epoch": 3.637846655791191, "grad_norm": 0.8889555931091309, "learning_rate": 4.898570117941618e-05, "loss": 0.2478, "num_input_tokens_seen": 48134144, "step": 22300 }, { "epoch": 3.6386623164763456, "grad_norm": 0.22998471558094025, "learning_rate": 4.8984697466071474e-05, "loss": 0.108, "num_input_tokens_seen": 48144416, "step": 22305 }, { "epoch": 3.639477977161501, "grad_norm": 0.2782689034938812, "learning_rate": 4.8983693266646444e-05, "loss": 0.0562, "num_input_tokens_seen": 48155584, "step": 22310 }, { "epoch": 3.6402936378466557, "grad_norm": 2.4429948329925537, "learning_rate": 4.898268858116145e-05, "loss": 0.2546, "num_input_tokens_seen": 48167232, "step": 22315 }, { "epoch": 3.641109298531811, "grad_norm": 1.0080848932266235, "learning_rate": 4.898168340963685e-05, "loss": 0.2119, "num_input_tokens_seen": 48178208, "step": 22320 }, { "epoch": 3.641924959216966, "grad_norm": 0.15024414658546448, "learning_rate": 4.898067775209301e-05, "loss": 0.0428, "num_input_tokens_seen": 48188896, "step": 22325 }, { "epoch": 3.6427406199021206, "grad_norm": 1.38578462600708, "learning_rate": 4.897967160855031e-05, "loss": 0.3493, "num_input_tokens_seen": 48200000, "step": 22330 }, { "epoch": 3.6435562805872754, "grad_norm": 1.2224078178405762, "learning_rate": 4.897866497902914e-05, "loss": 0.1711, "num_input_tokens_seen": 48211200, "step": 22335 }, { "epoch": 3.6443719412724307, "grad_norm": 1.57594633102417, "learning_rate": 4.897765786354992e-05, "loss": 0.294, "num_input_tokens_seen": 48221504, "step": 22340 }, { "epoch": 3.6451876019575855, "grad_norm": 0.6341546773910522, "learning_rate": 4.897665026213304e-05, "loss": 0.1127, "num_input_tokens_seen": 48230976, "step": 22345 }, { "epoch": 3.6460032626427408, "grad_norm": 0.8553803563117981, "learning_rate": 4.897564217479892e-05, "loss": 0.0728, "num_input_tokens_seen": 48242368, "step": 22350 }, { "epoch": 3.6468189233278956, "grad_norm": 0.9147049188613892, "learning_rate": 4.8974633601568e-05, "loss": 0.1555, "num_input_tokens_seen": 48253696, "step": 22355 }, { "epoch": 3.6476345840130504, "grad_norm": 0.22291506826877594, "learning_rate": 4.8973624542460717e-05, "loss": 0.1033, "num_input_tokens_seen": 48264768, "step": 22360 }, { "epoch": 3.6484502446982057, "grad_norm": 0.29286694526672363, "learning_rate": 4.897261499749752e-05, "loss": 0.1933, "num_input_tokens_seen": 48275520, "step": 22365 }, { "epoch": 3.6492659053833605, "grad_norm": 0.3539755642414093, "learning_rate": 4.897160496669886e-05, "loss": 0.1315, "num_input_tokens_seen": 48284960, "step": 22370 }, { "epoch": 3.6500815660685157, "grad_norm": 0.5780181288719177, "learning_rate": 4.8970594450085226e-05, "loss": 0.0334, "num_input_tokens_seen": 48295840, "step": 22375 }, { "epoch": 3.6508972267536706, "grad_norm": 0.5650975108146667, "learning_rate": 4.8969583447677085e-05, "loss": 0.08, "num_input_tokens_seen": 48306880, "step": 22380 }, { "epoch": 3.6517128874388254, "grad_norm": 1.9426519870758057, "learning_rate": 4.8968571959494926e-05, "loss": 0.1465, "num_input_tokens_seen": 48318336, "step": 22385 }, { "epoch": 3.65252854812398, "grad_norm": 2.0340523719787598, "learning_rate": 4.896755998555925e-05, "loss": 0.1894, "num_input_tokens_seen": 48329152, "step": 22390 }, { "epoch": 3.6533442088091355, "grad_norm": 0.46811628341674805, "learning_rate": 4.8966547525890564e-05, "loss": 0.034, "num_input_tokens_seen": 48339136, "step": 22395 }, { "epoch": 3.6541598694942903, "grad_norm": 0.21975211799144745, "learning_rate": 4.89655345805094e-05, "loss": 0.1135, "num_input_tokens_seen": 48350208, "step": 22400 }, { "epoch": 3.6549755301794455, "grad_norm": 0.49361005425453186, "learning_rate": 4.896452114943626e-05, "loss": 0.2898, "num_input_tokens_seen": 48362016, "step": 22405 }, { "epoch": 3.6557911908646004, "grad_norm": 0.6949596405029297, "learning_rate": 4.8963507232691706e-05, "loss": 0.1233, "num_input_tokens_seen": 48371424, "step": 22410 }, { "epoch": 3.656606851549755, "grad_norm": 0.16071027517318726, "learning_rate": 4.8962492830296277e-05, "loss": 0.2454, "num_input_tokens_seen": 48383328, "step": 22415 }, { "epoch": 3.6574225122349104, "grad_norm": 0.14040163159370422, "learning_rate": 4.8961477942270526e-05, "loss": 0.0537, "num_input_tokens_seen": 48394304, "step": 22420 }, { "epoch": 3.6582381729200653, "grad_norm": 0.29874423146247864, "learning_rate": 4.896046256863503e-05, "loss": 0.0693, "num_input_tokens_seen": 48405376, "step": 22425 }, { "epoch": 3.65905383360522, "grad_norm": 1.2699129581451416, "learning_rate": 4.895944670941037e-05, "loss": 0.0978, "num_input_tokens_seen": 48416352, "step": 22430 }, { "epoch": 3.6598694942903753, "grad_norm": 0.46694210171699524, "learning_rate": 4.895843036461713e-05, "loss": 0.2182, "num_input_tokens_seen": 48427488, "step": 22435 }, { "epoch": 3.66068515497553, "grad_norm": 0.1847778558731079, "learning_rate": 4.89574135342759e-05, "loss": 0.0818, "num_input_tokens_seen": 48439520, "step": 22440 }, { "epoch": 3.661500815660685, "grad_norm": 0.8479392528533936, "learning_rate": 4.895639621840728e-05, "loss": 0.0721, "num_input_tokens_seen": 48449984, "step": 22445 }, { "epoch": 3.6623164763458402, "grad_norm": 2.1738247871398926, "learning_rate": 4.895537841703192e-05, "loss": 0.1779, "num_input_tokens_seen": 48460672, "step": 22450 }, { "epoch": 3.663132137030995, "grad_norm": 0.9920687675476074, "learning_rate": 4.8954360130170415e-05, "loss": 0.0859, "num_input_tokens_seen": 48471328, "step": 22455 }, { "epoch": 3.6639477977161503, "grad_norm": 0.05870937183499336, "learning_rate": 4.895334135784342e-05, "loss": 0.0444, "num_input_tokens_seen": 48481664, "step": 22460 }, { "epoch": 3.664763458401305, "grad_norm": 0.13771997392177582, "learning_rate": 4.8952322100071565e-05, "loss": 0.1357, "num_input_tokens_seen": 48492896, "step": 22465 }, { "epoch": 3.66557911908646, "grad_norm": 0.2860208749771118, "learning_rate": 4.8951302356875525e-05, "loss": 0.0512, "num_input_tokens_seen": 48503648, "step": 22470 }, { "epoch": 3.6663947797716148, "grad_norm": 0.4709968864917755, "learning_rate": 4.895028212827596e-05, "loss": 0.0675, "num_input_tokens_seen": 48514368, "step": 22475 }, { "epoch": 3.66721044045677, "grad_norm": 0.1778234839439392, "learning_rate": 4.894926141429355e-05, "loss": 0.135, "num_input_tokens_seen": 48525504, "step": 22480 }, { "epoch": 3.668026101141925, "grad_norm": 0.8897945284843445, "learning_rate": 4.8948240214948965e-05, "loss": 0.1163, "num_input_tokens_seen": 48534880, "step": 22485 }, { "epoch": 3.66884176182708, "grad_norm": 0.7474585771560669, "learning_rate": 4.894721853026292e-05, "loss": 0.1447, "num_input_tokens_seen": 48546656, "step": 22490 }, { "epoch": 3.669657422512235, "grad_norm": 0.07380279153585434, "learning_rate": 4.89461963602561e-05, "loss": 0.0564, "num_input_tokens_seen": 48557952, "step": 22495 }, { "epoch": 3.6704730831973897, "grad_norm": 0.9902423620223999, "learning_rate": 4.894517370494924e-05, "loss": 0.0973, "num_input_tokens_seen": 48568832, "step": 22500 }, { "epoch": 3.671288743882545, "grad_norm": 1.3712178468704224, "learning_rate": 4.894415056436306e-05, "loss": 0.1668, "num_input_tokens_seen": 48578784, "step": 22505 }, { "epoch": 3.6721044045677, "grad_norm": 0.07927725464105606, "learning_rate": 4.894312693851829e-05, "loss": 0.1265, "num_input_tokens_seen": 48590016, "step": 22510 }, { "epoch": 3.672920065252855, "grad_norm": 0.7819048762321472, "learning_rate": 4.894210282743568e-05, "loss": 0.1029, "num_input_tokens_seen": 48600608, "step": 22515 }, { "epoch": 3.67373572593801, "grad_norm": 0.0612020343542099, "learning_rate": 4.8941078231135984e-05, "loss": 0.0356, "num_input_tokens_seen": 48610912, "step": 22520 }, { "epoch": 3.6745513866231647, "grad_norm": 0.19723863899707794, "learning_rate": 4.8940053149639965e-05, "loss": 0.0677, "num_input_tokens_seen": 48622272, "step": 22525 }, { "epoch": 3.6753670473083195, "grad_norm": 0.13922931253910065, "learning_rate": 4.89390275829684e-05, "loss": 0.054, "num_input_tokens_seen": 48633376, "step": 22530 }, { "epoch": 3.676182707993475, "grad_norm": 0.9817789793014526, "learning_rate": 4.893800153114207e-05, "loss": 0.1954, "num_input_tokens_seen": 48644608, "step": 22535 }, { "epoch": 3.6769983686786296, "grad_norm": 0.4128097891807556, "learning_rate": 4.893697499418177e-05, "loss": 0.0748, "num_input_tokens_seen": 48656448, "step": 22540 }, { "epoch": 3.677814029363785, "grad_norm": 0.9295444488525391, "learning_rate": 4.8935947972108314e-05, "loss": 0.2421, "num_input_tokens_seen": 48667936, "step": 22545 }, { "epoch": 3.6786296900489397, "grad_norm": 0.5178166031837463, "learning_rate": 4.89349204649425e-05, "loss": 0.2252, "num_input_tokens_seen": 48678240, "step": 22550 }, { "epoch": 3.6794453507340945, "grad_norm": 1.446751594543457, "learning_rate": 4.893389247270517e-05, "loss": 0.1395, "num_input_tokens_seen": 48687296, "step": 22555 }, { "epoch": 3.6802610114192493, "grad_norm": 0.08168289065361023, "learning_rate": 4.8932863995417134e-05, "loss": 0.0471, "num_input_tokens_seen": 48698720, "step": 22560 }, { "epoch": 3.6810766721044046, "grad_norm": 1.0151851177215576, "learning_rate": 4.893183503309925e-05, "loss": 0.0654, "num_input_tokens_seen": 48709696, "step": 22565 }, { "epoch": 3.6818923327895594, "grad_norm": 0.589176595211029, "learning_rate": 4.893080558577238e-05, "loss": 0.0776, "num_input_tokens_seen": 48720032, "step": 22570 }, { "epoch": 3.6827079934747147, "grad_norm": 1.9252194166183472, "learning_rate": 4.892977565345736e-05, "loss": 0.1634, "num_input_tokens_seen": 48729664, "step": 22575 }, { "epoch": 3.6835236541598695, "grad_norm": 0.32503843307495117, "learning_rate": 4.892874523617509e-05, "loss": 0.0634, "num_input_tokens_seen": 48742336, "step": 22580 }, { "epoch": 3.6843393148450243, "grad_norm": 0.31660836935043335, "learning_rate": 4.892771433394644e-05, "loss": 0.0409, "num_input_tokens_seen": 48754048, "step": 22585 }, { "epoch": 3.6851549755301796, "grad_norm": 0.6614958643913269, "learning_rate": 4.8926682946792305e-05, "loss": 0.1097, "num_input_tokens_seen": 48765344, "step": 22590 }, { "epoch": 3.6859706362153344, "grad_norm": 0.6568289399147034, "learning_rate": 4.892565107473358e-05, "loss": 0.2101, "num_input_tokens_seen": 48777152, "step": 22595 }, { "epoch": 3.6867862969004896, "grad_norm": 0.13978077471256256, "learning_rate": 4.8924618717791195e-05, "loss": 0.163, "num_input_tokens_seen": 48786880, "step": 22600 }, { "epoch": 3.6876019575856445, "grad_norm": 0.04606278985738754, "learning_rate": 4.892358587598606e-05, "loss": 0.0216, "num_input_tokens_seen": 48796256, "step": 22605 }, { "epoch": 3.6884176182707993, "grad_norm": 0.18111427128314972, "learning_rate": 4.89225525493391e-05, "loss": 0.1509, "num_input_tokens_seen": 48808096, "step": 22610 }, { "epoch": 3.689233278955954, "grad_norm": 0.4360113739967346, "learning_rate": 4.892151873787127e-05, "loss": 0.118, "num_input_tokens_seen": 48818848, "step": 22615 }, { "epoch": 3.6900489396411094, "grad_norm": 0.2437628209590912, "learning_rate": 4.892048444160352e-05, "loss": 0.0616, "num_input_tokens_seen": 48831296, "step": 22620 }, { "epoch": 3.690864600326264, "grad_norm": 0.4167708456516266, "learning_rate": 4.891944966055679e-05, "loss": 0.1841, "num_input_tokens_seen": 48841536, "step": 22625 }, { "epoch": 3.6916802610114194, "grad_norm": 1.309451937675476, "learning_rate": 4.891841439475209e-05, "loss": 0.3214, "num_input_tokens_seen": 48853152, "step": 22630 }, { "epoch": 3.6924959216965743, "grad_norm": 0.17077195644378662, "learning_rate": 4.8917378644210366e-05, "loss": 0.1297, "num_input_tokens_seen": 48862688, "step": 22635 }, { "epoch": 3.693311582381729, "grad_norm": 0.4474990963935852, "learning_rate": 4.891634240895263e-05, "loss": 0.0811, "num_input_tokens_seen": 48874688, "step": 22640 }, { "epoch": 3.6941272430668843, "grad_norm": 0.5504024624824524, "learning_rate": 4.891530568899987e-05, "loss": 0.0886, "num_input_tokens_seen": 48885856, "step": 22645 }, { "epoch": 3.694942903752039, "grad_norm": 0.920478880405426, "learning_rate": 4.891426848437311e-05, "loss": 0.1925, "num_input_tokens_seen": 48896096, "step": 22650 }, { "epoch": 3.695758564437194, "grad_norm": 1.3577384948730469, "learning_rate": 4.8913230795093354e-05, "loss": 0.221, "num_input_tokens_seen": 48907328, "step": 22655 }, { "epoch": 3.6965742251223492, "grad_norm": 1.5273995399475098, "learning_rate": 4.891219262118164e-05, "loss": 0.1369, "num_input_tokens_seen": 48918304, "step": 22660 }, { "epoch": 3.697389885807504, "grad_norm": 0.26975759863853455, "learning_rate": 4.891115396265901e-05, "loss": 0.0722, "num_input_tokens_seen": 48929440, "step": 22665 }, { "epoch": 3.698205546492659, "grad_norm": 0.4501023292541504, "learning_rate": 4.891011481954651e-05, "loss": 0.0285, "num_input_tokens_seen": 48940256, "step": 22670 }, { "epoch": 3.699021207177814, "grad_norm": 1.0950194597244263, "learning_rate": 4.890907519186521e-05, "loss": 0.0808, "num_input_tokens_seen": 48949856, "step": 22675 }, { "epoch": 3.699836867862969, "grad_norm": 0.4983436167240143, "learning_rate": 4.890803507963615e-05, "loss": 0.0881, "num_input_tokens_seen": 48960768, "step": 22680 }, { "epoch": 3.700652528548124, "grad_norm": 1.9883530139923096, "learning_rate": 4.890699448288045e-05, "loss": 0.2205, "num_input_tokens_seen": 48971264, "step": 22685 }, { "epoch": 3.701468189233279, "grad_norm": 0.06012396886944771, "learning_rate": 4.890595340161916e-05, "loss": 0.1522, "num_input_tokens_seen": 48981632, "step": 22690 }, { "epoch": 3.702283849918434, "grad_norm": 1.2955865859985352, "learning_rate": 4.8904911835873414e-05, "loss": 0.2961, "num_input_tokens_seen": 48993024, "step": 22695 }, { "epoch": 3.7030995106035887, "grad_norm": 0.04786119982600212, "learning_rate": 4.8903869785664305e-05, "loss": 0.2156, "num_input_tokens_seen": 49004064, "step": 22700 }, { "epoch": 3.703915171288744, "grad_norm": 1.3344610929489136, "learning_rate": 4.890282725101294e-05, "loss": 0.1898, "num_input_tokens_seen": 49016032, "step": 22705 }, { "epoch": 3.7047308319738987, "grad_norm": 0.591400682926178, "learning_rate": 4.890178423194046e-05, "loss": 0.0825, "num_input_tokens_seen": 49026336, "step": 22710 }, { "epoch": 3.705546492659054, "grad_norm": 0.9789397120475769, "learning_rate": 4.8900740728468e-05, "loss": 0.1251, "num_input_tokens_seen": 49037312, "step": 22715 }, { "epoch": 3.706362153344209, "grad_norm": 0.3658468425273895, "learning_rate": 4.889969674061671e-05, "loss": 0.1405, "num_input_tokens_seen": 49048480, "step": 22720 }, { "epoch": 3.7071778140293636, "grad_norm": 1.191593885421753, "learning_rate": 4.8898652268407755e-05, "loss": 0.4075, "num_input_tokens_seen": 49059840, "step": 22725 }, { "epoch": 3.707993474714519, "grad_norm": 0.24334631860256195, "learning_rate": 4.889760731186228e-05, "loss": 0.0956, "num_input_tokens_seen": 49070720, "step": 22730 }, { "epoch": 3.7088091353996737, "grad_norm": 0.11515223979949951, "learning_rate": 4.889656187100149e-05, "loss": 0.0965, "num_input_tokens_seen": 49081408, "step": 22735 }, { "epoch": 3.709624796084829, "grad_norm": 1.6285715103149414, "learning_rate": 4.889551594584655e-05, "loss": 0.0984, "num_input_tokens_seen": 49092416, "step": 22740 }, { "epoch": 3.710440456769984, "grad_norm": 0.40964576601982117, "learning_rate": 4.889446953641867e-05, "loss": 0.1232, "num_input_tokens_seen": 49102464, "step": 22745 }, { "epoch": 3.7112561174551386, "grad_norm": 0.1459839940071106, "learning_rate": 4.889342264273905e-05, "loss": 0.1863, "num_input_tokens_seen": 49113920, "step": 22750 }, { "epoch": 3.7120717781402934, "grad_norm": 0.1368878185749054, "learning_rate": 4.889237526482891e-05, "loss": 0.1302, "num_input_tokens_seen": 49124096, "step": 22755 }, { "epoch": 3.7128874388254487, "grad_norm": 0.2112441062927246, "learning_rate": 4.889132740270947e-05, "loss": 0.0955, "num_input_tokens_seen": 49134048, "step": 22760 }, { "epoch": 3.7137030995106035, "grad_norm": 1.0117672681808472, "learning_rate": 4.8890279056401984e-05, "loss": 0.0605, "num_input_tokens_seen": 49144800, "step": 22765 }, { "epoch": 3.7145187601957588, "grad_norm": 1.2968419790267944, "learning_rate": 4.8889230225927677e-05, "loss": 0.2651, "num_input_tokens_seen": 49156128, "step": 22770 }, { "epoch": 3.7153344208809136, "grad_norm": 0.17960742115974426, "learning_rate": 4.8888180911307816e-05, "loss": 0.0907, "num_input_tokens_seen": 49167168, "step": 22775 }, { "epoch": 3.7161500815660684, "grad_norm": 0.1531333178281784, "learning_rate": 4.8887131112563665e-05, "loss": 0.0567, "num_input_tokens_seen": 49177248, "step": 22780 }, { "epoch": 3.7169657422512232, "grad_norm": 0.5153030753135681, "learning_rate": 4.8886080829716495e-05, "loss": 0.0378, "num_input_tokens_seen": 49189568, "step": 22785 }, { "epoch": 3.7177814029363785, "grad_norm": 0.9736325144767761, "learning_rate": 4.8885030062787604e-05, "loss": 0.1435, "num_input_tokens_seen": 49198912, "step": 22790 }, { "epoch": 3.7185970636215333, "grad_norm": 0.3169333040714264, "learning_rate": 4.888397881179827e-05, "loss": 0.2193, "num_input_tokens_seen": 49210112, "step": 22795 }, { "epoch": 3.7194127243066886, "grad_norm": 1.2147022485733032, "learning_rate": 4.8882927076769814e-05, "loss": 0.1874, "num_input_tokens_seen": 49221344, "step": 22800 }, { "epoch": 3.7202283849918434, "grad_norm": 0.036469582468271255, "learning_rate": 4.888187485772354e-05, "loss": 0.124, "num_input_tokens_seen": 49232064, "step": 22805 }, { "epoch": 3.721044045676998, "grad_norm": 0.897459864616394, "learning_rate": 4.8880822154680774e-05, "loss": 0.3195, "num_input_tokens_seen": 49242144, "step": 22810 }, { "epoch": 3.7218597063621535, "grad_norm": 0.5780821442604065, "learning_rate": 4.887976896766285e-05, "loss": 0.1154, "num_input_tokens_seen": 49253248, "step": 22815 }, { "epoch": 3.7226753670473083, "grad_norm": 1.380658507347107, "learning_rate": 4.8878715296691116e-05, "loss": 0.1409, "num_input_tokens_seen": 49264384, "step": 22820 }, { "epoch": 3.7234910277324635, "grad_norm": 0.12900365889072418, "learning_rate": 4.887766114178693e-05, "loss": 0.069, "num_input_tokens_seen": 49276416, "step": 22825 }, { "epoch": 3.7243066884176184, "grad_norm": 0.2912899851799011, "learning_rate": 4.887660650297164e-05, "loss": 0.0863, "num_input_tokens_seen": 49288800, "step": 22830 }, { "epoch": 3.725122349102773, "grad_norm": 1.9251227378845215, "learning_rate": 4.8875551380266635e-05, "loss": 0.1633, "num_input_tokens_seen": 49299328, "step": 22835 }, { "epoch": 3.725938009787928, "grad_norm": 1.3005656003952026, "learning_rate": 4.88744957736933e-05, "loss": 0.1666, "num_input_tokens_seen": 49311360, "step": 22840 }, { "epoch": 3.7267536704730833, "grad_norm": 0.23763789236545563, "learning_rate": 4.887343968327302e-05, "loss": 0.17, "num_input_tokens_seen": 49321632, "step": 22845 }, { "epoch": 3.727569331158238, "grad_norm": 0.6586543917655945, "learning_rate": 4.8872383109027184e-05, "loss": 0.0903, "num_input_tokens_seen": 49332256, "step": 22850 }, { "epoch": 3.7283849918433933, "grad_norm": 1.5298476219177246, "learning_rate": 4.8871326050977226e-05, "loss": 0.2102, "num_input_tokens_seen": 49343360, "step": 22855 }, { "epoch": 3.729200652528548, "grad_norm": 0.10752401500940323, "learning_rate": 4.887026850914457e-05, "loss": 0.124, "num_input_tokens_seen": 49354112, "step": 22860 }, { "epoch": 3.730016313213703, "grad_norm": 0.2790384292602539, "learning_rate": 4.886921048355064e-05, "loss": 0.1167, "num_input_tokens_seen": 49365120, "step": 22865 }, { "epoch": 3.7308319738988582, "grad_norm": 1.2431206703186035, "learning_rate": 4.8868151974216876e-05, "loss": 0.3038, "num_input_tokens_seen": 49377600, "step": 22870 }, { "epoch": 3.731647634584013, "grad_norm": 0.12943199276924133, "learning_rate": 4.886709298116473e-05, "loss": 0.0814, "num_input_tokens_seen": 49387648, "step": 22875 }, { "epoch": 3.732463295269168, "grad_norm": 1.8992083072662354, "learning_rate": 4.8866033504415675e-05, "loss": 0.1903, "num_input_tokens_seen": 49398368, "step": 22880 }, { "epoch": 3.733278955954323, "grad_norm": 1.1680967807769775, "learning_rate": 4.886497354399117e-05, "loss": 0.1742, "num_input_tokens_seen": 49408416, "step": 22885 }, { "epoch": 3.734094616639478, "grad_norm": 0.617368757724762, "learning_rate": 4.886391309991271e-05, "loss": 0.2385, "num_input_tokens_seen": 49417888, "step": 22890 }, { "epoch": 3.7349102773246328, "grad_norm": 0.5306338667869568, "learning_rate": 4.886285217220176e-05, "loss": 0.2167, "num_input_tokens_seen": 49427296, "step": 22895 }, { "epoch": 3.735725938009788, "grad_norm": 0.22906161844730377, "learning_rate": 4.886179076087985e-05, "loss": 0.1578, "num_input_tokens_seen": 49437984, "step": 22900 }, { "epoch": 3.736541598694943, "grad_norm": 0.9673433303833008, "learning_rate": 4.886072886596847e-05, "loss": 0.164, "num_input_tokens_seen": 49448288, "step": 22905 }, { "epoch": 3.737357259380098, "grad_norm": 0.11228462308645248, "learning_rate": 4.885966648748916e-05, "loss": 0.1224, "num_input_tokens_seen": 49458624, "step": 22910 }, { "epoch": 3.738172920065253, "grad_norm": 1.1978957653045654, "learning_rate": 4.885860362546344e-05, "loss": 0.1636, "num_input_tokens_seen": 49468032, "step": 22915 }, { "epoch": 3.7389885807504077, "grad_norm": 0.30993372201919556, "learning_rate": 4.885754027991285e-05, "loss": 0.0427, "num_input_tokens_seen": 49479744, "step": 22920 }, { "epoch": 3.7398042414355626, "grad_norm": 1.412005066871643, "learning_rate": 4.885647645085894e-05, "loss": 0.1657, "num_input_tokens_seen": 49490688, "step": 22925 }, { "epoch": 3.740619902120718, "grad_norm": 0.2547204792499542, "learning_rate": 4.885541213832328e-05, "loss": 0.0923, "num_input_tokens_seen": 49499936, "step": 22930 }, { "epoch": 3.7414355628058726, "grad_norm": 0.07014963775873184, "learning_rate": 4.885434734232741e-05, "loss": 0.0476, "num_input_tokens_seen": 49510080, "step": 22935 }, { "epoch": 3.742251223491028, "grad_norm": 0.6474753618240356, "learning_rate": 4.885328206289295e-05, "loss": 0.1077, "num_input_tokens_seen": 49521056, "step": 22940 }, { "epoch": 3.7430668841761827, "grad_norm": 0.3119511902332306, "learning_rate": 4.8852216300041455e-05, "loss": 0.0498, "num_input_tokens_seen": 49531712, "step": 22945 }, { "epoch": 3.7438825448613375, "grad_norm": 0.70545893907547, "learning_rate": 4.8851150053794535e-05, "loss": 0.1144, "num_input_tokens_seen": 49542912, "step": 22950 }, { "epoch": 3.744698205546493, "grad_norm": 0.28534600138664246, "learning_rate": 4.885008332417381e-05, "loss": 0.0281, "num_input_tokens_seen": 49553888, "step": 22955 }, { "epoch": 3.7455138662316476, "grad_norm": 0.460050493478775, "learning_rate": 4.8849016111200894e-05, "loss": 0.1056, "num_input_tokens_seen": 49565376, "step": 22960 }, { "epoch": 3.746329526916803, "grad_norm": 0.578345537185669, "learning_rate": 4.884794841489741e-05, "loss": 0.1351, "num_input_tokens_seen": 49577184, "step": 22965 }, { "epoch": 3.7471451876019577, "grad_norm": 0.8792330026626587, "learning_rate": 4.884688023528499e-05, "loss": 0.0593, "num_input_tokens_seen": 49588608, "step": 22970 }, { "epoch": 3.7479608482871125, "grad_norm": 0.46271198987960815, "learning_rate": 4.8845811572385295e-05, "loss": 0.0413, "num_input_tokens_seen": 49599840, "step": 22975 }, { "epoch": 3.7487765089722673, "grad_norm": 0.17317412793636322, "learning_rate": 4.884474242621998e-05, "loss": 0.2299, "num_input_tokens_seen": 49611040, "step": 22980 }, { "epoch": 3.7495921696574226, "grad_norm": 0.34310367703437805, "learning_rate": 4.8843672796810715e-05, "loss": 0.0414, "num_input_tokens_seen": 49621472, "step": 22985 }, { "epoch": 3.7504078303425774, "grad_norm": 0.19318822026252747, "learning_rate": 4.8842602684179165e-05, "loss": 0.1353, "num_input_tokens_seen": 49632832, "step": 22990 }, { "epoch": 3.7512234910277327, "grad_norm": 1.161110520362854, "learning_rate": 4.8841532088347025e-05, "loss": 0.3456, "num_input_tokens_seen": 49643136, "step": 22995 }, { "epoch": 3.7520391517128875, "grad_norm": 0.5706380009651184, "learning_rate": 4.884046100933599e-05, "loss": 0.1439, "num_input_tokens_seen": 49653920, "step": 23000 }, { "epoch": 3.7528548123980423, "grad_norm": 0.0915636271238327, "learning_rate": 4.883938944716778e-05, "loss": 0.0833, "num_input_tokens_seen": 49664992, "step": 23005 }, { "epoch": 3.753670473083197, "grad_norm": 0.12041759490966797, "learning_rate": 4.8838317401864085e-05, "loss": 0.0312, "num_input_tokens_seen": 49676000, "step": 23010 }, { "epoch": 3.7544861337683524, "grad_norm": 0.27119436860084534, "learning_rate": 4.883724487344666e-05, "loss": 0.0615, "num_input_tokens_seen": 49687712, "step": 23015 }, { "epoch": 3.755301794453507, "grad_norm": 1.1880978345870972, "learning_rate": 4.883617186193722e-05, "loss": 0.0795, "num_input_tokens_seen": 49698944, "step": 23020 }, { "epoch": 3.7561174551386625, "grad_norm": 0.07419437915086746, "learning_rate": 4.883509836735752e-05, "loss": 0.108, "num_input_tokens_seen": 49710048, "step": 23025 }, { "epoch": 3.7569331158238173, "grad_norm": 0.539963960647583, "learning_rate": 4.8834024389729325e-05, "loss": 0.1365, "num_input_tokens_seen": 49720672, "step": 23030 }, { "epoch": 3.757748776508972, "grad_norm": 0.2769334614276886, "learning_rate": 4.8832949929074375e-05, "loss": 0.1063, "num_input_tokens_seen": 49731776, "step": 23035 }, { "epoch": 3.7585644371941274, "grad_norm": 0.24475081264972687, "learning_rate": 4.883187498541447e-05, "loss": 0.1736, "num_input_tokens_seen": 49742336, "step": 23040 }, { "epoch": 3.759380097879282, "grad_norm": 2.0424110889434814, "learning_rate": 4.883079955877138e-05, "loss": 0.2943, "num_input_tokens_seen": 49753408, "step": 23045 }, { "epoch": 3.7601957585644374, "grad_norm": 0.0859113559126854, "learning_rate": 4.882972364916691e-05, "loss": 0.0905, "num_input_tokens_seen": 49764192, "step": 23050 }, { "epoch": 3.7610114192495923, "grad_norm": 0.5445200204849243, "learning_rate": 4.882864725662286e-05, "loss": 0.0647, "num_input_tokens_seen": 49775840, "step": 23055 }, { "epoch": 3.761827079934747, "grad_norm": 0.11107270419597626, "learning_rate": 4.882757038116105e-05, "loss": 0.0772, "num_input_tokens_seen": 49785408, "step": 23060 }, { "epoch": 3.762642740619902, "grad_norm": 1.342705488204956, "learning_rate": 4.882649302280329e-05, "loss": 0.2499, "num_input_tokens_seen": 49796192, "step": 23065 }, { "epoch": 3.763458401305057, "grad_norm": 0.12053635716438293, "learning_rate": 4.882541518157142e-05, "loss": 0.0545, "num_input_tokens_seen": 49807296, "step": 23070 }, { "epoch": 3.764274061990212, "grad_norm": 1.533384084701538, "learning_rate": 4.88243368574873e-05, "loss": 0.2138, "num_input_tokens_seen": 49818368, "step": 23075 }, { "epoch": 3.7650897226753672, "grad_norm": 0.09707040339708328, "learning_rate": 4.882325805057275e-05, "loss": 0.1707, "num_input_tokens_seen": 49830368, "step": 23080 }, { "epoch": 3.765905383360522, "grad_norm": 1.0525099039077759, "learning_rate": 4.882217876084967e-05, "loss": 0.1949, "num_input_tokens_seen": 49840960, "step": 23085 }, { "epoch": 3.766721044045677, "grad_norm": 0.2490617036819458, "learning_rate": 4.882109898833991e-05, "loss": 0.32, "num_input_tokens_seen": 49851776, "step": 23090 }, { "epoch": 3.767536704730832, "grad_norm": 0.44872549176216125, "learning_rate": 4.882001873306536e-05, "loss": 0.1727, "num_input_tokens_seen": 49862336, "step": 23095 }, { "epoch": 3.768352365415987, "grad_norm": 1.6113433837890625, "learning_rate": 4.881893799504792e-05, "loss": 0.26, "num_input_tokens_seen": 49874208, "step": 23100 }, { "epoch": 3.7691680261011418, "grad_norm": 0.9030105471611023, "learning_rate": 4.8817856774309475e-05, "loss": 0.053, "num_input_tokens_seen": 49885440, "step": 23105 }, { "epoch": 3.769983686786297, "grad_norm": 0.5320064425468445, "learning_rate": 4.881677507087195e-05, "loss": 0.0985, "num_input_tokens_seen": 49896832, "step": 23110 }, { "epoch": 3.770799347471452, "grad_norm": 0.4255612790584564, "learning_rate": 4.8815692884757266e-05, "loss": 0.1085, "num_input_tokens_seen": 49907008, "step": 23115 }, { "epoch": 3.7716150081566067, "grad_norm": 0.5270166397094727, "learning_rate": 4.8814610215987356e-05, "loss": 0.0588, "num_input_tokens_seen": 49917600, "step": 23120 }, { "epoch": 3.772430668841762, "grad_norm": 0.8512138724327087, "learning_rate": 4.881352706458416e-05, "loss": 0.1345, "num_input_tokens_seen": 49929280, "step": 23125 }, { "epoch": 3.7732463295269167, "grad_norm": 1.4486632347106934, "learning_rate": 4.881244343056962e-05, "loss": 0.112, "num_input_tokens_seen": 49940384, "step": 23130 }, { "epoch": 3.774061990212072, "grad_norm": 0.029756590723991394, "learning_rate": 4.881135931396572e-05, "loss": 0.2112, "num_input_tokens_seen": 49949696, "step": 23135 }, { "epoch": 3.774877650897227, "grad_norm": 1.0294859409332275, "learning_rate": 4.88102747147944e-05, "loss": 0.1731, "num_input_tokens_seen": 49959680, "step": 23140 }, { "epoch": 3.7756933115823816, "grad_norm": 0.6388442516326904, "learning_rate": 4.8809189633077666e-05, "loss": 0.1452, "num_input_tokens_seen": 49970592, "step": 23145 }, { "epoch": 3.7765089722675365, "grad_norm": 0.593058705329895, "learning_rate": 4.88081040688375e-05, "loss": 0.2761, "num_input_tokens_seen": 49981440, "step": 23150 }, { "epoch": 3.7773246329526917, "grad_norm": 0.8040147423744202, "learning_rate": 4.88070180220959e-05, "loss": 0.1099, "num_input_tokens_seen": 49993024, "step": 23155 }, { "epoch": 3.7781402936378465, "grad_norm": 0.8919370770454407, "learning_rate": 4.880593149287488e-05, "loss": 0.1519, "num_input_tokens_seen": 50004704, "step": 23160 }, { "epoch": 3.778955954323002, "grad_norm": 0.40933844447135925, "learning_rate": 4.8804844481196454e-05, "loss": 0.1067, "num_input_tokens_seen": 50015424, "step": 23165 }, { "epoch": 3.7797716150081566, "grad_norm": 1.295617699623108, "learning_rate": 4.8803756987082664e-05, "loss": 0.0643, "num_input_tokens_seen": 50026368, "step": 23170 }, { "epoch": 3.7805872756933114, "grad_norm": 1.0244090557098389, "learning_rate": 4.8802669010555536e-05, "loss": 0.181, "num_input_tokens_seen": 50036512, "step": 23175 }, { "epoch": 3.7814029363784667, "grad_norm": 0.21488361060619354, "learning_rate": 4.880158055163713e-05, "loss": 0.1237, "num_input_tokens_seen": 50047776, "step": 23180 }, { "epoch": 3.7822185970636215, "grad_norm": 3.12371563911438, "learning_rate": 4.8800491610349496e-05, "loss": 0.2784, "num_input_tokens_seen": 50058048, "step": 23185 }, { "epoch": 3.7830342577487768, "grad_norm": 0.14391648769378662, "learning_rate": 4.87994021867147e-05, "loss": 0.053, "num_input_tokens_seen": 50068992, "step": 23190 }, { "epoch": 3.7838499184339316, "grad_norm": 0.7985276579856873, "learning_rate": 4.879831228075484e-05, "loss": 0.0832, "num_input_tokens_seen": 50079776, "step": 23195 }, { "epoch": 3.7846655791190864, "grad_norm": 0.0912056490778923, "learning_rate": 4.8797221892491984e-05, "loss": 0.0618, "num_input_tokens_seen": 50090592, "step": 23200 }, { "epoch": 3.7854812398042412, "grad_norm": 0.3276139199733734, "learning_rate": 4.879613102194823e-05, "loss": 0.1483, "num_input_tokens_seen": 50102240, "step": 23205 }, { "epoch": 3.7862969004893965, "grad_norm": 1.5799102783203125, "learning_rate": 4.8795039669145704e-05, "loss": 0.159, "num_input_tokens_seen": 50113472, "step": 23210 }, { "epoch": 3.7871125611745513, "grad_norm": 0.3268332779407501, "learning_rate": 4.8793947834106515e-05, "loss": 0.049, "num_input_tokens_seen": 50124224, "step": 23215 }, { "epoch": 3.7879282218597066, "grad_norm": 0.21901491284370422, "learning_rate": 4.8792855516852784e-05, "loss": 0.0908, "num_input_tokens_seen": 50135264, "step": 23220 }, { "epoch": 3.7887438825448614, "grad_norm": 1.9160346984863281, "learning_rate": 4.8791762717406656e-05, "loss": 0.2669, "num_input_tokens_seen": 50145824, "step": 23225 }, { "epoch": 3.789559543230016, "grad_norm": 0.016519449651241302, "learning_rate": 4.879066943579027e-05, "loss": 0.1437, "num_input_tokens_seen": 50156864, "step": 23230 }, { "epoch": 3.790375203915171, "grad_norm": 0.45183035731315613, "learning_rate": 4.878957567202578e-05, "loss": 0.0917, "num_input_tokens_seen": 50167872, "step": 23235 }, { "epoch": 3.7911908646003263, "grad_norm": 0.5010336637496948, "learning_rate": 4.878848142613537e-05, "loss": 0.1019, "num_input_tokens_seen": 50178880, "step": 23240 }, { "epoch": 3.792006525285481, "grad_norm": 0.4657081961631775, "learning_rate": 4.87873866981412e-05, "loss": 0.3136, "num_input_tokens_seen": 50189280, "step": 23245 }, { "epoch": 3.7928221859706364, "grad_norm": 0.13577568531036377, "learning_rate": 4.878629148806547e-05, "loss": 0.0536, "num_input_tokens_seen": 50200736, "step": 23250 }, { "epoch": 3.793637846655791, "grad_norm": 0.06193502992391586, "learning_rate": 4.878519579593036e-05, "loss": 0.0547, "num_input_tokens_seen": 50210208, "step": 23255 }, { "epoch": 3.794453507340946, "grad_norm": 0.3940984010696411, "learning_rate": 4.878409962175809e-05, "loss": 0.069, "num_input_tokens_seen": 50220960, "step": 23260 }, { "epoch": 3.7952691680261013, "grad_norm": 0.45776107907295227, "learning_rate": 4.878300296557087e-05, "loss": 0.1037, "num_input_tokens_seen": 50232384, "step": 23265 }, { "epoch": 3.796084828711256, "grad_norm": 1.2389851808547974, "learning_rate": 4.878190582739092e-05, "loss": 0.0694, "num_input_tokens_seen": 50243328, "step": 23270 }, { "epoch": 3.7969004893964113, "grad_norm": 0.29726725816726685, "learning_rate": 4.8780808207240476e-05, "loss": 0.131, "num_input_tokens_seen": 50254080, "step": 23275 }, { "epoch": 3.797716150081566, "grad_norm": 0.048933450132608414, "learning_rate": 4.8779710105141784e-05, "loss": 0.0512, "num_input_tokens_seen": 50264384, "step": 23280 }, { "epoch": 3.798531810766721, "grad_norm": 1.0443569421768188, "learning_rate": 4.877861152111711e-05, "loss": 0.1753, "num_input_tokens_seen": 50275232, "step": 23285 }, { "epoch": 3.799347471451876, "grad_norm": 0.6913309693336487, "learning_rate": 4.87775124551887e-05, "loss": 0.1216, "num_input_tokens_seen": 50286048, "step": 23290 }, { "epoch": 3.800163132137031, "grad_norm": 0.256753146648407, "learning_rate": 4.877641290737884e-05, "loss": 0.0893, "num_input_tokens_seen": 50296192, "step": 23295 }, { "epoch": 3.800978792822186, "grad_norm": 2.1453440189361572, "learning_rate": 4.877531287770981e-05, "loss": 0.1945, "num_input_tokens_seen": 50305888, "step": 23300 }, { "epoch": 3.801794453507341, "grad_norm": 0.7528964877128601, "learning_rate": 4.8774212366203904e-05, "loss": 0.1041, "num_input_tokens_seen": 50316640, "step": 23305 }, { "epoch": 3.802610114192496, "grad_norm": 0.5142259001731873, "learning_rate": 4.8773111372883425e-05, "loss": 0.1642, "num_input_tokens_seen": 50328832, "step": 23310 }, { "epoch": 3.8034257748776508, "grad_norm": 0.36977529525756836, "learning_rate": 4.877200989777068e-05, "loss": 0.2352, "num_input_tokens_seen": 50339136, "step": 23315 }, { "epoch": 3.804241435562806, "grad_norm": 0.042837467044591904, "learning_rate": 4.8770907940888e-05, "loss": 0.1828, "num_input_tokens_seen": 50350336, "step": 23320 }, { "epoch": 3.805057096247961, "grad_norm": 1.728467583656311, "learning_rate": 4.876980550225772e-05, "loss": 0.1772, "num_input_tokens_seen": 50361376, "step": 23325 }, { "epoch": 3.8058727569331157, "grad_norm": 0.0921860858798027, "learning_rate": 4.876870258190217e-05, "loss": 0.0879, "num_input_tokens_seen": 50372064, "step": 23330 }, { "epoch": 3.806688417618271, "grad_norm": 0.7660753726959229, "learning_rate": 4.876759917984372e-05, "loss": 0.1142, "num_input_tokens_seen": 50382304, "step": 23335 }, { "epoch": 3.8075040783034257, "grad_norm": 1.4491828680038452, "learning_rate": 4.876649529610471e-05, "loss": 0.1974, "num_input_tokens_seen": 50392416, "step": 23340 }, { "epoch": 3.8083197389885806, "grad_norm": 1.5634175539016724, "learning_rate": 4.876539093070753e-05, "loss": 0.251, "num_input_tokens_seen": 50402816, "step": 23345 }, { "epoch": 3.809135399673736, "grad_norm": 0.5308632850646973, "learning_rate": 4.876428608367455e-05, "loss": 0.1351, "num_input_tokens_seen": 50412224, "step": 23350 }, { "epoch": 3.8099510603588906, "grad_norm": 0.8386009931564331, "learning_rate": 4.8763180755028174e-05, "loss": 0.2511, "num_input_tokens_seen": 50423680, "step": 23355 }, { "epoch": 3.810766721044046, "grad_norm": 0.3252023756504059, "learning_rate": 4.876207494479078e-05, "loss": 0.0285, "num_input_tokens_seen": 50435072, "step": 23360 }, { "epoch": 3.8115823817292007, "grad_norm": 0.7533631324768066, "learning_rate": 4.87609686529848e-05, "loss": 0.2344, "num_input_tokens_seen": 50446336, "step": 23365 }, { "epoch": 3.8123980424143555, "grad_norm": 0.5031644105911255, "learning_rate": 4.875986187963265e-05, "loss": 0.0914, "num_input_tokens_seen": 50456928, "step": 23370 }, { "epoch": 3.8132137030995104, "grad_norm": 0.3334158658981323, "learning_rate": 4.8758754624756756e-05, "loss": 0.1452, "num_input_tokens_seen": 50467488, "step": 23375 }, { "epoch": 3.8140293637846656, "grad_norm": 0.5727564692497253, "learning_rate": 4.8757646888379554e-05, "loss": 0.0503, "num_input_tokens_seen": 50479168, "step": 23380 }, { "epoch": 3.8148450244698204, "grad_norm": 0.23320846259593964, "learning_rate": 4.87565386705235e-05, "loss": 0.0185, "num_input_tokens_seen": 50490048, "step": 23385 }, { "epoch": 3.8156606851549757, "grad_norm": 1.3647315502166748, "learning_rate": 4.8755429971211064e-05, "loss": 0.1469, "num_input_tokens_seen": 50500192, "step": 23390 }, { "epoch": 3.8164763458401305, "grad_norm": 0.08271143585443497, "learning_rate": 4.875432079046469e-05, "loss": 0.1278, "num_input_tokens_seen": 50510944, "step": 23395 }, { "epoch": 3.8172920065252853, "grad_norm": 0.15958254039287567, "learning_rate": 4.875321112830687e-05, "loss": 0.1604, "num_input_tokens_seen": 50521344, "step": 23400 }, { "epoch": 3.8181076672104406, "grad_norm": 0.15735375881195068, "learning_rate": 4.875210098476011e-05, "loss": 0.0861, "num_input_tokens_seen": 50532064, "step": 23405 }, { "epoch": 3.8189233278955954, "grad_norm": 0.5777471661567688, "learning_rate": 4.8750990359846885e-05, "loss": 0.0998, "num_input_tokens_seen": 50542816, "step": 23410 }, { "epoch": 3.8197389885807507, "grad_norm": 0.15533919632434845, "learning_rate": 4.8749879253589706e-05, "loss": 0.0363, "num_input_tokens_seen": 50553088, "step": 23415 }, { "epoch": 3.8205546492659055, "grad_norm": 0.162482351064682, "learning_rate": 4.874876766601109e-05, "loss": 0.0578, "num_input_tokens_seen": 50561792, "step": 23420 }, { "epoch": 3.8213703099510603, "grad_norm": 0.47792109847068787, "learning_rate": 4.874765559713358e-05, "loss": 0.1177, "num_input_tokens_seen": 50573280, "step": 23425 }, { "epoch": 3.822185970636215, "grad_norm": 0.12074803560972214, "learning_rate": 4.8746543046979696e-05, "loss": 0.0993, "num_input_tokens_seen": 50584384, "step": 23430 }, { "epoch": 3.8230016313213704, "grad_norm": 0.07367822527885437, "learning_rate": 4.8745430015571994e-05, "loss": 0.0736, "num_input_tokens_seen": 50595904, "step": 23435 }, { "epoch": 3.823817292006525, "grad_norm": 1.0981420278549194, "learning_rate": 4.8744316502933026e-05, "loss": 0.0398, "num_input_tokens_seen": 50608224, "step": 23440 }, { "epoch": 3.8246329526916805, "grad_norm": 0.7143940329551697, "learning_rate": 4.874320250908537e-05, "loss": 0.217, "num_input_tokens_seen": 50618912, "step": 23445 }, { "epoch": 3.8254486133768353, "grad_norm": 0.24798858165740967, "learning_rate": 4.8742088034051594e-05, "loss": 0.0979, "num_input_tokens_seen": 50629728, "step": 23450 }, { "epoch": 3.82626427406199, "grad_norm": 0.16746105253696442, "learning_rate": 4.874097307785429e-05, "loss": 0.0906, "num_input_tokens_seen": 50640096, "step": 23455 }, { "epoch": 3.827079934747145, "grad_norm": 0.10548070818185806, "learning_rate": 4.873985764051604e-05, "loss": 0.1306, "num_input_tokens_seen": 50650176, "step": 23460 }, { "epoch": 3.8278955954323, "grad_norm": 0.09989024698734283, "learning_rate": 4.873874172205946e-05, "loss": 0.1418, "num_input_tokens_seen": 50661504, "step": 23465 }, { "epoch": 3.828711256117455, "grad_norm": 0.32332345843315125, "learning_rate": 4.8737625322507165e-05, "loss": 0.0349, "num_input_tokens_seen": 50672384, "step": 23470 }, { "epoch": 3.8295269168026103, "grad_norm": 0.13893654942512512, "learning_rate": 4.873650844188178e-05, "loss": 0.1121, "num_input_tokens_seen": 50683168, "step": 23475 }, { "epoch": 3.830342577487765, "grad_norm": 1.4226661920547485, "learning_rate": 4.873539108020594e-05, "loss": 0.1712, "num_input_tokens_seen": 50692512, "step": 23480 }, { "epoch": 3.83115823817292, "grad_norm": 0.3665957748889923, "learning_rate": 4.873427323750229e-05, "loss": 0.1818, "num_input_tokens_seen": 50703168, "step": 23485 }, { "epoch": 3.831973898858075, "grad_norm": 0.4256265461444855, "learning_rate": 4.873315491379348e-05, "loss": 0.2184, "num_input_tokens_seen": 50714112, "step": 23490 }, { "epoch": 3.83278955954323, "grad_norm": 1.577055811882019, "learning_rate": 4.873203610910217e-05, "loss": 0.1824, "num_input_tokens_seen": 50724576, "step": 23495 }, { "epoch": 3.8336052202283852, "grad_norm": 0.511153519153595, "learning_rate": 4.8730916823451064e-05, "loss": 0.1254, "num_input_tokens_seen": 50735392, "step": 23500 }, { "epoch": 3.83442088091354, "grad_norm": 0.2437548190355301, "learning_rate": 4.8729797056862806e-05, "loss": 0.0788, "num_input_tokens_seen": 50747488, "step": 23505 }, { "epoch": 3.835236541598695, "grad_norm": 0.6640286445617676, "learning_rate": 4.872867680936012e-05, "loss": 0.1037, "num_input_tokens_seen": 50757120, "step": 23510 }, { "epoch": 3.8360522022838497, "grad_norm": 0.0651819035410881, "learning_rate": 4.872755608096569e-05, "loss": 0.1435, "num_input_tokens_seen": 50767136, "step": 23515 }, { "epoch": 3.836867862969005, "grad_norm": 0.5489020943641663, "learning_rate": 4.8726434871702234e-05, "loss": 0.0347, "num_input_tokens_seen": 50777728, "step": 23520 }, { "epoch": 3.8376835236541598, "grad_norm": 1.1404528617858887, "learning_rate": 4.8725313181592474e-05, "loss": 0.213, "num_input_tokens_seen": 50787488, "step": 23525 }, { "epoch": 3.838499184339315, "grad_norm": 1.457155466079712, "learning_rate": 4.872419101065915e-05, "loss": 0.0971, "num_input_tokens_seen": 50797760, "step": 23530 }, { "epoch": 3.83931484502447, "grad_norm": 1.0871987342834473, "learning_rate": 4.8723068358924994e-05, "loss": 0.06, "num_input_tokens_seen": 50808832, "step": 23535 }, { "epoch": 3.8401305057096247, "grad_norm": 0.2668652832508087, "learning_rate": 4.872194522641277e-05, "loss": 0.1167, "num_input_tokens_seen": 50819296, "step": 23540 }, { "epoch": 3.84094616639478, "grad_norm": 0.6950357556343079, "learning_rate": 4.872082161314523e-05, "loss": 0.0958, "num_input_tokens_seen": 50831424, "step": 23545 }, { "epoch": 3.8417618270799347, "grad_norm": 0.7130895853042603, "learning_rate": 4.8719697519145144e-05, "loss": 0.134, "num_input_tokens_seen": 50841152, "step": 23550 }, { "epoch": 3.8425774877650896, "grad_norm": 0.7963153123855591, "learning_rate": 4.87185729444353e-05, "loss": 0.2681, "num_input_tokens_seen": 50850752, "step": 23555 }, { "epoch": 3.843393148450245, "grad_norm": 0.5029823184013367, "learning_rate": 4.871744788903848e-05, "loss": 0.0936, "num_input_tokens_seen": 50860896, "step": 23560 }, { "epoch": 3.8442088091353996, "grad_norm": 1.9143580198287964, "learning_rate": 4.87163223529775e-05, "loss": 0.1648, "num_input_tokens_seen": 50871072, "step": 23565 }, { "epoch": 3.8450244698205545, "grad_norm": 0.886123776435852, "learning_rate": 4.8715196336275156e-05, "loss": 0.1255, "num_input_tokens_seen": 50880960, "step": 23570 }, { "epoch": 3.8458401305057097, "grad_norm": 0.8443892002105713, "learning_rate": 4.871406983895428e-05, "loss": 0.095, "num_input_tokens_seen": 50890080, "step": 23575 }, { "epoch": 3.8466557911908645, "grad_norm": 0.21848656237125397, "learning_rate": 4.871294286103769e-05, "loss": 0.0288, "num_input_tokens_seen": 50901216, "step": 23580 }, { "epoch": 3.84747145187602, "grad_norm": 1.8628323078155518, "learning_rate": 4.871181540254823e-05, "loss": 0.1513, "num_input_tokens_seen": 50912416, "step": 23585 }, { "epoch": 3.8482871125611746, "grad_norm": 0.28286072611808777, "learning_rate": 4.8710687463508754e-05, "loss": 0.0689, "num_input_tokens_seen": 50922752, "step": 23590 }, { "epoch": 3.8491027732463294, "grad_norm": 0.1374901831150055, "learning_rate": 4.870955904394211e-05, "loss": 0.0241, "num_input_tokens_seen": 50935168, "step": 23595 }, { "epoch": 3.8499184339314843, "grad_norm": 1.2342320680618286, "learning_rate": 4.870843014387118e-05, "loss": 0.2069, "num_input_tokens_seen": 50946304, "step": 23600 }, { "epoch": 3.8507340946166395, "grad_norm": 0.4313907325267792, "learning_rate": 4.870730076331884e-05, "loss": 0.0704, "num_input_tokens_seen": 50955872, "step": 23605 }, { "epoch": 3.8515497553017943, "grad_norm": 0.3986704349517822, "learning_rate": 4.870617090230797e-05, "loss": 0.1723, "num_input_tokens_seen": 50967488, "step": 23610 }, { "epoch": 3.8523654159869496, "grad_norm": 0.1586218923330307, "learning_rate": 4.870504056086147e-05, "loss": 0.0675, "num_input_tokens_seen": 50977664, "step": 23615 }, { "epoch": 3.8531810766721044, "grad_norm": 0.4401833415031433, "learning_rate": 4.870390973900226e-05, "loss": 0.0503, "num_input_tokens_seen": 50989920, "step": 23620 }, { "epoch": 3.8539967373572592, "grad_norm": 1.3808057308197021, "learning_rate": 4.8702778436753244e-05, "loss": 0.1104, "num_input_tokens_seen": 50999136, "step": 23625 }, { "epoch": 3.8548123980424145, "grad_norm": 0.02585318498313427, "learning_rate": 4.8701646654137355e-05, "loss": 0.2298, "num_input_tokens_seen": 51010336, "step": 23630 }, { "epoch": 3.8556280587275693, "grad_norm": 1.2320977449417114, "learning_rate": 4.8700514391177525e-05, "loss": 0.214, "num_input_tokens_seen": 51021792, "step": 23635 }, { "epoch": 3.8564437194127246, "grad_norm": 0.07387757301330566, "learning_rate": 4.869938164789671e-05, "loss": 0.1344, "num_input_tokens_seen": 51032416, "step": 23640 }, { "epoch": 3.8572593800978794, "grad_norm": 0.9173959493637085, "learning_rate": 4.869824842431786e-05, "loss": 0.1156, "num_input_tokens_seen": 51042304, "step": 23645 }, { "epoch": 3.858075040783034, "grad_norm": 0.42532408237457275, "learning_rate": 4.869711472046394e-05, "loss": 0.1579, "num_input_tokens_seen": 51053664, "step": 23650 }, { "epoch": 3.858890701468189, "grad_norm": 0.2098056524991989, "learning_rate": 4.869598053635793e-05, "loss": 0.0596, "num_input_tokens_seen": 51064608, "step": 23655 }, { "epoch": 3.8597063621533443, "grad_norm": 0.08505488187074661, "learning_rate": 4.869484587202281e-05, "loss": 0.1346, "num_input_tokens_seen": 51074208, "step": 23660 }, { "epoch": 3.860522022838499, "grad_norm": 1.699433445930481, "learning_rate": 4.869371072748159e-05, "loss": 0.1143, "num_input_tokens_seen": 51085600, "step": 23665 }, { "epoch": 3.8613376835236544, "grad_norm": 0.4071124792098999, "learning_rate": 4.869257510275725e-05, "loss": 0.1205, "num_input_tokens_seen": 51097344, "step": 23670 }, { "epoch": 3.862153344208809, "grad_norm": 0.07520870864391327, "learning_rate": 4.869143899787283e-05, "loss": 0.0463, "num_input_tokens_seen": 51109184, "step": 23675 }, { "epoch": 3.862969004893964, "grad_norm": 0.22078801691532135, "learning_rate": 4.869030241285134e-05, "loss": 0.0981, "num_input_tokens_seen": 51120800, "step": 23680 }, { "epoch": 3.863784665579119, "grad_norm": 0.30536743998527527, "learning_rate": 4.868916534771582e-05, "loss": 0.063, "num_input_tokens_seen": 51131008, "step": 23685 }, { "epoch": 3.864600326264274, "grad_norm": 1.2538540363311768, "learning_rate": 4.868802780248931e-05, "loss": 0.2585, "num_input_tokens_seen": 51141600, "step": 23690 }, { "epoch": 3.865415986949429, "grad_norm": 0.768730640411377, "learning_rate": 4.8686889777194863e-05, "loss": 0.0592, "num_input_tokens_seen": 51152128, "step": 23695 }, { "epoch": 3.866231647634584, "grad_norm": 0.1259377896785736, "learning_rate": 4.868575127185555e-05, "loss": 0.1095, "num_input_tokens_seen": 51163840, "step": 23700 }, { "epoch": 3.867047308319739, "grad_norm": 0.536598801612854, "learning_rate": 4.8684612286494445e-05, "loss": 0.1119, "num_input_tokens_seen": 51173984, "step": 23705 }, { "epoch": 3.867862969004894, "grad_norm": 0.6792694330215454, "learning_rate": 4.868347282113462e-05, "loss": 0.1269, "num_input_tokens_seen": 51185120, "step": 23710 }, { "epoch": 3.868678629690049, "grad_norm": 0.03684035316109657, "learning_rate": 4.8682332875799176e-05, "loss": 0.0364, "num_input_tokens_seen": 51195968, "step": 23715 }, { "epoch": 3.869494290375204, "grad_norm": 0.388109415769577, "learning_rate": 4.868119245051121e-05, "loss": 0.0744, "num_input_tokens_seen": 51206368, "step": 23720 }, { "epoch": 3.870309951060359, "grad_norm": 0.2781147360801697, "learning_rate": 4.868005154529384e-05, "loss": 0.1337, "num_input_tokens_seen": 51216928, "step": 23725 }, { "epoch": 3.871125611745514, "grad_norm": 1.2345836162567139, "learning_rate": 4.867891016017019e-05, "loss": 0.3258, "num_input_tokens_seen": 51227936, "step": 23730 }, { "epoch": 3.8719412724306688, "grad_norm": 0.5188035368919373, "learning_rate": 4.867776829516337e-05, "loss": 0.1553, "num_input_tokens_seen": 51238880, "step": 23735 }, { "epoch": 3.8727569331158236, "grad_norm": 0.9882082343101501, "learning_rate": 4.867662595029656e-05, "loss": 0.1081, "num_input_tokens_seen": 51250560, "step": 23740 }, { "epoch": 3.873572593800979, "grad_norm": 0.09988279640674591, "learning_rate": 4.8675483125592874e-05, "loss": 0.0421, "num_input_tokens_seen": 51261440, "step": 23745 }, { "epoch": 3.8743882544861337, "grad_norm": 0.25800564885139465, "learning_rate": 4.8674339821075496e-05, "loss": 0.2068, "num_input_tokens_seen": 51272640, "step": 23750 }, { "epoch": 3.875203915171289, "grad_norm": 1.5256545543670654, "learning_rate": 4.867319603676758e-05, "loss": 0.3065, "num_input_tokens_seen": 51284064, "step": 23755 }, { "epoch": 3.8760195758564437, "grad_norm": 1.2848669290542603, "learning_rate": 4.867205177269232e-05, "loss": 0.1285, "num_input_tokens_seen": 51294368, "step": 23760 }, { "epoch": 3.8768352365415986, "grad_norm": 0.20715129375457764, "learning_rate": 4.86709070288729e-05, "loss": 0.0651, "num_input_tokens_seen": 51304704, "step": 23765 }, { "epoch": 3.877650897226754, "grad_norm": 1.0638889074325562, "learning_rate": 4.8669761805332525e-05, "loss": 0.107, "num_input_tokens_seen": 51316128, "step": 23770 }, { "epoch": 3.8784665579119086, "grad_norm": 0.016311075538396835, "learning_rate": 4.8668616102094386e-05, "loss": 0.0695, "num_input_tokens_seen": 51327712, "step": 23775 }, { "epoch": 3.8792822185970635, "grad_norm": 1.5118536949157715, "learning_rate": 4.866746991918173e-05, "loss": 0.1977, "num_input_tokens_seen": 51338624, "step": 23780 }, { "epoch": 3.8800978792822187, "grad_norm": 0.04088972881436348, "learning_rate": 4.866632325661777e-05, "loss": 0.0574, "num_input_tokens_seen": 51348928, "step": 23785 }, { "epoch": 3.8809135399673735, "grad_norm": 0.9135475754737854, "learning_rate": 4.866517611442574e-05, "loss": 0.1319, "num_input_tokens_seen": 51359744, "step": 23790 }, { "epoch": 3.8817292006525284, "grad_norm": 0.09472500532865524, "learning_rate": 4.86640284926289e-05, "loss": 0.1855, "num_input_tokens_seen": 51371616, "step": 23795 }, { "epoch": 3.8825448613376836, "grad_norm": 0.20128796994686127, "learning_rate": 4.86628803912505e-05, "loss": 0.1425, "num_input_tokens_seen": 51382752, "step": 23800 }, { "epoch": 3.8833605220228384, "grad_norm": 1.24740731716156, "learning_rate": 4.866173181031382e-05, "loss": 0.3158, "num_input_tokens_seen": 51393440, "step": 23805 }, { "epoch": 3.8841761827079937, "grad_norm": 0.26481062173843384, "learning_rate": 4.866058274984211e-05, "loss": 0.0392, "num_input_tokens_seen": 51401792, "step": 23810 }, { "epoch": 3.8849918433931485, "grad_norm": 0.5563282370567322, "learning_rate": 4.865943320985868e-05, "loss": 0.2842, "num_input_tokens_seen": 51413216, "step": 23815 }, { "epoch": 3.8858075040783033, "grad_norm": 0.3000420033931732, "learning_rate": 4.865828319038683e-05, "loss": 0.1837, "num_input_tokens_seen": 51423840, "step": 23820 }, { "epoch": 3.886623164763458, "grad_norm": 0.09976271539926529, "learning_rate": 4.865713269144986e-05, "loss": 0.0724, "num_input_tokens_seen": 51434464, "step": 23825 }, { "epoch": 3.8874388254486134, "grad_norm": 0.4903830289840698, "learning_rate": 4.865598171307107e-05, "loss": 0.1977, "num_input_tokens_seen": 51446112, "step": 23830 }, { "epoch": 3.8882544861337682, "grad_norm": 1.3759139776229858, "learning_rate": 4.865483025527381e-05, "loss": 0.2224, "num_input_tokens_seen": 51456704, "step": 23835 }, { "epoch": 3.8890701468189235, "grad_norm": 1.4435224533081055, "learning_rate": 4.865367831808141e-05, "loss": 0.1545, "num_input_tokens_seen": 51467424, "step": 23840 }, { "epoch": 3.8898858075040783, "grad_norm": 0.26527780294418335, "learning_rate": 4.865252590151721e-05, "loss": 0.055, "num_input_tokens_seen": 51477792, "step": 23845 }, { "epoch": 3.890701468189233, "grad_norm": 1.1295558214187622, "learning_rate": 4.8651373005604566e-05, "loss": 0.278, "num_input_tokens_seen": 51486368, "step": 23850 }, { "epoch": 3.8915171288743884, "grad_norm": 1.6989390850067139, "learning_rate": 4.865021963036684e-05, "loss": 0.1466, "num_input_tokens_seen": 51496224, "step": 23855 }, { "epoch": 3.892332789559543, "grad_norm": 0.08431312441825867, "learning_rate": 4.864906577582742e-05, "loss": 0.1364, "num_input_tokens_seen": 51507872, "step": 23860 }, { "epoch": 3.8931484502446985, "grad_norm": 0.7331239581108093, "learning_rate": 4.864791144200967e-05, "loss": 0.0884, "num_input_tokens_seen": 51518112, "step": 23865 }, { "epoch": 3.8939641109298533, "grad_norm": 1.6660135984420776, "learning_rate": 4.864675662893701e-05, "loss": 0.1161, "num_input_tokens_seen": 51529120, "step": 23870 }, { "epoch": 3.894779771615008, "grad_norm": 0.5132681727409363, "learning_rate": 4.864560133663282e-05, "loss": 0.1531, "num_input_tokens_seen": 51540768, "step": 23875 }, { "epoch": 3.895595432300163, "grad_norm": 0.31119826436042786, "learning_rate": 4.8644445565120516e-05, "loss": 0.1658, "num_input_tokens_seen": 51550304, "step": 23880 }, { "epoch": 3.896411092985318, "grad_norm": 0.43363669514656067, "learning_rate": 4.8643289314423535e-05, "loss": 0.1305, "num_input_tokens_seen": 51561088, "step": 23885 }, { "epoch": 3.897226753670473, "grad_norm": 0.48060673475265503, "learning_rate": 4.86421325845653e-05, "loss": 0.1785, "num_input_tokens_seen": 51571264, "step": 23890 }, { "epoch": 3.8980424143556283, "grad_norm": 0.11564712971448898, "learning_rate": 4.864097537556926e-05, "loss": 0.0745, "num_input_tokens_seen": 51582912, "step": 23895 }, { "epoch": 3.898858075040783, "grad_norm": 0.5092701315879822, "learning_rate": 4.863981768745885e-05, "loss": 0.0736, "num_input_tokens_seen": 51594144, "step": 23900 }, { "epoch": 3.899673735725938, "grad_norm": 0.1280420422554016, "learning_rate": 4.863865952025757e-05, "loss": 0.1105, "num_input_tokens_seen": 51604928, "step": 23905 }, { "epoch": 3.9004893964110927, "grad_norm": 1.7046122550964355, "learning_rate": 4.863750087398885e-05, "loss": 0.2584, "num_input_tokens_seen": 51614176, "step": 23910 }, { "epoch": 3.901305057096248, "grad_norm": 1.3845709562301636, "learning_rate": 4.863634174867619e-05, "loss": 0.1047, "num_input_tokens_seen": 51624992, "step": 23915 }, { "epoch": 3.902120717781403, "grad_norm": 0.6281778812408447, "learning_rate": 4.863518214434308e-05, "loss": 0.0413, "num_input_tokens_seen": 51635744, "step": 23920 }, { "epoch": 3.902936378466558, "grad_norm": 0.41115647554397583, "learning_rate": 4.863402206101302e-05, "loss": 0.0425, "num_input_tokens_seen": 51645472, "step": 23925 }, { "epoch": 3.903752039151713, "grad_norm": 1.8270596265792847, "learning_rate": 4.8632861498709526e-05, "loss": 0.2038, "num_input_tokens_seen": 51656128, "step": 23930 }, { "epoch": 3.9045676998368677, "grad_norm": 0.6637927889823914, "learning_rate": 4.8631700457456105e-05, "loss": 0.2691, "num_input_tokens_seen": 51666368, "step": 23935 }, { "epoch": 3.905383360522023, "grad_norm": 1.1025525331497192, "learning_rate": 4.8630538937276304e-05, "loss": 0.1194, "num_input_tokens_seen": 51677792, "step": 23940 }, { "epoch": 3.9061990212071778, "grad_norm": 0.3595436215400696, "learning_rate": 4.862937693819365e-05, "loss": 0.2038, "num_input_tokens_seen": 51689280, "step": 23945 }, { "epoch": 3.907014681892333, "grad_norm": 0.2334476262331009, "learning_rate": 4.8628214460231694e-05, "loss": 0.0688, "num_input_tokens_seen": 51699712, "step": 23950 }, { "epoch": 3.907830342577488, "grad_norm": 0.9843264222145081, "learning_rate": 4.8627051503414e-05, "loss": 0.2189, "num_input_tokens_seen": 51710624, "step": 23955 }, { "epoch": 3.9086460032626427, "grad_norm": 1.0030276775360107, "learning_rate": 4.862588806776414e-05, "loss": 0.1198, "num_input_tokens_seen": 51721952, "step": 23960 }, { "epoch": 3.9094616639477975, "grad_norm": 0.12240396440029144, "learning_rate": 4.862472415330568e-05, "loss": 0.1723, "num_input_tokens_seen": 51731584, "step": 23965 }, { "epoch": 3.9102773246329527, "grad_norm": 0.13153919577598572, "learning_rate": 4.862355976006221e-05, "loss": 0.163, "num_input_tokens_seen": 51743232, "step": 23970 }, { "epoch": 3.9110929853181076, "grad_norm": 0.2585553824901581, "learning_rate": 4.862239488805734e-05, "loss": 0.1201, "num_input_tokens_seen": 51755712, "step": 23975 }, { "epoch": 3.911908646003263, "grad_norm": 0.3018258810043335, "learning_rate": 4.862122953731467e-05, "loss": 0.061, "num_input_tokens_seen": 51767776, "step": 23980 }, { "epoch": 3.9127243066884176, "grad_norm": 0.7180498242378235, "learning_rate": 4.8620063707857815e-05, "loss": 0.1514, "num_input_tokens_seen": 51778176, "step": 23985 }, { "epoch": 3.9135399673735725, "grad_norm": 0.4749462902545929, "learning_rate": 4.8618897399710404e-05, "loss": 0.145, "num_input_tokens_seen": 51789024, "step": 23990 }, { "epoch": 3.9143556280587277, "grad_norm": 0.37742486596107483, "learning_rate": 4.861773061289608e-05, "loss": 0.0725, "num_input_tokens_seen": 51798752, "step": 23995 }, { "epoch": 3.9151712887438825, "grad_norm": 0.8864458203315735, "learning_rate": 4.8616563347438485e-05, "loss": 0.1477, "num_input_tokens_seen": 51809856, "step": 24000 }, { "epoch": 3.9159869494290374, "grad_norm": 0.20103232562541962, "learning_rate": 4.861539560336127e-05, "loss": 0.0373, "num_input_tokens_seen": 51819584, "step": 24005 }, { "epoch": 3.9168026101141926, "grad_norm": 1.143982172012329, "learning_rate": 4.86142273806881e-05, "loss": 0.1161, "num_input_tokens_seen": 51831168, "step": 24010 }, { "epoch": 3.9176182707993474, "grad_norm": 0.49579980969429016, "learning_rate": 4.861305867944266e-05, "loss": 0.0758, "num_input_tokens_seen": 51839776, "step": 24015 }, { "epoch": 3.9184339314845023, "grad_norm": 0.07092585414648056, "learning_rate": 4.861188949964863e-05, "loss": 0.1542, "num_input_tokens_seen": 51850368, "step": 24020 }, { "epoch": 3.9192495921696575, "grad_norm": 1.4956413507461548, "learning_rate": 4.861071984132971e-05, "loss": 0.1628, "num_input_tokens_seen": 51861248, "step": 24025 }, { "epoch": 3.9200652528548123, "grad_norm": 1.084952712059021, "learning_rate": 4.8609549704509595e-05, "loss": 0.2906, "num_input_tokens_seen": 51872928, "step": 24030 }, { "epoch": 3.9208809135399676, "grad_norm": 1.9083212614059448, "learning_rate": 4.8608379089212007e-05, "loss": 0.2781, "num_input_tokens_seen": 51883680, "step": 24035 }, { "epoch": 3.9216965742251224, "grad_norm": 1.389940619468689, "learning_rate": 4.860720799546067e-05, "loss": 0.3293, "num_input_tokens_seen": 51895808, "step": 24040 }, { "epoch": 3.9225122349102772, "grad_norm": 0.5132884979248047, "learning_rate": 4.860603642327931e-05, "loss": 0.1063, "num_input_tokens_seen": 51905760, "step": 24045 }, { "epoch": 3.923327895595432, "grad_norm": 0.6909608840942383, "learning_rate": 4.860486437269167e-05, "loss": 0.2096, "num_input_tokens_seen": 51916288, "step": 24050 }, { "epoch": 3.9241435562805873, "grad_norm": 1.413378357887268, "learning_rate": 4.860369184372152e-05, "loss": 0.0602, "num_input_tokens_seen": 51927040, "step": 24055 }, { "epoch": 3.924959216965742, "grad_norm": 0.09869332611560822, "learning_rate": 4.8602518836392604e-05, "loss": 0.0443, "num_input_tokens_seen": 51937760, "step": 24060 }, { "epoch": 3.9257748776508974, "grad_norm": 0.7104409337043762, "learning_rate": 4.860134535072871e-05, "loss": 0.0558, "num_input_tokens_seen": 51948384, "step": 24065 }, { "epoch": 3.926590538336052, "grad_norm": 0.6089458465576172, "learning_rate": 4.86001713867536e-05, "loss": 0.1865, "num_input_tokens_seen": 51958496, "step": 24070 }, { "epoch": 3.927406199021207, "grad_norm": 0.13863356411457062, "learning_rate": 4.859899694449109e-05, "loss": 0.0526, "num_input_tokens_seen": 51969216, "step": 24075 }, { "epoch": 3.9282218597063623, "grad_norm": 0.11451926082372665, "learning_rate": 4.859782202396496e-05, "loss": 0.0681, "num_input_tokens_seen": 51980768, "step": 24080 }, { "epoch": 3.929037520391517, "grad_norm": 0.30907008051872253, "learning_rate": 4.859664662519904e-05, "loss": 0.0684, "num_input_tokens_seen": 51990976, "step": 24085 }, { "epoch": 3.9298531810766724, "grad_norm": 0.8615435361862183, "learning_rate": 4.859547074821713e-05, "loss": 0.1143, "num_input_tokens_seen": 52002240, "step": 24090 }, { "epoch": 3.930668841761827, "grad_norm": 1.4475327730178833, "learning_rate": 4.859429439304308e-05, "loss": 0.1106, "num_input_tokens_seen": 52013760, "step": 24095 }, { "epoch": 3.931484502446982, "grad_norm": 0.25273507833480835, "learning_rate": 4.8593117559700724e-05, "loss": 0.099, "num_input_tokens_seen": 52024960, "step": 24100 }, { "epoch": 3.932300163132137, "grad_norm": 0.12584678828716278, "learning_rate": 4.8591940248213906e-05, "loss": 0.1014, "num_input_tokens_seen": 52035872, "step": 24105 }, { "epoch": 3.933115823817292, "grad_norm": 0.07327371835708618, "learning_rate": 4.85907624586065e-05, "loss": 0.1955, "num_input_tokens_seen": 52047136, "step": 24110 }, { "epoch": 3.933931484502447, "grad_norm": 0.06107325106859207, "learning_rate": 4.858958419090236e-05, "loss": 0.25, "num_input_tokens_seen": 52059040, "step": 24115 }, { "epoch": 3.934747145187602, "grad_norm": 0.12498847395181656, "learning_rate": 4.858840544512536e-05, "loss": 0.1085, "num_input_tokens_seen": 52070176, "step": 24120 }, { "epoch": 3.935562805872757, "grad_norm": 0.17158271372318268, "learning_rate": 4.858722622129941e-05, "loss": 0.171, "num_input_tokens_seen": 52080576, "step": 24125 }, { "epoch": 3.936378466557912, "grad_norm": 0.5164604187011719, "learning_rate": 4.8586046519448394e-05, "loss": 0.096, "num_input_tokens_seen": 52091744, "step": 24130 }, { "epoch": 3.9371941272430666, "grad_norm": 0.22655023634433746, "learning_rate": 4.858486633959623e-05, "loss": 0.2513, "num_input_tokens_seen": 52102432, "step": 24135 }, { "epoch": 3.938009787928222, "grad_norm": 0.10102704167366028, "learning_rate": 4.858368568176683e-05, "loss": 0.1367, "num_input_tokens_seen": 52111968, "step": 24140 }, { "epoch": 3.9388254486133767, "grad_norm": 0.4001148045063019, "learning_rate": 4.858250454598412e-05, "loss": 0.1268, "num_input_tokens_seen": 52123872, "step": 24145 }, { "epoch": 3.939641109298532, "grad_norm": 0.117256760597229, "learning_rate": 4.8581322932272036e-05, "loss": 0.0454, "num_input_tokens_seen": 52134400, "step": 24150 }, { "epoch": 3.9404567699836868, "grad_norm": 0.07822098582983017, "learning_rate": 4.8580140840654534e-05, "loss": 0.0354, "num_input_tokens_seen": 52144448, "step": 24155 }, { "epoch": 3.9412724306688416, "grad_norm": 1.3245583772659302, "learning_rate": 4.857895827115555e-05, "loss": 0.236, "num_input_tokens_seen": 52155840, "step": 24160 }, { "epoch": 3.942088091353997, "grad_norm": 0.20563960075378418, "learning_rate": 4.8577775223799074e-05, "loss": 0.0742, "num_input_tokens_seen": 52167968, "step": 24165 }, { "epoch": 3.9429037520391517, "grad_norm": 0.8194819092750549, "learning_rate": 4.857659169860907e-05, "loss": 0.0674, "num_input_tokens_seen": 52177792, "step": 24170 }, { "epoch": 3.943719412724307, "grad_norm": 0.7573646903038025, "learning_rate": 4.857540769560954e-05, "loss": 0.0863, "num_input_tokens_seen": 52188992, "step": 24175 }, { "epoch": 3.9445350734094617, "grad_norm": 1.4784516096115112, "learning_rate": 4.857422321482445e-05, "loss": 0.1522, "num_input_tokens_seen": 52200192, "step": 24180 }, { "epoch": 3.9453507340946166, "grad_norm": 0.9921059608459473, "learning_rate": 4.857303825627782e-05, "loss": 0.1771, "num_input_tokens_seen": 52210944, "step": 24185 }, { "epoch": 3.9461663947797714, "grad_norm": 0.5805777907371521, "learning_rate": 4.857185281999368e-05, "loss": 0.1658, "num_input_tokens_seen": 52220960, "step": 24190 }, { "epoch": 3.9469820554649266, "grad_norm": 1.8002961874008179, "learning_rate": 4.857066690599603e-05, "loss": 0.1733, "num_input_tokens_seen": 52231232, "step": 24195 }, { "epoch": 3.9477977161500815, "grad_norm": 0.053408749401569366, "learning_rate": 4.856948051430891e-05, "loss": 0.1279, "num_input_tokens_seen": 52240032, "step": 24200 }, { "epoch": 3.9486133768352367, "grad_norm": 0.14620386064052582, "learning_rate": 4.856829364495637e-05, "loss": 0.0804, "num_input_tokens_seen": 52251968, "step": 24205 }, { "epoch": 3.9494290375203915, "grad_norm": 0.022369202226400375, "learning_rate": 4.856710629796246e-05, "loss": 0.0366, "num_input_tokens_seen": 52263072, "step": 24210 }, { "epoch": 3.9502446982055464, "grad_norm": 1.7633388042449951, "learning_rate": 4.856591847335125e-05, "loss": 0.2157, "num_input_tokens_seen": 52274336, "step": 24215 }, { "epoch": 3.9510603588907016, "grad_norm": 1.3484148979187012, "learning_rate": 4.8564730171146796e-05, "loss": 0.1557, "num_input_tokens_seen": 52284992, "step": 24220 }, { "epoch": 3.9518760195758564, "grad_norm": 0.75909823179245, "learning_rate": 4.85635413913732e-05, "loss": 0.0811, "num_input_tokens_seen": 52296032, "step": 24225 }, { "epoch": 3.9526916802610113, "grad_norm": 1.348628044128418, "learning_rate": 4.856235213405455e-05, "loss": 0.2605, "num_input_tokens_seen": 52306432, "step": 24230 }, { "epoch": 3.9535073409461665, "grad_norm": 0.1822085976600647, "learning_rate": 4.856116239921493e-05, "loss": 0.0669, "num_input_tokens_seen": 52316640, "step": 24235 }, { "epoch": 3.9543230016313213, "grad_norm": 1.369482398033142, "learning_rate": 4.8559972186878466e-05, "loss": 0.1601, "num_input_tokens_seen": 52327104, "step": 24240 }, { "epoch": 3.955138662316476, "grad_norm": 0.09587424248456955, "learning_rate": 4.8558781497069286e-05, "loss": 0.1831, "num_input_tokens_seen": 52338080, "step": 24245 }, { "epoch": 3.9559543230016314, "grad_norm": 0.22601820528507233, "learning_rate": 4.8557590329811506e-05, "loss": 0.3611, "num_input_tokens_seen": 52348576, "step": 24250 }, { "epoch": 3.9567699836867862, "grad_norm": 0.09399721026420593, "learning_rate": 4.855639868512928e-05, "loss": 0.1792, "num_input_tokens_seen": 52358848, "step": 24255 }, { "epoch": 3.9575856443719415, "grad_norm": 0.12939664721488953, "learning_rate": 4.855520656304674e-05, "loss": 0.0618, "num_input_tokens_seen": 52369248, "step": 24260 }, { "epoch": 3.9584013050570963, "grad_norm": 0.9254763126373291, "learning_rate": 4.855401396358807e-05, "loss": 0.1916, "num_input_tokens_seen": 52380640, "step": 24265 }, { "epoch": 3.959216965742251, "grad_norm": 0.654782772064209, "learning_rate": 4.8552820886777415e-05, "loss": 0.1571, "num_input_tokens_seen": 52391136, "step": 24270 }, { "epoch": 3.960032626427406, "grad_norm": 0.3535853624343872, "learning_rate": 4.855162733263897e-05, "loss": 0.0384, "num_input_tokens_seen": 52401824, "step": 24275 }, { "epoch": 3.960848287112561, "grad_norm": 1.1049911975860596, "learning_rate": 4.855043330119693e-05, "loss": 0.4094, "num_input_tokens_seen": 52412576, "step": 24280 }, { "epoch": 3.961663947797716, "grad_norm": 1.0037401914596558, "learning_rate": 4.854923879247547e-05, "loss": 0.1411, "num_input_tokens_seen": 52422720, "step": 24285 }, { "epoch": 3.9624796084828713, "grad_norm": 1.294602394104004, "learning_rate": 4.854804380649882e-05, "loss": 0.1421, "num_input_tokens_seen": 52432800, "step": 24290 }, { "epoch": 3.963295269168026, "grad_norm": 0.24088802933692932, "learning_rate": 4.854684834329118e-05, "loss": 0.069, "num_input_tokens_seen": 52443680, "step": 24295 }, { "epoch": 3.964110929853181, "grad_norm": 0.07097306847572327, "learning_rate": 4.8545652402876795e-05, "loss": 0.1827, "num_input_tokens_seen": 52454976, "step": 24300 }, { "epoch": 3.964926590538336, "grad_norm": 0.8504570722579956, "learning_rate": 4.854445598527989e-05, "loss": 0.3069, "num_input_tokens_seen": 52465376, "step": 24305 }, { "epoch": 3.965742251223491, "grad_norm": 0.7326509952545166, "learning_rate": 4.8543259090524716e-05, "loss": 0.2096, "num_input_tokens_seen": 52477408, "step": 24310 }, { "epoch": 3.9665579119086463, "grad_norm": 0.7348350286483765, "learning_rate": 4.8542061718635537e-05, "loss": 0.1091, "num_input_tokens_seen": 52489472, "step": 24315 }, { "epoch": 3.967373572593801, "grad_norm": 0.27091753482818604, "learning_rate": 4.85408638696366e-05, "loss": 0.0496, "num_input_tokens_seen": 52499648, "step": 24320 }, { "epoch": 3.968189233278956, "grad_norm": 0.25421392917633057, "learning_rate": 4.8539665543552206e-05, "loss": 0.2862, "num_input_tokens_seen": 52508096, "step": 24325 }, { "epoch": 3.9690048939641107, "grad_norm": 0.3270585536956787, "learning_rate": 4.853846674040662e-05, "loss": 0.061, "num_input_tokens_seen": 52520224, "step": 24330 }, { "epoch": 3.969820554649266, "grad_norm": 0.14287807047367096, "learning_rate": 4.853726746022415e-05, "loss": 0.0829, "num_input_tokens_seen": 52531520, "step": 24335 }, { "epoch": 3.970636215334421, "grad_norm": 0.15925724804401398, "learning_rate": 4.853606770302909e-05, "loss": 0.1472, "num_input_tokens_seen": 52543040, "step": 24340 }, { "epoch": 3.971451876019576, "grad_norm": 1.0992846488952637, "learning_rate": 4.853486746884576e-05, "loss": 0.3508, "num_input_tokens_seen": 52553760, "step": 24345 }, { "epoch": 3.972267536704731, "grad_norm": 0.575660228729248, "learning_rate": 4.853366675769848e-05, "loss": 0.118, "num_input_tokens_seen": 52564416, "step": 24350 }, { "epoch": 3.9730831973898857, "grad_norm": 1.1004000902175903, "learning_rate": 4.85324655696116e-05, "loss": 0.1824, "num_input_tokens_seen": 52574880, "step": 24355 }, { "epoch": 3.9738988580750405, "grad_norm": 0.4502919316291809, "learning_rate": 4.853126390460945e-05, "loss": 0.2965, "num_input_tokens_seen": 52586464, "step": 24360 }, { "epoch": 3.9747145187601958, "grad_norm": 0.0904790386557579, "learning_rate": 4.8530061762716385e-05, "loss": 0.043, "num_input_tokens_seen": 52596608, "step": 24365 }, { "epoch": 3.9755301794453506, "grad_norm": 0.36433303356170654, "learning_rate": 4.852885914395676e-05, "loss": 0.1197, "num_input_tokens_seen": 52607680, "step": 24370 }, { "epoch": 3.976345840130506, "grad_norm": 1.3011747598648071, "learning_rate": 4.852765604835496e-05, "loss": 0.2087, "num_input_tokens_seen": 52619648, "step": 24375 }, { "epoch": 3.9771615008156607, "grad_norm": 0.10893422365188599, "learning_rate": 4.852645247593537e-05, "loss": 0.092, "num_input_tokens_seen": 52630528, "step": 24380 }, { "epoch": 3.9779771615008155, "grad_norm": 0.08489523828029633, "learning_rate": 4.852524842672237e-05, "loss": 0.0468, "num_input_tokens_seen": 52640608, "step": 24385 }, { "epoch": 3.9787928221859707, "grad_norm": 1.246726393699646, "learning_rate": 4.852404390074037e-05, "loss": 0.1726, "num_input_tokens_seen": 52651872, "step": 24390 }, { "epoch": 3.9796084828711256, "grad_norm": 0.466124951839447, "learning_rate": 4.852283889801377e-05, "loss": 0.1176, "num_input_tokens_seen": 52663904, "step": 24395 }, { "epoch": 3.980424143556281, "grad_norm": 0.9400461912155151, "learning_rate": 4.8521633418567006e-05, "loss": 0.1832, "num_input_tokens_seen": 52674848, "step": 24400 }, { "epoch": 3.9812398042414356, "grad_norm": 0.9882106781005859, "learning_rate": 4.85204274624245e-05, "loss": 0.3457, "num_input_tokens_seen": 52686656, "step": 24405 }, { "epoch": 3.9820554649265905, "grad_norm": 0.099921815097332, "learning_rate": 4.85192210296107e-05, "loss": 0.0754, "num_input_tokens_seen": 52696544, "step": 24410 }, { "epoch": 3.9828711256117453, "grad_norm": 0.06943023949861526, "learning_rate": 4.8518014120150035e-05, "loss": 0.0925, "num_input_tokens_seen": 52706912, "step": 24415 }, { "epoch": 3.9836867862969005, "grad_norm": 0.10315938293933868, "learning_rate": 4.851680673406699e-05, "loss": 0.113, "num_input_tokens_seen": 52718016, "step": 24420 }, { "epoch": 3.9845024469820554, "grad_norm": 1.640471339225769, "learning_rate": 4.851559887138603e-05, "loss": 0.4514, "num_input_tokens_seen": 52728288, "step": 24425 }, { "epoch": 3.9853181076672106, "grad_norm": 0.3034103810787201, "learning_rate": 4.851439053213161e-05, "loss": 0.102, "num_input_tokens_seen": 52739104, "step": 24430 }, { "epoch": 3.9861337683523654, "grad_norm": 1.1463980674743652, "learning_rate": 4.8513181716328245e-05, "loss": 0.0825, "num_input_tokens_seen": 52749536, "step": 24435 }, { "epoch": 3.9869494290375203, "grad_norm": 1.196864128112793, "learning_rate": 4.8511972424000417e-05, "loss": 0.2239, "num_input_tokens_seen": 52760416, "step": 24440 }, { "epoch": 3.9877650897226755, "grad_norm": 0.14048953354358673, "learning_rate": 4.8510762655172645e-05, "loss": 0.1201, "num_input_tokens_seen": 52772160, "step": 24445 }, { "epoch": 3.9885807504078303, "grad_norm": 0.5526257157325745, "learning_rate": 4.850955240986945e-05, "loss": 0.078, "num_input_tokens_seen": 52781984, "step": 24450 }, { "epoch": 3.9893964110929856, "grad_norm": 1.3113502264022827, "learning_rate": 4.8508341688115346e-05, "loss": 0.1349, "num_input_tokens_seen": 52791776, "step": 24455 }, { "epoch": 3.9902120717781404, "grad_norm": 0.14215165376663208, "learning_rate": 4.850713048993487e-05, "loss": 0.098, "num_input_tokens_seen": 52802528, "step": 24460 }, { "epoch": 3.9910277324632952, "grad_norm": 0.34314975142478943, "learning_rate": 4.850591881535258e-05, "loss": 0.1264, "num_input_tokens_seen": 52812640, "step": 24465 }, { "epoch": 3.99184339314845, "grad_norm": 0.4464413523674011, "learning_rate": 4.8504706664393026e-05, "loss": 0.1226, "num_input_tokens_seen": 52823744, "step": 24470 }, { "epoch": 3.9926590538336053, "grad_norm": 1.10756516456604, "learning_rate": 4.850349403708076e-05, "loss": 0.0594, "num_input_tokens_seen": 52834016, "step": 24475 }, { "epoch": 3.99347471451876, "grad_norm": 1.7710952758789062, "learning_rate": 4.8502280933440394e-05, "loss": 0.1926, "num_input_tokens_seen": 52845248, "step": 24480 }, { "epoch": 3.9942903752039154, "grad_norm": 0.4272669851779938, "learning_rate": 4.850106735349647e-05, "loss": 0.0512, "num_input_tokens_seen": 52855104, "step": 24485 }, { "epoch": 3.99510603588907, "grad_norm": 0.2958374321460724, "learning_rate": 4.849985329727362e-05, "loss": 0.0644, "num_input_tokens_seen": 52866048, "step": 24490 }, { "epoch": 3.995921696574225, "grad_norm": 0.12445589900016785, "learning_rate": 4.849863876479642e-05, "loss": 0.0509, "num_input_tokens_seen": 52876768, "step": 24495 }, { "epoch": 3.99673735725938, "grad_norm": 0.25940778851509094, "learning_rate": 4.84974237560895e-05, "loss": 0.0849, "num_input_tokens_seen": 52887872, "step": 24500 }, { "epoch": 3.997553017944535, "grad_norm": 0.6069428324699402, "learning_rate": 4.8496208271177476e-05, "loss": 0.1287, "num_input_tokens_seen": 52898624, "step": 24505 }, { "epoch": 3.99836867862969, "grad_norm": 0.4317142963409424, "learning_rate": 4.8494992310084986e-05, "loss": 0.0985, "num_input_tokens_seen": 52908768, "step": 24510 }, { "epoch": 3.999184339314845, "grad_norm": 0.08230955898761749, "learning_rate": 4.8493775872836666e-05, "loss": 0.0542, "num_input_tokens_seen": 52920064, "step": 24515 }, { "epoch": 4.0, "grad_norm": 0.05795013904571533, "learning_rate": 4.849255895945718e-05, "loss": 0.0205, "num_input_tokens_seen": 52929744, "step": 24520 }, { "epoch": 4.0, "eval_loss": 0.13726525008678436, "eval_runtime": 90.5791, "eval_samples_per_second": 30.084, "eval_steps_per_second": 7.529, "num_input_tokens_seen": 52929744, "step": 24520 }, { "epoch": 4.000815660685155, "grad_norm": 1.2390477657318115, "learning_rate": 4.849134156997118e-05, "loss": 0.1702, "num_input_tokens_seen": 52941840, "step": 24525 }, { "epoch": 4.00163132137031, "grad_norm": 0.3865814208984375, "learning_rate": 4.849012370440335e-05, "loss": 0.1376, "num_input_tokens_seen": 52952560, "step": 24530 }, { "epoch": 4.002446982055465, "grad_norm": 0.42790064215660095, "learning_rate": 4.848890536277836e-05, "loss": 0.1424, "num_input_tokens_seen": 52961968, "step": 24535 }, { "epoch": 4.00326264274062, "grad_norm": 1.961512804031372, "learning_rate": 4.84876865451209e-05, "loss": 0.1742, "num_input_tokens_seen": 52971408, "step": 24540 }, { "epoch": 4.004078303425775, "grad_norm": 0.06101518124341965, "learning_rate": 4.848646725145568e-05, "loss": 0.198, "num_input_tokens_seen": 52982832, "step": 24545 }, { "epoch": 4.00489396411093, "grad_norm": 1.229259967803955, "learning_rate": 4.848524748180741e-05, "loss": 0.0989, "num_input_tokens_seen": 52993648, "step": 24550 }, { "epoch": 4.005709624796085, "grad_norm": 0.20980128645896912, "learning_rate": 4.84840272362008e-05, "loss": 0.1643, "num_input_tokens_seen": 53004272, "step": 24555 }, { "epoch": 4.006525285481239, "grad_norm": 1.0083997249603271, "learning_rate": 4.848280651466059e-05, "loss": 0.2135, "num_input_tokens_seen": 53013872, "step": 24560 }, { "epoch": 4.007340946166395, "grad_norm": 0.10408058762550354, "learning_rate": 4.8481585317211514e-05, "loss": 0.043, "num_input_tokens_seen": 53024912, "step": 24565 }, { "epoch": 4.00815660685155, "grad_norm": 1.8101847171783447, "learning_rate": 4.8480363643878325e-05, "loss": 0.1959, "num_input_tokens_seen": 53037360, "step": 24570 }, { "epoch": 4.008972267536705, "grad_norm": 0.9993314743041992, "learning_rate": 4.8479141494685775e-05, "loss": 0.1921, "num_input_tokens_seen": 53048688, "step": 24575 }, { "epoch": 4.00978792822186, "grad_norm": 0.5382652282714844, "learning_rate": 4.847791886965864e-05, "loss": 0.076, "num_input_tokens_seen": 53059568, "step": 24580 }, { "epoch": 4.010603588907014, "grad_norm": 0.642628014087677, "learning_rate": 4.84766957688217e-05, "loss": 0.0888, "num_input_tokens_seen": 53071312, "step": 24585 }, { "epoch": 4.011419249592169, "grad_norm": 1.0912840366363525, "learning_rate": 4.8475472192199735e-05, "loss": 0.1258, "num_input_tokens_seen": 53082544, "step": 24590 }, { "epoch": 4.012234910277325, "grad_norm": 0.09475713223218918, "learning_rate": 4.847424813981755e-05, "loss": 0.0318, "num_input_tokens_seen": 53093168, "step": 24595 }, { "epoch": 4.01305057096248, "grad_norm": 0.11851049214601517, "learning_rate": 4.847302361169994e-05, "loss": 0.1884, "num_input_tokens_seen": 53104688, "step": 24600 }, { "epoch": 4.013866231647635, "grad_norm": 0.33718055486679077, "learning_rate": 4.8471798607871724e-05, "loss": 0.19, "num_input_tokens_seen": 53116528, "step": 24605 }, { "epoch": 4.014681892332789, "grad_norm": 0.5046378970146179, "learning_rate": 4.847057312835774e-05, "loss": 0.1425, "num_input_tokens_seen": 53128016, "step": 24610 }, { "epoch": 4.015497553017944, "grad_norm": 0.1510583758354187, "learning_rate": 4.846934717318282e-05, "loss": 0.0832, "num_input_tokens_seen": 53139120, "step": 24615 }, { "epoch": 4.0163132137031, "grad_norm": 0.2040131539106369, "learning_rate": 4.84681207423718e-05, "loss": 0.045, "num_input_tokens_seen": 53149392, "step": 24620 }, { "epoch": 4.017128874388255, "grad_norm": 2.0451676845550537, "learning_rate": 4.846689383594955e-05, "loss": 0.1069, "num_input_tokens_seen": 53160304, "step": 24625 }, { "epoch": 4.0179445350734095, "grad_norm": 0.5681758522987366, "learning_rate": 4.846566645394092e-05, "loss": 0.1182, "num_input_tokens_seen": 53171248, "step": 24630 }, { "epoch": 4.018760195758564, "grad_norm": 0.6331433653831482, "learning_rate": 4.846443859637079e-05, "loss": 0.1999, "num_input_tokens_seen": 53183312, "step": 24635 }, { "epoch": 4.019575856443719, "grad_norm": 0.07646098732948303, "learning_rate": 4.8463210263264055e-05, "loss": 0.1574, "num_input_tokens_seen": 53194192, "step": 24640 }, { "epoch": 4.020391517128874, "grad_norm": 0.3499090075492859, "learning_rate": 4.846198145464559e-05, "loss": 0.355, "num_input_tokens_seen": 53205296, "step": 24645 }, { "epoch": 4.02120717781403, "grad_norm": 0.11364054679870605, "learning_rate": 4.846075217054031e-05, "loss": 0.1142, "num_input_tokens_seen": 53215632, "step": 24650 }, { "epoch": 4.0220228384991845, "grad_norm": 0.42556628584861755, "learning_rate": 4.845952241097312e-05, "loss": 0.1434, "num_input_tokens_seen": 53226704, "step": 24655 }, { "epoch": 4.022838499184339, "grad_norm": 0.09201809763908386, "learning_rate": 4.8458292175968955e-05, "loss": 0.0409, "num_input_tokens_seen": 53237328, "step": 24660 }, { "epoch": 4.023654159869494, "grad_norm": 0.26888391375541687, "learning_rate": 4.8457061465552734e-05, "loss": 0.0594, "num_input_tokens_seen": 53248912, "step": 24665 }, { "epoch": 4.024469820554649, "grad_norm": 0.078078992664814, "learning_rate": 4.845583027974941e-05, "loss": 0.0741, "num_input_tokens_seen": 53260528, "step": 24670 }, { "epoch": 4.025285481239805, "grad_norm": 0.6879124641418457, "learning_rate": 4.845459861858392e-05, "loss": 0.1154, "num_input_tokens_seen": 53272016, "step": 24675 }, { "epoch": 4.0261011419249595, "grad_norm": 1.0574870109558105, "learning_rate": 4.845336648208125e-05, "loss": 0.0994, "num_input_tokens_seen": 53282640, "step": 24680 }, { "epoch": 4.026916802610114, "grad_norm": 0.022667566314339638, "learning_rate": 4.845213387026635e-05, "loss": 0.0935, "num_input_tokens_seen": 53293904, "step": 24685 }, { "epoch": 4.027732463295269, "grad_norm": 0.07475796341896057, "learning_rate": 4.84509007831642e-05, "loss": 0.0542, "num_input_tokens_seen": 53305904, "step": 24690 }, { "epoch": 4.028548123980424, "grad_norm": 0.06270977109670639, "learning_rate": 4.84496672207998e-05, "loss": 0.0306, "num_input_tokens_seen": 53316528, "step": 24695 }, { "epoch": 4.029363784665579, "grad_norm": 0.8526743650436401, "learning_rate": 4.844843318319815e-05, "loss": 0.1403, "num_input_tokens_seen": 53328464, "step": 24700 }, { "epoch": 4.0301794453507345, "grad_norm": 0.8049693703651428, "learning_rate": 4.844719867038425e-05, "loss": 0.1392, "num_input_tokens_seen": 53339440, "step": 24705 }, { "epoch": 4.030995106035889, "grad_norm": 0.21053288877010345, "learning_rate": 4.844596368238312e-05, "loss": 0.1215, "num_input_tokens_seen": 53350000, "step": 24710 }, { "epoch": 4.031810766721044, "grad_norm": 0.06842613965272903, "learning_rate": 4.84447282192198e-05, "loss": 0.0424, "num_input_tokens_seen": 53360528, "step": 24715 }, { "epoch": 4.032626427406199, "grad_norm": 0.058896102011203766, "learning_rate": 4.844349228091932e-05, "loss": 0.0711, "num_input_tokens_seen": 53371600, "step": 24720 }, { "epoch": 4.033442088091354, "grad_norm": 0.3280218839645386, "learning_rate": 4.844225586750672e-05, "loss": 0.0996, "num_input_tokens_seen": 53382032, "step": 24725 }, { "epoch": 4.034257748776509, "grad_norm": 0.687187135219574, "learning_rate": 4.844101897900708e-05, "loss": 0.203, "num_input_tokens_seen": 53392816, "step": 24730 }, { "epoch": 4.035073409461664, "grad_norm": 0.4513944983482361, "learning_rate": 4.8439781615445443e-05, "loss": 0.0823, "num_input_tokens_seen": 53404624, "step": 24735 }, { "epoch": 4.035889070146819, "grad_norm": 1.1197946071624756, "learning_rate": 4.84385437768469e-05, "loss": 0.1804, "num_input_tokens_seen": 53416048, "step": 24740 }, { "epoch": 4.036704730831974, "grad_norm": 0.9654941558837891, "learning_rate": 4.843730546323654e-05, "loss": 0.0633, "num_input_tokens_seen": 53426704, "step": 24745 }, { "epoch": 4.037520391517129, "grad_norm": 1.8280507326126099, "learning_rate": 4.8436066674639444e-05, "loss": 0.1541, "num_input_tokens_seen": 53439248, "step": 24750 }, { "epoch": 4.0383360522022835, "grad_norm": 0.584265947341919, "learning_rate": 4.843482741108072e-05, "loss": 0.2551, "num_input_tokens_seen": 53448400, "step": 24755 }, { "epoch": 4.039151712887439, "grad_norm": 0.9496404528617859, "learning_rate": 4.84335876725855e-05, "loss": 0.2738, "num_input_tokens_seen": 53460336, "step": 24760 }, { "epoch": 4.039967373572594, "grad_norm": 1.2620375156402588, "learning_rate": 4.8432347459178894e-05, "loss": 0.1834, "num_input_tokens_seen": 53471568, "step": 24765 }, { "epoch": 4.040783034257749, "grad_norm": 0.17349164187908173, "learning_rate": 4.843110677088605e-05, "loss": 0.0865, "num_input_tokens_seen": 53483568, "step": 24770 }, { "epoch": 4.041598694942904, "grad_norm": 0.7499136328697205, "learning_rate": 4.842986560773209e-05, "loss": 0.1294, "num_input_tokens_seen": 53492848, "step": 24775 }, { "epoch": 4.0424143556280585, "grad_norm": 1.2427994012832642, "learning_rate": 4.842862396974218e-05, "loss": 0.1934, "num_input_tokens_seen": 53502992, "step": 24780 }, { "epoch": 4.043230016313213, "grad_norm": 0.8813715577125549, "learning_rate": 4.842738185694149e-05, "loss": 0.1373, "num_input_tokens_seen": 53513264, "step": 24785 }, { "epoch": 4.044045676998369, "grad_norm": 0.4638904631137848, "learning_rate": 4.842613926935518e-05, "loss": 0.111, "num_input_tokens_seen": 53523984, "step": 24790 }, { "epoch": 4.044861337683524, "grad_norm": 0.2965966463088989, "learning_rate": 4.842489620700845e-05, "loss": 0.1654, "num_input_tokens_seen": 53534672, "step": 24795 }, { "epoch": 4.045676998368679, "grad_norm": 0.24077251553535461, "learning_rate": 4.842365266992649e-05, "loss": 0.1067, "num_input_tokens_seen": 53546224, "step": 24800 }, { "epoch": 4.0464926590538335, "grad_norm": 0.10239093750715256, "learning_rate": 4.842240865813448e-05, "loss": 0.0208, "num_input_tokens_seen": 53555952, "step": 24805 }, { "epoch": 4.047308319738988, "grad_norm": 0.14856979250907898, "learning_rate": 4.842116417165764e-05, "loss": 0.1123, "num_input_tokens_seen": 53567024, "step": 24810 }, { "epoch": 4.048123980424143, "grad_norm": 0.2145828902721405, "learning_rate": 4.84199192105212e-05, "loss": 0.0658, "num_input_tokens_seen": 53578032, "step": 24815 }, { "epoch": 4.048939641109299, "grad_norm": 0.49712246656417847, "learning_rate": 4.841867377475039e-05, "loss": 0.1203, "num_input_tokens_seen": 53589968, "step": 24820 }, { "epoch": 4.049755301794454, "grad_norm": 0.39501282572746277, "learning_rate": 4.841742786437045e-05, "loss": 0.0541, "num_input_tokens_seen": 53599472, "step": 24825 }, { "epoch": 4.0505709624796085, "grad_norm": 0.048464562743902206, "learning_rate": 4.841618147940662e-05, "loss": 0.1911, "num_input_tokens_seen": 53611376, "step": 24830 }, { "epoch": 4.051386623164763, "grad_norm": 0.9105134606361389, "learning_rate": 4.841493461988417e-05, "loss": 0.0754, "num_input_tokens_seen": 53623760, "step": 24835 }, { "epoch": 4.052202283849918, "grad_norm": 1.8744897842407227, "learning_rate": 4.8413687285828366e-05, "loss": 0.3412, "num_input_tokens_seen": 53634416, "step": 24840 }, { "epoch": 4.053017944535074, "grad_norm": 0.05914391204714775, "learning_rate": 4.841243947726449e-05, "loss": 0.0922, "num_input_tokens_seen": 53644528, "step": 24845 }, { "epoch": 4.053833605220229, "grad_norm": 0.32628241181373596, "learning_rate": 4.841119119421783e-05, "loss": 0.0896, "num_input_tokens_seen": 53654480, "step": 24850 }, { "epoch": 4.054649265905383, "grad_norm": 0.08046825975179672, "learning_rate": 4.8409942436713674e-05, "loss": 0.1896, "num_input_tokens_seen": 53665136, "step": 24855 }, { "epoch": 4.055464926590538, "grad_norm": 0.2512708604335785, "learning_rate": 4.8408693204777334e-05, "loss": 0.1763, "num_input_tokens_seen": 53677104, "step": 24860 }, { "epoch": 4.056280587275693, "grad_norm": 0.2882387936115265, "learning_rate": 4.840744349843414e-05, "loss": 0.0555, "num_input_tokens_seen": 53688016, "step": 24865 }, { "epoch": 4.057096247960848, "grad_norm": 0.4553638994693756, "learning_rate": 4.8406193317709406e-05, "loss": 0.1447, "num_input_tokens_seen": 53698544, "step": 24870 }, { "epoch": 4.057911908646004, "grad_norm": 0.8717124462127686, "learning_rate": 4.840494266262846e-05, "loss": 0.0766, "num_input_tokens_seen": 53710000, "step": 24875 }, { "epoch": 4.058727569331158, "grad_norm": 0.08370260149240494, "learning_rate": 4.840369153321668e-05, "loss": 0.088, "num_input_tokens_seen": 53720656, "step": 24880 }, { "epoch": 4.059543230016313, "grad_norm": 0.1042085513472557, "learning_rate": 4.8402439929499386e-05, "loss": 0.0741, "num_input_tokens_seen": 53731280, "step": 24885 }, { "epoch": 4.060358890701468, "grad_norm": 0.6005653738975525, "learning_rate": 4.840118785150196e-05, "loss": 0.102, "num_input_tokens_seen": 53741712, "step": 24890 }, { "epoch": 4.061174551386623, "grad_norm": 0.28950855135917664, "learning_rate": 4.839993529924978e-05, "loss": 0.093, "num_input_tokens_seen": 53753360, "step": 24895 }, { "epoch": 4.061990212071779, "grad_norm": 1.442753553390503, "learning_rate": 4.839868227276822e-05, "loss": 0.1849, "num_input_tokens_seen": 53764112, "step": 24900 }, { "epoch": 4.062805872756933, "grad_norm": 0.1601802408695221, "learning_rate": 4.839742877208269e-05, "loss": 0.1022, "num_input_tokens_seen": 53774320, "step": 24905 }, { "epoch": 4.063621533442088, "grad_norm": 0.10450714826583862, "learning_rate": 4.8396174797218585e-05, "loss": 0.1417, "num_input_tokens_seen": 53784848, "step": 24910 }, { "epoch": 4.064437194127243, "grad_norm": 0.12338897585868835, "learning_rate": 4.8394920348201314e-05, "loss": 0.0893, "num_input_tokens_seen": 53795632, "step": 24915 }, { "epoch": 4.065252854812398, "grad_norm": 0.554076075553894, "learning_rate": 4.83936654250563e-05, "loss": 0.1266, "num_input_tokens_seen": 53806480, "step": 24920 }, { "epoch": 4.066068515497553, "grad_norm": 0.48778945207595825, "learning_rate": 4.8392410027808974e-05, "loss": 0.0657, "num_input_tokens_seen": 53816816, "step": 24925 }, { "epoch": 4.066884176182708, "grad_norm": 0.07524674385786057, "learning_rate": 4.8391154156484796e-05, "loss": 0.1308, "num_input_tokens_seen": 53828720, "step": 24930 }, { "epoch": 4.067699836867863, "grad_norm": 0.3508504331111908, "learning_rate": 4.8389897811109197e-05, "loss": 0.1151, "num_input_tokens_seen": 53840816, "step": 24935 }, { "epoch": 4.068515497553018, "grad_norm": 0.6318705677986145, "learning_rate": 4.8388640991707654e-05, "loss": 0.0793, "num_input_tokens_seen": 53851120, "step": 24940 }, { "epoch": 4.069331158238173, "grad_norm": 0.36804836988449097, "learning_rate": 4.838738369830562e-05, "loss": 0.1407, "num_input_tokens_seen": 53862288, "step": 24945 }, { "epoch": 4.070146818923328, "grad_norm": 1.2077112197875977, "learning_rate": 4.8386125930928595e-05, "loss": 0.2383, "num_input_tokens_seen": 53873584, "step": 24950 }, { "epoch": 4.0709624796084825, "grad_norm": 1.399795413017273, "learning_rate": 4.8384867689602056e-05, "loss": 0.294, "num_input_tokens_seen": 53885328, "step": 24955 }, { "epoch": 4.071778140293638, "grad_norm": 0.4655003547668457, "learning_rate": 4.838360897435151e-05, "loss": 0.0233, "num_input_tokens_seen": 53895376, "step": 24960 }, { "epoch": 4.072593800978793, "grad_norm": 0.21874824166297913, "learning_rate": 4.838234978520246e-05, "loss": 0.0952, "num_input_tokens_seen": 53906640, "step": 24965 }, { "epoch": 4.073409461663948, "grad_norm": 0.8326969742774963, "learning_rate": 4.838109012218044e-05, "loss": 0.1687, "num_input_tokens_seen": 53917264, "step": 24970 }, { "epoch": 4.074225122349103, "grad_norm": 0.175938218832016, "learning_rate": 4.837982998531096e-05, "loss": 0.1765, "num_input_tokens_seen": 53927984, "step": 24975 }, { "epoch": 4.075040783034257, "grad_norm": 0.31162992119789124, "learning_rate": 4.8378569374619565e-05, "loss": 0.0601, "num_input_tokens_seen": 53938672, "step": 24980 }, { "epoch": 4.075856443719413, "grad_norm": 0.8562030792236328, "learning_rate": 4.837730829013181e-05, "loss": 0.102, "num_input_tokens_seen": 53951120, "step": 24985 }, { "epoch": 4.076672104404568, "grad_norm": 0.47846370935440063, "learning_rate": 4.837604673187324e-05, "loss": 0.0904, "num_input_tokens_seen": 53962192, "step": 24990 }, { "epoch": 4.077487765089723, "grad_norm": 0.6318864226341248, "learning_rate": 4.837478469986942e-05, "loss": 0.2127, "num_input_tokens_seen": 53973488, "step": 24995 }, { "epoch": 4.078303425774878, "grad_norm": 0.28333836793899536, "learning_rate": 4.837352219414595e-05, "loss": 0.1362, "num_input_tokens_seen": 53984272, "step": 25000 }, { "epoch": 4.079119086460032, "grad_norm": 1.9318684339523315, "learning_rate": 4.83722592147284e-05, "loss": 0.2096, "num_input_tokens_seen": 53994704, "step": 25005 }, { "epoch": 4.079934747145187, "grad_norm": 0.11702197045087814, "learning_rate": 4.837099576164236e-05, "loss": 0.0983, "num_input_tokens_seen": 54005552, "step": 25010 }, { "epoch": 4.080750407830343, "grad_norm": 0.5329996347427368, "learning_rate": 4.836973183491345e-05, "loss": 0.0425, "num_input_tokens_seen": 54016560, "step": 25015 }, { "epoch": 4.081566068515498, "grad_norm": 0.9947211146354675, "learning_rate": 4.8368467434567264e-05, "loss": 0.1425, "num_input_tokens_seen": 54026736, "step": 25020 }, { "epoch": 4.082381729200653, "grad_norm": 0.9129197597503662, "learning_rate": 4.8367202560629456e-05, "loss": 0.1983, "num_input_tokens_seen": 54036592, "step": 25025 }, { "epoch": 4.083197389885807, "grad_norm": 0.6762126088142395, "learning_rate": 4.836593721312563e-05, "loss": 0.3493, "num_input_tokens_seen": 54046864, "step": 25030 }, { "epoch": 4.084013050570962, "grad_norm": 0.7182666063308716, "learning_rate": 4.8364671392081463e-05, "loss": 0.0531, "num_input_tokens_seen": 54058448, "step": 25035 }, { "epoch": 4.084828711256117, "grad_norm": 1.8293851613998413, "learning_rate": 4.8363405097522575e-05, "loss": 0.2535, "num_input_tokens_seen": 54068848, "step": 25040 }, { "epoch": 4.085644371941273, "grad_norm": 0.18967507779598236, "learning_rate": 4.836213832947465e-05, "loss": 0.2757, "num_input_tokens_seen": 54079504, "step": 25045 }, { "epoch": 4.0864600326264275, "grad_norm": 0.14686642587184906, "learning_rate": 4.8360871087963356e-05, "loss": 0.0736, "num_input_tokens_seen": 54089424, "step": 25050 }, { "epoch": 4.087275693311582, "grad_norm": 0.1140715554356575, "learning_rate": 4.8359603373014376e-05, "loss": 0.0435, "num_input_tokens_seen": 54100848, "step": 25055 }, { "epoch": 4.088091353996737, "grad_norm": 0.8723439574241638, "learning_rate": 4.835833518465339e-05, "loss": 0.1038, "num_input_tokens_seen": 54111376, "step": 25060 }, { "epoch": 4.088907014681892, "grad_norm": 0.47681552171707153, "learning_rate": 4.835706652290612e-05, "loss": 0.043, "num_input_tokens_seen": 54123408, "step": 25065 }, { "epoch": 4.089722675367048, "grad_norm": 0.1808377206325531, "learning_rate": 4.835579738779826e-05, "loss": 0.0948, "num_input_tokens_seen": 54133296, "step": 25070 }, { "epoch": 4.0905383360522025, "grad_norm": 0.11057557165622711, "learning_rate": 4.835452777935554e-05, "loss": 0.0613, "num_input_tokens_seen": 54144240, "step": 25075 }, { "epoch": 4.091353996737357, "grad_norm": 0.06661710888147354, "learning_rate": 4.835325769760368e-05, "loss": 0.0673, "num_input_tokens_seen": 54157008, "step": 25080 }, { "epoch": 4.092169657422512, "grad_norm": 0.2627677023410797, "learning_rate": 4.835198714256844e-05, "loss": 0.1228, "num_input_tokens_seen": 54167792, "step": 25085 }, { "epoch": 4.092985318107667, "grad_norm": 0.3470664620399475, "learning_rate": 4.8350716114275554e-05, "loss": 0.1067, "num_input_tokens_seen": 54178672, "step": 25090 }, { "epoch": 4.093800978792822, "grad_norm": 0.07151133567094803, "learning_rate": 4.834944461275077e-05, "loss": 0.0845, "num_input_tokens_seen": 54190288, "step": 25095 }, { "epoch": 4.0946166394779775, "grad_norm": 1.3638098239898682, "learning_rate": 4.8348172638019886e-05, "loss": 0.162, "num_input_tokens_seen": 54199728, "step": 25100 }, { "epoch": 4.095432300163132, "grad_norm": 0.398383766412735, "learning_rate": 4.8346900190108654e-05, "loss": 0.0563, "num_input_tokens_seen": 54209840, "step": 25105 }, { "epoch": 4.096247960848287, "grad_norm": 0.03253304958343506, "learning_rate": 4.834562726904287e-05, "loss": 0.1847, "num_input_tokens_seen": 54219856, "step": 25110 }, { "epoch": 4.097063621533442, "grad_norm": 0.09162694215774536, "learning_rate": 4.834435387484834e-05, "loss": 0.0854, "num_input_tokens_seen": 54230096, "step": 25115 }, { "epoch": 4.097879282218597, "grad_norm": 0.1257130652666092, "learning_rate": 4.8343080007550865e-05, "loss": 0.0367, "num_input_tokens_seen": 54241840, "step": 25120 }, { "epoch": 4.0986949429037525, "grad_norm": 0.7900670170783997, "learning_rate": 4.834180566717626e-05, "loss": 0.0812, "num_input_tokens_seen": 54251920, "step": 25125 }, { "epoch": 4.099510603588907, "grad_norm": 0.22079625725746155, "learning_rate": 4.834053085375036e-05, "loss": 0.122, "num_input_tokens_seen": 54263344, "step": 25130 }, { "epoch": 4.100326264274062, "grad_norm": 0.8551899194717407, "learning_rate": 4.833925556729897e-05, "loss": 0.1441, "num_input_tokens_seen": 54274384, "step": 25135 }, { "epoch": 4.101141924959217, "grad_norm": 0.09511423856019974, "learning_rate": 4.8337979807847976e-05, "loss": 0.0479, "num_input_tokens_seen": 54285296, "step": 25140 }, { "epoch": 4.101957585644372, "grad_norm": 0.35256248712539673, "learning_rate": 4.833670357542321e-05, "loss": 0.1275, "num_input_tokens_seen": 54295504, "step": 25145 }, { "epoch": 4.102773246329527, "grad_norm": 0.5797849893569946, "learning_rate": 4.8335426870050535e-05, "loss": 0.1196, "num_input_tokens_seen": 54306704, "step": 25150 }, { "epoch": 4.103588907014682, "grad_norm": 0.025364406406879425, "learning_rate": 4.8334149691755846e-05, "loss": 0.1617, "num_input_tokens_seen": 54316720, "step": 25155 }, { "epoch": 4.104404567699837, "grad_norm": 0.9075347185134888, "learning_rate": 4.8332872040565e-05, "loss": 0.1424, "num_input_tokens_seen": 54327408, "step": 25160 }, { "epoch": 4.105220228384992, "grad_norm": 0.4990581274032593, "learning_rate": 4.833159391650391e-05, "loss": 0.0933, "num_input_tokens_seen": 54337104, "step": 25165 }, { "epoch": 4.106035889070147, "grad_norm": 0.3790759742259979, "learning_rate": 4.833031531959846e-05, "loss": 0.2253, "num_input_tokens_seen": 54347408, "step": 25170 }, { "epoch": 4.1068515497553015, "grad_norm": 1.401467204093933, "learning_rate": 4.832903624987459e-05, "loss": 0.118, "num_input_tokens_seen": 54357264, "step": 25175 }, { "epoch": 4.107667210440456, "grad_norm": 0.7727828025817871, "learning_rate": 4.83277567073582e-05, "loss": 0.1537, "num_input_tokens_seen": 54368816, "step": 25180 }, { "epoch": 4.108482871125612, "grad_norm": 0.3727436661720276, "learning_rate": 4.8326476692075215e-05, "loss": 0.1352, "num_input_tokens_seen": 54380848, "step": 25185 }, { "epoch": 4.109298531810767, "grad_norm": 0.43981868028640747, "learning_rate": 4.83251962040516e-05, "loss": 0.0822, "num_input_tokens_seen": 54391760, "step": 25190 }, { "epoch": 4.110114192495922, "grad_norm": 0.3807206451892853, "learning_rate": 4.832391524331329e-05, "loss": 0.2451, "num_input_tokens_seen": 54402384, "step": 25195 }, { "epoch": 4.1109298531810765, "grad_norm": 0.16328981518745422, "learning_rate": 4.832263380988625e-05, "loss": 0.0398, "num_input_tokens_seen": 54413680, "step": 25200 }, { "epoch": 4.111745513866231, "grad_norm": 1.6635528802871704, "learning_rate": 4.832135190379645e-05, "loss": 0.1378, "num_input_tokens_seen": 54422640, "step": 25205 }, { "epoch": 4.112561174551387, "grad_norm": 1.1667325496673584, "learning_rate": 4.832006952506986e-05, "loss": 0.2122, "num_input_tokens_seen": 54434576, "step": 25210 }, { "epoch": 4.113376835236542, "grad_norm": 0.11901155859231949, "learning_rate": 4.8318786673732484e-05, "loss": 0.0443, "num_input_tokens_seen": 54444272, "step": 25215 }, { "epoch": 4.114192495921697, "grad_norm": 0.5416437983512878, "learning_rate": 4.831750334981032e-05, "loss": 0.0979, "num_input_tokens_seen": 54456016, "step": 25220 }, { "epoch": 4.1150081566068515, "grad_norm": 0.032690055668354034, "learning_rate": 4.8316219553329364e-05, "loss": 0.0222, "num_input_tokens_seen": 54466448, "step": 25225 }, { "epoch": 4.115823817292006, "grad_norm": 0.5244950652122498, "learning_rate": 4.831493528431564e-05, "loss": 0.1497, "num_input_tokens_seen": 54476592, "step": 25230 }, { "epoch": 4.116639477977161, "grad_norm": 0.024196850135922432, "learning_rate": 4.831365054279517e-05, "loss": 0.1077, "num_input_tokens_seen": 54488048, "step": 25235 }, { "epoch": 4.117455138662317, "grad_norm": 0.3533271849155426, "learning_rate": 4.8312365328794005e-05, "loss": 0.0746, "num_input_tokens_seen": 54499120, "step": 25240 }, { "epoch": 4.118270799347472, "grad_norm": 1.1687722206115723, "learning_rate": 4.831107964233818e-05, "loss": 0.1712, "num_input_tokens_seen": 54510576, "step": 25245 }, { "epoch": 4.1190864600326265, "grad_norm": 0.20404940843582153, "learning_rate": 4.8309793483453755e-05, "loss": 0.1322, "num_input_tokens_seen": 54522160, "step": 25250 }, { "epoch": 4.119902120717781, "grad_norm": 0.1116340234875679, "learning_rate": 4.830850685216679e-05, "loss": 0.0261, "num_input_tokens_seen": 54533872, "step": 25255 }, { "epoch": 4.120717781402936, "grad_norm": 0.4795306921005249, "learning_rate": 4.8307219748503374e-05, "loss": 0.1466, "num_input_tokens_seen": 54545200, "step": 25260 }, { "epoch": 4.121533442088092, "grad_norm": 0.184200257062912, "learning_rate": 4.8305932172489575e-05, "loss": 0.1217, "num_input_tokens_seen": 54554960, "step": 25265 }, { "epoch": 4.122349102773247, "grad_norm": 0.4306926429271698, "learning_rate": 4.8304644124151495e-05, "loss": 0.1681, "num_input_tokens_seen": 54565456, "step": 25270 }, { "epoch": 4.123164763458401, "grad_norm": 0.23351731896400452, "learning_rate": 4.8303355603515236e-05, "loss": 0.1645, "num_input_tokens_seen": 54575536, "step": 25275 }, { "epoch": 4.123980424143556, "grad_norm": 0.41413846611976624, "learning_rate": 4.830206661060692e-05, "loss": 0.1117, "num_input_tokens_seen": 54585296, "step": 25280 }, { "epoch": 4.124796084828711, "grad_norm": 0.33930784463882446, "learning_rate": 4.830077714545266e-05, "loss": 0.1387, "num_input_tokens_seen": 54595600, "step": 25285 }, { "epoch": 4.125611745513866, "grad_norm": 0.10662368685007095, "learning_rate": 4.829948720807859e-05, "loss": 0.0834, "num_input_tokens_seen": 54606160, "step": 25290 }, { "epoch": 4.126427406199022, "grad_norm": 0.25668197870254517, "learning_rate": 4.8298196798510855e-05, "loss": 0.0601, "num_input_tokens_seen": 54617232, "step": 25295 }, { "epoch": 4.127243066884176, "grad_norm": 1.8394815921783447, "learning_rate": 4.829690591677561e-05, "loss": 0.1088, "num_input_tokens_seen": 54628688, "step": 25300 }, { "epoch": 4.128058727569331, "grad_norm": 0.3434017300605774, "learning_rate": 4.829561456289901e-05, "loss": 0.1552, "num_input_tokens_seen": 54640016, "step": 25305 }, { "epoch": 4.128874388254486, "grad_norm": 0.08774887770414352, "learning_rate": 4.829432273690723e-05, "loss": 0.0799, "num_input_tokens_seen": 54648848, "step": 25310 }, { "epoch": 4.129690048939641, "grad_norm": 1.455442190170288, "learning_rate": 4.829303043882644e-05, "loss": 0.2405, "num_input_tokens_seen": 54659376, "step": 25315 }, { "epoch": 4.130505709624796, "grad_norm": 0.33711811900138855, "learning_rate": 4.829173766868285e-05, "loss": 0.0566, "num_input_tokens_seen": 54670928, "step": 25320 }, { "epoch": 4.131321370309951, "grad_norm": 0.19699116051197052, "learning_rate": 4.829044442650264e-05, "loss": 0.1274, "num_input_tokens_seen": 54681584, "step": 25325 }, { "epoch": 4.132137030995106, "grad_norm": 0.08831019699573517, "learning_rate": 4.8289150712312033e-05, "loss": 0.1399, "num_input_tokens_seen": 54692432, "step": 25330 }, { "epoch": 4.132952691680261, "grad_norm": 0.15820400416851044, "learning_rate": 4.828785652613724e-05, "loss": 0.169, "num_input_tokens_seen": 54703152, "step": 25335 }, { "epoch": 4.133768352365416, "grad_norm": 0.19227172434329987, "learning_rate": 4.828656186800449e-05, "loss": 0.0323, "num_input_tokens_seen": 54714032, "step": 25340 }, { "epoch": 4.134584013050571, "grad_norm": 0.6822611689567566, "learning_rate": 4.828526673794003e-05, "loss": 0.0743, "num_input_tokens_seen": 54724272, "step": 25345 }, { "epoch": 4.135399673735726, "grad_norm": 0.43784257769584656, "learning_rate": 4.8283971135970094e-05, "loss": 0.0319, "num_input_tokens_seen": 54734864, "step": 25350 }, { "epoch": 4.136215334420881, "grad_norm": 0.13785099983215332, "learning_rate": 4.8282675062120945e-05, "loss": 0.1737, "num_input_tokens_seen": 54744464, "step": 25355 }, { "epoch": 4.137030995106036, "grad_norm": 1.2863675355911255, "learning_rate": 4.828137851641885e-05, "loss": 0.2146, "num_input_tokens_seen": 54754672, "step": 25360 }, { "epoch": 4.137846655791191, "grad_norm": 0.22446036338806152, "learning_rate": 4.828008149889009e-05, "loss": 0.1955, "num_input_tokens_seen": 54766256, "step": 25365 }, { "epoch": 4.138662316476346, "grad_norm": 0.22922493517398834, "learning_rate": 4.827878400956094e-05, "loss": 0.0882, "num_input_tokens_seen": 54777296, "step": 25370 }, { "epoch": 4.1394779771615005, "grad_norm": 1.3001384735107422, "learning_rate": 4.82774860484577e-05, "loss": 0.2126, "num_input_tokens_seen": 54787728, "step": 25375 }, { "epoch": 4.140293637846656, "grad_norm": 0.49001678824424744, "learning_rate": 4.827618761560668e-05, "loss": 0.1305, "num_input_tokens_seen": 54798864, "step": 25380 }, { "epoch": 4.141109298531811, "grad_norm": 0.03981742262840271, "learning_rate": 4.8274888711034186e-05, "loss": 0.071, "num_input_tokens_seen": 54809872, "step": 25385 }, { "epoch": 4.141924959216966, "grad_norm": 0.08988548070192337, "learning_rate": 4.827358933476655e-05, "loss": 0.1814, "num_input_tokens_seen": 54819248, "step": 25390 }, { "epoch": 4.142740619902121, "grad_norm": 0.47611308097839355, "learning_rate": 4.8272289486830105e-05, "loss": 0.1072, "num_input_tokens_seen": 54830448, "step": 25395 }, { "epoch": 4.143556280587275, "grad_norm": 0.2209615260362625, "learning_rate": 4.827098916725118e-05, "loss": 0.1092, "num_input_tokens_seen": 54841776, "step": 25400 }, { "epoch": 4.14437194127243, "grad_norm": 2.0154688358306885, "learning_rate": 4.8269688376056144e-05, "loss": 0.1964, "num_input_tokens_seen": 54851888, "step": 25405 }, { "epoch": 4.145187601957586, "grad_norm": 1.6464593410491943, "learning_rate": 4.826838711327135e-05, "loss": 0.2899, "num_input_tokens_seen": 54861968, "step": 25410 }, { "epoch": 4.146003262642741, "grad_norm": 0.21722325682640076, "learning_rate": 4.8267085378923174e-05, "loss": 0.1287, "num_input_tokens_seen": 54873104, "step": 25415 }, { "epoch": 4.146818923327896, "grad_norm": 0.07644158601760864, "learning_rate": 4.8265783173038004e-05, "loss": 0.0181, "num_input_tokens_seen": 54883600, "step": 25420 }, { "epoch": 4.14763458401305, "grad_norm": 0.1495608538389206, "learning_rate": 4.8264480495642214e-05, "loss": 0.0281, "num_input_tokens_seen": 54895184, "step": 25425 }, { "epoch": 4.148450244698205, "grad_norm": 0.23275752365589142, "learning_rate": 4.8263177346762217e-05, "loss": 0.1276, "num_input_tokens_seen": 54906640, "step": 25430 }, { "epoch": 4.149265905383361, "grad_norm": 0.3854096829891205, "learning_rate": 4.826187372642442e-05, "loss": 0.1282, "num_input_tokens_seen": 54918192, "step": 25435 }, { "epoch": 4.150081566068516, "grad_norm": 0.2172805815935135, "learning_rate": 4.8260569634655237e-05, "loss": 0.1114, "num_input_tokens_seen": 54927536, "step": 25440 }, { "epoch": 4.150897226753671, "grad_norm": 0.3645469546318054, "learning_rate": 4.8259265071481115e-05, "loss": 0.11, "num_input_tokens_seen": 54938224, "step": 25445 }, { "epoch": 4.151712887438825, "grad_norm": 0.6026934385299683, "learning_rate": 4.825796003692847e-05, "loss": 0.0405, "num_input_tokens_seen": 54949104, "step": 25450 }, { "epoch": 4.15252854812398, "grad_norm": 0.10066911578178406, "learning_rate": 4.825665453102376e-05, "loss": 0.0526, "num_input_tokens_seen": 54960848, "step": 25455 }, { "epoch": 4.153344208809135, "grad_norm": 1.5659480094909668, "learning_rate": 4.8255348553793444e-05, "loss": 0.1373, "num_input_tokens_seen": 54971600, "step": 25460 }, { "epoch": 4.154159869494291, "grad_norm": 0.2677914500236511, "learning_rate": 4.8254042105263994e-05, "loss": 0.0361, "num_input_tokens_seen": 54981936, "step": 25465 }, { "epoch": 4.1549755301794455, "grad_norm": 1.1059741973876953, "learning_rate": 4.825273518546187e-05, "loss": 0.0968, "num_input_tokens_seen": 54993552, "step": 25470 }, { "epoch": 4.1557911908646, "grad_norm": 0.1598374843597412, "learning_rate": 4.825142779441358e-05, "loss": 0.0764, "num_input_tokens_seen": 55005200, "step": 25475 }, { "epoch": 4.156606851549755, "grad_norm": 0.08861181885004044, "learning_rate": 4.8250119932145595e-05, "loss": 0.0179, "num_input_tokens_seen": 55016464, "step": 25480 }, { "epoch": 4.15742251223491, "grad_norm": 0.15354077517986298, "learning_rate": 4.8248811598684454e-05, "loss": 0.2431, "num_input_tokens_seen": 55026576, "step": 25485 }, { "epoch": 4.158238172920065, "grad_norm": 0.029778137803077698, "learning_rate": 4.824750279405664e-05, "loss": 0.0781, "num_input_tokens_seen": 55037904, "step": 25490 }, { "epoch": 4.1590538336052205, "grad_norm": 0.3246137797832489, "learning_rate": 4.824619351828869e-05, "loss": 0.088, "num_input_tokens_seen": 55047408, "step": 25495 }, { "epoch": 4.159869494290375, "grad_norm": 1.2364931106567383, "learning_rate": 4.8244883771407145e-05, "loss": 0.1842, "num_input_tokens_seen": 55058064, "step": 25500 }, { "epoch": 4.16068515497553, "grad_norm": 0.48683154582977295, "learning_rate": 4.8243573553438534e-05, "loss": 0.0675, "num_input_tokens_seen": 55069008, "step": 25505 }, { "epoch": 4.161500815660685, "grad_norm": 1.210530400276184, "learning_rate": 4.824226286440943e-05, "loss": 0.1908, "num_input_tokens_seen": 55079792, "step": 25510 }, { "epoch": 4.16231647634584, "grad_norm": 0.7862509489059448, "learning_rate": 4.824095170434637e-05, "loss": 0.1781, "num_input_tokens_seen": 55090864, "step": 25515 }, { "epoch": 4.1631321370309955, "grad_norm": 0.7609314322471619, "learning_rate": 4.823964007327595e-05, "loss": 0.1694, "num_input_tokens_seen": 55101904, "step": 25520 }, { "epoch": 4.16394779771615, "grad_norm": 1.401738166809082, "learning_rate": 4.823832797122474e-05, "loss": 0.0937, "num_input_tokens_seen": 55112656, "step": 25525 }, { "epoch": 4.164763458401305, "grad_norm": 0.03998737782239914, "learning_rate": 4.823701539821933e-05, "loss": 0.2366, "num_input_tokens_seen": 55123024, "step": 25530 }, { "epoch": 4.16557911908646, "grad_norm": 0.1779637634754181, "learning_rate": 4.823570235428633e-05, "loss": 0.191, "num_input_tokens_seen": 55133136, "step": 25535 }, { "epoch": 4.166394779771615, "grad_norm": 0.5109360814094543, "learning_rate": 4.823438883945235e-05, "loss": 0.134, "num_input_tokens_seen": 55142864, "step": 25540 }, { "epoch": 4.16721044045677, "grad_norm": 0.8223886489868164, "learning_rate": 4.8233074853743996e-05, "loss": 0.1538, "num_input_tokens_seen": 55152848, "step": 25545 }, { "epoch": 4.168026101141925, "grad_norm": 0.8902885913848877, "learning_rate": 4.823176039718791e-05, "loss": 0.1407, "num_input_tokens_seen": 55162032, "step": 25550 }, { "epoch": 4.16884176182708, "grad_norm": 0.4789060354232788, "learning_rate": 4.823044546981073e-05, "loss": 0.1576, "num_input_tokens_seen": 55173616, "step": 25555 }, { "epoch": 4.169657422512235, "grad_norm": 1.1193679571151733, "learning_rate": 4.8229130071639096e-05, "loss": 0.1263, "num_input_tokens_seen": 55185328, "step": 25560 }, { "epoch": 4.17047308319739, "grad_norm": 0.7691474556922913, "learning_rate": 4.822781420269968e-05, "loss": 0.1882, "num_input_tokens_seen": 55197072, "step": 25565 }, { "epoch": 4.171288743882545, "grad_norm": 0.5005616545677185, "learning_rate": 4.822649786301914e-05, "loss": 0.0283, "num_input_tokens_seen": 55205936, "step": 25570 }, { "epoch": 4.1721044045677, "grad_norm": 0.5817008018493652, "learning_rate": 4.822518105262415e-05, "loss": 0.0366, "num_input_tokens_seen": 55216560, "step": 25575 }, { "epoch": 4.172920065252855, "grad_norm": 0.3377082347869873, "learning_rate": 4.822386377154141e-05, "loss": 0.0424, "num_input_tokens_seen": 55227504, "step": 25580 }, { "epoch": 4.17373572593801, "grad_norm": 0.8229416608810425, "learning_rate": 4.8222546019797607e-05, "loss": 0.2159, "num_input_tokens_seen": 55238032, "step": 25585 }, { "epoch": 4.174551386623165, "grad_norm": 0.06919066607952118, "learning_rate": 4.8221227797419455e-05, "loss": 0.0926, "num_input_tokens_seen": 55248080, "step": 25590 }, { "epoch": 4.1753670473083195, "grad_norm": 1.487457275390625, "learning_rate": 4.821990910443366e-05, "loss": 0.1636, "num_input_tokens_seen": 55259760, "step": 25595 }, { "epoch": 4.176182707993474, "grad_norm": 0.4260543882846832, "learning_rate": 4.821858994086694e-05, "loss": 0.057, "num_input_tokens_seen": 55270128, "step": 25600 }, { "epoch": 4.17699836867863, "grad_norm": 0.6955088376998901, "learning_rate": 4.821727030674605e-05, "loss": 0.1716, "num_input_tokens_seen": 55280592, "step": 25605 }, { "epoch": 4.177814029363785, "grad_norm": 0.311104953289032, "learning_rate": 4.821595020209773e-05, "loss": 0.0339, "num_input_tokens_seen": 55291344, "step": 25610 }, { "epoch": 4.17862969004894, "grad_norm": 1.4969018697738647, "learning_rate": 4.821462962694871e-05, "loss": 0.1171, "num_input_tokens_seen": 55301360, "step": 25615 }, { "epoch": 4.1794453507340945, "grad_norm": 1.4129279851913452, "learning_rate": 4.821330858132578e-05, "loss": 0.0938, "num_input_tokens_seen": 55311696, "step": 25620 }, { "epoch": 4.180261011419249, "grad_norm": 0.0895453691482544, "learning_rate": 4.821198706525571e-05, "loss": 0.1873, "num_input_tokens_seen": 55322704, "step": 25625 }, { "epoch": 4.181076672104404, "grad_norm": 1.113512396812439, "learning_rate": 4.821066507876527e-05, "loss": 0.1153, "num_input_tokens_seen": 55334704, "step": 25630 }, { "epoch": 4.18189233278956, "grad_norm": 0.19313859939575195, "learning_rate": 4.820934262188126e-05, "loss": 0.1368, "num_input_tokens_seen": 55344912, "step": 25635 }, { "epoch": 4.182707993474715, "grad_norm": 0.1653708517551422, "learning_rate": 4.820801969463047e-05, "loss": 0.1218, "num_input_tokens_seen": 55354832, "step": 25640 }, { "epoch": 4.1835236541598695, "grad_norm": 0.24996820092201233, "learning_rate": 4.8206696297039726e-05, "loss": 0.0641, "num_input_tokens_seen": 55365776, "step": 25645 }, { "epoch": 4.184339314845024, "grad_norm": 0.10244464874267578, "learning_rate": 4.820537242913584e-05, "loss": 0.2197, "num_input_tokens_seen": 55376208, "step": 25650 }, { "epoch": 4.185154975530179, "grad_norm": 1.0420174598693848, "learning_rate": 4.820404809094565e-05, "loss": 0.1752, "num_input_tokens_seen": 55386512, "step": 25655 }, { "epoch": 4.185970636215335, "grad_norm": 0.2862887680530548, "learning_rate": 4.820272328249598e-05, "loss": 0.0626, "num_input_tokens_seen": 55397168, "step": 25660 }, { "epoch": 4.18678629690049, "grad_norm": 0.051897019147872925, "learning_rate": 4.820139800381369e-05, "loss": 0.2502, "num_input_tokens_seen": 55408912, "step": 25665 }, { "epoch": 4.1876019575856445, "grad_norm": 0.41625943779945374, "learning_rate": 4.820007225492564e-05, "loss": 0.1626, "num_input_tokens_seen": 55418672, "step": 25670 }, { "epoch": 4.188417618270799, "grad_norm": 0.8024109601974487, "learning_rate": 4.8198746035858695e-05, "loss": 0.0427, "num_input_tokens_seen": 55430768, "step": 25675 }, { "epoch": 4.189233278955954, "grad_norm": 1.224628210067749, "learning_rate": 4.819741934663973e-05, "loss": 0.1833, "num_input_tokens_seen": 55441232, "step": 25680 }, { "epoch": 4.190048939641109, "grad_norm": 1.373414397239685, "learning_rate": 4.819609218729563e-05, "loss": 0.144, "num_input_tokens_seen": 55453040, "step": 25685 }, { "epoch": 4.190864600326265, "grad_norm": 0.08149244636297226, "learning_rate": 4.81947645578533e-05, "loss": 0.1012, "num_input_tokens_seen": 55462768, "step": 25690 }, { "epoch": 4.191680261011419, "grad_norm": 0.27010467648506165, "learning_rate": 4.8193436458339635e-05, "loss": 0.0768, "num_input_tokens_seen": 55474416, "step": 25695 }, { "epoch": 4.192495921696574, "grad_norm": 0.10846472531557083, "learning_rate": 4.819210788878157e-05, "loss": 0.0977, "num_input_tokens_seen": 55484272, "step": 25700 }, { "epoch": 4.193311582381729, "grad_norm": 0.5620893836021423, "learning_rate": 4.8190778849206006e-05, "loss": 0.0936, "num_input_tokens_seen": 55496336, "step": 25705 }, { "epoch": 4.194127243066884, "grad_norm": 0.33480048179626465, "learning_rate": 4.8189449339639894e-05, "loss": 0.0514, "num_input_tokens_seen": 55508656, "step": 25710 }, { "epoch": 4.19494290375204, "grad_norm": 1.5882704257965088, "learning_rate": 4.8188119360110175e-05, "loss": 0.2716, "num_input_tokens_seen": 55519664, "step": 25715 }, { "epoch": 4.195758564437194, "grad_norm": 0.3093855381011963, "learning_rate": 4.818678891064379e-05, "loss": 0.1103, "num_input_tokens_seen": 55529744, "step": 25720 }, { "epoch": 4.196574225122349, "grad_norm": 0.5376486778259277, "learning_rate": 4.818545799126773e-05, "loss": 0.0626, "num_input_tokens_seen": 55540400, "step": 25725 }, { "epoch": 4.197389885807504, "grad_norm": 2.2375223636627197, "learning_rate": 4.818412660200894e-05, "loss": 0.2813, "num_input_tokens_seen": 55551312, "step": 25730 }, { "epoch": 4.198205546492659, "grad_norm": 0.1187276616692543, "learning_rate": 4.818279474289441e-05, "loss": 0.0783, "num_input_tokens_seen": 55562448, "step": 25735 }, { "epoch": 4.199021207177814, "grad_norm": 0.11240336298942566, "learning_rate": 4.818146241395114e-05, "loss": 0.0371, "num_input_tokens_seen": 55572848, "step": 25740 }, { "epoch": 4.199836867862969, "grad_norm": 0.27698227763175964, "learning_rate": 4.818012961520613e-05, "loss": 0.1318, "num_input_tokens_seen": 55582800, "step": 25745 }, { "epoch": 4.200652528548124, "grad_norm": 0.7018848657608032, "learning_rate": 4.817879634668639e-05, "loss": 0.1284, "num_input_tokens_seen": 55594096, "step": 25750 }, { "epoch": 4.201468189233279, "grad_norm": 0.2515712082386017, "learning_rate": 4.817746260841893e-05, "loss": 0.1011, "num_input_tokens_seen": 55605456, "step": 25755 }, { "epoch": 4.202283849918434, "grad_norm": 0.15787823498249054, "learning_rate": 4.8176128400430785e-05, "loss": 0.0282, "num_input_tokens_seen": 55617072, "step": 25760 }, { "epoch": 4.203099510603589, "grad_norm": 0.45656469464302063, "learning_rate": 4.8174793722749e-05, "loss": 0.3286, "num_input_tokens_seen": 55628176, "step": 25765 }, { "epoch": 4.2039151712887435, "grad_norm": 0.10315176844596863, "learning_rate": 4.817345857540062e-05, "loss": 0.0837, "num_input_tokens_seen": 55637744, "step": 25770 }, { "epoch": 4.204730831973899, "grad_norm": 0.35969844460487366, "learning_rate": 4.817212295841271e-05, "loss": 0.0247, "num_input_tokens_seen": 55649104, "step": 25775 }, { "epoch": 4.205546492659054, "grad_norm": 1.2403157949447632, "learning_rate": 4.817078687181233e-05, "loss": 0.1322, "num_input_tokens_seen": 55660272, "step": 25780 }, { "epoch": 4.206362153344209, "grad_norm": 0.43393826484680176, "learning_rate": 4.8169450315626563e-05, "loss": 0.1398, "num_input_tokens_seen": 55670864, "step": 25785 }, { "epoch": 4.207177814029364, "grad_norm": 1.2778226137161255, "learning_rate": 4.816811328988249e-05, "loss": 0.0802, "num_input_tokens_seen": 55681456, "step": 25790 }, { "epoch": 4.2079934747145185, "grad_norm": 0.6422109007835388, "learning_rate": 4.8166775794607204e-05, "loss": 0.0699, "num_input_tokens_seen": 55692464, "step": 25795 }, { "epoch": 4.208809135399674, "grad_norm": 0.19294600188732147, "learning_rate": 4.816543782982782e-05, "loss": 0.1455, "num_input_tokens_seen": 55702704, "step": 25800 }, { "epoch": 4.209624796084829, "grad_norm": 0.045873068273067474, "learning_rate": 4.816409939557145e-05, "loss": 0.123, "num_input_tokens_seen": 55713040, "step": 25805 }, { "epoch": 4.210440456769984, "grad_norm": 0.32150059938430786, "learning_rate": 4.8162760491865225e-05, "loss": 0.1032, "num_input_tokens_seen": 55723920, "step": 25810 }, { "epoch": 4.211256117455139, "grad_norm": 1.8636517524719238, "learning_rate": 4.816142111873628e-05, "loss": 0.1238, "num_input_tokens_seen": 55735440, "step": 25815 }, { "epoch": 4.212071778140293, "grad_norm": 0.7137307524681091, "learning_rate": 4.8160081276211747e-05, "loss": 0.1119, "num_input_tokens_seen": 55745680, "step": 25820 }, { "epoch": 4.212887438825448, "grad_norm": 0.8633635640144348, "learning_rate": 4.815874096431878e-05, "loss": 0.1406, "num_input_tokens_seen": 55757680, "step": 25825 }, { "epoch": 4.213703099510604, "grad_norm": 1.6914756298065186, "learning_rate": 4.815740018308455e-05, "loss": 0.1011, "num_input_tokens_seen": 55767536, "step": 25830 }, { "epoch": 4.214518760195759, "grad_norm": 0.3737911283969879, "learning_rate": 4.8156058932536244e-05, "loss": 0.1053, "num_input_tokens_seen": 55778032, "step": 25835 }, { "epoch": 4.215334420880914, "grad_norm": 0.5211580991744995, "learning_rate": 4.815471721270101e-05, "loss": 0.1191, "num_input_tokens_seen": 55790000, "step": 25840 }, { "epoch": 4.216150081566068, "grad_norm": 0.09817782044410706, "learning_rate": 4.815337502360606e-05, "loss": 0.125, "num_input_tokens_seen": 55800688, "step": 25845 }, { "epoch": 4.216965742251223, "grad_norm": 0.20631104707717896, "learning_rate": 4.81520323652786e-05, "loss": 0.0451, "num_input_tokens_seen": 55812208, "step": 25850 }, { "epoch": 4.217781402936378, "grad_norm": 1.3304810523986816, "learning_rate": 4.8150689237745834e-05, "loss": 0.0871, "num_input_tokens_seen": 55823120, "step": 25855 }, { "epoch": 4.218597063621534, "grad_norm": 0.06083489954471588, "learning_rate": 4.814934564103498e-05, "loss": 0.0929, "num_input_tokens_seen": 55834416, "step": 25860 }, { "epoch": 4.219412724306689, "grad_norm": 0.12017836421728134, "learning_rate": 4.814800157517326e-05, "loss": 0.1502, "num_input_tokens_seen": 55846480, "step": 25865 }, { "epoch": 4.220228384991843, "grad_norm": 1.471412181854248, "learning_rate": 4.814665704018794e-05, "loss": 0.1112, "num_input_tokens_seen": 55857200, "step": 25870 }, { "epoch": 4.221044045676998, "grad_norm": 0.28538841009140015, "learning_rate": 4.814531203610624e-05, "loss": 0.084, "num_input_tokens_seen": 55868976, "step": 25875 }, { "epoch": 4.221859706362153, "grad_norm": 2.166480541229248, "learning_rate": 4.8143966562955436e-05, "loss": 0.1359, "num_input_tokens_seen": 55881360, "step": 25880 }, { "epoch": 4.222675367047309, "grad_norm": 0.3719070851802826, "learning_rate": 4.8142620620762776e-05, "loss": 0.1251, "num_input_tokens_seen": 55892208, "step": 25885 }, { "epoch": 4.2234910277324635, "grad_norm": 0.020220894366502762, "learning_rate": 4.8141274209555566e-05, "loss": 0.072, "num_input_tokens_seen": 55901936, "step": 25890 }, { "epoch": 4.224306688417618, "grad_norm": 0.8207300305366516, "learning_rate": 4.8139927329361074e-05, "loss": 0.0475, "num_input_tokens_seen": 55912784, "step": 25895 }, { "epoch": 4.225122349102773, "grad_norm": 0.3232976496219635, "learning_rate": 4.81385799802066e-05, "loss": 0.2852, "num_input_tokens_seen": 55924432, "step": 25900 }, { "epoch": 4.225938009787928, "grad_norm": 0.11138935387134552, "learning_rate": 4.813723216211944e-05, "loss": 0.1373, "num_input_tokens_seen": 55935216, "step": 25905 }, { "epoch": 4.226753670473083, "grad_norm": 0.04580778628587723, "learning_rate": 4.813588387512693e-05, "loss": 0.0872, "num_input_tokens_seen": 55945680, "step": 25910 }, { "epoch": 4.2275693311582385, "grad_norm": 0.5010867118835449, "learning_rate": 4.813453511925637e-05, "loss": 0.1962, "num_input_tokens_seen": 55956496, "step": 25915 }, { "epoch": 4.228384991843393, "grad_norm": 0.03530971333384514, "learning_rate": 4.813318589453512e-05, "loss": 0.0494, "num_input_tokens_seen": 55967280, "step": 25920 }, { "epoch": 4.229200652528548, "grad_norm": 0.2475200891494751, "learning_rate": 4.813183620099051e-05, "loss": 0.0859, "num_input_tokens_seen": 55978160, "step": 25925 }, { "epoch": 4.230016313213703, "grad_norm": 0.41330716013908386, "learning_rate": 4.8130486038649894e-05, "loss": 0.0625, "num_input_tokens_seen": 55989808, "step": 25930 }, { "epoch": 4.230831973898858, "grad_norm": 0.06574757397174835, "learning_rate": 4.812913540754063e-05, "loss": 0.1323, "num_input_tokens_seen": 56000944, "step": 25935 }, { "epoch": 4.231647634584013, "grad_norm": 0.05195915326476097, "learning_rate": 4.81277843076901e-05, "loss": 0.0854, "num_input_tokens_seen": 56011408, "step": 25940 }, { "epoch": 4.232463295269168, "grad_norm": 1.544836163520813, "learning_rate": 4.812643273912568e-05, "loss": 0.049, "num_input_tokens_seen": 56020944, "step": 25945 }, { "epoch": 4.233278955954323, "grad_norm": 0.07099777460098267, "learning_rate": 4.812508070187476e-05, "loss": 0.0147, "num_input_tokens_seen": 56031792, "step": 25950 }, { "epoch": 4.234094616639478, "grad_norm": 0.4166511595249176, "learning_rate": 4.812372819596474e-05, "loss": 0.0286, "num_input_tokens_seen": 56041936, "step": 25955 }, { "epoch": 4.234910277324633, "grad_norm": 0.5317455530166626, "learning_rate": 4.812237522142304e-05, "loss": 0.0316, "num_input_tokens_seen": 56052272, "step": 25960 }, { "epoch": 4.235725938009788, "grad_norm": 0.02910112403333187, "learning_rate": 4.812102177827708e-05, "loss": 0.0093, "num_input_tokens_seen": 56061808, "step": 25965 }, { "epoch": 4.236541598694943, "grad_norm": 1.387721300125122, "learning_rate": 4.8119667866554274e-05, "loss": 0.1916, "num_input_tokens_seen": 56071536, "step": 25970 }, { "epoch": 4.237357259380098, "grad_norm": 0.20591223239898682, "learning_rate": 4.8118313486282074e-05, "loss": 0.0904, "num_input_tokens_seen": 56083568, "step": 25975 }, { "epoch": 4.238172920065253, "grad_norm": 0.2925944924354553, "learning_rate": 4.811695863748792e-05, "loss": 0.0919, "num_input_tokens_seen": 56094000, "step": 25980 }, { "epoch": 4.238988580750408, "grad_norm": 1.4076511859893799, "learning_rate": 4.8115603320199276e-05, "loss": 0.2807, "num_input_tokens_seen": 56103952, "step": 25985 }, { "epoch": 4.239804241435563, "grad_norm": 0.2650325894355774, "learning_rate": 4.81142475344436e-05, "loss": 0.0442, "num_input_tokens_seen": 56114224, "step": 25990 }, { "epoch": 4.240619902120717, "grad_norm": 0.08853321522474289, "learning_rate": 4.811289128024838e-05, "loss": 0.0438, "num_input_tokens_seen": 56124464, "step": 25995 }, { "epoch": 4.241435562805873, "grad_norm": 0.06272365897893906, "learning_rate": 4.81115345576411e-05, "loss": 0.049, "num_input_tokens_seen": 56135280, "step": 26000 }, { "epoch": 4.242251223491028, "grad_norm": 0.26545265316963196, "learning_rate": 4.8110177366649246e-05, "loss": 0.0985, "num_input_tokens_seen": 56144976, "step": 26005 }, { "epoch": 4.243066884176183, "grad_norm": 1.0652645826339722, "learning_rate": 4.8108819707300336e-05, "loss": 0.2769, "num_input_tokens_seen": 56156688, "step": 26010 }, { "epoch": 4.2438825448613375, "grad_norm": 0.2511719763278961, "learning_rate": 4.8107461579621874e-05, "loss": 0.0717, "num_input_tokens_seen": 56167856, "step": 26015 }, { "epoch": 4.244698205546492, "grad_norm": 0.7370490431785583, "learning_rate": 4.810610298364139e-05, "loss": 0.0891, "num_input_tokens_seen": 56178928, "step": 26020 }, { "epoch": 4.245513866231648, "grad_norm": 0.11978798359632492, "learning_rate": 4.8104743919386425e-05, "loss": 0.1723, "num_input_tokens_seen": 56189584, "step": 26025 }, { "epoch": 4.246329526916803, "grad_norm": 1.2521419525146484, "learning_rate": 4.810338438688451e-05, "loss": 0.1805, "num_input_tokens_seen": 56200080, "step": 26030 }, { "epoch": 4.247145187601958, "grad_norm": 0.04089481756091118, "learning_rate": 4.8102024386163195e-05, "loss": 0.3394, "num_input_tokens_seen": 56210288, "step": 26035 }, { "epoch": 4.2479608482871125, "grad_norm": 0.37163427472114563, "learning_rate": 4.810066391725006e-05, "loss": 0.0544, "num_input_tokens_seen": 56220976, "step": 26040 }, { "epoch": 4.248776508972267, "grad_norm": 0.5724367499351501, "learning_rate": 4.8099302980172656e-05, "loss": 0.1053, "num_input_tokens_seen": 56232176, "step": 26045 }, { "epoch": 4.249592169657422, "grad_norm": 1.1793181896209717, "learning_rate": 4.8097941574958576e-05, "loss": 0.2729, "num_input_tokens_seen": 56244400, "step": 26050 }, { "epoch": 4.250407830342578, "grad_norm": 0.46193957328796387, "learning_rate": 4.809657970163541e-05, "loss": 0.0423, "num_input_tokens_seen": 56255952, "step": 26055 }, { "epoch": 4.251223491027733, "grad_norm": 0.2222469002008438, "learning_rate": 4.809521736023075e-05, "loss": 0.1095, "num_input_tokens_seen": 56267760, "step": 26060 }, { "epoch": 4.2520391517128875, "grad_norm": 0.8625044822692871, "learning_rate": 4.809385455077222e-05, "loss": 0.0645, "num_input_tokens_seen": 56278800, "step": 26065 }, { "epoch": 4.252854812398042, "grad_norm": 0.03604457154870033, "learning_rate": 4.809249127328743e-05, "loss": 0.077, "num_input_tokens_seen": 56289552, "step": 26070 }, { "epoch": 4.253670473083197, "grad_norm": 0.4146474599838257, "learning_rate": 4.8091127527804e-05, "loss": 0.1689, "num_input_tokens_seen": 56301808, "step": 26075 }, { "epoch": 4.254486133768353, "grad_norm": 1.2348175048828125, "learning_rate": 4.808976331434959e-05, "loss": 0.0818, "num_input_tokens_seen": 56311536, "step": 26080 }, { "epoch": 4.255301794453508, "grad_norm": 0.8971097469329834, "learning_rate": 4.8088398632951834e-05, "loss": 0.1116, "num_input_tokens_seen": 56323344, "step": 26085 }, { "epoch": 4.2561174551386625, "grad_norm": 0.7345790863037109, "learning_rate": 4.8087033483638383e-05, "loss": 0.1229, "num_input_tokens_seen": 56333904, "step": 26090 }, { "epoch": 4.256933115823817, "grad_norm": 1.0284110307693481, "learning_rate": 4.808566786643691e-05, "loss": 0.0651, "num_input_tokens_seen": 56343312, "step": 26095 }, { "epoch": 4.257748776508972, "grad_norm": 0.5735546350479126, "learning_rate": 4.80843017813751e-05, "loss": 0.0812, "num_input_tokens_seen": 56354960, "step": 26100 }, { "epoch": 4.258564437194127, "grad_norm": 0.20935113728046417, "learning_rate": 4.808293522848062e-05, "loss": 0.0969, "num_input_tokens_seen": 56366032, "step": 26105 }, { "epoch": 4.259380097879283, "grad_norm": 0.870002806186676, "learning_rate": 4.808156820778118e-05, "loss": 0.1166, "num_input_tokens_seen": 56376496, "step": 26110 }, { "epoch": 4.260195758564437, "grad_norm": 0.06772430986166, "learning_rate": 4.808020071930448e-05, "loss": 0.1544, "num_input_tokens_seen": 56387408, "step": 26115 }, { "epoch": 4.261011419249592, "grad_norm": 0.8058800101280212, "learning_rate": 4.807883276307823e-05, "loss": 0.1745, "num_input_tokens_seen": 56397328, "step": 26120 }, { "epoch": 4.261827079934747, "grad_norm": 0.5669854879379272, "learning_rate": 4.807746433913016e-05, "loss": 0.1792, "num_input_tokens_seen": 56406640, "step": 26125 }, { "epoch": 4.262642740619902, "grad_norm": 1.0350654125213623, "learning_rate": 4.8076095447488e-05, "loss": 0.1185, "num_input_tokens_seen": 56416400, "step": 26130 }, { "epoch": 4.263458401305057, "grad_norm": 1.8997101783752441, "learning_rate": 4.807472608817949e-05, "loss": 0.1638, "num_input_tokens_seen": 56426320, "step": 26135 }, { "epoch": 4.264274061990212, "grad_norm": 0.40563997626304626, "learning_rate": 4.807335626123238e-05, "loss": 0.3817, "num_input_tokens_seen": 56436400, "step": 26140 }, { "epoch": 4.265089722675367, "grad_norm": 1.546213150024414, "learning_rate": 4.8071985966674436e-05, "loss": 0.2117, "num_input_tokens_seen": 56445712, "step": 26145 }, { "epoch": 4.265905383360522, "grad_norm": 0.11302388459444046, "learning_rate": 4.807061520453343e-05, "loss": 0.1355, "num_input_tokens_seen": 56456720, "step": 26150 }, { "epoch": 4.266721044045677, "grad_norm": 0.38464462757110596, "learning_rate": 4.806924397483714e-05, "loss": 0.1087, "num_input_tokens_seen": 56468464, "step": 26155 }, { "epoch": 4.267536704730832, "grad_norm": 2.1493349075317383, "learning_rate": 4.806787227761336e-05, "loss": 0.2634, "num_input_tokens_seen": 56478096, "step": 26160 }, { "epoch": 4.268352365415987, "grad_norm": 1.8247092962265015, "learning_rate": 4.806650011288987e-05, "loss": 0.375, "num_input_tokens_seen": 56489776, "step": 26165 }, { "epoch": 4.269168026101142, "grad_norm": 0.4195447266101837, "learning_rate": 4.8065127480694507e-05, "loss": 0.1278, "num_input_tokens_seen": 56501232, "step": 26170 }, { "epoch": 4.269983686786297, "grad_norm": 0.22090935707092285, "learning_rate": 4.806375438105507e-05, "loss": 0.0673, "num_input_tokens_seen": 56511664, "step": 26175 }, { "epoch": 4.270799347471452, "grad_norm": 0.6064654588699341, "learning_rate": 4.806238081399939e-05, "loss": 0.1024, "num_input_tokens_seen": 56522032, "step": 26180 }, { "epoch": 4.271615008156607, "grad_norm": 0.07104453444480896, "learning_rate": 4.80610067795553e-05, "loss": 0.1935, "num_input_tokens_seen": 56533008, "step": 26185 }, { "epoch": 4.2724306688417615, "grad_norm": 0.5718966126441956, "learning_rate": 4.805963227775067e-05, "loss": 0.0555, "num_input_tokens_seen": 56544176, "step": 26190 }, { "epoch": 4.273246329526917, "grad_norm": 0.5399571061134338, "learning_rate": 4.805825730861333e-05, "loss": 0.1533, "num_input_tokens_seen": 56555376, "step": 26195 }, { "epoch": 4.274061990212072, "grad_norm": 0.3615507483482361, "learning_rate": 4.8056881872171156e-05, "loss": 0.144, "num_input_tokens_seen": 56566256, "step": 26200 }, { "epoch": 4.274877650897227, "grad_norm": 0.8026504516601562, "learning_rate": 4.805550596845202e-05, "loss": 0.1146, "num_input_tokens_seen": 56576976, "step": 26205 }, { "epoch": 4.275693311582382, "grad_norm": 0.17811210453510284, "learning_rate": 4.8054129597483804e-05, "loss": 0.1175, "num_input_tokens_seen": 56587504, "step": 26210 }, { "epoch": 4.2765089722675365, "grad_norm": 1.072396159172058, "learning_rate": 4.8052752759294405e-05, "loss": 0.1504, "num_input_tokens_seen": 56598800, "step": 26215 }, { "epoch": 4.277324632952691, "grad_norm": 2.3205347061157227, "learning_rate": 4.805137545391173e-05, "loss": 0.1651, "num_input_tokens_seen": 56609584, "step": 26220 }, { "epoch": 4.278140293637847, "grad_norm": 0.04567793756723404, "learning_rate": 4.804999768136369e-05, "loss": 0.1037, "num_input_tokens_seen": 56619984, "step": 26225 }, { "epoch": 4.278955954323002, "grad_norm": 0.16899140179157257, "learning_rate": 4.8048619441678206e-05, "loss": 0.1503, "num_input_tokens_seen": 56630128, "step": 26230 }, { "epoch": 4.279771615008157, "grad_norm": 1.5654263496398926, "learning_rate": 4.804724073488321e-05, "loss": 0.1563, "num_input_tokens_seen": 56641264, "step": 26235 }, { "epoch": 4.280587275693311, "grad_norm": 0.22006098926067352, "learning_rate": 4.8045861561006645e-05, "loss": 0.0426, "num_input_tokens_seen": 56652688, "step": 26240 }, { "epoch": 4.281402936378466, "grad_norm": 1.157105803489685, "learning_rate": 4.804448192007646e-05, "loss": 0.1423, "num_input_tokens_seen": 56663600, "step": 26245 }, { "epoch": 4.282218597063622, "grad_norm": 0.06865498423576355, "learning_rate": 4.8043101812120604e-05, "loss": 0.05, "num_input_tokens_seen": 56674544, "step": 26250 }, { "epoch": 4.283034257748777, "grad_norm": 0.27554336190223694, "learning_rate": 4.8041721237167066e-05, "loss": 0.0329, "num_input_tokens_seen": 56685680, "step": 26255 }, { "epoch": 4.283849918433932, "grad_norm": 1.4981695413589478, "learning_rate": 4.8040340195243816e-05, "loss": 0.1343, "num_input_tokens_seen": 56696752, "step": 26260 }, { "epoch": 4.284665579119086, "grad_norm": 0.19654826819896698, "learning_rate": 4.803895868637884e-05, "loss": 0.144, "num_input_tokens_seen": 56707600, "step": 26265 }, { "epoch": 4.285481239804241, "grad_norm": 0.39263829588890076, "learning_rate": 4.803757671060014e-05, "loss": 0.1729, "num_input_tokens_seen": 56718608, "step": 26270 }, { "epoch": 4.286296900489396, "grad_norm": 0.1465264856815338, "learning_rate": 4.803619426793572e-05, "loss": 0.2023, "num_input_tokens_seen": 56729840, "step": 26275 }, { "epoch": 4.287112561174552, "grad_norm": 0.4457131028175354, "learning_rate": 4.803481135841361e-05, "loss": 0.1174, "num_input_tokens_seen": 56739984, "step": 26280 }, { "epoch": 4.287928221859707, "grad_norm": 0.28897130489349365, "learning_rate": 4.803342798206182e-05, "loss": 0.299, "num_input_tokens_seen": 56751248, "step": 26285 }, { "epoch": 4.288743882544861, "grad_norm": 0.22609871625900269, "learning_rate": 4.803204413890839e-05, "loss": 0.1503, "num_input_tokens_seen": 56762480, "step": 26290 }, { "epoch": 4.289559543230016, "grad_norm": 1.7748870849609375, "learning_rate": 4.803065982898137e-05, "loss": 0.1789, "num_input_tokens_seen": 56772016, "step": 26295 }, { "epoch": 4.290375203915171, "grad_norm": 0.7779238820075989, "learning_rate": 4.802927505230881e-05, "loss": 0.0716, "num_input_tokens_seen": 56781168, "step": 26300 }, { "epoch": 4.291190864600326, "grad_norm": 1.1238784790039062, "learning_rate": 4.802788980891878e-05, "loss": 0.1281, "num_input_tokens_seen": 56792720, "step": 26305 }, { "epoch": 4.2920065252854815, "grad_norm": 0.04922053590416908, "learning_rate": 4.802650409883935e-05, "loss": 0.0956, "num_input_tokens_seen": 56803856, "step": 26310 }, { "epoch": 4.292822185970636, "grad_norm": 0.20025864243507385, "learning_rate": 4.80251179220986e-05, "loss": 0.101, "num_input_tokens_seen": 56814480, "step": 26315 }, { "epoch": 4.293637846655791, "grad_norm": 0.14762133359909058, "learning_rate": 4.8023731278724625e-05, "loss": 0.1399, "num_input_tokens_seen": 56824816, "step": 26320 }, { "epoch": 4.294453507340946, "grad_norm": 0.5963190793991089, "learning_rate": 4.8022344168745534e-05, "loss": 0.2656, "num_input_tokens_seen": 56834928, "step": 26325 }, { "epoch": 4.295269168026101, "grad_norm": 0.20524193346500397, "learning_rate": 4.8020956592189424e-05, "loss": 0.0443, "num_input_tokens_seen": 56845680, "step": 26330 }, { "epoch": 4.2960848287112565, "grad_norm": 0.059612076729536057, "learning_rate": 4.801956854908443e-05, "loss": 0.0889, "num_input_tokens_seen": 56856464, "step": 26335 }, { "epoch": 4.296900489396411, "grad_norm": 0.25603246688842773, "learning_rate": 4.8018180039458677e-05, "loss": 0.1508, "num_input_tokens_seen": 56867920, "step": 26340 }, { "epoch": 4.297716150081566, "grad_norm": 0.10952114313840866, "learning_rate": 4.80167910633403e-05, "loss": 0.0682, "num_input_tokens_seen": 56878672, "step": 26345 }, { "epoch": 4.298531810766721, "grad_norm": 0.4214134216308594, "learning_rate": 4.8015401620757464e-05, "loss": 0.0528, "num_input_tokens_seen": 56889616, "step": 26350 }, { "epoch": 4.299347471451876, "grad_norm": 0.13861463963985443, "learning_rate": 4.801401171173831e-05, "loss": 0.1121, "num_input_tokens_seen": 56899184, "step": 26355 }, { "epoch": 4.300163132137031, "grad_norm": 0.31867697834968567, "learning_rate": 4.8012621336311016e-05, "loss": 0.1188, "num_input_tokens_seen": 56910480, "step": 26360 }, { "epoch": 4.300978792822186, "grad_norm": 0.3953915536403656, "learning_rate": 4.801123049450375e-05, "loss": 0.2514, "num_input_tokens_seen": 56921040, "step": 26365 }, { "epoch": 4.301794453507341, "grad_norm": 1.0748251676559448, "learning_rate": 4.800983918634471e-05, "loss": 0.1783, "num_input_tokens_seen": 56931920, "step": 26370 }, { "epoch": 4.302610114192496, "grad_norm": 0.6693459749221802, "learning_rate": 4.800844741186209e-05, "loss": 0.116, "num_input_tokens_seen": 56942960, "step": 26375 }, { "epoch": 4.303425774877651, "grad_norm": 0.5343934297561646, "learning_rate": 4.8007055171084094e-05, "loss": 0.1608, "num_input_tokens_seen": 56955728, "step": 26380 }, { "epoch": 4.304241435562806, "grad_norm": 1.8360040187835693, "learning_rate": 4.800566246403894e-05, "loss": 0.0984, "num_input_tokens_seen": 56966224, "step": 26385 }, { "epoch": 4.30505709624796, "grad_norm": 1.2843599319458008, "learning_rate": 4.800426929075485e-05, "loss": 0.2213, "num_input_tokens_seen": 56975440, "step": 26390 }, { "epoch": 4.305872756933116, "grad_norm": 1.2595045566558838, "learning_rate": 4.800287565126006e-05, "loss": 0.0795, "num_input_tokens_seen": 56986416, "step": 26395 }, { "epoch": 4.306688417618271, "grad_norm": 0.532971203327179, "learning_rate": 4.800148154558282e-05, "loss": 0.1308, "num_input_tokens_seen": 56997392, "step": 26400 }, { "epoch": 4.307504078303426, "grad_norm": 0.44628965854644775, "learning_rate": 4.800008697375137e-05, "loss": 0.1215, "num_input_tokens_seen": 57008144, "step": 26405 }, { "epoch": 4.308319738988581, "grad_norm": 0.5848250985145569, "learning_rate": 4.799869193579397e-05, "loss": 0.1019, "num_input_tokens_seen": 57019472, "step": 26410 }, { "epoch": 4.309135399673735, "grad_norm": 0.11433353275060654, "learning_rate": 4.799729643173891e-05, "loss": 0.1821, "num_input_tokens_seen": 57029136, "step": 26415 }, { "epoch": 4.309951060358891, "grad_norm": 1.0691466331481934, "learning_rate": 4.7995900461614476e-05, "loss": 0.1, "num_input_tokens_seen": 57039504, "step": 26420 }, { "epoch": 4.310766721044046, "grad_norm": 0.6712072491645813, "learning_rate": 4.7994504025448936e-05, "loss": 0.1584, "num_input_tokens_seen": 57050064, "step": 26425 }, { "epoch": 4.311582381729201, "grad_norm": 0.5378607511520386, "learning_rate": 4.7993107123270594e-05, "loss": 0.0476, "num_input_tokens_seen": 57061232, "step": 26430 }, { "epoch": 4.3123980424143555, "grad_norm": 0.06208248436450958, "learning_rate": 4.799170975510778e-05, "loss": 0.2119, "num_input_tokens_seen": 57072016, "step": 26435 }, { "epoch": 4.31321370309951, "grad_norm": 0.4202030301094055, "learning_rate": 4.7990311920988794e-05, "loss": 0.083, "num_input_tokens_seen": 57082576, "step": 26440 }, { "epoch": 4.314029363784665, "grad_norm": 0.5966748595237732, "learning_rate": 4.798891362094198e-05, "loss": 0.1055, "num_input_tokens_seen": 57093712, "step": 26445 }, { "epoch": 4.314845024469821, "grad_norm": 0.5190337300300598, "learning_rate": 4.7987514854995666e-05, "loss": 0.0979, "num_input_tokens_seen": 57104432, "step": 26450 }, { "epoch": 4.315660685154976, "grad_norm": 0.4694775342941284, "learning_rate": 4.79861156231782e-05, "loss": 0.0524, "num_input_tokens_seen": 57114576, "step": 26455 }, { "epoch": 4.3164763458401305, "grad_norm": 1.538251280784607, "learning_rate": 4.798471592551793e-05, "loss": 0.2989, "num_input_tokens_seen": 57125744, "step": 26460 }, { "epoch": 4.317292006525285, "grad_norm": 0.11368045210838318, "learning_rate": 4.7983315762043247e-05, "loss": 0.1803, "num_input_tokens_seen": 57136784, "step": 26465 }, { "epoch": 4.31810766721044, "grad_norm": 1.7783496379852295, "learning_rate": 4.798191513278251e-05, "loss": 0.0894, "num_input_tokens_seen": 57147888, "step": 26470 }, { "epoch": 4.318923327895595, "grad_norm": 0.06657412648200989, "learning_rate": 4.79805140377641e-05, "loss": 0.0834, "num_input_tokens_seen": 57158704, "step": 26475 }, { "epoch": 4.319738988580751, "grad_norm": 0.028807159513235092, "learning_rate": 4.797911247701643e-05, "loss": 0.045, "num_input_tokens_seen": 57170096, "step": 26480 }, { "epoch": 4.3205546492659055, "grad_norm": 0.22919422388076782, "learning_rate": 4.7977710450567894e-05, "loss": 0.0737, "num_input_tokens_seen": 57181008, "step": 26485 }, { "epoch": 4.32137030995106, "grad_norm": 0.7959059476852417, "learning_rate": 4.79763079584469e-05, "loss": 0.1201, "num_input_tokens_seen": 57192752, "step": 26490 }, { "epoch": 4.322185970636215, "grad_norm": 0.6220296025276184, "learning_rate": 4.797490500068188e-05, "loss": 0.1973, "num_input_tokens_seen": 57204976, "step": 26495 }, { "epoch": 4.32300163132137, "grad_norm": 0.8968259692192078, "learning_rate": 4.7973501577301275e-05, "loss": 0.1792, "num_input_tokens_seen": 57216592, "step": 26500 }, { "epoch": 4.323817292006526, "grad_norm": 1.5164811611175537, "learning_rate": 4.7972097688333504e-05, "loss": 0.0681, "num_input_tokens_seen": 57227568, "step": 26505 }, { "epoch": 4.3246329526916805, "grad_norm": 1.1288363933563232, "learning_rate": 4.797069333380704e-05, "loss": 0.0863, "num_input_tokens_seen": 57238096, "step": 26510 }, { "epoch": 4.325448613376835, "grad_norm": 0.26089197397232056, "learning_rate": 4.7969288513750326e-05, "loss": 0.0576, "num_input_tokens_seen": 57250416, "step": 26515 }, { "epoch": 4.32626427406199, "grad_norm": 0.07772478461265564, "learning_rate": 4.796788322819185e-05, "loss": 0.1744, "num_input_tokens_seen": 57261008, "step": 26520 }, { "epoch": 4.327079934747145, "grad_norm": 1.6897026300430298, "learning_rate": 4.796647747716008e-05, "loss": 0.1748, "num_input_tokens_seen": 57271344, "step": 26525 }, { "epoch": 4.327895595432301, "grad_norm": 1.8950369358062744, "learning_rate": 4.7965071260683504e-05, "loss": 0.1183, "num_input_tokens_seen": 57280752, "step": 26530 }, { "epoch": 4.328711256117455, "grad_norm": 0.24709345400333405, "learning_rate": 4.7963664578790635e-05, "loss": 0.1959, "num_input_tokens_seen": 57290736, "step": 26535 }, { "epoch": 4.32952691680261, "grad_norm": 0.572606086730957, "learning_rate": 4.796225743150996e-05, "loss": 0.0996, "num_input_tokens_seen": 57301680, "step": 26540 }, { "epoch": 4.330342577487765, "grad_norm": 0.08464416116476059, "learning_rate": 4.796084981887002e-05, "loss": 0.1501, "num_input_tokens_seen": 57313200, "step": 26545 }, { "epoch": 4.33115823817292, "grad_norm": 0.9316269159317017, "learning_rate": 4.7959441740899325e-05, "loss": 0.1103, "num_input_tokens_seen": 57323856, "step": 26550 }, { "epoch": 4.331973898858075, "grad_norm": 1.011710286140442, "learning_rate": 4.7958033197626415e-05, "loss": 0.0622, "num_input_tokens_seen": 57334768, "step": 26555 }, { "epoch": 4.33278955954323, "grad_norm": 1.4949054718017578, "learning_rate": 4.7956624189079844e-05, "loss": 0.1814, "num_input_tokens_seen": 57344720, "step": 26560 }, { "epoch": 4.333605220228385, "grad_norm": 0.8746983408927917, "learning_rate": 4.795521471528816e-05, "loss": 0.1972, "num_input_tokens_seen": 57356784, "step": 26565 }, { "epoch": 4.33442088091354, "grad_norm": 0.5189886093139648, "learning_rate": 4.7953804776279925e-05, "loss": 0.0799, "num_input_tokens_seen": 57367184, "step": 26570 }, { "epoch": 4.335236541598695, "grad_norm": 0.22267931699752808, "learning_rate": 4.795239437208372e-05, "loss": 0.1469, "num_input_tokens_seen": 57377840, "step": 26575 }, { "epoch": 4.33605220228385, "grad_norm": 0.11173642426729202, "learning_rate": 4.795098350272813e-05, "loss": 0.1108, "num_input_tokens_seen": 57389232, "step": 26580 }, { "epoch": 4.3368678629690045, "grad_norm": 1.4190404415130615, "learning_rate": 4.794957216824173e-05, "loss": 0.148, "num_input_tokens_seen": 57400176, "step": 26585 }, { "epoch": 4.33768352365416, "grad_norm": 0.043128155171871185, "learning_rate": 4.794816036865315e-05, "loss": 0.1463, "num_input_tokens_seen": 57410768, "step": 26590 }, { "epoch": 4.338499184339315, "grad_norm": 0.98707115650177, "learning_rate": 4.7946748103990985e-05, "loss": 0.1578, "num_input_tokens_seen": 57421680, "step": 26595 }, { "epoch": 4.33931484502447, "grad_norm": 2.3808774948120117, "learning_rate": 4.794533537428386e-05, "loss": 0.1638, "num_input_tokens_seen": 57431600, "step": 26600 }, { "epoch": 4.340130505709625, "grad_norm": 0.1058688685297966, "learning_rate": 4.794392217956041e-05, "loss": 0.0384, "num_input_tokens_seen": 57442640, "step": 26605 }, { "epoch": 4.3409461663947795, "grad_norm": 1.2410411834716797, "learning_rate": 4.794250851984926e-05, "loss": 0.1356, "num_input_tokens_seen": 57453072, "step": 26610 }, { "epoch": 4.341761827079935, "grad_norm": 0.4639667868614197, "learning_rate": 4.794109439517908e-05, "loss": 0.0885, "num_input_tokens_seen": 57463792, "step": 26615 }, { "epoch": 4.34257748776509, "grad_norm": 0.5237468481063843, "learning_rate": 4.7939679805578505e-05, "loss": 0.1364, "num_input_tokens_seen": 57474448, "step": 26620 }, { "epoch": 4.343393148450245, "grad_norm": 0.7102389335632324, "learning_rate": 4.793826475107623e-05, "loss": 0.0817, "num_input_tokens_seen": 57485904, "step": 26625 }, { "epoch": 4.3442088091354, "grad_norm": 0.3473008871078491, "learning_rate": 4.793684923170092e-05, "loss": 0.2344, "num_input_tokens_seen": 57497008, "step": 26630 }, { "epoch": 4.3450244698205545, "grad_norm": 0.19898109138011932, "learning_rate": 4.7935433247481254e-05, "loss": 0.0821, "num_input_tokens_seen": 57507920, "step": 26635 }, { "epoch": 4.345840130505709, "grad_norm": 0.3917064666748047, "learning_rate": 4.793401679844595e-05, "loss": 0.1231, "num_input_tokens_seen": 57518832, "step": 26640 }, { "epoch": 4.346655791190865, "grad_norm": 0.08607026934623718, "learning_rate": 4.79325998846237e-05, "loss": 0.1128, "num_input_tokens_seen": 57529808, "step": 26645 }, { "epoch": 4.34747145187602, "grad_norm": 1.578014612197876, "learning_rate": 4.793118250604321e-05, "loss": 0.1041, "num_input_tokens_seen": 57540944, "step": 26650 }, { "epoch": 4.348287112561175, "grad_norm": 0.21928492188453674, "learning_rate": 4.7929764662733226e-05, "loss": 0.1712, "num_input_tokens_seen": 57552176, "step": 26655 }, { "epoch": 4.349102773246329, "grad_norm": 0.11503029614686966, "learning_rate": 4.792834635472246e-05, "loss": 0.0831, "num_input_tokens_seen": 57563184, "step": 26660 }, { "epoch": 4.349918433931484, "grad_norm": 0.4783203601837158, "learning_rate": 4.792692758203968e-05, "loss": 0.3111, "num_input_tokens_seen": 57574320, "step": 26665 }, { "epoch": 4.350734094616639, "grad_norm": 0.19066907465457916, "learning_rate": 4.792550834471363e-05, "loss": 0.1126, "num_input_tokens_seen": 57584944, "step": 26670 }, { "epoch": 4.351549755301795, "grad_norm": 0.7715592980384827, "learning_rate": 4.792408864277307e-05, "loss": 0.073, "num_input_tokens_seen": 57595600, "step": 26675 }, { "epoch": 4.35236541598695, "grad_norm": 2.6488006114959717, "learning_rate": 4.792266847624677e-05, "loss": 0.3377, "num_input_tokens_seen": 57607472, "step": 26680 }, { "epoch": 4.353181076672104, "grad_norm": 1.9662446975708008, "learning_rate": 4.792124784516351e-05, "loss": 0.3082, "num_input_tokens_seen": 57618800, "step": 26685 }, { "epoch": 4.353996737357259, "grad_norm": 0.4172869622707367, "learning_rate": 4.7919826749552085e-05, "loss": 0.0366, "num_input_tokens_seen": 57629040, "step": 26690 }, { "epoch": 4.354812398042414, "grad_norm": 1.2370604276657104, "learning_rate": 4.7918405189441296e-05, "loss": 0.2336, "num_input_tokens_seen": 57639728, "step": 26695 }, { "epoch": 4.35562805872757, "grad_norm": 0.9561865329742432, "learning_rate": 4.791698316485995e-05, "loss": 0.1327, "num_input_tokens_seen": 57650512, "step": 26700 }, { "epoch": 4.356443719412725, "grad_norm": 0.180534228682518, "learning_rate": 4.791556067583688e-05, "loss": 0.1727, "num_input_tokens_seen": 57662096, "step": 26705 }, { "epoch": 4.357259380097879, "grad_norm": 0.18550752103328705, "learning_rate": 4.791413772240089e-05, "loss": 0.2137, "num_input_tokens_seen": 57672080, "step": 26710 }, { "epoch": 4.358075040783034, "grad_norm": 0.28234127163887024, "learning_rate": 4.791271430458083e-05, "loss": 0.2679, "num_input_tokens_seen": 57682704, "step": 26715 }, { "epoch": 4.358890701468189, "grad_norm": 1.2657414674758911, "learning_rate": 4.7911290422405555e-05, "loss": 0.1736, "num_input_tokens_seen": 57693520, "step": 26720 }, { "epoch": 4.359706362153344, "grad_norm": 1.009494662284851, "learning_rate": 4.790986607590391e-05, "loss": 0.1536, "num_input_tokens_seen": 57703312, "step": 26725 }, { "epoch": 4.3605220228384995, "grad_norm": 1.7354490756988525, "learning_rate": 4.790844126510477e-05, "loss": 0.1636, "num_input_tokens_seen": 57714672, "step": 26730 }, { "epoch": 4.361337683523654, "grad_norm": 0.0368332676589489, "learning_rate": 4.7907015990037e-05, "loss": 0.0508, "num_input_tokens_seen": 57724464, "step": 26735 }, { "epoch": 4.362153344208809, "grad_norm": 0.764736533164978, "learning_rate": 4.7905590250729494e-05, "loss": 0.055, "num_input_tokens_seen": 57735856, "step": 26740 }, { "epoch": 4.362969004893964, "grad_norm": 0.3687175512313843, "learning_rate": 4.790416404721114e-05, "loss": 0.1203, "num_input_tokens_seen": 57747440, "step": 26745 }, { "epoch": 4.363784665579119, "grad_norm": 0.7986684441566467, "learning_rate": 4.7902737379510855e-05, "loss": 0.1206, "num_input_tokens_seen": 57757168, "step": 26750 }, { "epoch": 4.364600326264274, "grad_norm": 0.10932793468236923, "learning_rate": 4.7901310247657535e-05, "loss": 0.0422, "num_input_tokens_seen": 57768112, "step": 26755 }, { "epoch": 4.365415986949429, "grad_norm": 0.3923684060573578, "learning_rate": 4.7899882651680115e-05, "loss": 0.138, "num_input_tokens_seen": 57778480, "step": 26760 }, { "epoch": 4.366231647634584, "grad_norm": 0.13486547768115997, "learning_rate": 4.789845459160752e-05, "loss": 0.0564, "num_input_tokens_seen": 57789328, "step": 26765 }, { "epoch": 4.367047308319739, "grad_norm": 0.1581922471523285, "learning_rate": 4.78970260674687e-05, "loss": 0.0951, "num_input_tokens_seen": 57800112, "step": 26770 }, { "epoch": 4.367862969004894, "grad_norm": 0.26419132947921753, "learning_rate": 4.789559707929259e-05, "loss": 0.2262, "num_input_tokens_seen": 57810064, "step": 26775 }, { "epoch": 4.368678629690049, "grad_norm": 0.9482823014259338, "learning_rate": 4.7894167627108164e-05, "loss": 0.0717, "num_input_tokens_seen": 57821680, "step": 26780 }, { "epoch": 4.369494290375204, "grad_norm": 1.3786282539367676, "learning_rate": 4.789273771094439e-05, "loss": 0.1815, "num_input_tokens_seen": 57832496, "step": 26785 }, { "epoch": 4.370309951060359, "grad_norm": 0.3135034143924713, "learning_rate": 4.7891307330830245e-05, "loss": 0.1799, "num_input_tokens_seen": 57842768, "step": 26790 }, { "epoch": 4.371125611745514, "grad_norm": 0.9906368255615234, "learning_rate": 4.788987648679472e-05, "loss": 0.2528, "num_input_tokens_seen": 57853008, "step": 26795 }, { "epoch": 4.371941272430669, "grad_norm": 0.9210572838783264, "learning_rate": 4.78884451788668e-05, "loss": 0.1031, "num_input_tokens_seen": 57863312, "step": 26800 }, { "epoch": 4.372756933115824, "grad_norm": 1.0770577192306519, "learning_rate": 4.788701340707551e-05, "loss": 0.1233, "num_input_tokens_seen": 57873552, "step": 26805 }, { "epoch": 4.373572593800978, "grad_norm": 1.1412591934204102, "learning_rate": 4.788558117144986e-05, "loss": 0.1947, "num_input_tokens_seen": 57884336, "step": 26810 }, { "epoch": 4.374388254486134, "grad_norm": 0.9860005974769592, "learning_rate": 4.7884148472018863e-05, "loss": 0.2201, "num_input_tokens_seen": 57894512, "step": 26815 }, { "epoch": 4.375203915171289, "grad_norm": 0.06437418609857559, "learning_rate": 4.788271530881158e-05, "loss": 0.1132, "num_input_tokens_seen": 57906576, "step": 26820 }, { "epoch": 4.376019575856444, "grad_norm": 0.22139890491962433, "learning_rate": 4.7881281681857036e-05, "loss": 0.1264, "num_input_tokens_seen": 57916816, "step": 26825 }, { "epoch": 4.376835236541599, "grad_norm": 0.07457280904054642, "learning_rate": 4.787984759118429e-05, "loss": 0.0874, "num_input_tokens_seen": 57927088, "step": 26830 }, { "epoch": 4.377650897226753, "grad_norm": 0.3821512460708618, "learning_rate": 4.7878413036822414e-05, "loss": 0.0835, "num_input_tokens_seen": 57939120, "step": 26835 }, { "epoch": 4.378466557911908, "grad_norm": 0.6623501777648926, "learning_rate": 4.7876978018800475e-05, "loss": 0.1596, "num_input_tokens_seen": 57949264, "step": 26840 }, { "epoch": 4.379282218597064, "grad_norm": 0.9815965294837952, "learning_rate": 4.787554253714755e-05, "loss": 0.2065, "num_input_tokens_seen": 57961328, "step": 26845 }, { "epoch": 4.380097879282219, "grad_norm": 0.9595609307289124, "learning_rate": 4.787410659189274e-05, "loss": 0.1941, "num_input_tokens_seen": 57972816, "step": 26850 }, { "epoch": 4.3809135399673735, "grad_norm": 0.8846819400787354, "learning_rate": 4.7872670183065136e-05, "loss": 0.1143, "num_input_tokens_seen": 57983440, "step": 26855 }, { "epoch": 4.381729200652528, "grad_norm": 0.9127613306045532, "learning_rate": 4.787123331069386e-05, "loss": 0.1889, "num_input_tokens_seen": 57994384, "step": 26860 }, { "epoch": 4.382544861337683, "grad_norm": 0.7951414585113525, "learning_rate": 4.786979597480802e-05, "loss": 0.0907, "num_input_tokens_seen": 58006192, "step": 26865 }, { "epoch": 4.383360522022839, "grad_norm": 0.9666707515716553, "learning_rate": 4.786835817543676e-05, "loss": 0.0572, "num_input_tokens_seen": 58016784, "step": 26870 }, { "epoch": 4.384176182707994, "grad_norm": 1.0345607995986938, "learning_rate": 4.786691991260921e-05, "loss": 0.1108, "num_input_tokens_seen": 58026736, "step": 26875 }, { "epoch": 4.3849918433931485, "grad_norm": 0.15325278043746948, "learning_rate": 4.786548118635452e-05, "loss": 0.0849, "num_input_tokens_seen": 58037200, "step": 26880 }, { "epoch": 4.385807504078303, "grad_norm": 0.2709228992462158, "learning_rate": 4.786404199670184e-05, "loss": 0.06, "num_input_tokens_seen": 58049200, "step": 26885 }, { "epoch": 4.386623164763458, "grad_norm": 0.3433147668838501, "learning_rate": 4.786260234368035e-05, "loss": 0.1085, "num_input_tokens_seen": 58060144, "step": 26890 }, { "epoch": 4.387438825448613, "grad_norm": 0.30326759815216064, "learning_rate": 4.786116222731922e-05, "loss": 0.1434, "num_input_tokens_seen": 58071600, "step": 26895 }, { "epoch": 4.388254486133769, "grad_norm": 0.49713844060897827, "learning_rate": 4.7859721647647635e-05, "loss": 0.2056, "num_input_tokens_seen": 58081616, "step": 26900 }, { "epoch": 4.3890701468189235, "grad_norm": 0.4039410948753357, "learning_rate": 4.785828060469478e-05, "loss": 0.2282, "num_input_tokens_seen": 58092976, "step": 26905 }, { "epoch": 4.389885807504078, "grad_norm": 0.5501205325126648, "learning_rate": 4.785683909848989e-05, "loss": 0.0801, "num_input_tokens_seen": 58103536, "step": 26910 }, { "epoch": 4.390701468189233, "grad_norm": 0.1747772991657257, "learning_rate": 4.7855397129062154e-05, "loss": 0.0355, "num_input_tokens_seen": 58115504, "step": 26915 }, { "epoch": 4.391517128874388, "grad_norm": 0.23450756072998047, "learning_rate": 4.7853954696440796e-05, "loss": 0.1126, "num_input_tokens_seen": 58125648, "step": 26920 }, { "epoch": 4.392332789559543, "grad_norm": 1.7035034894943237, "learning_rate": 4.7852511800655054e-05, "loss": 0.1511, "num_input_tokens_seen": 58136816, "step": 26925 }, { "epoch": 4.3931484502446985, "grad_norm": 1.5246707201004028, "learning_rate": 4.785106844173417e-05, "loss": 0.2106, "num_input_tokens_seen": 58146832, "step": 26930 }, { "epoch": 4.393964110929853, "grad_norm": 0.6717075109481812, "learning_rate": 4.78496246197074e-05, "loss": 0.238, "num_input_tokens_seen": 58156912, "step": 26935 }, { "epoch": 4.394779771615008, "grad_norm": 1.471940040588379, "learning_rate": 4.7848180334604e-05, "loss": 0.1037, "num_input_tokens_seen": 58167856, "step": 26940 }, { "epoch": 4.395595432300163, "grad_norm": 0.8031310439109802, "learning_rate": 4.7846735586453235e-05, "loss": 0.1688, "num_input_tokens_seen": 58178928, "step": 26945 }, { "epoch": 4.396411092985318, "grad_norm": 0.08575073629617691, "learning_rate": 4.784529037528439e-05, "loss": 0.0647, "num_input_tokens_seen": 58187888, "step": 26950 }, { "epoch": 4.397226753670473, "grad_norm": 0.8157945275306702, "learning_rate": 4.784384470112676e-05, "loss": 0.1142, "num_input_tokens_seen": 58198160, "step": 26955 }, { "epoch": 4.398042414355628, "grad_norm": 0.14942340552806854, "learning_rate": 4.784239856400963e-05, "loss": 0.0538, "num_input_tokens_seen": 58209808, "step": 26960 }, { "epoch": 4.398858075040783, "grad_norm": 1.5534133911132812, "learning_rate": 4.784095196396232e-05, "loss": 0.1073, "num_input_tokens_seen": 58221104, "step": 26965 }, { "epoch": 4.399673735725938, "grad_norm": 1.4032514095306396, "learning_rate": 4.7839504901014146e-05, "loss": 0.16, "num_input_tokens_seen": 58230832, "step": 26970 }, { "epoch": 4.400489396411093, "grad_norm": 0.36136534810066223, "learning_rate": 4.783805737519443e-05, "loss": 0.1655, "num_input_tokens_seen": 58241328, "step": 26975 }, { "epoch": 4.401305057096248, "grad_norm": 0.09028610587120056, "learning_rate": 4.783660938653251e-05, "loss": 0.032, "num_input_tokens_seen": 58252720, "step": 26980 }, { "epoch": 4.402120717781403, "grad_norm": 1.7320040464401245, "learning_rate": 4.7835160935057725e-05, "loss": 0.1164, "num_input_tokens_seen": 58263216, "step": 26985 }, { "epoch": 4.402936378466558, "grad_norm": 0.13350515067577362, "learning_rate": 4.783371202079944e-05, "loss": 0.0245, "num_input_tokens_seen": 58274320, "step": 26990 }, { "epoch": 4.403752039151713, "grad_norm": 0.1415248066186905, "learning_rate": 4.7832262643787014e-05, "loss": 0.2271, "num_input_tokens_seen": 58285392, "step": 26995 }, { "epoch": 4.404567699836868, "grad_norm": 1.0443214178085327, "learning_rate": 4.7830812804049816e-05, "loss": 0.1903, "num_input_tokens_seen": 58295728, "step": 27000 }, { "epoch": 4.4053833605220225, "grad_norm": 0.17118291556835175, "learning_rate": 4.782936250161724e-05, "loss": 0.1418, "num_input_tokens_seen": 58306704, "step": 27005 }, { "epoch": 4.406199021207178, "grad_norm": 0.44026583433151245, "learning_rate": 4.782791173651867e-05, "loss": 0.1657, "num_input_tokens_seen": 58318288, "step": 27010 }, { "epoch": 4.407014681892333, "grad_norm": 0.6082284450531006, "learning_rate": 4.78264605087835e-05, "loss": 0.1263, "num_input_tokens_seen": 58330416, "step": 27015 }, { "epoch": 4.407830342577488, "grad_norm": 1.5495795011520386, "learning_rate": 4.7825008818441166e-05, "loss": 0.1348, "num_input_tokens_seen": 58341456, "step": 27020 }, { "epoch": 4.408646003262643, "grad_norm": 0.18916422128677368, "learning_rate": 4.7823556665521065e-05, "loss": 0.1401, "num_input_tokens_seen": 58351568, "step": 27025 }, { "epoch": 4.4094616639477975, "grad_norm": 1.2324349880218506, "learning_rate": 4.782210405005263e-05, "loss": 0.237, "num_input_tokens_seen": 58361968, "step": 27030 }, { "epoch": 4.410277324632952, "grad_norm": 0.5123143792152405, "learning_rate": 4.782065097206531e-05, "loss": 0.1256, "num_input_tokens_seen": 58373200, "step": 27035 }, { "epoch": 4.411092985318108, "grad_norm": 0.2821430563926697, "learning_rate": 4.7819197431588545e-05, "loss": 0.0645, "num_input_tokens_seen": 58383632, "step": 27040 }, { "epoch": 4.411908646003263, "grad_norm": 1.1920396089553833, "learning_rate": 4.78177434286518e-05, "loss": 0.1657, "num_input_tokens_seen": 58394768, "step": 27045 }, { "epoch": 4.412724306688418, "grad_norm": 1.2426166534423828, "learning_rate": 4.781628896328454e-05, "loss": 0.0737, "num_input_tokens_seen": 58403856, "step": 27050 }, { "epoch": 4.4135399673735725, "grad_norm": 1.637803077697754, "learning_rate": 4.781483403551623e-05, "loss": 0.2022, "num_input_tokens_seen": 58415152, "step": 27055 }, { "epoch": 4.414355628058727, "grad_norm": 1.3782222270965576, "learning_rate": 4.781337864537637e-05, "loss": 0.0792, "num_input_tokens_seen": 58425584, "step": 27060 }, { "epoch": 4.415171288743883, "grad_norm": 0.17152522504329681, "learning_rate": 4.781192279289445e-05, "loss": 0.1469, "num_input_tokens_seen": 58435088, "step": 27065 }, { "epoch": 4.415986949429038, "grad_norm": 0.24295952916145325, "learning_rate": 4.781046647809998e-05, "loss": 0.112, "num_input_tokens_seen": 58447088, "step": 27070 }, { "epoch": 4.416802610114193, "grad_norm": 1.8803216218948364, "learning_rate": 4.7809009701022465e-05, "loss": 0.1924, "num_input_tokens_seen": 58458928, "step": 27075 }, { "epoch": 4.417618270799347, "grad_norm": 0.28720822930336, "learning_rate": 4.7807552461691444e-05, "loss": 0.1427, "num_input_tokens_seen": 58470224, "step": 27080 }, { "epoch": 4.418433931484502, "grad_norm": 0.21097737550735474, "learning_rate": 4.7806094760136427e-05, "loss": 0.0273, "num_input_tokens_seen": 58480816, "step": 27085 }, { "epoch": 4.419249592169657, "grad_norm": 0.7412786483764648, "learning_rate": 4.780463659638698e-05, "loss": 0.2057, "num_input_tokens_seen": 58492528, "step": 27090 }, { "epoch": 4.420065252854813, "grad_norm": 0.16054661571979523, "learning_rate": 4.7803177970472624e-05, "loss": 0.0562, "num_input_tokens_seen": 58502640, "step": 27095 }, { "epoch": 4.420880913539968, "grad_norm": 0.7733790874481201, "learning_rate": 4.7801718882422955e-05, "loss": 0.0811, "num_input_tokens_seen": 58514064, "step": 27100 }, { "epoch": 4.421696574225122, "grad_norm": 0.36783739924430847, "learning_rate": 4.7800259332267517e-05, "loss": 0.0289, "num_input_tokens_seen": 58526032, "step": 27105 }, { "epoch": 4.422512234910277, "grad_norm": 0.5868462324142456, "learning_rate": 4.7798799320035906e-05, "loss": 0.0409, "num_input_tokens_seen": 58537904, "step": 27110 }, { "epoch": 4.423327895595432, "grad_norm": 1.4805965423583984, "learning_rate": 4.77973388457577e-05, "loss": 0.235, "num_input_tokens_seen": 58548208, "step": 27115 }, { "epoch": 4.424143556280587, "grad_norm": 0.18107926845550537, "learning_rate": 4.77958779094625e-05, "loss": 0.1939, "num_input_tokens_seen": 58559568, "step": 27120 }, { "epoch": 4.424959216965743, "grad_norm": 0.29472115635871887, "learning_rate": 4.779441651117992e-05, "loss": 0.1387, "num_input_tokens_seen": 58570576, "step": 27125 }, { "epoch": 4.425774877650897, "grad_norm": 0.18683582544326782, "learning_rate": 4.779295465093957e-05, "loss": 0.1086, "num_input_tokens_seen": 58581680, "step": 27130 }, { "epoch": 4.426590538336052, "grad_norm": 0.10873006284236908, "learning_rate": 4.7791492328771084e-05, "loss": 0.0972, "num_input_tokens_seen": 58593040, "step": 27135 }, { "epoch": 4.427406199021207, "grad_norm": 0.03475244343280792, "learning_rate": 4.779002954470409e-05, "loss": 0.0372, "num_input_tokens_seen": 58603728, "step": 27140 }, { "epoch": 4.428221859706362, "grad_norm": 0.09906939417123795, "learning_rate": 4.778856629876823e-05, "loss": 0.048, "num_input_tokens_seen": 58614640, "step": 27145 }, { "epoch": 4.4290375203915175, "grad_norm": 0.11250431835651398, "learning_rate": 4.778710259099317e-05, "loss": 0.0582, "num_input_tokens_seen": 58623536, "step": 27150 }, { "epoch": 4.429853181076672, "grad_norm": 1.5196529626846313, "learning_rate": 4.778563842140858e-05, "loss": 0.1277, "num_input_tokens_seen": 58633648, "step": 27155 }, { "epoch": 4.430668841761827, "grad_norm": 0.07294535636901855, "learning_rate": 4.77841737900441e-05, "loss": 0.1459, "num_input_tokens_seen": 58644624, "step": 27160 }, { "epoch": 4.431484502446982, "grad_norm": 0.2931632101535797, "learning_rate": 4.778270869692945e-05, "loss": 0.063, "num_input_tokens_seen": 58655344, "step": 27165 }, { "epoch": 4.432300163132137, "grad_norm": 0.8141847252845764, "learning_rate": 4.7781243142094304e-05, "loss": 0.0985, "num_input_tokens_seen": 58666448, "step": 27170 }, { "epoch": 4.433115823817292, "grad_norm": 0.20269951224327087, "learning_rate": 4.777977712556836e-05, "loss": 0.1158, "num_input_tokens_seen": 58678672, "step": 27175 }, { "epoch": 4.433931484502447, "grad_norm": 0.3381054401397705, "learning_rate": 4.777831064738133e-05, "loss": 0.0532, "num_input_tokens_seen": 58690672, "step": 27180 }, { "epoch": 4.434747145187602, "grad_norm": 2.14404034614563, "learning_rate": 4.7776843707562946e-05, "loss": 0.1965, "num_input_tokens_seen": 58702320, "step": 27185 }, { "epoch": 4.435562805872757, "grad_norm": 0.3539816737174988, "learning_rate": 4.777537630614293e-05, "loss": 0.1372, "num_input_tokens_seen": 58714192, "step": 27190 }, { "epoch": 4.436378466557912, "grad_norm": 0.07276631146669388, "learning_rate": 4.7773908443151025e-05, "loss": 0.1538, "num_input_tokens_seen": 58725488, "step": 27195 }, { "epoch": 4.437194127243067, "grad_norm": 0.846887469291687, "learning_rate": 4.777244011861697e-05, "loss": 0.1381, "num_input_tokens_seen": 58736208, "step": 27200 }, { "epoch": 4.438009787928221, "grad_norm": 0.35970792174339294, "learning_rate": 4.777097133257053e-05, "loss": 0.0482, "num_input_tokens_seen": 58746608, "step": 27205 }, { "epoch": 4.438825448613377, "grad_norm": 0.32409825921058655, "learning_rate": 4.776950208504146e-05, "loss": 0.2167, "num_input_tokens_seen": 58756880, "step": 27210 }, { "epoch": 4.439641109298532, "grad_norm": 0.6081352233886719, "learning_rate": 4.776803237605955e-05, "loss": 0.0248, "num_input_tokens_seen": 58768080, "step": 27215 }, { "epoch": 4.440456769983687, "grad_norm": 1.4595203399658203, "learning_rate": 4.776656220565458e-05, "loss": 0.2005, "num_input_tokens_seen": 58778896, "step": 27220 }, { "epoch": 4.441272430668842, "grad_norm": 3.083221912384033, "learning_rate": 4.776509157385635e-05, "loss": 0.3278, "num_input_tokens_seen": 58790032, "step": 27225 }, { "epoch": 4.442088091353996, "grad_norm": 0.37306615710258484, "learning_rate": 4.776362048069465e-05, "loss": 0.076, "num_input_tokens_seen": 58800880, "step": 27230 }, { "epoch": 4.442903752039152, "grad_norm": 0.3728920817375183, "learning_rate": 4.7762148926199316e-05, "loss": 0.0618, "num_input_tokens_seen": 58811760, "step": 27235 }, { "epoch": 4.443719412724307, "grad_norm": 0.42764222621917725, "learning_rate": 4.7760676910400145e-05, "loss": 0.1829, "num_input_tokens_seen": 58821808, "step": 27240 }, { "epoch": 4.444535073409462, "grad_norm": 0.5750018358230591, "learning_rate": 4.775920443332699e-05, "loss": 0.1706, "num_input_tokens_seen": 58832080, "step": 27245 }, { "epoch": 4.445350734094617, "grad_norm": 0.28977900743484497, "learning_rate": 4.775773149500969e-05, "loss": 0.1974, "num_input_tokens_seen": 58842480, "step": 27250 }, { "epoch": 4.446166394779771, "grad_norm": 2.047966718673706, "learning_rate": 4.775625809547808e-05, "loss": 0.3233, "num_input_tokens_seen": 58852752, "step": 27255 }, { "epoch": 4.446982055464926, "grad_norm": 0.1225549578666687, "learning_rate": 4.775478423476204e-05, "loss": 0.0935, "num_input_tokens_seen": 58863312, "step": 27260 }, { "epoch": 4.447797716150082, "grad_norm": 0.8841878175735474, "learning_rate": 4.775330991289143e-05, "loss": 0.1166, "num_input_tokens_seen": 58873104, "step": 27265 }, { "epoch": 4.448613376835237, "grad_norm": 0.9411643743515015, "learning_rate": 4.775183512989613e-05, "loss": 0.0592, "num_input_tokens_seen": 58884624, "step": 27270 }, { "epoch": 4.4494290375203915, "grad_norm": 0.5647889375686646, "learning_rate": 4.775035988580602e-05, "loss": 0.0632, "num_input_tokens_seen": 58894864, "step": 27275 }, { "epoch": 4.450244698205546, "grad_norm": 1.9947912693023682, "learning_rate": 4.774888418065101e-05, "loss": 0.211, "num_input_tokens_seen": 58906512, "step": 27280 }, { "epoch": 4.451060358890701, "grad_norm": 0.06608348339796066, "learning_rate": 4.774740801446101e-05, "loss": 0.1257, "num_input_tokens_seen": 58917904, "step": 27285 }, { "epoch": 4.451876019575856, "grad_norm": 0.9250381588935852, "learning_rate": 4.774593138726592e-05, "loss": 0.1408, "num_input_tokens_seen": 58928304, "step": 27290 }, { "epoch": 4.452691680261012, "grad_norm": 0.11096963286399841, "learning_rate": 4.7744454299095686e-05, "loss": 0.0465, "num_input_tokens_seen": 58939056, "step": 27295 }, { "epoch": 4.4535073409461665, "grad_norm": 0.3891056478023529, "learning_rate": 4.7742976749980225e-05, "loss": 0.1585, "num_input_tokens_seen": 58950384, "step": 27300 }, { "epoch": 4.454323001631321, "grad_norm": 0.044976238161325455, "learning_rate": 4.774149873994949e-05, "loss": 0.0779, "num_input_tokens_seen": 58959952, "step": 27305 }, { "epoch": 4.455138662316476, "grad_norm": 1.0760101079940796, "learning_rate": 4.7740020269033434e-05, "loss": 0.0789, "num_input_tokens_seen": 58970288, "step": 27310 }, { "epoch": 4.455954323001631, "grad_norm": 0.3007746636867523, "learning_rate": 4.7738541337262013e-05, "loss": 0.1351, "num_input_tokens_seen": 58981936, "step": 27315 }, { "epoch": 4.456769983686787, "grad_norm": 0.028072934597730637, "learning_rate": 4.773706194466521e-05, "loss": 0.0597, "num_input_tokens_seen": 58992432, "step": 27320 }, { "epoch": 4.4575856443719415, "grad_norm": 0.16962593793869019, "learning_rate": 4.7735582091273006e-05, "loss": 0.0182, "num_input_tokens_seen": 59002832, "step": 27325 }, { "epoch": 4.458401305057096, "grad_norm": 0.6297242045402527, "learning_rate": 4.773410177711539e-05, "loss": 0.054, "num_input_tokens_seen": 59013936, "step": 27330 }, { "epoch": 4.459216965742251, "grad_norm": 0.20799407362937927, "learning_rate": 4.773262100222235e-05, "loss": 0.071, "num_input_tokens_seen": 59023440, "step": 27335 }, { "epoch": 4.460032626427406, "grad_norm": 1.016848087310791, "learning_rate": 4.773113976662392e-05, "loss": 0.111, "num_input_tokens_seen": 59034224, "step": 27340 }, { "epoch": 4.460848287112561, "grad_norm": 0.5375759601593018, "learning_rate": 4.77296580703501e-05, "loss": 0.1265, "num_input_tokens_seen": 59044016, "step": 27345 }, { "epoch": 4.4616639477977165, "grad_norm": 0.05509293079376221, "learning_rate": 4.772817591343093e-05, "loss": 0.0686, "num_input_tokens_seen": 59053648, "step": 27350 }, { "epoch": 4.462479608482871, "grad_norm": 0.2277602106332779, "learning_rate": 4.772669329589644e-05, "loss": 0.1368, "num_input_tokens_seen": 59064080, "step": 27355 }, { "epoch": 4.463295269168026, "grad_norm": 1.0372071266174316, "learning_rate": 4.772521021777667e-05, "loss": 0.2553, "num_input_tokens_seen": 59074672, "step": 27360 }, { "epoch": 4.464110929853181, "grad_norm": 1.3771793842315674, "learning_rate": 4.7723726679101696e-05, "loss": 0.2177, "num_input_tokens_seen": 59084784, "step": 27365 }, { "epoch": 4.464926590538336, "grad_norm": 0.9086936116218567, "learning_rate": 4.772224267990157e-05, "loss": 0.1099, "num_input_tokens_seen": 59095504, "step": 27370 }, { "epoch": 4.465742251223491, "grad_norm": 0.0864446833729744, "learning_rate": 4.772075822020637e-05, "loss": 0.0373, "num_input_tokens_seen": 59105616, "step": 27375 }, { "epoch": 4.466557911908646, "grad_norm": 6.0511860847473145, "learning_rate": 4.7719273300046184e-05, "loss": 0.2058, "num_input_tokens_seen": 59114576, "step": 27380 }, { "epoch": 4.467373572593801, "grad_norm": 0.05670009180903435, "learning_rate": 4.771778791945111e-05, "loss": 0.1654, "num_input_tokens_seen": 59124976, "step": 27385 }, { "epoch": 4.468189233278956, "grad_norm": 0.15267369151115417, "learning_rate": 4.771630207845124e-05, "loss": 0.0722, "num_input_tokens_seen": 59136336, "step": 27390 }, { "epoch": 4.469004893964111, "grad_norm": 0.32670968770980835, "learning_rate": 4.771481577707668e-05, "loss": 0.1247, "num_input_tokens_seen": 59146960, "step": 27395 }, { "epoch": 4.4698205546492655, "grad_norm": 0.8717701435089111, "learning_rate": 4.771332901535757e-05, "loss": 0.0518, "num_input_tokens_seen": 59158096, "step": 27400 }, { "epoch": 4.470636215334421, "grad_norm": 0.06635864078998566, "learning_rate": 4.7711841793324036e-05, "loss": 0.0911, "num_input_tokens_seen": 59168912, "step": 27405 }, { "epoch": 4.471451876019576, "grad_norm": 0.08291113376617432, "learning_rate": 4.771035411100622e-05, "loss": 0.0452, "num_input_tokens_seen": 59179728, "step": 27410 }, { "epoch": 4.472267536704731, "grad_norm": 1.785387396812439, "learning_rate": 4.770886596843426e-05, "loss": 0.2428, "num_input_tokens_seen": 59191312, "step": 27415 }, { "epoch": 4.473083197389886, "grad_norm": 1.8664906024932861, "learning_rate": 4.770737736563832e-05, "loss": 0.0718, "num_input_tokens_seen": 59202352, "step": 27420 }, { "epoch": 4.4738988580750405, "grad_norm": 0.1406281590461731, "learning_rate": 4.770588830264858e-05, "loss": 0.0779, "num_input_tokens_seen": 59212560, "step": 27425 }, { "epoch": 4.474714518760196, "grad_norm": 0.5497645735740662, "learning_rate": 4.77043987794952e-05, "loss": 0.1456, "num_input_tokens_seen": 59221968, "step": 27430 }, { "epoch": 4.475530179445351, "grad_norm": 0.6181632876396179, "learning_rate": 4.7702908796208384e-05, "loss": 0.1779, "num_input_tokens_seen": 59232848, "step": 27435 }, { "epoch": 4.476345840130506, "grad_norm": 0.09318330883979797, "learning_rate": 4.770141835281832e-05, "loss": 0.1443, "num_input_tokens_seen": 59243984, "step": 27440 }, { "epoch": 4.477161500815661, "grad_norm": 0.27848318219184875, "learning_rate": 4.769992744935521e-05, "loss": 0.1218, "num_input_tokens_seen": 59253840, "step": 27445 }, { "epoch": 4.4779771615008155, "grad_norm": 0.09469717741012573, "learning_rate": 4.769843608584927e-05, "loss": 0.0908, "num_input_tokens_seen": 59264400, "step": 27450 }, { "epoch": 4.47879282218597, "grad_norm": 1.4506529569625854, "learning_rate": 4.769694426233073e-05, "loss": 0.2449, "num_input_tokens_seen": 59275728, "step": 27455 }, { "epoch": 4.479608482871126, "grad_norm": 0.09809502959251404, "learning_rate": 4.769545197882983e-05, "loss": 0.1382, "num_input_tokens_seen": 59286896, "step": 27460 }, { "epoch": 4.480424143556281, "grad_norm": 1.5166939496994019, "learning_rate": 4.76939592353768e-05, "loss": 0.1324, "num_input_tokens_seen": 59297808, "step": 27465 }, { "epoch": 4.481239804241436, "grad_norm": 0.3474573791027069, "learning_rate": 4.769246603200189e-05, "loss": 0.1022, "num_input_tokens_seen": 59308752, "step": 27470 }, { "epoch": 4.4820554649265905, "grad_norm": 1.1345326900482178, "learning_rate": 4.769097236873538e-05, "loss": 0.1109, "num_input_tokens_seen": 59318320, "step": 27475 }, { "epoch": 4.482871125611745, "grad_norm": 0.9466695785522461, "learning_rate": 4.768947824560752e-05, "loss": 0.103, "num_input_tokens_seen": 59328464, "step": 27480 }, { "epoch": 4.4836867862969, "grad_norm": 0.8013778924942017, "learning_rate": 4.768798366264861e-05, "loss": 0.1449, "num_input_tokens_seen": 59338000, "step": 27485 }, { "epoch": 4.484502446982056, "grad_norm": 0.540586531162262, "learning_rate": 4.768648861988892e-05, "loss": 0.1387, "num_input_tokens_seen": 59347792, "step": 27490 }, { "epoch": 4.485318107667211, "grad_norm": 0.2574921250343323, "learning_rate": 4.768499311735876e-05, "loss": 0.042, "num_input_tokens_seen": 59357648, "step": 27495 }, { "epoch": 4.486133768352365, "grad_norm": 0.06413564085960388, "learning_rate": 4.768349715508844e-05, "loss": 0.0465, "num_input_tokens_seen": 59367760, "step": 27500 }, { "epoch": 4.48694942903752, "grad_norm": 0.387024849653244, "learning_rate": 4.7682000733108265e-05, "loss": 0.0932, "num_input_tokens_seen": 59378416, "step": 27505 }, { "epoch": 4.487765089722675, "grad_norm": 0.514755368232727, "learning_rate": 4.7680503851448584e-05, "loss": 0.085, "num_input_tokens_seen": 59388944, "step": 27510 }, { "epoch": 4.488580750407831, "grad_norm": 0.12754039466381073, "learning_rate": 4.767900651013971e-05, "loss": 0.1178, "num_input_tokens_seen": 59397776, "step": 27515 }, { "epoch": 4.489396411092986, "grad_norm": 0.311491996049881, "learning_rate": 4.767750870921201e-05, "loss": 0.0121, "num_input_tokens_seen": 59408464, "step": 27520 }, { "epoch": 4.49021207177814, "grad_norm": 0.11097798496484756, "learning_rate": 4.767601044869581e-05, "loss": 0.2347, "num_input_tokens_seen": 59419888, "step": 27525 }, { "epoch": 4.491027732463295, "grad_norm": 0.02774953283369541, "learning_rate": 4.7674511728621505e-05, "loss": 0.1282, "num_input_tokens_seen": 59429712, "step": 27530 }, { "epoch": 4.49184339314845, "grad_norm": 0.3390287756919861, "learning_rate": 4.767301254901945e-05, "loss": 0.0995, "num_input_tokens_seen": 59440688, "step": 27535 }, { "epoch": 4.492659053833605, "grad_norm": 0.058956801891326904, "learning_rate": 4.767151290992003e-05, "loss": 0.0814, "num_input_tokens_seen": 59451120, "step": 27540 }, { "epoch": 4.493474714518761, "grad_norm": 0.04521201178431511, "learning_rate": 4.7670012811353645e-05, "loss": 0.0907, "num_input_tokens_seen": 59461776, "step": 27545 }, { "epoch": 4.494290375203915, "grad_norm": 0.14937207102775574, "learning_rate": 4.76685122533507e-05, "loss": 0.2136, "num_input_tokens_seen": 59472464, "step": 27550 }, { "epoch": 4.49510603588907, "grad_norm": 0.564771831035614, "learning_rate": 4.766701123594158e-05, "loss": 0.2577, "num_input_tokens_seen": 59482704, "step": 27555 }, { "epoch": 4.495921696574225, "grad_norm": 0.8387866616249084, "learning_rate": 4.766550975915673e-05, "loss": 0.1739, "num_input_tokens_seen": 59492976, "step": 27560 }, { "epoch": 4.49673735725938, "grad_norm": 0.9412037134170532, "learning_rate": 4.766400782302658e-05, "loss": 0.1287, "num_input_tokens_seen": 59503344, "step": 27565 }, { "epoch": 4.497553017944535, "grad_norm": 0.5063627362251282, "learning_rate": 4.766250542758155e-05, "loss": 0.0325, "num_input_tokens_seen": 59514896, "step": 27570 }, { "epoch": 4.49836867862969, "grad_norm": 0.8215790390968323, "learning_rate": 4.76610025728521e-05, "loss": 0.2656, "num_input_tokens_seen": 59527408, "step": 27575 }, { "epoch": 4.499184339314845, "grad_norm": 1.1922849416732788, "learning_rate": 4.765949925886869e-05, "loss": 0.1919, "num_input_tokens_seen": 59537520, "step": 27580 }, { "epoch": 4.5, "grad_norm": 0.7057920098304749, "learning_rate": 4.765799548566177e-05, "loss": 0.3201, "num_input_tokens_seen": 59549072, "step": 27585 }, { "epoch": 4.500815660685155, "grad_norm": 0.17023736238479614, "learning_rate": 4.765649125326184e-05, "loss": 0.0704, "num_input_tokens_seen": 59559824, "step": 27590 }, { "epoch": 4.50163132137031, "grad_norm": 0.4980206787586212, "learning_rate": 4.765498656169937e-05, "loss": 0.1257, "num_input_tokens_seen": 59569776, "step": 27595 }, { "epoch": 4.502446982055465, "grad_norm": 0.7252302169799805, "learning_rate": 4.765348141100485e-05, "loss": 0.1147, "num_input_tokens_seen": 59579184, "step": 27600 }, { "epoch": 4.50326264274062, "grad_norm": 0.19444702565670013, "learning_rate": 4.76519758012088e-05, "loss": 0.1022, "num_input_tokens_seen": 59590416, "step": 27605 }, { "epoch": 4.504078303425775, "grad_norm": 0.138977512717247, "learning_rate": 4.765046973234172e-05, "loss": 0.1091, "num_input_tokens_seen": 59601136, "step": 27610 }, { "epoch": 4.50489396411093, "grad_norm": 0.9477666020393372, "learning_rate": 4.7648963204434135e-05, "loss": 0.1668, "num_input_tokens_seen": 59612816, "step": 27615 }, { "epoch": 4.505709624796085, "grad_norm": 0.2016691118478775, "learning_rate": 4.7647456217516584e-05, "loss": 0.0909, "num_input_tokens_seen": 59624080, "step": 27620 }, { "epoch": 4.506525285481239, "grad_norm": 0.4715433418750763, "learning_rate": 4.76459487716196e-05, "loss": 0.0498, "num_input_tokens_seen": 59634608, "step": 27625 }, { "epoch": 4.507340946166395, "grad_norm": 1.383105754852295, "learning_rate": 4.764444086677373e-05, "loss": 0.0885, "num_input_tokens_seen": 59645200, "step": 27630 }, { "epoch": 4.50815660685155, "grad_norm": 0.2470850646495819, "learning_rate": 4.764293250300955e-05, "loss": 0.1454, "num_input_tokens_seen": 59654736, "step": 27635 }, { "epoch": 4.508972267536705, "grad_norm": 0.06396646797657013, "learning_rate": 4.764142368035761e-05, "loss": 0.1076, "num_input_tokens_seen": 59664624, "step": 27640 }, { "epoch": 4.50978792822186, "grad_norm": 0.06418019533157349, "learning_rate": 4.76399143988485e-05, "loss": 0.2042, "num_input_tokens_seen": 59674480, "step": 27645 }, { "epoch": 4.510603588907014, "grad_norm": 0.7032028436660767, "learning_rate": 4.76384046585128e-05, "loss": 0.0558, "num_input_tokens_seen": 59685392, "step": 27650 }, { "epoch": 4.511419249592169, "grad_norm": 0.6718917489051819, "learning_rate": 4.763689445938112e-05, "loss": 0.06, "num_input_tokens_seen": 59695280, "step": 27655 }, { "epoch": 4.512234910277325, "grad_norm": 0.4106779992580414, "learning_rate": 4.7635383801484043e-05, "loss": 0.1314, "num_input_tokens_seen": 59705872, "step": 27660 }, { "epoch": 4.51305057096248, "grad_norm": 0.08838459104299545, "learning_rate": 4.763387268485221e-05, "loss": 0.0526, "num_input_tokens_seen": 59717936, "step": 27665 }, { "epoch": 4.513866231647635, "grad_norm": 2.386263608932495, "learning_rate": 4.763236110951623e-05, "loss": 0.0985, "num_input_tokens_seen": 59728144, "step": 27670 }, { "epoch": 4.514681892332789, "grad_norm": 0.7425671815872192, "learning_rate": 4.7630849075506745e-05, "loss": 0.118, "num_input_tokens_seen": 59739888, "step": 27675 }, { "epoch": 4.515497553017944, "grad_norm": 0.018214331939816475, "learning_rate": 4.762933658285439e-05, "loss": 0.0339, "num_input_tokens_seen": 59750320, "step": 27680 }, { "epoch": 4.5163132137031, "grad_norm": 0.3005729913711548, "learning_rate": 4.762782363158982e-05, "loss": 0.1144, "num_input_tokens_seen": 59761520, "step": 27685 }, { "epoch": 4.517128874388255, "grad_norm": 0.7116439342498779, "learning_rate": 4.76263102217437e-05, "loss": 0.1094, "num_input_tokens_seen": 59771632, "step": 27690 }, { "epoch": 4.5179445350734095, "grad_norm": 0.8959900140762329, "learning_rate": 4.76247963533467e-05, "loss": 0.1142, "num_input_tokens_seen": 59781744, "step": 27695 }, { "epoch": 4.518760195758564, "grad_norm": 0.9553235173225403, "learning_rate": 4.76232820264295e-05, "loss": 0.103, "num_input_tokens_seen": 59791760, "step": 27700 }, { "epoch": 4.519575856443719, "grad_norm": 0.29405948519706726, "learning_rate": 4.762176724102279e-05, "loss": 0.1161, "num_input_tokens_seen": 59801872, "step": 27705 }, { "epoch": 4.520391517128875, "grad_norm": 0.39772748947143555, "learning_rate": 4.7620251997157274e-05, "loss": 0.159, "num_input_tokens_seen": 59811152, "step": 27710 }, { "epoch": 4.52120717781403, "grad_norm": 0.22550728917121887, "learning_rate": 4.7618736294863645e-05, "loss": 0.0538, "num_input_tokens_seen": 59821264, "step": 27715 }, { "epoch": 4.5220228384991845, "grad_norm": 1.040710210800171, "learning_rate": 4.761722013417264e-05, "loss": 0.1417, "num_input_tokens_seen": 59832336, "step": 27720 }, { "epoch": 4.522838499184339, "grad_norm": 0.7190937399864197, "learning_rate": 4.7615703515114976e-05, "loss": 0.0575, "num_input_tokens_seen": 59842288, "step": 27725 }, { "epoch": 4.523654159869494, "grad_norm": 1.8138775825500488, "learning_rate": 4.761418643772139e-05, "loss": 0.1367, "num_input_tokens_seen": 59852912, "step": 27730 }, { "epoch": 4.524469820554649, "grad_norm": 0.17630623281002045, "learning_rate": 4.761266890202263e-05, "loss": 0.0729, "num_input_tokens_seen": 59863280, "step": 27735 }, { "epoch": 4.525285481239804, "grad_norm": 0.2412528246641159, "learning_rate": 4.761115090804944e-05, "loss": 0.0709, "num_input_tokens_seen": 59874288, "step": 27740 }, { "epoch": 4.5261011419249595, "grad_norm": 0.8583831191062927, "learning_rate": 4.7609632455832596e-05, "loss": 0.0904, "num_input_tokens_seen": 59886000, "step": 27745 }, { "epoch": 4.526916802610114, "grad_norm": 0.7889488935470581, "learning_rate": 4.760811354540287e-05, "loss": 0.0579, "num_input_tokens_seen": 59896432, "step": 27750 }, { "epoch": 4.527732463295269, "grad_norm": 0.630608320236206, "learning_rate": 4.760659417679104e-05, "loss": 0.2006, "num_input_tokens_seen": 59906704, "step": 27755 }, { "epoch": 4.528548123980424, "grad_norm": 0.9978504180908203, "learning_rate": 4.76050743500279e-05, "loss": 0.0745, "num_input_tokens_seen": 59917392, "step": 27760 }, { "epoch": 4.529363784665579, "grad_norm": 0.23397177457809448, "learning_rate": 4.760355406514426e-05, "loss": 0.1915, "num_input_tokens_seen": 59927984, "step": 27765 }, { "epoch": 4.5301794453507345, "grad_norm": 0.30118611454963684, "learning_rate": 4.760203332217091e-05, "loss": 0.136, "num_input_tokens_seen": 59938160, "step": 27770 }, { "epoch": 4.530995106035889, "grad_norm": 0.45711421966552734, "learning_rate": 4.7600512121138694e-05, "loss": 0.0517, "num_input_tokens_seen": 59950000, "step": 27775 }, { "epoch": 4.531810766721044, "grad_norm": 0.4893368184566498, "learning_rate": 4.7598990462078416e-05, "loss": 0.0489, "num_input_tokens_seen": 59960720, "step": 27780 }, { "epoch": 4.532626427406199, "grad_norm": 0.04574708268046379, "learning_rate": 4.7597468345020947e-05, "loss": 0.0118, "num_input_tokens_seen": 59970864, "step": 27785 }, { "epoch": 4.533442088091354, "grad_norm": 1.7494392395019531, "learning_rate": 4.7595945769997096e-05, "loss": 0.1542, "num_input_tokens_seen": 59982256, "step": 27790 }, { "epoch": 4.5342577487765094, "grad_norm": 0.053753770887851715, "learning_rate": 4.759442273703775e-05, "loss": 0.2169, "num_input_tokens_seen": 59993584, "step": 27795 }, { "epoch": 4.535073409461664, "grad_norm": 0.17806610465049744, "learning_rate": 4.759289924617376e-05, "loss": 0.0528, "num_input_tokens_seen": 60004720, "step": 27800 }, { "epoch": 4.535889070146819, "grad_norm": 0.5505130887031555, "learning_rate": 4.7591375297436004e-05, "loss": 0.1992, "num_input_tokens_seen": 60015120, "step": 27805 }, { "epoch": 4.536704730831974, "grad_norm": 0.08610443770885468, "learning_rate": 4.758985089085537e-05, "loss": 0.0335, "num_input_tokens_seen": 60024752, "step": 27810 }, { "epoch": 4.537520391517129, "grad_norm": 1.1522552967071533, "learning_rate": 4.758832602646275e-05, "loss": 0.1193, "num_input_tokens_seen": 60037072, "step": 27815 }, { "epoch": 4.5383360522022835, "grad_norm": 0.11262308061122894, "learning_rate": 4.758680070428905e-05, "loss": 0.1113, "num_input_tokens_seen": 60048144, "step": 27820 }, { "epoch": 4.539151712887438, "grad_norm": 0.23833836615085602, "learning_rate": 4.7585274924365174e-05, "loss": 0.1606, "num_input_tokens_seen": 60058544, "step": 27825 }, { "epoch": 4.539967373572594, "grad_norm": 0.7899617552757263, "learning_rate": 4.758374868672206e-05, "loss": 0.0521, "num_input_tokens_seen": 60068688, "step": 27830 }, { "epoch": 4.540783034257749, "grad_norm": 0.32803040742874146, "learning_rate": 4.758222199139062e-05, "loss": 0.0588, "num_input_tokens_seen": 60079664, "step": 27835 }, { "epoch": 4.541598694942904, "grad_norm": 0.8555923700332642, "learning_rate": 4.758069483840181e-05, "loss": 0.1828, "num_input_tokens_seen": 60089296, "step": 27840 }, { "epoch": 4.5424143556280585, "grad_norm": 0.03936029225587845, "learning_rate": 4.757916722778657e-05, "loss": 0.1287, "num_input_tokens_seen": 60099568, "step": 27845 }, { "epoch": 4.543230016313213, "grad_norm": 0.531146764755249, "learning_rate": 4.757763915957587e-05, "loss": 0.0988, "num_input_tokens_seen": 60110160, "step": 27850 }, { "epoch": 4.544045676998369, "grad_norm": 0.8889708518981934, "learning_rate": 4.7576110633800664e-05, "loss": 0.0466, "num_input_tokens_seen": 60120880, "step": 27855 }, { "epoch": 4.544861337683524, "grad_norm": 0.06399574130773544, "learning_rate": 4.757458165049193e-05, "loss": 0.0557, "num_input_tokens_seen": 60132336, "step": 27860 }, { "epoch": 4.545676998368679, "grad_norm": 0.26854485273361206, "learning_rate": 4.757305220968067e-05, "loss": 0.0333, "num_input_tokens_seen": 60142608, "step": 27865 }, { "epoch": 4.5464926590538335, "grad_norm": 1.1440833806991577, "learning_rate": 4.757152231139787e-05, "loss": 0.0677, "num_input_tokens_seen": 60153744, "step": 27870 }, { "epoch": 4.547308319738988, "grad_norm": 0.06752360612154007, "learning_rate": 4.7569991955674534e-05, "loss": 0.0646, "num_input_tokens_seen": 60166288, "step": 27875 }, { "epoch": 4.548123980424144, "grad_norm": 1.995922565460205, "learning_rate": 4.756846114254168e-05, "loss": 0.217, "num_input_tokens_seen": 60176208, "step": 27880 }, { "epoch": 4.548939641109299, "grad_norm": 1.0567954778671265, "learning_rate": 4.756692987203033e-05, "loss": 0.0803, "num_input_tokens_seen": 60187088, "step": 27885 }, { "epoch": 4.549755301794454, "grad_norm": 0.3324156701564789, "learning_rate": 4.7565398144171525e-05, "loss": 0.2648, "num_input_tokens_seen": 60197488, "step": 27890 }, { "epoch": 4.5505709624796085, "grad_norm": 0.8638294339179993, "learning_rate": 4.756386595899629e-05, "loss": 0.0638, "num_input_tokens_seen": 60208816, "step": 27895 }, { "epoch": 4.551386623164763, "grad_norm": 0.6948354840278625, "learning_rate": 4.756233331653569e-05, "loss": 0.0977, "num_input_tokens_seen": 60219728, "step": 27900 }, { "epoch": 4.552202283849918, "grad_norm": 0.6608632802963257, "learning_rate": 4.756080021682079e-05, "loss": 0.0949, "num_input_tokens_seen": 60230288, "step": 27905 }, { "epoch": 4.553017944535073, "grad_norm": 0.05612850934267044, "learning_rate": 4.755926665988264e-05, "loss": 0.0299, "num_input_tokens_seen": 60242384, "step": 27910 }, { "epoch": 4.553833605220229, "grad_norm": 0.026820389553904533, "learning_rate": 4.755773264575234e-05, "loss": 0.1407, "num_input_tokens_seen": 60253360, "step": 27915 }, { "epoch": 4.554649265905383, "grad_norm": 0.136022686958313, "learning_rate": 4.7556198174460976e-05, "loss": 0.2432, "num_input_tokens_seen": 60263440, "step": 27920 }, { "epoch": 4.555464926590538, "grad_norm": 1.3185187578201294, "learning_rate": 4.755466324603963e-05, "loss": 0.0588, "num_input_tokens_seen": 60273392, "step": 27925 }, { "epoch": 4.556280587275693, "grad_norm": 0.8123053312301636, "learning_rate": 4.755312786051943e-05, "loss": 0.2115, "num_input_tokens_seen": 60285424, "step": 27930 }, { "epoch": 4.557096247960848, "grad_norm": 1.2683604955673218, "learning_rate": 4.755159201793148e-05, "loss": 0.1013, "num_input_tokens_seen": 60295248, "step": 27935 }, { "epoch": 4.557911908646004, "grad_norm": 0.07606402784585953, "learning_rate": 4.755005571830691e-05, "loss": 0.2259, "num_input_tokens_seen": 60306320, "step": 27940 }, { "epoch": 4.558727569331158, "grad_norm": 0.1155787855386734, "learning_rate": 4.754851896167686e-05, "loss": 0.0777, "num_input_tokens_seen": 60315600, "step": 27945 }, { "epoch": 4.559543230016313, "grad_norm": 0.0498870313167572, "learning_rate": 4.754698174807246e-05, "loss": 0.2022, "num_input_tokens_seen": 60326608, "step": 27950 }, { "epoch": 4.560358890701468, "grad_norm": 0.14799945056438446, "learning_rate": 4.7545444077524874e-05, "loss": 0.2558, "num_input_tokens_seen": 60336944, "step": 27955 }, { "epoch": 4.561174551386623, "grad_norm": 0.352129191160202, "learning_rate": 4.754390595006527e-05, "loss": 0.1742, "num_input_tokens_seen": 60348656, "step": 27960 }, { "epoch": 4.561990212071779, "grad_norm": 1.0503349304199219, "learning_rate": 4.75423673657248e-05, "loss": 0.085, "num_input_tokens_seen": 60359568, "step": 27965 }, { "epoch": 4.562805872756933, "grad_norm": 1.183427333831787, "learning_rate": 4.7540828324534664e-05, "loss": 0.1373, "num_input_tokens_seen": 60370192, "step": 27970 }, { "epoch": 4.563621533442088, "grad_norm": 0.4884597063064575, "learning_rate": 4.753928882652605e-05, "loss": 0.0946, "num_input_tokens_seen": 60381136, "step": 27975 }, { "epoch": 4.564437194127243, "grad_norm": 0.23700866103172302, "learning_rate": 4.7537748871730146e-05, "loss": 0.0932, "num_input_tokens_seen": 60392304, "step": 27980 }, { "epoch": 4.565252854812398, "grad_norm": 0.07145748287439346, "learning_rate": 4.7536208460178176e-05, "loss": 0.2222, "num_input_tokens_seen": 60402544, "step": 27985 }, { "epoch": 4.566068515497553, "grad_norm": 3.4054815769195557, "learning_rate": 4.753466759190136e-05, "loss": 0.0853, "num_input_tokens_seen": 60413488, "step": 27990 }, { "epoch": 4.566884176182708, "grad_norm": 0.5142728686332703, "learning_rate": 4.7533126266930905e-05, "loss": 0.1232, "num_input_tokens_seen": 60423696, "step": 27995 }, { "epoch": 4.567699836867863, "grad_norm": 0.5965099334716797, "learning_rate": 4.753158448529807e-05, "loss": 0.0668, "num_input_tokens_seen": 60434576, "step": 28000 }, { "epoch": 4.568515497553018, "grad_norm": 1.7780296802520752, "learning_rate": 4.753004224703409e-05, "loss": 0.2896, "num_input_tokens_seen": 60443856, "step": 28005 }, { "epoch": 4.569331158238173, "grad_norm": 1.521911382675171, "learning_rate": 4.752849955217022e-05, "loss": 0.1287, "num_input_tokens_seen": 60454864, "step": 28010 }, { "epoch": 4.570146818923328, "grad_norm": 0.9742722511291504, "learning_rate": 4.7526956400737725e-05, "loss": 0.1688, "num_input_tokens_seen": 60465456, "step": 28015 }, { "epoch": 4.5709624796084825, "grad_norm": 0.37661072611808777, "learning_rate": 4.752541279276789e-05, "loss": 0.112, "num_input_tokens_seen": 60476816, "step": 28020 }, { "epoch": 4.571778140293638, "grad_norm": 0.6799007654190063, "learning_rate": 4.752386872829198e-05, "loss": 0.1614, "num_input_tokens_seen": 60488784, "step": 28025 }, { "epoch": 4.572593800978793, "grad_norm": 0.5833417773246765, "learning_rate": 4.7522324207341295e-05, "loss": 0.0566, "num_input_tokens_seen": 60500656, "step": 28030 }, { "epoch": 4.573409461663948, "grad_norm": 0.35155364871025085, "learning_rate": 4.752077922994715e-05, "loss": 0.052, "num_input_tokens_seen": 60511664, "step": 28035 }, { "epoch": 4.574225122349103, "grad_norm": 2.114008903503418, "learning_rate": 4.751923379614084e-05, "loss": 0.2285, "num_input_tokens_seen": 60522000, "step": 28040 }, { "epoch": 4.575040783034257, "grad_norm": 0.42847663164138794, "learning_rate": 4.751768790595368e-05, "loss": 0.1074, "num_input_tokens_seen": 60532688, "step": 28045 }, { "epoch": 4.575856443719413, "grad_norm": 0.21487653255462646, "learning_rate": 4.751614155941702e-05, "loss": 0.1794, "num_input_tokens_seen": 60543152, "step": 28050 }, { "epoch": 4.576672104404568, "grad_norm": 0.3504626750946045, "learning_rate": 4.751459475656218e-05, "loss": 0.1311, "num_input_tokens_seen": 60552272, "step": 28055 }, { "epoch": 4.577487765089723, "grad_norm": 0.12311635911464691, "learning_rate": 4.751304749742052e-05, "loss": 0.1428, "num_input_tokens_seen": 60563888, "step": 28060 }, { "epoch": 4.578303425774878, "grad_norm": 0.1538054496049881, "learning_rate": 4.75114997820234e-05, "loss": 0.3369, "num_input_tokens_seen": 60574576, "step": 28065 }, { "epoch": 4.579119086460032, "grad_norm": 1.0044608116149902, "learning_rate": 4.750995161040217e-05, "loss": 0.0753, "num_input_tokens_seen": 60584848, "step": 28070 }, { "epoch": 4.579934747145187, "grad_norm": 1.1021254062652588, "learning_rate": 4.750840298258821e-05, "loss": 0.1311, "num_input_tokens_seen": 60595312, "step": 28075 }, { "epoch": 4.580750407830343, "grad_norm": 0.9352368116378784, "learning_rate": 4.750685389861291e-05, "loss": 0.1418, "num_input_tokens_seen": 60604848, "step": 28080 }, { "epoch": 4.581566068515498, "grad_norm": 0.13500581681728363, "learning_rate": 4.7505304358507676e-05, "loss": 0.1297, "num_input_tokens_seen": 60614640, "step": 28085 }, { "epoch": 4.582381729200653, "grad_norm": 1.8922009468078613, "learning_rate": 4.750375436230389e-05, "loss": 0.0982, "num_input_tokens_seen": 60624912, "step": 28090 }, { "epoch": 4.583197389885807, "grad_norm": 0.7176320552825928, "learning_rate": 4.750220391003297e-05, "loss": 0.0337, "num_input_tokens_seen": 60637968, "step": 28095 }, { "epoch": 4.584013050570962, "grad_norm": 0.03133801370859146, "learning_rate": 4.750065300172635e-05, "loss": 0.1324, "num_input_tokens_seen": 60648816, "step": 28100 }, { "epoch": 4.584828711256117, "grad_norm": 0.5618240833282471, "learning_rate": 4.749910163741546e-05, "loss": 0.1259, "num_input_tokens_seen": 60659888, "step": 28105 }, { "epoch": 4.585644371941273, "grad_norm": 1.236391305923462, "learning_rate": 4.749754981713171e-05, "loss": 0.1556, "num_input_tokens_seen": 60671472, "step": 28110 }, { "epoch": 4.5864600326264275, "grad_norm": 1.3541643619537354, "learning_rate": 4.749599754090659e-05, "loss": 0.0897, "num_input_tokens_seen": 60683696, "step": 28115 }, { "epoch": 4.587275693311582, "grad_norm": 0.4251953065395355, "learning_rate": 4.7494444808771545e-05, "loss": 0.0955, "num_input_tokens_seen": 60694832, "step": 28120 }, { "epoch": 4.588091353996737, "grad_norm": 0.3981759548187256, "learning_rate": 4.749289162075803e-05, "loss": 0.1137, "num_input_tokens_seen": 60706192, "step": 28125 }, { "epoch": 4.588907014681892, "grad_norm": 0.12120061367750168, "learning_rate": 4.749133797689753e-05, "loss": 0.0348, "num_input_tokens_seen": 60716880, "step": 28130 }, { "epoch": 4.589722675367048, "grad_norm": 1.6036018133163452, "learning_rate": 4.748978387722154e-05, "loss": 0.3409, "num_input_tokens_seen": 60727792, "step": 28135 }, { "epoch": 4.5905383360522025, "grad_norm": 0.6257677674293518, "learning_rate": 4.748822932176155e-05, "loss": 0.151, "num_input_tokens_seen": 60738800, "step": 28140 }, { "epoch": 4.591353996737357, "grad_norm": 1.0507439374923706, "learning_rate": 4.748667431054906e-05, "loss": 0.1715, "num_input_tokens_seen": 60748336, "step": 28145 }, { "epoch": 4.592169657422512, "grad_norm": 0.12246230989694595, "learning_rate": 4.7485118843615596e-05, "loss": 0.1655, "num_input_tokens_seen": 60759728, "step": 28150 }, { "epoch": 4.592985318107667, "grad_norm": 1.3494540452957153, "learning_rate": 4.748356292099266e-05, "loss": 0.1353, "num_input_tokens_seen": 60771184, "step": 28155 }, { "epoch": 4.593800978792823, "grad_norm": 0.1207166314125061, "learning_rate": 4.748200654271182e-05, "loss": 0.0407, "num_input_tokens_seen": 60782864, "step": 28160 }, { "epoch": 4.5946166394779775, "grad_norm": 2.3037493228912354, "learning_rate": 4.748044970880458e-05, "loss": 0.2234, "num_input_tokens_seen": 60794800, "step": 28165 }, { "epoch": 4.595432300163132, "grad_norm": 0.04427940770983696, "learning_rate": 4.7478892419302514e-05, "loss": 0.0342, "num_input_tokens_seen": 60806224, "step": 28170 }, { "epoch": 4.596247960848287, "grad_norm": 0.020773814991116524, "learning_rate": 4.7477334674237176e-05, "loss": 0.1464, "num_input_tokens_seen": 60817232, "step": 28175 }, { "epoch": 4.597063621533442, "grad_norm": 0.3827217221260071, "learning_rate": 4.7475776473640134e-05, "loss": 0.1461, "num_input_tokens_seen": 60828400, "step": 28180 }, { "epoch": 4.597879282218597, "grad_norm": 0.34084993600845337, "learning_rate": 4.7474217817542974e-05, "loss": 0.3422, "num_input_tokens_seen": 60840080, "step": 28185 }, { "epoch": 4.598694942903752, "grad_norm": 0.7264421582221985, "learning_rate": 4.747265870597727e-05, "loss": 0.073, "num_input_tokens_seen": 60851088, "step": 28190 }, { "epoch": 4.599510603588907, "grad_norm": 0.3453024923801422, "learning_rate": 4.747109913897464e-05, "loss": 0.0988, "num_input_tokens_seen": 60862096, "step": 28195 }, { "epoch": 4.600326264274062, "grad_norm": 0.07636333256959915, "learning_rate": 4.746953911656667e-05, "loss": 0.1978, "num_input_tokens_seen": 60872848, "step": 28200 }, { "epoch": 4.601141924959217, "grad_norm": 0.303439736366272, "learning_rate": 4.746797863878499e-05, "loss": 0.0143, "num_input_tokens_seen": 60883760, "step": 28205 }, { "epoch": 4.601957585644372, "grad_norm": 1.705543041229248, "learning_rate": 4.746641770566123e-05, "loss": 0.154, "num_input_tokens_seen": 60893456, "step": 28210 }, { "epoch": 4.602773246329527, "grad_norm": 1.1563094854354858, "learning_rate": 4.7464856317227005e-05, "loss": 0.0949, "num_input_tokens_seen": 60905136, "step": 28215 }, { "epoch": 4.603588907014682, "grad_norm": 0.7861053943634033, "learning_rate": 4.746329447351396e-05, "loss": 0.1968, "num_input_tokens_seen": 60915088, "step": 28220 }, { "epoch": 4.604404567699837, "grad_norm": 0.4123803377151489, "learning_rate": 4.7461732174553766e-05, "loss": 0.0904, "num_input_tokens_seen": 60926704, "step": 28225 }, { "epoch": 4.605220228384992, "grad_norm": 2.079204797744751, "learning_rate": 4.746016942037807e-05, "loss": 0.0799, "num_input_tokens_seen": 60937808, "step": 28230 }, { "epoch": 4.606035889070147, "grad_norm": 0.759182870388031, "learning_rate": 4.745860621101855e-05, "loss": 0.0726, "num_input_tokens_seen": 60948112, "step": 28235 }, { "epoch": 4.6068515497553015, "grad_norm": 0.41231051087379456, "learning_rate": 4.745704254650689e-05, "loss": 0.3477, "num_input_tokens_seen": 60959312, "step": 28240 }, { "epoch": 4.607667210440457, "grad_norm": 0.9647340178489685, "learning_rate": 4.745547842687477e-05, "loss": 0.0897, "num_input_tokens_seen": 60970256, "step": 28245 }, { "epoch": 4.608482871125612, "grad_norm": 0.03657524660229683, "learning_rate": 4.7453913852153884e-05, "loss": 0.0501, "num_input_tokens_seen": 60980752, "step": 28250 }, { "epoch": 4.609298531810767, "grad_norm": 0.08832980692386627, "learning_rate": 4.745234882237596e-05, "loss": 0.0574, "num_input_tokens_seen": 60991344, "step": 28255 }, { "epoch": 4.610114192495922, "grad_norm": 0.27132648229599, "learning_rate": 4.74507833375727e-05, "loss": 0.0337, "num_input_tokens_seen": 61001264, "step": 28260 }, { "epoch": 4.6109298531810765, "grad_norm": 0.05121004953980446, "learning_rate": 4.744921739777583e-05, "loss": 0.0149, "num_input_tokens_seen": 61011408, "step": 28265 }, { "epoch": 4.611745513866231, "grad_norm": 0.5728257894515991, "learning_rate": 4.74476510030171e-05, "loss": 0.064, "num_input_tokens_seen": 61020944, "step": 28270 }, { "epoch": 4.612561174551386, "grad_norm": 0.6389811635017395, "learning_rate": 4.7446084153328246e-05, "loss": 0.0845, "num_input_tokens_seen": 61031248, "step": 28275 }, { "epoch": 4.613376835236542, "grad_norm": 1.846967339515686, "learning_rate": 4.744451684874102e-05, "loss": 0.231, "num_input_tokens_seen": 61042320, "step": 28280 }, { "epoch": 4.614192495921697, "grad_norm": 1.010853886604309, "learning_rate": 4.744294908928718e-05, "loss": 0.2509, "num_input_tokens_seen": 61053520, "step": 28285 }, { "epoch": 4.6150081566068515, "grad_norm": 0.33275124430656433, "learning_rate": 4.744138087499851e-05, "loss": 0.0779, "num_input_tokens_seen": 61062960, "step": 28290 }, { "epoch": 4.615823817292006, "grad_norm": 0.9747361540794373, "learning_rate": 4.743981220590678e-05, "loss": 0.0459, "num_input_tokens_seen": 61073392, "step": 28295 }, { "epoch": 4.616639477977161, "grad_norm": 0.029049072414636612, "learning_rate": 4.743824308204381e-05, "loss": 0.0203, "num_input_tokens_seen": 61083984, "step": 28300 }, { "epoch": 4.617455138662317, "grad_norm": 2.0131773948669434, "learning_rate": 4.743667350344136e-05, "loss": 0.0985, "num_input_tokens_seen": 61095280, "step": 28305 }, { "epoch": 4.618270799347472, "grad_norm": 0.16979782283306122, "learning_rate": 4.743510347013127e-05, "loss": 0.0445, "num_input_tokens_seen": 61107024, "step": 28310 }, { "epoch": 4.6190864600326265, "grad_norm": 0.16469626128673553, "learning_rate": 4.743353298214534e-05, "loss": 0.0709, "num_input_tokens_seen": 61117552, "step": 28315 }, { "epoch": 4.619902120717781, "grad_norm": 1.0755044221878052, "learning_rate": 4.743196203951541e-05, "loss": 0.0898, "num_input_tokens_seen": 61129040, "step": 28320 }, { "epoch": 4.620717781402936, "grad_norm": 0.25937822461128235, "learning_rate": 4.74303906422733e-05, "loss": 0.0626, "num_input_tokens_seen": 61139664, "step": 28325 }, { "epoch": 4.621533442088092, "grad_norm": 2.141061305999756, "learning_rate": 4.742881879045088e-05, "loss": 0.3725, "num_input_tokens_seen": 61151728, "step": 28330 }, { "epoch": 4.622349102773247, "grad_norm": 1.7440627813339233, "learning_rate": 4.742724648408e-05, "loss": 0.2773, "num_input_tokens_seen": 61162448, "step": 28335 }, { "epoch": 4.623164763458401, "grad_norm": 1.8872275352478027, "learning_rate": 4.7425673723192504e-05, "loss": 0.1967, "num_input_tokens_seen": 61173136, "step": 28340 }, { "epoch": 4.623980424143556, "grad_norm": 1.883088231086731, "learning_rate": 4.742410050782029e-05, "loss": 0.1878, "num_input_tokens_seen": 61183120, "step": 28345 }, { "epoch": 4.624796084828711, "grad_norm": 0.13097819685935974, "learning_rate": 4.742252683799523e-05, "loss": 0.0536, "num_input_tokens_seen": 61194000, "step": 28350 }, { "epoch": 4.625611745513866, "grad_norm": 1.0745267868041992, "learning_rate": 4.7420952713749224e-05, "loss": 0.085, "num_input_tokens_seen": 61204688, "step": 28355 }, { "epoch": 4.626427406199021, "grad_norm": 1.389063835144043, "learning_rate": 4.741937813511416e-05, "loss": 0.1901, "num_input_tokens_seen": 61215376, "step": 28360 }, { "epoch": 4.627243066884176, "grad_norm": 0.5507066249847412, "learning_rate": 4.741780310212196e-05, "loss": 0.0588, "num_input_tokens_seen": 61226256, "step": 28365 }, { "epoch": 4.628058727569331, "grad_norm": 0.5277431607246399, "learning_rate": 4.741622761480454e-05, "loss": 0.0563, "num_input_tokens_seen": 61238192, "step": 28370 }, { "epoch": 4.628874388254486, "grad_norm": 0.1034453734755516, "learning_rate": 4.741465167319383e-05, "loss": 0.1402, "num_input_tokens_seen": 61248944, "step": 28375 }, { "epoch": 4.629690048939641, "grad_norm": 0.080253966152668, "learning_rate": 4.741307527732177e-05, "loss": 0.0978, "num_input_tokens_seen": 61260752, "step": 28380 }, { "epoch": 4.630505709624796, "grad_norm": 1.3950355052947998, "learning_rate": 4.741149842722031e-05, "loss": 0.3119, "num_input_tokens_seen": 61271984, "step": 28385 }, { "epoch": 4.631321370309951, "grad_norm": 1.1463536024093628, "learning_rate": 4.74099211229214e-05, "loss": 0.0845, "num_input_tokens_seen": 61283248, "step": 28390 }, { "epoch": 4.632137030995106, "grad_norm": 0.3146193027496338, "learning_rate": 4.740834336445701e-05, "loss": 0.0775, "num_input_tokens_seen": 61294512, "step": 28395 }, { "epoch": 4.632952691680261, "grad_norm": 1.55314040184021, "learning_rate": 4.740676515185911e-05, "loss": 0.1198, "num_input_tokens_seen": 61304944, "step": 28400 }, { "epoch": 4.633768352365416, "grad_norm": 0.11849198490381241, "learning_rate": 4.7405186485159696e-05, "loss": 0.0244, "num_input_tokens_seen": 61315408, "step": 28405 }, { "epoch": 4.634584013050571, "grad_norm": 0.8358097076416016, "learning_rate": 4.7403607364390755e-05, "loss": 0.1149, "num_input_tokens_seen": 61326000, "step": 28410 }, { "epoch": 4.635399673735726, "grad_norm": 0.12056897580623627, "learning_rate": 4.740202778958429e-05, "loss": 0.1551, "num_input_tokens_seen": 61335984, "step": 28415 }, { "epoch": 4.636215334420881, "grad_norm": 0.22775417566299438, "learning_rate": 4.74004477607723e-05, "loss": 0.0637, "num_input_tokens_seen": 61347344, "step": 28420 }, { "epoch": 4.637030995106036, "grad_norm": 1.2160851955413818, "learning_rate": 4.7398867277986836e-05, "loss": 0.218, "num_input_tokens_seen": 61358352, "step": 28425 }, { "epoch": 4.637846655791191, "grad_norm": 0.11932533979415894, "learning_rate": 4.739728634125991e-05, "loss": 0.0501, "num_input_tokens_seen": 61369168, "step": 28430 }, { "epoch": 4.638662316476346, "grad_norm": 1.7146462202072144, "learning_rate": 4.7395704950623554e-05, "loss": 0.2357, "num_input_tokens_seen": 61379984, "step": 28435 }, { "epoch": 4.6394779771615005, "grad_norm": 1.503872036933899, "learning_rate": 4.7394123106109834e-05, "loss": 0.2477, "num_input_tokens_seen": 61391760, "step": 28440 }, { "epoch": 4.640293637846656, "grad_norm": 0.5323691368103027, "learning_rate": 4.73925408077508e-05, "loss": 0.1868, "num_input_tokens_seen": 61401680, "step": 28445 }, { "epoch": 4.641109298531811, "grad_norm": 0.060420360416173935, "learning_rate": 4.739095805557851e-05, "loss": 0.0669, "num_input_tokens_seen": 61411696, "step": 28450 }, { "epoch": 4.641924959216966, "grad_norm": 0.25916996598243713, "learning_rate": 4.738937484962506e-05, "loss": 0.1557, "num_input_tokens_seen": 61421968, "step": 28455 }, { "epoch": 4.642740619902121, "grad_norm": 0.8132188320159912, "learning_rate": 4.738779118992252e-05, "loss": 0.1358, "num_input_tokens_seen": 61430960, "step": 28460 }, { "epoch": 4.643556280587275, "grad_norm": 0.9018223285675049, "learning_rate": 4.738620707650299e-05, "loss": 0.2932, "num_input_tokens_seen": 61442032, "step": 28465 }, { "epoch": 4.64437194127243, "grad_norm": 0.8387150764465332, "learning_rate": 4.738462250939858e-05, "loss": 0.172, "num_input_tokens_seen": 61453136, "step": 28470 }, { "epoch": 4.645187601957586, "grad_norm": 0.6125106811523438, "learning_rate": 4.73830374886414e-05, "loss": 0.1422, "num_input_tokens_seen": 61464048, "step": 28475 }, { "epoch": 4.646003262642741, "grad_norm": 0.07514593750238419, "learning_rate": 4.738145201426356e-05, "loss": 0.1784, "num_input_tokens_seen": 61475344, "step": 28480 }, { "epoch": 4.646818923327896, "grad_norm": 0.4676531255245209, "learning_rate": 4.737986608629721e-05, "loss": 0.0492, "num_input_tokens_seen": 61485168, "step": 28485 }, { "epoch": 4.64763458401305, "grad_norm": 1.070404291152954, "learning_rate": 4.737827970477448e-05, "loss": 0.1731, "num_input_tokens_seen": 61496304, "step": 28490 }, { "epoch": 4.648450244698205, "grad_norm": 0.4948771595954895, "learning_rate": 4.737669286972752e-05, "loss": 0.0585, "num_input_tokens_seen": 61507088, "step": 28495 }, { "epoch": 4.649265905383361, "grad_norm": 1.0993419885635376, "learning_rate": 4.73751055811885e-05, "loss": 0.1549, "num_input_tokens_seen": 61518032, "step": 28500 }, { "epoch": 4.650081566068516, "grad_norm": 0.6627869606018066, "learning_rate": 4.7373517839189574e-05, "loss": 0.1031, "num_input_tokens_seen": 61529424, "step": 28505 }, { "epoch": 4.650897226753671, "grad_norm": 0.6236154437065125, "learning_rate": 4.737192964376293e-05, "loss": 0.0551, "num_input_tokens_seen": 61540976, "step": 28510 }, { "epoch": 4.651712887438825, "grad_norm": 0.1940660923719406, "learning_rate": 4.7370340994940755e-05, "loss": 0.0518, "num_input_tokens_seen": 61552368, "step": 28515 }, { "epoch": 4.65252854812398, "grad_norm": 0.11094251275062561, "learning_rate": 4.736875189275523e-05, "loss": 0.0433, "num_input_tokens_seen": 61563184, "step": 28520 }, { "epoch": 4.653344208809135, "grad_norm": 2.3374359607696533, "learning_rate": 4.736716233723858e-05, "loss": 0.2595, "num_input_tokens_seen": 61572720, "step": 28525 }, { "epoch": 4.654159869494291, "grad_norm": 0.10050688683986664, "learning_rate": 4.7365572328423e-05, "loss": 0.0982, "num_input_tokens_seen": 61584176, "step": 28530 }, { "epoch": 4.6549755301794455, "grad_norm": 0.20121844112873077, "learning_rate": 4.7363981866340735e-05, "loss": 0.0558, "num_input_tokens_seen": 61595152, "step": 28535 }, { "epoch": 4.6557911908646, "grad_norm": 0.9322694540023804, "learning_rate": 4.736239095102401e-05, "loss": 0.1237, "num_input_tokens_seen": 61606032, "step": 28540 }, { "epoch": 4.656606851549755, "grad_norm": 0.13100680708885193, "learning_rate": 4.736079958250506e-05, "loss": 0.0987, "num_input_tokens_seen": 61617680, "step": 28545 }, { "epoch": 4.65742251223491, "grad_norm": 0.28402355313301086, "learning_rate": 4.7359207760816125e-05, "loss": 0.1091, "num_input_tokens_seen": 61628080, "step": 28550 }, { "epoch": 4.658238172920065, "grad_norm": 0.9491446614265442, "learning_rate": 4.7357615485989493e-05, "loss": 0.2406, "num_input_tokens_seen": 61639440, "step": 28555 }, { "epoch": 4.6590538336052205, "grad_norm": 0.9680479764938354, "learning_rate": 4.7356022758057414e-05, "loss": 0.1378, "num_input_tokens_seen": 61650544, "step": 28560 }, { "epoch": 4.659869494290375, "grad_norm": 0.6650114059448242, "learning_rate": 4.735442957705218e-05, "loss": 0.1966, "num_input_tokens_seen": 61661232, "step": 28565 }, { "epoch": 4.66068515497553, "grad_norm": 1.5517138242721558, "learning_rate": 4.735283594300608e-05, "loss": 0.0436, "num_input_tokens_seen": 61671984, "step": 28570 }, { "epoch": 4.661500815660685, "grad_norm": 0.6718622446060181, "learning_rate": 4.735124185595139e-05, "loss": 0.0809, "num_input_tokens_seen": 61682160, "step": 28575 }, { "epoch": 4.66231647634584, "grad_norm": 0.26535555720329285, "learning_rate": 4.734964731592043e-05, "loss": 0.1321, "num_input_tokens_seen": 61692112, "step": 28580 }, { "epoch": 4.6631321370309955, "grad_norm": 0.04336816817522049, "learning_rate": 4.734805232294552e-05, "loss": 0.1639, "num_input_tokens_seen": 61703312, "step": 28585 }, { "epoch": 4.66394779771615, "grad_norm": 0.5781800150871277, "learning_rate": 4.7346456877058975e-05, "loss": 0.044, "num_input_tokens_seen": 61713616, "step": 28590 }, { "epoch": 4.664763458401305, "grad_norm": 0.08473583310842514, "learning_rate": 4.734486097829313e-05, "loss": 0.2359, "num_input_tokens_seen": 61724080, "step": 28595 }, { "epoch": 4.66557911908646, "grad_norm": 0.6738030910491943, "learning_rate": 4.734326462668034e-05, "loss": 0.16, "num_input_tokens_seen": 61735216, "step": 28600 }, { "epoch": 4.666394779771615, "grad_norm": 1.4539538621902466, "learning_rate": 4.734166782225294e-05, "loss": 0.0811, "num_input_tokens_seen": 61746736, "step": 28605 }, { "epoch": 4.6672104404567705, "grad_norm": 1.6495949029922485, "learning_rate": 4.73400705650433e-05, "loss": 0.1653, "num_input_tokens_seen": 61758032, "step": 28610 }, { "epoch": 4.668026101141925, "grad_norm": 0.436447411775589, "learning_rate": 4.733847285508379e-05, "loss": 0.2491, "num_input_tokens_seen": 61768624, "step": 28615 }, { "epoch": 4.66884176182708, "grad_norm": 0.6804488301277161, "learning_rate": 4.73368746924068e-05, "loss": 0.0526, "num_input_tokens_seen": 61779408, "step": 28620 }, { "epoch": 4.669657422512235, "grad_norm": 0.551805853843689, "learning_rate": 4.7335276077044696e-05, "loss": 0.0518, "num_input_tokens_seen": 61790320, "step": 28625 }, { "epoch": 4.67047308319739, "grad_norm": 0.19601291418075562, "learning_rate": 4.733367700902989e-05, "loss": 0.0579, "num_input_tokens_seen": 61801968, "step": 28630 }, { "epoch": 4.671288743882545, "grad_norm": 0.25970467925071716, "learning_rate": 4.733207748839479e-05, "loss": 0.1883, "num_input_tokens_seen": 61812944, "step": 28635 }, { "epoch": 4.672104404567699, "grad_norm": 1.2714217901229858, "learning_rate": 4.73304775151718e-05, "loss": 0.1203, "num_input_tokens_seen": 61824912, "step": 28640 }, { "epoch": 4.672920065252855, "grad_norm": 0.03594202548265457, "learning_rate": 4.732887708939337e-05, "loss": 0.1681, "num_input_tokens_seen": 61835120, "step": 28645 }, { "epoch": 4.67373572593801, "grad_norm": 0.10176603496074677, "learning_rate": 4.732727621109191e-05, "loss": 0.0834, "num_input_tokens_seen": 61845680, "step": 28650 }, { "epoch": 4.674551386623165, "grad_norm": 1.0786736011505127, "learning_rate": 4.732567488029988e-05, "loss": 0.1424, "num_input_tokens_seen": 61856368, "step": 28655 }, { "epoch": 4.6753670473083195, "grad_norm": 0.39616623520851135, "learning_rate": 4.732407309704972e-05, "loss": 0.1069, "num_input_tokens_seen": 61866448, "step": 28660 }, { "epoch": 4.676182707993474, "grad_norm": 0.5581190586090088, "learning_rate": 4.732247086137389e-05, "loss": 0.1482, "num_input_tokens_seen": 61877040, "step": 28665 }, { "epoch": 4.67699836867863, "grad_norm": 1.2937877178192139, "learning_rate": 4.732086817330489e-05, "loss": 0.1678, "num_input_tokens_seen": 61887472, "step": 28670 }, { "epoch": 4.677814029363785, "grad_norm": 1.0548644065856934, "learning_rate": 4.731926503287517e-05, "loss": 0.0653, "num_input_tokens_seen": 61898864, "step": 28675 }, { "epoch": 4.67862969004894, "grad_norm": 0.2449779361486435, "learning_rate": 4.7317661440117224e-05, "loss": 0.0415, "num_input_tokens_seen": 61910160, "step": 28680 }, { "epoch": 4.6794453507340945, "grad_norm": 1.2876278162002563, "learning_rate": 4.731605739506356e-05, "loss": 0.3589, "num_input_tokens_seen": 61920176, "step": 28685 }, { "epoch": 4.680261011419249, "grad_norm": 1.5887141227722168, "learning_rate": 4.731445289774669e-05, "loss": 0.1943, "num_input_tokens_seen": 61930992, "step": 28690 }, { "epoch": 4.681076672104405, "grad_norm": 0.2962372303009033, "learning_rate": 4.731284794819912e-05, "loss": 0.1353, "num_input_tokens_seen": 61941744, "step": 28695 }, { "epoch": 4.68189233278956, "grad_norm": 0.21707294881343842, "learning_rate": 4.731124254645338e-05, "loss": 0.1273, "num_input_tokens_seen": 61951760, "step": 28700 }, { "epoch": 4.682707993474715, "grad_norm": 1.0661993026733398, "learning_rate": 4.7309636692542015e-05, "loss": 0.356, "num_input_tokens_seen": 61961040, "step": 28705 }, { "epoch": 4.6835236541598695, "grad_norm": 0.09470579028129578, "learning_rate": 4.7308030386497546e-05, "loss": 0.0507, "num_input_tokens_seen": 61971856, "step": 28710 }, { "epoch": 4.684339314845024, "grad_norm": 0.17003345489501953, "learning_rate": 4.7306423628352555e-05, "loss": 0.0289, "num_input_tokens_seen": 61983760, "step": 28715 }, { "epoch": 4.685154975530179, "grad_norm": 0.5993517637252808, "learning_rate": 4.730481641813959e-05, "loss": 0.0496, "num_input_tokens_seen": 61994480, "step": 28720 }, { "epoch": 4.685970636215334, "grad_norm": 1.7256637811660767, "learning_rate": 4.730320875589121e-05, "loss": 0.1787, "num_input_tokens_seen": 62005360, "step": 28725 }, { "epoch": 4.68678629690049, "grad_norm": 0.2972213327884674, "learning_rate": 4.730160064164002e-05, "loss": 0.087, "num_input_tokens_seen": 62016976, "step": 28730 }, { "epoch": 4.6876019575856445, "grad_norm": 1.3956674337387085, "learning_rate": 4.72999920754186e-05, "loss": 0.1255, "num_input_tokens_seen": 62028176, "step": 28735 }, { "epoch": 4.688417618270799, "grad_norm": 0.19933557510375977, "learning_rate": 4.729838305725956e-05, "loss": 0.1226, "num_input_tokens_seen": 62038672, "step": 28740 }, { "epoch": 4.689233278955954, "grad_norm": 0.10173029452562332, "learning_rate": 4.7296773587195484e-05, "loss": 0.0275, "num_input_tokens_seen": 62049424, "step": 28745 }, { "epoch": 4.690048939641109, "grad_norm": 0.43837234377861023, "learning_rate": 4.7295163665259026e-05, "loss": 0.094, "num_input_tokens_seen": 62060112, "step": 28750 }, { "epoch": 4.690864600326265, "grad_norm": 0.15896859765052795, "learning_rate": 4.7293553291482776e-05, "loss": 0.097, "num_input_tokens_seen": 62071728, "step": 28755 }, { "epoch": 4.691680261011419, "grad_norm": 0.14650380611419678, "learning_rate": 4.7291942465899395e-05, "loss": 0.2704, "num_input_tokens_seen": 62081680, "step": 28760 }, { "epoch": 4.692495921696574, "grad_norm": 0.9621838927268982, "learning_rate": 4.729033118854151e-05, "loss": 0.1015, "num_input_tokens_seen": 62091984, "step": 28765 }, { "epoch": 4.693311582381729, "grad_norm": 0.04248065873980522, "learning_rate": 4.7288719459441796e-05, "loss": 0.0805, "num_input_tokens_seen": 62102864, "step": 28770 }, { "epoch": 4.694127243066884, "grad_norm": 1.1391061544418335, "learning_rate": 4.728710727863291e-05, "loss": 0.2391, "num_input_tokens_seen": 62114256, "step": 28775 }, { "epoch": 4.69494290375204, "grad_norm": 0.4483099579811096, "learning_rate": 4.7285494646147513e-05, "loss": 0.0372, "num_input_tokens_seen": 62124816, "step": 28780 }, { "epoch": 4.695758564437194, "grad_norm": 0.16230057179927826, "learning_rate": 4.7283881562018305e-05, "loss": 0.0574, "num_input_tokens_seen": 62136368, "step": 28785 }, { "epoch": 4.696574225122349, "grad_norm": 0.31456780433654785, "learning_rate": 4.728226802627796e-05, "loss": 0.2229, "num_input_tokens_seen": 62147024, "step": 28790 }, { "epoch": 4.697389885807504, "grad_norm": 0.7294688820838928, "learning_rate": 4.7280654038959196e-05, "loss": 0.0628, "num_input_tokens_seen": 62156848, "step": 28795 }, { "epoch": 4.698205546492659, "grad_norm": 0.35731184482574463, "learning_rate": 4.72790396000947e-05, "loss": 0.0734, "num_input_tokens_seen": 62168496, "step": 28800 }, { "epoch": 4.699021207177814, "grad_norm": 0.35883960127830505, "learning_rate": 4.727742470971721e-05, "loss": 0.1456, "num_input_tokens_seen": 62178800, "step": 28805 }, { "epoch": 4.699836867862969, "grad_norm": 0.12463554739952087, "learning_rate": 4.7275809367859447e-05, "loss": 0.0966, "num_input_tokens_seen": 62190768, "step": 28810 }, { "epoch": 4.700652528548124, "grad_norm": 0.06682997941970825, "learning_rate": 4.727419357455415e-05, "loss": 0.0281, "num_input_tokens_seen": 62201648, "step": 28815 }, { "epoch": 4.701468189233279, "grad_norm": 0.30435240268707275, "learning_rate": 4.727257732983406e-05, "loss": 0.1797, "num_input_tokens_seen": 62212816, "step": 28820 }, { "epoch": 4.702283849918434, "grad_norm": 0.10718075931072235, "learning_rate": 4.7270960633731944e-05, "loss": 0.1093, "num_input_tokens_seen": 62222992, "step": 28825 }, { "epoch": 4.703099510603589, "grad_norm": 1.2991302013397217, "learning_rate": 4.726934348628055e-05, "loss": 0.1551, "num_input_tokens_seen": 62234192, "step": 28830 }, { "epoch": 4.7039151712887435, "grad_norm": 0.025961367413401604, "learning_rate": 4.726772588751266e-05, "loss": 0.2022, "num_input_tokens_seen": 62244848, "step": 28835 }, { "epoch": 4.704730831973899, "grad_norm": 1.6317039728164673, "learning_rate": 4.726610783746105e-05, "loss": 0.1543, "num_input_tokens_seen": 62255376, "step": 28840 }, { "epoch": 4.705546492659054, "grad_norm": 0.4981129467487335, "learning_rate": 4.726448933615853e-05, "loss": 0.0717, "num_input_tokens_seen": 62266192, "step": 28845 }, { "epoch": 4.706362153344209, "grad_norm": 0.08160426467657089, "learning_rate": 4.726287038363788e-05, "loss": 0.0457, "num_input_tokens_seen": 62277232, "step": 28850 }, { "epoch": 4.707177814029364, "grad_norm": 0.0727953314781189, "learning_rate": 4.7261250979931926e-05, "loss": 0.0388, "num_input_tokens_seen": 62288368, "step": 28855 }, { "epoch": 4.7079934747145185, "grad_norm": 0.9624606966972351, "learning_rate": 4.725963112507348e-05, "loss": 0.2075, "num_input_tokens_seen": 62299024, "step": 28860 }, { "epoch": 4.708809135399674, "grad_norm": 1.1825013160705566, "learning_rate": 4.725801081909537e-05, "loss": 0.2118, "num_input_tokens_seen": 62309136, "step": 28865 }, { "epoch": 4.709624796084829, "grad_norm": 1.7779262065887451, "learning_rate": 4.725639006203043e-05, "loss": 0.1785, "num_input_tokens_seen": 62320656, "step": 28870 }, { "epoch": 4.710440456769984, "grad_norm": 0.09175239503383636, "learning_rate": 4.725476885391152e-05, "loss": 0.0302, "num_input_tokens_seen": 62330896, "step": 28875 }, { "epoch": 4.711256117455139, "grad_norm": 0.12210704386234283, "learning_rate": 4.725314719477148e-05, "loss": 0.0925, "num_input_tokens_seen": 62342032, "step": 28880 }, { "epoch": 4.712071778140293, "grad_norm": 0.07247887551784515, "learning_rate": 4.7251525084643184e-05, "loss": 0.0962, "num_input_tokens_seen": 62351984, "step": 28885 }, { "epoch": 4.712887438825448, "grad_norm": 1.1032787561416626, "learning_rate": 4.72499025235595e-05, "loss": 0.0818, "num_input_tokens_seen": 62362256, "step": 28890 }, { "epoch": 4.713703099510604, "grad_norm": 1.310519814491272, "learning_rate": 4.724827951155332e-05, "loss": 0.199, "num_input_tokens_seen": 62373040, "step": 28895 }, { "epoch": 4.714518760195759, "grad_norm": 0.04564826935529709, "learning_rate": 4.724665604865753e-05, "loss": 0.0882, "num_input_tokens_seen": 62384304, "step": 28900 }, { "epoch": 4.715334420880914, "grad_norm": 1.5952017307281494, "learning_rate": 4.724503213490504e-05, "loss": 0.0939, "num_input_tokens_seen": 62394864, "step": 28905 }, { "epoch": 4.716150081566068, "grad_norm": 0.6611859798431396, "learning_rate": 4.724340777032875e-05, "loss": 0.1286, "num_input_tokens_seen": 62405040, "step": 28910 }, { "epoch": 4.716965742251223, "grad_norm": 0.03463057801127434, "learning_rate": 4.724178295496158e-05, "loss": 0.1024, "num_input_tokens_seen": 62417040, "step": 28915 }, { "epoch": 4.717781402936378, "grad_norm": 0.8691680431365967, "learning_rate": 4.7240157688836464e-05, "loss": 0.2102, "num_input_tokens_seen": 62427088, "step": 28920 }, { "epoch": 4.718597063621534, "grad_norm": 0.2865733504295349, "learning_rate": 4.7238531971986336e-05, "loss": 0.166, "num_input_tokens_seen": 62436816, "step": 28925 }, { "epoch": 4.719412724306689, "grad_norm": 0.1091538667678833, "learning_rate": 4.723690580444415e-05, "loss": 0.0311, "num_input_tokens_seen": 62447248, "step": 28930 }, { "epoch": 4.720228384991843, "grad_norm": 0.5032589435577393, "learning_rate": 4.723527918624286e-05, "loss": 0.1712, "num_input_tokens_seen": 62456368, "step": 28935 }, { "epoch": 4.721044045676998, "grad_norm": 0.41188928484916687, "learning_rate": 4.7233652117415426e-05, "loss": 0.1059, "num_input_tokens_seen": 62467088, "step": 28940 }, { "epoch": 4.721859706362153, "grad_norm": 1.461401343345642, "learning_rate": 4.723202459799483e-05, "loss": 0.2719, "num_input_tokens_seen": 62476752, "step": 28945 }, { "epoch": 4.722675367047309, "grad_norm": 0.03647707775235176, "learning_rate": 4.723039662801405e-05, "loss": 0.0926, "num_input_tokens_seen": 62487152, "step": 28950 }, { "epoch": 4.7234910277324635, "grad_norm": 0.17062658071517944, "learning_rate": 4.722876820750608e-05, "loss": 0.0454, "num_input_tokens_seen": 62498544, "step": 28955 }, { "epoch": 4.724306688417618, "grad_norm": 0.02014424465596676, "learning_rate": 4.7227139336503924e-05, "loss": 0.0692, "num_input_tokens_seen": 62509488, "step": 28960 }, { "epoch": 4.725122349102773, "grad_norm": 0.22453829646110535, "learning_rate": 4.72255100150406e-05, "loss": 0.1043, "num_input_tokens_seen": 62520464, "step": 28965 }, { "epoch": 4.725938009787928, "grad_norm": 0.8243473172187805, "learning_rate": 4.722388024314911e-05, "loss": 0.3303, "num_input_tokens_seen": 62531536, "step": 28970 }, { "epoch": 4.726753670473083, "grad_norm": 2.507783889770508, "learning_rate": 4.722225002086249e-05, "loss": 0.1533, "num_input_tokens_seen": 62542320, "step": 28975 }, { "epoch": 4.7275693311582385, "grad_norm": 0.4484693706035614, "learning_rate": 4.722061934821379e-05, "loss": 0.0883, "num_input_tokens_seen": 62551760, "step": 28980 }, { "epoch": 4.728384991843393, "grad_norm": 0.6591301560401917, "learning_rate": 4.721898822523605e-05, "loss": 0.0771, "num_input_tokens_seen": 62562512, "step": 28985 }, { "epoch": 4.729200652528548, "grad_norm": 0.13226640224456787, "learning_rate": 4.721735665196233e-05, "loss": 0.1297, "num_input_tokens_seen": 62573648, "step": 28990 }, { "epoch": 4.730016313213703, "grad_norm": 0.3276924192905426, "learning_rate": 4.7215724628425685e-05, "loss": 0.2189, "num_input_tokens_seen": 62584592, "step": 28995 }, { "epoch": 4.730831973898858, "grad_norm": 0.03834255412220955, "learning_rate": 4.721409215465921e-05, "loss": 0.0579, "num_input_tokens_seen": 62596496, "step": 29000 }, { "epoch": 4.731647634584013, "grad_norm": 0.204558327794075, "learning_rate": 4.721245923069596e-05, "loss": 0.1051, "num_input_tokens_seen": 62607312, "step": 29005 }, { "epoch": 4.732463295269168, "grad_norm": 0.7929573059082031, "learning_rate": 4.7210825856569055e-05, "loss": 0.0743, "num_input_tokens_seen": 62617680, "step": 29010 }, { "epoch": 4.733278955954323, "grad_norm": 0.3777529001235962, "learning_rate": 4.720919203231159e-05, "loss": 0.0807, "num_input_tokens_seen": 62627664, "step": 29015 }, { "epoch": 4.734094616639478, "grad_norm": 1.2721196413040161, "learning_rate": 4.7207557757956666e-05, "loss": 0.2729, "num_input_tokens_seen": 62638352, "step": 29020 }, { "epoch": 4.734910277324633, "grad_norm": 0.2460135519504547, "learning_rate": 4.720592303353742e-05, "loss": 0.1713, "num_input_tokens_seen": 62649520, "step": 29025 }, { "epoch": 4.735725938009788, "grad_norm": 0.09438692778348923, "learning_rate": 4.7204287859086974e-05, "loss": 0.0351, "num_input_tokens_seen": 62660080, "step": 29030 }, { "epoch": 4.736541598694943, "grad_norm": 0.45205187797546387, "learning_rate": 4.720265223463846e-05, "loss": 0.2806, "num_input_tokens_seen": 62669808, "step": 29035 }, { "epoch": 4.737357259380098, "grad_norm": 0.4390513300895691, "learning_rate": 4.720101616022503e-05, "loss": 0.0793, "num_input_tokens_seen": 62681136, "step": 29040 }, { "epoch": 4.738172920065253, "grad_norm": 0.1954546719789505, "learning_rate": 4.719937963587985e-05, "loss": 0.2145, "num_input_tokens_seen": 62692144, "step": 29045 }, { "epoch": 4.738988580750408, "grad_norm": 0.18110665678977966, "learning_rate": 4.719774266163608e-05, "loss": 0.1534, "num_input_tokens_seen": 62701360, "step": 29050 }, { "epoch": 4.739804241435563, "grad_norm": 1.7700319290161133, "learning_rate": 4.719610523752689e-05, "loss": 0.1193, "num_input_tokens_seen": 62713488, "step": 29055 }, { "epoch": 4.740619902120718, "grad_norm": 0.13189592957496643, "learning_rate": 4.719446736358547e-05, "loss": 0.1703, "num_input_tokens_seen": 62725264, "step": 29060 }, { "epoch": 4.741435562805873, "grad_norm": 1.16636061668396, "learning_rate": 4.719282903984502e-05, "loss": 0.0957, "num_input_tokens_seen": 62735312, "step": 29065 }, { "epoch": 4.742251223491028, "grad_norm": 0.20533119142055511, "learning_rate": 4.719119026633873e-05, "loss": 0.1614, "num_input_tokens_seen": 62745744, "step": 29070 }, { "epoch": 4.743066884176183, "grad_norm": 0.5716477036476135, "learning_rate": 4.718955104309982e-05, "loss": 0.1533, "num_input_tokens_seen": 62754960, "step": 29075 }, { "epoch": 4.7438825448613375, "grad_norm": 0.6575483679771423, "learning_rate": 4.718791137016151e-05, "loss": 0.1484, "num_input_tokens_seen": 62765712, "step": 29080 }, { "epoch": 4.744698205546492, "grad_norm": 0.566233217716217, "learning_rate": 4.718627124755702e-05, "loss": 0.0911, "num_input_tokens_seen": 62776784, "step": 29085 }, { "epoch": 4.745513866231647, "grad_norm": 0.18670691549777985, "learning_rate": 4.718463067531961e-05, "loss": 0.2762, "num_input_tokens_seen": 62787984, "step": 29090 }, { "epoch": 4.746329526916803, "grad_norm": 0.13368381559848785, "learning_rate": 4.718298965348251e-05, "loss": 0.1015, "num_input_tokens_seen": 62797680, "step": 29095 }, { "epoch": 4.747145187601958, "grad_norm": 0.092383973300457, "learning_rate": 4.718134818207899e-05, "loss": 0.0772, "num_input_tokens_seen": 62808112, "step": 29100 }, { "epoch": 4.7479608482871125, "grad_norm": 0.35980790853500366, "learning_rate": 4.71797062611423e-05, "loss": 0.2762, "num_input_tokens_seen": 62818928, "step": 29105 }, { "epoch": 4.748776508972267, "grad_norm": 0.49156883358955383, "learning_rate": 4.717806389070573e-05, "loss": 0.1207, "num_input_tokens_seen": 62829872, "step": 29110 }, { "epoch": 4.749592169657422, "grad_norm": 0.2509070634841919, "learning_rate": 4.7176421070802564e-05, "loss": 0.0977, "num_input_tokens_seen": 62840144, "step": 29115 }, { "epoch": 4.750407830342578, "grad_norm": 0.27310800552368164, "learning_rate": 4.717477780146608e-05, "loss": 0.1113, "num_input_tokens_seen": 62851856, "step": 29120 }, { "epoch": 4.751223491027733, "grad_norm": 0.5677315592765808, "learning_rate": 4.717313408272961e-05, "loss": 0.1919, "num_input_tokens_seen": 62863056, "step": 29125 }, { "epoch": 4.7520391517128875, "grad_norm": 1.4748884439468384, "learning_rate": 4.7171489914626445e-05, "loss": 0.0931, "num_input_tokens_seen": 62872592, "step": 29130 }, { "epoch": 4.752854812398042, "grad_norm": 1.9725043773651123, "learning_rate": 4.71698452971899e-05, "loss": 0.1453, "num_input_tokens_seen": 62883600, "step": 29135 }, { "epoch": 4.753670473083197, "grad_norm": 2.029287338256836, "learning_rate": 4.7168200230453325e-05, "loss": 0.1723, "num_input_tokens_seen": 62894160, "step": 29140 }, { "epoch": 4.754486133768353, "grad_norm": 0.5314253568649292, "learning_rate": 4.716655471445004e-05, "loss": 0.1197, "num_input_tokens_seen": 62905648, "step": 29145 }, { "epoch": 4.755301794453508, "grad_norm": 0.12573298811912537, "learning_rate": 4.716490874921342e-05, "loss": 0.0627, "num_input_tokens_seen": 62918256, "step": 29150 }, { "epoch": 4.7561174551386625, "grad_norm": 1.6033653020858765, "learning_rate": 4.7163262334776795e-05, "loss": 0.2298, "num_input_tokens_seen": 62929200, "step": 29155 }, { "epoch": 4.756933115823817, "grad_norm": 0.8005926012992859, "learning_rate": 4.716161547117354e-05, "loss": 0.0811, "num_input_tokens_seen": 62940016, "step": 29160 }, { "epoch": 4.757748776508972, "grad_norm": 0.5327176451683044, "learning_rate": 4.715996815843704e-05, "loss": 0.0365, "num_input_tokens_seen": 62949744, "step": 29165 }, { "epoch": 4.758564437194127, "grad_norm": 0.11946872621774673, "learning_rate": 4.7158320396600674e-05, "loss": 0.1139, "num_input_tokens_seen": 62960848, "step": 29170 }, { "epoch": 4.759380097879282, "grad_norm": 0.18485911190509796, "learning_rate": 4.7156672185697824e-05, "loss": 0.2382, "num_input_tokens_seen": 62971984, "step": 29175 }, { "epoch": 4.760195758564437, "grad_norm": 0.4369773268699646, "learning_rate": 4.7155023525761924e-05, "loss": 0.0733, "num_input_tokens_seen": 62982224, "step": 29180 }, { "epoch": 4.761011419249592, "grad_norm": 1.260390281677246, "learning_rate": 4.715337441682635e-05, "loss": 0.1896, "num_input_tokens_seen": 62992464, "step": 29185 }, { "epoch": 4.761827079934747, "grad_norm": 0.7319538593292236, "learning_rate": 4.715172485892455e-05, "loss": 0.106, "num_input_tokens_seen": 63002960, "step": 29190 }, { "epoch": 4.762642740619902, "grad_norm": 0.28949111700057983, "learning_rate": 4.7150074852089934e-05, "loss": 0.1754, "num_input_tokens_seen": 63013520, "step": 29195 }, { "epoch": 4.763458401305057, "grad_norm": 1.8128609657287598, "learning_rate": 4.714842439635596e-05, "loss": 0.1939, "num_input_tokens_seen": 63023760, "step": 29200 }, { "epoch": 4.764274061990212, "grad_norm": 0.06074810028076172, "learning_rate": 4.7146773491756064e-05, "loss": 0.1004, "num_input_tokens_seen": 63035248, "step": 29205 }, { "epoch": 4.765089722675367, "grad_norm": 0.3411619961261749, "learning_rate": 4.71451221383237e-05, "loss": 0.154, "num_input_tokens_seen": 63045904, "step": 29210 }, { "epoch": 4.765905383360522, "grad_norm": 1.090049386024475, "learning_rate": 4.7143470336092365e-05, "loss": 0.2205, "num_input_tokens_seen": 63056336, "step": 29215 }, { "epoch": 4.766721044045677, "grad_norm": 1.2520278692245483, "learning_rate": 4.71418180850955e-05, "loss": 0.2072, "num_input_tokens_seen": 63067376, "step": 29220 }, { "epoch": 4.767536704730832, "grad_norm": 0.16722765564918518, "learning_rate": 4.7140165385366595e-05, "loss": 0.2341, "num_input_tokens_seen": 63077456, "step": 29225 }, { "epoch": 4.768352365415987, "grad_norm": 0.09004885703325272, "learning_rate": 4.7138512236939167e-05, "loss": 0.0333, "num_input_tokens_seen": 63087024, "step": 29230 }, { "epoch": 4.769168026101142, "grad_norm": 0.231357142329216, "learning_rate": 4.7136858639846694e-05, "loss": 0.1036, "num_input_tokens_seen": 63098608, "step": 29235 }, { "epoch": 4.769983686786297, "grad_norm": 0.04048900306224823, "learning_rate": 4.71352045941227e-05, "loss": 0.097, "num_input_tokens_seen": 63108144, "step": 29240 }, { "epoch": 4.770799347471452, "grad_norm": 0.5556743741035461, "learning_rate": 4.7133550099800704e-05, "loss": 0.2734, "num_input_tokens_seen": 63118736, "step": 29245 }, { "epoch": 4.771615008156607, "grad_norm": 0.26892995834350586, "learning_rate": 4.7131895156914244e-05, "loss": 0.0237, "num_input_tokens_seen": 63128656, "step": 29250 }, { "epoch": 4.7724306688417615, "grad_norm": 0.09168639034032822, "learning_rate": 4.713023976549684e-05, "loss": 0.1564, "num_input_tokens_seen": 63139632, "step": 29255 }, { "epoch": 4.773246329526917, "grad_norm": 0.48695963621139526, "learning_rate": 4.712858392558206e-05, "loss": 0.06, "num_input_tokens_seen": 63150128, "step": 29260 }, { "epoch": 4.774061990212072, "grad_norm": 0.2646927535533905, "learning_rate": 4.712692763720346e-05, "loss": 0.1104, "num_input_tokens_seen": 63160112, "step": 29265 }, { "epoch": 4.774877650897227, "grad_norm": 0.5699594020843506, "learning_rate": 4.7125270900394585e-05, "loss": 0.1348, "num_input_tokens_seen": 63172592, "step": 29270 }, { "epoch": 4.775693311582382, "grad_norm": 1.0407966375350952, "learning_rate": 4.712361371518904e-05, "loss": 0.1346, "num_input_tokens_seen": 63184304, "step": 29275 }, { "epoch": 4.7765089722675365, "grad_norm": 0.03708244487643242, "learning_rate": 4.712195608162039e-05, "loss": 0.0282, "num_input_tokens_seen": 63195184, "step": 29280 }, { "epoch": 4.777324632952691, "grad_norm": 0.1532144397497177, "learning_rate": 4.7120297999722246e-05, "loss": 0.0926, "num_input_tokens_seen": 63206064, "step": 29285 }, { "epoch": 4.778140293637847, "grad_norm": 0.21584849059581757, "learning_rate": 4.711863946952819e-05, "loss": 0.0433, "num_input_tokens_seen": 63215536, "step": 29290 }, { "epoch": 4.778955954323002, "grad_norm": 0.6004976630210876, "learning_rate": 4.711698049107186e-05, "loss": 0.1264, "num_input_tokens_seen": 63226480, "step": 29295 }, { "epoch": 4.779771615008157, "grad_norm": 0.47121885418891907, "learning_rate": 4.7115321064386854e-05, "loss": 0.1438, "num_input_tokens_seen": 63237200, "step": 29300 }, { "epoch": 4.780587275693311, "grad_norm": 0.540119469165802, "learning_rate": 4.7113661189506806e-05, "loss": 0.2342, "num_input_tokens_seen": 63247632, "step": 29305 }, { "epoch": 4.781402936378466, "grad_norm": 0.29343393445014954, "learning_rate": 4.7112000866465366e-05, "loss": 0.1688, "num_input_tokens_seen": 63258864, "step": 29310 }, { "epoch": 4.782218597063622, "grad_norm": 0.7164627909660339, "learning_rate": 4.711034009529618e-05, "loss": 0.1057, "num_input_tokens_seen": 63270928, "step": 29315 }, { "epoch": 4.783034257748777, "grad_norm": 1.4031683206558228, "learning_rate": 4.710867887603291e-05, "loss": 0.2333, "num_input_tokens_seen": 63281200, "step": 29320 }, { "epoch": 4.783849918433932, "grad_norm": 0.6205119490623474, "learning_rate": 4.7107017208709204e-05, "loss": 0.1013, "num_input_tokens_seen": 63291184, "step": 29325 }, { "epoch": 4.784665579119086, "grad_norm": 0.6910900473594666, "learning_rate": 4.7105355093358747e-05, "loss": 0.1159, "num_input_tokens_seen": 63302544, "step": 29330 }, { "epoch": 4.785481239804241, "grad_norm": 0.30253472924232483, "learning_rate": 4.710369253001524e-05, "loss": 0.139, "num_input_tokens_seen": 63312464, "step": 29335 }, { "epoch": 4.786296900489396, "grad_norm": 0.26335251331329346, "learning_rate": 4.7102029518712355e-05, "loss": 0.3066, "num_input_tokens_seen": 63323856, "step": 29340 }, { "epoch": 4.787112561174552, "grad_norm": 0.09857596457004547, "learning_rate": 4.7100366059483804e-05, "loss": 0.0608, "num_input_tokens_seen": 63335664, "step": 29345 }, { "epoch": 4.787928221859707, "grad_norm": 0.1530928909778595, "learning_rate": 4.709870215236329e-05, "loss": 0.0324, "num_input_tokens_seen": 63347632, "step": 29350 }, { "epoch": 4.788743882544861, "grad_norm": 1.3092817068099976, "learning_rate": 4.709703779738456e-05, "loss": 0.1367, "num_input_tokens_seen": 63359152, "step": 29355 }, { "epoch": 4.789559543230016, "grad_norm": 0.10425710678100586, "learning_rate": 4.709537299458131e-05, "loss": 0.0476, "num_input_tokens_seen": 63370224, "step": 29360 }, { "epoch": 4.790375203915171, "grad_norm": 0.6445820927619934, "learning_rate": 4.709370774398731e-05, "loss": 0.0997, "num_input_tokens_seen": 63380784, "step": 29365 }, { "epoch": 4.791190864600326, "grad_norm": 2.332106113433838, "learning_rate": 4.709204204563629e-05, "loss": 0.2771, "num_input_tokens_seen": 63392656, "step": 29370 }, { "epoch": 4.7920065252854815, "grad_norm": 0.3114228844642639, "learning_rate": 4.709037589956201e-05, "loss": 0.1537, "num_input_tokens_seen": 63403440, "step": 29375 }, { "epoch": 4.792822185970636, "grad_norm": 1.2768476009368896, "learning_rate": 4.708870930579824e-05, "loss": 0.1109, "num_input_tokens_seen": 63413968, "step": 29380 }, { "epoch": 4.793637846655791, "grad_norm": 1.834596037864685, "learning_rate": 4.7087042264378756e-05, "loss": 0.1514, "num_input_tokens_seen": 63424752, "step": 29385 }, { "epoch": 4.794453507340946, "grad_norm": 0.061855290085077286, "learning_rate": 4.708537477533734e-05, "loss": 0.0351, "num_input_tokens_seen": 63435792, "step": 29390 }, { "epoch": 4.795269168026101, "grad_norm": 0.544736921787262, "learning_rate": 4.708370683870779e-05, "loss": 0.0472, "num_input_tokens_seen": 63448016, "step": 29395 }, { "epoch": 4.7960848287112565, "grad_norm": 0.2862167954444885, "learning_rate": 4.708203845452389e-05, "loss": 0.0291, "num_input_tokens_seen": 63456880, "step": 29400 }, { "epoch": 4.796900489396411, "grad_norm": 1.757280707359314, "learning_rate": 4.708036962281949e-05, "loss": 0.1502, "num_input_tokens_seen": 63467504, "step": 29405 }, { "epoch": 4.797716150081566, "grad_norm": 0.5139182209968567, "learning_rate": 4.707870034362837e-05, "loss": 0.114, "num_input_tokens_seen": 63478288, "step": 29410 }, { "epoch": 4.798531810766721, "grad_norm": 0.1076585128903389, "learning_rate": 4.707703061698439e-05, "loss": 0.0457, "num_input_tokens_seen": 63487888, "step": 29415 }, { "epoch": 4.799347471451876, "grad_norm": 0.13992618024349213, "learning_rate": 4.707536044292137e-05, "loss": 0.088, "num_input_tokens_seen": 63498576, "step": 29420 }, { "epoch": 4.800163132137031, "grad_norm": 0.03044627048075199, "learning_rate": 4.707368982147318e-05, "loss": 0.116, "num_input_tokens_seen": 63509872, "step": 29425 }, { "epoch": 4.800978792822186, "grad_norm": 0.26967546343803406, "learning_rate": 4.7072018752673655e-05, "loss": 0.0298, "num_input_tokens_seen": 63521328, "step": 29430 }, { "epoch": 4.801794453507341, "grad_norm": 1.1697429418563843, "learning_rate": 4.707034723655667e-05, "loss": 0.1382, "num_input_tokens_seen": 63531248, "step": 29435 }, { "epoch": 4.802610114192496, "grad_norm": 2.6832778453826904, "learning_rate": 4.7068675273156095e-05, "loss": 0.2665, "num_input_tokens_seen": 63541648, "step": 29440 }, { "epoch": 4.803425774877651, "grad_norm": 0.6819345951080322, "learning_rate": 4.706700286250582e-05, "loss": 0.0915, "num_input_tokens_seen": 63551408, "step": 29445 }, { "epoch": 4.804241435562806, "grad_norm": 0.23487697541713715, "learning_rate": 4.706533000463974e-05, "loss": 0.0342, "num_input_tokens_seen": 63562160, "step": 29450 }, { "epoch": 4.80505709624796, "grad_norm": 0.057119548320770264, "learning_rate": 4.706365669959176e-05, "loss": 0.0835, "num_input_tokens_seen": 63573648, "step": 29455 }, { "epoch": 4.805872756933116, "grad_norm": 0.040985189378261566, "learning_rate": 4.706198294739579e-05, "loss": 0.1264, "num_input_tokens_seen": 63584048, "step": 29460 }, { "epoch": 4.806688417618271, "grad_norm": 0.4705166518688202, "learning_rate": 4.706030874808573e-05, "loss": 0.0493, "num_input_tokens_seen": 63595344, "step": 29465 }, { "epoch": 4.807504078303426, "grad_norm": 1.4797420501708984, "learning_rate": 4.7058634101695545e-05, "loss": 0.1487, "num_input_tokens_seen": 63606800, "step": 29470 }, { "epoch": 4.808319738988581, "grad_norm": 1.6213001012802124, "learning_rate": 4.7056959008259155e-05, "loss": 0.1945, "num_input_tokens_seen": 63617456, "step": 29475 }, { "epoch": 4.809135399673735, "grad_norm": 0.34204983711242676, "learning_rate": 4.7055283467810507e-05, "loss": 0.2821, "num_input_tokens_seen": 63628336, "step": 29480 }, { "epoch": 4.809951060358891, "grad_norm": 0.059022724628448486, "learning_rate": 4.7053607480383554e-05, "loss": 0.0612, "num_input_tokens_seen": 63640720, "step": 29485 }, { "epoch": 4.810766721044046, "grad_norm": 1.038050889968872, "learning_rate": 4.7051931046012274e-05, "loss": 0.0637, "num_input_tokens_seen": 63650960, "step": 29490 }, { "epoch": 4.811582381729201, "grad_norm": 1.6867083311080933, "learning_rate": 4.705025416473064e-05, "loss": 0.1322, "num_input_tokens_seen": 63662192, "step": 29495 }, { "epoch": 4.8123980424143555, "grad_norm": 1.594375729560852, "learning_rate": 4.7048576836572636e-05, "loss": 0.1119, "num_input_tokens_seen": 63673232, "step": 29500 }, { "epoch": 4.81321370309951, "grad_norm": 0.0402916856110096, "learning_rate": 4.704689906157224e-05, "loss": 0.0235, "num_input_tokens_seen": 63683728, "step": 29505 }, { "epoch": 4.814029363784666, "grad_norm": 1.0535213947296143, "learning_rate": 4.704522083976347e-05, "loss": 0.0824, "num_input_tokens_seen": 63694768, "step": 29510 }, { "epoch": 4.814845024469821, "grad_norm": 0.11521948128938675, "learning_rate": 4.704354217118033e-05, "loss": 0.1515, "num_input_tokens_seen": 63705488, "step": 29515 }, { "epoch": 4.815660685154976, "grad_norm": 1.0199522972106934, "learning_rate": 4.704186305585685e-05, "loss": 0.2551, "num_input_tokens_seen": 63715600, "step": 29520 }, { "epoch": 4.8164763458401305, "grad_norm": 1.4524260759353638, "learning_rate": 4.704018349382705e-05, "loss": 0.1983, "num_input_tokens_seen": 63726000, "step": 29525 }, { "epoch": 4.817292006525285, "grad_norm": 0.6973876357078552, "learning_rate": 4.7038503485124976e-05, "loss": 0.1579, "num_input_tokens_seen": 63736400, "step": 29530 }, { "epoch": 4.81810766721044, "grad_norm": 1.5380592346191406, "learning_rate": 4.7036823029784666e-05, "loss": 0.4154, "num_input_tokens_seen": 63747600, "step": 29535 }, { "epoch": 4.818923327895595, "grad_norm": 0.22237053513526917, "learning_rate": 4.7035142127840184e-05, "loss": 0.0374, "num_input_tokens_seen": 63758416, "step": 29540 }, { "epoch": 4.819738988580751, "grad_norm": 0.20592360198497772, "learning_rate": 4.7033460779325586e-05, "loss": 0.1835, "num_input_tokens_seen": 63768976, "step": 29545 }, { "epoch": 4.8205546492659055, "grad_norm": 0.20164382457733154, "learning_rate": 4.7031778984274964e-05, "loss": 0.0957, "num_input_tokens_seen": 63778896, "step": 29550 }, { "epoch": 4.82137030995106, "grad_norm": 0.6007541418075562, "learning_rate": 4.703009674272239e-05, "loss": 0.1146, "num_input_tokens_seen": 63790384, "step": 29555 }, { "epoch": 4.822185970636215, "grad_norm": 0.14642824232578278, "learning_rate": 4.7028414054701954e-05, "loss": 0.0325, "num_input_tokens_seen": 63800688, "step": 29560 }, { "epoch": 4.82300163132137, "grad_norm": 0.08106779307126999, "learning_rate": 4.702673092024776e-05, "loss": 0.0547, "num_input_tokens_seen": 63811856, "step": 29565 }, { "epoch": 4.823817292006526, "grad_norm": 0.2799558937549591, "learning_rate": 4.702504733939394e-05, "loss": 0.136, "num_input_tokens_seen": 63822576, "step": 29570 }, { "epoch": 4.8246329526916805, "grad_norm": 0.4305518865585327, "learning_rate": 4.7023363312174575e-05, "loss": 0.0362, "num_input_tokens_seen": 63833168, "step": 29575 }, { "epoch": 4.825448613376835, "grad_norm": 0.3057309687137604, "learning_rate": 4.702167883862382e-05, "loss": 0.0373, "num_input_tokens_seen": 63843312, "step": 29580 }, { "epoch": 4.82626427406199, "grad_norm": 0.7786309123039246, "learning_rate": 4.70199939187758e-05, "loss": 0.2692, "num_input_tokens_seen": 63853904, "step": 29585 }, { "epoch": 4.827079934747145, "grad_norm": 2.039823055267334, "learning_rate": 4.7018308552664683e-05, "loss": 0.432, "num_input_tokens_seen": 63865072, "step": 29590 }, { "epoch": 4.827895595432301, "grad_norm": 0.7063306570053101, "learning_rate": 4.7016622740324604e-05, "loss": 0.0613, "num_input_tokens_seen": 63875760, "step": 29595 }, { "epoch": 4.828711256117455, "grad_norm": 0.16111819446086884, "learning_rate": 4.7014936481789726e-05, "loss": 0.0539, "num_input_tokens_seen": 63887216, "step": 29600 }, { "epoch": 4.82952691680261, "grad_norm": 0.052664078772068024, "learning_rate": 4.701324977709424e-05, "loss": 0.1019, "num_input_tokens_seen": 63897104, "step": 29605 }, { "epoch": 4.830342577487765, "grad_norm": 1.1193126440048218, "learning_rate": 4.701156262627232e-05, "loss": 0.1917, "num_input_tokens_seen": 63907984, "step": 29610 }, { "epoch": 4.83115823817292, "grad_norm": 0.692622721195221, "learning_rate": 4.7009875029358166e-05, "loss": 0.1091, "num_input_tokens_seen": 63918640, "step": 29615 }, { "epoch": 4.831973898858075, "grad_norm": 1.6018550395965576, "learning_rate": 4.7008186986385964e-05, "loss": 0.1687, "num_input_tokens_seen": 63929136, "step": 29620 }, { "epoch": 4.8327895595432295, "grad_norm": 0.4202740788459778, "learning_rate": 4.7006498497389936e-05, "loss": 0.2651, "num_input_tokens_seen": 63940464, "step": 29625 }, { "epoch": 4.833605220228385, "grad_norm": 1.2740930318832397, "learning_rate": 4.700480956240431e-05, "loss": 0.1741, "num_input_tokens_seen": 63950512, "step": 29630 }, { "epoch": 4.83442088091354, "grad_norm": 1.7551636695861816, "learning_rate": 4.700312018146329e-05, "loss": 0.2844, "num_input_tokens_seen": 63961840, "step": 29635 }, { "epoch": 4.835236541598695, "grad_norm": 0.5743557810783386, "learning_rate": 4.700143035460113e-05, "loss": 0.0988, "num_input_tokens_seen": 63971440, "step": 29640 }, { "epoch": 4.83605220228385, "grad_norm": 0.6901082992553711, "learning_rate": 4.699974008185207e-05, "loss": 0.1138, "num_input_tokens_seen": 63982608, "step": 29645 }, { "epoch": 4.8368678629690045, "grad_norm": 0.11441025882959366, "learning_rate": 4.699804936325037e-05, "loss": 0.2061, "num_input_tokens_seen": 63994000, "step": 29650 }, { "epoch": 4.83768352365416, "grad_norm": 0.738784909248352, "learning_rate": 4.69963581988303e-05, "loss": 0.1275, "num_input_tokens_seen": 64005008, "step": 29655 }, { "epoch": 4.838499184339315, "grad_norm": 0.658086895942688, "learning_rate": 4.6994666588626125e-05, "loss": 0.1122, "num_input_tokens_seen": 64014448, "step": 29660 }, { "epoch": 4.83931484502447, "grad_norm": 0.3465605080127716, "learning_rate": 4.6992974532672116e-05, "loss": 0.0409, "num_input_tokens_seen": 64026672, "step": 29665 }, { "epoch": 4.840130505709625, "grad_norm": 0.2045782506465912, "learning_rate": 4.6991282031002594e-05, "loss": 0.1465, "num_input_tokens_seen": 64036752, "step": 29670 }, { "epoch": 4.8409461663947795, "grad_norm": 0.3209049105644226, "learning_rate": 4.698958908365184e-05, "loss": 0.1829, "num_input_tokens_seen": 64047760, "step": 29675 }, { "epoch": 4.841761827079935, "grad_norm": 0.10265351086854935, "learning_rate": 4.698789569065416e-05, "loss": 0.2097, "num_input_tokens_seen": 64059664, "step": 29680 }, { "epoch": 4.84257748776509, "grad_norm": 1.7905243635177612, "learning_rate": 4.698620185204389e-05, "loss": 0.1418, "num_input_tokens_seen": 64071408, "step": 29685 }, { "epoch": 4.843393148450245, "grad_norm": 0.4016946256160736, "learning_rate": 4.698450756785534e-05, "loss": 0.1065, "num_input_tokens_seen": 64082448, "step": 29690 }, { "epoch": 4.8442088091354, "grad_norm": 0.9970803260803223, "learning_rate": 4.698281283812286e-05, "loss": 0.1118, "num_input_tokens_seen": 64094320, "step": 29695 }, { "epoch": 4.8450244698205545, "grad_norm": 0.3755565583705902, "learning_rate": 4.698111766288079e-05, "loss": 0.1594, "num_input_tokens_seen": 64105392, "step": 29700 }, { "epoch": 4.845840130505709, "grad_norm": 0.47169366478919983, "learning_rate": 4.6979422042163484e-05, "loss": 0.1659, "num_input_tokens_seen": 64117008, "step": 29705 }, { "epoch": 4.846655791190865, "grad_norm": 0.9917578101158142, "learning_rate": 4.697772597600531e-05, "loss": 0.4538, "num_input_tokens_seen": 64128016, "step": 29710 }, { "epoch": 4.84747145187602, "grad_norm": 0.9186831712722778, "learning_rate": 4.697602946444064e-05, "loss": 0.1365, "num_input_tokens_seen": 64138448, "step": 29715 }, { "epoch": 4.848287112561175, "grad_norm": 0.21812604367733002, "learning_rate": 4.697433250750385e-05, "loss": 0.032, "num_input_tokens_seen": 64148848, "step": 29720 }, { "epoch": 4.849102773246329, "grad_norm": 1.0977602005004883, "learning_rate": 4.6972635105229336e-05, "loss": 0.1908, "num_input_tokens_seen": 64159280, "step": 29725 }, { "epoch": 4.849918433931484, "grad_norm": 0.1758122593164444, "learning_rate": 4.6970937257651494e-05, "loss": 0.1398, "num_input_tokens_seen": 64170384, "step": 29730 }, { "epoch": 4.850734094616639, "grad_norm": 0.07112445682287216, "learning_rate": 4.696923896480474e-05, "loss": 0.203, "num_input_tokens_seen": 64180112, "step": 29735 }, { "epoch": 4.851549755301795, "grad_norm": 0.7839836478233337, "learning_rate": 4.696754022672348e-05, "loss": 0.102, "num_input_tokens_seen": 64190032, "step": 29740 }, { "epoch": 4.85236541598695, "grad_norm": 1.0353426933288574, "learning_rate": 4.696584104344216e-05, "loss": 0.1304, "num_input_tokens_seen": 64201936, "step": 29745 }, { "epoch": 4.853181076672104, "grad_norm": 0.8015941381454468, "learning_rate": 4.696414141499521e-05, "loss": 0.0741, "num_input_tokens_seen": 64212784, "step": 29750 }, { "epoch": 4.853996737357259, "grad_norm": 0.0733376294374466, "learning_rate": 4.696244134141706e-05, "loss": 0.088, "num_input_tokens_seen": 64224208, "step": 29755 }, { "epoch": 4.854812398042414, "grad_norm": 0.338302880525589, "learning_rate": 4.696074082274218e-05, "loss": 0.1128, "num_input_tokens_seen": 64235792, "step": 29760 }, { "epoch": 4.85562805872757, "grad_norm": 0.06975313276052475, "learning_rate": 4.695903985900503e-05, "loss": 0.1254, "num_input_tokens_seen": 64246512, "step": 29765 }, { "epoch": 4.856443719412725, "grad_norm": 0.22261974215507507, "learning_rate": 4.6957338450240074e-05, "loss": 0.0723, "num_input_tokens_seen": 64256624, "step": 29770 }, { "epoch": 4.857259380097879, "grad_norm": 0.4034563899040222, "learning_rate": 4.69556365964818e-05, "loss": 0.1419, "num_input_tokens_seen": 64266544, "step": 29775 }, { "epoch": 4.858075040783034, "grad_norm": 1.1216943264007568, "learning_rate": 4.69539342977647e-05, "loss": 0.2407, "num_input_tokens_seen": 64277904, "step": 29780 }, { "epoch": 4.858890701468189, "grad_norm": 0.18468371033668518, "learning_rate": 4.6952231554123275e-05, "loss": 0.0854, "num_input_tokens_seen": 64288464, "step": 29785 }, { "epoch": 4.859706362153344, "grad_norm": 0.17365999519824982, "learning_rate": 4.695052836559203e-05, "loss": 0.2489, "num_input_tokens_seen": 64299408, "step": 29790 }, { "epoch": 4.8605220228384995, "grad_norm": 2.3754758834838867, "learning_rate": 4.6948824732205474e-05, "loss": 0.2745, "num_input_tokens_seen": 64310224, "step": 29795 }, { "epoch": 4.861337683523654, "grad_norm": 0.9858028888702393, "learning_rate": 4.694712065399814e-05, "loss": 0.0888, "num_input_tokens_seen": 64321520, "step": 29800 }, { "epoch": 4.862153344208809, "grad_norm": 2.1564009189605713, "learning_rate": 4.694541613100457e-05, "loss": 0.2342, "num_input_tokens_seen": 64332880, "step": 29805 }, { "epoch": 4.862969004893964, "grad_norm": 0.21615248918533325, "learning_rate": 4.69437111632593e-05, "loss": 0.1025, "num_input_tokens_seen": 64342576, "step": 29810 }, { "epoch": 4.863784665579119, "grad_norm": 0.35065123438835144, "learning_rate": 4.6942005750796885e-05, "loss": 0.0863, "num_input_tokens_seen": 64353936, "step": 29815 }, { "epoch": 4.864600326264274, "grad_norm": 0.02212863229215145, "learning_rate": 4.6940299893651885e-05, "loss": 0.044, "num_input_tokens_seen": 64364592, "step": 29820 }, { "epoch": 4.865415986949429, "grad_norm": 0.32805106043815613, "learning_rate": 4.6938593591858884e-05, "loss": 0.071, "num_input_tokens_seen": 64375696, "step": 29825 }, { "epoch": 4.866231647634584, "grad_norm": 0.14153459668159485, "learning_rate": 4.693688684545244e-05, "loss": 0.0469, "num_input_tokens_seen": 64387440, "step": 29830 }, { "epoch": 4.867047308319739, "grad_norm": 0.09733761101961136, "learning_rate": 4.693517965446716e-05, "loss": 0.069, "num_input_tokens_seen": 64399088, "step": 29835 }, { "epoch": 4.867862969004894, "grad_norm": 0.2635699212551117, "learning_rate": 4.693347201893765e-05, "loss": 0.3376, "num_input_tokens_seen": 64409712, "step": 29840 }, { "epoch": 4.868678629690049, "grad_norm": 0.2162070870399475, "learning_rate": 4.693176393889849e-05, "loss": 0.2265, "num_input_tokens_seen": 64421200, "step": 29845 }, { "epoch": 4.869494290375204, "grad_norm": 0.30295559763908386, "learning_rate": 4.693005541438432e-05, "loss": 0.2079, "num_input_tokens_seen": 64431344, "step": 29850 }, { "epoch": 4.870309951060359, "grad_norm": 0.14088885486125946, "learning_rate": 4.6928346445429746e-05, "loss": 0.0464, "num_input_tokens_seen": 64442928, "step": 29855 }, { "epoch": 4.871125611745514, "grad_norm": 0.05903793126344681, "learning_rate": 4.692663703206942e-05, "loss": 0.1515, "num_input_tokens_seen": 64453968, "step": 29860 }, { "epoch": 4.871941272430669, "grad_norm": 0.046468447893857956, "learning_rate": 4.6924927174337984e-05, "loss": 0.0559, "num_input_tokens_seen": 64465008, "step": 29865 }, { "epoch": 4.872756933115824, "grad_norm": 0.0661139041185379, "learning_rate": 4.692321687227008e-05, "loss": 0.1502, "num_input_tokens_seen": 64474832, "step": 29870 }, { "epoch": 4.873572593800979, "grad_norm": 0.8462587594985962, "learning_rate": 4.692150612590037e-05, "loss": 0.0952, "num_input_tokens_seen": 64485776, "step": 29875 }, { "epoch": 4.874388254486134, "grad_norm": 0.2533830404281616, "learning_rate": 4.691979493526353e-05, "loss": 0.1898, "num_input_tokens_seen": 64497392, "step": 29880 }, { "epoch": 4.875203915171289, "grad_norm": 0.36355483531951904, "learning_rate": 4.6918083300394245e-05, "loss": 0.0385, "num_input_tokens_seen": 64508176, "step": 29885 }, { "epoch": 4.876019575856444, "grad_norm": 2.0088255405426025, "learning_rate": 4.691637122132719e-05, "loss": 0.2794, "num_input_tokens_seen": 64517808, "step": 29890 }, { "epoch": 4.876835236541599, "grad_norm": 1.9321030378341675, "learning_rate": 4.691465869809708e-05, "loss": 0.1699, "num_input_tokens_seen": 64527696, "step": 29895 }, { "epoch": 4.877650897226753, "grad_norm": 0.4438783526420593, "learning_rate": 4.69129457307386e-05, "loss": 0.0589, "num_input_tokens_seen": 64538448, "step": 29900 }, { "epoch": 4.878466557911908, "grad_norm": 1.0368682146072388, "learning_rate": 4.691123231928648e-05, "loss": 0.0836, "num_input_tokens_seen": 64548784, "step": 29905 }, { "epoch": 4.879282218597064, "grad_norm": 0.052334416657686234, "learning_rate": 4.690951846377544e-05, "loss": 0.0728, "num_input_tokens_seen": 64559152, "step": 29910 }, { "epoch": 4.880097879282219, "grad_norm": 0.20015360414981842, "learning_rate": 4.690780416424021e-05, "loss": 0.0682, "num_input_tokens_seen": 64570096, "step": 29915 }, { "epoch": 4.8809135399673735, "grad_norm": 0.03733723238110542, "learning_rate": 4.690608942071555e-05, "loss": 0.0931, "num_input_tokens_seen": 64581744, "step": 29920 }, { "epoch": 4.881729200652528, "grad_norm": 0.6121270656585693, "learning_rate": 4.690437423323618e-05, "loss": 0.1004, "num_input_tokens_seen": 64591856, "step": 29925 }, { "epoch": 4.882544861337683, "grad_norm": 0.04389798268675804, "learning_rate": 4.690265860183689e-05, "loss": 0.0257, "num_input_tokens_seen": 64602192, "step": 29930 }, { "epoch": 4.883360522022839, "grad_norm": 0.1435101181268692, "learning_rate": 4.690094252655244e-05, "loss": 0.0936, "num_input_tokens_seen": 64612976, "step": 29935 }, { "epoch": 4.884176182707994, "grad_norm": 0.3115079998970032, "learning_rate": 4.6899226007417594e-05, "loss": 0.0615, "num_input_tokens_seen": 64623984, "step": 29940 }, { "epoch": 4.8849918433931485, "grad_norm": 0.8116230368614197, "learning_rate": 4.689750904446717e-05, "loss": 0.0796, "num_input_tokens_seen": 64634992, "step": 29945 }, { "epoch": 4.885807504078303, "grad_norm": 0.4434505105018616, "learning_rate": 4.689579163773593e-05, "loss": 0.1591, "num_input_tokens_seen": 64645840, "step": 29950 }, { "epoch": 4.886623164763458, "grad_norm": 0.18131975829601288, "learning_rate": 4.68940737872587e-05, "loss": 0.1154, "num_input_tokens_seen": 64656016, "step": 29955 }, { "epoch": 4.887438825448614, "grad_norm": 0.15037135779857635, "learning_rate": 4.6892355493070295e-05, "loss": 0.0699, "num_input_tokens_seen": 64667760, "step": 29960 }, { "epoch": 4.888254486133769, "grad_norm": 0.888028621673584, "learning_rate": 4.689063675520553e-05, "loss": 0.1622, "num_input_tokens_seen": 64677872, "step": 29965 }, { "epoch": 4.8890701468189235, "grad_norm": 0.03358515352010727, "learning_rate": 4.688891757369924e-05, "loss": 0.0215, "num_input_tokens_seen": 64688784, "step": 29970 }, { "epoch": 4.889885807504078, "grad_norm": 0.5088986754417419, "learning_rate": 4.688719794858627e-05, "loss": 0.1917, "num_input_tokens_seen": 64698576, "step": 29975 }, { "epoch": 4.890701468189233, "grad_norm": 0.060089703649282455, "learning_rate": 4.6885477879901465e-05, "loss": 0.0202, "num_input_tokens_seen": 64709520, "step": 29980 }, { "epoch": 4.891517128874388, "grad_norm": 0.573937714099884, "learning_rate": 4.688375736767969e-05, "loss": 0.0784, "num_input_tokens_seen": 64720592, "step": 29985 }, { "epoch": 4.892332789559543, "grad_norm": 1.4764937162399292, "learning_rate": 4.688203641195581e-05, "loss": 0.1626, "num_input_tokens_seen": 64731568, "step": 29990 }, { "epoch": 4.8931484502446985, "grad_norm": 2.1597371101379395, "learning_rate": 4.68803150127647e-05, "loss": 0.1612, "num_input_tokens_seen": 64742256, "step": 29995 }, { "epoch": 4.893964110929853, "grad_norm": 0.16540758311748505, "learning_rate": 4.687859317014125e-05, "loss": 0.0328, "num_input_tokens_seen": 64752208, "step": 30000 }, { "epoch": 4.894779771615008, "grad_norm": 0.09709916263818741, "learning_rate": 4.6876870884120356e-05, "loss": 0.148, "num_input_tokens_seen": 64761168, "step": 30005 }, { "epoch": 4.895595432300163, "grad_norm": 0.08936046808958054, "learning_rate": 4.687514815473692e-05, "loss": 0.0824, "num_input_tokens_seen": 64773040, "step": 30010 }, { "epoch": 4.896411092985318, "grad_norm": 0.4213910400867462, "learning_rate": 4.687342498202585e-05, "loss": 0.2404, "num_input_tokens_seen": 64785584, "step": 30015 }, { "epoch": 4.897226753670473, "grad_norm": 0.12143343687057495, "learning_rate": 4.687170136602208e-05, "loss": 0.109, "num_input_tokens_seen": 64797648, "step": 30020 }, { "epoch": 4.898042414355628, "grad_norm": 2.2951152324676514, "learning_rate": 4.686997730676054e-05, "loss": 0.0789, "num_input_tokens_seen": 64808912, "step": 30025 }, { "epoch": 4.898858075040783, "grad_norm": 0.15489932894706726, "learning_rate": 4.6868252804276156e-05, "loss": 0.2365, "num_input_tokens_seen": 64820336, "step": 30030 }, { "epoch": 4.899673735725938, "grad_norm": 0.033541131764650345, "learning_rate": 4.6866527858603895e-05, "loss": 0.0653, "num_input_tokens_seen": 64831600, "step": 30035 }, { "epoch": 4.900489396411093, "grad_norm": 0.4313296377658844, "learning_rate": 4.686480246977871e-05, "loss": 0.0461, "num_input_tokens_seen": 64842000, "step": 30040 }, { "epoch": 4.901305057096248, "grad_norm": 0.47934186458587646, "learning_rate": 4.686307663783556e-05, "loss": 0.1406, "num_input_tokens_seen": 64853456, "step": 30045 }, { "epoch": 4.902120717781403, "grad_norm": 0.39990487694740295, "learning_rate": 4.686135036280943e-05, "loss": 0.0837, "num_input_tokens_seen": 64863952, "step": 30050 }, { "epoch": 4.902936378466558, "grad_norm": 0.7886455655097961, "learning_rate": 4.685962364473529e-05, "loss": 0.1865, "num_input_tokens_seen": 64874128, "step": 30055 }, { "epoch": 4.903752039151713, "grad_norm": 0.10893242061138153, "learning_rate": 4.6857896483648155e-05, "loss": 0.0538, "num_input_tokens_seen": 64883856, "step": 30060 }, { "epoch": 4.904567699836868, "grad_norm": 1.380958914756775, "learning_rate": 4.685616887958302e-05, "loss": 0.1421, "num_input_tokens_seen": 64894736, "step": 30065 }, { "epoch": 4.9053833605220225, "grad_norm": 0.1148778572678566, "learning_rate": 4.68544408325749e-05, "loss": 0.0764, "num_input_tokens_seen": 64905808, "step": 30070 }, { "epoch": 4.906199021207177, "grad_norm": 0.8598161339759827, "learning_rate": 4.68527123426588e-05, "loss": 0.2387, "num_input_tokens_seen": 64917104, "step": 30075 }, { "epoch": 4.907014681892333, "grad_norm": 0.22455595433712006, "learning_rate": 4.685098340986977e-05, "loss": 0.0304, "num_input_tokens_seen": 64926800, "step": 30080 }, { "epoch": 4.907830342577488, "grad_norm": 0.05468887835741043, "learning_rate": 4.6849254034242845e-05, "loss": 0.1174, "num_input_tokens_seen": 64937904, "step": 30085 }, { "epoch": 4.908646003262643, "grad_norm": 0.01827608421444893, "learning_rate": 4.6847524215813065e-05, "loss": 0.2566, "num_input_tokens_seen": 64948592, "step": 30090 }, { "epoch": 4.9094616639477975, "grad_norm": 0.3342641592025757, "learning_rate": 4.684579395461549e-05, "loss": 0.071, "num_input_tokens_seen": 64959152, "step": 30095 }, { "epoch": 4.910277324632952, "grad_norm": 0.06854454427957535, "learning_rate": 4.684406325068519e-05, "loss": 0.3801, "num_input_tokens_seen": 64970736, "step": 30100 }, { "epoch": 4.911092985318108, "grad_norm": 0.903201162815094, "learning_rate": 4.6842332104057237e-05, "loss": 0.1499, "num_input_tokens_seen": 64982032, "step": 30105 }, { "epoch": 4.911908646003263, "grad_norm": 0.20803788304328918, "learning_rate": 4.684060051476671e-05, "loss": 0.0743, "num_input_tokens_seen": 64994448, "step": 30110 }, { "epoch": 4.912724306688418, "grad_norm": 0.21488440036773682, "learning_rate": 4.683886848284871e-05, "loss": 0.1433, "num_input_tokens_seen": 65005744, "step": 30115 }, { "epoch": 4.9135399673735725, "grad_norm": 0.5063382387161255, "learning_rate": 4.683713600833833e-05, "loss": 0.0434, "num_input_tokens_seen": 65016848, "step": 30120 }, { "epoch": 4.914355628058727, "grad_norm": 0.6384249329566956, "learning_rate": 4.6835403091270704e-05, "loss": 0.1816, "num_input_tokens_seen": 65027600, "step": 30125 }, { "epoch": 4.915171288743883, "grad_norm": 0.3490970730781555, "learning_rate": 4.683366973168092e-05, "loss": 0.1037, "num_input_tokens_seen": 65039120, "step": 30130 }, { "epoch": 4.915986949429038, "grad_norm": 1.2093321084976196, "learning_rate": 4.683193592960412e-05, "loss": 0.0752, "num_input_tokens_seen": 65050512, "step": 30135 }, { "epoch": 4.916802610114193, "grad_norm": 1.6190673112869263, "learning_rate": 4.683020168507545e-05, "loss": 0.1931, "num_input_tokens_seen": 65059888, "step": 30140 }, { "epoch": 4.917618270799347, "grad_norm": 0.3072167932987213, "learning_rate": 4.6828466998130046e-05, "loss": 0.076, "num_input_tokens_seen": 65071120, "step": 30145 }, { "epoch": 4.918433931484502, "grad_norm": 0.14025457203388214, "learning_rate": 4.6826731868803064e-05, "loss": 0.104, "num_input_tokens_seen": 65081744, "step": 30150 }, { "epoch": 4.919249592169657, "grad_norm": 0.1518782377243042, "learning_rate": 4.682499629712967e-05, "loss": 0.0838, "num_input_tokens_seen": 65093040, "step": 30155 }, { "epoch": 4.920065252854813, "grad_norm": 0.04298636317253113, "learning_rate": 4.682326028314504e-05, "loss": 0.2159, "num_input_tokens_seen": 65103056, "step": 30160 }, { "epoch": 4.920880913539968, "grad_norm": 0.36339080333709717, "learning_rate": 4.682152382688436e-05, "loss": 0.0367, "num_input_tokens_seen": 65113904, "step": 30165 }, { "epoch": 4.921696574225122, "grad_norm": 1.1975425481796265, "learning_rate": 4.6819786928382815e-05, "loss": 0.1838, "num_input_tokens_seen": 65124016, "step": 30170 }, { "epoch": 4.922512234910277, "grad_norm": 0.16066201031208038, "learning_rate": 4.681804958767561e-05, "loss": 0.0193, "num_input_tokens_seen": 65135120, "step": 30175 }, { "epoch": 4.923327895595432, "grad_norm": 0.3650929033756256, "learning_rate": 4.681631180479795e-05, "loss": 0.1732, "num_input_tokens_seen": 65144784, "step": 30180 }, { "epoch": 4.924143556280587, "grad_norm": 0.6397395133972168, "learning_rate": 4.681457357978506e-05, "loss": 0.1511, "num_input_tokens_seen": 65154928, "step": 30185 }, { "epoch": 4.924959216965743, "grad_norm": 0.2131216675043106, "learning_rate": 4.681283491267215e-05, "loss": 0.0376, "num_input_tokens_seen": 65165808, "step": 30190 }, { "epoch": 4.925774877650897, "grad_norm": 0.46770238876342773, "learning_rate": 4.6811095803494474e-05, "loss": 0.1537, "num_input_tokens_seen": 65175952, "step": 30195 }, { "epoch": 4.926590538336052, "grad_norm": 0.33575019240379333, "learning_rate": 4.6809356252287285e-05, "loss": 0.1461, "num_input_tokens_seen": 65185904, "step": 30200 }, { "epoch": 4.927406199021207, "grad_norm": 0.12662319839000702, "learning_rate": 4.680761625908581e-05, "loss": 0.0879, "num_input_tokens_seen": 65196848, "step": 30205 }, { "epoch": 4.928221859706362, "grad_norm": 0.17982237040996552, "learning_rate": 4.680587582392533e-05, "loss": 0.2163, "num_input_tokens_seen": 65207440, "step": 30210 }, { "epoch": 4.9290375203915175, "grad_norm": 0.1103384867310524, "learning_rate": 4.680413494684112e-05, "loss": 0.1203, "num_input_tokens_seen": 65217904, "step": 30215 }, { "epoch": 4.929853181076672, "grad_norm": 0.2875140309333801, "learning_rate": 4.680239362786845e-05, "loss": 0.0489, "num_input_tokens_seen": 65228464, "step": 30220 }, { "epoch": 4.930668841761827, "grad_norm": 1.9780782461166382, "learning_rate": 4.680065186704261e-05, "loss": 0.244, "num_input_tokens_seen": 65239024, "step": 30225 }, { "epoch": 4.931484502446982, "grad_norm": 2.019218921661377, "learning_rate": 4.67989096643989e-05, "loss": 0.1004, "num_input_tokens_seen": 65249872, "step": 30230 }, { "epoch": 4.932300163132137, "grad_norm": 0.14754198491573334, "learning_rate": 4.679716701997265e-05, "loss": 0.2284, "num_input_tokens_seen": 65260560, "step": 30235 }, { "epoch": 4.933115823817292, "grad_norm": 0.502670168876648, "learning_rate": 4.6795423933799145e-05, "loss": 0.0501, "num_input_tokens_seen": 65272016, "step": 30240 }, { "epoch": 4.933931484502447, "grad_norm": 0.2869475483894348, "learning_rate": 4.679368040591373e-05, "loss": 0.0354, "num_input_tokens_seen": 65281136, "step": 30245 }, { "epoch": 4.934747145187602, "grad_norm": 1.4889681339263916, "learning_rate": 4.679193643635173e-05, "loss": 0.1326, "num_input_tokens_seen": 65290640, "step": 30250 }, { "epoch": 4.935562805872757, "grad_norm": 1.792935848236084, "learning_rate": 4.679019202514849e-05, "loss": 0.3315, "num_input_tokens_seen": 65302032, "step": 30255 }, { "epoch": 4.936378466557912, "grad_norm": 0.30637162923812866, "learning_rate": 4.678844717233938e-05, "loss": 0.2585, "num_input_tokens_seen": 65313168, "step": 30260 }, { "epoch": 4.937194127243067, "grad_norm": 0.29504188895225525, "learning_rate": 4.678670187795974e-05, "loss": 0.0222, "num_input_tokens_seen": 65324528, "step": 30265 }, { "epoch": 4.938009787928221, "grad_norm": 0.9409607648849487, "learning_rate": 4.6784956142044945e-05, "loss": 0.0652, "num_input_tokens_seen": 65335568, "step": 30270 }, { "epoch": 4.938825448613377, "grad_norm": 0.6361716985702515, "learning_rate": 4.6783209964630374e-05, "loss": 0.2265, "num_input_tokens_seen": 65346032, "step": 30275 }, { "epoch": 4.939641109298532, "grad_norm": 0.1043815165758133, "learning_rate": 4.6781463345751434e-05, "loss": 0.0906, "num_input_tokens_seen": 65357712, "step": 30280 }, { "epoch": 4.940456769983687, "grad_norm": 0.0337226428091526, "learning_rate": 4.6779716285443495e-05, "loss": 0.0693, "num_input_tokens_seen": 65369552, "step": 30285 }, { "epoch": 4.941272430668842, "grad_norm": 0.7715280055999756, "learning_rate": 4.677796878374198e-05, "loss": 0.1804, "num_input_tokens_seen": 65380912, "step": 30290 }, { "epoch": 4.942088091353996, "grad_norm": 0.973589301109314, "learning_rate": 4.6776220840682297e-05, "loss": 0.3507, "num_input_tokens_seen": 65392080, "step": 30295 }, { "epoch": 4.942903752039152, "grad_norm": 0.050191547721624374, "learning_rate": 4.6774472456299875e-05, "loss": 0.0239, "num_input_tokens_seen": 65402256, "step": 30300 }, { "epoch": 4.943719412724307, "grad_norm": 1.4903210401535034, "learning_rate": 4.677272363063015e-05, "loss": 0.2196, "num_input_tokens_seen": 65413456, "step": 30305 }, { "epoch": 4.944535073409462, "grad_norm": 0.726422131061554, "learning_rate": 4.6770974363708553e-05, "loss": 0.1378, "num_input_tokens_seen": 65424912, "step": 30310 }, { "epoch": 4.945350734094617, "grad_norm": 0.14942707121372223, "learning_rate": 4.676922465557054e-05, "loss": 0.0188, "num_input_tokens_seen": 65434896, "step": 30315 }, { "epoch": 4.946166394779771, "grad_norm": 0.47937288880348206, "learning_rate": 4.676747450625159e-05, "loss": 0.0811, "num_input_tokens_seen": 65445424, "step": 30320 }, { "epoch": 4.946982055464927, "grad_norm": 0.18923023343086243, "learning_rate": 4.676572391578714e-05, "loss": 0.126, "num_input_tokens_seen": 65454672, "step": 30325 }, { "epoch": 4.947797716150082, "grad_norm": 0.07603135704994202, "learning_rate": 4.6763972884212684e-05, "loss": 0.0938, "num_input_tokens_seen": 65465776, "step": 30330 }, { "epoch": 4.948613376835237, "grad_norm": 1.8483303785324097, "learning_rate": 4.676222141156371e-05, "loss": 0.3387, "num_input_tokens_seen": 65477168, "step": 30335 }, { "epoch": 4.9494290375203915, "grad_norm": 1.3058111667633057, "learning_rate": 4.6760469497875716e-05, "loss": 0.0524, "num_input_tokens_seen": 65487248, "step": 30340 }, { "epoch": 4.950244698205546, "grad_norm": 0.27139562368392944, "learning_rate": 4.6758717143184194e-05, "loss": 0.0238, "num_input_tokens_seen": 65497360, "step": 30345 }, { "epoch": 4.951060358890701, "grad_norm": 0.4964219331741333, "learning_rate": 4.675696434752467e-05, "loss": 0.1149, "num_input_tokens_seen": 65507728, "step": 30350 }, { "epoch": 4.951876019575856, "grad_norm": 0.3774961233139038, "learning_rate": 4.6755211110932665e-05, "loss": 0.1061, "num_input_tokens_seen": 65517104, "step": 30355 }, { "epoch": 4.952691680261012, "grad_norm": 1.2868443727493286, "learning_rate": 4.6753457433443715e-05, "loss": 0.179, "num_input_tokens_seen": 65529168, "step": 30360 }, { "epoch": 4.9535073409461665, "grad_norm": 1.3595914840698242, "learning_rate": 4.6751703315093346e-05, "loss": 0.1019, "num_input_tokens_seen": 65538960, "step": 30365 }, { "epoch": 4.954323001631321, "grad_norm": 0.021685350686311722, "learning_rate": 4.6749948755917114e-05, "loss": 0.0445, "num_input_tokens_seen": 65549392, "step": 30370 }, { "epoch": 4.955138662316476, "grad_norm": 1.4207369089126587, "learning_rate": 4.6748193755950574e-05, "loss": 0.1218, "num_input_tokens_seen": 65559824, "step": 30375 }, { "epoch": 4.955954323001631, "grad_norm": 1.0508580207824707, "learning_rate": 4.6746438315229316e-05, "loss": 0.1596, "num_input_tokens_seen": 65570896, "step": 30380 }, { "epoch": 4.956769983686787, "grad_norm": 0.12840919196605682, "learning_rate": 4.674468243378888e-05, "loss": 0.1203, "num_input_tokens_seen": 65582384, "step": 30385 }, { "epoch": 4.9575856443719415, "grad_norm": 1.7832127809524536, "learning_rate": 4.6742926111664876e-05, "loss": 0.3107, "num_input_tokens_seen": 65593264, "step": 30390 }, { "epoch": 4.958401305057096, "grad_norm": 0.4867597818374634, "learning_rate": 4.6741169348892885e-05, "loss": 0.279, "num_input_tokens_seen": 65604016, "step": 30395 }, { "epoch": 4.959216965742251, "grad_norm": 0.8900495767593384, "learning_rate": 4.673941214550853e-05, "loss": 0.0961, "num_input_tokens_seen": 65613616, "step": 30400 }, { "epoch": 4.960032626427406, "grad_norm": 0.6721974015235901, "learning_rate": 4.6737654501547404e-05, "loss": 0.1991, "num_input_tokens_seen": 65624816, "step": 30405 }, { "epoch": 4.960848287112562, "grad_norm": 0.7850514650344849, "learning_rate": 4.673589641704513e-05, "loss": 0.2204, "num_input_tokens_seen": 65635792, "step": 30410 }, { "epoch": 4.9616639477977165, "grad_norm": 0.4298832416534424, "learning_rate": 4.6734137892037344e-05, "loss": 0.203, "num_input_tokens_seen": 65646512, "step": 30415 }, { "epoch": 4.962479608482871, "grad_norm": 0.6716561913490295, "learning_rate": 4.673237892655968e-05, "loss": 0.1228, "num_input_tokens_seen": 65656976, "step": 30420 }, { "epoch": 4.963295269168026, "grad_norm": 1.3639144897460938, "learning_rate": 4.673061952064779e-05, "loss": 0.1329, "num_input_tokens_seen": 65667792, "step": 30425 }, { "epoch": 4.964110929853181, "grad_norm": 1.1086128950119019, "learning_rate": 4.672885967433732e-05, "loss": 0.1327, "num_input_tokens_seen": 65679088, "step": 30430 }, { "epoch": 4.964926590538336, "grad_norm": 1.119969129562378, "learning_rate": 4.672709938766395e-05, "loss": 0.0978, "num_input_tokens_seen": 65689840, "step": 30435 }, { "epoch": 4.9657422512234906, "grad_norm": 0.7378960847854614, "learning_rate": 4.672533866066335e-05, "loss": 0.1558, "num_input_tokens_seen": 65700528, "step": 30440 }, { "epoch": 4.966557911908646, "grad_norm": 0.1566161811351776, "learning_rate": 4.6723577493371204e-05, "loss": 0.0589, "num_input_tokens_seen": 65710128, "step": 30445 }, { "epoch": 4.967373572593801, "grad_norm": 0.24900181591510773, "learning_rate": 4.672181588582319e-05, "loss": 0.2357, "num_input_tokens_seen": 65722064, "step": 30450 }, { "epoch": 4.968189233278956, "grad_norm": 0.4738323390483856, "learning_rate": 4.672005383805503e-05, "loss": 0.0621, "num_input_tokens_seen": 65732336, "step": 30455 }, { "epoch": 4.969004893964111, "grad_norm": 0.14732223749160767, "learning_rate": 4.6718291350102426e-05, "loss": 0.0723, "num_input_tokens_seen": 65743504, "step": 30460 }, { "epoch": 4.9698205546492655, "grad_norm": 0.535232663154602, "learning_rate": 4.67165284220011e-05, "loss": 0.1398, "num_input_tokens_seen": 65754608, "step": 30465 }, { "epoch": 4.970636215334421, "grad_norm": 0.8742254972457886, "learning_rate": 4.671476505378677e-05, "loss": 0.1578, "num_input_tokens_seen": 65765584, "step": 30470 }, { "epoch": 4.971451876019576, "grad_norm": 0.08087541162967682, "learning_rate": 4.671300124549518e-05, "loss": 0.2573, "num_input_tokens_seen": 65776816, "step": 30475 }, { "epoch": 4.972267536704731, "grad_norm": 1.4022300243377686, "learning_rate": 4.671123699716207e-05, "loss": 0.2014, "num_input_tokens_seen": 65788336, "step": 30480 }, { "epoch": 4.973083197389886, "grad_norm": 0.036545127630233765, "learning_rate": 4.670947230882321e-05, "loss": 0.2302, "num_input_tokens_seen": 65798704, "step": 30485 }, { "epoch": 4.9738988580750405, "grad_norm": 0.494326114654541, "learning_rate": 4.670770718051434e-05, "loss": 0.1115, "num_input_tokens_seen": 65808816, "step": 30490 }, { "epoch": 4.974714518760196, "grad_norm": 0.6960942149162292, "learning_rate": 4.6705941612271256e-05, "loss": 0.105, "num_input_tokens_seen": 65819088, "step": 30495 }, { "epoch": 4.975530179445351, "grad_norm": 2.0242743492126465, "learning_rate": 4.6704175604129727e-05, "loss": 0.2612, "num_input_tokens_seen": 65830256, "step": 30500 }, { "epoch": 4.976345840130506, "grad_norm": 0.03989420458674431, "learning_rate": 4.6702409156125546e-05, "loss": 0.1212, "num_input_tokens_seen": 65839792, "step": 30505 }, { "epoch": 4.977161500815661, "grad_norm": 0.35576727986335754, "learning_rate": 4.670064226829451e-05, "loss": 0.0474, "num_input_tokens_seen": 65850512, "step": 30510 }, { "epoch": 4.9779771615008155, "grad_norm": 0.2286641001701355, "learning_rate": 4.669887494067243e-05, "loss": 0.1072, "num_input_tokens_seen": 65861584, "step": 30515 }, { "epoch": 4.97879282218597, "grad_norm": 0.5687227249145508, "learning_rate": 4.6697107173295124e-05, "loss": 0.3088, "num_input_tokens_seen": 65872880, "step": 30520 }, { "epoch": 4.979608482871125, "grad_norm": 0.1305389553308487, "learning_rate": 4.669533896619841e-05, "loss": 0.1268, "num_input_tokens_seen": 65882544, "step": 30525 }, { "epoch": 4.980424143556281, "grad_norm": 0.1646229326725006, "learning_rate": 4.669357031941813e-05, "loss": 0.0411, "num_input_tokens_seen": 65893456, "step": 30530 }, { "epoch": 4.981239804241436, "grad_norm": 0.35719355940818787, "learning_rate": 4.669180123299014e-05, "loss": 0.0853, "num_input_tokens_seen": 65905264, "step": 30535 }, { "epoch": 4.9820554649265905, "grad_norm": 1.7877925634384155, "learning_rate": 4.6690031706950266e-05, "loss": 0.2819, "num_input_tokens_seen": 65916400, "step": 30540 }, { "epoch": 4.982871125611745, "grad_norm": 0.28815191984176636, "learning_rate": 4.6688261741334394e-05, "loss": 0.12, "num_input_tokens_seen": 65928272, "step": 30545 }, { "epoch": 4.9836867862969, "grad_norm": 0.6838453412055969, "learning_rate": 4.668649133617838e-05, "loss": 0.069, "num_input_tokens_seen": 65939792, "step": 30550 }, { "epoch": 4.984502446982056, "grad_norm": 0.14744532108306885, "learning_rate": 4.668472049151811e-05, "loss": 0.1114, "num_input_tokens_seen": 65950576, "step": 30555 }, { "epoch": 4.985318107667211, "grad_norm": 1.4554698467254639, "learning_rate": 4.6682949207389465e-05, "loss": 0.2068, "num_input_tokens_seen": 65960240, "step": 30560 }, { "epoch": 4.986133768352365, "grad_norm": 0.7556349039077759, "learning_rate": 4.668117748382835e-05, "loss": 0.1364, "num_input_tokens_seen": 65970544, "step": 30565 }, { "epoch": 4.98694942903752, "grad_norm": 0.11761646717786789, "learning_rate": 4.667940532087067e-05, "loss": 0.0247, "num_input_tokens_seen": 65982768, "step": 30570 }, { "epoch": 4.987765089722675, "grad_norm": 0.12840907275676727, "learning_rate": 4.6677632718552336e-05, "loss": 0.0703, "num_input_tokens_seen": 65993680, "step": 30575 }, { "epoch": 4.988580750407831, "grad_norm": 0.9820466041564941, "learning_rate": 4.667585967690927e-05, "loss": 0.1996, "num_input_tokens_seen": 66005136, "step": 30580 }, { "epoch": 4.989396411092986, "grad_norm": 1.215078592300415, "learning_rate": 4.667408619597742e-05, "loss": 0.1623, "num_input_tokens_seen": 66015376, "step": 30585 }, { "epoch": 4.99021207177814, "grad_norm": 0.3217698633670807, "learning_rate": 4.667231227579272e-05, "loss": 0.0449, "num_input_tokens_seen": 66025904, "step": 30590 }, { "epoch": 4.991027732463295, "grad_norm": 0.23447297513484955, "learning_rate": 4.667053791639111e-05, "loss": 0.1501, "num_input_tokens_seen": 66036560, "step": 30595 }, { "epoch": 4.99184339314845, "grad_norm": 0.7710341811180115, "learning_rate": 4.6668763117808556e-05, "loss": 0.1839, "num_input_tokens_seen": 66047856, "step": 30600 }, { "epoch": 4.992659053833605, "grad_norm": 0.11134620755910873, "learning_rate": 4.666698788008104e-05, "loss": 0.195, "num_input_tokens_seen": 66058736, "step": 30605 }, { "epoch": 4.993474714518761, "grad_norm": 0.8802415132522583, "learning_rate": 4.666521220324452e-05, "loss": 0.0919, "num_input_tokens_seen": 66068848, "step": 30610 }, { "epoch": 4.994290375203915, "grad_norm": 1.6640853881835938, "learning_rate": 4.666343608733499e-05, "loss": 0.1083, "num_input_tokens_seen": 66079760, "step": 30615 }, { "epoch": 4.99510603588907, "grad_norm": 0.07448239624500275, "learning_rate": 4.6661659532388446e-05, "loss": 0.1046, "num_input_tokens_seen": 66090128, "step": 30620 }, { "epoch": 4.995921696574225, "grad_norm": 1.7054840326309204, "learning_rate": 4.665988253844089e-05, "loss": 0.155, "num_input_tokens_seen": 66100304, "step": 30625 }, { "epoch": 4.99673735725938, "grad_norm": 1.056498408317566, "learning_rate": 4.6658105105528345e-05, "loss": 0.2654, "num_input_tokens_seen": 66111696, "step": 30630 }, { "epoch": 4.997553017944535, "grad_norm": 1.3935364484786987, "learning_rate": 4.665632723368682e-05, "loss": 0.2392, "num_input_tokens_seen": 66122064, "step": 30635 }, { "epoch": 4.99836867862969, "grad_norm": 0.7285448908805847, "learning_rate": 4.6654548922952344e-05, "loss": 0.038, "num_input_tokens_seen": 66131376, "step": 30640 }, { "epoch": 4.999184339314845, "grad_norm": 0.15490186214447021, "learning_rate": 4.665277017336097e-05, "loss": 0.2036, "num_input_tokens_seen": 66142800, "step": 30645 }, { "epoch": 5.0, "grad_norm": 0.16159304976463318, "learning_rate": 4.6650990984948746e-05, "loss": 0.1351, "num_input_tokens_seen": 66152480, "step": 30650 }, { "epoch": 5.0, "eval_loss": 0.13557404279708862, "eval_runtime": 90.5503, "eval_samples_per_second": 30.094, "eval_steps_per_second": 7.532, "num_input_tokens_seen": 66152480, "step": 30650 }, { "epoch": 5.000815660685155, "grad_norm": 0.6327893137931824, "learning_rate": 4.6649211357751715e-05, "loss": 0.0471, "num_input_tokens_seen": 66162912, "step": 30655 }, { "epoch": 5.00163132137031, "grad_norm": 0.8737130761146545, "learning_rate": 4.6647431291805955e-05, "loss": 0.1245, "num_input_tokens_seen": 66173312, "step": 30660 }, { "epoch": 5.002446982055465, "grad_norm": 0.28618210554122925, "learning_rate": 4.664565078714754e-05, "loss": 0.1163, "num_input_tokens_seen": 66184832, "step": 30665 }, { "epoch": 5.00326264274062, "grad_norm": 0.49705758690834045, "learning_rate": 4.6643869843812546e-05, "loss": 0.0685, "num_input_tokens_seen": 66196544, "step": 30670 }, { "epoch": 5.004078303425775, "grad_norm": 0.07729071378707886, "learning_rate": 4.664208846183708e-05, "loss": 0.0441, "num_input_tokens_seen": 66207904, "step": 30675 }, { "epoch": 5.00489396411093, "grad_norm": 0.194576233625412, "learning_rate": 4.6640306641257234e-05, "loss": 0.1, "num_input_tokens_seen": 66218752, "step": 30680 }, { "epoch": 5.005709624796085, "grad_norm": 0.1559964120388031, "learning_rate": 4.6638524382109115e-05, "loss": 0.0716, "num_input_tokens_seen": 66230528, "step": 30685 }, { "epoch": 5.006525285481239, "grad_norm": 1.3069568872451782, "learning_rate": 4.663674168442885e-05, "loss": 0.102, "num_input_tokens_seen": 66240768, "step": 30690 }, { "epoch": 5.007340946166395, "grad_norm": 1.9918864965438843, "learning_rate": 4.663495854825257e-05, "loss": 0.1778, "num_input_tokens_seen": 66252672, "step": 30695 }, { "epoch": 5.00815660685155, "grad_norm": 0.5340691804885864, "learning_rate": 4.663317497361641e-05, "loss": 0.1099, "num_input_tokens_seen": 66263520, "step": 30700 }, { "epoch": 5.008972267536705, "grad_norm": 0.04858335107564926, "learning_rate": 4.663139096055652e-05, "loss": 0.0314, "num_input_tokens_seen": 66274752, "step": 30705 }, { "epoch": 5.00978792822186, "grad_norm": 0.5591846108436584, "learning_rate": 4.6629606509109046e-05, "loss": 0.349, "num_input_tokens_seen": 66285440, "step": 30710 }, { "epoch": 5.010603588907014, "grad_norm": 0.3813514709472656, "learning_rate": 4.662782161931015e-05, "loss": 0.0262, "num_input_tokens_seen": 66295040, "step": 30715 }, { "epoch": 5.011419249592169, "grad_norm": 0.11312978714704514, "learning_rate": 4.6626036291196025e-05, "loss": 0.0311, "num_input_tokens_seen": 66304384, "step": 30720 }, { "epoch": 5.012234910277325, "grad_norm": 0.1965855360031128, "learning_rate": 4.6624250524802834e-05, "loss": 0.1431, "num_input_tokens_seen": 66315648, "step": 30725 }, { "epoch": 5.01305057096248, "grad_norm": 0.13545896112918854, "learning_rate": 4.662246432016677e-05, "loss": 0.0799, "num_input_tokens_seen": 66325664, "step": 30730 }, { "epoch": 5.013866231647635, "grad_norm": 0.528953492641449, "learning_rate": 4.6620677677324044e-05, "loss": 0.0617, "num_input_tokens_seen": 66336544, "step": 30735 }, { "epoch": 5.014681892332789, "grad_norm": 0.5287462472915649, "learning_rate": 4.661889059631085e-05, "loss": 0.0479, "num_input_tokens_seen": 66347360, "step": 30740 }, { "epoch": 5.015497553017944, "grad_norm": 0.32497531175613403, "learning_rate": 4.6617103077163413e-05, "loss": 0.058, "num_input_tokens_seen": 66356992, "step": 30745 }, { "epoch": 5.0163132137031, "grad_norm": 0.5522935390472412, "learning_rate": 4.6615315119917966e-05, "loss": 0.059, "num_input_tokens_seen": 66368032, "step": 30750 }, { "epoch": 5.017128874388255, "grad_norm": 0.38778242468833923, "learning_rate": 4.6613526724610735e-05, "loss": 0.2334, "num_input_tokens_seen": 66379424, "step": 30755 }, { "epoch": 5.0179445350734095, "grad_norm": 0.07001979649066925, "learning_rate": 4.6611737891277965e-05, "loss": 0.0501, "num_input_tokens_seen": 66390752, "step": 30760 }, { "epoch": 5.018760195758564, "grad_norm": 0.5980316400527954, "learning_rate": 4.660994861995591e-05, "loss": 0.1027, "num_input_tokens_seen": 66403200, "step": 30765 }, { "epoch": 5.019575856443719, "grad_norm": 0.03686553239822388, "learning_rate": 4.660815891068083e-05, "loss": 0.0748, "num_input_tokens_seen": 66412768, "step": 30770 }, { "epoch": 5.020391517128874, "grad_norm": 1.519347071647644, "learning_rate": 4.6606368763489e-05, "loss": 0.1493, "num_input_tokens_seen": 66424448, "step": 30775 }, { "epoch": 5.02120717781403, "grad_norm": 0.16549959778785706, "learning_rate": 4.660457817841669e-05, "loss": 0.2101, "num_input_tokens_seen": 66434400, "step": 30780 }, { "epoch": 5.0220228384991845, "grad_norm": 0.11867063492536545, "learning_rate": 4.6602787155500204e-05, "loss": 0.3295, "num_input_tokens_seen": 66445440, "step": 30785 }, { "epoch": 5.022838499184339, "grad_norm": 1.5397981405258179, "learning_rate": 4.660099569477583e-05, "loss": 0.2108, "num_input_tokens_seen": 66455776, "step": 30790 }, { "epoch": 5.023654159869494, "grad_norm": 0.10008013248443604, "learning_rate": 4.659920379627988e-05, "loss": 0.0142, "num_input_tokens_seen": 66467616, "step": 30795 }, { "epoch": 5.024469820554649, "grad_norm": 0.1716526746749878, "learning_rate": 4.659741146004866e-05, "loss": 0.1038, "num_input_tokens_seen": 66477056, "step": 30800 }, { "epoch": 5.025285481239805, "grad_norm": 0.15675172209739685, "learning_rate": 4.6595618686118494e-05, "loss": 0.1512, "num_input_tokens_seen": 66488352, "step": 30805 }, { "epoch": 5.0261011419249595, "grad_norm": 0.5425694584846497, "learning_rate": 4.659382547452572e-05, "loss": 0.1347, "num_input_tokens_seen": 66497824, "step": 30810 }, { "epoch": 5.026916802610114, "grad_norm": 0.33755698800086975, "learning_rate": 4.659203182530668e-05, "loss": 0.113, "num_input_tokens_seen": 66509920, "step": 30815 }, { "epoch": 5.027732463295269, "grad_norm": 0.5381590723991394, "learning_rate": 4.659023773849773e-05, "loss": 0.0543, "num_input_tokens_seen": 66519520, "step": 30820 }, { "epoch": 5.028548123980424, "grad_norm": 0.12120971083641052, "learning_rate": 4.658844321413522e-05, "loss": 0.063, "num_input_tokens_seen": 66529792, "step": 30825 }, { "epoch": 5.029363784665579, "grad_norm": 0.6566482782363892, "learning_rate": 4.6586648252255516e-05, "loss": 0.1701, "num_input_tokens_seen": 66540416, "step": 30830 }, { "epoch": 5.0301794453507345, "grad_norm": 0.714594304561615, "learning_rate": 4.658485285289501e-05, "loss": 0.0553, "num_input_tokens_seen": 66549280, "step": 30835 }, { "epoch": 5.030995106035889, "grad_norm": 0.21408547461032867, "learning_rate": 4.658305701609007e-05, "loss": 0.0381, "num_input_tokens_seen": 66560640, "step": 30840 }, { "epoch": 5.031810766721044, "grad_norm": 0.3480062186717987, "learning_rate": 4.6581260741877105e-05, "loss": 0.1568, "num_input_tokens_seen": 66571808, "step": 30845 }, { "epoch": 5.032626427406199, "grad_norm": 2.288520097732544, "learning_rate": 4.6579464030292505e-05, "loss": 0.2664, "num_input_tokens_seen": 66583424, "step": 30850 }, { "epoch": 5.033442088091354, "grad_norm": 1.2395212650299072, "learning_rate": 4.6577666881372706e-05, "loss": 0.1922, "num_input_tokens_seen": 66593984, "step": 30855 }, { "epoch": 5.034257748776509, "grad_norm": 0.33906275033950806, "learning_rate": 4.65758692951541e-05, "loss": 0.1216, "num_input_tokens_seen": 66604704, "step": 30860 }, { "epoch": 5.035073409461664, "grad_norm": 1.3158378601074219, "learning_rate": 4.657407127167314e-05, "loss": 0.2101, "num_input_tokens_seen": 66615808, "step": 30865 }, { "epoch": 5.035889070146819, "grad_norm": 1.0502896308898926, "learning_rate": 4.6572272810966255e-05, "loss": 0.0607, "num_input_tokens_seen": 66627008, "step": 30870 }, { "epoch": 5.036704730831974, "grad_norm": 0.16223156452178955, "learning_rate": 4.657047391306989e-05, "loss": 0.1077, "num_input_tokens_seen": 66638464, "step": 30875 }, { "epoch": 5.037520391517129, "grad_norm": 1.178747534751892, "learning_rate": 4.656867457802052e-05, "loss": 0.1049, "num_input_tokens_seen": 66648544, "step": 30880 }, { "epoch": 5.0383360522022835, "grad_norm": 0.09206897765398026, "learning_rate": 4.656687480585459e-05, "loss": 0.1288, "num_input_tokens_seen": 66660384, "step": 30885 }, { "epoch": 5.039151712887439, "grad_norm": 0.6340404152870178, "learning_rate": 4.656507459660858e-05, "loss": 0.1744, "num_input_tokens_seen": 66671136, "step": 30890 }, { "epoch": 5.039967373572594, "grad_norm": 1.3579363822937012, "learning_rate": 4.6563273950318974e-05, "loss": 0.18, "num_input_tokens_seen": 66681664, "step": 30895 }, { "epoch": 5.040783034257749, "grad_norm": 0.09411207586526871, "learning_rate": 4.656147286702227e-05, "loss": 0.0488, "num_input_tokens_seen": 66691968, "step": 30900 }, { "epoch": 5.041598694942904, "grad_norm": 0.6256566643714905, "learning_rate": 4.655967134675497e-05, "loss": 0.0863, "num_input_tokens_seen": 66703488, "step": 30905 }, { "epoch": 5.0424143556280585, "grad_norm": 0.13059760630130768, "learning_rate": 4.6557869389553575e-05, "loss": 0.0862, "num_input_tokens_seen": 66714304, "step": 30910 }, { "epoch": 5.043230016313213, "grad_norm": 0.4387310743331909, "learning_rate": 4.655606699545461e-05, "loss": 0.1644, "num_input_tokens_seen": 66724864, "step": 30915 }, { "epoch": 5.044045676998369, "grad_norm": 0.16584210097789764, "learning_rate": 4.65542641644946e-05, "loss": 0.1443, "num_input_tokens_seen": 66736736, "step": 30920 }, { "epoch": 5.044861337683524, "grad_norm": 0.4311765432357788, "learning_rate": 4.6552460896710084e-05, "loss": 0.2013, "num_input_tokens_seen": 66748128, "step": 30925 }, { "epoch": 5.045676998368679, "grad_norm": 0.09822637587785721, "learning_rate": 4.655065719213761e-05, "loss": 0.0335, "num_input_tokens_seen": 66759200, "step": 30930 }, { "epoch": 5.0464926590538335, "grad_norm": 0.2798604965209961, "learning_rate": 4.6548853050813726e-05, "loss": 0.1962, "num_input_tokens_seen": 66770720, "step": 30935 }, { "epoch": 5.047308319738988, "grad_norm": 1.120861291885376, "learning_rate": 4.6547048472775e-05, "loss": 0.1664, "num_input_tokens_seen": 66781312, "step": 30940 }, { "epoch": 5.048123980424143, "grad_norm": 0.5163009166717529, "learning_rate": 4.6545243458057997e-05, "loss": 0.0776, "num_input_tokens_seen": 66792864, "step": 30945 }, { "epoch": 5.048939641109299, "grad_norm": 1.1916285753250122, "learning_rate": 4.654343800669931e-05, "loss": 0.2206, "num_input_tokens_seen": 66802976, "step": 30950 }, { "epoch": 5.049755301794454, "grad_norm": 0.9042913317680359, "learning_rate": 4.6541632118735516e-05, "loss": 0.0707, "num_input_tokens_seen": 66812768, "step": 30955 }, { "epoch": 5.0505709624796085, "grad_norm": 0.27870240807533264, "learning_rate": 4.6539825794203224e-05, "loss": 0.0947, "num_input_tokens_seen": 66823072, "step": 30960 }, { "epoch": 5.051386623164763, "grad_norm": 1.5459007024765015, "learning_rate": 4.653801903313904e-05, "loss": 0.1085, "num_input_tokens_seen": 66835200, "step": 30965 }, { "epoch": 5.052202283849918, "grad_norm": 0.31412437558174133, "learning_rate": 4.653621183557957e-05, "loss": 0.0453, "num_input_tokens_seen": 66846528, "step": 30970 }, { "epoch": 5.053017944535074, "grad_norm": 0.7613025307655334, "learning_rate": 4.653440420156145e-05, "loss": 0.1576, "num_input_tokens_seen": 66858080, "step": 30975 }, { "epoch": 5.053833605220229, "grad_norm": 0.6877225637435913, "learning_rate": 4.653259613112131e-05, "loss": 0.1423, "num_input_tokens_seen": 66869888, "step": 30980 }, { "epoch": 5.054649265905383, "grad_norm": 1.9038989543914795, "learning_rate": 4.653078762429579e-05, "loss": 0.1919, "num_input_tokens_seen": 66880256, "step": 30985 }, { "epoch": 5.055464926590538, "grad_norm": 0.7208256125450134, "learning_rate": 4.652897868112155e-05, "loss": 0.1272, "num_input_tokens_seen": 66890784, "step": 30990 }, { "epoch": 5.056280587275693, "grad_norm": 0.2929104268550873, "learning_rate": 4.652716930163524e-05, "loss": 0.0442, "num_input_tokens_seen": 66901920, "step": 30995 }, { "epoch": 5.057096247960848, "grad_norm": 0.5476671457290649, "learning_rate": 4.652535948587354e-05, "loss": 0.1608, "num_input_tokens_seen": 66912608, "step": 31000 }, { "epoch": 5.057911908646004, "grad_norm": 0.15858589112758636, "learning_rate": 4.6523549233873125e-05, "loss": 0.1491, "num_input_tokens_seen": 66922336, "step": 31005 }, { "epoch": 5.058727569331158, "grad_norm": 0.501876175403595, "learning_rate": 4.6521738545670676e-05, "loss": 0.2498, "num_input_tokens_seen": 66932960, "step": 31010 }, { "epoch": 5.059543230016313, "grad_norm": 0.42598405480384827, "learning_rate": 4.65199274213029e-05, "loss": 0.1662, "num_input_tokens_seen": 66944448, "step": 31015 }, { "epoch": 5.060358890701468, "grad_norm": 0.3088875710964203, "learning_rate": 4.651811586080649e-05, "loss": 0.0584, "num_input_tokens_seen": 66955264, "step": 31020 }, { "epoch": 5.061174551386623, "grad_norm": 0.20174486935138702, "learning_rate": 4.6516303864218164e-05, "loss": 0.0478, "num_input_tokens_seen": 66965024, "step": 31025 }, { "epoch": 5.061990212071779, "grad_norm": 0.1379709243774414, "learning_rate": 4.651449143157465e-05, "loss": 0.1451, "num_input_tokens_seen": 66975072, "step": 31030 }, { "epoch": 5.062805872756933, "grad_norm": 0.05872043967247009, "learning_rate": 4.651267856291266e-05, "loss": 0.1362, "num_input_tokens_seen": 66987232, "step": 31035 }, { "epoch": 5.063621533442088, "grad_norm": 1.0110019445419312, "learning_rate": 4.651086525826896e-05, "loss": 0.2066, "num_input_tokens_seen": 66995968, "step": 31040 }, { "epoch": 5.064437194127243, "grad_norm": 2.247366189956665, "learning_rate": 4.650905151768028e-05, "loss": 0.2132, "num_input_tokens_seen": 67006912, "step": 31045 }, { "epoch": 5.065252854812398, "grad_norm": 0.521895170211792, "learning_rate": 4.650723734118339e-05, "loss": 0.1909, "num_input_tokens_seen": 67017824, "step": 31050 }, { "epoch": 5.066068515497553, "grad_norm": 0.11277255415916443, "learning_rate": 4.650542272881505e-05, "loss": 0.0179, "num_input_tokens_seen": 67028736, "step": 31055 }, { "epoch": 5.066884176182708, "grad_norm": 0.3594527244567871, "learning_rate": 4.650360768061204e-05, "loss": 0.0886, "num_input_tokens_seen": 67039936, "step": 31060 }, { "epoch": 5.067699836867863, "grad_norm": 0.13856175541877747, "learning_rate": 4.650179219661114e-05, "loss": 0.0476, "num_input_tokens_seen": 67051616, "step": 31065 }, { "epoch": 5.068515497553018, "grad_norm": 0.13442179560661316, "learning_rate": 4.649997627684914e-05, "loss": 0.0543, "num_input_tokens_seen": 67062240, "step": 31070 }, { "epoch": 5.069331158238173, "grad_norm": 0.19018295407295227, "learning_rate": 4.649815992136285e-05, "loss": 0.084, "num_input_tokens_seen": 67073408, "step": 31075 }, { "epoch": 5.070146818923328, "grad_norm": 0.20646090805530548, "learning_rate": 4.6496343130189074e-05, "loss": 0.0789, "num_input_tokens_seen": 67084896, "step": 31080 }, { "epoch": 5.0709624796084825, "grad_norm": 0.1277213841676712, "learning_rate": 4.649452590336464e-05, "loss": 0.0271, "num_input_tokens_seen": 67095040, "step": 31085 }, { "epoch": 5.071778140293638, "grad_norm": 0.046072062104940414, "learning_rate": 4.649270824092636e-05, "loss": 0.0787, "num_input_tokens_seen": 67106336, "step": 31090 }, { "epoch": 5.072593800978793, "grad_norm": 0.14047223329544067, "learning_rate": 4.649089014291109e-05, "loss": 0.2513, "num_input_tokens_seen": 67116832, "step": 31095 }, { "epoch": 5.073409461663948, "grad_norm": 1.8510023355484009, "learning_rate": 4.648907160935567e-05, "loss": 0.1579, "num_input_tokens_seen": 67128736, "step": 31100 }, { "epoch": 5.074225122349103, "grad_norm": 0.9819611310958862, "learning_rate": 4.6487252640296945e-05, "loss": 0.184, "num_input_tokens_seen": 67140128, "step": 31105 }, { "epoch": 5.075040783034257, "grad_norm": 0.1939404308795929, "learning_rate": 4.6485433235771794e-05, "loss": 0.0974, "num_input_tokens_seen": 67151232, "step": 31110 }, { "epoch": 5.075856443719413, "grad_norm": 0.31693458557128906, "learning_rate": 4.6483613395817086e-05, "loss": 0.1265, "num_input_tokens_seen": 67161984, "step": 31115 }, { "epoch": 5.076672104404568, "grad_norm": 1.6083751916885376, "learning_rate": 4.648179312046969e-05, "loss": 0.1411, "num_input_tokens_seen": 67172224, "step": 31120 }, { "epoch": 5.077487765089723, "grad_norm": 0.13198097050189972, "learning_rate": 4.6479972409766505e-05, "loss": 0.0283, "num_input_tokens_seen": 67182560, "step": 31125 }, { "epoch": 5.078303425774878, "grad_norm": 0.5936102271080017, "learning_rate": 4.647815126374443e-05, "loss": 0.0399, "num_input_tokens_seen": 67193024, "step": 31130 }, { "epoch": 5.079119086460032, "grad_norm": 0.051262591034173965, "learning_rate": 4.647632968244038e-05, "loss": 0.1989, "num_input_tokens_seen": 67204096, "step": 31135 }, { "epoch": 5.079934747145187, "grad_norm": 0.20647667348384857, "learning_rate": 4.6474507665891254e-05, "loss": 0.1132, "num_input_tokens_seen": 67214432, "step": 31140 }, { "epoch": 5.080750407830343, "grad_norm": 0.7005312442779541, "learning_rate": 4.6472685214133994e-05, "loss": 0.0509, "num_input_tokens_seen": 67225792, "step": 31145 }, { "epoch": 5.081566068515498, "grad_norm": 0.7542842030525208, "learning_rate": 4.647086232720552e-05, "loss": 0.166, "num_input_tokens_seen": 67236128, "step": 31150 }, { "epoch": 5.082381729200653, "grad_norm": 1.0308542251586914, "learning_rate": 4.6469039005142796e-05, "loss": 0.1216, "num_input_tokens_seen": 67247072, "step": 31155 }, { "epoch": 5.083197389885807, "grad_norm": 0.740594208240509, "learning_rate": 4.6467215247982755e-05, "loss": 0.1504, "num_input_tokens_seen": 67257344, "step": 31160 }, { "epoch": 5.084013050570962, "grad_norm": 0.5537754893302917, "learning_rate": 4.646539105576237e-05, "loss": 0.1124, "num_input_tokens_seen": 67266848, "step": 31165 }, { "epoch": 5.084828711256117, "grad_norm": 1.4403102397918701, "learning_rate": 4.646356642851859e-05, "loss": 0.2611, "num_input_tokens_seen": 67277056, "step": 31170 }, { "epoch": 5.085644371941273, "grad_norm": 0.172149196267128, "learning_rate": 4.646174136628842e-05, "loss": 0.057, "num_input_tokens_seen": 67288672, "step": 31175 }, { "epoch": 5.0864600326264275, "grad_norm": 0.05687402933835983, "learning_rate": 4.645991586910883e-05, "loss": 0.1039, "num_input_tokens_seen": 67299424, "step": 31180 }, { "epoch": 5.087275693311582, "grad_norm": 0.3955801725387573, "learning_rate": 4.6458089937016826e-05, "loss": 0.0635, "num_input_tokens_seen": 67309056, "step": 31185 }, { "epoch": 5.088091353996737, "grad_norm": 0.36901912093162537, "learning_rate": 4.64562635700494e-05, "loss": 0.0675, "num_input_tokens_seen": 67320320, "step": 31190 }, { "epoch": 5.088907014681892, "grad_norm": 0.1698140650987625, "learning_rate": 4.645443676824358e-05, "loss": 0.2395, "num_input_tokens_seen": 67330944, "step": 31195 }, { "epoch": 5.089722675367048, "grad_norm": 0.11013743281364441, "learning_rate": 4.645260953163638e-05, "loss": 0.039, "num_input_tokens_seen": 67342720, "step": 31200 }, { "epoch": 5.0905383360522025, "grad_norm": 1.3451999425888062, "learning_rate": 4.6450781860264836e-05, "loss": 0.1611, "num_input_tokens_seen": 67353856, "step": 31205 }, { "epoch": 5.091353996737357, "grad_norm": 0.052362605929374695, "learning_rate": 4.644895375416598e-05, "loss": 0.2364, "num_input_tokens_seen": 67364384, "step": 31210 }, { "epoch": 5.092169657422512, "grad_norm": 0.2628832459449768, "learning_rate": 4.644712521337687e-05, "loss": 0.2268, "num_input_tokens_seen": 67375328, "step": 31215 }, { "epoch": 5.092985318107667, "grad_norm": 0.23725031316280365, "learning_rate": 4.644529623793455e-05, "loss": 0.0957, "num_input_tokens_seen": 67386240, "step": 31220 }, { "epoch": 5.093800978792822, "grad_norm": 0.16549313068389893, "learning_rate": 4.6443466827876115e-05, "loss": 0.1118, "num_input_tokens_seen": 67397760, "step": 31225 }, { "epoch": 5.0946166394779775, "grad_norm": 0.10800191760063171, "learning_rate": 4.6441636983238614e-05, "loss": 0.0699, "num_input_tokens_seen": 67408768, "step": 31230 }, { "epoch": 5.095432300163132, "grad_norm": 1.255553960800171, "learning_rate": 4.643980670405914e-05, "loss": 0.1719, "num_input_tokens_seen": 67419456, "step": 31235 }, { "epoch": 5.096247960848287, "grad_norm": 0.8492396473884583, "learning_rate": 4.643797599037478e-05, "loss": 0.103, "num_input_tokens_seen": 67430528, "step": 31240 }, { "epoch": 5.097063621533442, "grad_norm": 0.6862136721611023, "learning_rate": 4.643614484222264e-05, "loss": 0.0931, "num_input_tokens_seen": 67441472, "step": 31245 }, { "epoch": 5.097879282218597, "grad_norm": 1.1011922359466553, "learning_rate": 4.6434313259639836e-05, "loss": 0.1444, "num_input_tokens_seen": 67451808, "step": 31250 }, { "epoch": 5.0986949429037525, "grad_norm": 0.3711557984352112, "learning_rate": 4.6432481242663475e-05, "loss": 0.1142, "num_input_tokens_seen": 67463104, "step": 31255 }, { "epoch": 5.099510603588907, "grad_norm": 0.10004636645317078, "learning_rate": 4.64306487913307e-05, "loss": 0.0678, "num_input_tokens_seen": 67474144, "step": 31260 }, { "epoch": 5.100326264274062, "grad_norm": 0.8005383610725403, "learning_rate": 4.6428815905678635e-05, "loss": 0.0897, "num_input_tokens_seen": 67484096, "step": 31265 }, { "epoch": 5.101141924959217, "grad_norm": 0.05032875016331673, "learning_rate": 4.642698258574444e-05, "loss": 0.1282, "num_input_tokens_seen": 67493664, "step": 31270 }, { "epoch": 5.101957585644372, "grad_norm": 2.291231393814087, "learning_rate": 4.642514883156525e-05, "loss": 0.2686, "num_input_tokens_seen": 67504768, "step": 31275 }, { "epoch": 5.102773246329527, "grad_norm": 1.1888705492019653, "learning_rate": 4.6423314643178244e-05, "loss": 0.119, "num_input_tokens_seen": 67514848, "step": 31280 }, { "epoch": 5.103588907014682, "grad_norm": 0.0716441422700882, "learning_rate": 4.642148002062059e-05, "loss": 0.1226, "num_input_tokens_seen": 67525248, "step": 31285 }, { "epoch": 5.104404567699837, "grad_norm": 0.8379150032997131, "learning_rate": 4.641964496392947e-05, "loss": 0.0763, "num_input_tokens_seen": 67534656, "step": 31290 }, { "epoch": 5.105220228384992, "grad_norm": 1.8851511478424072, "learning_rate": 4.641780947314207e-05, "loss": 0.2325, "num_input_tokens_seen": 67545024, "step": 31295 }, { "epoch": 5.106035889070147, "grad_norm": 1.6797783374786377, "learning_rate": 4.641597354829559e-05, "loss": 0.2272, "num_input_tokens_seen": 67556640, "step": 31300 }, { "epoch": 5.1068515497553015, "grad_norm": 0.19841545820236206, "learning_rate": 4.6414137189427244e-05, "loss": 0.0359, "num_input_tokens_seen": 67568032, "step": 31305 }, { "epoch": 5.107667210440456, "grad_norm": 0.09714306145906448, "learning_rate": 4.6412300396574235e-05, "loss": 0.0827, "num_input_tokens_seen": 67578688, "step": 31310 }, { "epoch": 5.108482871125612, "grad_norm": 0.19200772047042847, "learning_rate": 4.64104631697738e-05, "loss": 0.036, "num_input_tokens_seen": 67589664, "step": 31315 }, { "epoch": 5.109298531810767, "grad_norm": 0.8818573355674744, "learning_rate": 4.6408625509063166e-05, "loss": 0.1488, "num_input_tokens_seen": 67600736, "step": 31320 }, { "epoch": 5.110114192495922, "grad_norm": 0.303742915391922, "learning_rate": 4.640678741447957e-05, "loss": 0.2095, "num_input_tokens_seen": 67612288, "step": 31325 }, { "epoch": 5.1109298531810765, "grad_norm": 1.4447485208511353, "learning_rate": 4.640494888606027e-05, "loss": 0.2371, "num_input_tokens_seen": 67622592, "step": 31330 }, { "epoch": 5.111745513866231, "grad_norm": 0.3073584735393524, "learning_rate": 4.640310992384253e-05, "loss": 0.1709, "num_input_tokens_seen": 67633568, "step": 31335 }, { "epoch": 5.112561174551387, "grad_norm": 0.1250971108675003, "learning_rate": 4.6401270527863625e-05, "loss": 0.1533, "num_input_tokens_seen": 67644000, "step": 31340 }, { "epoch": 5.113376835236542, "grad_norm": 0.8646083474159241, "learning_rate": 4.639943069816082e-05, "loss": 0.1656, "num_input_tokens_seen": 67654816, "step": 31345 }, { "epoch": 5.114192495921697, "grad_norm": 1.293382167816162, "learning_rate": 4.63975904347714e-05, "loss": 0.3058, "num_input_tokens_seen": 67666304, "step": 31350 }, { "epoch": 5.1150081566068515, "grad_norm": 1.3397492170333862, "learning_rate": 4.639574973773267e-05, "loss": 0.0982, "num_input_tokens_seen": 67677536, "step": 31355 }, { "epoch": 5.115823817292006, "grad_norm": 0.2759677767753601, "learning_rate": 4.639390860708193e-05, "loss": 0.2154, "num_input_tokens_seen": 67688608, "step": 31360 }, { "epoch": 5.116639477977161, "grad_norm": 2.8510401248931885, "learning_rate": 4.639206704285648e-05, "loss": 0.2284, "num_input_tokens_seen": 67699200, "step": 31365 }, { "epoch": 5.117455138662317, "grad_norm": 0.20540539920330048, "learning_rate": 4.6390225045093666e-05, "loss": 0.0825, "num_input_tokens_seen": 67709568, "step": 31370 }, { "epoch": 5.118270799347472, "grad_norm": 0.48833906650543213, "learning_rate": 4.63883826138308e-05, "loss": 0.1385, "num_input_tokens_seen": 67719360, "step": 31375 }, { "epoch": 5.1190864600326265, "grad_norm": 0.38348251581192017, "learning_rate": 4.6386539749105226e-05, "loss": 0.0981, "num_input_tokens_seen": 67729472, "step": 31380 }, { "epoch": 5.119902120717781, "grad_norm": 0.7284020185470581, "learning_rate": 4.63846964509543e-05, "loss": 0.1347, "num_input_tokens_seen": 67739424, "step": 31385 }, { "epoch": 5.120717781402936, "grad_norm": 1.6764229536056519, "learning_rate": 4.6382852719415373e-05, "loss": 0.2079, "num_input_tokens_seen": 67750624, "step": 31390 }, { "epoch": 5.121533442088092, "grad_norm": 0.2651582956314087, "learning_rate": 4.63810085545258e-05, "loss": 0.0962, "num_input_tokens_seen": 67760960, "step": 31395 }, { "epoch": 5.122349102773247, "grad_norm": 0.4872835874557495, "learning_rate": 4.637916395632297e-05, "loss": 0.0682, "num_input_tokens_seen": 67771232, "step": 31400 }, { "epoch": 5.123164763458401, "grad_norm": 0.7393003702163696, "learning_rate": 4.637731892484426e-05, "loss": 0.0342, "num_input_tokens_seen": 67781408, "step": 31405 }, { "epoch": 5.123980424143556, "grad_norm": 0.09931474924087524, "learning_rate": 4.637547346012707e-05, "loss": 0.2069, "num_input_tokens_seen": 67793472, "step": 31410 }, { "epoch": 5.124796084828711, "grad_norm": 0.2958233058452606, "learning_rate": 4.637362756220878e-05, "loss": 0.031, "num_input_tokens_seen": 67804704, "step": 31415 }, { "epoch": 5.125611745513866, "grad_norm": 0.06948398798704147, "learning_rate": 4.637178123112682e-05, "loss": 0.0954, "num_input_tokens_seen": 67815840, "step": 31420 }, { "epoch": 5.126427406199022, "grad_norm": 0.09142705798149109, "learning_rate": 4.636993446691861e-05, "loss": 0.0452, "num_input_tokens_seen": 67827008, "step": 31425 }, { "epoch": 5.127243066884176, "grad_norm": 1.4127205610275269, "learning_rate": 4.636808726962155e-05, "loss": 0.2925, "num_input_tokens_seen": 67837024, "step": 31430 }, { "epoch": 5.128058727569331, "grad_norm": 0.1881181001663208, "learning_rate": 4.636623963927311e-05, "loss": 0.0691, "num_input_tokens_seen": 67847360, "step": 31435 }, { "epoch": 5.128874388254486, "grad_norm": 0.04733822122216225, "learning_rate": 4.636439157591071e-05, "loss": 0.284, "num_input_tokens_seen": 67857152, "step": 31440 }, { "epoch": 5.129690048939641, "grad_norm": 0.9515668153762817, "learning_rate": 4.636254307957182e-05, "loss": 0.1446, "num_input_tokens_seen": 67867680, "step": 31445 }, { "epoch": 5.130505709624796, "grad_norm": 0.17121808230876923, "learning_rate": 4.636069415029388e-05, "loss": 0.0405, "num_input_tokens_seen": 67877632, "step": 31450 }, { "epoch": 5.131321370309951, "grad_norm": 0.45181578397750854, "learning_rate": 4.635884478811439e-05, "loss": 0.2175, "num_input_tokens_seen": 67886912, "step": 31455 }, { "epoch": 5.132137030995106, "grad_norm": 0.26754850149154663, "learning_rate": 4.635699499307081e-05, "loss": 0.0537, "num_input_tokens_seen": 67895040, "step": 31460 }, { "epoch": 5.132952691680261, "grad_norm": 0.17804664373397827, "learning_rate": 4.635514476520063e-05, "loss": 0.1212, "num_input_tokens_seen": 67906336, "step": 31465 }, { "epoch": 5.133768352365416, "grad_norm": 0.03201794996857643, "learning_rate": 4.6353294104541354e-05, "loss": 0.1477, "num_input_tokens_seen": 67916992, "step": 31470 }, { "epoch": 5.134584013050571, "grad_norm": 0.1482342928647995, "learning_rate": 4.635144301113048e-05, "loss": 0.1384, "num_input_tokens_seen": 67928352, "step": 31475 }, { "epoch": 5.135399673735726, "grad_norm": 0.258518785238266, "learning_rate": 4.634959148500553e-05, "loss": 0.2208, "num_input_tokens_seen": 67940160, "step": 31480 }, { "epoch": 5.136215334420881, "grad_norm": 0.2161974161863327, "learning_rate": 4.634773952620402e-05, "loss": 0.1131, "num_input_tokens_seen": 67950304, "step": 31485 }, { "epoch": 5.137030995106036, "grad_norm": 0.9696845412254333, "learning_rate": 4.63458871347635e-05, "loss": 0.1897, "num_input_tokens_seen": 67960224, "step": 31490 }, { "epoch": 5.137846655791191, "grad_norm": 1.3973910808563232, "learning_rate": 4.634403431072149e-05, "loss": 0.1374, "num_input_tokens_seen": 67971616, "step": 31495 }, { "epoch": 5.138662316476346, "grad_norm": 0.528017520904541, "learning_rate": 4.6342181054115545e-05, "loss": 0.1864, "num_input_tokens_seen": 67982048, "step": 31500 }, { "epoch": 5.1394779771615005, "grad_norm": 0.44308897852897644, "learning_rate": 4.6340327364983225e-05, "loss": 0.1452, "num_input_tokens_seen": 67991584, "step": 31505 }, { "epoch": 5.140293637846656, "grad_norm": 1.3585844039916992, "learning_rate": 4.633847324336211e-05, "loss": 0.1681, "num_input_tokens_seen": 68002208, "step": 31510 }, { "epoch": 5.141109298531811, "grad_norm": 0.6031845808029175, "learning_rate": 4.633661868928976e-05, "loss": 0.0391, "num_input_tokens_seen": 68013344, "step": 31515 }, { "epoch": 5.141924959216966, "grad_norm": 0.7815961241722107, "learning_rate": 4.633476370280376e-05, "loss": 0.2491, "num_input_tokens_seen": 68025056, "step": 31520 }, { "epoch": 5.142740619902121, "grad_norm": 0.3212483823299408, "learning_rate": 4.6332908283941714e-05, "loss": 0.0419, "num_input_tokens_seen": 68035296, "step": 31525 }, { "epoch": 5.143556280587275, "grad_norm": 1.0046844482421875, "learning_rate": 4.6331052432741214e-05, "loss": 0.1288, "num_input_tokens_seen": 68046080, "step": 31530 }, { "epoch": 5.14437194127243, "grad_norm": 0.15179871022701263, "learning_rate": 4.632919614923987e-05, "loss": 0.1441, "num_input_tokens_seen": 68057344, "step": 31535 }, { "epoch": 5.145187601957586, "grad_norm": 0.13355562090873718, "learning_rate": 4.6327339433475324e-05, "loss": 0.1398, "num_input_tokens_seen": 68067488, "step": 31540 }, { "epoch": 5.146003262642741, "grad_norm": 1.5774831771850586, "learning_rate": 4.632548228548518e-05, "loss": 0.2103, "num_input_tokens_seen": 68079584, "step": 31545 }, { "epoch": 5.146818923327896, "grad_norm": 0.11565212905406952, "learning_rate": 4.632362470530709e-05, "loss": 0.2226, "num_input_tokens_seen": 68090816, "step": 31550 }, { "epoch": 5.14763458401305, "grad_norm": 0.22405076026916504, "learning_rate": 4.632176669297868e-05, "loss": 0.1052, "num_input_tokens_seen": 68102592, "step": 31555 }, { "epoch": 5.148450244698205, "grad_norm": 0.480864942073822, "learning_rate": 4.631990824853763e-05, "loss": 0.029, "num_input_tokens_seen": 68113664, "step": 31560 }, { "epoch": 5.149265905383361, "grad_norm": 0.3301009237766266, "learning_rate": 4.63180493720216e-05, "loss": 0.1156, "num_input_tokens_seen": 68124480, "step": 31565 }, { "epoch": 5.150081566068516, "grad_norm": 0.09856144338846207, "learning_rate": 4.631619006346825e-05, "loss": 0.1065, "num_input_tokens_seen": 68135424, "step": 31570 }, { "epoch": 5.150897226753671, "grad_norm": 0.5331529378890991, "learning_rate": 4.631433032291526e-05, "loss": 0.1787, "num_input_tokens_seen": 68146784, "step": 31575 }, { "epoch": 5.151712887438825, "grad_norm": 0.5990188717842102, "learning_rate": 4.6312470150400334e-05, "loss": 0.1774, "num_input_tokens_seen": 68155744, "step": 31580 }, { "epoch": 5.15252854812398, "grad_norm": 0.22054611146450043, "learning_rate": 4.631060954596116e-05, "loss": 0.0197, "num_input_tokens_seen": 68167168, "step": 31585 }, { "epoch": 5.153344208809135, "grad_norm": 1.1477999687194824, "learning_rate": 4.630874850963545e-05, "loss": 0.1591, "num_input_tokens_seen": 68178496, "step": 31590 }, { "epoch": 5.154159869494291, "grad_norm": 0.28500428795814514, "learning_rate": 4.6306887041460914e-05, "loss": 0.1201, "num_input_tokens_seen": 68189568, "step": 31595 }, { "epoch": 5.1549755301794455, "grad_norm": 0.2406606674194336, "learning_rate": 4.630502514147529e-05, "loss": 0.0972, "num_input_tokens_seen": 68198624, "step": 31600 }, { "epoch": 5.1557911908646, "grad_norm": 0.2588817775249481, "learning_rate": 4.6303162809716304e-05, "loss": 0.0314, "num_input_tokens_seen": 68210528, "step": 31605 }, { "epoch": 5.156606851549755, "grad_norm": 0.1183520182967186, "learning_rate": 4.63013000462217e-05, "loss": 0.0793, "num_input_tokens_seen": 68220384, "step": 31610 }, { "epoch": 5.15742251223491, "grad_norm": 0.11772780865430832, "learning_rate": 4.629943685102922e-05, "loss": 0.2311, "num_input_tokens_seen": 68230464, "step": 31615 }, { "epoch": 5.158238172920065, "grad_norm": 0.18493200838565826, "learning_rate": 4.6297573224176636e-05, "loss": 0.1627, "num_input_tokens_seen": 68240384, "step": 31620 }, { "epoch": 5.1590538336052205, "grad_norm": 0.2552637457847595, "learning_rate": 4.629570916570172e-05, "loss": 0.091, "num_input_tokens_seen": 68251840, "step": 31625 }, { "epoch": 5.159869494290375, "grad_norm": 0.5128335356712341, "learning_rate": 4.629384467564223e-05, "loss": 0.1963, "num_input_tokens_seen": 68263008, "step": 31630 }, { "epoch": 5.16068515497553, "grad_norm": 0.2429887056350708, "learning_rate": 4.629197975403597e-05, "loss": 0.1198, "num_input_tokens_seen": 68273920, "step": 31635 }, { "epoch": 5.161500815660685, "grad_norm": 0.7376108765602112, "learning_rate": 4.629011440092073e-05, "loss": 0.1596, "num_input_tokens_seen": 68284256, "step": 31640 }, { "epoch": 5.16231647634584, "grad_norm": 0.3216945230960846, "learning_rate": 4.6288248616334315e-05, "loss": 0.1975, "num_input_tokens_seen": 68296000, "step": 31645 }, { "epoch": 5.1631321370309955, "grad_norm": 0.3681519627571106, "learning_rate": 4.628638240031453e-05, "loss": 0.1308, "num_input_tokens_seen": 68308224, "step": 31650 }, { "epoch": 5.16394779771615, "grad_norm": 0.9425638318061829, "learning_rate": 4.628451575289921e-05, "loss": 0.1132, "num_input_tokens_seen": 68318560, "step": 31655 }, { "epoch": 5.164763458401305, "grad_norm": 0.10164476186037064, "learning_rate": 4.6282648674126175e-05, "loss": 0.0998, "num_input_tokens_seen": 68328256, "step": 31660 }, { "epoch": 5.16557911908646, "grad_norm": 0.09737706184387207, "learning_rate": 4.628078116403326e-05, "loss": 0.1376, "num_input_tokens_seen": 68339648, "step": 31665 }, { "epoch": 5.166394779771615, "grad_norm": 0.5156913995742798, "learning_rate": 4.627891322265832e-05, "loss": 0.077, "num_input_tokens_seen": 68351072, "step": 31670 }, { "epoch": 5.16721044045677, "grad_norm": 0.7803539633750916, "learning_rate": 4.627704485003921e-05, "loss": 0.1043, "num_input_tokens_seen": 68360832, "step": 31675 }, { "epoch": 5.168026101141925, "grad_norm": 0.8094354867935181, "learning_rate": 4.62751760462138e-05, "loss": 0.1148, "num_input_tokens_seen": 68370016, "step": 31680 }, { "epoch": 5.16884176182708, "grad_norm": 0.20028053224086761, "learning_rate": 4.627330681121995e-05, "loss": 0.0797, "num_input_tokens_seen": 68380768, "step": 31685 }, { "epoch": 5.169657422512235, "grad_norm": 0.4255612790584564, "learning_rate": 4.627143714509555e-05, "loss": 0.0336, "num_input_tokens_seen": 68392448, "step": 31690 }, { "epoch": 5.17047308319739, "grad_norm": 0.4334662854671478, "learning_rate": 4.6269567047878495e-05, "loss": 0.1513, "num_input_tokens_seen": 68402016, "step": 31695 }, { "epoch": 5.171288743882545, "grad_norm": 0.6464482545852661, "learning_rate": 4.626769651960668e-05, "loss": 0.1127, "num_input_tokens_seen": 68412704, "step": 31700 }, { "epoch": 5.1721044045677, "grad_norm": 0.3533793091773987, "learning_rate": 4.626582556031802e-05, "loss": 0.1446, "num_input_tokens_seen": 68424448, "step": 31705 }, { "epoch": 5.172920065252855, "grad_norm": 0.6420993804931641, "learning_rate": 4.6263954170050426e-05, "loss": 0.1706, "num_input_tokens_seen": 68435168, "step": 31710 }, { "epoch": 5.17373572593801, "grad_norm": 0.4176650643348694, "learning_rate": 4.6262082348841815e-05, "loss": 0.1239, "num_input_tokens_seen": 68445696, "step": 31715 }, { "epoch": 5.174551386623165, "grad_norm": 1.4729235172271729, "learning_rate": 4.626021009673015e-05, "loss": 0.1535, "num_input_tokens_seen": 68455936, "step": 31720 }, { "epoch": 5.1753670473083195, "grad_norm": 0.45723557472229004, "learning_rate": 4.625833741375335e-05, "loss": 0.0598, "num_input_tokens_seen": 68465344, "step": 31725 }, { "epoch": 5.176182707993474, "grad_norm": 1.0620759725570679, "learning_rate": 4.6256464299949364e-05, "loss": 0.0602, "num_input_tokens_seen": 68475936, "step": 31730 }, { "epoch": 5.17699836867863, "grad_norm": 0.6768863797187805, "learning_rate": 4.6254590755356175e-05, "loss": 0.0605, "num_input_tokens_seen": 68486496, "step": 31735 }, { "epoch": 5.177814029363785, "grad_norm": 0.4428384006023407, "learning_rate": 4.625271678001173e-05, "loss": 0.0608, "num_input_tokens_seen": 68497408, "step": 31740 }, { "epoch": 5.17862969004894, "grad_norm": 0.41196054220199585, "learning_rate": 4.625084237395403e-05, "loss": 0.0848, "num_input_tokens_seen": 68506720, "step": 31745 }, { "epoch": 5.1794453507340945, "grad_norm": 1.326654076576233, "learning_rate": 4.6248967537221045e-05, "loss": 0.2531, "num_input_tokens_seen": 68516800, "step": 31750 }, { "epoch": 5.180261011419249, "grad_norm": 0.7357861995697021, "learning_rate": 4.6247092269850776e-05, "loss": 0.0526, "num_input_tokens_seen": 68525120, "step": 31755 }, { "epoch": 5.181076672104404, "grad_norm": 0.12432894110679626, "learning_rate": 4.624521657188123e-05, "loss": 0.1713, "num_input_tokens_seen": 68535872, "step": 31760 }, { "epoch": 5.18189233278956, "grad_norm": 0.6800853610038757, "learning_rate": 4.624334044335042e-05, "loss": 0.1101, "num_input_tokens_seen": 68545760, "step": 31765 }, { "epoch": 5.182707993474715, "grad_norm": 0.32119014859199524, "learning_rate": 4.6241463884296366e-05, "loss": 0.0742, "num_input_tokens_seen": 68555552, "step": 31770 }, { "epoch": 5.1835236541598695, "grad_norm": 1.4526108503341675, "learning_rate": 4.62395868947571e-05, "loss": 0.0441, "num_input_tokens_seen": 68565760, "step": 31775 }, { "epoch": 5.184339314845024, "grad_norm": 1.9246265888214111, "learning_rate": 4.6237709474770665e-05, "loss": 0.0673, "num_input_tokens_seen": 68576160, "step": 31780 }, { "epoch": 5.185154975530179, "grad_norm": 1.8102390766143799, "learning_rate": 4.623583162437509e-05, "loss": 0.2405, "num_input_tokens_seen": 68587360, "step": 31785 }, { "epoch": 5.185970636215335, "grad_norm": 0.3968137204647064, "learning_rate": 4.623395334360846e-05, "loss": 0.0352, "num_input_tokens_seen": 68598336, "step": 31790 }, { "epoch": 5.18678629690049, "grad_norm": 0.26501134037971497, "learning_rate": 4.623207463250883e-05, "loss": 0.1724, "num_input_tokens_seen": 68610560, "step": 31795 }, { "epoch": 5.1876019575856445, "grad_norm": 0.12552575767040253, "learning_rate": 4.623019549111427e-05, "loss": 0.068, "num_input_tokens_seen": 68621472, "step": 31800 }, { "epoch": 5.188417618270799, "grad_norm": 1.7684308290481567, "learning_rate": 4.6228315919462864e-05, "loss": 0.2421, "num_input_tokens_seen": 68632064, "step": 31805 }, { "epoch": 5.189233278955954, "grad_norm": 0.7156065702438354, "learning_rate": 4.622643591759271e-05, "loss": 0.2145, "num_input_tokens_seen": 68642784, "step": 31810 }, { "epoch": 5.190048939641109, "grad_norm": 0.6346502900123596, "learning_rate": 4.62245554855419e-05, "loss": 0.1556, "num_input_tokens_seen": 68652448, "step": 31815 }, { "epoch": 5.190864600326265, "grad_norm": 0.7400847673416138, "learning_rate": 4.622267462334855e-05, "loss": 0.1577, "num_input_tokens_seen": 68662656, "step": 31820 }, { "epoch": 5.191680261011419, "grad_norm": 0.41265425086021423, "learning_rate": 4.622079333105077e-05, "loss": 0.0425, "num_input_tokens_seen": 68673248, "step": 31825 }, { "epoch": 5.192495921696574, "grad_norm": 0.8499594330787659, "learning_rate": 4.62189116086867e-05, "loss": 0.0995, "num_input_tokens_seen": 68683776, "step": 31830 }, { "epoch": 5.193311582381729, "grad_norm": 0.13283638656139374, "learning_rate": 4.621702945629447e-05, "loss": 0.1059, "num_input_tokens_seen": 68694112, "step": 31835 }, { "epoch": 5.194127243066884, "grad_norm": 1.4434984922409058, "learning_rate": 4.621514687391222e-05, "loss": 0.263, "num_input_tokens_seen": 68704512, "step": 31840 }, { "epoch": 5.19494290375204, "grad_norm": 0.14530883729457855, "learning_rate": 4.62132638615781e-05, "loss": 0.1177, "num_input_tokens_seen": 68715776, "step": 31845 }, { "epoch": 5.195758564437194, "grad_norm": 0.2445937842130661, "learning_rate": 4.621138041933028e-05, "loss": 0.1763, "num_input_tokens_seen": 68725888, "step": 31850 }, { "epoch": 5.196574225122349, "grad_norm": 0.08873042464256287, "learning_rate": 4.6209496547206934e-05, "loss": 0.088, "num_input_tokens_seen": 68736864, "step": 31855 }, { "epoch": 5.197389885807504, "grad_norm": 0.2356022745370865, "learning_rate": 4.620761224524622e-05, "loss": 0.0932, "num_input_tokens_seen": 68747264, "step": 31860 }, { "epoch": 5.198205546492659, "grad_norm": 0.17682573199272156, "learning_rate": 4.620572751348635e-05, "loss": 0.2308, "num_input_tokens_seen": 68757792, "step": 31865 }, { "epoch": 5.199021207177814, "grad_norm": 0.6010709404945374, "learning_rate": 4.620384235196551e-05, "loss": 0.2584, "num_input_tokens_seen": 68769440, "step": 31870 }, { "epoch": 5.199836867862969, "grad_norm": 0.9825226068496704, "learning_rate": 4.620195676072191e-05, "loss": 0.124, "num_input_tokens_seen": 68780224, "step": 31875 }, { "epoch": 5.200652528548124, "grad_norm": 0.6354921460151672, "learning_rate": 4.6200070739793754e-05, "loss": 0.0701, "num_input_tokens_seen": 68791008, "step": 31880 }, { "epoch": 5.201468189233279, "grad_norm": 0.06929666548967361, "learning_rate": 4.619818428921927e-05, "loss": 0.0857, "num_input_tokens_seen": 68801088, "step": 31885 }, { "epoch": 5.202283849918434, "grad_norm": 0.3280820846557617, "learning_rate": 4.619629740903669e-05, "loss": 0.0606, "num_input_tokens_seen": 68812128, "step": 31890 }, { "epoch": 5.203099510603589, "grad_norm": 0.9025071263313293, "learning_rate": 4.6194410099284255e-05, "loss": 0.0807, "num_input_tokens_seen": 68822688, "step": 31895 }, { "epoch": 5.2039151712887435, "grad_norm": 1.9848542213439941, "learning_rate": 4.619252236000021e-05, "loss": 0.2805, "num_input_tokens_seen": 68832896, "step": 31900 }, { "epoch": 5.204730831973899, "grad_norm": 0.5565356016159058, "learning_rate": 4.6190634191222815e-05, "loss": 0.0531, "num_input_tokens_seen": 68844224, "step": 31905 }, { "epoch": 5.205546492659054, "grad_norm": 0.2632216811180115, "learning_rate": 4.6188745592990335e-05, "loss": 0.1426, "num_input_tokens_seen": 68856512, "step": 31910 }, { "epoch": 5.206362153344209, "grad_norm": 0.3957381546497345, "learning_rate": 4.618685656534105e-05, "loss": 0.0932, "num_input_tokens_seen": 68867552, "step": 31915 }, { "epoch": 5.207177814029364, "grad_norm": 0.8521800637245178, "learning_rate": 4.6184967108313236e-05, "loss": 0.1671, "num_input_tokens_seen": 68877536, "step": 31920 }, { "epoch": 5.2079934747145185, "grad_norm": 1.824001669883728, "learning_rate": 4.618307722194519e-05, "loss": 0.3527, "num_input_tokens_seen": 68887360, "step": 31925 }, { "epoch": 5.208809135399674, "grad_norm": 0.5768232345581055, "learning_rate": 4.618118690627521e-05, "loss": 0.1681, "num_input_tokens_seen": 68899328, "step": 31930 }, { "epoch": 5.209624796084829, "grad_norm": 0.1683652698993683, "learning_rate": 4.617929616134161e-05, "loss": 0.0956, "num_input_tokens_seen": 68909888, "step": 31935 }, { "epoch": 5.210440456769984, "grad_norm": 1.6847217082977295, "learning_rate": 4.617740498718271e-05, "loss": 0.2258, "num_input_tokens_seen": 68920416, "step": 31940 }, { "epoch": 5.211256117455139, "grad_norm": 0.39744889736175537, "learning_rate": 4.617551338383682e-05, "loss": 0.0919, "num_input_tokens_seen": 68931936, "step": 31945 }, { "epoch": 5.212071778140293, "grad_norm": 1.8526365756988525, "learning_rate": 4.617362135134229e-05, "loss": 0.1782, "num_input_tokens_seen": 68942912, "step": 31950 }, { "epoch": 5.212887438825448, "grad_norm": 0.2993067800998688, "learning_rate": 4.617172888973747e-05, "loss": 0.3424, "num_input_tokens_seen": 68954400, "step": 31955 }, { "epoch": 5.213703099510604, "grad_norm": 1.0617986917495728, "learning_rate": 4.6169835999060706e-05, "loss": 0.0901, "num_input_tokens_seen": 68964032, "step": 31960 }, { "epoch": 5.214518760195759, "grad_norm": 0.28158190846443176, "learning_rate": 4.616794267935035e-05, "loss": 0.0622, "num_input_tokens_seen": 68974944, "step": 31965 }, { "epoch": 5.215334420880914, "grad_norm": 0.19539479911327362, "learning_rate": 4.616604893064479e-05, "loss": 0.0807, "num_input_tokens_seen": 68985856, "step": 31970 }, { "epoch": 5.216150081566068, "grad_norm": 1.1052826642990112, "learning_rate": 4.6164154752982395e-05, "loss": 0.1074, "num_input_tokens_seen": 68996768, "step": 31975 }, { "epoch": 5.216965742251223, "grad_norm": 1.1125597953796387, "learning_rate": 4.616226014640156e-05, "loss": 0.1135, "num_input_tokens_seen": 69008064, "step": 31980 }, { "epoch": 5.217781402936378, "grad_norm": 0.18910877406597137, "learning_rate": 4.616036511094067e-05, "loss": 0.1645, "num_input_tokens_seen": 69018240, "step": 31985 }, { "epoch": 5.218597063621534, "grad_norm": 0.7827879190444946, "learning_rate": 4.615846964663814e-05, "loss": 0.1705, "num_input_tokens_seen": 69028128, "step": 31990 }, { "epoch": 5.219412724306689, "grad_norm": 1.3235845565795898, "learning_rate": 4.6156573753532386e-05, "loss": 0.1327, "num_input_tokens_seen": 69038720, "step": 31995 }, { "epoch": 5.220228384991843, "grad_norm": 0.753350555896759, "learning_rate": 4.615467743166182e-05, "loss": 0.0252, "num_input_tokens_seen": 69050368, "step": 32000 }, { "epoch": 5.221044045676998, "grad_norm": 0.060498662292957306, "learning_rate": 4.615278068106488e-05, "loss": 0.0249, "num_input_tokens_seen": 69060672, "step": 32005 }, { "epoch": 5.221859706362153, "grad_norm": 0.5801933407783508, "learning_rate": 4.6150883501780006e-05, "loss": 0.2107, "num_input_tokens_seen": 69072480, "step": 32010 }, { "epoch": 5.222675367047309, "grad_norm": 0.1723068356513977, "learning_rate": 4.614898589384564e-05, "loss": 0.0683, "num_input_tokens_seen": 69083232, "step": 32015 }, { "epoch": 5.2234910277324635, "grad_norm": 0.8048961162567139, "learning_rate": 4.6147087857300256e-05, "loss": 0.0737, "num_input_tokens_seen": 69094912, "step": 32020 }, { "epoch": 5.224306688417618, "grad_norm": 0.4190601110458374, "learning_rate": 4.61451893921823e-05, "loss": 0.0418, "num_input_tokens_seen": 69106432, "step": 32025 }, { "epoch": 5.225122349102773, "grad_norm": 0.9349737763404846, "learning_rate": 4.614329049853027e-05, "loss": 0.0831, "num_input_tokens_seen": 69117184, "step": 32030 }, { "epoch": 5.225938009787928, "grad_norm": 0.34990742802619934, "learning_rate": 4.614139117638262e-05, "loss": 0.0569, "num_input_tokens_seen": 69127520, "step": 32035 }, { "epoch": 5.226753670473083, "grad_norm": 0.059068284928798676, "learning_rate": 4.613949142577787e-05, "loss": 0.1304, "num_input_tokens_seen": 69139104, "step": 32040 }, { "epoch": 5.2275693311582385, "grad_norm": 1.455747127532959, "learning_rate": 4.61375912467545e-05, "loss": 0.0709, "num_input_tokens_seen": 69149312, "step": 32045 }, { "epoch": 5.228384991843393, "grad_norm": 0.09505056589841843, "learning_rate": 4.613569063935104e-05, "loss": 0.1248, "num_input_tokens_seen": 69160576, "step": 32050 }, { "epoch": 5.229200652528548, "grad_norm": 0.39291828870773315, "learning_rate": 4.6133789603605983e-05, "loss": 0.0374, "num_input_tokens_seen": 69171360, "step": 32055 }, { "epoch": 5.230016313213703, "grad_norm": 1.3938125371932983, "learning_rate": 4.613188813955788e-05, "loss": 0.388, "num_input_tokens_seen": 69183360, "step": 32060 }, { "epoch": 5.230831973898858, "grad_norm": 0.20397521555423737, "learning_rate": 4.612998624724525e-05, "loss": 0.0996, "num_input_tokens_seen": 69195392, "step": 32065 }, { "epoch": 5.231647634584013, "grad_norm": 0.7215102910995483, "learning_rate": 4.6128083926706645e-05, "loss": 0.0279, "num_input_tokens_seen": 69206816, "step": 32070 }, { "epoch": 5.232463295269168, "grad_norm": 0.593165934085846, "learning_rate": 4.6126181177980616e-05, "loss": 0.0308, "num_input_tokens_seen": 69217248, "step": 32075 }, { "epoch": 5.233278955954323, "grad_norm": 1.0467758178710938, "learning_rate": 4.6124278001105725e-05, "loss": 0.1842, "num_input_tokens_seen": 69228480, "step": 32080 }, { "epoch": 5.234094616639478, "grad_norm": 0.9392780661582947, "learning_rate": 4.612237439612054e-05, "loss": 0.1862, "num_input_tokens_seen": 69240832, "step": 32085 }, { "epoch": 5.234910277324633, "grad_norm": 1.051395058631897, "learning_rate": 4.612047036306365e-05, "loss": 0.0662, "num_input_tokens_seen": 69250976, "step": 32090 }, { "epoch": 5.235725938009788, "grad_norm": 0.0683596134185791, "learning_rate": 4.611856590197363e-05, "loss": 0.0775, "num_input_tokens_seen": 69261184, "step": 32095 }, { "epoch": 5.236541598694943, "grad_norm": 0.6316969394683838, "learning_rate": 4.611666101288908e-05, "loss": 0.2277, "num_input_tokens_seen": 69273024, "step": 32100 }, { "epoch": 5.237357259380098, "grad_norm": 0.19625817239284515, "learning_rate": 4.611475569584861e-05, "loss": 0.0497, "num_input_tokens_seen": 69285280, "step": 32105 }, { "epoch": 5.238172920065253, "grad_norm": 0.18725551664829254, "learning_rate": 4.6112849950890826e-05, "loss": 0.0816, "num_input_tokens_seen": 69295680, "step": 32110 }, { "epoch": 5.238988580750408, "grad_norm": 1.2520610094070435, "learning_rate": 4.6110943778054355e-05, "loss": 0.1732, "num_input_tokens_seen": 69305888, "step": 32115 }, { "epoch": 5.239804241435563, "grad_norm": 0.7435758113861084, "learning_rate": 4.610903717737782e-05, "loss": 0.0983, "num_input_tokens_seen": 69317856, "step": 32120 }, { "epoch": 5.240619902120717, "grad_norm": 1.7720391750335693, "learning_rate": 4.610713014889988e-05, "loss": 0.3417, "num_input_tokens_seen": 69329088, "step": 32125 }, { "epoch": 5.241435562805873, "grad_norm": 1.3507895469665527, "learning_rate": 4.610522269265917e-05, "loss": 0.14, "num_input_tokens_seen": 69340544, "step": 32130 }, { "epoch": 5.242251223491028, "grad_norm": 1.264599084854126, "learning_rate": 4.610331480869434e-05, "loss": 0.2609, "num_input_tokens_seen": 69351104, "step": 32135 }, { "epoch": 5.243066884176183, "grad_norm": 0.03652459383010864, "learning_rate": 4.610140649704407e-05, "loss": 0.1092, "num_input_tokens_seen": 69362848, "step": 32140 }, { "epoch": 5.2438825448613375, "grad_norm": 1.0223482847213745, "learning_rate": 4.609949775774703e-05, "loss": 0.1313, "num_input_tokens_seen": 69373504, "step": 32145 }, { "epoch": 5.244698205546492, "grad_norm": 0.08102528750896454, "learning_rate": 4.60975885908419e-05, "loss": 0.0626, "num_input_tokens_seen": 69384256, "step": 32150 }, { "epoch": 5.245513866231648, "grad_norm": 0.42665979266166687, "learning_rate": 4.609567899636737e-05, "loss": 0.1109, "num_input_tokens_seen": 69394880, "step": 32155 }, { "epoch": 5.246329526916803, "grad_norm": 0.39485830068588257, "learning_rate": 4.6093768974362147e-05, "loss": 0.088, "num_input_tokens_seen": 69405120, "step": 32160 }, { "epoch": 5.247145187601958, "grad_norm": 0.03926697000861168, "learning_rate": 4.609185852486493e-05, "loss": 0.0664, "num_input_tokens_seen": 69416640, "step": 32165 }, { "epoch": 5.2479608482871125, "grad_norm": 0.5249419212341309, "learning_rate": 4.608994764791445e-05, "loss": 0.177, "num_input_tokens_seen": 69426912, "step": 32170 }, { "epoch": 5.248776508972267, "grad_norm": 0.05823593959212303, "learning_rate": 4.608803634354942e-05, "loss": 0.1924, "num_input_tokens_seen": 69436512, "step": 32175 }, { "epoch": 5.249592169657422, "grad_norm": 0.7618659138679504, "learning_rate": 4.608612461180859e-05, "loss": 0.3761, "num_input_tokens_seen": 69447424, "step": 32180 }, { "epoch": 5.250407830342578, "grad_norm": 0.6812909841537476, "learning_rate": 4.608421245273069e-05, "loss": 0.2027, "num_input_tokens_seen": 69459136, "step": 32185 }, { "epoch": 5.251223491027733, "grad_norm": 1.2513737678527832, "learning_rate": 4.6082299866354475e-05, "loss": 0.1771, "num_input_tokens_seen": 69470048, "step": 32190 }, { "epoch": 5.2520391517128875, "grad_norm": 0.5813425183296204, "learning_rate": 4.6080386852718715e-05, "loss": 0.1658, "num_input_tokens_seen": 69480992, "step": 32195 }, { "epoch": 5.252854812398042, "grad_norm": 2.825079917907715, "learning_rate": 4.607847341186216e-05, "loss": 0.1741, "num_input_tokens_seen": 69491936, "step": 32200 }, { "epoch": 5.253670473083197, "grad_norm": 0.8648921251296997, "learning_rate": 4.607655954382362e-05, "loss": 0.2264, "num_input_tokens_seen": 69500896, "step": 32205 }, { "epoch": 5.254486133768353, "grad_norm": 0.6059995889663696, "learning_rate": 4.607464524864185e-05, "loss": 0.0966, "num_input_tokens_seen": 69509888, "step": 32210 }, { "epoch": 5.255301794453508, "grad_norm": 0.2146986573934555, "learning_rate": 4.6072730526355664e-05, "loss": 0.0833, "num_input_tokens_seen": 69519840, "step": 32215 }, { "epoch": 5.2561174551386625, "grad_norm": 1.572229266166687, "learning_rate": 4.607081537700386e-05, "loss": 0.1042, "num_input_tokens_seen": 69530400, "step": 32220 }, { "epoch": 5.256933115823817, "grad_norm": 0.04848196357488632, "learning_rate": 4.606889980062525e-05, "loss": 0.0652, "num_input_tokens_seen": 69540992, "step": 32225 }, { "epoch": 5.257748776508972, "grad_norm": 0.4287551939487457, "learning_rate": 4.606698379725866e-05, "loss": 0.1492, "num_input_tokens_seen": 69552704, "step": 32230 }, { "epoch": 5.258564437194127, "grad_norm": 1.0210028886795044, "learning_rate": 4.606506736694292e-05, "loss": 0.0623, "num_input_tokens_seen": 69563648, "step": 32235 }, { "epoch": 5.259380097879283, "grad_norm": 1.957819938659668, "learning_rate": 4.606315050971687e-05, "loss": 0.1279, "num_input_tokens_seen": 69574880, "step": 32240 }, { "epoch": 5.260195758564437, "grad_norm": 0.9641340374946594, "learning_rate": 4.606123322561935e-05, "loss": 0.2422, "num_input_tokens_seen": 69586496, "step": 32245 }, { "epoch": 5.261011419249592, "grad_norm": 1.2452071905136108, "learning_rate": 4.605931551468922e-05, "loss": 0.1278, "num_input_tokens_seen": 69596704, "step": 32250 }, { "epoch": 5.261827079934747, "grad_norm": 2.0749528408050537, "learning_rate": 4.605739737696534e-05, "loss": 0.2215, "num_input_tokens_seen": 69606912, "step": 32255 }, { "epoch": 5.262642740619902, "grad_norm": 1.2287415266036987, "learning_rate": 4.60554788124866e-05, "loss": 0.3052, "num_input_tokens_seen": 69618880, "step": 32260 }, { "epoch": 5.263458401305057, "grad_norm": 0.8770024180412292, "learning_rate": 4.605355982129186e-05, "loss": 0.0883, "num_input_tokens_seen": 69629440, "step": 32265 }, { "epoch": 5.264274061990212, "grad_norm": 0.25182172656059265, "learning_rate": 4.605164040342003e-05, "loss": 0.1002, "num_input_tokens_seen": 69640128, "step": 32270 }, { "epoch": 5.265089722675367, "grad_norm": 0.8296898007392883, "learning_rate": 4.6049720558909996e-05, "loss": 0.1314, "num_input_tokens_seen": 69650816, "step": 32275 }, { "epoch": 5.265905383360522, "grad_norm": 0.27156245708465576, "learning_rate": 4.604780028780068e-05, "loss": 0.1171, "num_input_tokens_seen": 69661376, "step": 32280 }, { "epoch": 5.266721044045677, "grad_norm": 1.2693016529083252, "learning_rate": 4.604587959013098e-05, "loss": 0.0884, "num_input_tokens_seen": 69671200, "step": 32285 }, { "epoch": 5.267536704730832, "grad_norm": 1.0805584192276, "learning_rate": 4.6043958465939834e-05, "loss": 0.2077, "num_input_tokens_seen": 69682048, "step": 32290 }, { "epoch": 5.268352365415987, "grad_norm": 0.14630310237407684, "learning_rate": 4.604203691526618e-05, "loss": 0.0736, "num_input_tokens_seen": 69694208, "step": 32295 }, { "epoch": 5.269168026101142, "grad_norm": 0.2230111062526703, "learning_rate": 4.604011493814895e-05, "loss": 0.0869, "num_input_tokens_seen": 69702848, "step": 32300 }, { "epoch": 5.269983686786297, "grad_norm": 0.3902955949306488, "learning_rate": 4.603819253462709e-05, "loss": 0.0527, "num_input_tokens_seen": 69714080, "step": 32305 }, { "epoch": 5.270799347471452, "grad_norm": 0.054823510348796844, "learning_rate": 4.6036269704739585e-05, "loss": 0.0411, "num_input_tokens_seen": 69725408, "step": 32310 }, { "epoch": 5.271615008156607, "grad_norm": 0.4309535324573517, "learning_rate": 4.603434644852538e-05, "loss": 0.0336, "num_input_tokens_seen": 69736704, "step": 32315 }, { "epoch": 5.2724306688417615, "grad_norm": 0.4212989807128906, "learning_rate": 4.6032422766023465e-05, "loss": 0.0575, "num_input_tokens_seen": 69747264, "step": 32320 }, { "epoch": 5.273246329526917, "grad_norm": 0.8761297464370728, "learning_rate": 4.6030498657272814e-05, "loss": 0.0852, "num_input_tokens_seen": 69758592, "step": 32325 }, { "epoch": 5.274061990212072, "grad_norm": 0.46779245138168335, "learning_rate": 4.6028574122312436e-05, "loss": 0.0783, "num_input_tokens_seen": 69769088, "step": 32330 }, { "epoch": 5.274877650897227, "grad_norm": 0.5010861754417419, "learning_rate": 4.602664916118132e-05, "loss": 0.1178, "num_input_tokens_seen": 69780064, "step": 32335 }, { "epoch": 5.275693311582382, "grad_norm": 0.20119120180606842, "learning_rate": 4.602472377391849e-05, "loss": 0.1427, "num_input_tokens_seen": 69792064, "step": 32340 }, { "epoch": 5.2765089722675365, "grad_norm": 0.8759371638298035, "learning_rate": 4.6022797960562956e-05, "loss": 0.2365, "num_input_tokens_seen": 69802304, "step": 32345 }, { "epoch": 5.277324632952691, "grad_norm": 0.20244988799095154, "learning_rate": 4.602087172115376e-05, "loss": 0.0613, "num_input_tokens_seen": 69811104, "step": 32350 }, { "epoch": 5.278140293637847, "grad_norm": 0.0783318355679512, "learning_rate": 4.601894505572992e-05, "loss": 0.0572, "num_input_tokens_seen": 69823072, "step": 32355 }, { "epoch": 5.278955954323002, "grad_norm": 2.036367654800415, "learning_rate": 4.60170179643305e-05, "loss": 0.2364, "num_input_tokens_seen": 69835104, "step": 32360 }, { "epoch": 5.279771615008157, "grad_norm": 0.793684184551239, "learning_rate": 4.601509044699455e-05, "loss": 0.1267, "num_input_tokens_seen": 69846784, "step": 32365 }, { "epoch": 5.280587275693311, "grad_norm": 0.41909584403038025, "learning_rate": 4.6013162503761134e-05, "loss": 0.0686, "num_input_tokens_seen": 69857024, "step": 32370 }, { "epoch": 5.281402936378466, "grad_norm": 0.20173941552639008, "learning_rate": 4.6011234134669325e-05, "loss": 0.0223, "num_input_tokens_seen": 69866624, "step": 32375 }, { "epoch": 5.282218597063622, "grad_norm": 0.9229574799537659, "learning_rate": 4.600930533975819e-05, "loss": 0.1339, "num_input_tokens_seen": 69878816, "step": 32380 }, { "epoch": 5.283034257748777, "grad_norm": 0.19125568866729736, "learning_rate": 4.600737611906684e-05, "loss": 0.1142, "num_input_tokens_seen": 69889184, "step": 32385 }, { "epoch": 5.283849918433932, "grad_norm": 0.38573330640792847, "learning_rate": 4.600544647263436e-05, "loss": 0.2002, "num_input_tokens_seen": 69900160, "step": 32390 }, { "epoch": 5.284665579119086, "grad_norm": 0.03366551175713539, "learning_rate": 4.6003516400499856e-05, "loss": 0.1263, "num_input_tokens_seen": 69910848, "step": 32395 }, { "epoch": 5.285481239804241, "grad_norm": 0.08915787935256958, "learning_rate": 4.600158590270246e-05, "loss": 0.0246, "num_input_tokens_seen": 69921856, "step": 32400 }, { "epoch": 5.286296900489396, "grad_norm": 0.15938518941402435, "learning_rate": 4.599965497928127e-05, "loss": 0.0576, "num_input_tokens_seen": 69932768, "step": 32405 }, { "epoch": 5.287112561174552, "grad_norm": 1.2985330820083618, "learning_rate": 4.599772363027544e-05, "loss": 0.1535, "num_input_tokens_seen": 69943488, "step": 32410 }, { "epoch": 5.287928221859707, "grad_norm": 1.9683079719543457, "learning_rate": 4.59957918557241e-05, "loss": 0.1025, "num_input_tokens_seen": 69953312, "step": 32415 }, { "epoch": 5.288743882544861, "grad_norm": 0.8915193676948547, "learning_rate": 4.599385965566641e-05, "loss": 0.1137, "num_input_tokens_seen": 69963072, "step": 32420 }, { "epoch": 5.289559543230016, "grad_norm": 0.10550878942012787, "learning_rate": 4.599192703014151e-05, "loss": 0.1066, "num_input_tokens_seen": 69973600, "step": 32425 }, { "epoch": 5.290375203915171, "grad_norm": 0.10494202375411987, "learning_rate": 4.5989993979188586e-05, "loss": 0.1054, "num_input_tokens_seen": 69984384, "step": 32430 }, { "epoch": 5.291190864600326, "grad_norm": 0.23994506895542145, "learning_rate": 4.59880605028468e-05, "loss": 0.0709, "num_input_tokens_seen": 69995616, "step": 32435 }, { "epoch": 5.2920065252854815, "grad_norm": 1.382054328918457, "learning_rate": 4.5986126601155344e-05, "loss": 0.1569, "num_input_tokens_seen": 70006432, "step": 32440 }, { "epoch": 5.292822185970636, "grad_norm": 1.5500844717025757, "learning_rate": 4.598419227415341e-05, "loss": 0.2281, "num_input_tokens_seen": 70017120, "step": 32445 }, { "epoch": 5.293637846655791, "grad_norm": 0.518547773361206, "learning_rate": 4.59822575218802e-05, "loss": 0.1374, "num_input_tokens_seen": 70027744, "step": 32450 }, { "epoch": 5.294453507340946, "grad_norm": 1.1645076274871826, "learning_rate": 4.5980322344374924e-05, "loss": 0.1349, "num_input_tokens_seen": 70039360, "step": 32455 }, { "epoch": 5.295269168026101, "grad_norm": 0.06157531961798668, "learning_rate": 4.59783867416768e-05, "loss": 0.234, "num_input_tokens_seen": 70049952, "step": 32460 }, { "epoch": 5.2960848287112565, "grad_norm": 0.2365003377199173, "learning_rate": 4.597645071382506e-05, "loss": 0.1585, "num_input_tokens_seen": 70059904, "step": 32465 }, { "epoch": 5.296900489396411, "grad_norm": 0.23248501121997833, "learning_rate": 4.597451426085894e-05, "loss": 0.229, "num_input_tokens_seen": 70070464, "step": 32470 }, { "epoch": 5.297716150081566, "grad_norm": 0.8768717050552368, "learning_rate": 4.597257738281766e-05, "loss": 0.0637, "num_input_tokens_seen": 70081472, "step": 32475 }, { "epoch": 5.298531810766721, "grad_norm": 0.3234686553478241, "learning_rate": 4.5970640079740514e-05, "loss": 0.0866, "num_input_tokens_seen": 70091360, "step": 32480 }, { "epoch": 5.299347471451876, "grad_norm": 0.7453970909118652, "learning_rate": 4.596870235166674e-05, "loss": 0.1127, "num_input_tokens_seen": 70101088, "step": 32485 }, { "epoch": 5.300163132137031, "grad_norm": 0.4877297282218933, "learning_rate": 4.5966764198635606e-05, "loss": 0.0714, "num_input_tokens_seen": 70112608, "step": 32490 }, { "epoch": 5.300978792822186, "grad_norm": 0.03640981763601303, "learning_rate": 4.59648256206864e-05, "loss": 0.0366, "num_input_tokens_seen": 70124128, "step": 32495 }, { "epoch": 5.301794453507341, "grad_norm": 2.4926044940948486, "learning_rate": 4.59628866178584e-05, "loss": 0.3055, "num_input_tokens_seen": 70136736, "step": 32500 }, { "epoch": 5.302610114192496, "grad_norm": 0.2815510630607605, "learning_rate": 4.596094719019092e-05, "loss": 0.1125, "num_input_tokens_seen": 70148768, "step": 32505 }, { "epoch": 5.303425774877651, "grad_norm": 1.2572343349456787, "learning_rate": 4.595900733772325e-05, "loss": 0.1721, "num_input_tokens_seen": 70158592, "step": 32510 }, { "epoch": 5.304241435562806, "grad_norm": 0.09217408299446106, "learning_rate": 4.5957067060494704e-05, "loss": 0.0931, "num_input_tokens_seen": 70169856, "step": 32515 }, { "epoch": 5.30505709624796, "grad_norm": 0.5692400336265564, "learning_rate": 4.5955126358544616e-05, "loss": 0.0305, "num_input_tokens_seen": 70179680, "step": 32520 }, { "epoch": 5.305872756933116, "grad_norm": 0.42818009853363037, "learning_rate": 4.5953185231912306e-05, "loss": 0.1825, "num_input_tokens_seen": 70191104, "step": 32525 }, { "epoch": 5.306688417618271, "grad_norm": 0.12133246660232544, "learning_rate": 4.595124368063711e-05, "loss": 0.1475, "num_input_tokens_seen": 70201728, "step": 32530 }, { "epoch": 5.307504078303426, "grad_norm": 0.2854387164115906, "learning_rate": 4.5949301704758395e-05, "loss": 0.0236, "num_input_tokens_seen": 70213088, "step": 32535 }, { "epoch": 5.308319738988581, "grad_norm": 0.38481709361076355, "learning_rate": 4.594735930431549e-05, "loss": 0.0393, "num_input_tokens_seen": 70223648, "step": 32540 }, { "epoch": 5.309135399673735, "grad_norm": 0.6157909035682678, "learning_rate": 4.594541647934779e-05, "loss": 0.1289, "num_input_tokens_seen": 70234272, "step": 32545 }, { "epoch": 5.309951060358891, "grad_norm": 0.38039010763168335, "learning_rate": 4.594347322989464e-05, "loss": 0.0973, "num_input_tokens_seen": 70244800, "step": 32550 }, { "epoch": 5.310766721044046, "grad_norm": 0.06643366068601608, "learning_rate": 4.594152955599544e-05, "loss": 0.1421, "num_input_tokens_seen": 70255360, "step": 32555 }, { "epoch": 5.311582381729201, "grad_norm": 0.09141223132610321, "learning_rate": 4.5939585457689586e-05, "loss": 0.104, "num_input_tokens_seen": 70265888, "step": 32560 }, { "epoch": 5.3123980424143555, "grad_norm": 0.12939143180847168, "learning_rate": 4.593764093501647e-05, "loss": 0.1897, "num_input_tokens_seen": 70276544, "step": 32565 }, { "epoch": 5.31321370309951, "grad_norm": 0.14271140098571777, "learning_rate": 4.5935695988015484e-05, "loss": 0.0718, "num_input_tokens_seen": 70287904, "step": 32570 }, { "epoch": 5.314029363784665, "grad_norm": 0.2972182631492615, "learning_rate": 4.593375061672607e-05, "loss": 0.0771, "num_input_tokens_seen": 70299872, "step": 32575 }, { "epoch": 5.314845024469821, "grad_norm": 0.09219971299171448, "learning_rate": 4.593180482118764e-05, "loss": 0.1118, "num_input_tokens_seen": 70310624, "step": 32580 }, { "epoch": 5.315660685154976, "grad_norm": 0.25106140971183777, "learning_rate": 4.5929858601439634e-05, "loss": 0.038, "num_input_tokens_seen": 70321312, "step": 32585 }, { "epoch": 5.3164763458401305, "grad_norm": 0.10811485350131989, "learning_rate": 4.59279119575215e-05, "loss": 0.0587, "num_input_tokens_seen": 70332384, "step": 32590 }, { "epoch": 5.317292006525285, "grad_norm": 0.7063263654708862, "learning_rate": 4.592596488947267e-05, "loss": 0.1389, "num_input_tokens_seen": 70343232, "step": 32595 }, { "epoch": 5.31810766721044, "grad_norm": 1.5815714597702026, "learning_rate": 4.592401739733261e-05, "loss": 0.3436, "num_input_tokens_seen": 70355104, "step": 32600 }, { "epoch": 5.318923327895595, "grad_norm": 1.9801204204559326, "learning_rate": 4.59220694811408e-05, "loss": 0.2042, "num_input_tokens_seen": 70366400, "step": 32605 }, { "epoch": 5.319738988580751, "grad_norm": 1.0619800090789795, "learning_rate": 4.592012114093671e-05, "loss": 0.1627, "num_input_tokens_seen": 70378400, "step": 32610 }, { "epoch": 5.3205546492659055, "grad_norm": 0.2695389986038208, "learning_rate": 4.5918172376759835e-05, "loss": 0.0555, "num_input_tokens_seen": 70389952, "step": 32615 }, { "epoch": 5.32137030995106, "grad_norm": 0.3891667127609253, "learning_rate": 4.5916223188649656e-05, "loss": 0.0914, "num_input_tokens_seen": 70401408, "step": 32620 }, { "epoch": 5.322185970636215, "grad_norm": 0.16043990850448608, "learning_rate": 4.591427357664567e-05, "loss": 0.1429, "num_input_tokens_seen": 70412640, "step": 32625 }, { "epoch": 5.32300163132137, "grad_norm": 0.4916357696056366, "learning_rate": 4.5912323540787406e-05, "loss": 0.1009, "num_input_tokens_seen": 70422592, "step": 32630 }, { "epoch": 5.323817292006526, "grad_norm": 0.2769436538219452, "learning_rate": 4.5910373081114375e-05, "loss": 0.1309, "num_input_tokens_seen": 70433856, "step": 32635 }, { "epoch": 5.3246329526916805, "grad_norm": 2.072984457015991, "learning_rate": 4.590842219766611e-05, "loss": 0.1993, "num_input_tokens_seen": 70445376, "step": 32640 }, { "epoch": 5.325448613376835, "grad_norm": 1.876871109008789, "learning_rate": 4.590647089048214e-05, "loss": 0.2303, "num_input_tokens_seen": 70456384, "step": 32645 }, { "epoch": 5.32626427406199, "grad_norm": 0.14265695214271545, "learning_rate": 4.590451915960202e-05, "loss": 0.1831, "num_input_tokens_seen": 70467040, "step": 32650 }, { "epoch": 5.327079934747145, "grad_norm": 0.3428408205509186, "learning_rate": 4.59025670050653e-05, "loss": 0.1331, "num_input_tokens_seen": 70477248, "step": 32655 }, { "epoch": 5.327895595432301, "grad_norm": 0.12626925110816956, "learning_rate": 4.5900614426911535e-05, "loss": 0.2351, "num_input_tokens_seen": 70487648, "step": 32660 }, { "epoch": 5.328711256117455, "grad_norm": 0.5905224680900574, "learning_rate": 4.589866142518031e-05, "loss": 0.1804, "num_input_tokens_seen": 70499296, "step": 32665 }, { "epoch": 5.32952691680261, "grad_norm": 0.9747646450996399, "learning_rate": 4.5896707999911196e-05, "loss": 0.111, "num_input_tokens_seen": 70509664, "step": 32670 }, { "epoch": 5.330342577487765, "grad_norm": 0.22925004363059998, "learning_rate": 4.5894754151143794e-05, "loss": 0.0517, "num_input_tokens_seen": 70520064, "step": 32675 }, { "epoch": 5.33115823817292, "grad_norm": 1.2465779781341553, "learning_rate": 4.589279987891768e-05, "loss": 0.1503, "num_input_tokens_seen": 70531104, "step": 32680 }, { "epoch": 5.331973898858075, "grad_norm": 0.06766360253095627, "learning_rate": 4.589084518327248e-05, "loss": 0.0955, "num_input_tokens_seen": 70542528, "step": 32685 }, { "epoch": 5.33278955954323, "grad_norm": 0.9504028558731079, "learning_rate": 4.58888900642478e-05, "loss": 0.1266, "num_input_tokens_seen": 70553216, "step": 32690 }, { "epoch": 5.333605220228385, "grad_norm": 1.8089240789413452, "learning_rate": 4.588693452188326e-05, "loss": 0.2325, "num_input_tokens_seen": 70565504, "step": 32695 }, { "epoch": 5.33442088091354, "grad_norm": 0.6835708618164062, "learning_rate": 4.58849785562185e-05, "loss": 0.1289, "num_input_tokens_seen": 70577376, "step": 32700 }, { "epoch": 5.335236541598695, "grad_norm": 0.5063592791557312, "learning_rate": 4.5883022167293155e-05, "loss": 0.1661, "num_input_tokens_seen": 70588960, "step": 32705 }, { "epoch": 5.33605220228385, "grad_norm": 0.7412596940994263, "learning_rate": 4.588106535514687e-05, "loss": 0.0713, "num_input_tokens_seen": 70600192, "step": 32710 }, { "epoch": 5.3368678629690045, "grad_norm": 0.09792690724134445, "learning_rate": 4.58791081198193e-05, "loss": 0.1054, "num_input_tokens_seen": 70610816, "step": 32715 }, { "epoch": 5.33768352365416, "grad_norm": 1.306010365486145, "learning_rate": 4.587715046135013e-05, "loss": 0.3111, "num_input_tokens_seen": 70621280, "step": 32720 }, { "epoch": 5.338499184339315, "grad_norm": 0.7440460920333862, "learning_rate": 4.587519237977902e-05, "loss": 0.1239, "num_input_tokens_seen": 70631968, "step": 32725 }, { "epoch": 5.33931484502447, "grad_norm": 0.08963080495595932, "learning_rate": 4.587323387514565e-05, "loss": 0.1264, "num_input_tokens_seen": 70643296, "step": 32730 }, { "epoch": 5.340130505709625, "grad_norm": 0.35768815875053406, "learning_rate": 4.587127494748971e-05, "loss": 0.0487, "num_input_tokens_seen": 70652576, "step": 32735 }, { "epoch": 5.3409461663947795, "grad_norm": 0.22243738174438477, "learning_rate": 4.5869315596850917e-05, "loss": 0.0774, "num_input_tokens_seen": 70663424, "step": 32740 }, { "epoch": 5.341761827079935, "grad_norm": 1.4603707790374756, "learning_rate": 4.586735582326896e-05, "loss": 0.1572, "num_input_tokens_seen": 70673536, "step": 32745 }, { "epoch": 5.34257748776509, "grad_norm": 1.2817124128341675, "learning_rate": 4.586539562678357e-05, "loss": 0.122, "num_input_tokens_seen": 70683712, "step": 32750 }, { "epoch": 5.343393148450245, "grad_norm": 1.3942816257476807, "learning_rate": 4.5863435007434464e-05, "loss": 0.1727, "num_input_tokens_seen": 70693856, "step": 32755 }, { "epoch": 5.3442088091354, "grad_norm": 0.40611889958381653, "learning_rate": 4.586147396526138e-05, "loss": 0.0718, "num_input_tokens_seen": 70705024, "step": 32760 }, { "epoch": 5.3450244698205545, "grad_norm": 0.41552993655204773, "learning_rate": 4.585951250030407e-05, "loss": 0.0488, "num_input_tokens_seen": 70715040, "step": 32765 }, { "epoch": 5.345840130505709, "grad_norm": 1.2964589595794678, "learning_rate": 4.585755061260227e-05, "loss": 0.0643, "num_input_tokens_seen": 70726080, "step": 32770 }, { "epoch": 5.346655791190865, "grad_norm": 0.21601705253124237, "learning_rate": 4.585558830219575e-05, "loss": 0.0748, "num_input_tokens_seen": 70736800, "step": 32775 }, { "epoch": 5.34747145187602, "grad_norm": 1.947084665298462, "learning_rate": 4.5853625569124276e-05, "loss": 0.1107, "num_input_tokens_seen": 70748928, "step": 32780 }, { "epoch": 5.348287112561175, "grad_norm": 0.532584547996521, "learning_rate": 4.585166241342762e-05, "loss": 0.152, "num_input_tokens_seen": 70760736, "step": 32785 }, { "epoch": 5.349102773246329, "grad_norm": 1.1316721439361572, "learning_rate": 4.584969883514558e-05, "loss": 0.1221, "num_input_tokens_seen": 70771552, "step": 32790 }, { "epoch": 5.349918433931484, "grad_norm": 0.7541897892951965, "learning_rate": 4.584773483431794e-05, "loss": 0.0877, "num_input_tokens_seen": 70782656, "step": 32795 }, { "epoch": 5.350734094616639, "grad_norm": 1.1162041425704956, "learning_rate": 4.5845770410984505e-05, "loss": 0.1361, "num_input_tokens_seen": 70792736, "step": 32800 }, { "epoch": 5.351549755301795, "grad_norm": 0.26961249113082886, "learning_rate": 4.5843805565185096e-05, "loss": 0.1097, "num_input_tokens_seen": 70804032, "step": 32805 }, { "epoch": 5.35236541598695, "grad_norm": 0.9772354364395142, "learning_rate": 4.5841840296959514e-05, "loss": 0.0667, "num_input_tokens_seen": 70814208, "step": 32810 }, { "epoch": 5.353181076672104, "grad_norm": 1.1060516834259033, "learning_rate": 4.58398746063476e-05, "loss": 0.1851, "num_input_tokens_seen": 70825312, "step": 32815 }, { "epoch": 5.353996737357259, "grad_norm": 0.9207391142845154, "learning_rate": 4.583790849338919e-05, "loss": 0.1249, "num_input_tokens_seen": 70835680, "step": 32820 }, { "epoch": 5.354812398042414, "grad_norm": 1.196632742881775, "learning_rate": 4.583594195812414e-05, "loss": 0.1355, "num_input_tokens_seen": 70846336, "step": 32825 }, { "epoch": 5.35562805872757, "grad_norm": 0.7450544834136963, "learning_rate": 4.5833975000592286e-05, "loss": 0.1009, "num_input_tokens_seen": 70858016, "step": 32830 }, { "epoch": 5.356443719412725, "grad_norm": 1.0646940469741821, "learning_rate": 4.583200762083349e-05, "loss": 0.2527, "num_input_tokens_seen": 70868704, "step": 32835 }, { "epoch": 5.357259380097879, "grad_norm": 0.18351192772388458, "learning_rate": 4.583003981888765e-05, "loss": 0.1115, "num_input_tokens_seen": 70879392, "step": 32840 }, { "epoch": 5.358075040783034, "grad_norm": 1.187056064605713, "learning_rate": 4.5828071594794616e-05, "loss": 0.1946, "num_input_tokens_seen": 70890848, "step": 32845 }, { "epoch": 5.358890701468189, "grad_norm": 1.155763864517212, "learning_rate": 4.5826102948594295e-05, "loss": 0.0975, "num_input_tokens_seen": 70901472, "step": 32850 }, { "epoch": 5.359706362153344, "grad_norm": 1.5897570848464966, "learning_rate": 4.582413388032658e-05, "loss": 0.2126, "num_input_tokens_seen": 70911904, "step": 32855 }, { "epoch": 5.3605220228384995, "grad_norm": 0.22027142345905304, "learning_rate": 4.582216439003137e-05, "loss": 0.0381, "num_input_tokens_seen": 70922880, "step": 32860 }, { "epoch": 5.361337683523654, "grad_norm": 0.13092216849327087, "learning_rate": 4.582019447774858e-05, "loss": 0.0776, "num_input_tokens_seen": 70933472, "step": 32865 }, { "epoch": 5.362153344208809, "grad_norm": 0.07306994497776031, "learning_rate": 4.581822414351814e-05, "loss": 0.1158, "num_input_tokens_seen": 70944864, "step": 32870 }, { "epoch": 5.362969004893964, "grad_norm": 1.4754023551940918, "learning_rate": 4.581625338737998e-05, "loss": 0.0908, "num_input_tokens_seen": 70956512, "step": 32875 }, { "epoch": 5.363784665579119, "grad_norm": 0.2457178384065628, "learning_rate": 4.581428220937404e-05, "loss": 0.0317, "num_input_tokens_seen": 70966400, "step": 32880 }, { "epoch": 5.364600326264274, "grad_norm": 1.7422034740447998, "learning_rate": 4.581231060954026e-05, "loss": 0.0892, "num_input_tokens_seen": 70977120, "step": 32885 }, { "epoch": 5.365415986949429, "grad_norm": 0.9353575706481934, "learning_rate": 4.581033858791861e-05, "loss": 0.1489, "num_input_tokens_seen": 70987712, "step": 32890 }, { "epoch": 5.366231647634584, "grad_norm": 0.7240558862686157, "learning_rate": 4.580836614454904e-05, "loss": 0.0648, "num_input_tokens_seen": 70998080, "step": 32895 }, { "epoch": 5.367047308319739, "grad_norm": 0.6164636015892029, "learning_rate": 4.580639327947154e-05, "loss": 0.1844, "num_input_tokens_seen": 71009568, "step": 32900 }, { "epoch": 5.367862969004894, "grad_norm": 1.6957217454910278, "learning_rate": 4.580441999272608e-05, "loss": 0.2134, "num_input_tokens_seen": 71021728, "step": 32905 }, { "epoch": 5.368678629690049, "grad_norm": 0.3290434181690216, "learning_rate": 4.580244628435266e-05, "loss": 0.0579, "num_input_tokens_seen": 71032704, "step": 32910 }, { "epoch": 5.369494290375204, "grad_norm": 0.179179385304451, "learning_rate": 4.580047215439128e-05, "loss": 0.0433, "num_input_tokens_seen": 71043808, "step": 32915 }, { "epoch": 5.370309951060359, "grad_norm": 0.1822265237569809, "learning_rate": 4.579849760288194e-05, "loss": 0.1575, "num_input_tokens_seen": 71054752, "step": 32920 }, { "epoch": 5.371125611745514, "grad_norm": 0.5191752314567566, "learning_rate": 4.579652262986466e-05, "loss": 0.1485, "num_input_tokens_seen": 71065216, "step": 32925 }, { "epoch": 5.371941272430669, "grad_norm": 0.2003435492515564, "learning_rate": 4.579454723537947e-05, "loss": 0.1774, "num_input_tokens_seen": 71074752, "step": 32930 }, { "epoch": 5.372756933115824, "grad_norm": 0.06256669759750366, "learning_rate": 4.5792571419466396e-05, "loss": 0.0984, "num_input_tokens_seen": 71084864, "step": 32935 }, { "epoch": 5.373572593800978, "grad_norm": 0.6163271069526672, "learning_rate": 4.579059518216549e-05, "loss": 0.1617, "num_input_tokens_seen": 71095392, "step": 32940 }, { "epoch": 5.374388254486134, "grad_norm": 0.070979543030262, "learning_rate": 4.578861852351679e-05, "loss": 0.134, "num_input_tokens_seen": 71105472, "step": 32945 }, { "epoch": 5.375203915171289, "grad_norm": 0.12421977519989014, "learning_rate": 4.5786641443560375e-05, "loss": 0.1857, "num_input_tokens_seen": 71116448, "step": 32950 }, { "epoch": 5.376019575856444, "grad_norm": 0.30123305320739746, "learning_rate": 4.578466394233629e-05, "loss": 0.0481, "num_input_tokens_seen": 71128032, "step": 32955 }, { "epoch": 5.376835236541599, "grad_norm": 0.19324499368667603, "learning_rate": 4.5782686019884633e-05, "loss": 0.0774, "num_input_tokens_seen": 71140448, "step": 32960 }, { "epoch": 5.377650897226753, "grad_norm": 0.7906779050827026, "learning_rate": 4.578070767624547e-05, "loss": 0.0413, "num_input_tokens_seen": 71150720, "step": 32965 }, { "epoch": 5.378466557911908, "grad_norm": 0.281894326210022, "learning_rate": 4.577872891145891e-05, "loss": 0.1687, "num_input_tokens_seen": 71161920, "step": 32970 }, { "epoch": 5.379282218597064, "grad_norm": 0.3507463335990906, "learning_rate": 4.577674972556505e-05, "loss": 0.0566, "num_input_tokens_seen": 71173888, "step": 32975 }, { "epoch": 5.380097879282219, "grad_norm": 0.06012823060154915, "learning_rate": 4.577477011860399e-05, "loss": 0.0392, "num_input_tokens_seen": 71185984, "step": 32980 }, { "epoch": 5.3809135399673735, "grad_norm": 0.29690131545066833, "learning_rate": 4.577279009061587e-05, "loss": 0.0357, "num_input_tokens_seen": 71196640, "step": 32985 }, { "epoch": 5.381729200652528, "grad_norm": 0.08547184616327286, "learning_rate": 4.57708096416408e-05, "loss": 0.0181, "num_input_tokens_seen": 71208000, "step": 32990 }, { "epoch": 5.382544861337683, "grad_norm": 0.2839559018611908, "learning_rate": 4.576882877171893e-05, "loss": 0.0532, "num_input_tokens_seen": 71219616, "step": 32995 }, { "epoch": 5.383360522022839, "grad_norm": 0.6396917104721069, "learning_rate": 4.576684748089039e-05, "loss": 0.1372, "num_input_tokens_seen": 71230880, "step": 33000 }, { "epoch": 5.384176182707994, "grad_norm": 1.8767855167388916, "learning_rate": 4.576486576919534e-05, "loss": 0.2232, "num_input_tokens_seen": 71240800, "step": 33005 }, { "epoch": 5.3849918433931485, "grad_norm": 0.25033560395240784, "learning_rate": 4.576288363667395e-05, "loss": 0.1208, "num_input_tokens_seen": 71250784, "step": 33010 }, { "epoch": 5.385807504078303, "grad_norm": 0.1981716901063919, "learning_rate": 4.576090108336638e-05, "loss": 0.0753, "num_input_tokens_seen": 71260576, "step": 33015 }, { "epoch": 5.386623164763458, "grad_norm": 1.2269384860992432, "learning_rate": 4.575891810931282e-05, "loss": 0.055, "num_input_tokens_seen": 71270976, "step": 33020 }, { "epoch": 5.387438825448613, "grad_norm": 0.09527872502803802, "learning_rate": 4.575693471455344e-05, "loss": 0.1663, "num_input_tokens_seen": 71281216, "step": 33025 }, { "epoch": 5.388254486133769, "grad_norm": 0.49739551544189453, "learning_rate": 4.575495089912844e-05, "loss": 0.1593, "num_input_tokens_seen": 71292064, "step": 33030 }, { "epoch": 5.3890701468189235, "grad_norm": 0.4087796211242676, "learning_rate": 4.575296666307805e-05, "loss": 0.1546, "num_input_tokens_seen": 71303392, "step": 33035 }, { "epoch": 5.389885807504078, "grad_norm": 0.04409776255488396, "learning_rate": 4.575098200644246e-05, "loss": 0.0601, "num_input_tokens_seen": 71314016, "step": 33040 }, { "epoch": 5.390701468189233, "grad_norm": 0.2661793529987335, "learning_rate": 4.574899692926188e-05, "loss": 0.1447, "num_input_tokens_seen": 71326016, "step": 33045 }, { "epoch": 5.391517128874388, "grad_norm": 0.5142858624458313, "learning_rate": 4.574701143157657e-05, "loss": 0.1403, "num_input_tokens_seen": 71335840, "step": 33050 }, { "epoch": 5.392332789559543, "grad_norm": 0.1560516208410263, "learning_rate": 4.574502551342674e-05, "loss": 0.1183, "num_input_tokens_seen": 71347072, "step": 33055 }, { "epoch": 5.3931484502446985, "grad_norm": 0.9562426805496216, "learning_rate": 4.5743039174852665e-05, "loss": 0.0644, "num_input_tokens_seen": 71358464, "step": 33060 }, { "epoch": 5.393964110929853, "grad_norm": 0.6424078941345215, "learning_rate": 4.5741052415894583e-05, "loss": 0.0832, "num_input_tokens_seen": 71367776, "step": 33065 }, { "epoch": 5.394779771615008, "grad_norm": 0.49912625551223755, "learning_rate": 4.5739065236592756e-05, "loss": 0.0358, "num_input_tokens_seen": 71378368, "step": 33070 }, { "epoch": 5.395595432300163, "grad_norm": 1.8603971004486084, "learning_rate": 4.573707763698746e-05, "loss": 0.1555, "num_input_tokens_seen": 71389632, "step": 33075 }, { "epoch": 5.396411092985318, "grad_norm": 0.2593100666999817, "learning_rate": 4.5735089617118985e-05, "loss": 0.0251, "num_input_tokens_seen": 71400064, "step": 33080 }, { "epoch": 5.397226753670473, "grad_norm": 0.4939213991165161, "learning_rate": 4.5733101177027616e-05, "loss": 0.073, "num_input_tokens_seen": 71410784, "step": 33085 }, { "epoch": 5.398042414355628, "grad_norm": 2.003925323486328, "learning_rate": 4.573111231675365e-05, "loss": 0.2388, "num_input_tokens_seen": 71421632, "step": 33090 }, { "epoch": 5.398858075040783, "grad_norm": 0.08147677779197693, "learning_rate": 4.5729123036337393e-05, "loss": 0.221, "num_input_tokens_seen": 71432640, "step": 33095 }, { "epoch": 5.399673735725938, "grad_norm": 0.41113948822021484, "learning_rate": 4.572713333581916e-05, "loss": 0.0281, "num_input_tokens_seen": 71442816, "step": 33100 }, { "epoch": 5.400489396411093, "grad_norm": 0.302716463804245, "learning_rate": 4.5725143215239276e-05, "loss": 0.0482, "num_input_tokens_seen": 71453568, "step": 33105 }, { "epoch": 5.401305057096248, "grad_norm": 0.12646648287773132, "learning_rate": 4.572315267463807e-05, "loss": 0.0941, "num_input_tokens_seen": 71464448, "step": 33110 }, { "epoch": 5.402120717781403, "grad_norm": 0.04884849488735199, "learning_rate": 4.5721161714055895e-05, "loss": 0.1087, "num_input_tokens_seen": 71475264, "step": 33115 }, { "epoch": 5.402936378466558, "grad_norm": 0.8798056244850159, "learning_rate": 4.571917033353308e-05, "loss": 0.1006, "num_input_tokens_seen": 71484704, "step": 33120 }, { "epoch": 5.403752039151713, "grad_norm": 1.1506115198135376, "learning_rate": 4.571717853311e-05, "loss": 0.1017, "num_input_tokens_seen": 71494592, "step": 33125 }, { "epoch": 5.404567699836868, "grad_norm": 3.2007436752319336, "learning_rate": 4.571518631282701e-05, "loss": 0.2856, "num_input_tokens_seen": 71505568, "step": 33130 }, { "epoch": 5.4053833605220225, "grad_norm": 0.02526087500154972, "learning_rate": 4.57131936727245e-05, "loss": 0.0918, "num_input_tokens_seen": 71515872, "step": 33135 }, { "epoch": 5.406199021207178, "grad_norm": 0.11164279282093048, "learning_rate": 4.571120061284283e-05, "loss": 0.1225, "num_input_tokens_seen": 71526720, "step": 33140 }, { "epoch": 5.407014681892333, "grad_norm": 0.0642004907131195, "learning_rate": 4.570920713322242e-05, "loss": 0.1141, "num_input_tokens_seen": 71537856, "step": 33145 }, { "epoch": 5.407830342577488, "grad_norm": 0.37494125962257385, "learning_rate": 4.5707213233903646e-05, "loss": 0.1119, "num_input_tokens_seen": 71546624, "step": 33150 }, { "epoch": 5.408646003262643, "grad_norm": 0.05271501466631889, "learning_rate": 4.570521891492693e-05, "loss": 0.2497, "num_input_tokens_seen": 71556864, "step": 33155 }, { "epoch": 5.4094616639477975, "grad_norm": 0.6962012052536011, "learning_rate": 4.570322417633269e-05, "loss": 0.1174, "num_input_tokens_seen": 71566848, "step": 33160 }, { "epoch": 5.410277324632952, "grad_norm": 0.9026600122451782, "learning_rate": 4.570122901816134e-05, "loss": 0.0872, "num_input_tokens_seen": 71576544, "step": 33165 }, { "epoch": 5.411092985318108, "grad_norm": 0.13831564784049988, "learning_rate": 4.5699233440453326e-05, "loss": 0.1592, "num_input_tokens_seen": 71586816, "step": 33170 }, { "epoch": 5.411908646003263, "grad_norm": 0.7482017874717712, "learning_rate": 4.569723744324909e-05, "loss": 0.0517, "num_input_tokens_seen": 71596480, "step": 33175 }, { "epoch": 5.412724306688418, "grad_norm": 0.2453700304031372, "learning_rate": 4.5695241026589084e-05, "loss": 0.0888, "num_input_tokens_seen": 71607968, "step": 33180 }, { "epoch": 5.4135399673735725, "grad_norm": 1.6795244216918945, "learning_rate": 4.5693244190513755e-05, "loss": 0.1688, "num_input_tokens_seen": 71618560, "step": 33185 }, { "epoch": 5.414355628058727, "grad_norm": 0.06399615854024887, "learning_rate": 4.569124693506359e-05, "loss": 0.0377, "num_input_tokens_seen": 71629952, "step": 33190 }, { "epoch": 5.415171288743883, "grad_norm": 0.6209318041801453, "learning_rate": 4.568924926027905e-05, "loss": 0.1036, "num_input_tokens_seen": 71639296, "step": 33195 }, { "epoch": 5.415986949429038, "grad_norm": 0.8059556484222412, "learning_rate": 4.568725116620064e-05, "loss": 0.1204, "num_input_tokens_seen": 71649760, "step": 33200 }, { "epoch": 5.416802610114193, "grad_norm": 0.7565279603004456, "learning_rate": 4.568525265286883e-05, "loss": 0.0718, "num_input_tokens_seen": 71660800, "step": 33205 }, { "epoch": 5.417618270799347, "grad_norm": 0.4420849680900574, "learning_rate": 4.568325372032414e-05, "loss": 0.0613, "num_input_tokens_seen": 71671104, "step": 33210 }, { "epoch": 5.418433931484502, "grad_norm": 1.5806907415390015, "learning_rate": 4.568125436860706e-05, "loss": 0.1658, "num_input_tokens_seen": 71681568, "step": 33215 }, { "epoch": 5.419249592169657, "grad_norm": 0.41065219044685364, "learning_rate": 4.567925459775814e-05, "loss": 0.2186, "num_input_tokens_seen": 71691008, "step": 33220 }, { "epoch": 5.420065252854813, "grad_norm": 0.4699893593788147, "learning_rate": 4.5677254407817895e-05, "loss": 0.0522, "num_input_tokens_seen": 71702112, "step": 33225 }, { "epoch": 5.420880913539968, "grad_norm": 0.4882189631462097, "learning_rate": 4.567525379882684e-05, "loss": 0.0846, "num_input_tokens_seen": 71713696, "step": 33230 }, { "epoch": 5.421696574225122, "grad_norm": 1.4008885622024536, "learning_rate": 4.567325277082556e-05, "loss": 0.2803, "num_input_tokens_seen": 71724800, "step": 33235 }, { "epoch": 5.422512234910277, "grad_norm": 0.6191504001617432, "learning_rate": 4.567125132385457e-05, "loss": 0.0532, "num_input_tokens_seen": 71736128, "step": 33240 }, { "epoch": 5.423327895595432, "grad_norm": 0.43442463874816895, "learning_rate": 4.5669249457954455e-05, "loss": 0.1092, "num_input_tokens_seen": 71746912, "step": 33245 }, { "epoch": 5.424143556280587, "grad_norm": 0.987112820148468, "learning_rate": 4.566724717316579e-05, "loss": 0.2147, "num_input_tokens_seen": 71757888, "step": 33250 }, { "epoch": 5.424959216965743, "grad_norm": 0.10039960592985153, "learning_rate": 4.5665244469529124e-05, "loss": 0.0719, "num_input_tokens_seen": 71769504, "step": 33255 }, { "epoch": 5.425774877650897, "grad_norm": 1.5293049812316895, "learning_rate": 4.566324134708507e-05, "loss": 0.1236, "num_input_tokens_seen": 71780352, "step": 33260 }, { "epoch": 5.426590538336052, "grad_norm": 1.3954380750656128, "learning_rate": 4.566123780587422e-05, "loss": 0.1464, "num_input_tokens_seen": 71791488, "step": 33265 }, { "epoch": 5.427406199021207, "grad_norm": 0.880430281162262, "learning_rate": 4.565923384593717e-05, "loss": 0.0933, "num_input_tokens_seen": 71801728, "step": 33270 }, { "epoch": 5.428221859706362, "grad_norm": 1.9512732028961182, "learning_rate": 4.565722946731454e-05, "loss": 0.1221, "num_input_tokens_seen": 71812096, "step": 33275 }, { "epoch": 5.4290375203915175, "grad_norm": 0.38379284739494324, "learning_rate": 4.565522467004695e-05, "loss": 0.0412, "num_input_tokens_seen": 71822752, "step": 33280 }, { "epoch": 5.429853181076672, "grad_norm": 0.1427057832479477, "learning_rate": 4.565321945417503e-05, "loss": 0.0575, "num_input_tokens_seen": 71833056, "step": 33285 }, { "epoch": 5.430668841761827, "grad_norm": 0.06031941622495651, "learning_rate": 4.565121381973942e-05, "loss": 0.1446, "num_input_tokens_seen": 71842528, "step": 33290 }, { "epoch": 5.431484502446982, "grad_norm": 0.6110055446624756, "learning_rate": 4.564920776678076e-05, "loss": 0.0751, "num_input_tokens_seen": 71853440, "step": 33295 }, { "epoch": 5.432300163132137, "grad_norm": 0.12680284678936005, "learning_rate": 4.564720129533971e-05, "loss": 0.1039, "num_input_tokens_seen": 71863040, "step": 33300 }, { "epoch": 5.433115823817292, "grad_norm": 0.04045158997178078, "learning_rate": 4.564519440545693e-05, "loss": 0.1222, "num_input_tokens_seen": 71873792, "step": 33305 }, { "epoch": 5.433931484502447, "grad_norm": 1.2051235437393188, "learning_rate": 4.56431870971731e-05, "loss": 0.1342, "num_input_tokens_seen": 71885856, "step": 33310 }, { "epoch": 5.434747145187602, "grad_norm": 0.6741086840629578, "learning_rate": 4.56411793705289e-05, "loss": 0.0543, "num_input_tokens_seen": 71896416, "step": 33315 }, { "epoch": 5.435562805872757, "grad_norm": 0.12375372648239136, "learning_rate": 4.5639171225565005e-05, "loss": 0.0782, "num_input_tokens_seen": 71906976, "step": 33320 }, { "epoch": 5.436378466557912, "grad_norm": 0.06609894335269928, "learning_rate": 4.563716266232212e-05, "loss": 0.0936, "num_input_tokens_seen": 71916864, "step": 33325 }, { "epoch": 5.437194127243067, "grad_norm": 0.2652300298213959, "learning_rate": 4.563515368084096e-05, "loss": 0.0321, "num_input_tokens_seen": 71928288, "step": 33330 }, { "epoch": 5.438009787928221, "grad_norm": 0.16275011003017426, "learning_rate": 4.5633144281162235e-05, "loss": 0.0851, "num_input_tokens_seen": 71939104, "step": 33335 }, { "epoch": 5.438825448613377, "grad_norm": 0.36928030848503113, "learning_rate": 4.563113446332666e-05, "loss": 0.0482, "num_input_tokens_seen": 71950208, "step": 33340 }, { "epoch": 5.439641109298532, "grad_norm": 1.4309102296829224, "learning_rate": 4.562912422737498e-05, "loss": 0.3242, "num_input_tokens_seen": 71961312, "step": 33345 }, { "epoch": 5.440456769983687, "grad_norm": 1.4457374811172485, "learning_rate": 4.5627113573347926e-05, "loss": 0.3078, "num_input_tokens_seen": 71973248, "step": 33350 }, { "epoch": 5.441272430668842, "grad_norm": 0.06201687082648277, "learning_rate": 4.5625102501286245e-05, "loss": 0.0324, "num_input_tokens_seen": 71983648, "step": 33355 }, { "epoch": 5.442088091353996, "grad_norm": 0.05962385609745979, "learning_rate": 4.5623091011230704e-05, "loss": 0.0686, "num_input_tokens_seen": 71995072, "step": 33360 }, { "epoch": 5.442903752039152, "grad_norm": 1.6522209644317627, "learning_rate": 4.5621079103222056e-05, "loss": 0.1777, "num_input_tokens_seen": 72006112, "step": 33365 }, { "epoch": 5.443719412724307, "grad_norm": 0.1064079999923706, "learning_rate": 4.5619066777301074e-05, "loss": 0.0383, "num_input_tokens_seen": 72017184, "step": 33370 }, { "epoch": 5.444535073409462, "grad_norm": 0.6611090898513794, "learning_rate": 4.561705403350855e-05, "loss": 0.0996, "num_input_tokens_seen": 72028352, "step": 33375 }, { "epoch": 5.445350734094617, "grad_norm": 0.08553268015384674, "learning_rate": 4.561504087188527e-05, "loss": 0.1145, "num_input_tokens_seen": 72037664, "step": 33380 }, { "epoch": 5.446166394779771, "grad_norm": 0.1293068677186966, "learning_rate": 4.5613027292472046e-05, "loss": 0.0689, "num_input_tokens_seen": 72049120, "step": 33385 }, { "epoch": 5.446982055464926, "grad_norm": 0.9343870878219604, "learning_rate": 4.5611013295309665e-05, "loss": 0.2011, "num_input_tokens_seen": 72059104, "step": 33390 }, { "epoch": 5.447797716150082, "grad_norm": 1.9206254482269287, "learning_rate": 4.5608998880438955e-05, "loss": 0.1182, "num_input_tokens_seen": 72070048, "step": 33395 }, { "epoch": 5.448613376835237, "grad_norm": 0.25383350253105164, "learning_rate": 4.560698404790074e-05, "loss": 0.2326, "num_input_tokens_seen": 72080928, "step": 33400 }, { "epoch": 5.4494290375203915, "grad_norm": 0.07289256155490875, "learning_rate": 4.560496879773585e-05, "loss": 0.0566, "num_input_tokens_seen": 72091488, "step": 33405 }, { "epoch": 5.450244698205546, "grad_norm": 0.5942339897155762, "learning_rate": 4.560295312998513e-05, "loss": 0.0527, "num_input_tokens_seen": 72102528, "step": 33410 }, { "epoch": 5.451060358890701, "grad_norm": 0.7388920783996582, "learning_rate": 4.560093704468942e-05, "loss": 0.0813, "num_input_tokens_seen": 72112992, "step": 33415 }, { "epoch": 5.451876019575856, "grad_norm": 0.5812087059020996, "learning_rate": 4.559892054188959e-05, "loss": 0.0723, "num_input_tokens_seen": 72124096, "step": 33420 }, { "epoch": 5.452691680261012, "grad_norm": 0.3278709948062897, "learning_rate": 4.559690362162651e-05, "loss": 0.0669, "num_input_tokens_seen": 72134624, "step": 33425 }, { "epoch": 5.4535073409461665, "grad_norm": 1.1139470338821411, "learning_rate": 4.559488628394104e-05, "loss": 0.2032, "num_input_tokens_seen": 72145344, "step": 33430 }, { "epoch": 5.454323001631321, "grad_norm": 0.29748496413230896, "learning_rate": 4.559286852887408e-05, "loss": 0.0723, "num_input_tokens_seen": 72157312, "step": 33435 }, { "epoch": 5.455138662316476, "grad_norm": 1.4313807487487793, "learning_rate": 4.559085035646651e-05, "loss": 0.2274, "num_input_tokens_seen": 72168032, "step": 33440 }, { "epoch": 5.455954323001631, "grad_norm": 0.5401431918144226, "learning_rate": 4.5588831766759246e-05, "loss": 0.0754, "num_input_tokens_seen": 72178944, "step": 33445 }, { "epoch": 5.456769983686787, "grad_norm": 1.159019947052002, "learning_rate": 4.5586812759793177e-05, "loss": 0.0998, "num_input_tokens_seen": 72188640, "step": 33450 }, { "epoch": 5.4575856443719415, "grad_norm": 1.2194923162460327, "learning_rate": 4.558479333560923e-05, "loss": 0.194, "num_input_tokens_seen": 72198720, "step": 33455 }, { "epoch": 5.458401305057096, "grad_norm": 0.0384163036942482, "learning_rate": 4.5582773494248346e-05, "loss": 0.1165, "num_input_tokens_seen": 72208704, "step": 33460 }, { "epoch": 5.459216965742251, "grad_norm": 0.14276427030563354, "learning_rate": 4.5580753235751435e-05, "loss": 0.0862, "num_input_tokens_seen": 72218688, "step": 33465 }, { "epoch": 5.460032626427406, "grad_norm": 1.0505141019821167, "learning_rate": 4.5578732560159455e-05, "loss": 0.1044, "num_input_tokens_seen": 72230336, "step": 33470 }, { "epoch": 5.460848287112561, "grad_norm": 0.1617009937763214, "learning_rate": 4.557671146751335e-05, "loss": 0.0268, "num_input_tokens_seen": 72241024, "step": 33475 }, { "epoch": 5.4616639477977165, "grad_norm": 0.6048555374145508, "learning_rate": 4.557468995785409e-05, "loss": 0.0983, "num_input_tokens_seen": 72252064, "step": 33480 }, { "epoch": 5.462479608482871, "grad_norm": 1.467022180557251, "learning_rate": 4.5572668031222635e-05, "loss": 0.2372, "num_input_tokens_seen": 72263616, "step": 33485 }, { "epoch": 5.463295269168026, "grad_norm": 1.1630343198776245, "learning_rate": 4.5570645687659954e-05, "loss": 0.2637, "num_input_tokens_seen": 72274784, "step": 33490 }, { "epoch": 5.464110929853181, "grad_norm": 0.08083044737577438, "learning_rate": 4.556862292720706e-05, "loss": 0.0986, "num_input_tokens_seen": 72284064, "step": 33495 }, { "epoch": 5.464926590538336, "grad_norm": 1.1634522676467896, "learning_rate": 4.5566599749904915e-05, "loss": 0.1864, "num_input_tokens_seen": 72295616, "step": 33500 }, { "epoch": 5.465742251223491, "grad_norm": 0.33008283376693726, "learning_rate": 4.5564576155794535e-05, "loss": 0.2169, "num_input_tokens_seen": 72305952, "step": 33505 }, { "epoch": 5.466557911908646, "grad_norm": 0.29943084716796875, "learning_rate": 4.5562552144916934e-05, "loss": 0.1878, "num_input_tokens_seen": 72316512, "step": 33510 }, { "epoch": 5.467373572593801, "grad_norm": 0.12815679609775543, "learning_rate": 4.556052771731314e-05, "loss": 0.1782, "num_input_tokens_seen": 72327328, "step": 33515 }, { "epoch": 5.468189233278956, "grad_norm": 0.9897528290748596, "learning_rate": 4.555850287302415e-05, "loss": 0.079, "num_input_tokens_seen": 72338560, "step": 33520 }, { "epoch": 5.469004893964111, "grad_norm": 0.7703413367271423, "learning_rate": 4.555647761209104e-05, "loss": 0.11, "num_input_tokens_seen": 72349312, "step": 33525 }, { "epoch": 5.4698205546492655, "grad_norm": 0.7388231754302979, "learning_rate": 4.555445193455482e-05, "loss": 0.0709, "num_input_tokens_seen": 72361024, "step": 33530 }, { "epoch": 5.470636215334421, "grad_norm": 0.3065885305404663, "learning_rate": 4.555242584045656e-05, "loss": 0.2449, "num_input_tokens_seen": 72372352, "step": 33535 }, { "epoch": 5.471451876019576, "grad_norm": 0.17832717299461365, "learning_rate": 4.5550399329837314e-05, "loss": 0.0709, "num_input_tokens_seen": 72383136, "step": 33540 }, { "epoch": 5.472267536704731, "grad_norm": 0.2627333998680115, "learning_rate": 4.554837240273816e-05, "loss": 0.0211, "num_input_tokens_seen": 72392224, "step": 33545 }, { "epoch": 5.473083197389886, "grad_norm": 0.13815860450267792, "learning_rate": 4.554634505920017e-05, "loss": 0.0377, "num_input_tokens_seen": 72402176, "step": 33550 }, { "epoch": 5.4738988580750405, "grad_norm": 1.3691883087158203, "learning_rate": 4.554431729926443e-05, "loss": 0.1938, "num_input_tokens_seen": 72413792, "step": 33555 }, { "epoch": 5.474714518760196, "grad_norm": 1.1233795881271362, "learning_rate": 4.5542289122972036e-05, "loss": 0.1809, "num_input_tokens_seen": 72425792, "step": 33560 }, { "epoch": 5.475530179445351, "grad_norm": 0.07370603084564209, "learning_rate": 4.5540260530364095e-05, "loss": 0.035, "num_input_tokens_seen": 72436864, "step": 33565 }, { "epoch": 5.476345840130506, "grad_norm": 0.40495917201042175, "learning_rate": 4.5538231521481725e-05, "loss": 0.0413, "num_input_tokens_seen": 72448096, "step": 33570 }, { "epoch": 5.477161500815661, "grad_norm": 0.5716274380683899, "learning_rate": 4.553620209636603e-05, "loss": 0.0526, "num_input_tokens_seen": 72459776, "step": 33575 }, { "epoch": 5.4779771615008155, "grad_norm": 2.6226706504821777, "learning_rate": 4.5534172255058146e-05, "loss": 0.2563, "num_input_tokens_seen": 72470528, "step": 33580 }, { "epoch": 5.47879282218597, "grad_norm": 1.7030500173568726, "learning_rate": 4.553214199759922e-05, "loss": 0.3473, "num_input_tokens_seen": 72482144, "step": 33585 }, { "epoch": 5.479608482871126, "grad_norm": 1.1673073768615723, "learning_rate": 4.553011132403039e-05, "loss": 0.1492, "num_input_tokens_seen": 72493504, "step": 33590 }, { "epoch": 5.480424143556281, "grad_norm": 0.04003095254302025, "learning_rate": 4.552808023439281e-05, "loss": 0.1021, "num_input_tokens_seen": 72503904, "step": 33595 }, { "epoch": 5.481239804241436, "grad_norm": 1.2914644479751587, "learning_rate": 4.5526048728727634e-05, "loss": 0.2611, "num_input_tokens_seen": 72514560, "step": 33600 }, { "epoch": 5.4820554649265905, "grad_norm": 0.3696868419647217, "learning_rate": 4.552401680707604e-05, "loss": 0.1094, "num_input_tokens_seen": 72526080, "step": 33605 }, { "epoch": 5.482871125611745, "grad_norm": 0.12310423702001572, "learning_rate": 4.552198446947922e-05, "loss": 0.1298, "num_input_tokens_seen": 72536992, "step": 33610 }, { "epoch": 5.4836867862969, "grad_norm": 0.338973730802536, "learning_rate": 4.551995171597834e-05, "loss": 0.134, "num_input_tokens_seen": 72548224, "step": 33615 }, { "epoch": 5.484502446982056, "grad_norm": 0.12934274971485138, "learning_rate": 4.5517918546614616e-05, "loss": 0.0722, "num_input_tokens_seen": 72559744, "step": 33620 }, { "epoch": 5.485318107667211, "grad_norm": 0.5060495734214783, "learning_rate": 4.551588496142923e-05, "loss": 0.0716, "num_input_tokens_seen": 72570528, "step": 33625 }, { "epoch": 5.486133768352365, "grad_norm": 0.6496005058288574, "learning_rate": 4.551385096046342e-05, "loss": 0.168, "num_input_tokens_seen": 72582112, "step": 33630 }, { "epoch": 5.48694942903752, "grad_norm": 0.1418631374835968, "learning_rate": 4.55118165437584e-05, "loss": 0.1464, "num_input_tokens_seen": 72592736, "step": 33635 }, { "epoch": 5.487765089722675, "grad_norm": 0.10209762305021286, "learning_rate": 4.550978171135539e-05, "loss": 0.0683, "num_input_tokens_seen": 72604096, "step": 33640 }, { "epoch": 5.488580750407831, "grad_norm": 0.7374672889709473, "learning_rate": 4.5507746463295645e-05, "loss": 0.0853, "num_input_tokens_seen": 72614560, "step": 33645 }, { "epoch": 5.489396411092986, "grad_norm": 1.0800141096115112, "learning_rate": 4.550571079962039e-05, "loss": 0.2089, "num_input_tokens_seen": 72625536, "step": 33650 }, { "epoch": 5.49021207177814, "grad_norm": 0.2682197391986847, "learning_rate": 4.550367472037089e-05, "loss": 0.2478, "num_input_tokens_seen": 72636384, "step": 33655 }, { "epoch": 5.491027732463295, "grad_norm": 1.5706416368484497, "learning_rate": 4.5501638225588424e-05, "loss": 0.2305, "num_input_tokens_seen": 72646272, "step": 33660 }, { "epoch": 5.49184339314845, "grad_norm": 0.05705469846725464, "learning_rate": 4.5499601315314245e-05, "loss": 0.1456, "num_input_tokens_seen": 72657184, "step": 33665 }, { "epoch": 5.492659053833605, "grad_norm": 0.5903229713439941, "learning_rate": 4.549756398958964e-05, "loss": 0.1465, "num_input_tokens_seen": 72668544, "step": 33670 }, { "epoch": 5.493474714518761, "grad_norm": 0.07605362683534622, "learning_rate": 4.5495526248455895e-05, "loss": 0.065, "num_input_tokens_seen": 72679104, "step": 33675 }, { "epoch": 5.494290375203915, "grad_norm": 0.8388298749923706, "learning_rate": 4.549348809195431e-05, "loss": 0.2109, "num_input_tokens_seen": 72689856, "step": 33680 }, { "epoch": 5.49510603588907, "grad_norm": 0.8047215342521667, "learning_rate": 4.5491449520126196e-05, "loss": 0.0926, "num_input_tokens_seen": 72701312, "step": 33685 }, { "epoch": 5.495921696574225, "grad_norm": 0.0870635062456131, "learning_rate": 4.5489410533012856e-05, "loss": 0.0191, "num_input_tokens_seen": 72711808, "step": 33690 }, { "epoch": 5.49673735725938, "grad_norm": 0.3267684578895569, "learning_rate": 4.548737113065562e-05, "loss": 0.0688, "num_input_tokens_seen": 72722528, "step": 33695 }, { "epoch": 5.497553017944535, "grad_norm": 1.5717542171478271, "learning_rate": 4.548533131309582e-05, "loss": 0.1785, "num_input_tokens_seen": 72733408, "step": 33700 }, { "epoch": 5.49836867862969, "grad_norm": 1.1008641719818115, "learning_rate": 4.54832910803748e-05, "loss": 0.1802, "num_input_tokens_seen": 72744544, "step": 33705 }, { "epoch": 5.499184339314845, "grad_norm": 0.17544399201869965, "learning_rate": 4.5481250432533896e-05, "loss": 0.0667, "num_input_tokens_seen": 72755712, "step": 33710 }, { "epoch": 5.5, "grad_norm": 2.139970302581787, "learning_rate": 4.547920936961447e-05, "loss": 0.1491, "num_input_tokens_seen": 72765696, "step": 33715 }, { "epoch": 5.500815660685155, "grad_norm": 0.5838527679443359, "learning_rate": 4.547716789165788e-05, "loss": 0.0744, "num_input_tokens_seen": 72776576, "step": 33720 }, { "epoch": 5.50163132137031, "grad_norm": 0.9620661735534668, "learning_rate": 4.5475125998705516e-05, "loss": 0.0532, "num_input_tokens_seen": 72786112, "step": 33725 }, { "epoch": 5.502446982055465, "grad_norm": 0.019793029874563217, "learning_rate": 4.547308369079874e-05, "loss": 0.0624, "num_input_tokens_seen": 72796384, "step": 33730 }, { "epoch": 5.50326264274062, "grad_norm": 0.2356468141078949, "learning_rate": 4.5471040967978954e-05, "loss": 0.0691, "num_input_tokens_seen": 72806720, "step": 33735 }, { "epoch": 5.504078303425775, "grad_norm": 1.5066107511520386, "learning_rate": 4.546899783028755e-05, "loss": 0.1724, "num_input_tokens_seen": 72817408, "step": 33740 }, { "epoch": 5.50489396411093, "grad_norm": 0.9614988565444946, "learning_rate": 4.546695427776595e-05, "loss": 0.196, "num_input_tokens_seen": 72828096, "step": 33745 }, { "epoch": 5.505709624796085, "grad_norm": 0.13432538509368896, "learning_rate": 4.5464910310455546e-05, "loss": 0.033, "num_input_tokens_seen": 72839040, "step": 33750 }, { "epoch": 5.506525285481239, "grad_norm": 0.6834064722061157, "learning_rate": 4.546286592839778e-05, "loss": 0.106, "num_input_tokens_seen": 72849728, "step": 33755 }, { "epoch": 5.507340946166395, "grad_norm": 0.09949855506420135, "learning_rate": 4.546082113163407e-05, "loss": 0.1318, "num_input_tokens_seen": 72859872, "step": 33760 }, { "epoch": 5.50815660685155, "grad_norm": 1.7989386320114136, "learning_rate": 4.5458775920205865e-05, "loss": 0.331, "num_input_tokens_seen": 72871648, "step": 33765 }, { "epoch": 5.508972267536705, "grad_norm": 0.20165397226810455, "learning_rate": 4.545673029415462e-05, "loss": 0.0471, "num_input_tokens_seen": 72883136, "step": 33770 }, { "epoch": 5.50978792822186, "grad_norm": 1.1765320301055908, "learning_rate": 4.545468425352177e-05, "loss": 0.0658, "num_input_tokens_seen": 72892896, "step": 33775 }, { "epoch": 5.510603588907014, "grad_norm": 0.03966633975505829, "learning_rate": 4.545263779834881e-05, "loss": 0.1349, "num_input_tokens_seen": 72904576, "step": 33780 }, { "epoch": 5.511419249592169, "grad_norm": 0.08829495310783386, "learning_rate": 4.5450590928677193e-05, "loss": 0.1096, "num_input_tokens_seen": 72914016, "step": 33785 }, { "epoch": 5.512234910277325, "grad_norm": 0.14971274137496948, "learning_rate": 4.54485436445484e-05, "loss": 0.0654, "num_input_tokens_seen": 72925152, "step": 33790 }, { "epoch": 5.51305057096248, "grad_norm": 0.513634443283081, "learning_rate": 4.544649594600394e-05, "loss": 0.0496, "num_input_tokens_seen": 72936320, "step": 33795 }, { "epoch": 5.513866231647635, "grad_norm": 0.06763828545808792, "learning_rate": 4.54444478330853e-05, "loss": 0.0224, "num_input_tokens_seen": 72946848, "step": 33800 }, { "epoch": 5.514681892332789, "grad_norm": 0.17759712040424347, "learning_rate": 4.5442399305833986e-05, "loss": 0.0164, "num_input_tokens_seen": 72958208, "step": 33805 }, { "epoch": 5.515497553017944, "grad_norm": 0.6712028980255127, "learning_rate": 4.544035036429152e-05, "loss": 0.0436, "num_input_tokens_seen": 72969152, "step": 33810 }, { "epoch": 5.5163132137031, "grad_norm": 0.11229176819324493, "learning_rate": 4.543830100849942e-05, "loss": 0.2677, "num_input_tokens_seen": 72981664, "step": 33815 }, { "epoch": 5.517128874388255, "grad_norm": 0.5750674605369568, "learning_rate": 4.5436251238499226e-05, "loss": 0.0498, "num_input_tokens_seen": 72992224, "step": 33820 }, { "epoch": 5.5179445350734095, "grad_norm": 0.2557067573070526, "learning_rate": 4.543420105433247e-05, "loss": 0.0755, "num_input_tokens_seen": 73003712, "step": 33825 }, { "epoch": 5.518760195758564, "grad_norm": 0.6486923694610596, "learning_rate": 4.5432150456040714e-05, "loss": 0.3154, "num_input_tokens_seen": 73015328, "step": 33830 }, { "epoch": 5.519575856443719, "grad_norm": 1.2196077108383179, "learning_rate": 4.5430099443665505e-05, "loss": 0.1861, "num_input_tokens_seen": 73026304, "step": 33835 }, { "epoch": 5.520391517128875, "grad_norm": 0.3736896812915802, "learning_rate": 4.5428048017248414e-05, "loss": 0.2922, "num_input_tokens_seen": 73038688, "step": 33840 }, { "epoch": 5.52120717781403, "grad_norm": 1.1042076349258423, "learning_rate": 4.542599617683101e-05, "loss": 0.0744, "num_input_tokens_seen": 73049824, "step": 33845 }, { "epoch": 5.5220228384991845, "grad_norm": 0.11453214287757874, "learning_rate": 4.542394392245489e-05, "loss": 0.1638, "num_input_tokens_seen": 73061760, "step": 33850 }, { "epoch": 5.522838499184339, "grad_norm": 0.08903289586305618, "learning_rate": 4.542189125416163e-05, "loss": 0.1241, "num_input_tokens_seen": 73072704, "step": 33855 }, { "epoch": 5.523654159869494, "grad_norm": 0.8202807307243347, "learning_rate": 4.541983817199284e-05, "loss": 0.0886, "num_input_tokens_seen": 73084032, "step": 33860 }, { "epoch": 5.524469820554649, "grad_norm": 0.2659582197666168, "learning_rate": 4.541778467599012e-05, "loss": 0.1294, "num_input_tokens_seen": 73094592, "step": 33865 }, { "epoch": 5.525285481239804, "grad_norm": 0.5619381666183472, "learning_rate": 4.54157307661951e-05, "loss": 0.1162, "num_input_tokens_seen": 73106304, "step": 33870 }, { "epoch": 5.5261011419249595, "grad_norm": 0.07961256802082062, "learning_rate": 4.5413676442649386e-05, "loss": 0.104, "num_input_tokens_seen": 73117088, "step": 33875 }, { "epoch": 5.526916802610114, "grad_norm": 0.2612385153770447, "learning_rate": 4.5411621705394634e-05, "loss": 0.0608, "num_input_tokens_seen": 73127776, "step": 33880 }, { "epoch": 5.527732463295269, "grad_norm": 0.686100959777832, "learning_rate": 4.540956655447247e-05, "loss": 0.0372, "num_input_tokens_seen": 73139904, "step": 33885 }, { "epoch": 5.528548123980424, "grad_norm": 0.13796786963939667, "learning_rate": 4.540751098992455e-05, "loss": 0.0223, "num_input_tokens_seen": 73150688, "step": 33890 }, { "epoch": 5.529363784665579, "grad_norm": 0.1977275162935257, "learning_rate": 4.540545501179253e-05, "loss": 0.1632, "num_input_tokens_seen": 73160576, "step": 33895 }, { "epoch": 5.5301794453507345, "grad_norm": 0.28955623507499695, "learning_rate": 4.540339862011807e-05, "loss": 0.1403, "num_input_tokens_seen": 73170272, "step": 33900 }, { "epoch": 5.530995106035889, "grad_norm": 1.13251531124115, "learning_rate": 4.540134181494287e-05, "loss": 0.2186, "num_input_tokens_seen": 73181760, "step": 33905 }, { "epoch": 5.531810766721044, "grad_norm": 0.033691223710775375, "learning_rate": 4.539928459630858e-05, "loss": 0.0675, "num_input_tokens_seen": 73192096, "step": 33910 }, { "epoch": 5.532626427406199, "grad_norm": 1.4931755065917969, "learning_rate": 4.539722696425692e-05, "loss": 0.1292, "num_input_tokens_seen": 73201408, "step": 33915 }, { "epoch": 5.533442088091354, "grad_norm": 1.4401999711990356, "learning_rate": 4.5395168918829575e-05, "loss": 0.1286, "num_input_tokens_seen": 73211936, "step": 33920 }, { "epoch": 5.5342577487765094, "grad_norm": 1.449577808380127, "learning_rate": 4.5393110460068254e-05, "loss": 0.1491, "num_input_tokens_seen": 73223392, "step": 33925 }, { "epoch": 5.535073409461664, "grad_norm": 1.5506324768066406, "learning_rate": 4.539105158801469e-05, "loss": 0.1543, "num_input_tokens_seen": 73233920, "step": 33930 }, { "epoch": 5.535889070146819, "grad_norm": 0.42313164472579956, "learning_rate": 4.5388992302710595e-05, "loss": 0.0369, "num_input_tokens_seen": 73244288, "step": 33935 }, { "epoch": 5.536704730831974, "grad_norm": 0.7798886895179749, "learning_rate": 4.53869326041977e-05, "loss": 0.2037, "num_input_tokens_seen": 73256256, "step": 33940 }, { "epoch": 5.537520391517129, "grad_norm": 0.7562277317047119, "learning_rate": 4.538487249251776e-05, "loss": 0.2461, "num_input_tokens_seen": 73267264, "step": 33945 }, { "epoch": 5.5383360522022835, "grad_norm": 0.7038952708244324, "learning_rate": 4.5382811967712515e-05, "loss": 0.2475, "num_input_tokens_seen": 73279168, "step": 33950 }, { "epoch": 5.539151712887438, "grad_norm": 1.0150779485702515, "learning_rate": 4.538075102982373e-05, "loss": 0.0853, "num_input_tokens_seen": 73288128, "step": 33955 }, { "epoch": 5.539967373572594, "grad_norm": 1.3811246156692505, "learning_rate": 4.537868967889317e-05, "loss": 0.2767, "num_input_tokens_seen": 73297472, "step": 33960 }, { "epoch": 5.540783034257749, "grad_norm": 0.241582453250885, "learning_rate": 4.5376627914962614e-05, "loss": 0.1004, "num_input_tokens_seen": 73309504, "step": 33965 }, { "epoch": 5.541598694942904, "grad_norm": 0.20726199448108673, "learning_rate": 4.5374565738073836e-05, "loss": 0.1205, "num_input_tokens_seen": 73320192, "step": 33970 }, { "epoch": 5.5424143556280585, "grad_norm": 0.07779063284397125, "learning_rate": 4.537250314826864e-05, "loss": 0.1913, "num_input_tokens_seen": 73331168, "step": 33975 }, { "epoch": 5.543230016313213, "grad_norm": 1.408074140548706, "learning_rate": 4.537044014558882e-05, "loss": 0.0969, "num_input_tokens_seen": 73342944, "step": 33980 }, { "epoch": 5.544045676998369, "grad_norm": 0.09734564274549484, "learning_rate": 4.5368376730076195e-05, "loss": 0.0516, "num_input_tokens_seen": 73353920, "step": 33985 }, { "epoch": 5.544861337683524, "grad_norm": 1.604506015777588, "learning_rate": 4.536631290177258e-05, "loss": 0.3371, "num_input_tokens_seen": 73366432, "step": 33990 }, { "epoch": 5.545676998368679, "grad_norm": 0.12930390238761902, "learning_rate": 4.536424866071979e-05, "loss": 0.1003, "num_input_tokens_seen": 73377824, "step": 33995 }, { "epoch": 5.5464926590538335, "grad_norm": 0.5119082927703857, "learning_rate": 4.5362184006959664e-05, "loss": 0.0444, "num_input_tokens_seen": 73387840, "step": 34000 }, { "epoch": 5.547308319738988, "grad_norm": 0.7116658687591553, "learning_rate": 4.536011894053406e-05, "loss": 0.0914, "num_input_tokens_seen": 73398304, "step": 34005 }, { "epoch": 5.548123980424144, "grad_norm": 2.3316168785095215, "learning_rate": 4.535805346148481e-05, "loss": 0.1952, "num_input_tokens_seen": 73410016, "step": 34010 }, { "epoch": 5.548939641109299, "grad_norm": 1.1124497652053833, "learning_rate": 4.535598756985378e-05, "loss": 0.0918, "num_input_tokens_seen": 73420960, "step": 34015 }, { "epoch": 5.549755301794454, "grad_norm": 0.9391559958457947, "learning_rate": 4.535392126568283e-05, "loss": 0.0739, "num_input_tokens_seen": 73431712, "step": 34020 }, { "epoch": 5.5505709624796085, "grad_norm": 1.5839091539382935, "learning_rate": 4.535185454901386e-05, "loss": 0.1867, "num_input_tokens_seen": 73441376, "step": 34025 }, { "epoch": 5.551386623164763, "grad_norm": 0.3526662290096283, "learning_rate": 4.5349787419888735e-05, "loss": 0.1319, "num_input_tokens_seen": 73450752, "step": 34030 }, { "epoch": 5.552202283849918, "grad_norm": 1.516333818435669, "learning_rate": 4.5347719878349346e-05, "loss": 0.1335, "num_input_tokens_seen": 73461952, "step": 34035 }, { "epoch": 5.553017944535073, "grad_norm": 0.5991340279579163, "learning_rate": 4.5345651924437604e-05, "loss": 0.134, "num_input_tokens_seen": 73472064, "step": 34040 }, { "epoch": 5.553833605220229, "grad_norm": 0.11173912137746811, "learning_rate": 4.534358355819542e-05, "loss": 0.0293, "num_input_tokens_seen": 73483200, "step": 34045 }, { "epoch": 5.554649265905383, "grad_norm": 0.682728111743927, "learning_rate": 4.53415147796647e-05, "loss": 0.037, "num_input_tokens_seen": 73494080, "step": 34050 }, { "epoch": 5.555464926590538, "grad_norm": 0.4183976948261261, "learning_rate": 4.533944558888738e-05, "loss": 0.1971, "num_input_tokens_seen": 73503616, "step": 34055 }, { "epoch": 5.556280587275693, "grad_norm": 0.6272698044776917, "learning_rate": 4.53373759859054e-05, "loss": 0.1189, "num_input_tokens_seen": 73515328, "step": 34060 }, { "epoch": 5.557096247960848, "grad_norm": 0.11697318404912949, "learning_rate": 4.533530597076069e-05, "loss": 0.1716, "num_input_tokens_seen": 73527520, "step": 34065 }, { "epoch": 5.557911908646004, "grad_norm": 0.8917902112007141, "learning_rate": 4.5333235543495205e-05, "loss": 0.1874, "num_input_tokens_seen": 73538688, "step": 34070 }, { "epoch": 5.558727569331158, "grad_norm": 0.5031224489212036, "learning_rate": 4.5331164704150915e-05, "loss": 0.1363, "num_input_tokens_seen": 73548544, "step": 34075 }, { "epoch": 5.559543230016313, "grad_norm": 0.3982155919075012, "learning_rate": 4.532909345276978e-05, "loss": 0.0606, "num_input_tokens_seen": 73559712, "step": 34080 }, { "epoch": 5.560358890701468, "grad_norm": 0.42929279804229736, "learning_rate": 4.5327021789393764e-05, "loss": 0.0311, "num_input_tokens_seen": 73570304, "step": 34085 }, { "epoch": 5.561174551386623, "grad_norm": 0.7874147295951843, "learning_rate": 4.532494971406487e-05, "loss": 0.1564, "num_input_tokens_seen": 73581056, "step": 34090 }, { "epoch": 5.561990212071779, "grad_norm": 0.3011910617351532, "learning_rate": 4.532287722682509e-05, "loss": 0.0473, "num_input_tokens_seen": 73592064, "step": 34095 }, { "epoch": 5.562805872756933, "grad_norm": 0.1617438793182373, "learning_rate": 4.532080432771642e-05, "loss": 0.0499, "num_input_tokens_seen": 73603296, "step": 34100 }, { "epoch": 5.563621533442088, "grad_norm": 1.0128285884857178, "learning_rate": 4.531873101678088e-05, "loss": 0.074, "num_input_tokens_seen": 73612704, "step": 34105 }, { "epoch": 5.564437194127243, "grad_norm": 1.4082419872283936, "learning_rate": 4.531665729406047e-05, "loss": 0.1158, "num_input_tokens_seen": 73623168, "step": 34110 }, { "epoch": 5.565252854812398, "grad_norm": 0.17527762055397034, "learning_rate": 4.531458315959723e-05, "loss": 0.2516, "num_input_tokens_seen": 73633088, "step": 34115 }, { "epoch": 5.566068515497553, "grad_norm": 1.616111159324646, "learning_rate": 4.531250861343319e-05, "loss": 0.0728, "num_input_tokens_seen": 73644512, "step": 34120 }, { "epoch": 5.566884176182708, "grad_norm": 0.045169584453105927, "learning_rate": 4.5310433655610395e-05, "loss": 0.1081, "num_input_tokens_seen": 73655008, "step": 34125 }, { "epoch": 5.567699836867863, "grad_norm": 0.8098430037498474, "learning_rate": 4.5308358286170894e-05, "loss": 0.0949, "num_input_tokens_seen": 73665216, "step": 34130 }, { "epoch": 5.568515497553018, "grad_norm": 0.3406784236431122, "learning_rate": 4.530628250515675e-05, "loss": 0.0436, "num_input_tokens_seen": 73676384, "step": 34135 }, { "epoch": 5.569331158238173, "grad_norm": 0.20461800694465637, "learning_rate": 4.5304206312610034e-05, "loss": 0.1941, "num_input_tokens_seen": 73687456, "step": 34140 }, { "epoch": 5.570146818923328, "grad_norm": 0.33791419863700867, "learning_rate": 4.5302129708572815e-05, "loss": 0.1293, "num_input_tokens_seen": 73699424, "step": 34145 }, { "epoch": 5.5709624796084825, "grad_norm": 0.29383090138435364, "learning_rate": 4.530005269308719e-05, "loss": 0.0612, "num_input_tokens_seen": 73710400, "step": 34150 }, { "epoch": 5.571778140293638, "grad_norm": 0.2905932068824768, "learning_rate": 4.529797526619524e-05, "loss": 0.2146, "num_input_tokens_seen": 73721184, "step": 34155 }, { "epoch": 5.572593800978793, "grad_norm": 0.5095188021659851, "learning_rate": 4.529589742793906e-05, "loss": 0.1135, "num_input_tokens_seen": 73732032, "step": 34160 }, { "epoch": 5.573409461663948, "grad_norm": 1.7992886304855347, "learning_rate": 4.529381917836079e-05, "loss": 0.187, "num_input_tokens_seen": 73742944, "step": 34165 }, { "epoch": 5.574225122349103, "grad_norm": 0.10789517313241959, "learning_rate": 4.5291740517502526e-05, "loss": 0.0617, "num_input_tokens_seen": 73753312, "step": 34170 }, { "epoch": 5.575040783034257, "grad_norm": 0.5928254723548889, "learning_rate": 4.528966144540639e-05, "loss": 0.095, "num_input_tokens_seen": 73764480, "step": 34175 }, { "epoch": 5.575856443719413, "grad_norm": 0.3737611174583435, "learning_rate": 4.5287581962114534e-05, "loss": 0.1284, "num_input_tokens_seen": 73774496, "step": 34180 }, { "epoch": 5.576672104404568, "grad_norm": 0.8081221580505371, "learning_rate": 4.528550206766908e-05, "loss": 0.1179, "num_input_tokens_seen": 73786688, "step": 34185 }, { "epoch": 5.577487765089723, "grad_norm": 0.8823701739311218, "learning_rate": 4.528342176211221e-05, "loss": 0.1779, "num_input_tokens_seen": 73798080, "step": 34190 }, { "epoch": 5.578303425774878, "grad_norm": 0.8953506350517273, "learning_rate": 4.528134104548606e-05, "loss": 0.0414, "num_input_tokens_seen": 73808736, "step": 34195 }, { "epoch": 5.579119086460032, "grad_norm": 0.48798561096191406, "learning_rate": 4.52792599178328e-05, "loss": 0.1304, "num_input_tokens_seen": 73819232, "step": 34200 }, { "epoch": 5.579934747145187, "grad_norm": 0.35229161381721497, "learning_rate": 4.5277178379194615e-05, "loss": 0.0541, "num_input_tokens_seen": 73829312, "step": 34205 }, { "epoch": 5.580750407830343, "grad_norm": 0.24766376614570618, "learning_rate": 4.5275096429613686e-05, "loss": 0.0338, "num_input_tokens_seen": 73840320, "step": 34210 }, { "epoch": 5.581566068515498, "grad_norm": 0.26561295986175537, "learning_rate": 4.527301406913221e-05, "loss": 0.0564, "num_input_tokens_seen": 73850464, "step": 34215 }, { "epoch": 5.582381729200653, "grad_norm": 0.09172873198986053, "learning_rate": 4.527093129779239e-05, "loss": 0.0904, "num_input_tokens_seen": 73861152, "step": 34220 }, { "epoch": 5.583197389885807, "grad_norm": 0.14933250844478607, "learning_rate": 4.526884811563642e-05, "loss": 0.0131, "num_input_tokens_seen": 73872288, "step": 34225 }, { "epoch": 5.584013050570962, "grad_norm": 0.19864895939826965, "learning_rate": 4.526676452270653e-05, "loss": 0.1196, "num_input_tokens_seen": 73882720, "step": 34230 }, { "epoch": 5.584828711256117, "grad_norm": 0.26547664403915405, "learning_rate": 4.526468051904496e-05, "loss": 0.1357, "num_input_tokens_seen": 73892448, "step": 34235 }, { "epoch": 5.585644371941273, "grad_norm": 2.1065595149993896, "learning_rate": 4.526259610469392e-05, "loss": 0.2566, "num_input_tokens_seen": 73904224, "step": 34240 }, { "epoch": 5.5864600326264275, "grad_norm": 1.2860212326049805, "learning_rate": 4.5260511279695675e-05, "loss": 0.1282, "num_input_tokens_seen": 73915424, "step": 34245 }, { "epoch": 5.587275693311582, "grad_norm": 0.25461581349372864, "learning_rate": 4.525842604409245e-05, "loss": 0.1586, "num_input_tokens_seen": 73925344, "step": 34250 }, { "epoch": 5.588091353996737, "grad_norm": 0.23800355195999146, "learning_rate": 4.5256340397926536e-05, "loss": 0.1463, "num_input_tokens_seen": 73935904, "step": 34255 }, { "epoch": 5.588907014681892, "grad_norm": 0.0955689325928688, "learning_rate": 4.5254254341240185e-05, "loss": 0.0154, "num_input_tokens_seen": 73947296, "step": 34260 }, { "epoch": 5.589722675367048, "grad_norm": 0.4771307110786438, "learning_rate": 4.525216787407567e-05, "loss": 0.0362, "num_input_tokens_seen": 73955680, "step": 34265 }, { "epoch": 5.5905383360522025, "grad_norm": 0.9549001455307007, "learning_rate": 4.525008099647528e-05, "loss": 0.0636, "num_input_tokens_seen": 73966304, "step": 34270 }, { "epoch": 5.591353996737357, "grad_norm": 0.08330804854631424, "learning_rate": 4.524799370848132e-05, "loss": 0.0293, "num_input_tokens_seen": 73977120, "step": 34275 }, { "epoch": 5.592169657422512, "grad_norm": 0.10917381197214127, "learning_rate": 4.5245906010136074e-05, "loss": 0.059, "num_input_tokens_seen": 73987648, "step": 34280 }, { "epoch": 5.592985318107667, "grad_norm": 0.8977259397506714, "learning_rate": 4.524381790148187e-05, "loss": 0.1329, "num_input_tokens_seen": 73997312, "step": 34285 }, { "epoch": 5.593800978792823, "grad_norm": 0.2563439905643463, "learning_rate": 4.5241729382561e-05, "loss": 0.2424, "num_input_tokens_seen": 74009056, "step": 34290 }, { "epoch": 5.5946166394779775, "grad_norm": 0.7046748399734497, "learning_rate": 4.523964045341581e-05, "loss": 0.0472, "num_input_tokens_seen": 74019488, "step": 34295 }, { "epoch": 5.595432300163132, "grad_norm": 0.38031166791915894, "learning_rate": 4.523755111408863e-05, "loss": 0.0707, "num_input_tokens_seen": 74030880, "step": 34300 }, { "epoch": 5.596247960848287, "grad_norm": 0.516309380531311, "learning_rate": 4.523546136462181e-05, "loss": 0.1638, "num_input_tokens_seen": 74042080, "step": 34305 }, { "epoch": 5.597063621533442, "grad_norm": 0.30993571877479553, "learning_rate": 4.5233371205057685e-05, "loss": 0.0655, "num_input_tokens_seen": 74052448, "step": 34310 }, { "epoch": 5.597879282218597, "grad_norm": 1.511620044708252, "learning_rate": 4.5231280635438635e-05, "loss": 0.155, "num_input_tokens_seen": 74063392, "step": 34315 }, { "epoch": 5.598694942903752, "grad_norm": 1.7393540143966675, "learning_rate": 4.5229189655807006e-05, "loss": 0.1121, "num_input_tokens_seen": 74074720, "step": 34320 }, { "epoch": 5.599510603588907, "grad_norm": 1.4290456771850586, "learning_rate": 4.522709826620519e-05, "loss": 0.1488, "num_input_tokens_seen": 74084512, "step": 34325 }, { "epoch": 5.600326264274062, "grad_norm": 0.13349053263664246, "learning_rate": 4.522500646667557e-05, "loss": 0.1078, "num_input_tokens_seen": 74095520, "step": 34330 }, { "epoch": 5.601141924959217, "grad_norm": 0.400613933801651, "learning_rate": 4.522291425726054e-05, "loss": 0.1963, "num_input_tokens_seen": 74105792, "step": 34335 }, { "epoch": 5.601957585644372, "grad_norm": 0.05379409343004227, "learning_rate": 4.52208216380025e-05, "loss": 0.0692, "num_input_tokens_seen": 74116224, "step": 34340 }, { "epoch": 5.602773246329527, "grad_norm": 1.184421420097351, "learning_rate": 4.521872860894385e-05, "loss": 0.0864, "num_input_tokens_seen": 74127232, "step": 34345 }, { "epoch": 5.603588907014682, "grad_norm": 1.5650817155838013, "learning_rate": 4.521663517012702e-05, "loss": 0.1197, "num_input_tokens_seen": 74137088, "step": 34350 }, { "epoch": 5.604404567699837, "grad_norm": 0.04701405391097069, "learning_rate": 4.5214541321594427e-05, "loss": 0.0812, "num_input_tokens_seen": 74148128, "step": 34355 }, { "epoch": 5.605220228384992, "grad_norm": 0.20166559517383575, "learning_rate": 4.5212447063388506e-05, "loss": 0.0435, "num_input_tokens_seen": 74158432, "step": 34360 }, { "epoch": 5.606035889070147, "grad_norm": 1.6425755023956299, "learning_rate": 4.5210352395551713e-05, "loss": 0.1807, "num_input_tokens_seen": 74169376, "step": 34365 }, { "epoch": 5.6068515497553015, "grad_norm": 0.972305953502655, "learning_rate": 4.520825731812649e-05, "loss": 0.1028, "num_input_tokens_seen": 74180352, "step": 34370 }, { "epoch": 5.607667210440457, "grad_norm": 0.7358310222625732, "learning_rate": 4.5206161831155295e-05, "loss": 0.0661, "num_input_tokens_seen": 74190304, "step": 34375 }, { "epoch": 5.608482871125612, "grad_norm": 1.6266969442367554, "learning_rate": 4.52040659346806e-05, "loss": 0.1523, "num_input_tokens_seen": 74201504, "step": 34380 }, { "epoch": 5.609298531810767, "grad_norm": 1.4343876838684082, "learning_rate": 4.5201969628744866e-05, "loss": 0.3003, "num_input_tokens_seen": 74211968, "step": 34385 }, { "epoch": 5.610114192495922, "grad_norm": 0.021501891314983368, "learning_rate": 4.5199872913390603e-05, "loss": 0.1334, "num_input_tokens_seen": 74223232, "step": 34390 }, { "epoch": 5.6109298531810765, "grad_norm": 1.3998143672943115, "learning_rate": 4.519777578866028e-05, "loss": 0.124, "num_input_tokens_seen": 74234272, "step": 34395 }, { "epoch": 5.611745513866231, "grad_norm": 0.02137465961277485, "learning_rate": 4.519567825459642e-05, "loss": 0.0596, "num_input_tokens_seen": 74244608, "step": 34400 }, { "epoch": 5.612561174551386, "grad_norm": 0.9896433353424072, "learning_rate": 4.519358031124151e-05, "loss": 0.1156, "num_input_tokens_seen": 74255104, "step": 34405 }, { "epoch": 5.613376835236542, "grad_norm": 0.14417721331119537, "learning_rate": 4.519148195863808e-05, "loss": 0.0494, "num_input_tokens_seen": 74265792, "step": 34410 }, { "epoch": 5.614192495921697, "grad_norm": 0.29447922110557556, "learning_rate": 4.5189383196828656e-05, "loss": 0.2098, "num_input_tokens_seen": 74277120, "step": 34415 }, { "epoch": 5.6150081566068515, "grad_norm": 0.2333306223154068, "learning_rate": 4.518728402585577e-05, "loss": 0.0587, "num_input_tokens_seen": 74286848, "step": 34420 }, { "epoch": 5.615823817292006, "grad_norm": 0.8729727268218994, "learning_rate": 4.5185184445761956e-05, "loss": 0.0524, "num_input_tokens_seen": 74296992, "step": 34425 }, { "epoch": 5.616639477977161, "grad_norm": 0.2765652537345886, "learning_rate": 4.5183084456589784e-05, "loss": 0.0917, "num_input_tokens_seen": 74307680, "step": 34430 }, { "epoch": 5.617455138662317, "grad_norm": 2.1897518634796143, "learning_rate": 4.518098405838179e-05, "loss": 0.2788, "num_input_tokens_seen": 74318368, "step": 34435 }, { "epoch": 5.618270799347472, "grad_norm": 0.03208191320300102, "learning_rate": 4.517888325118056e-05, "loss": 0.065, "num_input_tokens_seen": 74328544, "step": 34440 }, { "epoch": 5.6190864600326265, "grad_norm": 0.6657669544219971, "learning_rate": 4.517678203502866e-05, "loss": 0.0792, "num_input_tokens_seen": 74339904, "step": 34445 }, { "epoch": 5.619902120717781, "grad_norm": 0.5646312832832336, "learning_rate": 4.517468040996868e-05, "loss": 0.0462, "num_input_tokens_seen": 74350784, "step": 34450 }, { "epoch": 5.620717781402936, "grad_norm": 0.08849679678678513, "learning_rate": 4.5172578376043206e-05, "loss": 0.1226, "num_input_tokens_seen": 74361408, "step": 34455 }, { "epoch": 5.621533442088092, "grad_norm": 0.78436678647995, "learning_rate": 4.517047593329483e-05, "loss": 0.2641, "num_input_tokens_seen": 74371904, "step": 34460 }, { "epoch": 5.622349102773247, "grad_norm": 0.10411543399095535, "learning_rate": 4.516837308176618e-05, "loss": 0.349, "num_input_tokens_seen": 74383232, "step": 34465 }, { "epoch": 5.623164763458401, "grad_norm": 0.7519554495811462, "learning_rate": 4.516626982149986e-05, "loss": 0.0973, "num_input_tokens_seen": 74394240, "step": 34470 }, { "epoch": 5.623980424143556, "grad_norm": 0.053145479410886765, "learning_rate": 4.5164166152538515e-05, "loss": 0.0187, "num_input_tokens_seen": 74404160, "step": 34475 }, { "epoch": 5.624796084828711, "grad_norm": 1.148764729499817, "learning_rate": 4.516206207492475e-05, "loss": 0.2953, "num_input_tokens_seen": 74415904, "step": 34480 }, { "epoch": 5.625611745513866, "grad_norm": 0.4541132152080536, "learning_rate": 4.515995758870122e-05, "loss": 0.0741, "num_input_tokens_seen": 74426336, "step": 34485 }, { "epoch": 5.626427406199021, "grad_norm": 2.5255911350250244, "learning_rate": 4.515785269391057e-05, "loss": 0.1436, "num_input_tokens_seen": 74434592, "step": 34490 }, { "epoch": 5.627243066884176, "grad_norm": 0.13813892006874084, "learning_rate": 4.515574739059547e-05, "loss": 0.1016, "num_input_tokens_seen": 74446464, "step": 34495 }, { "epoch": 5.628058727569331, "grad_norm": 0.07466185837984085, "learning_rate": 4.515364167879858e-05, "loss": 0.1094, "num_input_tokens_seen": 74457056, "step": 34500 }, { "epoch": 5.628874388254486, "grad_norm": 1.539862036705017, "learning_rate": 4.5151535558562574e-05, "loss": 0.0407, "num_input_tokens_seen": 74467584, "step": 34505 }, { "epoch": 5.629690048939641, "grad_norm": 0.24018406867980957, "learning_rate": 4.514942902993014e-05, "loss": 0.0684, "num_input_tokens_seen": 74477376, "step": 34510 }, { "epoch": 5.630505709624796, "grad_norm": 0.6342037916183472, "learning_rate": 4.514732209294396e-05, "loss": 0.2498, "num_input_tokens_seen": 74487648, "step": 34515 }, { "epoch": 5.631321370309951, "grad_norm": 1.2535279989242554, "learning_rate": 4.514521474764674e-05, "loss": 0.1011, "num_input_tokens_seen": 74498496, "step": 34520 }, { "epoch": 5.632137030995106, "grad_norm": 1.5586556196212769, "learning_rate": 4.514310699408118e-05, "loss": 0.1824, "num_input_tokens_seen": 74509664, "step": 34525 }, { "epoch": 5.632952691680261, "grad_norm": 0.1788191944360733, "learning_rate": 4.514099883229001e-05, "loss": 0.0866, "num_input_tokens_seen": 74519808, "step": 34530 }, { "epoch": 5.633768352365416, "grad_norm": 1.1768914461135864, "learning_rate": 4.513889026231595e-05, "loss": 0.0897, "num_input_tokens_seen": 74531840, "step": 34535 }, { "epoch": 5.634584013050571, "grad_norm": 0.387398898601532, "learning_rate": 4.513678128420173e-05, "loss": 0.1996, "num_input_tokens_seen": 74543168, "step": 34540 }, { "epoch": 5.635399673735726, "grad_norm": 0.7085121870040894, "learning_rate": 4.513467189799009e-05, "loss": 0.0474, "num_input_tokens_seen": 74553792, "step": 34545 }, { "epoch": 5.636215334420881, "grad_norm": 0.14115290343761444, "learning_rate": 4.513256210372378e-05, "loss": 0.0935, "num_input_tokens_seen": 74565312, "step": 34550 }, { "epoch": 5.637030995106036, "grad_norm": 0.069536492228508, "learning_rate": 4.513045190144556e-05, "loss": 0.3471, "num_input_tokens_seen": 74576256, "step": 34555 }, { "epoch": 5.637846655791191, "grad_norm": 0.136280819773674, "learning_rate": 4.5128341291198196e-05, "loss": 0.157, "num_input_tokens_seen": 74586560, "step": 34560 }, { "epoch": 5.638662316476346, "grad_norm": 1.6259853839874268, "learning_rate": 4.512623027302446e-05, "loss": 0.0572, "num_input_tokens_seen": 74597792, "step": 34565 }, { "epoch": 5.6394779771615005, "grad_norm": 1.7045459747314453, "learning_rate": 4.512411884696714e-05, "loss": 0.2947, "num_input_tokens_seen": 74609504, "step": 34570 }, { "epoch": 5.640293637846656, "grad_norm": 0.07375035434961319, "learning_rate": 4.512200701306902e-05, "loss": 0.1528, "num_input_tokens_seen": 74620160, "step": 34575 }, { "epoch": 5.641109298531811, "grad_norm": 2.5194942951202393, "learning_rate": 4.511989477137289e-05, "loss": 0.1618, "num_input_tokens_seen": 74631072, "step": 34580 }, { "epoch": 5.641924959216966, "grad_norm": 0.048124704509973526, "learning_rate": 4.5117782121921585e-05, "loss": 0.0342, "num_input_tokens_seen": 74641088, "step": 34585 }, { "epoch": 5.642740619902121, "grad_norm": 0.8185315728187561, "learning_rate": 4.5115669064757906e-05, "loss": 0.0932, "num_input_tokens_seen": 74650560, "step": 34590 }, { "epoch": 5.643556280587275, "grad_norm": 0.12292462587356567, "learning_rate": 4.511355559992466e-05, "loss": 0.0896, "num_input_tokens_seen": 74660608, "step": 34595 }, { "epoch": 5.64437194127243, "grad_norm": 0.3372400999069214, "learning_rate": 4.51114417274647e-05, "loss": 0.0545, "num_input_tokens_seen": 74672064, "step": 34600 }, { "epoch": 5.645187601957586, "grad_norm": 0.6985923647880554, "learning_rate": 4.510932744742087e-05, "loss": 0.0606, "num_input_tokens_seen": 74683200, "step": 34605 }, { "epoch": 5.646003262642741, "grad_norm": 0.22016723453998566, "learning_rate": 4.5107212759835995e-05, "loss": 0.1506, "num_input_tokens_seen": 74694016, "step": 34610 }, { "epoch": 5.646818923327896, "grad_norm": 0.9745835065841675, "learning_rate": 4.510509766475295e-05, "loss": 0.3901, "num_input_tokens_seen": 74704096, "step": 34615 }, { "epoch": 5.64763458401305, "grad_norm": 0.20770224928855896, "learning_rate": 4.510298216221459e-05, "loss": 0.1634, "num_input_tokens_seen": 74714528, "step": 34620 }, { "epoch": 5.648450244698205, "grad_norm": 0.3516792356967926, "learning_rate": 4.510086625226381e-05, "loss": 0.2118, "num_input_tokens_seen": 74725888, "step": 34625 }, { "epoch": 5.649265905383361, "grad_norm": 0.8189969658851624, "learning_rate": 4.5098749934943464e-05, "loss": 0.0818, "num_input_tokens_seen": 74736384, "step": 34630 }, { "epoch": 5.650081566068516, "grad_norm": 1.233279824256897, "learning_rate": 4.509663321029645e-05, "loss": 0.2352, "num_input_tokens_seen": 74748064, "step": 34635 }, { "epoch": 5.650897226753671, "grad_norm": 0.04857684671878815, "learning_rate": 4.5094516078365676e-05, "loss": 0.0665, "num_input_tokens_seen": 74758720, "step": 34640 }, { "epoch": 5.651712887438825, "grad_norm": 0.3853446841239929, "learning_rate": 4.509239853919404e-05, "loss": 0.0354, "num_input_tokens_seen": 74770208, "step": 34645 }, { "epoch": 5.65252854812398, "grad_norm": 0.12143392115831375, "learning_rate": 4.5090280592824465e-05, "loss": 0.0214, "num_input_tokens_seen": 74781568, "step": 34650 }, { "epoch": 5.653344208809135, "grad_norm": 0.030440691858530045, "learning_rate": 4.508816223929986e-05, "loss": 0.0927, "num_input_tokens_seen": 74792608, "step": 34655 }, { "epoch": 5.654159869494291, "grad_norm": 0.5168301463127136, "learning_rate": 4.508604347866316e-05, "loss": 0.0413, "num_input_tokens_seen": 74801760, "step": 34660 }, { "epoch": 5.6549755301794455, "grad_norm": 0.36041367053985596, "learning_rate": 4.508392431095732e-05, "loss": 0.0774, "num_input_tokens_seen": 74812256, "step": 34665 }, { "epoch": 5.6557911908646, "grad_norm": 0.1501273661851883, "learning_rate": 4.5081804736225266e-05, "loss": 0.0661, "num_input_tokens_seen": 74822848, "step": 34670 }, { "epoch": 5.656606851549755, "grad_norm": 0.13392284512519836, "learning_rate": 4.5079684754509964e-05, "loss": 0.1077, "num_input_tokens_seen": 74832992, "step": 34675 }, { "epoch": 5.65742251223491, "grad_norm": 0.07705061137676239, "learning_rate": 4.5077564365854386e-05, "loss": 0.0887, "num_input_tokens_seen": 74843136, "step": 34680 }, { "epoch": 5.658238172920065, "grad_norm": 0.04229428246617317, "learning_rate": 4.5075443570301486e-05, "loss": 0.0506, "num_input_tokens_seen": 74852800, "step": 34685 }, { "epoch": 5.6590538336052205, "grad_norm": 0.744158148765564, "learning_rate": 4.5073322367894254e-05, "loss": 0.1105, "num_input_tokens_seen": 74864512, "step": 34690 }, { "epoch": 5.659869494290375, "grad_norm": 0.4956861436367035, "learning_rate": 4.507120075867569e-05, "loss": 0.0287, "num_input_tokens_seen": 74875936, "step": 34695 }, { "epoch": 5.66068515497553, "grad_norm": 0.09212841093540192, "learning_rate": 4.506907874268877e-05, "loss": 0.155, "num_input_tokens_seen": 74885504, "step": 34700 }, { "epoch": 5.661500815660685, "grad_norm": 2.0629560947418213, "learning_rate": 4.5066956319976515e-05, "loss": 0.0982, "num_input_tokens_seen": 74897632, "step": 34705 }, { "epoch": 5.66231647634584, "grad_norm": 1.0868573188781738, "learning_rate": 4.506483349058194e-05, "loss": 0.0595, "num_input_tokens_seen": 74908672, "step": 34710 }, { "epoch": 5.6631321370309955, "grad_norm": 1.3521944284439087, "learning_rate": 4.506271025454804e-05, "loss": 0.1341, "num_input_tokens_seen": 74919104, "step": 34715 }, { "epoch": 5.66394779771615, "grad_norm": 0.19516031444072723, "learning_rate": 4.506058661191789e-05, "loss": 0.0538, "num_input_tokens_seen": 74929952, "step": 34720 }, { "epoch": 5.664763458401305, "grad_norm": 1.3225992918014526, "learning_rate": 4.505846256273449e-05, "loss": 0.0991, "num_input_tokens_seen": 74941760, "step": 34725 }, { "epoch": 5.66557911908646, "grad_norm": 0.601726233959198, "learning_rate": 4.50563381070409e-05, "loss": 0.0943, "num_input_tokens_seen": 74952000, "step": 34730 }, { "epoch": 5.666394779771615, "grad_norm": 0.3634214997291565, "learning_rate": 4.5054213244880175e-05, "loss": 0.028, "num_input_tokens_seen": 74963008, "step": 34735 }, { "epoch": 5.6672104404567705, "grad_norm": 0.582300066947937, "learning_rate": 4.505208797629538e-05, "loss": 0.0416, "num_input_tokens_seen": 74973408, "step": 34740 }, { "epoch": 5.668026101141925, "grad_norm": 1.2617093324661255, "learning_rate": 4.5049962301329586e-05, "loss": 0.3717, "num_input_tokens_seen": 74983744, "step": 34745 }, { "epoch": 5.66884176182708, "grad_norm": 0.1551613062620163, "learning_rate": 4.5047836220025866e-05, "loss": 0.1144, "num_input_tokens_seen": 74993952, "step": 34750 }, { "epoch": 5.669657422512235, "grad_norm": 0.4229947626590729, "learning_rate": 4.504570973242732e-05, "loss": 0.0727, "num_input_tokens_seen": 75004128, "step": 34755 }, { "epoch": 5.67047308319739, "grad_norm": 1.9002364873886108, "learning_rate": 4.504358283857703e-05, "loss": 0.1168, "num_input_tokens_seen": 75014496, "step": 34760 }, { "epoch": 5.671288743882545, "grad_norm": 0.645645797252655, "learning_rate": 4.5041455538518106e-05, "loss": 0.3904, "num_input_tokens_seen": 75024704, "step": 34765 }, { "epoch": 5.672104404567699, "grad_norm": 0.0816347673535347, "learning_rate": 4.503932783229367e-05, "loss": 0.1152, "num_input_tokens_seen": 75035488, "step": 34770 }, { "epoch": 5.672920065252855, "grad_norm": 0.04423374682664871, "learning_rate": 4.503719971994682e-05, "loss": 0.0893, "num_input_tokens_seen": 75048192, "step": 34775 }, { "epoch": 5.67373572593801, "grad_norm": 0.052043844014406204, "learning_rate": 4.503507120152071e-05, "loss": 0.1194, "num_input_tokens_seen": 75059680, "step": 34780 }, { "epoch": 5.674551386623165, "grad_norm": 1.4314239025115967, "learning_rate": 4.503294227705847e-05, "loss": 0.1451, "num_input_tokens_seen": 75069696, "step": 34785 }, { "epoch": 5.6753670473083195, "grad_norm": 1.4236637353897095, "learning_rate": 4.503081294660323e-05, "loss": 0.2762, "num_input_tokens_seen": 75080512, "step": 34790 }, { "epoch": 5.676182707993474, "grad_norm": 1.5152058601379395, "learning_rate": 4.502868321019815e-05, "loss": 0.1558, "num_input_tokens_seen": 75092000, "step": 34795 }, { "epoch": 5.67699836867863, "grad_norm": 0.0713457316160202, "learning_rate": 4.502655306788641e-05, "loss": 0.1738, "num_input_tokens_seen": 75102208, "step": 34800 }, { "epoch": 5.677814029363785, "grad_norm": 0.11929883807897568, "learning_rate": 4.502442251971116e-05, "loss": 0.1051, "num_input_tokens_seen": 75112960, "step": 34805 }, { "epoch": 5.67862969004894, "grad_norm": 0.3179865777492523, "learning_rate": 4.502229156571559e-05, "loss": 0.0653, "num_input_tokens_seen": 75125824, "step": 34810 }, { "epoch": 5.6794453507340945, "grad_norm": 0.1232743114233017, "learning_rate": 4.502016020594287e-05, "loss": 0.1454, "num_input_tokens_seen": 75136896, "step": 34815 }, { "epoch": 5.680261011419249, "grad_norm": 0.9902503490447998, "learning_rate": 4.501802844043622e-05, "loss": 0.0544, "num_input_tokens_seen": 75146528, "step": 34820 }, { "epoch": 5.681076672104405, "grad_norm": 0.41711413860321045, "learning_rate": 4.501589626923882e-05, "loss": 0.0896, "num_input_tokens_seen": 75157184, "step": 34825 }, { "epoch": 5.68189233278956, "grad_norm": 1.517073631286621, "learning_rate": 4.5013763692393894e-05, "loss": 0.0867, "num_input_tokens_seen": 75166016, "step": 34830 }, { "epoch": 5.682707993474715, "grad_norm": 0.10761014372110367, "learning_rate": 4.501163070994465e-05, "loss": 0.2569, "num_input_tokens_seen": 75177824, "step": 34835 }, { "epoch": 5.6835236541598695, "grad_norm": 0.49219945073127747, "learning_rate": 4.5009497321934335e-05, "loss": 0.1681, "num_input_tokens_seen": 75188128, "step": 34840 }, { "epoch": 5.684339314845024, "grad_norm": 0.7139567136764526, "learning_rate": 4.500736352840617e-05, "loss": 0.142, "num_input_tokens_seen": 75198016, "step": 34845 }, { "epoch": 5.685154975530179, "grad_norm": 0.056820016354322433, "learning_rate": 4.5005229329403395e-05, "loss": 0.0368, "num_input_tokens_seen": 75208768, "step": 34850 }, { "epoch": 5.685970636215334, "grad_norm": 0.24154870212078094, "learning_rate": 4.500309472496927e-05, "loss": 0.0664, "num_input_tokens_seen": 75220064, "step": 34855 }, { "epoch": 5.68678629690049, "grad_norm": 0.7831224203109741, "learning_rate": 4.5000959715147064e-05, "loss": 0.0839, "num_input_tokens_seen": 75231552, "step": 34860 }, { "epoch": 5.6876019575856445, "grad_norm": 0.5007439255714417, "learning_rate": 4.4998824299980024e-05, "loss": 0.0805, "num_input_tokens_seen": 75242080, "step": 34865 }, { "epoch": 5.688417618270799, "grad_norm": 0.021443895995616913, "learning_rate": 4.499668847951145e-05, "loss": 0.0493, "num_input_tokens_seen": 75253312, "step": 34870 }, { "epoch": 5.689233278955954, "grad_norm": 0.3110465705394745, "learning_rate": 4.4994552253784617e-05, "loss": 0.0613, "num_input_tokens_seen": 75265376, "step": 34875 }, { "epoch": 5.690048939641109, "grad_norm": 0.38874149322509766, "learning_rate": 4.499241562284281e-05, "loss": 0.1932, "num_input_tokens_seen": 75275872, "step": 34880 }, { "epoch": 5.690864600326265, "grad_norm": 0.16645875573158264, "learning_rate": 4.499027858672934e-05, "loss": 0.0414, "num_input_tokens_seen": 75288096, "step": 34885 }, { "epoch": 5.691680261011419, "grad_norm": 1.370161771774292, "learning_rate": 4.498814114548752e-05, "loss": 0.0697, "num_input_tokens_seen": 75298112, "step": 34890 }, { "epoch": 5.692495921696574, "grad_norm": 0.27817630767822266, "learning_rate": 4.4986003299160654e-05, "loss": 0.1738, "num_input_tokens_seen": 75308992, "step": 34895 }, { "epoch": 5.693311582381729, "grad_norm": 0.22351638972759247, "learning_rate": 4.4983865047792085e-05, "loss": 0.1768, "num_input_tokens_seen": 75320352, "step": 34900 }, { "epoch": 5.694127243066884, "grad_norm": 0.15430666506290436, "learning_rate": 4.4981726391425134e-05, "loss": 0.0517, "num_input_tokens_seen": 75333056, "step": 34905 }, { "epoch": 5.69494290375204, "grad_norm": 0.13723480701446533, "learning_rate": 4.497958733010315e-05, "loss": 0.1151, "num_input_tokens_seen": 75345248, "step": 34910 }, { "epoch": 5.695758564437194, "grad_norm": 1.8721712827682495, "learning_rate": 4.4977447863869485e-05, "loss": 0.2346, "num_input_tokens_seen": 75356576, "step": 34915 }, { "epoch": 5.696574225122349, "grad_norm": 0.14680080115795135, "learning_rate": 4.4975307992767495e-05, "loss": 0.0651, "num_input_tokens_seen": 75368000, "step": 34920 }, { "epoch": 5.697389885807504, "grad_norm": 1.1642742156982422, "learning_rate": 4.497316771684055e-05, "loss": 0.071, "num_input_tokens_seen": 75378208, "step": 34925 }, { "epoch": 5.698205546492659, "grad_norm": 1.5722367763519287, "learning_rate": 4.497102703613202e-05, "loss": 0.2018, "num_input_tokens_seen": 75390112, "step": 34930 }, { "epoch": 5.699021207177814, "grad_norm": 0.06954708695411682, "learning_rate": 4.496888595068529e-05, "loss": 0.038, "num_input_tokens_seen": 75401824, "step": 34935 }, { "epoch": 5.699836867862969, "grad_norm": 1.1450128555297852, "learning_rate": 4.4966744460543765e-05, "loss": 0.1952, "num_input_tokens_seen": 75412384, "step": 34940 }, { "epoch": 5.700652528548124, "grad_norm": 0.6756334900856018, "learning_rate": 4.496460256575083e-05, "loss": 0.0506, "num_input_tokens_seen": 75423648, "step": 34945 }, { "epoch": 5.701468189233279, "grad_norm": 1.3524930477142334, "learning_rate": 4.496246026634988e-05, "loss": 0.2462, "num_input_tokens_seen": 75435232, "step": 34950 }, { "epoch": 5.702283849918434, "grad_norm": 1.091137409210205, "learning_rate": 4.496031756238437e-05, "loss": 0.0533, "num_input_tokens_seen": 75445952, "step": 34955 }, { "epoch": 5.703099510603589, "grad_norm": 0.0939243882894516, "learning_rate": 4.49581744538977e-05, "loss": 0.0284, "num_input_tokens_seen": 75457696, "step": 34960 }, { "epoch": 5.7039151712887435, "grad_norm": 0.7510550022125244, "learning_rate": 4.495603094093329e-05, "loss": 0.1033, "num_input_tokens_seen": 75467424, "step": 34965 }, { "epoch": 5.704730831973899, "grad_norm": 0.08499466627836227, "learning_rate": 4.4953887023534615e-05, "loss": 0.1227, "num_input_tokens_seen": 75476928, "step": 34970 }, { "epoch": 5.705546492659054, "grad_norm": 1.7048890590667725, "learning_rate": 4.49517427017451e-05, "loss": 0.0823, "num_input_tokens_seen": 75486624, "step": 34975 }, { "epoch": 5.706362153344209, "grad_norm": 0.17836925387382507, "learning_rate": 4.4949597975608204e-05, "loss": 0.1544, "num_input_tokens_seen": 75497888, "step": 34980 }, { "epoch": 5.707177814029364, "grad_norm": 0.14827120304107666, "learning_rate": 4.4947452845167406e-05, "loss": 0.0741, "num_input_tokens_seen": 75508352, "step": 34985 }, { "epoch": 5.7079934747145185, "grad_norm": 0.162202388048172, "learning_rate": 4.494530731046617e-05, "loss": 0.07, "num_input_tokens_seen": 75520416, "step": 34990 }, { "epoch": 5.708809135399674, "grad_norm": 0.2984362244606018, "learning_rate": 4.4943161371547974e-05, "loss": 0.1868, "num_input_tokens_seen": 75531168, "step": 34995 }, { "epoch": 5.709624796084829, "grad_norm": 0.02682955376803875, "learning_rate": 4.4941015028456306e-05, "loss": 0.0761, "num_input_tokens_seen": 75541632, "step": 35000 }, { "epoch": 5.710440456769984, "grad_norm": 0.31321823596954346, "learning_rate": 4.493886828123468e-05, "loss": 0.0698, "num_input_tokens_seen": 75552224, "step": 35005 }, { "epoch": 5.711256117455139, "grad_norm": 0.06118397042155266, "learning_rate": 4.49367211299266e-05, "loss": 0.0128, "num_input_tokens_seen": 75562080, "step": 35010 }, { "epoch": 5.712071778140293, "grad_norm": 0.3912368416786194, "learning_rate": 4.493457357457556e-05, "loss": 0.1165, "num_input_tokens_seen": 75571616, "step": 35015 }, { "epoch": 5.712887438825448, "grad_norm": 0.26252180337905884, "learning_rate": 4.493242561522511e-05, "loss": 0.0332, "num_input_tokens_seen": 75581952, "step": 35020 }, { "epoch": 5.713703099510604, "grad_norm": 0.07425878942012787, "learning_rate": 4.493027725191876e-05, "loss": 0.0494, "num_input_tokens_seen": 75592448, "step": 35025 }, { "epoch": 5.714518760195759, "grad_norm": 1.0149801969528198, "learning_rate": 4.4928128484700067e-05, "loss": 0.083, "num_input_tokens_seen": 75603872, "step": 35030 }, { "epoch": 5.715334420880914, "grad_norm": 0.3649132549762726, "learning_rate": 4.4925979313612565e-05, "loss": 0.0946, "num_input_tokens_seen": 75614752, "step": 35035 }, { "epoch": 5.716150081566068, "grad_norm": 0.13826103508472443, "learning_rate": 4.4923829738699816e-05, "loss": 0.0197, "num_input_tokens_seen": 75625600, "step": 35040 }, { "epoch": 5.716965742251223, "grad_norm": 1.2956290245056152, "learning_rate": 4.4921679760005384e-05, "loss": 0.2112, "num_input_tokens_seen": 75635648, "step": 35045 }, { "epoch": 5.717781402936378, "grad_norm": 0.9595244526863098, "learning_rate": 4.491952937757283e-05, "loss": 0.166, "num_input_tokens_seen": 75647680, "step": 35050 }, { "epoch": 5.718597063621534, "grad_norm": 0.13404810428619385, "learning_rate": 4.4917378591445754e-05, "loss": 0.0744, "num_input_tokens_seen": 75658080, "step": 35055 }, { "epoch": 5.719412724306689, "grad_norm": 0.1713683158159256, "learning_rate": 4.491522740166773e-05, "loss": 0.152, "num_input_tokens_seen": 75669920, "step": 35060 }, { "epoch": 5.720228384991843, "grad_norm": 0.055517133325338364, "learning_rate": 4.4913075808282356e-05, "loss": 0.0609, "num_input_tokens_seen": 75681792, "step": 35065 }, { "epoch": 5.721044045676998, "grad_norm": 0.07071001082658768, "learning_rate": 4.491092381133324e-05, "loss": 0.0192, "num_input_tokens_seen": 75692992, "step": 35070 }, { "epoch": 5.721859706362153, "grad_norm": 0.15198908746242523, "learning_rate": 4.4908771410864e-05, "loss": 0.0459, "num_input_tokens_seen": 75704608, "step": 35075 }, { "epoch": 5.722675367047309, "grad_norm": 0.8605325818061829, "learning_rate": 4.490661860691825e-05, "loss": 0.0612, "num_input_tokens_seen": 75715328, "step": 35080 }, { "epoch": 5.7234910277324635, "grad_norm": 1.8884717226028442, "learning_rate": 4.490446539953961e-05, "loss": 0.1769, "num_input_tokens_seen": 75725376, "step": 35085 }, { "epoch": 5.724306688417618, "grad_norm": 0.039114292711019516, "learning_rate": 4.490231178877173e-05, "loss": 0.0875, "num_input_tokens_seen": 75736512, "step": 35090 }, { "epoch": 5.725122349102773, "grad_norm": 0.031515687704086304, "learning_rate": 4.490015777465827e-05, "loss": 0.0249, "num_input_tokens_seen": 75747552, "step": 35095 }, { "epoch": 5.725938009787928, "grad_norm": 1.0057933330535889, "learning_rate": 4.489800335724286e-05, "loss": 0.1587, "num_input_tokens_seen": 75759136, "step": 35100 }, { "epoch": 5.726753670473083, "grad_norm": 0.11092457175254822, "learning_rate": 4.489584853656916e-05, "loss": 0.1288, "num_input_tokens_seen": 75770208, "step": 35105 }, { "epoch": 5.7275693311582385, "grad_norm": 0.1102370172739029, "learning_rate": 4.489369331268085e-05, "loss": 0.0459, "num_input_tokens_seen": 75780800, "step": 35110 }, { "epoch": 5.728384991843393, "grad_norm": 0.4257639944553375, "learning_rate": 4.489153768562162e-05, "loss": 0.0372, "num_input_tokens_seen": 75791328, "step": 35115 }, { "epoch": 5.729200652528548, "grad_norm": 0.17583245038986206, "learning_rate": 4.488938165543514e-05, "loss": 0.2576, "num_input_tokens_seen": 75802688, "step": 35120 }, { "epoch": 5.730016313213703, "grad_norm": 0.11226793378591537, "learning_rate": 4.488722522216511e-05, "loss": 0.2116, "num_input_tokens_seen": 75812640, "step": 35125 }, { "epoch": 5.730831973898858, "grad_norm": 0.5602582097053528, "learning_rate": 4.488506838585522e-05, "loss": 0.0807, "num_input_tokens_seen": 75823680, "step": 35130 }, { "epoch": 5.731647634584013, "grad_norm": 0.14119701087474823, "learning_rate": 4.488291114654921e-05, "loss": 0.052, "num_input_tokens_seen": 75834208, "step": 35135 }, { "epoch": 5.732463295269168, "grad_norm": 0.021432343870401382, "learning_rate": 4.488075350429077e-05, "loss": 0.0725, "num_input_tokens_seen": 75846432, "step": 35140 }, { "epoch": 5.733278955954323, "grad_norm": 0.3877235949039459, "learning_rate": 4.487859545912365e-05, "loss": 0.2014, "num_input_tokens_seen": 75856960, "step": 35145 }, { "epoch": 5.734094616639478, "grad_norm": 0.7420638799667358, "learning_rate": 4.487643701109157e-05, "loss": 0.0495, "num_input_tokens_seen": 75868256, "step": 35150 }, { "epoch": 5.734910277324633, "grad_norm": 0.039295319467782974, "learning_rate": 4.487427816023828e-05, "loss": 0.0651, "num_input_tokens_seen": 75879808, "step": 35155 }, { "epoch": 5.735725938009788, "grad_norm": 3.0043866634368896, "learning_rate": 4.487211890660753e-05, "loss": 0.2969, "num_input_tokens_seen": 75891168, "step": 35160 }, { "epoch": 5.736541598694943, "grad_norm": 0.05522492527961731, "learning_rate": 4.486995925024308e-05, "loss": 0.2899, "num_input_tokens_seen": 75902080, "step": 35165 }, { "epoch": 5.737357259380098, "grad_norm": 0.9368324875831604, "learning_rate": 4.48677991911887e-05, "loss": 0.1565, "num_input_tokens_seen": 75913568, "step": 35170 }, { "epoch": 5.738172920065253, "grad_norm": 0.08245100826025009, "learning_rate": 4.486563872948817e-05, "loss": 0.1545, "num_input_tokens_seen": 75923552, "step": 35175 }, { "epoch": 5.738988580750408, "grad_norm": 1.3346164226531982, "learning_rate": 4.486347786518526e-05, "loss": 0.1975, "num_input_tokens_seen": 75933856, "step": 35180 }, { "epoch": 5.739804241435563, "grad_norm": 0.1068873256444931, "learning_rate": 4.486131659832378e-05, "loss": 0.1825, "num_input_tokens_seen": 75944896, "step": 35185 }, { "epoch": 5.740619902120718, "grad_norm": 1.7289294004440308, "learning_rate": 4.4859154928947525e-05, "loss": 0.1877, "num_input_tokens_seen": 75954880, "step": 35190 }, { "epoch": 5.741435562805873, "grad_norm": 0.10736056417226791, "learning_rate": 4.485699285710029e-05, "loss": 0.0842, "num_input_tokens_seen": 75964800, "step": 35195 }, { "epoch": 5.742251223491028, "grad_norm": 1.3219763040542603, "learning_rate": 4.485483038282592e-05, "loss": 0.2074, "num_input_tokens_seen": 75975264, "step": 35200 }, { "epoch": 5.743066884176183, "grad_norm": 0.48376306891441345, "learning_rate": 4.485266750616821e-05, "loss": 0.0601, "num_input_tokens_seen": 75986976, "step": 35205 }, { "epoch": 5.7438825448613375, "grad_norm": 0.13544213771820068, "learning_rate": 4.485050422717102e-05, "loss": 0.1022, "num_input_tokens_seen": 75998752, "step": 35210 }, { "epoch": 5.744698205546492, "grad_norm": 0.259528785943985, "learning_rate": 4.4848340545878174e-05, "loss": 0.1504, "num_input_tokens_seen": 76010688, "step": 35215 }, { "epoch": 5.745513866231647, "grad_norm": 1.0589373111724854, "learning_rate": 4.4846176462333526e-05, "loss": 0.288, "num_input_tokens_seen": 76021408, "step": 35220 }, { "epoch": 5.746329526916803, "grad_norm": 0.07023920118808746, "learning_rate": 4.4844011976580934e-05, "loss": 0.226, "num_input_tokens_seen": 76032480, "step": 35225 }, { "epoch": 5.747145187601958, "grad_norm": 1.467049241065979, "learning_rate": 4.4841847088664275e-05, "loss": 0.1125, "num_input_tokens_seen": 76043072, "step": 35230 }, { "epoch": 5.7479608482871125, "grad_norm": 0.1086091697216034, "learning_rate": 4.483968179862741e-05, "loss": 0.0874, "num_input_tokens_seen": 76054208, "step": 35235 }, { "epoch": 5.748776508972267, "grad_norm": 0.2532711625099182, "learning_rate": 4.483751610651422e-05, "loss": 0.1041, "num_input_tokens_seen": 76066080, "step": 35240 }, { "epoch": 5.749592169657422, "grad_norm": 0.7750874757766724, "learning_rate": 4.48353500123686e-05, "loss": 0.1675, "num_input_tokens_seen": 76077088, "step": 35245 }, { "epoch": 5.750407830342578, "grad_norm": 0.11692773550748825, "learning_rate": 4.483318351623446e-05, "loss": 0.1375, "num_input_tokens_seen": 76087104, "step": 35250 }, { "epoch": 5.751223491027733, "grad_norm": 0.8028681874275208, "learning_rate": 4.483101661815569e-05, "loss": 0.0704, "num_input_tokens_seen": 76098560, "step": 35255 }, { "epoch": 5.7520391517128875, "grad_norm": 0.6379098892211914, "learning_rate": 4.4828849318176205e-05, "loss": 0.0656, "num_input_tokens_seen": 76109472, "step": 35260 }, { "epoch": 5.752854812398042, "grad_norm": 0.8435890674591064, "learning_rate": 4.482668161633993e-05, "loss": 0.1528, "num_input_tokens_seen": 76119744, "step": 35265 }, { "epoch": 5.753670473083197, "grad_norm": 0.06179606169462204, "learning_rate": 4.4824513512690815e-05, "loss": 0.0542, "num_input_tokens_seen": 76129664, "step": 35270 }, { "epoch": 5.754486133768353, "grad_norm": 0.16587398946285248, "learning_rate": 4.482234500727277e-05, "loss": 0.0522, "num_input_tokens_seen": 76140384, "step": 35275 }, { "epoch": 5.755301794453508, "grad_norm": 1.2582770586013794, "learning_rate": 4.482017610012976e-05, "loss": 0.1389, "num_input_tokens_seen": 76150624, "step": 35280 }, { "epoch": 5.7561174551386625, "grad_norm": 1.073026180267334, "learning_rate": 4.481800679130575e-05, "loss": 0.0898, "num_input_tokens_seen": 76161952, "step": 35285 }, { "epoch": 5.756933115823817, "grad_norm": 0.10110930353403091, "learning_rate": 4.481583708084468e-05, "loss": 0.1338, "num_input_tokens_seen": 76173600, "step": 35290 }, { "epoch": 5.757748776508972, "grad_norm": 0.09915459156036377, "learning_rate": 4.481366696879053e-05, "loss": 0.0519, "num_input_tokens_seen": 76183808, "step": 35295 }, { "epoch": 5.758564437194127, "grad_norm": 1.7940146923065186, "learning_rate": 4.48114964551873e-05, "loss": 0.1789, "num_input_tokens_seen": 76194752, "step": 35300 }, { "epoch": 5.759380097879282, "grad_norm": 1.625160813331604, "learning_rate": 4.480932554007895e-05, "loss": 0.1247, "num_input_tokens_seen": 76205696, "step": 35305 }, { "epoch": 5.760195758564437, "grad_norm": 0.31766441464424133, "learning_rate": 4.48071542235095e-05, "loss": 0.1428, "num_input_tokens_seen": 76215072, "step": 35310 }, { "epoch": 5.761011419249592, "grad_norm": 0.8403124213218689, "learning_rate": 4.480498250552292e-05, "loss": 0.1933, "num_input_tokens_seen": 76225248, "step": 35315 }, { "epoch": 5.761827079934747, "grad_norm": 1.0129334926605225, "learning_rate": 4.480281038616326e-05, "loss": 0.1141, "num_input_tokens_seen": 76235552, "step": 35320 }, { "epoch": 5.762642740619902, "grad_norm": 1.2471435070037842, "learning_rate": 4.480063786547452e-05, "loss": 0.1782, "num_input_tokens_seen": 76246624, "step": 35325 }, { "epoch": 5.763458401305057, "grad_norm": 1.5988373756408691, "learning_rate": 4.479846494350074e-05, "loss": 0.0903, "num_input_tokens_seen": 76256352, "step": 35330 }, { "epoch": 5.764274061990212, "grad_norm": 0.20279906690120697, "learning_rate": 4.479629162028595e-05, "loss": 0.1436, "num_input_tokens_seen": 76266912, "step": 35335 }, { "epoch": 5.765089722675367, "grad_norm": 0.251762330532074, "learning_rate": 4.47941178958742e-05, "loss": 0.2136, "num_input_tokens_seen": 76277344, "step": 35340 }, { "epoch": 5.765905383360522, "grad_norm": 2.327842950820923, "learning_rate": 4.4791943770309535e-05, "loss": 0.2148, "num_input_tokens_seen": 76288544, "step": 35345 }, { "epoch": 5.766721044045677, "grad_norm": 1.7692168951034546, "learning_rate": 4.478976924363603e-05, "loss": 0.197, "num_input_tokens_seen": 76299744, "step": 35350 }, { "epoch": 5.767536704730832, "grad_norm": 0.6272241473197937, "learning_rate": 4.478759431589773e-05, "loss": 0.1851, "num_input_tokens_seen": 76309280, "step": 35355 }, { "epoch": 5.768352365415987, "grad_norm": 0.6464052200317383, "learning_rate": 4.478541898713874e-05, "loss": 0.0935, "num_input_tokens_seen": 76319072, "step": 35360 }, { "epoch": 5.769168026101142, "grad_norm": 0.631996214389801, "learning_rate": 4.4783243257403134e-05, "loss": 0.0958, "num_input_tokens_seen": 76329696, "step": 35365 }, { "epoch": 5.769983686786297, "grad_norm": 0.19464406371116638, "learning_rate": 4.478106712673501e-05, "loss": 0.2005, "num_input_tokens_seen": 76339904, "step": 35370 }, { "epoch": 5.770799347471452, "grad_norm": 0.637769341468811, "learning_rate": 4.477889059517846e-05, "loss": 0.0742, "num_input_tokens_seen": 76351776, "step": 35375 }, { "epoch": 5.771615008156607, "grad_norm": 0.1406434029340744, "learning_rate": 4.47767136627776e-05, "loss": 0.1444, "num_input_tokens_seen": 76362208, "step": 35380 }, { "epoch": 5.7724306688417615, "grad_norm": 0.4117233455181122, "learning_rate": 4.477453632957656e-05, "loss": 0.0619, "num_input_tokens_seen": 76374624, "step": 35385 }, { "epoch": 5.773246329526917, "grad_norm": 0.16040191054344177, "learning_rate": 4.4772358595619444e-05, "loss": 0.0814, "num_input_tokens_seen": 76384064, "step": 35390 }, { "epoch": 5.774061990212072, "grad_norm": 1.202393651008606, "learning_rate": 4.47701804609504e-05, "loss": 0.0715, "num_input_tokens_seen": 76394976, "step": 35395 }, { "epoch": 5.774877650897227, "grad_norm": 0.424609899520874, "learning_rate": 4.476800192561357e-05, "loss": 0.2253, "num_input_tokens_seen": 76407136, "step": 35400 }, { "epoch": 5.775693311582382, "grad_norm": 0.813328742980957, "learning_rate": 4.476582298965311e-05, "loss": 0.0953, "num_input_tokens_seen": 76417536, "step": 35405 }, { "epoch": 5.7765089722675365, "grad_norm": 0.06354502588510513, "learning_rate": 4.476364365311316e-05, "loss": 0.0268, "num_input_tokens_seen": 76428576, "step": 35410 }, { "epoch": 5.777324632952691, "grad_norm": 1.381139874458313, "learning_rate": 4.47614639160379e-05, "loss": 0.1255, "num_input_tokens_seen": 76439040, "step": 35415 }, { "epoch": 5.778140293637847, "grad_norm": 0.4918675124645233, "learning_rate": 4.4759283778471516e-05, "loss": 0.2898, "num_input_tokens_seen": 76450624, "step": 35420 }, { "epoch": 5.778955954323002, "grad_norm": 0.08213233947753906, "learning_rate": 4.475710324045817e-05, "loss": 0.1412, "num_input_tokens_seen": 76459840, "step": 35425 }, { "epoch": 5.779771615008157, "grad_norm": 1.3042789697647095, "learning_rate": 4.475492230204208e-05, "loss": 0.2166, "num_input_tokens_seen": 76470688, "step": 35430 }, { "epoch": 5.780587275693311, "grad_norm": 0.20693516731262207, "learning_rate": 4.475274096326741e-05, "loss": 0.1933, "num_input_tokens_seen": 76481728, "step": 35435 }, { "epoch": 5.781402936378466, "grad_norm": 2.2293992042541504, "learning_rate": 4.475055922417839e-05, "loss": 0.1472, "num_input_tokens_seen": 76492032, "step": 35440 }, { "epoch": 5.782218597063622, "grad_norm": 0.8855673670768738, "learning_rate": 4.474837708481924e-05, "loss": 0.0613, "num_input_tokens_seen": 76502784, "step": 35445 }, { "epoch": 5.783034257748777, "grad_norm": 0.1835489422082901, "learning_rate": 4.4746194545234165e-05, "loss": 0.055, "num_input_tokens_seen": 76513856, "step": 35450 }, { "epoch": 5.783849918433932, "grad_norm": 0.3125417232513428, "learning_rate": 4.474401160546742e-05, "loss": 0.047, "num_input_tokens_seen": 76525056, "step": 35455 }, { "epoch": 5.784665579119086, "grad_norm": 1.3860241174697876, "learning_rate": 4.474182826556322e-05, "loss": 0.1286, "num_input_tokens_seen": 76535584, "step": 35460 }, { "epoch": 5.785481239804241, "grad_norm": 0.6736984252929688, "learning_rate": 4.473964452556583e-05, "loss": 0.0971, "num_input_tokens_seen": 76547168, "step": 35465 }, { "epoch": 5.786296900489396, "grad_norm": 0.11326828598976135, "learning_rate": 4.4737460385519504e-05, "loss": 0.0486, "num_input_tokens_seen": 76557888, "step": 35470 }, { "epoch": 5.787112561174552, "grad_norm": 0.7946982383728027, "learning_rate": 4.473527584546851e-05, "loss": 0.0788, "num_input_tokens_seen": 76568864, "step": 35475 }, { "epoch": 5.787928221859707, "grad_norm": 0.3762643039226532, "learning_rate": 4.4733090905457105e-05, "loss": 0.1166, "num_input_tokens_seen": 76579296, "step": 35480 }, { "epoch": 5.788743882544861, "grad_norm": 0.11450184136629105, "learning_rate": 4.473090556552958e-05, "loss": 0.035, "num_input_tokens_seen": 76589824, "step": 35485 }, { "epoch": 5.789559543230016, "grad_norm": 0.5407349467277527, "learning_rate": 4.472871982573023e-05, "loss": 0.1982, "num_input_tokens_seen": 76599744, "step": 35490 }, { "epoch": 5.790375203915171, "grad_norm": 0.3186228573322296, "learning_rate": 4.4726533686103345e-05, "loss": 0.0922, "num_input_tokens_seen": 76612000, "step": 35495 }, { "epoch": 5.791190864600326, "grad_norm": 0.4515172839164734, "learning_rate": 4.472434714669322e-05, "loss": 0.079, "num_input_tokens_seen": 76623936, "step": 35500 }, { "epoch": 5.7920065252854815, "grad_norm": 0.1480158120393753, "learning_rate": 4.472216020754419e-05, "loss": 0.0712, "num_input_tokens_seen": 76634240, "step": 35505 }, { "epoch": 5.792822185970636, "grad_norm": 0.23734629154205322, "learning_rate": 4.4719972868700554e-05, "loss": 0.1587, "num_input_tokens_seen": 76645344, "step": 35510 }, { "epoch": 5.793637846655791, "grad_norm": 0.12398495525121689, "learning_rate": 4.4717785130206656e-05, "loss": 0.0175, "num_input_tokens_seen": 76656000, "step": 35515 }, { "epoch": 5.794453507340946, "grad_norm": 1.5213664770126343, "learning_rate": 4.471559699210683e-05, "loss": 0.1132, "num_input_tokens_seen": 76666976, "step": 35520 }, { "epoch": 5.795269168026101, "grad_norm": 3.539229393005371, "learning_rate": 4.471340845444542e-05, "loss": 0.2321, "num_input_tokens_seen": 76676192, "step": 35525 }, { "epoch": 5.7960848287112565, "grad_norm": 0.31373360753059387, "learning_rate": 4.4711219517266777e-05, "loss": 0.144, "num_input_tokens_seen": 76687104, "step": 35530 }, { "epoch": 5.796900489396411, "grad_norm": 1.5069242715835571, "learning_rate": 4.470903018061527e-05, "loss": 0.1355, "num_input_tokens_seen": 76699104, "step": 35535 }, { "epoch": 5.797716150081566, "grad_norm": 0.06342776864767075, "learning_rate": 4.470684044453526e-05, "loss": 0.209, "num_input_tokens_seen": 76710464, "step": 35540 }, { "epoch": 5.798531810766721, "grad_norm": 0.374445378780365, "learning_rate": 4.470465030907113e-05, "loss": 0.0497, "num_input_tokens_seen": 76720864, "step": 35545 }, { "epoch": 5.799347471451876, "grad_norm": 0.06380236893892288, "learning_rate": 4.4702459774267256e-05, "loss": 0.1022, "num_input_tokens_seen": 76731520, "step": 35550 }, { "epoch": 5.800163132137031, "grad_norm": 2.2240920066833496, "learning_rate": 4.4700268840168045e-05, "loss": 0.1986, "num_input_tokens_seen": 76741536, "step": 35555 }, { "epoch": 5.800978792822186, "grad_norm": 0.09950514882802963, "learning_rate": 4.4698077506817904e-05, "loss": 0.0524, "num_input_tokens_seen": 76751904, "step": 35560 }, { "epoch": 5.801794453507341, "grad_norm": 1.4752988815307617, "learning_rate": 4.469588577426123e-05, "loss": 0.2839, "num_input_tokens_seen": 76761504, "step": 35565 }, { "epoch": 5.802610114192496, "grad_norm": 0.40847402811050415, "learning_rate": 4.469369364254243e-05, "loss": 0.0374, "num_input_tokens_seen": 76771968, "step": 35570 }, { "epoch": 5.803425774877651, "grad_norm": 1.4366742372512817, "learning_rate": 4.4691501111705966e-05, "loss": 0.1397, "num_input_tokens_seen": 76782432, "step": 35575 }, { "epoch": 5.804241435562806, "grad_norm": 1.1904263496398926, "learning_rate": 4.468930818179624e-05, "loss": 0.1589, "num_input_tokens_seen": 76793152, "step": 35580 }, { "epoch": 5.80505709624796, "grad_norm": 0.9160469770431519, "learning_rate": 4.46871148528577e-05, "loss": 0.1054, "num_input_tokens_seen": 76804000, "step": 35585 }, { "epoch": 5.805872756933116, "grad_norm": 0.3360448479652405, "learning_rate": 4.468492112493481e-05, "loss": 0.067, "num_input_tokens_seen": 76814432, "step": 35590 }, { "epoch": 5.806688417618271, "grad_norm": 0.39547932147979736, "learning_rate": 4.468272699807203e-05, "loss": 0.1371, "num_input_tokens_seen": 76825472, "step": 35595 }, { "epoch": 5.807504078303426, "grad_norm": 0.1637336164712906, "learning_rate": 4.4680532472313806e-05, "loss": 0.0191, "num_input_tokens_seen": 76837568, "step": 35600 }, { "epoch": 5.808319738988581, "grad_norm": 0.3348437249660492, "learning_rate": 4.467833754770463e-05, "loss": 0.0438, "num_input_tokens_seen": 76848768, "step": 35605 }, { "epoch": 5.809135399673735, "grad_norm": 1.6104875802993774, "learning_rate": 4.467614222428898e-05, "loss": 0.095, "num_input_tokens_seen": 76860000, "step": 35610 }, { "epoch": 5.809951060358891, "grad_norm": 0.12363220751285553, "learning_rate": 4.467394650211134e-05, "loss": 0.0921, "num_input_tokens_seen": 76871904, "step": 35615 }, { "epoch": 5.810766721044046, "grad_norm": 0.7879684567451477, "learning_rate": 4.467175038121623e-05, "loss": 0.0961, "num_input_tokens_seen": 76881504, "step": 35620 }, { "epoch": 5.811582381729201, "grad_norm": 2.4726340770721436, "learning_rate": 4.466955386164813e-05, "loss": 0.2356, "num_input_tokens_seen": 76894080, "step": 35625 }, { "epoch": 5.8123980424143555, "grad_norm": 0.7755162119865417, "learning_rate": 4.466735694345157e-05, "loss": 0.0922, "num_input_tokens_seen": 76904672, "step": 35630 }, { "epoch": 5.81321370309951, "grad_norm": 1.3352270126342773, "learning_rate": 4.466515962667107e-05, "loss": 0.0826, "num_input_tokens_seen": 76914976, "step": 35635 }, { "epoch": 5.814029363784666, "grad_norm": 1.2640283107757568, "learning_rate": 4.466296191135118e-05, "loss": 0.2228, "num_input_tokens_seen": 76925824, "step": 35640 }, { "epoch": 5.814845024469821, "grad_norm": 0.36062443256378174, "learning_rate": 4.46607637975364e-05, "loss": 0.0656, "num_input_tokens_seen": 76936160, "step": 35645 }, { "epoch": 5.815660685154976, "grad_norm": 1.0706791877746582, "learning_rate": 4.4658565285271315e-05, "loss": 0.1103, "num_input_tokens_seen": 76946016, "step": 35650 }, { "epoch": 5.8164763458401305, "grad_norm": 1.8493905067443848, "learning_rate": 4.465636637460046e-05, "loss": 0.1454, "num_input_tokens_seen": 76956704, "step": 35655 }, { "epoch": 5.817292006525285, "grad_norm": 0.8930352330207825, "learning_rate": 4.465416706556841e-05, "loss": 0.0585, "num_input_tokens_seen": 76967360, "step": 35660 }, { "epoch": 5.81810766721044, "grad_norm": 0.5949097871780396, "learning_rate": 4.465196735821973e-05, "loss": 0.1086, "num_input_tokens_seen": 76979008, "step": 35665 }, { "epoch": 5.818923327895595, "grad_norm": 0.04608462378382683, "learning_rate": 4.4649767252599e-05, "loss": 0.157, "num_input_tokens_seen": 76988800, "step": 35670 }, { "epoch": 5.819738988580751, "grad_norm": 1.8748780488967896, "learning_rate": 4.46475667487508e-05, "loss": 0.321, "num_input_tokens_seen": 76999616, "step": 35675 }, { "epoch": 5.8205546492659055, "grad_norm": 0.18878388404846191, "learning_rate": 4.4645365846719744e-05, "loss": 0.0255, "num_input_tokens_seen": 77010656, "step": 35680 }, { "epoch": 5.82137030995106, "grad_norm": 0.043484319001436234, "learning_rate": 4.464316454655042e-05, "loss": 0.1498, "num_input_tokens_seen": 77021920, "step": 35685 }, { "epoch": 5.822185970636215, "grad_norm": 1.835942029953003, "learning_rate": 4.4640962848287455e-05, "loss": 0.075, "num_input_tokens_seen": 77032416, "step": 35690 }, { "epoch": 5.82300163132137, "grad_norm": 1.5570921897888184, "learning_rate": 4.463876075197546e-05, "loss": 0.115, "num_input_tokens_seen": 77042944, "step": 35695 }, { "epoch": 5.823817292006526, "grad_norm": 0.8075902462005615, "learning_rate": 4.4636558257659065e-05, "loss": 0.0509, "num_input_tokens_seen": 77051584, "step": 35700 }, { "epoch": 5.8246329526916805, "grad_norm": 1.6108978986740112, "learning_rate": 4.4634355365382906e-05, "loss": 0.1663, "num_input_tokens_seen": 77062048, "step": 35705 }, { "epoch": 5.825448613376835, "grad_norm": 0.29567068815231323, "learning_rate": 4.463215207519163e-05, "loss": 0.2179, "num_input_tokens_seen": 77074080, "step": 35710 }, { "epoch": 5.82626427406199, "grad_norm": 0.4036219120025635, "learning_rate": 4.4629948387129885e-05, "loss": 0.3015, "num_input_tokens_seen": 77085632, "step": 35715 }, { "epoch": 5.827079934747145, "grad_norm": 0.34364938735961914, "learning_rate": 4.462774430124233e-05, "loss": 0.021, "num_input_tokens_seen": 77097664, "step": 35720 }, { "epoch": 5.827895595432301, "grad_norm": 0.17851491272449493, "learning_rate": 4.462553981757364e-05, "loss": 0.045, "num_input_tokens_seen": 77109568, "step": 35725 }, { "epoch": 5.828711256117455, "grad_norm": 1.5134023427963257, "learning_rate": 4.4623334936168484e-05, "loss": 0.1619, "num_input_tokens_seen": 77119648, "step": 35730 }, { "epoch": 5.82952691680261, "grad_norm": 1.2406336069107056, "learning_rate": 4.4621129657071556e-05, "loss": 0.0585, "num_input_tokens_seen": 77132064, "step": 35735 }, { "epoch": 5.830342577487765, "grad_norm": 0.3949146866798401, "learning_rate": 4.461892398032755e-05, "loss": 0.0447, "num_input_tokens_seen": 77142880, "step": 35740 }, { "epoch": 5.83115823817292, "grad_norm": 0.9480941295623779, "learning_rate": 4.461671790598115e-05, "loss": 0.0821, "num_input_tokens_seen": 77154272, "step": 35745 }, { "epoch": 5.831973898858075, "grad_norm": 1.637811303138733, "learning_rate": 4.461451143407709e-05, "loss": 0.2828, "num_input_tokens_seen": 77165280, "step": 35750 }, { "epoch": 5.8327895595432295, "grad_norm": 1.153654932975769, "learning_rate": 4.4612304564660065e-05, "loss": 0.0617, "num_input_tokens_seen": 77176032, "step": 35755 }, { "epoch": 5.833605220228385, "grad_norm": 0.13042819499969482, "learning_rate": 4.4610097297774805e-05, "loss": 0.1303, "num_input_tokens_seen": 77186880, "step": 35760 }, { "epoch": 5.83442088091354, "grad_norm": 1.0998870134353638, "learning_rate": 4.460788963346605e-05, "loss": 0.1631, "num_input_tokens_seen": 77196704, "step": 35765 }, { "epoch": 5.835236541598695, "grad_norm": 0.9854596257209778, "learning_rate": 4.460568157177853e-05, "loss": 0.0831, "num_input_tokens_seen": 77208064, "step": 35770 }, { "epoch": 5.83605220228385, "grad_norm": 0.6697385311126709, "learning_rate": 4.460347311275701e-05, "loss": 0.0928, "num_input_tokens_seen": 77219328, "step": 35775 }, { "epoch": 5.8368678629690045, "grad_norm": 0.1305035501718521, "learning_rate": 4.460126425644624e-05, "loss": 0.0485, "num_input_tokens_seen": 77230592, "step": 35780 }, { "epoch": 5.83768352365416, "grad_norm": 0.6358627080917358, "learning_rate": 4.459905500289098e-05, "loss": 0.131, "num_input_tokens_seen": 77240992, "step": 35785 }, { "epoch": 5.838499184339315, "grad_norm": 0.9306478500366211, "learning_rate": 4.459684535213601e-05, "loss": 0.1467, "num_input_tokens_seen": 77250144, "step": 35790 }, { "epoch": 5.83931484502447, "grad_norm": 0.03926185891032219, "learning_rate": 4.4594635304226104e-05, "loss": 0.0965, "num_input_tokens_seen": 77262464, "step": 35795 }, { "epoch": 5.840130505709625, "grad_norm": 1.5809259414672852, "learning_rate": 4.459242485920605e-05, "loss": 0.2235, "num_input_tokens_seen": 77273248, "step": 35800 }, { "epoch": 5.8409461663947795, "grad_norm": 1.3968883752822876, "learning_rate": 4.459021401712067e-05, "loss": 0.2226, "num_input_tokens_seen": 77284032, "step": 35805 }, { "epoch": 5.841761827079935, "grad_norm": 0.5720195770263672, "learning_rate": 4.458800277801474e-05, "loss": 0.071, "num_input_tokens_seen": 77295072, "step": 35810 }, { "epoch": 5.84257748776509, "grad_norm": 1.9953107833862305, "learning_rate": 4.458579114193308e-05, "loss": 0.2531, "num_input_tokens_seen": 77306208, "step": 35815 }, { "epoch": 5.843393148450245, "grad_norm": 0.2192753404378891, "learning_rate": 4.458357910892052e-05, "loss": 0.0481, "num_input_tokens_seen": 77317024, "step": 35820 }, { "epoch": 5.8442088091354, "grad_norm": 1.7317790985107422, "learning_rate": 4.458136667902189e-05, "loss": 0.1784, "num_input_tokens_seen": 77327648, "step": 35825 }, { "epoch": 5.8450244698205545, "grad_norm": 0.5529813170433044, "learning_rate": 4.457915385228202e-05, "loss": 0.0593, "num_input_tokens_seen": 77338304, "step": 35830 }, { "epoch": 5.845840130505709, "grad_norm": 0.29800257086753845, "learning_rate": 4.457694062874576e-05, "loss": 0.0859, "num_input_tokens_seen": 77348768, "step": 35835 }, { "epoch": 5.846655791190865, "grad_norm": 0.23184683918952942, "learning_rate": 4.457472700845796e-05, "loss": 0.1133, "num_input_tokens_seen": 77358208, "step": 35840 }, { "epoch": 5.84747145187602, "grad_norm": 0.060414232313632965, "learning_rate": 4.457251299146349e-05, "loss": 0.0584, "num_input_tokens_seen": 77368896, "step": 35845 }, { "epoch": 5.848287112561175, "grad_norm": 0.2500130236148834, "learning_rate": 4.4570298577807215e-05, "loss": 0.0756, "num_input_tokens_seen": 77379616, "step": 35850 }, { "epoch": 5.849102773246329, "grad_norm": 0.9189040660858154, "learning_rate": 4.456808376753401e-05, "loss": 0.0777, "num_input_tokens_seen": 77390400, "step": 35855 }, { "epoch": 5.849918433931484, "grad_norm": 1.1258652210235596, "learning_rate": 4.456586856068876e-05, "loss": 0.1348, "num_input_tokens_seen": 77400544, "step": 35860 }, { "epoch": 5.850734094616639, "grad_norm": 0.37024426460266113, "learning_rate": 4.456365295731637e-05, "loss": 0.0604, "num_input_tokens_seen": 77411136, "step": 35865 }, { "epoch": 5.851549755301795, "grad_norm": 0.22611235082149506, "learning_rate": 4.4561436957461725e-05, "loss": 0.2022, "num_input_tokens_seen": 77422240, "step": 35870 }, { "epoch": 5.85236541598695, "grad_norm": 0.29980576038360596, "learning_rate": 4.4559220561169756e-05, "loss": 0.0752, "num_input_tokens_seen": 77431648, "step": 35875 }, { "epoch": 5.853181076672104, "grad_norm": 0.43066510558128357, "learning_rate": 4.455700376848536e-05, "loss": 0.0609, "num_input_tokens_seen": 77443680, "step": 35880 }, { "epoch": 5.853996737357259, "grad_norm": 0.3324006199836731, "learning_rate": 4.4554786579453475e-05, "loss": 0.0828, "num_input_tokens_seen": 77453824, "step": 35885 }, { "epoch": 5.854812398042414, "grad_norm": 0.07713599503040314, "learning_rate": 4.4552568994119036e-05, "loss": 0.1863, "num_input_tokens_seen": 77464512, "step": 35890 }, { "epoch": 5.85562805872757, "grad_norm": 1.3573846817016602, "learning_rate": 4.455035101252698e-05, "loss": 0.2467, "num_input_tokens_seen": 77475584, "step": 35895 }, { "epoch": 5.856443719412725, "grad_norm": 1.273814082145691, "learning_rate": 4.4548132634722264e-05, "loss": 0.1509, "num_input_tokens_seen": 77486464, "step": 35900 }, { "epoch": 5.857259380097879, "grad_norm": 1.7486984729766846, "learning_rate": 4.454591386074983e-05, "loss": 0.1291, "num_input_tokens_seen": 77495168, "step": 35905 }, { "epoch": 5.858075040783034, "grad_norm": 2.1534762382507324, "learning_rate": 4.454369469065467e-05, "loss": 0.1398, "num_input_tokens_seen": 77505632, "step": 35910 }, { "epoch": 5.858890701468189, "grad_norm": 0.22716529667377472, "learning_rate": 4.4541475124481736e-05, "loss": 0.1058, "num_input_tokens_seen": 77516512, "step": 35915 }, { "epoch": 5.859706362153344, "grad_norm": 1.4707144498825073, "learning_rate": 4.453925516227602e-05, "loss": 0.2011, "num_input_tokens_seen": 77528352, "step": 35920 }, { "epoch": 5.8605220228384995, "grad_norm": 0.6062422394752502, "learning_rate": 4.453703480408251e-05, "loss": 0.1738, "num_input_tokens_seen": 77537728, "step": 35925 }, { "epoch": 5.861337683523654, "grad_norm": 0.0660415068268776, "learning_rate": 4.453481404994621e-05, "loss": 0.1804, "num_input_tokens_seen": 77549216, "step": 35930 }, { "epoch": 5.862153344208809, "grad_norm": 1.4677802324295044, "learning_rate": 4.4532592899912116e-05, "loss": 0.2314, "num_input_tokens_seen": 77560288, "step": 35935 }, { "epoch": 5.862969004893964, "grad_norm": 1.9867557287216187, "learning_rate": 4.453037135402525e-05, "loss": 0.1832, "num_input_tokens_seen": 77571968, "step": 35940 }, { "epoch": 5.863784665579119, "grad_norm": 0.09984593838453293, "learning_rate": 4.452814941233063e-05, "loss": 0.0391, "num_input_tokens_seen": 77583872, "step": 35945 }, { "epoch": 5.864600326264274, "grad_norm": 0.10886699706315994, "learning_rate": 4.4525927074873295e-05, "loss": 0.1273, "num_input_tokens_seen": 77595232, "step": 35950 }, { "epoch": 5.865415986949429, "grad_norm": 0.07926511019468307, "learning_rate": 4.4523704341698274e-05, "loss": 0.0242, "num_input_tokens_seen": 77605696, "step": 35955 }, { "epoch": 5.866231647634584, "grad_norm": 0.176889106631279, "learning_rate": 4.4521481212850616e-05, "loss": 0.0845, "num_input_tokens_seen": 77616192, "step": 35960 }, { "epoch": 5.867047308319739, "grad_norm": 0.7150045037269592, "learning_rate": 4.4519257688375384e-05, "loss": 0.0518, "num_input_tokens_seen": 77626688, "step": 35965 }, { "epoch": 5.867862969004894, "grad_norm": 0.14000153541564941, "learning_rate": 4.451703376831762e-05, "loss": 0.0675, "num_input_tokens_seen": 77637152, "step": 35970 }, { "epoch": 5.868678629690049, "grad_norm": 1.044627070426941, "learning_rate": 4.4514809452722414e-05, "loss": 0.0697, "num_input_tokens_seen": 77647328, "step": 35975 }, { "epoch": 5.869494290375204, "grad_norm": 0.2412264496088028, "learning_rate": 4.4512584741634836e-05, "loss": 0.0348, "num_input_tokens_seen": 77657824, "step": 35980 }, { "epoch": 5.870309951060359, "grad_norm": 0.08114472031593323, "learning_rate": 4.451035963509998e-05, "loss": 0.0583, "num_input_tokens_seen": 77668768, "step": 35985 }, { "epoch": 5.871125611745514, "grad_norm": 2.297060489654541, "learning_rate": 4.450813413316293e-05, "loss": 0.131, "num_input_tokens_seen": 77679040, "step": 35990 }, { "epoch": 5.871941272430669, "grad_norm": 0.07110206037759781, "learning_rate": 4.45059082358688e-05, "loss": 0.136, "num_input_tokens_seen": 77690304, "step": 35995 }, { "epoch": 5.872756933115824, "grad_norm": 1.545137882232666, "learning_rate": 4.450368194326269e-05, "loss": 0.0768, "num_input_tokens_seen": 77700480, "step": 36000 }, { "epoch": 5.873572593800979, "grad_norm": 0.5102595686912537, "learning_rate": 4.450145525538972e-05, "loss": 0.09, "num_input_tokens_seen": 77711552, "step": 36005 }, { "epoch": 5.874388254486134, "grad_norm": 0.5317001938819885, "learning_rate": 4.4499228172295025e-05, "loss": 0.1781, "num_input_tokens_seen": 77722944, "step": 36010 }, { "epoch": 5.875203915171289, "grad_norm": 0.09068028628826141, "learning_rate": 4.449700069402374e-05, "loss": 0.0405, "num_input_tokens_seen": 77735072, "step": 36015 }, { "epoch": 5.876019575856444, "grad_norm": 0.04275471344590187, "learning_rate": 4.4494772820620986e-05, "loss": 0.0326, "num_input_tokens_seen": 77746848, "step": 36020 }, { "epoch": 5.876835236541599, "grad_norm": 0.20415519177913666, "learning_rate": 4.4492544552131943e-05, "loss": 0.0921, "num_input_tokens_seen": 77757664, "step": 36025 }, { "epoch": 5.877650897226753, "grad_norm": 0.11931298673152924, "learning_rate": 4.449031588860175e-05, "loss": 0.0761, "num_input_tokens_seen": 77768288, "step": 36030 }, { "epoch": 5.878466557911908, "grad_norm": 0.04092282056808472, "learning_rate": 4.4488086830075585e-05, "loss": 0.1241, "num_input_tokens_seen": 77778080, "step": 36035 }, { "epoch": 5.879282218597064, "grad_norm": 1.411371111869812, "learning_rate": 4.448585737659862e-05, "loss": 0.1797, "num_input_tokens_seen": 77790784, "step": 36040 }, { "epoch": 5.880097879282219, "grad_norm": 0.7710120677947998, "learning_rate": 4.448362752821603e-05, "loss": 0.1368, "num_input_tokens_seen": 77801344, "step": 36045 }, { "epoch": 5.8809135399673735, "grad_norm": 0.641048789024353, "learning_rate": 4.4481397284973016e-05, "loss": 0.086, "num_input_tokens_seen": 77811392, "step": 36050 }, { "epoch": 5.881729200652528, "grad_norm": 1.868038535118103, "learning_rate": 4.447916664691477e-05, "loss": 0.1226, "num_input_tokens_seen": 77821312, "step": 36055 }, { "epoch": 5.882544861337683, "grad_norm": 0.1881672739982605, "learning_rate": 4.44769356140865e-05, "loss": 0.2453, "num_input_tokens_seen": 77831872, "step": 36060 }, { "epoch": 5.883360522022839, "grad_norm": 0.3798012435436249, "learning_rate": 4.447470418653342e-05, "loss": 0.1339, "num_input_tokens_seen": 77842752, "step": 36065 }, { "epoch": 5.884176182707994, "grad_norm": 0.3176012337207794, "learning_rate": 4.4472472364300754e-05, "loss": 0.0813, "num_input_tokens_seen": 77853696, "step": 36070 }, { "epoch": 5.8849918433931485, "grad_norm": 1.4137767553329468, "learning_rate": 4.447024014743374e-05, "loss": 0.1898, "num_input_tokens_seen": 77864288, "step": 36075 }, { "epoch": 5.885807504078303, "grad_norm": 0.18151451647281647, "learning_rate": 4.446800753597761e-05, "loss": 0.0924, "num_input_tokens_seen": 77874880, "step": 36080 }, { "epoch": 5.886623164763458, "grad_norm": 0.29608607292175293, "learning_rate": 4.44657745299776e-05, "loss": 0.0605, "num_input_tokens_seen": 77886688, "step": 36085 }, { "epoch": 5.887438825448614, "grad_norm": 0.6969850063323975, "learning_rate": 4.446354112947898e-05, "loss": 0.1071, "num_input_tokens_seen": 77897376, "step": 36090 }, { "epoch": 5.888254486133769, "grad_norm": 1.2314599752426147, "learning_rate": 4.446130733452701e-05, "loss": 0.0799, "num_input_tokens_seen": 77908736, "step": 36095 }, { "epoch": 5.8890701468189235, "grad_norm": 0.07812488079071045, "learning_rate": 4.4459073145166955e-05, "loss": 0.2543, "num_input_tokens_seen": 77920352, "step": 36100 }, { "epoch": 5.889885807504078, "grad_norm": 0.40592122077941895, "learning_rate": 4.4456838561444095e-05, "loss": 0.1345, "num_input_tokens_seen": 77931360, "step": 36105 }, { "epoch": 5.890701468189233, "grad_norm": 0.4546952545642853, "learning_rate": 4.445460358340373e-05, "loss": 0.0886, "num_input_tokens_seen": 77942208, "step": 36110 }, { "epoch": 5.891517128874388, "grad_norm": 0.02813972532749176, "learning_rate": 4.4452368211091135e-05, "loss": 0.0314, "num_input_tokens_seen": 77953248, "step": 36115 }, { "epoch": 5.892332789559543, "grad_norm": 0.2122194766998291, "learning_rate": 4.445013244455162e-05, "loss": 0.0269, "num_input_tokens_seen": 77963584, "step": 36120 }, { "epoch": 5.8931484502446985, "grad_norm": 1.1496316194534302, "learning_rate": 4.4447896283830496e-05, "loss": 0.0687, "num_input_tokens_seen": 77973920, "step": 36125 }, { "epoch": 5.893964110929853, "grad_norm": 0.09126190096139908, "learning_rate": 4.444565972897309e-05, "loss": 0.0607, "num_input_tokens_seen": 77984736, "step": 36130 }, { "epoch": 5.894779771615008, "grad_norm": 2.1606929302215576, "learning_rate": 4.4443422780024715e-05, "loss": 0.2827, "num_input_tokens_seen": 77996160, "step": 36135 }, { "epoch": 5.895595432300163, "grad_norm": 1.1211555004119873, "learning_rate": 4.444118543703071e-05, "loss": 0.1962, "num_input_tokens_seen": 78006592, "step": 36140 }, { "epoch": 5.896411092985318, "grad_norm": 0.03130100294947624, "learning_rate": 4.4438947700036425e-05, "loss": 0.1007, "num_input_tokens_seen": 78015936, "step": 36145 }, { "epoch": 5.897226753670473, "grad_norm": 1.6786967515945435, "learning_rate": 4.4436709569087196e-05, "loss": 0.284, "num_input_tokens_seen": 78028160, "step": 36150 }, { "epoch": 5.898042414355628, "grad_norm": 0.04350470006465912, "learning_rate": 4.44344710442284e-05, "loss": 0.0532, "num_input_tokens_seen": 78039712, "step": 36155 }, { "epoch": 5.898858075040783, "grad_norm": 0.15903295576572418, "learning_rate": 4.443223212550539e-05, "loss": 0.066, "num_input_tokens_seen": 78051936, "step": 36160 }, { "epoch": 5.899673735725938, "grad_norm": 0.8708640336990356, "learning_rate": 4.4429992812963545e-05, "loss": 0.2742, "num_input_tokens_seen": 78062464, "step": 36165 }, { "epoch": 5.900489396411093, "grad_norm": 0.5016657114028931, "learning_rate": 4.4427753106648244e-05, "loss": 0.064, "num_input_tokens_seen": 78072352, "step": 36170 }, { "epoch": 5.901305057096248, "grad_norm": 0.05675860494375229, "learning_rate": 4.4425513006604884e-05, "loss": 0.114, "num_input_tokens_seen": 78082816, "step": 36175 }, { "epoch": 5.902120717781403, "grad_norm": 1.9314631223678589, "learning_rate": 4.442327251287886e-05, "loss": 0.2442, "num_input_tokens_seen": 78092288, "step": 36180 }, { "epoch": 5.902936378466558, "grad_norm": 0.5762593746185303, "learning_rate": 4.4421031625515575e-05, "loss": 0.1027, "num_input_tokens_seen": 78104608, "step": 36185 }, { "epoch": 5.903752039151713, "grad_norm": 0.06175912544131279, "learning_rate": 4.441879034456045e-05, "loss": 0.0826, "num_input_tokens_seen": 78116160, "step": 36190 }, { "epoch": 5.904567699836868, "grad_norm": 0.38092121481895447, "learning_rate": 4.44165486700589e-05, "loss": 0.2181, "num_input_tokens_seen": 78127200, "step": 36195 }, { "epoch": 5.9053833605220225, "grad_norm": 0.7446348667144775, "learning_rate": 4.4414306602056364e-05, "loss": 0.1494, "num_input_tokens_seen": 78136928, "step": 36200 }, { "epoch": 5.906199021207177, "grad_norm": 0.07037612795829773, "learning_rate": 4.441206414059828e-05, "loss": 0.068, "num_input_tokens_seen": 78148704, "step": 36205 }, { "epoch": 5.907014681892333, "grad_norm": 1.2441257238388062, "learning_rate": 4.440982128573008e-05, "loss": 0.1306, "num_input_tokens_seen": 78158752, "step": 36210 }, { "epoch": 5.907830342577488, "grad_norm": 1.0284357070922852, "learning_rate": 4.440757803749723e-05, "loss": 0.3572, "num_input_tokens_seen": 78167968, "step": 36215 }, { "epoch": 5.908646003262643, "grad_norm": 1.768786072731018, "learning_rate": 4.44053343959452e-05, "loss": 0.1214, "num_input_tokens_seen": 78178688, "step": 36220 }, { "epoch": 5.9094616639477975, "grad_norm": 1.668748140335083, "learning_rate": 4.440309036111945e-05, "loss": 0.1358, "num_input_tokens_seen": 78189376, "step": 36225 }, { "epoch": 5.910277324632952, "grad_norm": 0.10290797799825668, "learning_rate": 4.440084593306545e-05, "loss": 0.0788, "num_input_tokens_seen": 78200000, "step": 36230 }, { "epoch": 5.911092985318108, "grad_norm": 1.142715573310852, "learning_rate": 4.43986011118287e-05, "loss": 0.0968, "num_input_tokens_seen": 78210592, "step": 36235 }, { "epoch": 5.911908646003263, "grad_norm": 0.7523385286331177, "learning_rate": 4.439635589745469e-05, "loss": 0.1057, "num_input_tokens_seen": 78220896, "step": 36240 }, { "epoch": 5.912724306688418, "grad_norm": 0.9977962970733643, "learning_rate": 4.439411028998892e-05, "loss": 0.0617, "num_input_tokens_seen": 78233088, "step": 36245 }, { "epoch": 5.9135399673735725, "grad_norm": 0.9561271071434021, "learning_rate": 4.4391864289476904e-05, "loss": 0.2748, "num_input_tokens_seen": 78243680, "step": 36250 }, { "epoch": 5.914355628058727, "grad_norm": 0.03123829886317253, "learning_rate": 4.4389617895964156e-05, "loss": 0.095, "num_input_tokens_seen": 78253344, "step": 36255 }, { "epoch": 5.915171288743883, "grad_norm": 0.06063534691929817, "learning_rate": 4.438737110949621e-05, "loss": 0.138, "num_input_tokens_seen": 78264864, "step": 36260 }, { "epoch": 5.915986949429038, "grad_norm": 0.2761831283569336, "learning_rate": 4.438512393011859e-05, "loss": 0.0595, "num_input_tokens_seen": 78276000, "step": 36265 }, { "epoch": 5.916802610114193, "grad_norm": 0.6046199798583984, "learning_rate": 4.4382876357876834e-05, "loss": 0.1842, "num_input_tokens_seen": 78287360, "step": 36270 }, { "epoch": 5.917618270799347, "grad_norm": 0.07503233850002289, "learning_rate": 4.4380628392816506e-05, "loss": 0.0946, "num_input_tokens_seen": 78296608, "step": 36275 }, { "epoch": 5.918433931484502, "grad_norm": 0.4639696776866913, "learning_rate": 4.437838003498316e-05, "loss": 0.0798, "num_input_tokens_seen": 78306848, "step": 36280 }, { "epoch": 5.919249592169657, "grad_norm": 0.588028609752655, "learning_rate": 4.4376131284422356e-05, "loss": 0.0339, "num_input_tokens_seen": 78317504, "step": 36285 }, { "epoch": 5.920065252854813, "grad_norm": 1.3446779251098633, "learning_rate": 4.437388214117967e-05, "loss": 0.1797, "num_input_tokens_seen": 78327360, "step": 36290 }, { "epoch": 5.920880913539968, "grad_norm": 0.07741666585206985, "learning_rate": 4.437163260530069e-05, "loss": 0.1189, "num_input_tokens_seen": 78338656, "step": 36295 }, { "epoch": 5.921696574225122, "grad_norm": 0.541043221950531, "learning_rate": 4.4369382676830994e-05, "loss": 0.2043, "num_input_tokens_seen": 78350464, "step": 36300 }, { "epoch": 5.922512234910277, "grad_norm": 1.6260051727294922, "learning_rate": 4.436713235581619e-05, "loss": 0.1419, "num_input_tokens_seen": 78360480, "step": 36305 }, { "epoch": 5.923327895595432, "grad_norm": 1.3335908651351929, "learning_rate": 4.436488164230188e-05, "loss": 0.0668, "num_input_tokens_seen": 78371264, "step": 36310 }, { "epoch": 5.924143556280587, "grad_norm": 1.4185476303100586, "learning_rate": 4.436263053633368e-05, "loss": 0.1848, "num_input_tokens_seen": 78382176, "step": 36315 }, { "epoch": 5.924959216965743, "grad_norm": 0.34402310848236084, "learning_rate": 4.436037903795721e-05, "loss": 0.2637, "num_input_tokens_seen": 78392704, "step": 36320 }, { "epoch": 5.925774877650897, "grad_norm": 0.5999917387962341, "learning_rate": 4.4358127147218086e-05, "loss": 0.3269, "num_input_tokens_seen": 78404544, "step": 36325 }, { "epoch": 5.926590538336052, "grad_norm": 0.1398361772298813, "learning_rate": 4.4355874864161975e-05, "loss": 0.0944, "num_input_tokens_seen": 78414400, "step": 36330 }, { "epoch": 5.927406199021207, "grad_norm": 0.1306641399860382, "learning_rate": 4.435362218883449e-05, "loss": 0.0965, "num_input_tokens_seen": 78425664, "step": 36335 }, { "epoch": 5.928221859706362, "grad_norm": 1.4355292320251465, "learning_rate": 4.435136912128131e-05, "loss": 0.0694, "num_input_tokens_seen": 78437120, "step": 36340 }, { "epoch": 5.9290375203915175, "grad_norm": 0.4513325095176697, "learning_rate": 4.434911566154808e-05, "loss": 0.0905, "num_input_tokens_seen": 78447456, "step": 36345 }, { "epoch": 5.929853181076672, "grad_norm": 0.21928483247756958, "learning_rate": 4.434686180968048e-05, "loss": 0.0849, "num_input_tokens_seen": 78457760, "step": 36350 }, { "epoch": 5.930668841761827, "grad_norm": 0.13776874542236328, "learning_rate": 4.434460756572418e-05, "loss": 0.0363, "num_input_tokens_seen": 78469248, "step": 36355 }, { "epoch": 5.931484502446982, "grad_norm": 0.0549340657889843, "learning_rate": 4.4342352929724864e-05, "loss": 0.1879, "num_input_tokens_seen": 78480864, "step": 36360 }, { "epoch": 5.932300163132137, "grad_norm": 1.5218135118484497, "learning_rate": 4.434009790172823e-05, "loss": 0.153, "num_input_tokens_seen": 78492224, "step": 36365 }, { "epoch": 5.933115823817292, "grad_norm": 0.711027204990387, "learning_rate": 4.4337842481779976e-05, "loss": 0.0627, "num_input_tokens_seen": 78503072, "step": 36370 }, { "epoch": 5.933931484502447, "grad_norm": 0.3905428647994995, "learning_rate": 4.433558666992582e-05, "loss": 0.1952, "num_input_tokens_seen": 78511744, "step": 36375 }, { "epoch": 5.934747145187602, "grad_norm": 0.8077658414840698, "learning_rate": 4.433333046621147e-05, "loss": 0.1598, "num_input_tokens_seen": 78522176, "step": 36380 }, { "epoch": 5.935562805872757, "grad_norm": 0.18015077710151672, "learning_rate": 4.4331073870682635e-05, "loss": 0.0645, "num_input_tokens_seen": 78532928, "step": 36385 }, { "epoch": 5.936378466557912, "grad_norm": 0.45212864875793457, "learning_rate": 4.432881688338508e-05, "loss": 0.1354, "num_input_tokens_seen": 78543552, "step": 36390 }, { "epoch": 5.937194127243067, "grad_norm": 0.8059157729148865, "learning_rate": 4.4326559504364526e-05, "loss": 0.1168, "num_input_tokens_seen": 78553824, "step": 36395 }, { "epoch": 5.938009787928221, "grad_norm": 1.4911442995071411, "learning_rate": 4.4324301733666736e-05, "loss": 0.1195, "num_input_tokens_seen": 78565248, "step": 36400 }, { "epoch": 5.938825448613377, "grad_norm": 0.7609632611274719, "learning_rate": 4.432204357133744e-05, "loss": 0.1211, "num_input_tokens_seen": 78577440, "step": 36405 }, { "epoch": 5.939641109298532, "grad_norm": 1.6600499153137207, "learning_rate": 4.431978501742243e-05, "loss": 0.2061, "num_input_tokens_seen": 78588160, "step": 36410 }, { "epoch": 5.940456769983687, "grad_norm": 1.2222763299942017, "learning_rate": 4.431752607196747e-05, "loss": 0.1214, "num_input_tokens_seen": 78598976, "step": 36415 }, { "epoch": 5.941272430668842, "grad_norm": 0.9572247862815857, "learning_rate": 4.4315266735018335e-05, "loss": 0.251, "num_input_tokens_seen": 78608768, "step": 36420 }, { "epoch": 5.942088091353996, "grad_norm": 0.25749650597572327, "learning_rate": 4.4313007006620816e-05, "loss": 0.0944, "num_input_tokens_seen": 78620736, "step": 36425 }, { "epoch": 5.942903752039152, "grad_norm": 0.1766812950372696, "learning_rate": 4.431074688682071e-05, "loss": 0.1456, "num_input_tokens_seen": 78629216, "step": 36430 }, { "epoch": 5.943719412724307, "grad_norm": 0.2658027708530426, "learning_rate": 4.4308486375663814e-05, "loss": 0.0578, "num_input_tokens_seen": 78639040, "step": 36435 }, { "epoch": 5.944535073409462, "grad_norm": 0.12991639971733093, "learning_rate": 4.430622547319596e-05, "loss": 0.1156, "num_input_tokens_seen": 78650624, "step": 36440 }, { "epoch": 5.945350734094617, "grad_norm": 1.2012734413146973, "learning_rate": 4.430396417946295e-05, "loss": 0.0806, "num_input_tokens_seen": 78662368, "step": 36445 }, { "epoch": 5.946166394779771, "grad_norm": 0.8924797177314758, "learning_rate": 4.430170249451061e-05, "loss": 0.1403, "num_input_tokens_seen": 78673984, "step": 36450 }, { "epoch": 5.946982055464927, "grad_norm": 0.22804315388202667, "learning_rate": 4.429944041838479e-05, "loss": 0.0434, "num_input_tokens_seen": 78683328, "step": 36455 }, { "epoch": 5.947797716150082, "grad_norm": 1.446772575378418, "learning_rate": 4.429717795113133e-05, "loss": 0.1457, "num_input_tokens_seen": 78694496, "step": 36460 }, { "epoch": 5.948613376835237, "grad_norm": 0.7540379762649536, "learning_rate": 4.4294915092796074e-05, "loss": 0.0673, "num_input_tokens_seen": 78704832, "step": 36465 }, { "epoch": 5.9494290375203915, "grad_norm": 2.289381742477417, "learning_rate": 4.429265184342488e-05, "loss": 0.1332, "num_input_tokens_seen": 78715840, "step": 36470 }, { "epoch": 5.950244698205546, "grad_norm": 0.27123403549194336, "learning_rate": 4.429038820306363e-05, "loss": 0.0578, "num_input_tokens_seen": 78726592, "step": 36475 }, { "epoch": 5.951060358890701, "grad_norm": 0.3181007206439972, "learning_rate": 4.4288124171758196e-05, "loss": 0.0939, "num_input_tokens_seen": 78738048, "step": 36480 }, { "epoch": 5.951876019575856, "grad_norm": 0.23285655677318573, "learning_rate": 4.428585974955445e-05, "loss": 0.0874, "num_input_tokens_seen": 78749312, "step": 36485 }, { "epoch": 5.952691680261012, "grad_norm": 0.12746359407901764, "learning_rate": 4.428359493649828e-05, "loss": 0.2274, "num_input_tokens_seen": 78760512, "step": 36490 }, { "epoch": 5.9535073409461665, "grad_norm": 1.0957772731781006, "learning_rate": 4.428132973263561e-05, "loss": 0.1183, "num_input_tokens_seen": 78771072, "step": 36495 }, { "epoch": 5.954323001631321, "grad_norm": 0.02795455977320671, "learning_rate": 4.4279064138012325e-05, "loss": 0.0461, "num_input_tokens_seen": 78781376, "step": 36500 }, { "epoch": 5.955138662316476, "grad_norm": 0.5104379653930664, "learning_rate": 4.427679815267436e-05, "loss": 0.1804, "num_input_tokens_seen": 78792640, "step": 36505 }, { "epoch": 5.955954323001631, "grad_norm": 1.0534443855285645, "learning_rate": 4.427453177666762e-05, "loss": 0.1051, "num_input_tokens_seen": 78803776, "step": 36510 }, { "epoch": 5.956769983686787, "grad_norm": 2.009937047958374, "learning_rate": 4.4272265010038036e-05, "loss": 0.1174, "num_input_tokens_seen": 78813440, "step": 36515 }, { "epoch": 5.9575856443719415, "grad_norm": 0.18132176995277405, "learning_rate": 4.426999785283156e-05, "loss": 0.1955, "num_input_tokens_seen": 78824736, "step": 36520 }, { "epoch": 5.958401305057096, "grad_norm": 0.11106869578361511, "learning_rate": 4.426773030509413e-05, "loss": 0.0958, "num_input_tokens_seen": 78833920, "step": 36525 }, { "epoch": 5.959216965742251, "grad_norm": 0.08628716319799423, "learning_rate": 4.42654623668717e-05, "loss": 0.0762, "num_input_tokens_seen": 78845376, "step": 36530 }, { "epoch": 5.960032626427406, "grad_norm": 0.946697473526001, "learning_rate": 4.426319403821024e-05, "loss": 0.0838, "num_input_tokens_seen": 78858016, "step": 36535 }, { "epoch": 5.960848287112562, "grad_norm": 0.8740711808204651, "learning_rate": 4.426092531915571e-05, "loss": 0.0895, "num_input_tokens_seen": 78869216, "step": 36540 }, { "epoch": 5.9616639477977165, "grad_norm": 0.3048674464225769, "learning_rate": 4.42586562097541e-05, "loss": 0.1106, "num_input_tokens_seen": 78880096, "step": 36545 }, { "epoch": 5.962479608482871, "grad_norm": 0.7188835144042969, "learning_rate": 4.425638671005139e-05, "loss": 0.2013, "num_input_tokens_seen": 78889664, "step": 36550 }, { "epoch": 5.963295269168026, "grad_norm": 0.05904955416917801, "learning_rate": 4.425411682009357e-05, "loss": 0.0631, "num_input_tokens_seen": 78902240, "step": 36555 }, { "epoch": 5.964110929853181, "grad_norm": 0.9458028078079224, "learning_rate": 4.4251846539926646e-05, "loss": 0.0786, "num_input_tokens_seen": 78911872, "step": 36560 }, { "epoch": 5.964926590538336, "grad_norm": 1.114089846611023, "learning_rate": 4.424957586959664e-05, "loss": 0.0449, "num_input_tokens_seen": 78922304, "step": 36565 }, { "epoch": 5.9657422512234906, "grad_norm": 0.18142274022102356, "learning_rate": 4.424730480914955e-05, "loss": 0.0628, "num_input_tokens_seen": 78933408, "step": 36570 }, { "epoch": 5.966557911908646, "grad_norm": 0.4060054421424866, "learning_rate": 4.42450333586314e-05, "loss": 0.1253, "num_input_tokens_seen": 78944704, "step": 36575 }, { "epoch": 5.967373572593801, "grad_norm": 0.09663692861795425, "learning_rate": 4.4242761518088254e-05, "loss": 0.0607, "num_input_tokens_seen": 78955584, "step": 36580 }, { "epoch": 5.968189233278956, "grad_norm": 0.5890234708786011, "learning_rate": 4.4240489287566135e-05, "loss": 0.1433, "num_input_tokens_seen": 78966272, "step": 36585 }, { "epoch": 5.969004893964111, "grad_norm": 0.15831749141216278, "learning_rate": 4.4238216667111084e-05, "loss": 0.1722, "num_input_tokens_seen": 78976608, "step": 36590 }, { "epoch": 5.9698205546492655, "grad_norm": 0.09546737372875214, "learning_rate": 4.4235943656769166e-05, "loss": 0.0565, "num_input_tokens_seen": 78987360, "step": 36595 }, { "epoch": 5.970636215334421, "grad_norm": 0.05693449079990387, "learning_rate": 4.423367025658645e-05, "loss": 0.0695, "num_input_tokens_seen": 78999040, "step": 36600 }, { "epoch": 5.971451876019576, "grad_norm": 0.9782765507698059, "learning_rate": 4.423139646660901e-05, "loss": 0.0287, "num_input_tokens_seen": 79010112, "step": 36605 }, { "epoch": 5.972267536704731, "grad_norm": 0.36882418394088745, "learning_rate": 4.422912228688292e-05, "loss": 0.054, "num_input_tokens_seen": 79020416, "step": 36610 }, { "epoch": 5.973083197389886, "grad_norm": 0.7781259417533875, "learning_rate": 4.422684771745427e-05, "loss": 0.207, "num_input_tokens_seen": 79031648, "step": 36615 }, { "epoch": 5.9738988580750405, "grad_norm": 0.14162971079349518, "learning_rate": 4.4224572758369164e-05, "loss": 0.074, "num_input_tokens_seen": 79042560, "step": 36620 }, { "epoch": 5.974714518760196, "grad_norm": 0.5861718058586121, "learning_rate": 4.42222974096737e-05, "loss": 0.1532, "num_input_tokens_seen": 79052672, "step": 36625 }, { "epoch": 5.975530179445351, "grad_norm": 1.5569989681243896, "learning_rate": 4.4220021671414006e-05, "loss": 0.325, "num_input_tokens_seen": 79063744, "step": 36630 }, { "epoch": 5.976345840130506, "grad_norm": 1.4765052795410156, "learning_rate": 4.421774554363617e-05, "loss": 0.0629, "num_input_tokens_seen": 79075424, "step": 36635 }, { "epoch": 5.977161500815661, "grad_norm": 0.14532916247844696, "learning_rate": 4.421546902638635e-05, "loss": 0.1119, "num_input_tokens_seen": 79085792, "step": 36640 }, { "epoch": 5.9779771615008155, "grad_norm": 2.230847120285034, "learning_rate": 4.4213192119710675e-05, "loss": 0.1345, "num_input_tokens_seen": 79096704, "step": 36645 }, { "epoch": 5.97879282218597, "grad_norm": 1.2103842496871948, "learning_rate": 4.421091482365529e-05, "loss": 0.1487, "num_input_tokens_seen": 79107328, "step": 36650 }, { "epoch": 5.979608482871125, "grad_norm": 0.3830009698867798, "learning_rate": 4.420863713826635e-05, "loss": 0.0805, "num_input_tokens_seen": 79118144, "step": 36655 }, { "epoch": 5.980424143556281, "grad_norm": 0.0313827320933342, "learning_rate": 4.4206359063589994e-05, "loss": 0.2706, "num_input_tokens_seen": 79129568, "step": 36660 }, { "epoch": 5.981239804241436, "grad_norm": 2.327314615249634, "learning_rate": 4.420408059967243e-05, "loss": 0.1934, "num_input_tokens_seen": 79139936, "step": 36665 }, { "epoch": 5.9820554649265905, "grad_norm": 1.1551262140274048, "learning_rate": 4.420180174655979e-05, "loss": 0.1109, "num_input_tokens_seen": 79149664, "step": 36670 }, { "epoch": 5.982871125611745, "grad_norm": 0.0952669307589531, "learning_rate": 4.4199522504298286e-05, "loss": 0.1113, "num_input_tokens_seen": 79161280, "step": 36675 }, { "epoch": 5.9836867862969, "grad_norm": 0.09595562517642975, "learning_rate": 4.41972428729341e-05, "loss": 0.0959, "num_input_tokens_seen": 79171488, "step": 36680 }, { "epoch": 5.984502446982056, "grad_norm": 1.0996251106262207, "learning_rate": 4.4194962852513436e-05, "loss": 0.0855, "num_input_tokens_seen": 79182656, "step": 36685 }, { "epoch": 5.985318107667211, "grad_norm": 0.2047969400882721, "learning_rate": 4.41926824430825e-05, "loss": 0.2438, "num_input_tokens_seen": 79194016, "step": 36690 }, { "epoch": 5.986133768352365, "grad_norm": 1.549515962600708, "learning_rate": 4.419040164468751e-05, "loss": 0.2188, "num_input_tokens_seen": 79205472, "step": 36695 }, { "epoch": 5.98694942903752, "grad_norm": 0.1502038836479187, "learning_rate": 4.418812045737468e-05, "loss": 0.1043, "num_input_tokens_seen": 79215904, "step": 36700 }, { "epoch": 5.987765089722675, "grad_norm": 0.26717129349708557, "learning_rate": 4.4185838881190246e-05, "loss": 0.1172, "num_input_tokens_seen": 79227776, "step": 36705 }, { "epoch": 5.988580750407831, "grad_norm": 0.04878146946430206, "learning_rate": 4.418355691618045e-05, "loss": 0.1509, "num_input_tokens_seen": 79239488, "step": 36710 }, { "epoch": 5.989396411092986, "grad_norm": 1.4891809225082397, "learning_rate": 4.418127456239154e-05, "loss": 0.2798, "num_input_tokens_seen": 79249600, "step": 36715 }, { "epoch": 5.99021207177814, "grad_norm": 0.5516091585159302, "learning_rate": 4.4178991819869767e-05, "loss": 0.1853, "num_input_tokens_seen": 79259840, "step": 36720 }, { "epoch": 5.991027732463295, "grad_norm": 1.1561371088027954, "learning_rate": 4.417670868866138e-05, "loss": 0.1405, "num_input_tokens_seen": 79271488, "step": 36725 }, { "epoch": 5.99184339314845, "grad_norm": 0.19318972527980804, "learning_rate": 4.417442516881268e-05, "loss": 0.0665, "num_input_tokens_seen": 79282752, "step": 36730 }, { "epoch": 5.992659053833605, "grad_norm": 0.8866488337516785, "learning_rate": 4.4172141260369934e-05, "loss": 0.0808, "num_input_tokens_seen": 79294336, "step": 36735 }, { "epoch": 5.993474714518761, "grad_norm": 0.8060554265975952, "learning_rate": 4.416985696337941e-05, "loss": 0.2546, "num_input_tokens_seen": 79306272, "step": 36740 }, { "epoch": 5.994290375203915, "grad_norm": 0.448366641998291, "learning_rate": 4.4167572277887424e-05, "loss": 0.0818, "num_input_tokens_seen": 79317120, "step": 36745 }, { "epoch": 5.99510603588907, "grad_norm": 0.07474969327449799, "learning_rate": 4.416528720394027e-05, "loss": 0.0215, "num_input_tokens_seen": 79327200, "step": 36750 }, { "epoch": 5.995921696574225, "grad_norm": 0.832240104675293, "learning_rate": 4.416300174158425e-05, "loss": 0.0964, "num_input_tokens_seen": 79337888, "step": 36755 }, { "epoch": 5.99673735725938, "grad_norm": 0.33477646112442017, "learning_rate": 4.41607158908657e-05, "loss": 0.1086, "num_input_tokens_seen": 79348480, "step": 36760 }, { "epoch": 5.997553017944535, "grad_norm": 0.24886994063854218, "learning_rate": 4.415842965183093e-05, "loss": 0.1459, "num_input_tokens_seen": 79360480, "step": 36765 }, { "epoch": 5.99836867862969, "grad_norm": 1.209238886833191, "learning_rate": 4.4156143024526286e-05, "loss": 0.1218, "num_input_tokens_seen": 79370912, "step": 36770 }, { "epoch": 5.999184339314845, "grad_norm": 0.46373024582862854, "learning_rate": 4.4153856008998096e-05, "loss": 0.057, "num_input_tokens_seen": 79380288, "step": 36775 }, { "epoch": 6.0, "grad_norm": 0.036125533282756805, "learning_rate": 4.415156860529272e-05, "loss": 0.0862, "num_input_tokens_seen": 79389648, "step": 36780 }, { "epoch": 6.0, "eval_loss": 0.13706479966640472, "eval_runtime": 90.5063, "eval_samples_per_second": 30.108, "eval_steps_per_second": 7.535, "num_input_tokens_seen": 79389648, "step": 36780 }, { "epoch": 6.000815660685155, "grad_norm": 0.329149454832077, "learning_rate": 4.41492808134565e-05, "loss": 0.041, "num_input_tokens_seen": 79400656, "step": 36785 }, { "epoch": 6.00163132137031, "grad_norm": 1.7601597309112549, "learning_rate": 4.414699263353582e-05, "loss": 0.1091, "num_input_tokens_seen": 79412656, "step": 36790 }, { "epoch": 6.002446982055465, "grad_norm": 0.08493593335151672, "learning_rate": 4.414470406557704e-05, "loss": 0.0572, "num_input_tokens_seen": 79424048, "step": 36795 }, { "epoch": 6.00326264274062, "grad_norm": 0.5157057642936707, "learning_rate": 4.414241510962655e-05, "loss": 0.0852, "num_input_tokens_seen": 79434896, "step": 36800 }, { "epoch": 6.004078303425775, "grad_norm": 0.0762365311384201, "learning_rate": 4.414012576573073e-05, "loss": 0.1901, "num_input_tokens_seen": 79445680, "step": 36805 }, { "epoch": 6.00489396411093, "grad_norm": 0.5869556665420532, "learning_rate": 4.413783603393598e-05, "loss": 0.0701, "num_input_tokens_seen": 79456592, "step": 36810 }, { "epoch": 6.005709624796085, "grad_norm": 1.0475667715072632, "learning_rate": 4.413554591428871e-05, "loss": 0.1417, "num_input_tokens_seen": 79466992, "step": 36815 }, { "epoch": 6.006525285481239, "grad_norm": 0.3042924702167511, "learning_rate": 4.413325540683533e-05, "loss": 0.0299, "num_input_tokens_seen": 79477168, "step": 36820 }, { "epoch": 6.007340946166395, "grad_norm": 1.601280689239502, "learning_rate": 4.413096451162225e-05, "loss": 0.1237, "num_input_tokens_seen": 79488368, "step": 36825 }, { "epoch": 6.00815660685155, "grad_norm": 0.09004460275173187, "learning_rate": 4.41286732286959e-05, "loss": 0.1099, "num_input_tokens_seen": 79498736, "step": 36830 }, { "epoch": 6.008972267536705, "grad_norm": 0.27092108130455017, "learning_rate": 4.412638155810273e-05, "loss": 0.2799, "num_input_tokens_seen": 79508656, "step": 36835 }, { "epoch": 6.00978792822186, "grad_norm": 0.8998469114303589, "learning_rate": 4.412408949988917e-05, "loss": 0.282, "num_input_tokens_seen": 79519696, "step": 36840 }, { "epoch": 6.010603588907014, "grad_norm": 0.9778851270675659, "learning_rate": 4.4121797054101675e-05, "loss": 0.3115, "num_input_tokens_seen": 79529776, "step": 36845 }, { "epoch": 6.011419249592169, "grad_norm": 1.5664938688278198, "learning_rate": 4.4119504220786715e-05, "loss": 0.1868, "num_input_tokens_seen": 79541392, "step": 36850 }, { "epoch": 6.012234910277325, "grad_norm": 0.739291250705719, "learning_rate": 4.411721099999073e-05, "loss": 0.0612, "num_input_tokens_seen": 79552720, "step": 36855 }, { "epoch": 6.01305057096248, "grad_norm": 0.9349960684776306, "learning_rate": 4.4114917391760226e-05, "loss": 0.2394, "num_input_tokens_seen": 79563888, "step": 36860 }, { "epoch": 6.013866231647635, "grad_norm": 0.08077218383550644, "learning_rate": 4.411262339614166e-05, "loss": 0.1188, "num_input_tokens_seen": 79575760, "step": 36865 }, { "epoch": 6.014681892332789, "grad_norm": 0.36117854714393616, "learning_rate": 4.411032901318154e-05, "loss": 0.0618, "num_input_tokens_seen": 79587408, "step": 36870 }, { "epoch": 6.015497553017944, "grad_norm": 1.0760738849639893, "learning_rate": 4.4108034242926364e-05, "loss": 0.0838, "num_input_tokens_seen": 79598352, "step": 36875 }, { "epoch": 6.0163132137031, "grad_norm": 2.136840343475342, "learning_rate": 4.410573908542263e-05, "loss": 0.2606, "num_input_tokens_seen": 79609520, "step": 36880 }, { "epoch": 6.017128874388255, "grad_norm": 0.2711188495159149, "learning_rate": 4.410344354071686e-05, "loss": 0.0844, "num_input_tokens_seen": 79621712, "step": 36885 }, { "epoch": 6.0179445350734095, "grad_norm": 0.4463031589984894, "learning_rate": 4.410114760885556e-05, "loss": 0.1023, "num_input_tokens_seen": 79630576, "step": 36890 }, { "epoch": 6.018760195758564, "grad_norm": 0.20084501802921295, "learning_rate": 4.409885128988529e-05, "loss": 0.0944, "num_input_tokens_seen": 79641360, "step": 36895 }, { "epoch": 6.019575856443719, "grad_norm": 0.28549715876579285, "learning_rate": 4.4096554583852555e-05, "loss": 0.0577, "num_input_tokens_seen": 79651792, "step": 36900 }, { "epoch": 6.020391517128874, "grad_norm": 0.0894651710987091, "learning_rate": 4.409425749080392e-05, "loss": 0.0907, "num_input_tokens_seen": 79661488, "step": 36905 }, { "epoch": 6.02120717781403, "grad_norm": 1.3247181177139282, "learning_rate": 4.409196001078594e-05, "loss": 0.2672, "num_input_tokens_seen": 79671856, "step": 36910 }, { "epoch": 6.0220228384991845, "grad_norm": 0.0711340680718422, "learning_rate": 4.408966214384517e-05, "loss": 0.0821, "num_input_tokens_seen": 79681232, "step": 36915 }, { "epoch": 6.022838499184339, "grad_norm": 1.9292151927947998, "learning_rate": 4.4087363890028177e-05, "loss": 0.3188, "num_input_tokens_seen": 79691728, "step": 36920 }, { "epoch": 6.023654159869494, "grad_norm": 0.08697623759508133, "learning_rate": 4.408506524938154e-05, "loss": 0.0634, "num_input_tokens_seen": 79702672, "step": 36925 }, { "epoch": 6.024469820554649, "grad_norm": 0.28709620237350464, "learning_rate": 4.408276622195184e-05, "loss": 0.0688, "num_input_tokens_seen": 79713552, "step": 36930 }, { "epoch": 6.025285481239805, "grad_norm": 0.10315871238708496, "learning_rate": 4.408046680778568e-05, "loss": 0.0497, "num_input_tokens_seen": 79725296, "step": 36935 }, { "epoch": 6.0261011419249595, "grad_norm": 1.7134833335876465, "learning_rate": 4.407816700692966e-05, "loss": 0.0868, "num_input_tokens_seen": 79735792, "step": 36940 }, { "epoch": 6.026916802610114, "grad_norm": 0.26333582401275635, "learning_rate": 4.4075866819430376e-05, "loss": 0.0486, "num_input_tokens_seen": 79746576, "step": 36945 }, { "epoch": 6.027732463295269, "grad_norm": 1.2392157316207886, "learning_rate": 4.407356624533445e-05, "loss": 0.0979, "num_input_tokens_seen": 79756496, "step": 36950 }, { "epoch": 6.028548123980424, "grad_norm": 0.7101684212684631, "learning_rate": 4.407126528468851e-05, "loss": 0.088, "num_input_tokens_seen": 79767600, "step": 36955 }, { "epoch": 6.029363784665579, "grad_norm": 0.044073790311813354, "learning_rate": 4.406896393753919e-05, "loss": 0.0936, "num_input_tokens_seen": 79779056, "step": 36960 }, { "epoch": 6.0301794453507345, "grad_norm": 0.7103587985038757, "learning_rate": 4.4066662203933115e-05, "loss": 0.2556, "num_input_tokens_seen": 79790672, "step": 36965 }, { "epoch": 6.030995106035889, "grad_norm": 1.1564964056015015, "learning_rate": 4.4064360083916945e-05, "loss": 0.1722, "num_input_tokens_seen": 79800144, "step": 36970 }, { "epoch": 6.031810766721044, "grad_norm": 1.2828623056411743, "learning_rate": 4.406205757753734e-05, "loss": 0.1701, "num_input_tokens_seen": 79811056, "step": 36975 }, { "epoch": 6.032626427406199, "grad_norm": 1.0006159543991089, "learning_rate": 4.4059754684840946e-05, "loss": 0.1755, "num_input_tokens_seen": 79821616, "step": 36980 }, { "epoch": 6.033442088091354, "grad_norm": 0.569149911403656, "learning_rate": 4.405745140587445e-05, "loss": 0.1148, "num_input_tokens_seen": 79833104, "step": 36985 }, { "epoch": 6.034257748776509, "grad_norm": 0.22883424162864685, "learning_rate": 4.405514774068453e-05, "loss": 0.0321, "num_input_tokens_seen": 79842832, "step": 36990 }, { "epoch": 6.035073409461664, "grad_norm": 0.11392736434936523, "learning_rate": 4.405284368931787e-05, "loss": 0.0474, "num_input_tokens_seen": 79853840, "step": 36995 }, { "epoch": 6.035889070146819, "grad_norm": 0.8579310178756714, "learning_rate": 4.4050539251821155e-05, "loss": 0.1571, "num_input_tokens_seen": 79865488, "step": 37000 }, { "epoch": 6.036704730831974, "grad_norm": 1.1164292097091675, "learning_rate": 4.404823442824109e-05, "loss": 0.2721, "num_input_tokens_seen": 79874640, "step": 37005 }, { "epoch": 6.037520391517129, "grad_norm": 0.4503503143787384, "learning_rate": 4.40459292186244e-05, "loss": 0.0393, "num_input_tokens_seen": 79885552, "step": 37010 }, { "epoch": 6.0383360522022835, "grad_norm": 0.0843207985162735, "learning_rate": 4.404362362301779e-05, "loss": 0.0145, "num_input_tokens_seen": 79896336, "step": 37015 }, { "epoch": 6.039151712887439, "grad_norm": 0.5953121781349182, "learning_rate": 4.4041317641467986e-05, "loss": 0.1352, "num_input_tokens_seen": 79907184, "step": 37020 }, { "epoch": 6.039967373572594, "grad_norm": 1.0327450037002563, "learning_rate": 4.4039011274021724e-05, "loss": 0.1467, "num_input_tokens_seen": 79917936, "step": 37025 }, { "epoch": 6.040783034257749, "grad_norm": 2.7219862937927246, "learning_rate": 4.4036704520725745e-05, "loss": 0.2166, "num_input_tokens_seen": 79930000, "step": 37030 }, { "epoch": 6.041598694942904, "grad_norm": 0.4641461670398712, "learning_rate": 4.40343973816268e-05, "loss": 0.0612, "num_input_tokens_seen": 79941456, "step": 37035 }, { "epoch": 6.0424143556280585, "grad_norm": 0.7913559675216675, "learning_rate": 4.403208985677165e-05, "loss": 0.0672, "num_input_tokens_seen": 79951440, "step": 37040 }, { "epoch": 6.043230016313213, "grad_norm": 0.11626636236906052, "learning_rate": 4.402978194620705e-05, "loss": 0.0385, "num_input_tokens_seen": 79962064, "step": 37045 }, { "epoch": 6.044045676998369, "grad_norm": 0.03828652203083038, "learning_rate": 4.402747364997978e-05, "loss": 0.0358, "num_input_tokens_seen": 79974160, "step": 37050 }, { "epoch": 6.044861337683524, "grad_norm": 0.07732044905424118, "learning_rate": 4.4025164968136624e-05, "loss": 0.0209, "num_input_tokens_seen": 79985072, "step": 37055 }, { "epoch": 6.045676998368679, "grad_norm": 1.0765085220336914, "learning_rate": 4.402285590072436e-05, "loss": 0.1094, "num_input_tokens_seen": 79996784, "step": 37060 }, { "epoch": 6.0464926590538335, "grad_norm": 0.20927883684635162, "learning_rate": 4.4020546447789784e-05, "loss": 0.0491, "num_input_tokens_seen": 80008496, "step": 37065 }, { "epoch": 6.047308319738988, "grad_norm": 1.5347968339920044, "learning_rate": 4.401823660937971e-05, "loss": 0.1734, "num_input_tokens_seen": 80019600, "step": 37070 }, { "epoch": 6.048123980424143, "grad_norm": 0.19583158195018768, "learning_rate": 4.401592638554094e-05, "loss": 0.0373, "num_input_tokens_seen": 80030864, "step": 37075 }, { "epoch": 6.048939641109299, "grad_norm": 0.7370429039001465, "learning_rate": 4.40136157763203e-05, "loss": 0.0466, "num_input_tokens_seen": 80041936, "step": 37080 }, { "epoch": 6.049755301794454, "grad_norm": 0.7155025005340576, "learning_rate": 4.401130478176462e-05, "loss": 0.0584, "num_input_tokens_seen": 80053552, "step": 37085 }, { "epoch": 6.0505709624796085, "grad_norm": 1.2573808431625366, "learning_rate": 4.400899340192073e-05, "loss": 0.2337, "num_input_tokens_seen": 80064688, "step": 37090 }, { "epoch": 6.051386623164763, "grad_norm": 0.9169109463691711, "learning_rate": 4.4006681636835475e-05, "loss": 0.0317, "num_input_tokens_seen": 80076176, "step": 37095 }, { "epoch": 6.052202283849918, "grad_norm": 0.060268521308898926, "learning_rate": 4.4004369486555705e-05, "loss": 0.0364, "num_input_tokens_seen": 80085968, "step": 37100 }, { "epoch": 6.053017944535074, "grad_norm": 0.776373028755188, "learning_rate": 4.400205695112828e-05, "loss": 0.2735, "num_input_tokens_seen": 80096048, "step": 37105 }, { "epoch": 6.053833605220229, "grad_norm": 0.6453686952590942, "learning_rate": 4.399974403060005e-05, "loss": 0.0769, "num_input_tokens_seen": 80106608, "step": 37110 }, { "epoch": 6.054649265905383, "grad_norm": 0.03471837565302849, "learning_rate": 4.399743072501792e-05, "loss": 0.1379, "num_input_tokens_seen": 80116400, "step": 37115 }, { "epoch": 6.055464926590538, "grad_norm": 1.2975969314575195, "learning_rate": 4.3995117034428753e-05, "loss": 0.1558, "num_input_tokens_seen": 80127024, "step": 37120 }, { "epoch": 6.056280587275693, "grad_norm": 0.20175926387310028, "learning_rate": 4.399280295887944e-05, "loss": 0.1122, "num_input_tokens_seen": 80137936, "step": 37125 }, { "epoch": 6.057096247960848, "grad_norm": 1.373273491859436, "learning_rate": 4.399048849841688e-05, "loss": 0.0755, "num_input_tokens_seen": 80148752, "step": 37130 }, { "epoch": 6.057911908646004, "grad_norm": 0.4174937605857849, "learning_rate": 4.398817365308799e-05, "loss": 0.1452, "num_input_tokens_seen": 80159856, "step": 37135 }, { "epoch": 6.058727569331158, "grad_norm": 1.466989278793335, "learning_rate": 4.3985858422939665e-05, "loss": 0.1211, "num_input_tokens_seen": 80172656, "step": 37140 }, { "epoch": 6.059543230016313, "grad_norm": 0.0883631706237793, "learning_rate": 4.398354280801883e-05, "loss": 0.1032, "num_input_tokens_seen": 80184240, "step": 37145 }, { "epoch": 6.060358890701468, "grad_norm": 0.23285627365112305, "learning_rate": 4.398122680837242e-05, "loss": 0.0452, "num_input_tokens_seen": 80195184, "step": 37150 }, { "epoch": 6.061174551386623, "grad_norm": 0.3727458417415619, "learning_rate": 4.397891042404737e-05, "loss": 0.0415, "num_input_tokens_seen": 80205296, "step": 37155 }, { "epoch": 6.061990212071779, "grad_norm": 0.03504066541790962, "learning_rate": 4.397659365509063e-05, "loss": 0.0656, "num_input_tokens_seen": 80216496, "step": 37160 }, { "epoch": 6.062805872756933, "grad_norm": 0.749671459197998, "learning_rate": 4.397427650154914e-05, "loss": 0.0616, "num_input_tokens_seen": 80227280, "step": 37165 }, { "epoch": 6.063621533442088, "grad_norm": 1.176060676574707, "learning_rate": 4.397195896346987e-05, "loss": 0.1544, "num_input_tokens_seen": 80238704, "step": 37170 }, { "epoch": 6.064437194127243, "grad_norm": 1.5154986381530762, "learning_rate": 4.396964104089978e-05, "loss": 0.1144, "num_input_tokens_seen": 80248560, "step": 37175 }, { "epoch": 6.065252854812398, "grad_norm": 0.04372335225343704, "learning_rate": 4.396732273388585e-05, "loss": 0.1002, "num_input_tokens_seen": 80258448, "step": 37180 }, { "epoch": 6.066068515497553, "grad_norm": 0.18843698501586914, "learning_rate": 4.396500404247506e-05, "loss": 0.0267, "num_input_tokens_seen": 80269168, "step": 37185 }, { "epoch": 6.066884176182708, "grad_norm": 0.39084336161613464, "learning_rate": 4.396268496671441e-05, "loss": 0.0406, "num_input_tokens_seen": 80278672, "step": 37190 }, { "epoch": 6.067699836867863, "grad_norm": 0.14043205976486206, "learning_rate": 4.3960365506650885e-05, "loss": 0.0545, "num_input_tokens_seen": 80290128, "step": 37195 }, { "epoch": 6.068515497553018, "grad_norm": 0.06238863617181778, "learning_rate": 4.39580456623315e-05, "loss": 0.1955, "num_input_tokens_seen": 80300816, "step": 37200 }, { "epoch": 6.069331158238173, "grad_norm": 0.11689548939466476, "learning_rate": 4.395572543380329e-05, "loss": 0.0179, "num_input_tokens_seen": 80311120, "step": 37205 }, { "epoch": 6.070146818923328, "grad_norm": 1.3461971282958984, "learning_rate": 4.3953404821113234e-05, "loss": 0.0696, "num_input_tokens_seen": 80321968, "step": 37210 }, { "epoch": 6.0709624796084825, "grad_norm": 0.4071694016456604, "learning_rate": 4.3951083824308395e-05, "loss": 0.1697, "num_input_tokens_seen": 80332176, "step": 37215 }, { "epoch": 6.071778140293638, "grad_norm": 1.881775975227356, "learning_rate": 4.39487624434358e-05, "loss": 0.0777, "num_input_tokens_seen": 80343376, "step": 37220 }, { "epoch": 6.072593800978793, "grad_norm": 0.039980921894311905, "learning_rate": 4.39464406785425e-05, "loss": 0.0255, "num_input_tokens_seen": 80354032, "step": 37225 }, { "epoch": 6.073409461663948, "grad_norm": 0.40504735708236694, "learning_rate": 4.394411852967554e-05, "loss": 0.0355, "num_input_tokens_seen": 80365072, "step": 37230 }, { "epoch": 6.074225122349103, "grad_norm": 0.09932366758584976, "learning_rate": 4.3941795996881986e-05, "loss": 0.0418, "num_input_tokens_seen": 80374512, "step": 37235 }, { "epoch": 6.075040783034257, "grad_norm": 0.12187108397483826, "learning_rate": 4.3939473080208905e-05, "loss": 0.075, "num_input_tokens_seen": 80384368, "step": 37240 }, { "epoch": 6.075856443719413, "grad_norm": 0.36424219608306885, "learning_rate": 4.393714977970338e-05, "loss": 0.0544, "num_input_tokens_seen": 80395152, "step": 37245 }, { "epoch": 6.076672104404568, "grad_norm": 0.7378442287445068, "learning_rate": 4.3934826095412484e-05, "loss": 0.1336, "num_input_tokens_seen": 80406288, "step": 37250 }, { "epoch": 6.077487765089723, "grad_norm": 0.04765138030052185, "learning_rate": 4.393250202738332e-05, "loss": 0.0821, "num_input_tokens_seen": 80417040, "step": 37255 }, { "epoch": 6.078303425774878, "grad_norm": 0.5107657313346863, "learning_rate": 4.393017757566299e-05, "loss": 0.1131, "num_input_tokens_seen": 80428112, "step": 37260 }, { "epoch": 6.079119086460032, "grad_norm": 0.06433217227458954, "learning_rate": 4.392785274029859e-05, "loss": 0.0947, "num_input_tokens_seen": 80438288, "step": 37265 }, { "epoch": 6.079934747145187, "grad_norm": 0.2331266552209854, "learning_rate": 4.3925527521337244e-05, "loss": 0.5197, "num_input_tokens_seen": 80449360, "step": 37270 }, { "epoch": 6.080750407830343, "grad_norm": 1.2905429601669312, "learning_rate": 4.392320191882607e-05, "loss": 0.0659, "num_input_tokens_seen": 80460752, "step": 37275 }, { "epoch": 6.081566068515498, "grad_norm": 0.7279790639877319, "learning_rate": 4.392087593281221e-05, "loss": 0.1177, "num_input_tokens_seen": 80472048, "step": 37280 }, { "epoch": 6.082381729200653, "grad_norm": 0.7909759879112244, "learning_rate": 4.39185495633428e-05, "loss": 0.0759, "num_input_tokens_seen": 80481840, "step": 37285 }, { "epoch": 6.083197389885807, "grad_norm": 0.598690927028656, "learning_rate": 4.391622281046498e-05, "loss": 0.0185, "num_input_tokens_seen": 80493648, "step": 37290 }, { "epoch": 6.084013050570962, "grad_norm": 0.16861635446548462, "learning_rate": 4.39138956742259e-05, "loss": 0.1436, "num_input_tokens_seen": 80503024, "step": 37295 }, { "epoch": 6.084828711256117, "grad_norm": 0.2550452947616577, "learning_rate": 4.3911568154672737e-05, "loss": 0.1471, "num_input_tokens_seen": 80513808, "step": 37300 }, { "epoch": 6.085644371941273, "grad_norm": 0.7019974589347839, "learning_rate": 4.3909240251852656e-05, "loss": 0.2675, "num_input_tokens_seen": 80524176, "step": 37305 }, { "epoch": 6.0864600326264275, "grad_norm": 0.09641298651695251, "learning_rate": 4.390691196581283e-05, "loss": 0.1188, "num_input_tokens_seen": 80536176, "step": 37310 }, { "epoch": 6.087275693311582, "grad_norm": 0.07615245133638382, "learning_rate": 4.390458329660045e-05, "loss": 0.1164, "num_input_tokens_seen": 80546960, "step": 37315 }, { "epoch": 6.088091353996737, "grad_norm": 1.2154618501663208, "learning_rate": 4.390225424426271e-05, "loss": 0.2283, "num_input_tokens_seen": 80557936, "step": 37320 }, { "epoch": 6.088907014681892, "grad_norm": 0.5643240213394165, "learning_rate": 4.3899924808846807e-05, "loss": 0.1471, "num_input_tokens_seen": 80567728, "step": 37325 }, { "epoch": 6.089722675367048, "grad_norm": 0.03698648512363434, "learning_rate": 4.389759499039994e-05, "loss": 0.1263, "num_input_tokens_seen": 80579184, "step": 37330 }, { "epoch": 6.0905383360522025, "grad_norm": 0.707495927810669, "learning_rate": 4.389526478896935e-05, "loss": 0.0604, "num_input_tokens_seen": 80589872, "step": 37335 }, { "epoch": 6.091353996737357, "grad_norm": 0.27563440799713135, "learning_rate": 4.3892934204602246e-05, "loss": 0.0535, "num_input_tokens_seen": 80599888, "step": 37340 }, { "epoch": 6.092169657422512, "grad_norm": 2.302121639251709, "learning_rate": 4.389060323734586e-05, "loss": 0.177, "num_input_tokens_seen": 80610384, "step": 37345 }, { "epoch": 6.092985318107667, "grad_norm": 0.2874036729335785, "learning_rate": 4.3888271887247434e-05, "loss": 0.2006, "num_input_tokens_seen": 80621456, "step": 37350 }, { "epoch": 6.093800978792822, "grad_norm": 0.21758903563022614, "learning_rate": 4.388594015435422e-05, "loss": 0.1013, "num_input_tokens_seen": 80632208, "step": 37355 }, { "epoch": 6.0946166394779775, "grad_norm": 0.9959366321563721, "learning_rate": 4.3883608038713474e-05, "loss": 0.0628, "num_input_tokens_seen": 80642736, "step": 37360 }, { "epoch": 6.095432300163132, "grad_norm": 1.3004882335662842, "learning_rate": 4.388127554037245e-05, "loss": 0.1908, "num_input_tokens_seen": 80654064, "step": 37365 }, { "epoch": 6.096247960848287, "grad_norm": 1.7534432411193848, "learning_rate": 4.387894265937843e-05, "loss": 0.1159, "num_input_tokens_seen": 80664880, "step": 37370 }, { "epoch": 6.097063621533442, "grad_norm": 2.062777519226074, "learning_rate": 4.3876609395778676e-05, "loss": 0.072, "num_input_tokens_seen": 80676720, "step": 37375 }, { "epoch": 6.097879282218597, "grad_norm": 0.961012065410614, "learning_rate": 4.38742757496205e-05, "loss": 0.1556, "num_input_tokens_seen": 80688240, "step": 37380 }, { "epoch": 6.0986949429037525, "grad_norm": 0.2863866984844208, "learning_rate": 4.387194172095117e-05, "loss": 0.097, "num_input_tokens_seen": 80699216, "step": 37385 }, { "epoch": 6.099510603588907, "grad_norm": 0.2519192099571228, "learning_rate": 4.3869607309818005e-05, "loss": 0.2217, "num_input_tokens_seen": 80710192, "step": 37390 }, { "epoch": 6.100326264274062, "grad_norm": 1.6578915119171143, "learning_rate": 4.386727251626831e-05, "loss": 0.2332, "num_input_tokens_seen": 80720880, "step": 37395 }, { "epoch": 6.101141924959217, "grad_norm": 2.3724141120910645, "learning_rate": 4.386493734034941e-05, "loss": 0.0595, "num_input_tokens_seen": 80731760, "step": 37400 }, { "epoch": 6.101957585644372, "grad_norm": 0.14383605122566223, "learning_rate": 4.3862601782108616e-05, "loss": 0.0674, "num_input_tokens_seen": 80741712, "step": 37405 }, { "epoch": 6.102773246329527, "grad_norm": 0.6431389451026917, "learning_rate": 4.386026584159326e-05, "loss": 0.2158, "num_input_tokens_seen": 80753264, "step": 37410 }, { "epoch": 6.103588907014682, "grad_norm": 0.7268275618553162, "learning_rate": 4.385792951885071e-05, "loss": 0.0895, "num_input_tokens_seen": 80764816, "step": 37415 }, { "epoch": 6.104404567699837, "grad_norm": 1.2321292161941528, "learning_rate": 4.385559281392828e-05, "loss": 0.292, "num_input_tokens_seen": 80774320, "step": 37420 }, { "epoch": 6.105220228384992, "grad_norm": 0.021307047456502914, "learning_rate": 4.385325572687334e-05, "loss": 0.0839, "num_input_tokens_seen": 80784976, "step": 37425 }, { "epoch": 6.106035889070147, "grad_norm": 0.2126116305589676, "learning_rate": 4.385091825773327e-05, "loss": 0.1128, "num_input_tokens_seen": 80794672, "step": 37430 }, { "epoch": 6.1068515497553015, "grad_norm": 0.23679716885089874, "learning_rate": 4.3848580406555416e-05, "loss": 0.0404, "num_input_tokens_seen": 80805424, "step": 37435 }, { "epoch": 6.107667210440456, "grad_norm": 0.37585848569869995, "learning_rate": 4.384624217338717e-05, "loss": 0.0473, "num_input_tokens_seen": 80816240, "step": 37440 }, { "epoch": 6.108482871125612, "grad_norm": 0.04660295322537422, "learning_rate": 4.384390355827592e-05, "loss": 0.1989, "num_input_tokens_seen": 80825744, "step": 37445 }, { "epoch": 6.109298531810767, "grad_norm": 1.2249469757080078, "learning_rate": 4.3841564561269054e-05, "loss": 0.1337, "num_input_tokens_seen": 80835984, "step": 37450 }, { "epoch": 6.110114192495922, "grad_norm": 1.088706612586975, "learning_rate": 4.383922518241399e-05, "loss": 0.0997, "num_input_tokens_seen": 80847184, "step": 37455 }, { "epoch": 6.1109298531810765, "grad_norm": 0.15242186188697815, "learning_rate": 4.383688542175812e-05, "loss": 0.0744, "num_input_tokens_seen": 80858608, "step": 37460 }, { "epoch": 6.111745513866231, "grad_norm": 0.0972684770822525, "learning_rate": 4.383454527934888e-05, "loss": 0.1478, "num_input_tokens_seen": 80869584, "step": 37465 }, { "epoch": 6.112561174551387, "grad_norm": 1.52491295337677, "learning_rate": 4.383220475523368e-05, "loss": 0.1166, "num_input_tokens_seen": 80879408, "step": 37470 }, { "epoch": 6.113376835236542, "grad_norm": 0.12265090644359589, "learning_rate": 4.3829863849459965e-05, "loss": 0.1327, "num_input_tokens_seen": 80890032, "step": 37475 }, { "epoch": 6.114192495921697, "grad_norm": 0.26803261041641235, "learning_rate": 4.382752256207516e-05, "loss": 0.1599, "num_input_tokens_seen": 80902160, "step": 37480 }, { "epoch": 6.1150081566068515, "grad_norm": 0.05825848504900932, "learning_rate": 4.382518089312674e-05, "loss": 0.277, "num_input_tokens_seen": 80914032, "step": 37485 }, { "epoch": 6.115823817292006, "grad_norm": 0.12212710082530975, "learning_rate": 4.382283884266214e-05, "loss": 0.0758, "num_input_tokens_seen": 80924912, "step": 37490 }, { "epoch": 6.116639477977161, "grad_norm": 0.08198322355747223, "learning_rate": 4.382049641072883e-05, "loss": 0.1749, "num_input_tokens_seen": 80934544, "step": 37495 }, { "epoch": 6.117455138662317, "grad_norm": 1.1487784385681152, "learning_rate": 4.381815359737429e-05, "loss": 0.1575, "num_input_tokens_seen": 80945456, "step": 37500 }, { "epoch": 6.118270799347472, "grad_norm": 0.5025508403778076, "learning_rate": 4.3815810402646e-05, "loss": 0.0821, "num_input_tokens_seen": 80956368, "step": 37505 }, { "epoch": 6.1190864600326265, "grad_norm": 0.03861101716756821, "learning_rate": 4.381346682659143e-05, "loss": 0.0368, "num_input_tokens_seen": 80967280, "step": 37510 }, { "epoch": 6.119902120717781, "grad_norm": 1.0557565689086914, "learning_rate": 4.38111228692581e-05, "loss": 0.1048, "num_input_tokens_seen": 80978320, "step": 37515 }, { "epoch": 6.120717781402936, "grad_norm": 1.391660451889038, "learning_rate": 4.38087785306935e-05, "loss": 0.1484, "num_input_tokens_seen": 80988624, "step": 37520 }, { "epoch": 6.121533442088092, "grad_norm": 0.6579517722129822, "learning_rate": 4.3806433810945134e-05, "loss": 0.2199, "num_input_tokens_seen": 81000560, "step": 37525 }, { "epoch": 6.122349102773247, "grad_norm": 0.30127447843551636, "learning_rate": 4.3804088710060534e-05, "loss": 0.0164, "num_input_tokens_seen": 81011408, "step": 37530 }, { "epoch": 6.123164763458401, "grad_norm": 0.039234139025211334, "learning_rate": 4.3801743228087225e-05, "loss": 0.1378, "num_input_tokens_seen": 81023472, "step": 37535 }, { "epoch": 6.123980424143556, "grad_norm": 2.1808884143829346, "learning_rate": 4.379939736507272e-05, "loss": 0.2474, "num_input_tokens_seen": 81033968, "step": 37540 }, { "epoch": 6.124796084828711, "grad_norm": 0.9019780158996582, "learning_rate": 4.3797051121064604e-05, "loss": 0.0864, "num_input_tokens_seen": 81044304, "step": 37545 }, { "epoch": 6.125611745513866, "grad_norm": 0.12242327630519867, "learning_rate": 4.379470449611038e-05, "loss": 0.0518, "num_input_tokens_seen": 81055056, "step": 37550 }, { "epoch": 6.126427406199022, "grad_norm": 1.2380131483078003, "learning_rate": 4.379235749025764e-05, "loss": 0.1135, "num_input_tokens_seen": 81066544, "step": 37555 }, { "epoch": 6.127243066884176, "grad_norm": 0.12439048290252686, "learning_rate": 4.379001010355394e-05, "loss": 0.0568, "num_input_tokens_seen": 81077360, "step": 37560 }, { "epoch": 6.128058727569331, "grad_norm": 1.1031484603881836, "learning_rate": 4.3787662336046834e-05, "loss": 0.0549, "num_input_tokens_seen": 81088816, "step": 37565 }, { "epoch": 6.128874388254486, "grad_norm": 0.09426368772983551, "learning_rate": 4.378531418778392e-05, "loss": 0.1418, "num_input_tokens_seen": 81101488, "step": 37570 }, { "epoch": 6.129690048939641, "grad_norm": 0.32021772861480713, "learning_rate": 4.378296565881279e-05, "loss": 0.0675, "num_input_tokens_seen": 81111568, "step": 37575 }, { "epoch": 6.130505709624796, "grad_norm": 0.19854263961315155, "learning_rate": 4.378061674918103e-05, "loss": 0.0505, "num_input_tokens_seen": 81121840, "step": 37580 }, { "epoch": 6.131321370309951, "grad_norm": 0.7656840085983276, "learning_rate": 4.3778267458936244e-05, "loss": 0.052, "num_input_tokens_seen": 81132720, "step": 37585 }, { "epoch": 6.132137030995106, "grad_norm": 1.5284117460250854, "learning_rate": 4.377591778812605e-05, "loss": 0.0946, "num_input_tokens_seen": 81143536, "step": 37590 }, { "epoch": 6.132952691680261, "grad_norm": 1.6153935194015503, "learning_rate": 4.377356773679806e-05, "loss": 0.3009, "num_input_tokens_seen": 81154096, "step": 37595 }, { "epoch": 6.133768352365416, "grad_norm": 0.6004880666732788, "learning_rate": 4.3771217304999913e-05, "loss": 0.0369, "num_input_tokens_seen": 81165104, "step": 37600 }, { "epoch": 6.134584013050571, "grad_norm": 1.4437408447265625, "learning_rate": 4.376886649277922e-05, "loss": 0.1847, "num_input_tokens_seen": 81175952, "step": 37605 }, { "epoch": 6.135399673735726, "grad_norm": 1.2760109901428223, "learning_rate": 4.376651530018364e-05, "loss": 0.1544, "num_input_tokens_seen": 81186608, "step": 37610 }, { "epoch": 6.136215334420881, "grad_norm": 1.2162548303604126, "learning_rate": 4.376416372726083e-05, "loss": 0.1072, "num_input_tokens_seen": 81198128, "step": 37615 }, { "epoch": 6.137030995106036, "grad_norm": 0.8991458415985107, "learning_rate": 4.376181177405843e-05, "loss": 0.1093, "num_input_tokens_seen": 81207792, "step": 37620 }, { "epoch": 6.137846655791191, "grad_norm": 0.5347930788993835, "learning_rate": 4.3759459440624115e-05, "loss": 0.049, "num_input_tokens_seen": 81219152, "step": 37625 }, { "epoch": 6.138662316476346, "grad_norm": 0.30216801166534424, "learning_rate": 4.375710672700556e-05, "loss": 0.0267, "num_input_tokens_seen": 81230800, "step": 37630 }, { "epoch": 6.1394779771615005, "grad_norm": 0.3074359595775604, "learning_rate": 4.3754753633250445e-05, "loss": 0.1512, "num_input_tokens_seen": 81243536, "step": 37635 }, { "epoch": 6.140293637846656, "grad_norm": 0.03548179194331169, "learning_rate": 4.375240015940645e-05, "loss": 0.027, "num_input_tokens_seen": 81255344, "step": 37640 }, { "epoch": 6.141109298531811, "grad_norm": 0.0712142065167427, "learning_rate": 4.375004630552127e-05, "loss": 0.0311, "num_input_tokens_seen": 81266544, "step": 37645 }, { "epoch": 6.141924959216966, "grad_norm": 1.881615400314331, "learning_rate": 4.374769207164263e-05, "loss": 0.1781, "num_input_tokens_seen": 81276976, "step": 37650 }, { "epoch": 6.142740619902121, "grad_norm": 0.16627094149589539, "learning_rate": 4.3745337457818215e-05, "loss": 0.075, "num_input_tokens_seen": 81288656, "step": 37655 }, { "epoch": 6.143556280587275, "grad_norm": 2.335662841796875, "learning_rate": 4.3742982464095764e-05, "loss": 0.2463, "num_input_tokens_seen": 81298928, "step": 37660 }, { "epoch": 6.14437194127243, "grad_norm": 3.1222612857818604, "learning_rate": 4.374062709052299e-05, "loss": 0.2169, "num_input_tokens_seen": 81310224, "step": 37665 }, { "epoch": 6.145187601957586, "grad_norm": 0.23401868343353271, "learning_rate": 4.373827133714763e-05, "loss": 0.0415, "num_input_tokens_seen": 81321616, "step": 37670 }, { "epoch": 6.146003262642741, "grad_norm": 0.4924565553665161, "learning_rate": 4.3735915204017443e-05, "loss": 0.1417, "num_input_tokens_seen": 81332656, "step": 37675 }, { "epoch": 6.146818923327896, "grad_norm": 0.575161874294281, "learning_rate": 4.3733558691180155e-05, "loss": 0.1238, "num_input_tokens_seen": 81343440, "step": 37680 }, { "epoch": 6.14763458401305, "grad_norm": 0.4737270176410675, "learning_rate": 4.373120179868354e-05, "loss": 0.2037, "num_input_tokens_seen": 81354672, "step": 37685 }, { "epoch": 6.148450244698205, "grad_norm": 0.905387818813324, "learning_rate": 4.3728844526575366e-05, "loss": 0.1016, "num_input_tokens_seen": 81365808, "step": 37690 }, { "epoch": 6.149265905383361, "grad_norm": 1.1313492059707642, "learning_rate": 4.3726486874903385e-05, "loss": 0.0853, "num_input_tokens_seen": 81375856, "step": 37695 }, { "epoch": 6.150081566068516, "grad_norm": 1.409772515296936, "learning_rate": 4.37241288437154e-05, "loss": 0.0529, "num_input_tokens_seen": 81387184, "step": 37700 }, { "epoch": 6.150897226753671, "grad_norm": 0.12812037765979767, "learning_rate": 4.3721770433059184e-05, "loss": 0.0967, "num_input_tokens_seen": 81397712, "step": 37705 }, { "epoch": 6.151712887438825, "grad_norm": 0.244872584939003, "learning_rate": 4.3719411642982545e-05, "loss": 0.1182, "num_input_tokens_seen": 81408592, "step": 37710 }, { "epoch": 6.15252854812398, "grad_norm": 2.4131500720977783, "learning_rate": 4.371705247353328e-05, "loss": 0.2737, "num_input_tokens_seen": 81420176, "step": 37715 }, { "epoch": 6.153344208809135, "grad_norm": 1.3878529071807861, "learning_rate": 4.3714692924759206e-05, "loss": 0.1555, "num_input_tokens_seen": 81431920, "step": 37720 }, { "epoch": 6.154159869494291, "grad_norm": 0.5156821012496948, "learning_rate": 4.371233299670813e-05, "loss": 0.2118, "num_input_tokens_seen": 81443312, "step": 37725 }, { "epoch": 6.1549755301794455, "grad_norm": 0.7231618165969849, "learning_rate": 4.370997268942789e-05, "loss": 0.0854, "num_input_tokens_seen": 81454736, "step": 37730 }, { "epoch": 6.1557911908646, "grad_norm": 1.236372470855713, "learning_rate": 4.3707612002966324e-05, "loss": 0.3897, "num_input_tokens_seen": 81465968, "step": 37735 }, { "epoch": 6.156606851549755, "grad_norm": 0.37853214144706726, "learning_rate": 4.370525093737127e-05, "loss": 0.0848, "num_input_tokens_seen": 81477264, "step": 37740 }, { "epoch": 6.15742251223491, "grad_norm": 1.2167903184890747, "learning_rate": 4.3702889492690566e-05, "loss": 0.1059, "num_input_tokens_seen": 81488080, "step": 37745 }, { "epoch": 6.158238172920065, "grad_norm": 0.5179105997085571, "learning_rate": 4.370052766897209e-05, "loss": 0.1985, "num_input_tokens_seen": 81499536, "step": 37750 }, { "epoch": 6.1590538336052205, "grad_norm": 0.1050298660993576, "learning_rate": 4.369816546626369e-05, "loss": 0.074, "num_input_tokens_seen": 81511184, "step": 37755 }, { "epoch": 6.159869494290375, "grad_norm": 0.08935956656932831, "learning_rate": 4.3695802884613246e-05, "loss": 0.3719, "num_input_tokens_seen": 81521648, "step": 37760 }, { "epoch": 6.16068515497553, "grad_norm": 0.11973344534635544, "learning_rate": 4.3693439924068646e-05, "loss": 0.0951, "num_input_tokens_seen": 81532016, "step": 37765 }, { "epoch": 6.161500815660685, "grad_norm": 1.6342576742172241, "learning_rate": 4.3691076584677774e-05, "loss": 0.1183, "num_input_tokens_seen": 81543440, "step": 37770 }, { "epoch": 6.16231647634584, "grad_norm": 1.0362645387649536, "learning_rate": 4.3688712866488515e-05, "loss": 0.1657, "num_input_tokens_seen": 81553680, "step": 37775 }, { "epoch": 6.1631321370309955, "grad_norm": 0.6823140978813171, "learning_rate": 4.3686348769548786e-05, "loss": 0.0382, "num_input_tokens_seen": 81564944, "step": 37780 }, { "epoch": 6.16394779771615, "grad_norm": 0.44017279148101807, "learning_rate": 4.368398429390649e-05, "loss": 0.1072, "num_input_tokens_seen": 81575376, "step": 37785 }, { "epoch": 6.164763458401305, "grad_norm": 1.4193603992462158, "learning_rate": 4.3681619439609555e-05, "loss": 0.1794, "num_input_tokens_seen": 81585840, "step": 37790 }, { "epoch": 6.16557911908646, "grad_norm": 1.3187611103057861, "learning_rate": 4.3679254206705897e-05, "loss": 0.1053, "num_input_tokens_seen": 81597072, "step": 37795 }, { "epoch": 6.166394779771615, "grad_norm": 0.15354788303375244, "learning_rate": 4.367688859524346e-05, "loss": 0.1979, "num_input_tokens_seen": 81608336, "step": 37800 }, { "epoch": 6.16721044045677, "grad_norm": 0.2576713263988495, "learning_rate": 4.3674522605270185e-05, "loss": 0.1249, "num_input_tokens_seen": 81619440, "step": 37805 }, { "epoch": 6.168026101141925, "grad_norm": 1.048357367515564, "learning_rate": 4.3672156236834015e-05, "loss": 0.114, "num_input_tokens_seen": 81629648, "step": 37810 }, { "epoch": 6.16884176182708, "grad_norm": 0.0879688560962677, "learning_rate": 4.3669789489982915e-05, "loss": 0.2506, "num_input_tokens_seen": 81639408, "step": 37815 }, { "epoch": 6.169657422512235, "grad_norm": 0.42736029624938965, "learning_rate": 4.366742236476484e-05, "loss": 0.1107, "num_input_tokens_seen": 81650704, "step": 37820 }, { "epoch": 6.17047308319739, "grad_norm": 0.06235251948237419, "learning_rate": 4.366505486122777e-05, "loss": 0.0358, "num_input_tokens_seen": 81661328, "step": 37825 }, { "epoch": 6.171288743882545, "grad_norm": 0.3600447177886963, "learning_rate": 4.366268697941969e-05, "loss": 0.0616, "num_input_tokens_seen": 81671728, "step": 37830 }, { "epoch": 6.1721044045677, "grad_norm": 0.6012250185012817, "learning_rate": 4.366031871938858e-05, "loss": 0.0724, "num_input_tokens_seen": 81681936, "step": 37835 }, { "epoch": 6.172920065252855, "grad_norm": 1.1398571729660034, "learning_rate": 4.365795008118245e-05, "loss": 0.1485, "num_input_tokens_seen": 81692592, "step": 37840 }, { "epoch": 6.17373572593801, "grad_norm": 0.7944700717926025, "learning_rate": 4.365558106484928e-05, "loss": 0.1142, "num_input_tokens_seen": 81703696, "step": 37845 }, { "epoch": 6.174551386623165, "grad_norm": 0.4317437410354614, "learning_rate": 4.36532116704371e-05, "loss": 0.071, "num_input_tokens_seen": 81714448, "step": 37850 }, { "epoch": 6.1753670473083195, "grad_norm": 0.07604212313890457, "learning_rate": 4.3650841897993916e-05, "loss": 0.1652, "num_input_tokens_seen": 81724560, "step": 37855 }, { "epoch": 6.176182707993474, "grad_norm": 0.05663163587450981, "learning_rate": 4.364847174756777e-05, "loss": 0.0144, "num_input_tokens_seen": 81734384, "step": 37860 }, { "epoch": 6.17699836867863, "grad_norm": 0.43369433283805847, "learning_rate": 4.364610121920667e-05, "loss": 0.0722, "num_input_tokens_seen": 81745552, "step": 37865 }, { "epoch": 6.177814029363785, "grad_norm": 0.6795276403427124, "learning_rate": 4.364373031295868e-05, "loss": 0.1323, "num_input_tokens_seen": 81756144, "step": 37870 }, { "epoch": 6.17862969004894, "grad_norm": 1.3056327104568481, "learning_rate": 4.3641359028871856e-05, "loss": 0.1669, "num_input_tokens_seen": 81767024, "step": 37875 }, { "epoch": 6.1794453507340945, "grad_norm": 2.959141492843628, "learning_rate": 4.3638987366994234e-05, "loss": 0.2482, "num_input_tokens_seen": 81777488, "step": 37880 }, { "epoch": 6.180261011419249, "grad_norm": 0.39340370893478394, "learning_rate": 4.363661532737389e-05, "loss": 0.2452, "num_input_tokens_seen": 81788624, "step": 37885 }, { "epoch": 6.181076672104404, "grad_norm": 0.22710657119750977, "learning_rate": 4.36342429100589e-05, "loss": 0.0687, "num_input_tokens_seen": 81798224, "step": 37890 }, { "epoch": 6.18189233278956, "grad_norm": 0.1966738998889923, "learning_rate": 4.3631870115097327e-05, "loss": 0.0442, "num_input_tokens_seen": 81808976, "step": 37895 }, { "epoch": 6.182707993474715, "grad_norm": 2.1089329719543457, "learning_rate": 4.3629496942537276e-05, "loss": 0.1577, "num_input_tokens_seen": 81819760, "step": 37900 }, { "epoch": 6.1835236541598695, "grad_norm": 0.09243801236152649, "learning_rate": 4.362712339242683e-05, "loss": 0.216, "num_input_tokens_seen": 81830416, "step": 37905 }, { "epoch": 6.184339314845024, "grad_norm": 1.398714542388916, "learning_rate": 4.362474946481411e-05, "loss": 0.2603, "num_input_tokens_seen": 81842416, "step": 37910 }, { "epoch": 6.185154975530179, "grad_norm": 0.20743697881698608, "learning_rate": 4.3622375159747205e-05, "loss": 0.0696, "num_input_tokens_seen": 81852080, "step": 37915 }, { "epoch": 6.185970636215335, "grad_norm": 0.6758499145507812, "learning_rate": 4.3620000477274245e-05, "loss": 0.1298, "num_input_tokens_seen": 81862096, "step": 37920 }, { "epoch": 6.18678629690049, "grad_norm": 1.4958192110061646, "learning_rate": 4.361762541744335e-05, "loss": 0.08, "num_input_tokens_seen": 81872816, "step": 37925 }, { "epoch": 6.1876019575856445, "grad_norm": 1.6399370431900024, "learning_rate": 4.361524998030267e-05, "loss": 0.1658, "num_input_tokens_seen": 81883120, "step": 37930 }, { "epoch": 6.188417618270799, "grad_norm": 0.44962775707244873, "learning_rate": 4.3612874165900326e-05, "loss": 0.0974, "num_input_tokens_seen": 81893904, "step": 37935 }, { "epoch": 6.189233278955954, "grad_norm": 0.06514357030391693, "learning_rate": 4.361049797428447e-05, "loss": 0.1388, "num_input_tokens_seen": 81904368, "step": 37940 }, { "epoch": 6.190048939641109, "grad_norm": 0.2093193382024765, "learning_rate": 4.360812140550328e-05, "loss": 0.0979, "num_input_tokens_seen": 81915664, "step": 37945 }, { "epoch": 6.190864600326265, "grad_norm": 0.16656580567359924, "learning_rate": 4.360574445960489e-05, "loss": 0.0223, "num_input_tokens_seen": 81926736, "step": 37950 }, { "epoch": 6.191680261011419, "grad_norm": 0.04117054119706154, "learning_rate": 4.3603367136637487e-05, "loss": 0.0297, "num_input_tokens_seen": 81937552, "step": 37955 }, { "epoch": 6.192495921696574, "grad_norm": 0.234584778547287, "learning_rate": 4.360098943664925e-05, "loss": 0.2749, "num_input_tokens_seen": 81947120, "step": 37960 }, { "epoch": 6.193311582381729, "grad_norm": 1.3821470737457275, "learning_rate": 4.359861135968837e-05, "loss": 0.16, "num_input_tokens_seen": 81955216, "step": 37965 }, { "epoch": 6.194127243066884, "grad_norm": 0.5480703115463257, "learning_rate": 4.359623290580303e-05, "loss": 0.1805, "num_input_tokens_seen": 81965360, "step": 37970 }, { "epoch": 6.19494290375204, "grad_norm": 1.7449854612350464, "learning_rate": 4.359385407504144e-05, "loss": 0.3198, "num_input_tokens_seen": 81976112, "step": 37975 }, { "epoch": 6.195758564437194, "grad_norm": 1.689435601234436, "learning_rate": 4.359147486745181e-05, "loss": 0.1464, "num_input_tokens_seen": 81987568, "step": 37980 }, { "epoch": 6.196574225122349, "grad_norm": 1.258748173713684, "learning_rate": 4.358909528308236e-05, "loss": 0.1658, "num_input_tokens_seen": 81998096, "step": 37985 }, { "epoch": 6.197389885807504, "grad_norm": 0.13845573365688324, "learning_rate": 4.358671532198131e-05, "loss": 0.0224, "num_input_tokens_seen": 82009872, "step": 37990 }, { "epoch": 6.198205546492659, "grad_norm": 0.1336878389120102, "learning_rate": 4.35843349841969e-05, "loss": 0.0955, "num_input_tokens_seen": 82020080, "step": 37995 }, { "epoch": 6.199021207177814, "grad_norm": 0.08544942736625671, "learning_rate": 4.3581954269777356e-05, "loss": 0.068, "num_input_tokens_seen": 82030416, "step": 38000 }, { "epoch": 6.199836867862969, "grad_norm": 0.6671580672264099, "learning_rate": 4.3579573178770937e-05, "loss": 0.0668, "num_input_tokens_seen": 82041296, "step": 38005 }, { "epoch": 6.200652528548124, "grad_norm": 0.1329992115497589, "learning_rate": 4.357719171122591e-05, "loss": 0.0905, "num_input_tokens_seen": 82051024, "step": 38010 }, { "epoch": 6.201468189233279, "grad_norm": 0.3970228135585785, "learning_rate": 4.3574809867190515e-05, "loss": 0.2646, "num_input_tokens_seen": 82062288, "step": 38015 }, { "epoch": 6.202283849918434, "grad_norm": 0.08674600720405579, "learning_rate": 4.3572427646713033e-05, "loss": 0.0902, "num_input_tokens_seen": 82073808, "step": 38020 }, { "epoch": 6.203099510603589, "grad_norm": 0.22388021647930145, "learning_rate": 4.3570045049841744e-05, "loss": 0.1146, "num_input_tokens_seen": 82083728, "step": 38025 }, { "epoch": 6.2039151712887435, "grad_norm": 2.5948657989501953, "learning_rate": 4.356766207662494e-05, "loss": 0.1575, "num_input_tokens_seen": 82095600, "step": 38030 }, { "epoch": 6.204730831973899, "grad_norm": 0.19208763539791107, "learning_rate": 4.356527872711091e-05, "loss": 0.0962, "num_input_tokens_seen": 82105136, "step": 38035 }, { "epoch": 6.205546492659054, "grad_norm": 0.07196509838104248, "learning_rate": 4.356289500134795e-05, "loss": 0.0389, "num_input_tokens_seen": 82115504, "step": 38040 }, { "epoch": 6.206362153344209, "grad_norm": 0.8392159342765808, "learning_rate": 4.356051089938438e-05, "loss": 0.2306, "num_input_tokens_seen": 82126864, "step": 38045 }, { "epoch": 6.207177814029364, "grad_norm": 0.27376413345336914, "learning_rate": 4.3558126421268506e-05, "loss": 0.1831, "num_input_tokens_seen": 82137520, "step": 38050 }, { "epoch": 6.2079934747145185, "grad_norm": 0.33327123522758484, "learning_rate": 4.355574156704866e-05, "loss": 0.1, "num_input_tokens_seen": 82148976, "step": 38055 }, { "epoch": 6.208809135399674, "grad_norm": 0.43400365114212036, "learning_rate": 4.3553356336773166e-05, "loss": 0.0831, "num_input_tokens_seen": 82160496, "step": 38060 }, { "epoch": 6.209624796084829, "grad_norm": 1.5582759380340576, "learning_rate": 4.355097073049037e-05, "loss": 0.1023, "num_input_tokens_seen": 82171440, "step": 38065 }, { "epoch": 6.210440456769984, "grad_norm": 0.5713582038879395, "learning_rate": 4.354858474824862e-05, "loss": 0.1929, "num_input_tokens_seen": 82182704, "step": 38070 }, { "epoch": 6.211256117455139, "grad_norm": 0.1656067818403244, "learning_rate": 4.3546198390096275e-05, "loss": 0.1417, "num_input_tokens_seen": 82193392, "step": 38075 }, { "epoch": 6.212071778140293, "grad_norm": 1.1712011098861694, "learning_rate": 4.354381165608169e-05, "loss": 0.1585, "num_input_tokens_seen": 82204080, "step": 38080 }, { "epoch": 6.212887438825448, "grad_norm": 1.2094365358352661, "learning_rate": 4.354142454625324e-05, "loss": 0.0706, "num_input_tokens_seen": 82215440, "step": 38085 }, { "epoch": 6.213703099510604, "grad_norm": 1.3800913095474243, "learning_rate": 4.3539037060659294e-05, "loss": 0.1165, "num_input_tokens_seen": 82226832, "step": 38090 }, { "epoch": 6.214518760195759, "grad_norm": 0.04186161607503891, "learning_rate": 4.353664919934824e-05, "loss": 0.0438, "num_input_tokens_seen": 82237296, "step": 38095 }, { "epoch": 6.215334420880914, "grad_norm": 0.8238420486450195, "learning_rate": 4.3534260962368475e-05, "loss": 0.128, "num_input_tokens_seen": 82248400, "step": 38100 }, { "epoch": 6.216150081566068, "grad_norm": 0.2638343572616577, "learning_rate": 4.353187234976841e-05, "loss": 0.1199, "num_input_tokens_seen": 82259088, "step": 38105 }, { "epoch": 6.216965742251223, "grad_norm": 0.08696474879980087, "learning_rate": 4.352948336159644e-05, "loss": 0.2229, "num_input_tokens_seen": 82269424, "step": 38110 }, { "epoch": 6.217781402936378, "grad_norm": 2.7562546730041504, "learning_rate": 4.352709399790098e-05, "loss": 0.309, "num_input_tokens_seen": 82280144, "step": 38115 }, { "epoch": 6.218597063621534, "grad_norm": 0.08082929253578186, "learning_rate": 4.3524704258730453e-05, "loss": 0.0965, "num_input_tokens_seen": 82290672, "step": 38120 }, { "epoch": 6.219412724306689, "grad_norm": 0.5216068625450134, "learning_rate": 4.3522314144133295e-05, "loss": 0.1258, "num_input_tokens_seen": 82301456, "step": 38125 }, { "epoch": 6.220228384991843, "grad_norm": 1.89000403881073, "learning_rate": 4.351992365415795e-05, "loss": 0.1471, "num_input_tokens_seen": 82311696, "step": 38130 }, { "epoch": 6.221044045676998, "grad_norm": 1.818941593170166, "learning_rate": 4.3517532788852855e-05, "loss": 0.1205, "num_input_tokens_seen": 82322320, "step": 38135 }, { "epoch": 6.221859706362153, "grad_norm": 0.9741904735565186, "learning_rate": 4.351514154826646e-05, "loss": 0.1511, "num_input_tokens_seen": 82332976, "step": 38140 }, { "epoch": 6.222675367047309, "grad_norm": 2.9631550312042236, "learning_rate": 4.351274993244724e-05, "loss": 0.2467, "num_input_tokens_seen": 82343920, "step": 38145 }, { "epoch": 6.2234910277324635, "grad_norm": 0.28695863485336304, "learning_rate": 4.351035794144366e-05, "loss": 0.0914, "num_input_tokens_seen": 82354928, "step": 38150 }, { "epoch": 6.224306688417618, "grad_norm": 0.4747202694416046, "learning_rate": 4.350796557530419e-05, "loss": 0.1013, "num_input_tokens_seen": 82366544, "step": 38155 }, { "epoch": 6.225122349102773, "grad_norm": 0.7550003528594971, "learning_rate": 4.350557283407732e-05, "loss": 0.0244, "num_input_tokens_seen": 82377520, "step": 38160 }, { "epoch": 6.225938009787928, "grad_norm": 0.11215893179178238, "learning_rate": 4.350317971781154e-05, "loss": 0.0434, "num_input_tokens_seen": 82388240, "step": 38165 }, { "epoch": 6.226753670473083, "grad_norm": 0.15193331241607666, "learning_rate": 4.350078622655536e-05, "loss": 0.1934, "num_input_tokens_seen": 82399984, "step": 38170 }, { "epoch": 6.2275693311582385, "grad_norm": 0.12964041531085968, "learning_rate": 4.349839236035727e-05, "loss": 0.1128, "num_input_tokens_seen": 82410992, "step": 38175 }, { "epoch": 6.228384991843393, "grad_norm": 0.16953343152999878, "learning_rate": 4.349599811926578e-05, "loss": 0.0523, "num_input_tokens_seen": 82422224, "step": 38180 }, { "epoch": 6.229200652528548, "grad_norm": 1.4349883794784546, "learning_rate": 4.349360350332944e-05, "loss": 0.1786, "num_input_tokens_seen": 82432624, "step": 38185 }, { "epoch": 6.230016313213703, "grad_norm": 0.4182150661945343, "learning_rate": 4.349120851259676e-05, "loss": 0.1827, "num_input_tokens_seen": 82443696, "step": 38190 }, { "epoch": 6.230831973898858, "grad_norm": 0.5661023855209351, "learning_rate": 4.348881314711627e-05, "loss": 0.1133, "num_input_tokens_seen": 82454736, "step": 38195 }, { "epoch": 6.231647634584013, "grad_norm": 1.1029636859893799, "learning_rate": 4.348641740693654e-05, "loss": 0.2722, "num_input_tokens_seen": 82465072, "step": 38200 }, { "epoch": 6.232463295269168, "grad_norm": 0.33262553811073303, "learning_rate": 4.3484021292106105e-05, "loss": 0.0906, "num_input_tokens_seen": 82475760, "step": 38205 }, { "epoch": 6.233278955954323, "grad_norm": 0.14516262710094452, "learning_rate": 4.348162480267353e-05, "loss": 0.1419, "num_input_tokens_seen": 82487056, "step": 38210 }, { "epoch": 6.234094616639478, "grad_norm": 0.0515766367316246, "learning_rate": 4.347922793868738e-05, "loss": 0.2164, "num_input_tokens_seen": 82498544, "step": 38215 }, { "epoch": 6.234910277324633, "grad_norm": 0.05211520567536354, "learning_rate": 4.347683070019624e-05, "loss": 0.177, "num_input_tokens_seen": 82510096, "step": 38220 }, { "epoch": 6.235725938009788, "grad_norm": 0.7578802704811096, "learning_rate": 4.347443308724869e-05, "loss": 0.0407, "num_input_tokens_seen": 82520752, "step": 38225 }, { "epoch": 6.236541598694943, "grad_norm": 0.6289993524551392, "learning_rate": 4.34720350998933e-05, "loss": 0.1266, "num_input_tokens_seen": 82531760, "step": 38230 }, { "epoch": 6.237357259380098, "grad_norm": 0.19210301339626312, "learning_rate": 4.34696367381787e-05, "loss": 0.0644, "num_input_tokens_seen": 82542576, "step": 38235 }, { "epoch": 6.238172920065253, "grad_norm": 0.972611665725708, "learning_rate": 4.3467238002153476e-05, "loss": 0.158, "num_input_tokens_seen": 82553168, "step": 38240 }, { "epoch": 6.238988580750408, "grad_norm": 0.12114091217517853, "learning_rate": 4.346483889186625e-05, "loss": 0.123, "num_input_tokens_seen": 82563280, "step": 38245 }, { "epoch": 6.239804241435563, "grad_norm": 0.03949350863695145, "learning_rate": 4.346243940736564e-05, "loss": 0.1118, "num_input_tokens_seen": 82575120, "step": 38250 }, { "epoch": 6.240619902120717, "grad_norm": 1.9666091203689575, "learning_rate": 4.346003954870027e-05, "loss": 0.1939, "num_input_tokens_seen": 82586032, "step": 38255 }, { "epoch": 6.241435562805873, "grad_norm": 2.792175531387329, "learning_rate": 4.345763931591878e-05, "loss": 0.1827, "num_input_tokens_seen": 82597040, "step": 38260 }, { "epoch": 6.242251223491028, "grad_norm": 0.08983150869607925, "learning_rate": 4.345523870906982e-05, "loss": 0.0325, "num_input_tokens_seen": 82608400, "step": 38265 }, { "epoch": 6.243066884176183, "grad_norm": 1.564549446105957, "learning_rate": 4.345283772820203e-05, "loss": 0.1217, "num_input_tokens_seen": 82618000, "step": 38270 }, { "epoch": 6.2438825448613375, "grad_norm": 0.7574443817138672, "learning_rate": 4.345043637336407e-05, "loss": 0.1586, "num_input_tokens_seen": 82628880, "step": 38275 }, { "epoch": 6.244698205546492, "grad_norm": 0.060338810086250305, "learning_rate": 4.344803464460462e-05, "loss": 0.0934, "num_input_tokens_seen": 82639216, "step": 38280 }, { "epoch": 6.245513866231648, "grad_norm": 0.28981518745422363, "learning_rate": 4.3445632541972334e-05, "loss": 0.0623, "num_input_tokens_seen": 82649264, "step": 38285 }, { "epoch": 6.246329526916803, "grad_norm": 0.10155577957630157, "learning_rate": 4.344323006551592e-05, "loss": 0.1546, "num_input_tokens_seen": 82660112, "step": 38290 }, { "epoch": 6.247145187601958, "grad_norm": 0.18944048881530762, "learning_rate": 4.344082721528404e-05, "loss": 0.0695, "num_input_tokens_seen": 82671408, "step": 38295 }, { "epoch": 6.2479608482871125, "grad_norm": 0.3219861388206482, "learning_rate": 4.3438423991325407e-05, "loss": 0.0762, "num_input_tokens_seen": 82682928, "step": 38300 }, { "epoch": 6.248776508972267, "grad_norm": 0.45438534021377563, "learning_rate": 4.343602039368872e-05, "loss": 0.0753, "num_input_tokens_seen": 82694864, "step": 38305 }, { "epoch": 6.249592169657422, "grad_norm": 0.4050990343093872, "learning_rate": 4.343361642242269e-05, "loss": 0.0324, "num_input_tokens_seen": 82705296, "step": 38310 }, { "epoch": 6.250407830342578, "grad_norm": 0.38540956377983093, "learning_rate": 4.3431212077576046e-05, "loss": 0.2098, "num_input_tokens_seen": 82716304, "step": 38315 }, { "epoch": 6.251223491027733, "grad_norm": 0.09573610126972198, "learning_rate": 4.34288073591975e-05, "loss": 0.082, "num_input_tokens_seen": 82728272, "step": 38320 }, { "epoch": 6.2520391517128875, "grad_norm": 1.257567048072815, "learning_rate": 4.3426402267335796e-05, "loss": 0.1536, "num_input_tokens_seen": 82738896, "step": 38325 }, { "epoch": 6.252854812398042, "grad_norm": 0.30615290999412537, "learning_rate": 4.3423996802039674e-05, "loss": 0.0433, "num_input_tokens_seen": 82749712, "step": 38330 }, { "epoch": 6.253670473083197, "grad_norm": 0.17191261053085327, "learning_rate": 4.342159096335788e-05, "loss": 0.0295, "num_input_tokens_seen": 82758608, "step": 38335 }, { "epoch": 6.254486133768353, "grad_norm": 0.5172674059867859, "learning_rate": 4.3419184751339185e-05, "loss": 0.0494, "num_input_tokens_seen": 82770288, "step": 38340 }, { "epoch": 6.255301794453508, "grad_norm": 1.6922078132629395, "learning_rate": 4.341677816603234e-05, "loss": 0.1914, "num_input_tokens_seen": 82782448, "step": 38345 }, { "epoch": 6.2561174551386625, "grad_norm": 0.4580223858356476, "learning_rate": 4.3414371207486124e-05, "loss": 0.0544, "num_input_tokens_seen": 82793264, "step": 38350 }, { "epoch": 6.256933115823817, "grad_norm": 0.08881805092096329, "learning_rate": 4.341196387574932e-05, "loss": 0.0611, "num_input_tokens_seen": 82804688, "step": 38355 }, { "epoch": 6.257748776508972, "grad_norm": 0.4163900315761566, "learning_rate": 4.34095561708707e-05, "loss": 0.0314, "num_input_tokens_seen": 82816336, "step": 38360 }, { "epoch": 6.258564437194127, "grad_norm": 0.13291069865226746, "learning_rate": 4.3407148092899075e-05, "loss": 0.0218, "num_input_tokens_seen": 82827408, "step": 38365 }, { "epoch": 6.259380097879283, "grad_norm": 2.0476768016815186, "learning_rate": 4.340473964188324e-05, "loss": 0.1784, "num_input_tokens_seen": 82838128, "step": 38370 }, { "epoch": 6.260195758564437, "grad_norm": 0.14472298324108124, "learning_rate": 4.3402330817872013e-05, "loss": 0.0689, "num_input_tokens_seen": 82849072, "step": 38375 }, { "epoch": 6.261011419249592, "grad_norm": 1.8451530933380127, "learning_rate": 4.3399921620914204e-05, "loss": 0.0918, "num_input_tokens_seen": 82859088, "step": 38380 }, { "epoch": 6.261827079934747, "grad_norm": 0.20287162065505981, "learning_rate": 4.339751205105864e-05, "loss": 0.0376, "num_input_tokens_seen": 82870480, "step": 38385 }, { "epoch": 6.262642740619902, "grad_norm": 1.486952781677246, "learning_rate": 4.3395102108354155e-05, "loss": 0.115, "num_input_tokens_seen": 82880720, "step": 38390 }, { "epoch": 6.263458401305057, "grad_norm": 0.18304328620433807, "learning_rate": 4.3392691792849596e-05, "loss": 0.0773, "num_input_tokens_seen": 82891536, "step": 38395 }, { "epoch": 6.264274061990212, "grad_norm": 0.6548401713371277, "learning_rate": 4.33902811045938e-05, "loss": 0.2321, "num_input_tokens_seen": 82903056, "step": 38400 }, { "epoch": 6.265089722675367, "grad_norm": 1.4456672668457031, "learning_rate": 4.338787004363563e-05, "loss": 0.1274, "num_input_tokens_seen": 82912912, "step": 38405 }, { "epoch": 6.265905383360522, "grad_norm": 1.1532974243164062, "learning_rate": 4.3385458610023945e-05, "loss": 0.1512, "num_input_tokens_seen": 82923664, "step": 38410 }, { "epoch": 6.266721044045677, "grad_norm": 0.16317884624004364, "learning_rate": 4.338304680380762e-05, "loss": 0.0254, "num_input_tokens_seen": 82934256, "step": 38415 }, { "epoch": 6.267536704730832, "grad_norm": 1.6058233976364136, "learning_rate": 4.3380634625035535e-05, "loss": 0.2159, "num_input_tokens_seen": 82945712, "step": 38420 }, { "epoch": 6.268352365415987, "grad_norm": 0.8205023407936096, "learning_rate": 4.337822207375656e-05, "loss": 0.0622, "num_input_tokens_seen": 82956720, "step": 38425 }, { "epoch": 6.269168026101142, "grad_norm": 0.6800962090492249, "learning_rate": 4.337580915001961e-05, "loss": 0.1242, "num_input_tokens_seen": 82967856, "step": 38430 }, { "epoch": 6.269983686786297, "grad_norm": 0.9840092062950134, "learning_rate": 4.3373395853873565e-05, "loss": 0.1262, "num_input_tokens_seen": 82979472, "step": 38435 }, { "epoch": 6.270799347471452, "grad_norm": 0.8091889023780823, "learning_rate": 4.337098218536736e-05, "loss": 0.0403, "num_input_tokens_seen": 82989264, "step": 38440 }, { "epoch": 6.271615008156607, "grad_norm": 1.6261390447616577, "learning_rate": 4.336856814454989e-05, "loss": 0.1739, "num_input_tokens_seen": 83000368, "step": 38445 }, { "epoch": 6.2724306688417615, "grad_norm": 0.04377282038331032, "learning_rate": 4.3366153731470084e-05, "loss": 0.0759, "num_input_tokens_seen": 83011376, "step": 38450 }, { "epoch": 6.273246329526917, "grad_norm": 0.17968598008155823, "learning_rate": 4.336373894617687e-05, "loss": 0.1201, "num_input_tokens_seen": 83021008, "step": 38455 }, { "epoch": 6.274061990212072, "grad_norm": 0.3043498396873474, "learning_rate": 4.3361323788719185e-05, "loss": 0.2026, "num_input_tokens_seen": 83031888, "step": 38460 }, { "epoch": 6.274877650897227, "grad_norm": 0.15148434042930603, "learning_rate": 4.3358908259145994e-05, "loss": 0.2428, "num_input_tokens_seen": 83042032, "step": 38465 }, { "epoch": 6.275693311582382, "grad_norm": 0.21266141533851624, "learning_rate": 4.335649235750623e-05, "loss": 0.2241, "num_input_tokens_seen": 83051920, "step": 38470 }, { "epoch": 6.2765089722675365, "grad_norm": 0.27873337268829346, "learning_rate": 4.335407608384886e-05, "loss": 0.2919, "num_input_tokens_seen": 83063280, "step": 38475 }, { "epoch": 6.277324632952691, "grad_norm": 0.7283298969268799, "learning_rate": 4.335165943822286e-05, "loss": 0.0912, "num_input_tokens_seen": 83072112, "step": 38480 }, { "epoch": 6.278140293637847, "grad_norm": 0.23210971057415009, "learning_rate": 4.3349242420677195e-05, "loss": 0.0646, "num_input_tokens_seen": 83083728, "step": 38485 }, { "epoch": 6.278955954323002, "grad_norm": 2.53830885887146, "learning_rate": 4.334682503126086e-05, "loss": 0.3055, "num_input_tokens_seen": 83094256, "step": 38490 }, { "epoch": 6.279771615008157, "grad_norm": 0.47778138518333435, "learning_rate": 4.334440727002284e-05, "loss": 0.1674, "num_input_tokens_seen": 83105552, "step": 38495 }, { "epoch": 6.280587275693311, "grad_norm": 0.7603237628936768, "learning_rate": 4.334198913701213e-05, "loss": 0.0612, "num_input_tokens_seen": 83117296, "step": 38500 }, { "epoch": 6.281402936378466, "grad_norm": 0.4562795162200928, "learning_rate": 4.333957063227775e-05, "loss": 0.1309, "num_input_tokens_seen": 83127824, "step": 38505 }, { "epoch": 6.282218597063622, "grad_norm": 1.0183131694793701, "learning_rate": 4.33371517558687e-05, "loss": 0.1516, "num_input_tokens_seen": 83138608, "step": 38510 }, { "epoch": 6.283034257748777, "grad_norm": 1.2620549201965332, "learning_rate": 4.333473250783401e-05, "loss": 0.1205, "num_input_tokens_seen": 83149936, "step": 38515 }, { "epoch": 6.283849918433932, "grad_norm": 2.2634499073028564, "learning_rate": 4.33323128882227e-05, "loss": 0.1578, "num_input_tokens_seen": 83160528, "step": 38520 }, { "epoch": 6.284665579119086, "grad_norm": 0.9975460171699524, "learning_rate": 4.332989289708382e-05, "loss": 0.1195, "num_input_tokens_seen": 83170640, "step": 38525 }, { "epoch": 6.285481239804241, "grad_norm": 0.1535550206899643, "learning_rate": 4.3327472534466404e-05, "loss": 0.053, "num_input_tokens_seen": 83182128, "step": 38530 }, { "epoch": 6.286296900489396, "grad_norm": 0.7898518443107605, "learning_rate": 4.332505180041951e-05, "loss": 0.2234, "num_input_tokens_seen": 83192368, "step": 38535 }, { "epoch": 6.287112561174552, "grad_norm": 0.9275499582290649, "learning_rate": 4.3322630694992186e-05, "loss": 0.1325, "num_input_tokens_seen": 83203024, "step": 38540 }, { "epoch": 6.287928221859707, "grad_norm": 1.1844278573989868, "learning_rate": 4.332020921823352e-05, "loss": 0.2374, "num_input_tokens_seen": 83214160, "step": 38545 }, { "epoch": 6.288743882544861, "grad_norm": 0.045557402074337006, "learning_rate": 4.331778737019256e-05, "loss": 0.2419, "num_input_tokens_seen": 83224784, "step": 38550 }, { "epoch": 6.289559543230016, "grad_norm": 0.290462851524353, "learning_rate": 4.3315365150918414e-05, "loss": 0.0623, "num_input_tokens_seen": 83235344, "step": 38555 }, { "epoch": 6.290375203915171, "grad_norm": 0.2893018424510956, "learning_rate": 4.331294256046015e-05, "loss": 0.1819, "num_input_tokens_seen": 83246384, "step": 38560 }, { "epoch": 6.291190864600326, "grad_norm": 1.3154431581497192, "learning_rate": 4.331051959886688e-05, "loss": 0.2158, "num_input_tokens_seen": 83257520, "step": 38565 }, { "epoch": 6.2920065252854815, "grad_norm": 0.2169274091720581, "learning_rate": 4.3308096266187695e-05, "loss": 0.1416, "num_input_tokens_seen": 83267824, "step": 38570 }, { "epoch": 6.292822185970636, "grad_norm": 0.168557807803154, "learning_rate": 4.330567256247172e-05, "loss": 0.0637, "num_input_tokens_seen": 83277904, "step": 38575 }, { "epoch": 6.293637846655791, "grad_norm": 0.5806328058242798, "learning_rate": 4.330324848776806e-05, "loss": 0.0147, "num_input_tokens_seen": 83287280, "step": 38580 }, { "epoch": 6.294453507340946, "grad_norm": 1.3296853303909302, "learning_rate": 4.330082404212585e-05, "loss": 0.257, "num_input_tokens_seen": 83297520, "step": 38585 }, { "epoch": 6.295269168026101, "grad_norm": 0.04578527808189392, "learning_rate": 4.329839922559424e-05, "loss": 0.0573, "num_input_tokens_seen": 83308400, "step": 38590 }, { "epoch": 6.2960848287112565, "grad_norm": 0.814846396446228, "learning_rate": 4.3295974038222344e-05, "loss": 0.2605, "num_input_tokens_seen": 83321008, "step": 38595 }, { "epoch": 6.296900489396411, "grad_norm": 0.19044506549835205, "learning_rate": 4.329354848005932e-05, "loss": 0.1113, "num_input_tokens_seen": 83333264, "step": 38600 }, { "epoch": 6.297716150081566, "grad_norm": 0.5263842344284058, "learning_rate": 4.329112255115434e-05, "loss": 0.1442, "num_input_tokens_seen": 83343856, "step": 38605 }, { "epoch": 6.298531810766721, "grad_norm": 0.45883914828300476, "learning_rate": 4.328869625155655e-05, "loss": 0.0843, "num_input_tokens_seen": 83354832, "step": 38610 }, { "epoch": 6.299347471451876, "grad_norm": 0.04036887735128403, "learning_rate": 4.328626958131513e-05, "loss": 0.1533, "num_input_tokens_seen": 83365488, "step": 38615 }, { "epoch": 6.300163132137031, "grad_norm": 1.4333691596984863, "learning_rate": 4.3283842540479264e-05, "loss": 0.1847, "num_input_tokens_seen": 83376688, "step": 38620 }, { "epoch": 6.300978792822186, "grad_norm": 1.7642265558242798, "learning_rate": 4.328141512909814e-05, "loss": 0.1221, "num_input_tokens_seen": 83387056, "step": 38625 }, { "epoch": 6.301794453507341, "grad_norm": 0.7745867371559143, "learning_rate": 4.327898734722093e-05, "loss": 0.1479, "num_input_tokens_seen": 83398224, "step": 38630 }, { "epoch": 6.302610114192496, "grad_norm": 0.36477455496788025, "learning_rate": 4.327655919489686e-05, "loss": 0.1135, "num_input_tokens_seen": 83408880, "step": 38635 }, { "epoch": 6.303425774877651, "grad_norm": 0.30161088705062866, "learning_rate": 4.327413067217514e-05, "loss": 0.179, "num_input_tokens_seen": 83419504, "step": 38640 }, { "epoch": 6.304241435562806, "grad_norm": 0.27773651480674744, "learning_rate": 4.327170177910497e-05, "loss": 0.0651, "num_input_tokens_seen": 83429296, "step": 38645 }, { "epoch": 6.30505709624796, "grad_norm": 0.03252704069018364, "learning_rate": 4.3269272515735586e-05, "loss": 0.0416, "num_input_tokens_seen": 83440912, "step": 38650 }, { "epoch": 6.305872756933116, "grad_norm": 0.2900769114494324, "learning_rate": 4.3266842882116224e-05, "loss": 0.1, "num_input_tokens_seen": 83451568, "step": 38655 }, { "epoch": 6.306688417618271, "grad_norm": 0.16524483263492584, "learning_rate": 4.326441287829611e-05, "loss": 0.0374, "num_input_tokens_seen": 83461200, "step": 38660 }, { "epoch": 6.307504078303426, "grad_norm": 0.11362137645483017, "learning_rate": 4.32619825043245e-05, "loss": 0.0528, "num_input_tokens_seen": 83472016, "step": 38665 }, { "epoch": 6.308319738988581, "grad_norm": 0.3489152491092682, "learning_rate": 4.325955176025065e-05, "loss": 0.106, "num_input_tokens_seen": 83482896, "step": 38670 }, { "epoch": 6.309135399673735, "grad_norm": 2.6188087463378906, "learning_rate": 4.3257120646123825e-05, "loss": 0.317, "num_input_tokens_seen": 83494384, "step": 38675 }, { "epoch": 6.309951060358891, "grad_norm": 0.40788933634757996, "learning_rate": 4.325468916199328e-05, "loss": 0.0407, "num_input_tokens_seen": 83504848, "step": 38680 }, { "epoch": 6.310766721044046, "grad_norm": 0.062415167689323425, "learning_rate": 4.325225730790831e-05, "loss": 0.1044, "num_input_tokens_seen": 83516208, "step": 38685 }, { "epoch": 6.311582381729201, "grad_norm": 3.0080199241638184, "learning_rate": 4.3249825083918186e-05, "loss": 0.1497, "num_input_tokens_seen": 83527056, "step": 38690 }, { "epoch": 6.3123980424143555, "grad_norm": 1.0182570219039917, "learning_rate": 4.32473924900722e-05, "loss": 0.1543, "num_input_tokens_seen": 83537392, "step": 38695 }, { "epoch": 6.31321370309951, "grad_norm": 0.14400827884674072, "learning_rate": 4.324495952641966e-05, "loss": 0.0994, "num_input_tokens_seen": 83548944, "step": 38700 }, { "epoch": 6.314029363784665, "grad_norm": 0.3544236123561859, "learning_rate": 4.324252619300988e-05, "loss": 0.0595, "num_input_tokens_seen": 83558832, "step": 38705 }, { "epoch": 6.314845024469821, "grad_norm": 0.5311485528945923, "learning_rate": 4.324009248989215e-05, "loss": 0.0946, "num_input_tokens_seen": 83567696, "step": 38710 }, { "epoch": 6.315660685154976, "grad_norm": 0.053179170936346054, "learning_rate": 4.323765841711581e-05, "loss": 0.0638, "num_input_tokens_seen": 83578480, "step": 38715 }, { "epoch": 6.3164763458401305, "grad_norm": 0.04240097478032112, "learning_rate": 4.3235223974730196e-05, "loss": 0.1267, "num_input_tokens_seen": 83588496, "step": 38720 }, { "epoch": 6.317292006525285, "grad_norm": 0.29735320806503296, "learning_rate": 4.323278916278463e-05, "loss": 0.0766, "num_input_tokens_seen": 83599376, "step": 38725 }, { "epoch": 6.31810766721044, "grad_norm": 2.2188782691955566, "learning_rate": 4.323035398132845e-05, "loss": 0.14, "num_input_tokens_seen": 83610032, "step": 38730 }, { "epoch": 6.318923327895595, "grad_norm": 0.5497426390647888, "learning_rate": 4.322791843041103e-05, "loss": 0.0287, "num_input_tokens_seen": 83621392, "step": 38735 }, { "epoch": 6.319738988580751, "grad_norm": 0.42787331342697144, "learning_rate": 4.322548251008173e-05, "loss": 0.0509, "num_input_tokens_seen": 83632368, "step": 38740 }, { "epoch": 6.3205546492659055, "grad_norm": 0.2728378176689148, "learning_rate": 4.322304622038989e-05, "loss": 0.0197, "num_input_tokens_seen": 83642800, "step": 38745 }, { "epoch": 6.32137030995106, "grad_norm": 1.1684383153915405, "learning_rate": 4.3220609561384905e-05, "loss": 0.1638, "num_input_tokens_seen": 83653552, "step": 38750 }, { "epoch": 6.322185970636215, "grad_norm": 0.8974997997283936, "learning_rate": 4.321817253311615e-05, "loss": 0.0789, "num_input_tokens_seen": 83664624, "step": 38755 }, { "epoch": 6.32300163132137, "grad_norm": 0.37612926959991455, "learning_rate": 4.3215735135633024e-05, "loss": 0.0702, "num_input_tokens_seen": 83675536, "step": 38760 }, { "epoch": 6.323817292006526, "grad_norm": 0.049866706132888794, "learning_rate": 4.321329736898492e-05, "loss": 0.0305, "num_input_tokens_seen": 83685712, "step": 38765 }, { "epoch": 6.3246329526916805, "grad_norm": 0.14421215653419495, "learning_rate": 4.321085923322123e-05, "loss": 0.0931, "num_input_tokens_seen": 83696112, "step": 38770 }, { "epoch": 6.325448613376835, "grad_norm": 0.4453831613063812, "learning_rate": 4.320842072839137e-05, "loss": 0.079, "num_input_tokens_seen": 83706608, "step": 38775 }, { "epoch": 6.32626427406199, "grad_norm": 0.11152726411819458, "learning_rate": 4.320598185454478e-05, "loss": 0.1595, "num_input_tokens_seen": 83718288, "step": 38780 }, { "epoch": 6.327079934747145, "grad_norm": 0.695723831653595, "learning_rate": 4.320354261173086e-05, "loss": 0.0498, "num_input_tokens_seen": 83729328, "step": 38785 }, { "epoch": 6.327895595432301, "grad_norm": 1.4521132707595825, "learning_rate": 4.320110299999907e-05, "loss": 0.2245, "num_input_tokens_seen": 83740016, "step": 38790 }, { "epoch": 6.328711256117455, "grad_norm": 1.667677640914917, "learning_rate": 4.3198663019398826e-05, "loss": 0.189, "num_input_tokens_seen": 83751120, "step": 38795 }, { "epoch": 6.32952691680261, "grad_norm": 0.26582005620002747, "learning_rate": 4.31962226699796e-05, "loss": 0.1619, "num_input_tokens_seen": 83760880, "step": 38800 }, { "epoch": 6.330342577487765, "grad_norm": 1.3921222686767578, "learning_rate": 4.3193781951790826e-05, "loss": 0.1285, "num_input_tokens_seen": 83771440, "step": 38805 }, { "epoch": 6.33115823817292, "grad_norm": 0.04812769591808319, "learning_rate": 4.319134086488199e-05, "loss": 0.1506, "num_input_tokens_seen": 83781424, "step": 38810 }, { "epoch": 6.331973898858075, "grad_norm": 0.06948009133338928, "learning_rate": 4.318889940930255e-05, "loss": 0.1, "num_input_tokens_seen": 83792848, "step": 38815 }, { "epoch": 6.33278955954323, "grad_norm": 0.6475440859794617, "learning_rate": 4.318645758510199e-05, "loss": 0.1495, "num_input_tokens_seen": 83803152, "step": 38820 }, { "epoch": 6.333605220228385, "grad_norm": 0.3237905204296112, "learning_rate": 4.318401539232979e-05, "loss": 0.0859, "num_input_tokens_seen": 83814672, "step": 38825 }, { "epoch": 6.33442088091354, "grad_norm": 0.39385583996772766, "learning_rate": 4.318157283103545e-05, "loss": 0.1276, "num_input_tokens_seen": 83825904, "step": 38830 }, { "epoch": 6.335236541598695, "grad_norm": 0.23665624856948853, "learning_rate": 4.317912990126848e-05, "loss": 0.0216, "num_input_tokens_seen": 83837936, "step": 38835 }, { "epoch": 6.33605220228385, "grad_norm": 0.8367473483085632, "learning_rate": 4.3176686603078376e-05, "loss": 0.1515, "num_input_tokens_seen": 83848944, "step": 38840 }, { "epoch": 6.3368678629690045, "grad_norm": 1.0872827768325806, "learning_rate": 4.3174242936514666e-05, "loss": 0.1254, "num_input_tokens_seen": 83861136, "step": 38845 }, { "epoch": 6.33768352365416, "grad_norm": 1.246837854385376, "learning_rate": 4.317179890162686e-05, "loss": 0.1918, "num_input_tokens_seen": 83873136, "step": 38850 }, { "epoch": 6.338499184339315, "grad_norm": 0.3481810986995697, "learning_rate": 4.31693544984645e-05, "loss": 0.1374, "num_input_tokens_seen": 83884496, "step": 38855 }, { "epoch": 6.33931484502447, "grad_norm": 0.17428918182849884, "learning_rate": 4.316690972707712e-05, "loss": 0.0189, "num_input_tokens_seen": 83895664, "step": 38860 }, { "epoch": 6.340130505709625, "grad_norm": 1.4248552322387695, "learning_rate": 4.316446458751426e-05, "loss": 0.2046, "num_input_tokens_seen": 83907664, "step": 38865 }, { "epoch": 6.3409461663947795, "grad_norm": 0.2511727511882782, "learning_rate": 4.316201907982549e-05, "loss": 0.0786, "num_input_tokens_seen": 83918896, "step": 38870 }, { "epoch": 6.341761827079935, "grad_norm": 0.7537267804145813, "learning_rate": 4.3159573204060366e-05, "loss": 0.109, "num_input_tokens_seen": 83929008, "step": 38875 }, { "epoch": 6.34257748776509, "grad_norm": 0.11141622066497803, "learning_rate": 4.315712696026845e-05, "loss": 0.0558, "num_input_tokens_seen": 83937776, "step": 38880 }, { "epoch": 6.343393148450245, "grad_norm": 0.20395693182945251, "learning_rate": 4.315468034849932e-05, "loss": 0.0269, "num_input_tokens_seen": 83948816, "step": 38885 }, { "epoch": 6.3442088091354, "grad_norm": 0.7052071690559387, "learning_rate": 4.315223336880257e-05, "loss": 0.0747, "num_input_tokens_seen": 83959440, "step": 38890 }, { "epoch": 6.3450244698205545, "grad_norm": 0.3122289478778839, "learning_rate": 4.314978602122778e-05, "loss": 0.0552, "num_input_tokens_seen": 83971248, "step": 38895 }, { "epoch": 6.345840130505709, "grad_norm": 0.08700529485940933, "learning_rate": 4.314733830582455e-05, "loss": 0.0527, "num_input_tokens_seen": 83981584, "step": 38900 }, { "epoch": 6.346655791190865, "grad_norm": 0.7627732157707214, "learning_rate": 4.314489022264248e-05, "loss": 0.1352, "num_input_tokens_seen": 83992112, "step": 38905 }, { "epoch": 6.34747145187602, "grad_norm": 0.6012852191925049, "learning_rate": 4.3142441771731204e-05, "loss": 0.1168, "num_input_tokens_seen": 84003472, "step": 38910 }, { "epoch": 6.348287112561175, "grad_norm": 0.12558774650096893, "learning_rate": 4.3139992953140326e-05, "loss": 0.0853, "num_input_tokens_seen": 84013840, "step": 38915 }, { "epoch": 6.349102773246329, "grad_norm": 0.018248235806822777, "learning_rate": 4.3137543766919476e-05, "loss": 0.0606, "num_input_tokens_seen": 84024912, "step": 38920 }, { "epoch": 6.349918433931484, "grad_norm": 0.02019411511719227, "learning_rate": 4.313509421311829e-05, "loss": 0.1235, "num_input_tokens_seen": 84035632, "step": 38925 }, { "epoch": 6.350734094616639, "grad_norm": 1.2337946891784668, "learning_rate": 4.313264429178642e-05, "loss": 0.1217, "num_input_tokens_seen": 84046448, "step": 38930 }, { "epoch": 6.351549755301795, "grad_norm": 0.30779632925987244, "learning_rate": 4.313019400297351e-05, "loss": 0.1398, "num_input_tokens_seen": 84057648, "step": 38935 }, { "epoch": 6.35236541598695, "grad_norm": 2.6218156814575195, "learning_rate": 4.312774334672921e-05, "loss": 0.098, "num_input_tokens_seen": 84068880, "step": 38940 }, { "epoch": 6.353181076672104, "grad_norm": 0.7159109115600586, "learning_rate": 4.312529232310321e-05, "loss": 0.0682, "num_input_tokens_seen": 84079696, "step": 38945 }, { "epoch": 6.353996737357259, "grad_norm": 1.8376902341842651, "learning_rate": 4.312284093214515e-05, "loss": 0.2055, "num_input_tokens_seen": 84089456, "step": 38950 }, { "epoch": 6.354812398042414, "grad_norm": 0.20053403079509735, "learning_rate": 4.312038917390474e-05, "loss": 0.0369, "num_input_tokens_seen": 84099440, "step": 38955 }, { "epoch": 6.35562805872757, "grad_norm": 0.9295114874839783, "learning_rate": 4.3117937048431654e-05, "loss": 0.2549, "num_input_tokens_seen": 84109232, "step": 38960 }, { "epoch": 6.356443719412725, "grad_norm": 0.2072593867778778, "learning_rate": 4.3115484555775587e-05, "loss": 0.1181, "num_input_tokens_seen": 84121712, "step": 38965 }, { "epoch": 6.357259380097879, "grad_norm": 0.22535960376262665, "learning_rate": 4.311303169598625e-05, "loss": 0.0283, "num_input_tokens_seen": 84132688, "step": 38970 }, { "epoch": 6.358075040783034, "grad_norm": 1.5135385990142822, "learning_rate": 4.311057846911334e-05, "loss": 0.0859, "num_input_tokens_seen": 84142544, "step": 38975 }, { "epoch": 6.358890701468189, "grad_norm": 2.254579544067383, "learning_rate": 4.3108124875206585e-05, "loss": 0.26, "num_input_tokens_seen": 84151312, "step": 38980 }, { "epoch": 6.359706362153344, "grad_norm": 0.9471266865730286, "learning_rate": 4.310567091431571e-05, "loss": 0.0943, "num_input_tokens_seen": 84162512, "step": 38985 }, { "epoch": 6.3605220228384995, "grad_norm": 1.401062250137329, "learning_rate": 4.310321658649044e-05, "loss": 0.118, "num_input_tokens_seen": 84174480, "step": 38990 }, { "epoch": 6.361337683523654, "grad_norm": 0.07463658601045609, "learning_rate": 4.3100761891780525e-05, "loss": 0.1026, "num_input_tokens_seen": 84185712, "step": 38995 }, { "epoch": 6.362153344208809, "grad_norm": 0.0967262014746666, "learning_rate": 4.3098306830235705e-05, "loss": 0.189, "num_input_tokens_seen": 84195824, "step": 39000 }, { "epoch": 6.362969004893964, "grad_norm": 1.4787036180496216, "learning_rate": 4.309585140190574e-05, "loss": 0.2224, "num_input_tokens_seen": 84207344, "step": 39005 }, { "epoch": 6.363784665579119, "grad_norm": 0.05871058255434036, "learning_rate": 4.309339560684039e-05, "loss": 0.0252, "num_input_tokens_seen": 84218512, "step": 39010 }, { "epoch": 6.364600326264274, "grad_norm": 0.7143253684043884, "learning_rate": 4.309093944508943e-05, "loss": 0.0328, "num_input_tokens_seen": 84229520, "step": 39015 }, { "epoch": 6.365415986949429, "grad_norm": 0.02585838921368122, "learning_rate": 4.3088482916702623e-05, "loss": 0.1606, "num_input_tokens_seen": 84240752, "step": 39020 }, { "epoch": 6.366231647634584, "grad_norm": 0.2306431531906128, "learning_rate": 4.308602602172976e-05, "loss": 0.1282, "num_input_tokens_seen": 84250224, "step": 39025 }, { "epoch": 6.367047308319739, "grad_norm": 0.13748221099376678, "learning_rate": 4.308356876022065e-05, "loss": 0.0205, "num_input_tokens_seen": 84260784, "step": 39030 }, { "epoch": 6.367862969004894, "grad_norm": 0.32660576701164246, "learning_rate": 4.308111113222507e-05, "loss": 0.0373, "num_input_tokens_seen": 84272560, "step": 39035 }, { "epoch": 6.368678629690049, "grad_norm": 0.2408522665500641, "learning_rate": 4.3078653137792836e-05, "loss": 0.0657, "num_input_tokens_seen": 84282640, "step": 39040 }, { "epoch": 6.369494290375204, "grad_norm": 1.0197035074234009, "learning_rate": 4.3076194776973764e-05, "loss": 0.2136, "num_input_tokens_seen": 84293936, "step": 39045 }, { "epoch": 6.370309951060359, "grad_norm": 0.9208832383155823, "learning_rate": 4.307373604981767e-05, "loss": 0.1862, "num_input_tokens_seen": 84303376, "step": 39050 }, { "epoch": 6.371125611745514, "grad_norm": 1.3574087619781494, "learning_rate": 4.307127695637439e-05, "loss": 0.1198, "num_input_tokens_seen": 84313968, "step": 39055 }, { "epoch": 6.371941272430669, "grad_norm": 0.20247633755207062, "learning_rate": 4.306881749669376e-05, "loss": 0.1383, "num_input_tokens_seen": 84325520, "step": 39060 }, { "epoch": 6.372756933115824, "grad_norm": 0.3451400399208069, "learning_rate": 4.3066357670825605e-05, "loss": 0.0379, "num_input_tokens_seen": 84336496, "step": 39065 }, { "epoch": 6.373572593800978, "grad_norm": 0.1376425176858902, "learning_rate": 4.3063897478819815e-05, "loss": 0.1183, "num_input_tokens_seen": 84346928, "step": 39070 }, { "epoch": 6.374388254486134, "grad_norm": 0.09061737358570099, "learning_rate": 4.3061436920726216e-05, "loss": 0.0594, "num_input_tokens_seen": 84358032, "step": 39075 }, { "epoch": 6.375203915171289, "grad_norm": 0.09200412780046463, "learning_rate": 4.305897599659469e-05, "loss": 0.0271, "num_input_tokens_seen": 84368816, "step": 39080 }, { "epoch": 6.376019575856444, "grad_norm": 0.07017379999160767, "learning_rate": 4.3056514706475096e-05, "loss": 0.0484, "num_input_tokens_seen": 84378800, "step": 39085 }, { "epoch": 6.376835236541599, "grad_norm": 0.051460832357406616, "learning_rate": 4.305405305041733e-05, "loss": 0.0354, "num_input_tokens_seen": 84389584, "step": 39090 }, { "epoch": 6.377650897226753, "grad_norm": 0.31021207571029663, "learning_rate": 4.3051591028471286e-05, "loss": 0.2279, "num_input_tokens_seen": 84398768, "step": 39095 }, { "epoch": 6.378466557911908, "grad_norm": 0.2292499840259552, "learning_rate": 4.304912864068683e-05, "loss": 0.0282, "num_input_tokens_seen": 84408464, "step": 39100 }, { "epoch": 6.379282218597064, "grad_norm": 0.08480731397867203, "learning_rate": 4.30466658871139e-05, "loss": 0.0302, "num_input_tokens_seen": 84419600, "step": 39105 }, { "epoch": 6.380097879282219, "grad_norm": 0.929795503616333, "learning_rate": 4.304420276780239e-05, "loss": 0.0579, "num_input_tokens_seen": 84428784, "step": 39110 }, { "epoch": 6.3809135399673735, "grad_norm": 0.07373305410146713, "learning_rate": 4.3041739282802215e-05, "loss": 0.0526, "num_input_tokens_seen": 84439696, "step": 39115 }, { "epoch": 6.381729200652528, "grad_norm": 0.5105882883071899, "learning_rate": 4.303927543216331e-05, "loss": 0.102, "num_input_tokens_seen": 84451280, "step": 39120 }, { "epoch": 6.382544861337683, "grad_norm": 1.8176530599594116, "learning_rate": 4.3036811215935605e-05, "loss": 0.2246, "num_input_tokens_seen": 84462512, "step": 39125 }, { "epoch": 6.383360522022839, "grad_norm": 0.13532865047454834, "learning_rate": 4.303434663416904e-05, "loss": 0.0433, "num_input_tokens_seen": 84473616, "step": 39130 }, { "epoch": 6.384176182707994, "grad_norm": 0.6562988758087158, "learning_rate": 4.303188168691356e-05, "loss": 0.0934, "num_input_tokens_seen": 84485776, "step": 39135 }, { "epoch": 6.3849918433931485, "grad_norm": 0.21690398454666138, "learning_rate": 4.3029416374219124e-05, "loss": 0.0656, "num_input_tokens_seen": 84498064, "step": 39140 }, { "epoch": 6.385807504078303, "grad_norm": 0.04478035122156143, "learning_rate": 4.3026950696135684e-05, "loss": 0.2507, "num_input_tokens_seen": 84509296, "step": 39145 }, { "epoch": 6.386623164763458, "grad_norm": 1.2403806447982788, "learning_rate": 4.3024484652713226e-05, "loss": 0.0509, "num_input_tokens_seen": 84519888, "step": 39150 }, { "epoch": 6.387438825448613, "grad_norm": 0.5875434279441833, "learning_rate": 4.302201824400173e-05, "loss": 0.1737, "num_input_tokens_seen": 84530352, "step": 39155 }, { "epoch": 6.388254486133769, "grad_norm": 0.06157580763101578, "learning_rate": 4.301955147005116e-05, "loss": 0.184, "num_input_tokens_seen": 84540912, "step": 39160 }, { "epoch": 6.3890701468189235, "grad_norm": 0.8514369130134583, "learning_rate": 4.301708433091153e-05, "loss": 0.0518, "num_input_tokens_seen": 84551760, "step": 39165 }, { "epoch": 6.389885807504078, "grad_norm": 0.4937780797481537, "learning_rate": 4.301461682663281e-05, "loss": 0.0809, "num_input_tokens_seen": 84562960, "step": 39170 }, { "epoch": 6.390701468189233, "grad_norm": 0.8653057217597961, "learning_rate": 4.301214895726505e-05, "loss": 0.1407, "num_input_tokens_seen": 84573264, "step": 39175 }, { "epoch": 6.391517128874388, "grad_norm": 1.8532251119613647, "learning_rate": 4.300968072285823e-05, "loss": 0.0499, "num_input_tokens_seen": 84583632, "step": 39180 }, { "epoch": 6.392332789559543, "grad_norm": 1.450795292854309, "learning_rate": 4.300721212346238e-05, "loss": 0.3932, "num_input_tokens_seen": 84594960, "step": 39185 }, { "epoch": 6.3931484502446985, "grad_norm": 1.6274287700653076, "learning_rate": 4.300474315912754e-05, "loss": 0.1299, "num_input_tokens_seen": 84604912, "step": 39190 }, { "epoch": 6.393964110929853, "grad_norm": 0.23119963705539703, "learning_rate": 4.300227382990374e-05, "loss": 0.1818, "num_input_tokens_seen": 84615344, "step": 39195 }, { "epoch": 6.394779771615008, "grad_norm": 0.6857329607009888, "learning_rate": 4.2999804135841024e-05, "loss": 0.081, "num_input_tokens_seen": 84625744, "step": 39200 }, { "epoch": 6.395595432300163, "grad_norm": 1.7416266202926636, "learning_rate": 4.299733407698943e-05, "loss": 0.0837, "num_input_tokens_seen": 84636784, "step": 39205 }, { "epoch": 6.396411092985318, "grad_norm": 1.045867681503296, "learning_rate": 4.299486365339904e-05, "loss": 0.1325, "num_input_tokens_seen": 84648656, "step": 39210 }, { "epoch": 6.397226753670473, "grad_norm": 0.0398913249373436, "learning_rate": 4.299239286511991e-05, "loss": 0.058, "num_input_tokens_seen": 84659728, "step": 39215 }, { "epoch": 6.398042414355628, "grad_norm": 2.1129379272460938, "learning_rate": 4.298992171220211e-05, "loss": 0.1069, "num_input_tokens_seen": 84670672, "step": 39220 }, { "epoch": 6.398858075040783, "grad_norm": 0.9271186590194702, "learning_rate": 4.2987450194695725e-05, "loss": 0.1817, "num_input_tokens_seen": 84679952, "step": 39225 }, { "epoch": 6.399673735725938, "grad_norm": 0.18470484018325806, "learning_rate": 4.298497831265085e-05, "loss": 0.0548, "num_input_tokens_seen": 84691024, "step": 39230 }, { "epoch": 6.400489396411093, "grad_norm": 0.47370198369026184, "learning_rate": 4.298250606611757e-05, "loss": 0.1671, "num_input_tokens_seen": 84700976, "step": 39235 }, { "epoch": 6.401305057096248, "grad_norm": 0.21253135800361633, "learning_rate": 4.298003345514599e-05, "loss": 0.0278, "num_input_tokens_seen": 84710896, "step": 39240 }, { "epoch": 6.402120717781403, "grad_norm": 0.7803694009780884, "learning_rate": 4.2977560479786225e-05, "loss": 0.1691, "num_input_tokens_seen": 84722448, "step": 39245 }, { "epoch": 6.402936378466558, "grad_norm": 2.0484273433685303, "learning_rate": 4.2975087140088386e-05, "loss": 0.2622, "num_input_tokens_seen": 84733168, "step": 39250 }, { "epoch": 6.403752039151713, "grad_norm": 1.6184850931167603, "learning_rate": 4.29726134361026e-05, "loss": 0.1273, "num_input_tokens_seen": 84744176, "step": 39255 }, { "epoch": 6.404567699836868, "grad_norm": 0.32930222153663635, "learning_rate": 4.297013936787902e-05, "loss": 0.0213, "num_input_tokens_seen": 84754704, "step": 39260 }, { "epoch": 6.4053833605220225, "grad_norm": 1.728234052658081, "learning_rate": 4.2967664935467754e-05, "loss": 0.1874, "num_input_tokens_seen": 84765584, "step": 39265 }, { "epoch": 6.406199021207178, "grad_norm": 0.8304633498191833, "learning_rate": 4.2965190138918966e-05, "loss": 0.21, "num_input_tokens_seen": 84776144, "step": 39270 }, { "epoch": 6.407014681892333, "grad_norm": 0.09245866537094116, "learning_rate": 4.2962714978282816e-05, "loss": 0.151, "num_input_tokens_seen": 84785488, "step": 39275 }, { "epoch": 6.407830342577488, "grad_norm": 1.3745512962341309, "learning_rate": 4.296023945360945e-05, "loss": 0.1927, "num_input_tokens_seen": 84796080, "step": 39280 }, { "epoch": 6.408646003262643, "grad_norm": 0.1503588855266571, "learning_rate": 4.295776356494906e-05, "loss": 0.129, "num_input_tokens_seen": 84805520, "step": 39285 }, { "epoch": 6.4094616639477975, "grad_norm": 0.09595408290624619, "learning_rate": 4.29552873123518e-05, "loss": 0.0543, "num_input_tokens_seen": 84816464, "step": 39290 }, { "epoch": 6.410277324632952, "grad_norm": 0.5858474969863892, "learning_rate": 4.295281069586787e-05, "loss": 0.1787, "num_input_tokens_seen": 84826640, "step": 39295 }, { "epoch": 6.411092985318108, "grad_norm": 1.1980241537094116, "learning_rate": 4.295033371554745e-05, "loss": 0.2389, "num_input_tokens_seen": 84836304, "step": 39300 }, { "epoch": 6.411908646003263, "grad_norm": 1.9036451578140259, "learning_rate": 4.2947856371440755e-05, "loss": 0.073, "num_input_tokens_seen": 84847280, "step": 39305 }, { "epoch": 6.412724306688418, "grad_norm": 0.14763551950454712, "learning_rate": 4.294537866359797e-05, "loss": 0.2191, "num_input_tokens_seen": 84859696, "step": 39310 }, { "epoch": 6.4135399673735725, "grad_norm": 0.278119295835495, "learning_rate": 4.2942900592069334e-05, "loss": 0.058, "num_input_tokens_seen": 84870576, "step": 39315 }, { "epoch": 6.414355628058727, "grad_norm": 0.035355955362319946, "learning_rate": 4.294042215690505e-05, "loss": 0.0667, "num_input_tokens_seen": 84882224, "step": 39320 }, { "epoch": 6.415171288743883, "grad_norm": 0.8850900530815125, "learning_rate": 4.293794335815535e-05, "loss": 0.1971, "num_input_tokens_seen": 84892496, "step": 39325 }, { "epoch": 6.415986949429038, "grad_norm": 0.671586811542511, "learning_rate": 4.293546419587048e-05, "loss": 0.2415, "num_input_tokens_seen": 84903440, "step": 39330 }, { "epoch": 6.416802610114193, "grad_norm": 0.894951343536377, "learning_rate": 4.293298467010066e-05, "loss": 0.0873, "num_input_tokens_seen": 84914608, "step": 39335 }, { "epoch": 6.417618270799347, "grad_norm": 0.42364493012428284, "learning_rate": 4.293050478089617e-05, "loss": 0.0541, "num_input_tokens_seen": 84926768, "step": 39340 }, { "epoch": 6.418433931484502, "grad_norm": 1.944488286972046, "learning_rate": 4.292802452830725e-05, "loss": 0.1313, "num_input_tokens_seen": 84937488, "step": 39345 }, { "epoch": 6.419249592169657, "grad_norm": 0.0664239153265953, "learning_rate": 4.292554391238417e-05, "loss": 0.0946, "num_input_tokens_seen": 84949840, "step": 39350 }, { "epoch": 6.420065252854813, "grad_norm": 1.1663720607757568, "learning_rate": 4.29230629331772e-05, "loss": 0.254, "num_input_tokens_seen": 84959088, "step": 39355 }, { "epoch": 6.420880913539968, "grad_norm": 0.09654213488101959, "learning_rate": 4.292058159073662e-05, "loss": 0.0789, "num_input_tokens_seen": 84969232, "step": 39360 }, { "epoch": 6.421696574225122, "grad_norm": 0.19823183119297028, "learning_rate": 4.2918099885112725e-05, "loss": 0.1519, "num_input_tokens_seen": 84981488, "step": 39365 }, { "epoch": 6.422512234910277, "grad_norm": 0.5644437670707703, "learning_rate": 4.2915617816355805e-05, "loss": 0.135, "num_input_tokens_seen": 84992688, "step": 39370 }, { "epoch": 6.423327895595432, "grad_norm": 0.11132444441318512, "learning_rate": 4.291313538451616e-05, "loss": 0.0469, "num_input_tokens_seen": 85004144, "step": 39375 }, { "epoch": 6.424143556280587, "grad_norm": 1.6744318008422852, "learning_rate": 4.291065258964411e-05, "loss": 0.1224, "num_input_tokens_seen": 85015408, "step": 39380 }, { "epoch": 6.424959216965743, "grad_norm": 0.3297322988510132, "learning_rate": 4.290816943178995e-05, "loss": 0.0472, "num_input_tokens_seen": 85026064, "step": 39385 }, { "epoch": 6.425774877650897, "grad_norm": 0.38028770685195923, "learning_rate": 4.290568591100403e-05, "loss": 0.1048, "num_input_tokens_seen": 85037232, "step": 39390 }, { "epoch": 6.426590538336052, "grad_norm": 2.3428704738616943, "learning_rate": 4.2903202027336665e-05, "loss": 0.1855, "num_input_tokens_seen": 85047888, "step": 39395 }, { "epoch": 6.427406199021207, "grad_norm": 0.326084166765213, "learning_rate": 4.29007177808382e-05, "loss": 0.1178, "num_input_tokens_seen": 85058768, "step": 39400 }, { "epoch": 6.428221859706362, "grad_norm": 1.0000559091567993, "learning_rate": 4.2898233171558974e-05, "loss": 0.0973, "num_input_tokens_seen": 85069296, "step": 39405 }, { "epoch": 6.4290375203915175, "grad_norm": 0.21378399431705475, "learning_rate": 4.2895748199549356e-05, "loss": 0.1111, "num_input_tokens_seen": 85080528, "step": 39410 }, { "epoch": 6.429853181076672, "grad_norm": 0.0646069198846817, "learning_rate": 4.28932628648597e-05, "loss": 0.1309, "num_input_tokens_seen": 85091024, "step": 39415 }, { "epoch": 6.430668841761827, "grad_norm": 0.7738844156265259, "learning_rate": 4.289077716754037e-05, "loss": 0.0849, "num_input_tokens_seen": 85100656, "step": 39420 }, { "epoch": 6.431484502446982, "grad_norm": 0.23428276181221008, "learning_rate": 4.288829110764174e-05, "loss": 0.0299, "num_input_tokens_seen": 85111856, "step": 39425 }, { "epoch": 6.432300163132137, "grad_norm": 0.5574290752410889, "learning_rate": 4.28858046852142e-05, "loss": 0.1564, "num_input_tokens_seen": 85120944, "step": 39430 }, { "epoch": 6.433115823817292, "grad_norm": 0.8975366950035095, "learning_rate": 4.288331790030814e-05, "loss": 0.0963, "num_input_tokens_seen": 85131440, "step": 39435 }, { "epoch": 6.433931484502447, "grad_norm": 0.2942159175872803, "learning_rate": 4.2880830752973963e-05, "loss": 0.1177, "num_input_tokens_seen": 85140720, "step": 39440 }, { "epoch": 6.434747145187602, "grad_norm": 0.8616366982460022, "learning_rate": 4.287834324326205e-05, "loss": 0.2251, "num_input_tokens_seen": 85151152, "step": 39445 }, { "epoch": 6.435562805872757, "grad_norm": 1.0752993822097778, "learning_rate": 4.2875855371222846e-05, "loss": 0.0677, "num_input_tokens_seen": 85161392, "step": 39450 }, { "epoch": 6.436378466557912, "grad_norm": 0.8312980532646179, "learning_rate": 4.2873367136906753e-05, "loss": 0.1189, "num_input_tokens_seen": 85172656, "step": 39455 }, { "epoch": 6.437194127243067, "grad_norm": 0.1877436488866806, "learning_rate": 4.2870878540364204e-05, "loss": 0.0557, "num_input_tokens_seen": 85184784, "step": 39460 }, { "epoch": 6.438009787928221, "grad_norm": 1.9706863164901733, "learning_rate": 4.286838958164562e-05, "loss": 0.114, "num_input_tokens_seen": 85195248, "step": 39465 }, { "epoch": 6.438825448613377, "grad_norm": 0.6651427745819092, "learning_rate": 4.286590026080146e-05, "loss": 0.0935, "num_input_tokens_seen": 85205040, "step": 39470 }, { "epoch": 6.439641109298532, "grad_norm": 0.40226179361343384, "learning_rate": 4.2863410577882164e-05, "loss": 0.0464, "num_input_tokens_seen": 85215344, "step": 39475 }, { "epoch": 6.440456769983687, "grad_norm": 3.37798810005188, "learning_rate": 4.286092053293819e-05, "loss": 0.1727, "num_input_tokens_seen": 85226352, "step": 39480 }, { "epoch": 6.441272430668842, "grad_norm": 0.1343870460987091, "learning_rate": 4.285843012602001e-05, "loss": 0.0364, "num_input_tokens_seen": 85235408, "step": 39485 }, { "epoch": 6.442088091353996, "grad_norm": 0.12777750194072723, "learning_rate": 4.2855939357178084e-05, "loss": 0.1305, "num_input_tokens_seen": 85246352, "step": 39490 }, { "epoch": 6.442903752039152, "grad_norm": 1.1776199340820312, "learning_rate": 4.2853448226462895e-05, "loss": 0.1974, "num_input_tokens_seen": 85256368, "step": 39495 }, { "epoch": 6.443719412724307, "grad_norm": 0.05635562539100647, "learning_rate": 4.285095673392493e-05, "loss": 0.0529, "num_input_tokens_seen": 85267856, "step": 39500 }, { "epoch": 6.444535073409462, "grad_norm": 0.2121029794216156, "learning_rate": 4.284846487961468e-05, "loss": 0.147, "num_input_tokens_seen": 85280112, "step": 39505 }, { "epoch": 6.445350734094617, "grad_norm": 0.5487963557243347, "learning_rate": 4.2845972663582646e-05, "loss": 0.0483, "num_input_tokens_seen": 85288688, "step": 39510 }, { "epoch": 6.446166394779771, "grad_norm": 0.3492763638496399, "learning_rate": 4.284348008587934e-05, "loss": 0.3026, "num_input_tokens_seen": 85299888, "step": 39515 }, { "epoch": 6.446982055464926, "grad_norm": 0.09885407984256744, "learning_rate": 4.284098714655527e-05, "loss": 0.0628, "num_input_tokens_seen": 85310128, "step": 39520 }, { "epoch": 6.447797716150082, "grad_norm": 1.987382411956787, "learning_rate": 4.2838493845660965e-05, "loss": 0.073, "num_input_tokens_seen": 85321072, "step": 39525 }, { "epoch": 6.448613376835237, "grad_norm": 0.6154369115829468, "learning_rate": 4.283600018324695e-05, "loss": 0.0914, "num_input_tokens_seen": 85333008, "step": 39530 }, { "epoch": 6.4494290375203915, "grad_norm": 0.7712268829345703, "learning_rate": 4.2833506159363766e-05, "loss": 0.0726, "num_input_tokens_seen": 85344048, "step": 39535 }, { "epoch": 6.450244698205546, "grad_norm": 0.18268677592277527, "learning_rate": 4.283101177406196e-05, "loss": 0.0878, "num_input_tokens_seen": 85353136, "step": 39540 }, { "epoch": 6.451060358890701, "grad_norm": 1.182808756828308, "learning_rate": 4.282851702739208e-05, "loss": 0.1331, "num_input_tokens_seen": 85364016, "step": 39545 }, { "epoch": 6.451876019575856, "grad_norm": 1.1700252294540405, "learning_rate": 4.2826021919404686e-05, "loss": 0.1386, "num_input_tokens_seen": 85374928, "step": 39550 }, { "epoch": 6.452691680261012, "grad_norm": 2.1310689449310303, "learning_rate": 4.282352645015034e-05, "loss": 0.1948, "num_input_tokens_seen": 85383888, "step": 39555 }, { "epoch": 6.4535073409461665, "grad_norm": 2.887540817260742, "learning_rate": 4.282103061967962e-05, "loss": 0.2355, "num_input_tokens_seen": 85394736, "step": 39560 }, { "epoch": 6.454323001631321, "grad_norm": 0.26535460352897644, "learning_rate": 4.281853442804311e-05, "loss": 0.1599, "num_input_tokens_seen": 85405232, "step": 39565 }, { "epoch": 6.455138662316476, "grad_norm": 0.5261968970298767, "learning_rate": 4.2816037875291395e-05, "loss": 0.1337, "num_input_tokens_seen": 85416176, "step": 39570 }, { "epoch": 6.455954323001631, "grad_norm": 0.9078853130340576, "learning_rate": 4.281354096147506e-05, "loss": 0.1085, "num_input_tokens_seen": 85426800, "step": 39575 }, { "epoch": 6.456769983686787, "grad_norm": 0.6707747578620911, "learning_rate": 4.2811043686644736e-05, "loss": 0.09, "num_input_tokens_seen": 85437520, "step": 39580 }, { "epoch": 6.4575856443719415, "grad_norm": 0.6769729852676392, "learning_rate": 4.280854605085101e-05, "loss": 0.1031, "num_input_tokens_seen": 85447888, "step": 39585 }, { "epoch": 6.458401305057096, "grad_norm": 0.03026416152715683, "learning_rate": 4.28060480541445e-05, "loss": 0.0755, "num_input_tokens_seen": 85459344, "step": 39590 }, { "epoch": 6.459216965742251, "grad_norm": 0.4135826826095581, "learning_rate": 4.2803549696575845e-05, "loss": 0.185, "num_input_tokens_seen": 85469744, "step": 39595 }, { "epoch": 6.460032626427406, "grad_norm": 0.2260814905166626, "learning_rate": 4.280105097819567e-05, "loss": 0.032, "num_input_tokens_seen": 85481168, "step": 39600 }, { "epoch": 6.460848287112561, "grad_norm": 0.11565981805324554, "learning_rate": 4.2798551899054614e-05, "loss": 0.0617, "num_input_tokens_seen": 85492976, "step": 39605 }, { "epoch": 6.4616639477977165, "grad_norm": 0.3464037775993347, "learning_rate": 4.279605245920332e-05, "loss": 0.1225, "num_input_tokens_seen": 85502896, "step": 39610 }, { "epoch": 6.462479608482871, "grad_norm": 2.6389403343200684, "learning_rate": 4.2793552658692446e-05, "loss": 0.2919, "num_input_tokens_seen": 85513680, "step": 39615 }, { "epoch": 6.463295269168026, "grad_norm": 0.11758792400360107, "learning_rate": 4.2791052497572666e-05, "loss": 0.1791, "num_input_tokens_seen": 85524784, "step": 39620 }, { "epoch": 6.464110929853181, "grad_norm": 1.3761310577392578, "learning_rate": 4.278855197589463e-05, "loss": 0.1831, "num_input_tokens_seen": 85536208, "step": 39625 }, { "epoch": 6.464926590538336, "grad_norm": 0.1887134164571762, "learning_rate": 4.278605109370902e-05, "loss": 0.0668, "num_input_tokens_seen": 85545936, "step": 39630 }, { "epoch": 6.465742251223491, "grad_norm": 0.09270903468132019, "learning_rate": 4.278354985106653e-05, "loss": 0.066, "num_input_tokens_seen": 85556080, "step": 39635 }, { "epoch": 6.466557911908646, "grad_norm": 1.9888060092926025, "learning_rate": 4.278104824801783e-05, "loss": 0.2042, "num_input_tokens_seen": 85565840, "step": 39640 }, { "epoch": 6.467373572593801, "grad_norm": 0.2874290347099304, "learning_rate": 4.2778546284613644e-05, "loss": 0.2001, "num_input_tokens_seen": 85576912, "step": 39645 }, { "epoch": 6.468189233278956, "grad_norm": 0.13380229473114014, "learning_rate": 4.277604396090466e-05, "loss": 0.0901, "num_input_tokens_seen": 85587760, "step": 39650 }, { "epoch": 6.469004893964111, "grad_norm": 0.026236291974782944, "learning_rate": 4.277354127694159e-05, "loss": 0.0512, "num_input_tokens_seen": 85597552, "step": 39655 }, { "epoch": 6.4698205546492655, "grad_norm": 1.3664212226867676, "learning_rate": 4.2771038232775155e-05, "loss": 0.1522, "num_input_tokens_seen": 85608368, "step": 39660 }, { "epoch": 6.470636215334421, "grad_norm": 1.4358186721801758, "learning_rate": 4.2768534828456094e-05, "loss": 0.0715, "num_input_tokens_seen": 85618096, "step": 39665 }, { "epoch": 6.471451876019576, "grad_norm": 0.24879442155361176, "learning_rate": 4.276603106403513e-05, "loss": 0.0493, "num_input_tokens_seen": 85628656, "step": 39670 }, { "epoch": 6.472267536704731, "grad_norm": 0.6420644521713257, "learning_rate": 4.276352693956301e-05, "loss": 0.0424, "num_input_tokens_seen": 85639632, "step": 39675 }, { "epoch": 6.473083197389886, "grad_norm": 0.10978254675865173, "learning_rate": 4.276102245509048e-05, "loss": 0.1425, "num_input_tokens_seen": 85650832, "step": 39680 }, { "epoch": 6.4738988580750405, "grad_norm": 0.2610606551170349, "learning_rate": 4.27585176106683e-05, "loss": 0.0818, "num_input_tokens_seen": 85661296, "step": 39685 }, { "epoch": 6.474714518760196, "grad_norm": 1.3490689992904663, "learning_rate": 4.2756012406347226e-05, "loss": 0.0928, "num_input_tokens_seen": 85672368, "step": 39690 }, { "epoch": 6.475530179445351, "grad_norm": 0.06368512660264969, "learning_rate": 4.2753506842178045e-05, "loss": 0.0638, "num_input_tokens_seen": 85682160, "step": 39695 }, { "epoch": 6.476345840130506, "grad_norm": 0.567844808101654, "learning_rate": 4.275100091821152e-05, "loss": 0.0488, "num_input_tokens_seen": 85692656, "step": 39700 }, { "epoch": 6.477161500815661, "grad_norm": 2.3809802532196045, "learning_rate": 4.274849463449845e-05, "loss": 0.1382, "num_input_tokens_seen": 85703600, "step": 39705 }, { "epoch": 6.4779771615008155, "grad_norm": 0.9584880471229553, "learning_rate": 4.2745987991089614e-05, "loss": 0.1198, "num_input_tokens_seen": 85714512, "step": 39710 }, { "epoch": 6.47879282218597, "grad_norm": 0.6426225304603577, "learning_rate": 4.2743480988035825e-05, "loss": 0.1972, "num_input_tokens_seen": 85723888, "step": 39715 }, { "epoch": 6.479608482871126, "grad_norm": 0.7222511172294617, "learning_rate": 4.274097362538788e-05, "loss": 0.0501, "num_input_tokens_seen": 85734992, "step": 39720 }, { "epoch": 6.480424143556281, "grad_norm": 0.4657454192638397, "learning_rate": 4.27384659031966e-05, "loss": 0.0437, "num_input_tokens_seen": 85746768, "step": 39725 }, { "epoch": 6.481239804241436, "grad_norm": 0.34518295526504517, "learning_rate": 4.273595782151281e-05, "loss": 0.0587, "num_input_tokens_seen": 85758352, "step": 39730 }, { "epoch": 6.4820554649265905, "grad_norm": 0.12400789558887482, "learning_rate": 4.273344938038732e-05, "loss": 0.0276, "num_input_tokens_seen": 85769424, "step": 39735 }, { "epoch": 6.482871125611745, "grad_norm": 0.8857760429382324, "learning_rate": 4.2730940579870994e-05, "loss": 0.1842, "num_input_tokens_seen": 85780816, "step": 39740 }, { "epoch": 6.4836867862969, "grad_norm": 0.05512961000204086, "learning_rate": 4.272843142001466e-05, "loss": 0.2103, "num_input_tokens_seen": 85792528, "step": 39745 }, { "epoch": 6.484502446982056, "grad_norm": 0.08004128932952881, "learning_rate": 4.272592190086918e-05, "loss": 0.0513, "num_input_tokens_seen": 85802448, "step": 39750 }, { "epoch": 6.485318107667211, "grad_norm": 0.47735849022865295, "learning_rate": 4.2723412022485395e-05, "loss": 0.0399, "num_input_tokens_seen": 85813104, "step": 39755 }, { "epoch": 6.486133768352365, "grad_norm": 0.2103656381368637, "learning_rate": 4.272090178491419e-05, "loss": 0.1825, "num_input_tokens_seen": 85823248, "step": 39760 }, { "epoch": 6.48694942903752, "grad_norm": 0.06453648209571838, "learning_rate": 4.271839118820642e-05, "loss": 0.3276, "num_input_tokens_seen": 85834384, "step": 39765 }, { "epoch": 6.487765089722675, "grad_norm": 0.7788646221160889, "learning_rate": 4.271588023241299e-05, "loss": 0.1187, "num_input_tokens_seen": 85844848, "step": 39770 }, { "epoch": 6.488580750407831, "grad_norm": 1.8190304040908813, "learning_rate": 4.2713368917584763e-05, "loss": 0.2444, "num_input_tokens_seen": 85856528, "step": 39775 }, { "epoch": 6.489396411092986, "grad_norm": 0.7966713905334473, "learning_rate": 4.271085724377264e-05, "loss": 0.022, "num_input_tokens_seen": 85868112, "step": 39780 }, { "epoch": 6.49021207177814, "grad_norm": 0.323266863822937, "learning_rate": 4.2708345211027534e-05, "loss": 0.1217, "num_input_tokens_seen": 85878256, "step": 39785 }, { "epoch": 6.491027732463295, "grad_norm": 0.053891316056251526, "learning_rate": 4.270583281940035e-05, "loss": 0.0387, "num_input_tokens_seen": 85887440, "step": 39790 }, { "epoch": 6.49184339314845, "grad_norm": 1.6534243822097778, "learning_rate": 4.2703320068942e-05, "loss": 0.0897, "num_input_tokens_seen": 85897872, "step": 39795 }, { "epoch": 6.492659053833605, "grad_norm": 0.10240885615348816, "learning_rate": 4.270080695970341e-05, "loss": 0.0776, "num_input_tokens_seen": 85909776, "step": 39800 }, { "epoch": 6.493474714518761, "grad_norm": 1.549275517463684, "learning_rate": 4.269829349173551e-05, "loss": 0.2659, "num_input_tokens_seen": 85919984, "step": 39805 }, { "epoch": 6.494290375203915, "grad_norm": 0.263126015663147, "learning_rate": 4.269577966508924e-05, "loss": 0.0576, "num_input_tokens_seen": 85931632, "step": 39810 }, { "epoch": 6.49510603588907, "grad_norm": 0.41262564063072205, "learning_rate": 4.2693265479815546e-05, "loss": 0.0405, "num_input_tokens_seen": 85944176, "step": 39815 }, { "epoch": 6.495921696574225, "grad_norm": 0.2561017870903015, "learning_rate": 4.269075093596539e-05, "loss": 0.0395, "num_input_tokens_seen": 85956016, "step": 39820 }, { "epoch": 6.49673735725938, "grad_norm": 0.41526156663894653, "learning_rate": 4.2688236033589716e-05, "loss": 0.0679, "num_input_tokens_seen": 85966704, "step": 39825 }, { "epoch": 6.497553017944535, "grad_norm": 0.028116773813962936, "learning_rate": 4.26857207727395e-05, "loss": 0.2341, "num_input_tokens_seen": 85976048, "step": 39830 }, { "epoch": 6.49836867862969, "grad_norm": 0.3919609785079956, "learning_rate": 4.268320515346572e-05, "loss": 0.1561, "num_input_tokens_seen": 85985424, "step": 39835 }, { "epoch": 6.499184339314845, "grad_norm": 0.09462449699640274, "learning_rate": 4.268068917581935e-05, "loss": 0.0972, "num_input_tokens_seen": 85996976, "step": 39840 }, { "epoch": 6.5, "grad_norm": 0.9212145209312439, "learning_rate": 4.2678172839851386e-05, "loss": 0.0587, "num_input_tokens_seen": 86008784, "step": 39845 }, { "epoch": 6.500815660685155, "grad_norm": 1.0564749240875244, "learning_rate": 4.2675656145612826e-05, "loss": 0.167, "num_input_tokens_seen": 86019568, "step": 39850 }, { "epoch": 6.50163132137031, "grad_norm": 0.4540676176548004, "learning_rate": 4.267313909315467e-05, "loss": 0.0501, "num_input_tokens_seen": 86030448, "step": 39855 }, { "epoch": 6.502446982055465, "grad_norm": 1.419492244720459, "learning_rate": 4.267062168252793e-05, "loss": 0.1969, "num_input_tokens_seen": 86041040, "step": 39860 }, { "epoch": 6.50326264274062, "grad_norm": 0.31846266984939575, "learning_rate": 4.2668103913783616e-05, "loss": 0.0192, "num_input_tokens_seen": 86052624, "step": 39865 }, { "epoch": 6.504078303425775, "grad_norm": 0.04585165157914162, "learning_rate": 4.266558578697278e-05, "loss": 0.1295, "num_input_tokens_seen": 86063472, "step": 39870 }, { "epoch": 6.50489396411093, "grad_norm": 0.11437567323446274, "learning_rate": 4.266306730214643e-05, "loss": 0.0489, "num_input_tokens_seen": 86074320, "step": 39875 }, { "epoch": 6.505709624796085, "grad_norm": 0.14322881400585175, "learning_rate": 4.266054845935561e-05, "loss": 0.1667, "num_input_tokens_seen": 86086320, "step": 39880 }, { "epoch": 6.506525285481239, "grad_norm": 0.039491910487413406, "learning_rate": 4.265802925865138e-05, "loss": 0.1779, "num_input_tokens_seen": 86098000, "step": 39885 }, { "epoch": 6.507340946166395, "grad_norm": 0.1090041920542717, "learning_rate": 4.2655509700084775e-05, "loss": 0.0941, "num_input_tokens_seen": 86107088, "step": 39890 }, { "epoch": 6.50815660685155, "grad_norm": 0.01346888393163681, "learning_rate": 4.2652989783706877e-05, "loss": 0.2156, "num_input_tokens_seen": 86118096, "step": 39895 }, { "epoch": 6.508972267536705, "grad_norm": 1.9566055536270142, "learning_rate": 4.265046950956874e-05, "loss": 0.1117, "num_input_tokens_seen": 86129136, "step": 39900 }, { "epoch": 6.50978792822186, "grad_norm": 1.7893949747085571, "learning_rate": 4.264794887772145e-05, "loss": 0.2186, "num_input_tokens_seen": 86140816, "step": 39905 }, { "epoch": 6.510603588907014, "grad_norm": 0.14563600718975067, "learning_rate": 4.2645427888216086e-05, "loss": 0.059, "num_input_tokens_seen": 86151664, "step": 39910 }, { "epoch": 6.511419249592169, "grad_norm": 0.30569514632225037, "learning_rate": 4.264290654110374e-05, "loss": 0.0829, "num_input_tokens_seen": 86161744, "step": 39915 }, { "epoch": 6.512234910277325, "grad_norm": 0.9758583307266235, "learning_rate": 4.2640384836435524e-05, "loss": 0.0975, "num_input_tokens_seen": 86171312, "step": 39920 }, { "epoch": 6.51305057096248, "grad_norm": 0.16598330438137054, "learning_rate": 4.2637862774262514e-05, "loss": 0.0694, "num_input_tokens_seen": 86181392, "step": 39925 }, { "epoch": 6.513866231647635, "grad_norm": 0.5714299082756042, "learning_rate": 4.263534035463585e-05, "loss": 0.0889, "num_input_tokens_seen": 86191728, "step": 39930 }, { "epoch": 6.514681892332789, "grad_norm": 0.09098022431135178, "learning_rate": 4.2632817577606637e-05, "loss": 0.0206, "num_input_tokens_seen": 86201712, "step": 39935 }, { "epoch": 6.515497553017944, "grad_norm": 0.3250993490219116, "learning_rate": 4.263029444322601e-05, "loss": 0.2508, "num_input_tokens_seen": 86212464, "step": 39940 }, { "epoch": 6.5163132137031, "grad_norm": 0.9641734957695007, "learning_rate": 4.26277709515451e-05, "loss": 0.122, "num_input_tokens_seen": 86224208, "step": 39945 }, { "epoch": 6.517128874388255, "grad_norm": 1.5052003860473633, "learning_rate": 4.262524710261505e-05, "loss": 0.1574, "num_input_tokens_seen": 86234544, "step": 39950 }, { "epoch": 6.5179445350734095, "grad_norm": 0.8441208004951477, "learning_rate": 4.262272289648701e-05, "loss": 0.1546, "num_input_tokens_seen": 86245776, "step": 39955 }, { "epoch": 6.518760195758564, "grad_norm": 0.11663489788770676, "learning_rate": 4.2620198333212126e-05, "loss": 0.1607, "num_input_tokens_seen": 86256976, "step": 39960 }, { "epoch": 6.519575856443719, "grad_norm": 1.4450113773345947, "learning_rate": 4.261767341284157e-05, "loss": 0.2664, "num_input_tokens_seen": 86267568, "step": 39965 }, { "epoch": 6.520391517128875, "grad_norm": 0.7292010188102722, "learning_rate": 4.261514813542653e-05, "loss": 0.0476, "num_input_tokens_seen": 86278768, "step": 39970 }, { "epoch": 6.52120717781403, "grad_norm": 0.5242245197296143, "learning_rate": 4.261262250101815e-05, "loss": 0.0584, "num_input_tokens_seen": 86288080, "step": 39975 }, { "epoch": 6.5220228384991845, "grad_norm": 0.31615036725997925, "learning_rate": 4.261009650966764e-05, "loss": 0.0564, "num_input_tokens_seen": 86299120, "step": 39980 }, { "epoch": 6.522838499184339, "grad_norm": 0.20611104369163513, "learning_rate": 4.2607570161426184e-05, "loss": 0.0643, "num_input_tokens_seen": 86311056, "step": 39985 }, { "epoch": 6.523654159869494, "grad_norm": 2.216777801513672, "learning_rate": 4.260504345634497e-05, "loss": 0.2213, "num_input_tokens_seen": 86321904, "step": 39990 }, { "epoch": 6.524469820554649, "grad_norm": 0.10878393799066544, "learning_rate": 4.2602516394475225e-05, "loss": 0.2091, "num_input_tokens_seen": 86332880, "step": 39995 }, { "epoch": 6.525285481239804, "grad_norm": 0.44401776790618896, "learning_rate": 4.259998897586816e-05, "loss": 0.0545, "num_input_tokens_seen": 86344080, "step": 40000 }, { "epoch": 6.5261011419249595, "grad_norm": 0.43529099225997925, "learning_rate": 4.259746120057498e-05, "loss": 0.0977, "num_input_tokens_seen": 86356240, "step": 40005 }, { "epoch": 6.526916802610114, "grad_norm": 0.6913860440254211, "learning_rate": 4.2594933068646936e-05, "loss": 0.0679, "num_input_tokens_seen": 86367824, "step": 40010 }, { "epoch": 6.527732463295269, "grad_norm": 0.6612952351570129, "learning_rate": 4.259240458013525e-05, "loss": 0.198, "num_input_tokens_seen": 86379632, "step": 40015 }, { "epoch": 6.528548123980424, "grad_norm": 0.11716829985380173, "learning_rate": 4.258987573509117e-05, "loss": 0.1243, "num_input_tokens_seen": 86391248, "step": 40020 }, { "epoch": 6.529363784665579, "grad_norm": 0.13622669875621796, "learning_rate": 4.258734653356594e-05, "loss": 0.1596, "num_input_tokens_seen": 86401104, "step": 40025 }, { "epoch": 6.5301794453507345, "grad_norm": 1.5855791568756104, "learning_rate": 4.258481697561082e-05, "loss": 0.2258, "num_input_tokens_seen": 86412528, "step": 40030 }, { "epoch": 6.530995106035889, "grad_norm": 0.1626371443271637, "learning_rate": 4.258228706127707e-05, "loss": 0.1389, "num_input_tokens_seen": 86422800, "step": 40035 }, { "epoch": 6.531810766721044, "grad_norm": 0.07920366525650024, "learning_rate": 4.2579756790615974e-05, "loss": 0.0319, "num_input_tokens_seen": 86434448, "step": 40040 }, { "epoch": 6.532626427406199, "grad_norm": 0.22585423290729523, "learning_rate": 4.2577226163678804e-05, "loss": 0.0603, "num_input_tokens_seen": 86447312, "step": 40045 }, { "epoch": 6.533442088091354, "grad_norm": 0.899922788143158, "learning_rate": 4.2574695180516853e-05, "loss": 0.0868, "num_input_tokens_seen": 86458800, "step": 40050 }, { "epoch": 6.5342577487765094, "grad_norm": 1.0320013761520386, "learning_rate": 4.257216384118141e-05, "loss": 0.0813, "num_input_tokens_seen": 86470928, "step": 40055 }, { "epoch": 6.535073409461664, "grad_norm": 0.1532883644104004, "learning_rate": 4.256963214572377e-05, "loss": 0.1231, "num_input_tokens_seen": 86481520, "step": 40060 }, { "epoch": 6.535889070146819, "grad_norm": 0.36665093898773193, "learning_rate": 4.256710009419524e-05, "loss": 0.0836, "num_input_tokens_seen": 86492080, "step": 40065 }, { "epoch": 6.536704730831974, "grad_norm": 0.05898764729499817, "learning_rate": 4.2564567686647156e-05, "loss": 0.0585, "num_input_tokens_seen": 86503280, "step": 40070 }, { "epoch": 6.537520391517129, "grad_norm": 2.2565362453460693, "learning_rate": 4.256203492313081e-05, "loss": 0.1471, "num_input_tokens_seen": 86513808, "step": 40075 }, { "epoch": 6.5383360522022835, "grad_norm": 0.12183164060115814, "learning_rate": 4.255950180369756e-05, "loss": 0.0652, "num_input_tokens_seen": 86524944, "step": 40080 }, { "epoch": 6.539151712887438, "grad_norm": 0.06250113993883133, "learning_rate": 4.2556968328398724e-05, "loss": 0.2855, "num_input_tokens_seen": 86535824, "step": 40085 }, { "epoch": 6.539967373572594, "grad_norm": 1.3708949089050293, "learning_rate": 4.255443449728566e-05, "loss": 0.0821, "num_input_tokens_seen": 86546992, "step": 40090 }, { "epoch": 6.540783034257749, "grad_norm": 0.13818585872650146, "learning_rate": 4.25519003104097e-05, "loss": 0.0512, "num_input_tokens_seen": 86558352, "step": 40095 }, { "epoch": 6.541598694942904, "grad_norm": 0.08133967965841293, "learning_rate": 4.2549365767822214e-05, "loss": 0.1082, "num_input_tokens_seen": 86569424, "step": 40100 }, { "epoch": 6.5424143556280585, "grad_norm": 0.35925576090812683, "learning_rate": 4.2546830869574574e-05, "loss": 0.1075, "num_input_tokens_seen": 86579024, "step": 40105 }, { "epoch": 6.543230016313213, "grad_norm": 0.08659115433692932, "learning_rate": 4.254429561571814e-05, "loss": 0.1408, "num_input_tokens_seen": 86590832, "step": 40110 }, { "epoch": 6.544045676998369, "grad_norm": 1.0323584079742432, "learning_rate": 4.254176000630431e-05, "loss": 0.0685, "num_input_tokens_seen": 86602736, "step": 40115 }, { "epoch": 6.544861337683524, "grad_norm": 1.156099796295166, "learning_rate": 4.253922404138445e-05, "loss": 0.0559, "num_input_tokens_seen": 86613712, "step": 40120 }, { "epoch": 6.545676998368679, "grad_norm": 0.041000254452228546, "learning_rate": 4.253668772100997e-05, "loss": 0.188, "num_input_tokens_seen": 86624112, "step": 40125 }, { "epoch": 6.5464926590538335, "grad_norm": 0.032062772661447525, "learning_rate": 4.253415104523227e-05, "loss": 0.0324, "num_input_tokens_seen": 86634000, "step": 40130 }, { "epoch": 6.547308319738988, "grad_norm": 1.9693059921264648, "learning_rate": 4.253161401410275e-05, "loss": 0.2227, "num_input_tokens_seen": 86644816, "step": 40135 }, { "epoch": 6.548123980424144, "grad_norm": 1.4463609457015991, "learning_rate": 4.252907662767283e-05, "loss": 0.2894, "num_input_tokens_seen": 86656496, "step": 40140 }, { "epoch": 6.548939641109299, "grad_norm": 0.04420240595936775, "learning_rate": 4.252653888599394e-05, "loss": 0.1128, "num_input_tokens_seen": 86668400, "step": 40145 }, { "epoch": 6.549755301794454, "grad_norm": 0.21017175912857056, "learning_rate": 4.2524000789117505e-05, "loss": 0.0192, "num_input_tokens_seen": 86678832, "step": 40150 }, { "epoch": 6.5505709624796085, "grad_norm": 1.481643795967102, "learning_rate": 4.2521462337094954e-05, "loss": 0.1363, "num_input_tokens_seen": 86689040, "step": 40155 }, { "epoch": 6.551386623164763, "grad_norm": 0.08216149359941483, "learning_rate": 4.251892352997775e-05, "loss": 0.2252, "num_input_tokens_seen": 86699952, "step": 40160 }, { "epoch": 6.552202283849918, "grad_norm": 1.36674964427948, "learning_rate": 4.251638436781733e-05, "loss": 0.1187, "num_input_tokens_seen": 86710320, "step": 40165 }, { "epoch": 6.553017944535073, "grad_norm": 0.12796515226364136, "learning_rate": 4.251384485066516e-05, "loss": 0.0572, "num_input_tokens_seen": 86721648, "step": 40170 }, { "epoch": 6.553833605220229, "grad_norm": 0.40581339597702026, "learning_rate": 4.251130497857272e-05, "loss": 0.1265, "num_input_tokens_seen": 86732560, "step": 40175 }, { "epoch": 6.554649265905383, "grad_norm": 0.060393255203962326, "learning_rate": 4.2508764751591454e-05, "loss": 0.0227, "num_input_tokens_seen": 86743408, "step": 40180 }, { "epoch": 6.555464926590538, "grad_norm": 0.49002179503440857, "learning_rate": 4.250622416977287e-05, "loss": 0.1297, "num_input_tokens_seen": 86754384, "step": 40185 }, { "epoch": 6.556280587275693, "grad_norm": 1.0807275772094727, "learning_rate": 4.2503683233168436e-05, "loss": 0.0347, "num_input_tokens_seen": 86764592, "step": 40190 }, { "epoch": 6.557096247960848, "grad_norm": 0.11089518666267395, "learning_rate": 4.250114194182967e-05, "loss": 0.0928, "num_input_tokens_seen": 86776176, "step": 40195 }, { "epoch": 6.557911908646004, "grad_norm": 1.6057310104370117, "learning_rate": 4.249860029580804e-05, "loss": 0.0969, "num_input_tokens_seen": 86786320, "step": 40200 }, { "epoch": 6.558727569331158, "grad_norm": 0.7693774104118347, "learning_rate": 4.2496058295155095e-05, "loss": 0.0305, "num_input_tokens_seen": 86798288, "step": 40205 }, { "epoch": 6.559543230016313, "grad_norm": 0.14651550352573395, "learning_rate": 4.2493515939922324e-05, "loss": 0.1318, "num_input_tokens_seen": 86809456, "step": 40210 }, { "epoch": 6.560358890701468, "grad_norm": 0.5474736094474792, "learning_rate": 4.2490973230161264e-05, "loss": 0.0993, "num_input_tokens_seen": 86819824, "step": 40215 }, { "epoch": 6.561174551386623, "grad_norm": 1.3855684995651245, "learning_rate": 4.2488430165923434e-05, "loss": 0.1817, "num_input_tokens_seen": 86829872, "step": 40220 }, { "epoch": 6.561990212071779, "grad_norm": 0.06075911596417427, "learning_rate": 4.248588674726039e-05, "loss": 0.1061, "num_input_tokens_seen": 86839472, "step": 40225 }, { "epoch": 6.562805872756933, "grad_norm": 0.3447500765323639, "learning_rate": 4.248334297422367e-05, "loss": 0.0726, "num_input_tokens_seen": 86851184, "step": 40230 }, { "epoch": 6.563621533442088, "grad_norm": 0.13720601797103882, "learning_rate": 4.2480798846864824e-05, "loss": 0.0775, "num_input_tokens_seen": 86861136, "step": 40235 }, { "epoch": 6.564437194127243, "grad_norm": 0.07651445269584656, "learning_rate": 4.2478254365235406e-05, "loss": 0.1125, "num_input_tokens_seen": 86871344, "step": 40240 }, { "epoch": 6.565252854812398, "grad_norm": 1.5029317140579224, "learning_rate": 4.2475709529386995e-05, "loss": 0.0507, "num_input_tokens_seen": 86881488, "step": 40245 }, { "epoch": 6.566068515497553, "grad_norm": 0.6412254571914673, "learning_rate": 4.2473164339371164e-05, "loss": 0.1285, "num_input_tokens_seen": 86893264, "step": 40250 }, { "epoch": 6.566884176182708, "grad_norm": 0.14250876009464264, "learning_rate": 4.247061879523949e-05, "loss": 0.0683, "num_input_tokens_seen": 86904208, "step": 40255 }, { "epoch": 6.567699836867863, "grad_norm": 0.3163307309150696, "learning_rate": 4.246807289704356e-05, "loss": 0.1471, "num_input_tokens_seen": 86916080, "step": 40260 }, { "epoch": 6.568515497553018, "grad_norm": 0.14223036170005798, "learning_rate": 4.2465526644834975e-05, "loss": 0.0354, "num_input_tokens_seen": 86926640, "step": 40265 }, { "epoch": 6.569331158238173, "grad_norm": 0.047141317278146744, "learning_rate": 4.2462980038665335e-05, "loss": 0.1017, "num_input_tokens_seen": 86936688, "step": 40270 }, { "epoch": 6.570146818923328, "grad_norm": 0.05763428658246994, "learning_rate": 4.2460433078586254e-05, "loss": 0.0574, "num_input_tokens_seen": 86948272, "step": 40275 }, { "epoch": 6.5709624796084825, "grad_norm": 6.245945453643799, "learning_rate": 4.245788576464934e-05, "loss": 0.1475, "num_input_tokens_seen": 86959184, "step": 40280 }, { "epoch": 6.571778140293638, "grad_norm": 1.3927820920944214, "learning_rate": 4.245533809690623e-05, "loss": 0.0735, "num_input_tokens_seen": 86970032, "step": 40285 }, { "epoch": 6.572593800978793, "grad_norm": 0.0984271839261055, "learning_rate": 4.245279007540855e-05, "loss": 0.1059, "num_input_tokens_seen": 86980048, "step": 40290 }, { "epoch": 6.573409461663948, "grad_norm": 0.9341477751731873, "learning_rate": 4.245024170020794e-05, "loss": 0.0811, "num_input_tokens_seen": 86991888, "step": 40295 }, { "epoch": 6.574225122349103, "grad_norm": 0.17056173086166382, "learning_rate": 4.244769297135605e-05, "loss": 0.0277, "num_input_tokens_seen": 87003056, "step": 40300 }, { "epoch": 6.575040783034257, "grad_norm": 2.373762607574463, "learning_rate": 4.244514388890451e-05, "loss": 0.0613, "num_input_tokens_seen": 87014192, "step": 40305 }, { "epoch": 6.575856443719413, "grad_norm": 0.2082851529121399, "learning_rate": 4.244259445290501e-05, "loss": 0.2382, "num_input_tokens_seen": 87024112, "step": 40310 }, { "epoch": 6.576672104404568, "grad_norm": 0.09027546644210815, "learning_rate": 4.244004466340921e-05, "loss": 0.1113, "num_input_tokens_seen": 87034896, "step": 40315 }, { "epoch": 6.577487765089723, "grad_norm": 0.13306187093257904, "learning_rate": 4.243749452046877e-05, "loss": 0.1195, "num_input_tokens_seen": 87045392, "step": 40320 }, { "epoch": 6.578303425774878, "grad_norm": 2.4725875854492188, "learning_rate": 4.243494402413539e-05, "loss": 0.1012, "num_input_tokens_seen": 87056432, "step": 40325 }, { "epoch": 6.579119086460032, "grad_norm": 0.2005823701620102, "learning_rate": 4.243239317446075e-05, "loss": 0.1098, "num_input_tokens_seen": 87067440, "step": 40330 }, { "epoch": 6.579934747145187, "grad_norm": 0.06052769720554352, "learning_rate": 4.242984197149654e-05, "loss": 0.0691, "num_input_tokens_seen": 87078064, "step": 40335 }, { "epoch": 6.580750407830343, "grad_norm": 0.15560388565063477, "learning_rate": 4.242729041529448e-05, "loss": 0.0297, "num_input_tokens_seen": 87087920, "step": 40340 }, { "epoch": 6.581566068515498, "grad_norm": 0.16460160911083221, "learning_rate": 4.242473850590627e-05, "loss": 0.0701, "num_input_tokens_seen": 87098032, "step": 40345 }, { "epoch": 6.582381729200653, "grad_norm": 0.09418448805809021, "learning_rate": 4.2422186243383625e-05, "loss": 0.1741, "num_input_tokens_seen": 87108304, "step": 40350 }, { "epoch": 6.583197389885807, "grad_norm": 0.34154918789863586, "learning_rate": 4.241963362777828e-05, "loss": 0.0631, "num_input_tokens_seen": 87119088, "step": 40355 }, { "epoch": 6.584013050570962, "grad_norm": 0.5870181918144226, "learning_rate": 4.241708065914196e-05, "loss": 0.2465, "num_input_tokens_seen": 87130032, "step": 40360 }, { "epoch": 6.584828711256117, "grad_norm": 0.2117534875869751, "learning_rate": 4.24145273375264e-05, "loss": 0.2755, "num_input_tokens_seen": 87141104, "step": 40365 }, { "epoch": 6.585644371941273, "grad_norm": 0.3414592742919922, "learning_rate": 4.241197366298336e-05, "loss": 0.1235, "num_input_tokens_seen": 87151472, "step": 40370 }, { "epoch": 6.5864600326264275, "grad_norm": 1.562785029411316, "learning_rate": 4.2409419635564585e-05, "loss": 0.0704, "num_input_tokens_seen": 87161680, "step": 40375 }, { "epoch": 6.587275693311582, "grad_norm": 0.5373609066009521, "learning_rate": 4.2406865255321825e-05, "loss": 0.0604, "num_input_tokens_seen": 87172080, "step": 40380 }, { "epoch": 6.588091353996737, "grad_norm": 0.6031992435455322, "learning_rate": 4.2404310522306865e-05, "loss": 0.0528, "num_input_tokens_seen": 87181168, "step": 40385 }, { "epoch": 6.588907014681892, "grad_norm": 0.07086414843797684, "learning_rate": 4.2401755436571475e-05, "loss": 0.1277, "num_input_tokens_seen": 87192144, "step": 40390 }, { "epoch": 6.589722675367048, "grad_norm": 0.5280255675315857, "learning_rate": 4.239919999816744e-05, "loss": 0.1512, "num_input_tokens_seen": 87203344, "step": 40395 }, { "epoch": 6.5905383360522025, "grad_norm": 0.6154842376708984, "learning_rate": 4.239664420714653e-05, "loss": 0.0591, "num_input_tokens_seen": 87214416, "step": 40400 }, { "epoch": 6.591353996737357, "grad_norm": 0.18288040161132812, "learning_rate": 4.239408806356057e-05, "loss": 0.0841, "num_input_tokens_seen": 87225168, "step": 40405 }, { "epoch": 6.592169657422512, "grad_norm": 0.4498758018016815, "learning_rate": 4.239153156746134e-05, "loss": 0.213, "num_input_tokens_seen": 87237232, "step": 40410 }, { "epoch": 6.592985318107667, "grad_norm": 2.453796863555908, "learning_rate": 4.2388974718900666e-05, "loss": 0.3128, "num_input_tokens_seen": 87247696, "step": 40415 }, { "epoch": 6.593800978792823, "grad_norm": 1.6726207733154297, "learning_rate": 4.238641751793036e-05, "loss": 0.2331, "num_input_tokens_seen": 87258448, "step": 40420 }, { "epoch": 6.5946166394779775, "grad_norm": 0.49645891785621643, "learning_rate": 4.2383859964602246e-05, "loss": 0.0328, "num_input_tokens_seen": 87268880, "step": 40425 }, { "epoch": 6.595432300163132, "grad_norm": 0.679786741733551, "learning_rate": 4.2381302058968156e-05, "loss": 0.1294, "num_input_tokens_seen": 87279568, "step": 40430 }, { "epoch": 6.596247960848287, "grad_norm": 0.14859528839588165, "learning_rate": 4.2378743801079925e-05, "loss": 0.1012, "num_input_tokens_seen": 87289776, "step": 40435 }, { "epoch": 6.597063621533442, "grad_norm": 0.01725626178085804, "learning_rate": 4.237618519098942e-05, "loss": 0.1766, "num_input_tokens_seen": 87301488, "step": 40440 }, { "epoch": 6.597879282218597, "grad_norm": 0.018132122233510017, "learning_rate": 4.237362622874846e-05, "loss": 0.0416, "num_input_tokens_seen": 87312112, "step": 40445 }, { "epoch": 6.598694942903752, "grad_norm": 1.1511421203613281, "learning_rate": 4.237106691440893e-05, "loss": 0.0839, "num_input_tokens_seen": 87322416, "step": 40450 }, { "epoch": 6.599510603588907, "grad_norm": 0.13959264755249023, "learning_rate": 4.236850724802269e-05, "loss": 0.054, "num_input_tokens_seen": 87333040, "step": 40455 }, { "epoch": 6.600326264274062, "grad_norm": 0.04031548276543617, "learning_rate": 4.2365947229641625e-05, "loss": 0.0828, "num_input_tokens_seen": 87344112, "step": 40460 }, { "epoch": 6.601141924959217, "grad_norm": 0.48131078481674194, "learning_rate": 4.2363386859317605e-05, "loss": 0.0668, "num_input_tokens_seen": 87353680, "step": 40465 }, { "epoch": 6.601957585644372, "grad_norm": 0.27070340514183044, "learning_rate": 4.236082613710253e-05, "loss": 0.1082, "num_input_tokens_seen": 87363536, "step": 40470 }, { "epoch": 6.602773246329527, "grad_norm": 0.25905701518058777, "learning_rate": 4.2358265063048276e-05, "loss": 0.0777, "num_input_tokens_seen": 87374992, "step": 40475 }, { "epoch": 6.603588907014682, "grad_norm": 0.03465333953499794, "learning_rate": 4.2355703637206765e-05, "loss": 0.0698, "num_input_tokens_seen": 87386448, "step": 40480 }, { "epoch": 6.604404567699837, "grad_norm": 0.7694011330604553, "learning_rate": 4.235314185962991e-05, "loss": 0.2236, "num_input_tokens_seen": 87395760, "step": 40485 }, { "epoch": 6.605220228384992, "grad_norm": 0.549927294254303, "learning_rate": 4.235057973036961e-05, "loss": 0.0528, "num_input_tokens_seen": 87407088, "step": 40490 }, { "epoch": 6.606035889070147, "grad_norm": 1.3963865041732788, "learning_rate": 4.234801724947781e-05, "loss": 0.0392, "num_input_tokens_seen": 87418256, "step": 40495 }, { "epoch": 6.6068515497553015, "grad_norm": 0.2940574288368225, "learning_rate": 4.234545441700643e-05, "loss": 0.0821, "num_input_tokens_seen": 87427984, "step": 40500 }, { "epoch": 6.607667210440457, "grad_norm": 0.030812609940767288, "learning_rate": 4.234289123300741e-05, "loss": 0.2398, "num_input_tokens_seen": 87437424, "step": 40505 }, { "epoch": 6.608482871125612, "grad_norm": 0.2816544473171234, "learning_rate": 4.23403276975327e-05, "loss": 0.1455, "num_input_tokens_seen": 87447504, "step": 40510 }, { "epoch": 6.609298531810767, "grad_norm": 0.332571804523468, "learning_rate": 4.233776381063426e-05, "loss": 0.1026, "num_input_tokens_seen": 87457456, "step": 40515 }, { "epoch": 6.610114192495922, "grad_norm": 0.5557423233985901, "learning_rate": 4.2335199572364027e-05, "loss": 0.0314, "num_input_tokens_seen": 87468464, "step": 40520 }, { "epoch": 6.6109298531810765, "grad_norm": 0.07249295711517334, "learning_rate": 4.233263498277399e-05, "loss": 0.0666, "num_input_tokens_seen": 87479504, "step": 40525 }, { "epoch": 6.611745513866231, "grad_norm": 0.7198641896247864, "learning_rate": 4.233007004191612e-05, "loss": 0.0649, "num_input_tokens_seen": 87489840, "step": 40530 }, { "epoch": 6.612561174551386, "grad_norm": 0.3266218602657318, "learning_rate": 4.2327504749842397e-05, "loss": 0.0332, "num_input_tokens_seen": 87501424, "step": 40535 }, { "epoch": 6.613376835236542, "grad_norm": 0.12262728810310364, "learning_rate": 4.23249391066048e-05, "loss": 0.0599, "num_input_tokens_seen": 87512688, "step": 40540 }, { "epoch": 6.614192495921697, "grad_norm": 0.30084240436553955, "learning_rate": 4.232237311225534e-05, "loss": 0.1049, "num_input_tokens_seen": 87523056, "step": 40545 }, { "epoch": 6.6150081566068515, "grad_norm": 0.5470024943351746, "learning_rate": 4.231980676684601e-05, "loss": 0.092, "num_input_tokens_seen": 87532784, "step": 40550 }, { "epoch": 6.615823817292006, "grad_norm": 0.3427332937717438, "learning_rate": 4.231724007042883e-05, "loss": 0.0612, "num_input_tokens_seen": 87542224, "step": 40555 }, { "epoch": 6.616639477977161, "grad_norm": 2.091020345687866, "learning_rate": 4.23146730230558e-05, "loss": 0.0852, "num_input_tokens_seen": 87554224, "step": 40560 }, { "epoch": 6.617455138662317, "grad_norm": 1.327631950378418, "learning_rate": 4.231210562477896e-05, "loss": 0.0296, "num_input_tokens_seen": 87564944, "step": 40565 }, { "epoch": 6.618270799347472, "grad_norm": 0.050160303711891174, "learning_rate": 4.230953787565035e-05, "loss": 0.0603, "num_input_tokens_seen": 87575696, "step": 40570 }, { "epoch": 6.6190864600326265, "grad_norm": 0.03864225745201111, "learning_rate": 4.2306969775721984e-05, "loss": 0.1137, "num_input_tokens_seen": 87587216, "step": 40575 }, { "epoch": 6.619902120717781, "grad_norm": 0.06782082468271255, "learning_rate": 4.230440132504592e-05, "loss": 0.089, "num_input_tokens_seen": 87597872, "step": 40580 }, { "epoch": 6.620717781402936, "grad_norm": 0.5003640651702881, "learning_rate": 4.230183252367422e-05, "loss": 0.0742, "num_input_tokens_seen": 87608816, "step": 40585 }, { "epoch": 6.621533442088092, "grad_norm": 0.40450921654701233, "learning_rate": 4.229926337165892e-05, "loss": 0.0826, "num_input_tokens_seen": 87619600, "step": 40590 }, { "epoch": 6.622349102773247, "grad_norm": 1.5914863348007202, "learning_rate": 4.2296693869052116e-05, "loss": 0.2419, "num_input_tokens_seen": 87629168, "step": 40595 }, { "epoch": 6.623164763458401, "grad_norm": 0.033918242901563644, "learning_rate": 4.229412401590587e-05, "loss": 0.0794, "num_input_tokens_seen": 87640144, "step": 40600 }, { "epoch": 6.623980424143556, "grad_norm": 0.5998451709747314, "learning_rate": 4.229155381227226e-05, "loss": 0.0713, "num_input_tokens_seen": 87650128, "step": 40605 }, { "epoch": 6.624796084828711, "grad_norm": 1.4012911319732666, "learning_rate": 4.2288983258203365e-05, "loss": 0.2336, "num_input_tokens_seen": 87661264, "step": 40610 }, { "epoch": 6.625611745513866, "grad_norm": 0.068662129342556, "learning_rate": 4.22864123537513e-05, "loss": 0.0537, "num_input_tokens_seen": 87672464, "step": 40615 }, { "epoch": 6.626427406199021, "grad_norm": 0.4054228961467743, "learning_rate": 4.228384109896816e-05, "loss": 0.1257, "num_input_tokens_seen": 87684400, "step": 40620 }, { "epoch": 6.627243066884176, "grad_norm": 0.4220118522644043, "learning_rate": 4.228126949390605e-05, "loss": 0.2321, "num_input_tokens_seen": 87695728, "step": 40625 }, { "epoch": 6.628058727569331, "grad_norm": 0.020854922011494637, "learning_rate": 4.227869753861709e-05, "loss": 0.026, "num_input_tokens_seen": 87705456, "step": 40630 }, { "epoch": 6.628874388254486, "grad_norm": 0.07074521481990814, "learning_rate": 4.227612523315341e-05, "loss": 0.0265, "num_input_tokens_seen": 87716816, "step": 40635 }, { "epoch": 6.629690048939641, "grad_norm": 0.12888561189174652, "learning_rate": 4.227355257756713e-05, "loss": 0.095, "num_input_tokens_seen": 87727216, "step": 40640 }, { "epoch": 6.630505709624796, "grad_norm": 0.0804353654384613, "learning_rate": 4.2270979571910396e-05, "loss": 0.0672, "num_input_tokens_seen": 87737040, "step": 40645 }, { "epoch": 6.631321370309951, "grad_norm": 1.273571252822876, "learning_rate": 4.2268406216235355e-05, "loss": 0.0903, "num_input_tokens_seen": 87746928, "step": 40650 }, { "epoch": 6.632137030995106, "grad_norm": 0.024273058399558067, "learning_rate": 4.226583251059415e-05, "loss": 0.1764, "num_input_tokens_seen": 87757424, "step": 40655 }, { "epoch": 6.632952691680261, "grad_norm": 0.7569583058357239, "learning_rate": 4.226325845503895e-05, "loss": 0.0943, "num_input_tokens_seen": 87768816, "step": 40660 }, { "epoch": 6.633768352365416, "grad_norm": 0.07061678171157837, "learning_rate": 4.226068404962192e-05, "loss": 0.1172, "num_input_tokens_seen": 87780912, "step": 40665 }, { "epoch": 6.634584013050571, "grad_norm": 1.8136012554168701, "learning_rate": 4.225810929439522e-05, "loss": 0.268, "num_input_tokens_seen": 87791920, "step": 40670 }, { "epoch": 6.635399673735726, "grad_norm": 0.2168681025505066, "learning_rate": 4.225553418941105e-05, "loss": 0.0689, "num_input_tokens_seen": 87802352, "step": 40675 }, { "epoch": 6.636215334420881, "grad_norm": 0.0995330736041069, "learning_rate": 4.225295873472159e-05, "loss": 0.0546, "num_input_tokens_seen": 87813488, "step": 40680 }, { "epoch": 6.637030995106036, "grad_norm": 0.6158335208892822, "learning_rate": 4.2250382930379026e-05, "loss": 0.2645, "num_input_tokens_seen": 87824016, "step": 40685 }, { "epoch": 6.637846655791191, "grad_norm": 0.7489133477210999, "learning_rate": 4.2247806776435586e-05, "loss": 0.3494, "num_input_tokens_seen": 87835120, "step": 40690 }, { "epoch": 6.638662316476346, "grad_norm": 0.06676590442657471, "learning_rate": 4.224523027294345e-05, "loss": 0.1509, "num_input_tokens_seen": 87846480, "step": 40695 }, { "epoch": 6.6394779771615005, "grad_norm": 0.31870099902153015, "learning_rate": 4.224265341995484e-05, "loss": 0.0898, "num_input_tokens_seen": 87856304, "step": 40700 }, { "epoch": 6.640293637846656, "grad_norm": 1.6425796747207642, "learning_rate": 4.224007621752199e-05, "loss": 0.1681, "num_input_tokens_seen": 87868080, "step": 40705 }, { "epoch": 6.641109298531811, "grad_norm": 0.13674256205558777, "learning_rate": 4.223749866569713e-05, "loss": 0.0984, "num_input_tokens_seen": 87878416, "step": 40710 }, { "epoch": 6.641924959216966, "grad_norm": 0.10440414398908615, "learning_rate": 4.223492076453248e-05, "loss": 0.1991, "num_input_tokens_seen": 87888720, "step": 40715 }, { "epoch": 6.642740619902121, "grad_norm": 0.325741708278656, "learning_rate": 4.22323425140803e-05, "loss": 0.1341, "num_input_tokens_seen": 87900528, "step": 40720 }, { "epoch": 6.643556280587275, "grad_norm": 0.7736082673072815, "learning_rate": 4.222976391439285e-05, "loss": 0.2233, "num_input_tokens_seen": 87911984, "step": 40725 }, { "epoch": 6.64437194127243, "grad_norm": 0.7411336302757263, "learning_rate": 4.222718496552237e-05, "loss": 0.15, "num_input_tokens_seen": 87922608, "step": 40730 }, { "epoch": 6.645187601957586, "grad_norm": 0.16621199250221252, "learning_rate": 4.222460566752113e-05, "loss": 0.132, "num_input_tokens_seen": 87933936, "step": 40735 }, { "epoch": 6.646003262642741, "grad_norm": 0.06056779623031616, "learning_rate": 4.2222026020441407e-05, "loss": 0.1171, "num_input_tokens_seen": 87945488, "step": 40740 }, { "epoch": 6.646818923327896, "grad_norm": 0.06185683608055115, "learning_rate": 4.221944602433548e-05, "loss": 0.2427, "num_input_tokens_seen": 87956112, "step": 40745 }, { "epoch": 6.64763458401305, "grad_norm": 1.6195652484893799, "learning_rate": 4.221686567925563e-05, "loss": 0.2088, "num_input_tokens_seen": 87965584, "step": 40750 }, { "epoch": 6.648450244698205, "grad_norm": 0.6703631281852722, "learning_rate": 4.221428498525416e-05, "loss": 0.0886, "num_input_tokens_seen": 87975056, "step": 40755 }, { "epoch": 6.649265905383361, "grad_norm": 0.0575174018740654, "learning_rate": 4.221170394238336e-05, "loss": 0.0741, "num_input_tokens_seen": 87985712, "step": 40760 }, { "epoch": 6.650081566068516, "grad_norm": 1.2232264280319214, "learning_rate": 4.220912255069556e-05, "loss": 0.089, "num_input_tokens_seen": 87996336, "step": 40765 }, { "epoch": 6.650897226753671, "grad_norm": 0.5487948060035706, "learning_rate": 4.2206540810243046e-05, "loss": 0.1026, "num_input_tokens_seen": 88007184, "step": 40770 }, { "epoch": 6.651712887438825, "grad_norm": 0.30139610171318054, "learning_rate": 4.220395872107816e-05, "loss": 0.2415, "num_input_tokens_seen": 88017008, "step": 40775 }, { "epoch": 6.65252854812398, "grad_norm": 0.6176438927650452, "learning_rate": 4.2201376283253225e-05, "loss": 0.0538, "num_input_tokens_seen": 88027760, "step": 40780 }, { "epoch": 6.653344208809135, "grad_norm": 1.6680830717086792, "learning_rate": 4.2198793496820576e-05, "loss": 0.2277, "num_input_tokens_seen": 88039696, "step": 40785 }, { "epoch": 6.654159869494291, "grad_norm": 0.07883268594741821, "learning_rate": 4.219621036183256e-05, "loss": 0.0686, "num_input_tokens_seen": 88050544, "step": 40790 }, { "epoch": 6.6549755301794455, "grad_norm": 0.04518939182162285, "learning_rate": 4.219362687834153e-05, "loss": 0.0324, "num_input_tokens_seen": 88061840, "step": 40795 }, { "epoch": 6.6557911908646, "grad_norm": 1.64307701587677, "learning_rate": 4.2191043046399835e-05, "loss": 0.1288, "num_input_tokens_seen": 88072816, "step": 40800 }, { "epoch": 6.656606851549755, "grad_norm": 0.5282414555549622, "learning_rate": 4.2188458866059844e-05, "loss": 0.1382, "num_input_tokens_seen": 88083536, "step": 40805 }, { "epoch": 6.65742251223491, "grad_norm": 2.51845383644104, "learning_rate": 4.218587433737393e-05, "loss": 0.2069, "num_input_tokens_seen": 88094704, "step": 40810 }, { "epoch": 6.658238172920065, "grad_norm": 1.5091089010238647, "learning_rate": 4.2183289460394475e-05, "loss": 0.1824, "num_input_tokens_seen": 88106288, "step": 40815 }, { "epoch": 6.6590538336052205, "grad_norm": 0.02761225961148739, "learning_rate": 4.2180704235173865e-05, "loss": 0.0617, "num_input_tokens_seen": 88114960, "step": 40820 }, { "epoch": 6.659869494290375, "grad_norm": 0.2917468249797821, "learning_rate": 4.217811866176448e-05, "loss": 0.1334, "num_input_tokens_seen": 88124944, "step": 40825 }, { "epoch": 6.66068515497553, "grad_norm": 0.16304560005664825, "learning_rate": 4.217553274021873e-05, "loss": 0.0158, "num_input_tokens_seen": 88135056, "step": 40830 }, { "epoch": 6.661500815660685, "grad_norm": 0.4664228856563568, "learning_rate": 4.217294647058901e-05, "loss": 0.0559, "num_input_tokens_seen": 88146768, "step": 40835 }, { "epoch": 6.66231647634584, "grad_norm": 0.08407339453697205, "learning_rate": 4.217035985292776e-05, "loss": 0.2672, "num_input_tokens_seen": 88157968, "step": 40840 }, { "epoch": 6.6631321370309955, "grad_norm": 0.03874783590435982, "learning_rate": 4.216777288728738e-05, "loss": 0.1285, "num_input_tokens_seen": 88168912, "step": 40845 }, { "epoch": 6.66394779771615, "grad_norm": 0.7041706442832947, "learning_rate": 4.21651855737203e-05, "loss": 0.0845, "num_input_tokens_seen": 88179632, "step": 40850 }, { "epoch": 6.664763458401305, "grad_norm": 0.3568146228790283, "learning_rate": 4.216259791227896e-05, "loss": 0.0397, "num_input_tokens_seen": 88191472, "step": 40855 }, { "epoch": 6.66557911908646, "grad_norm": 0.1288575381040573, "learning_rate": 4.216000990301581e-05, "loss": 0.0781, "num_input_tokens_seen": 88202800, "step": 40860 }, { "epoch": 6.666394779771615, "grad_norm": 0.6208162307739258, "learning_rate": 4.215742154598328e-05, "loss": 0.2187, "num_input_tokens_seen": 88212848, "step": 40865 }, { "epoch": 6.6672104404567705, "grad_norm": 0.697297990322113, "learning_rate": 4.215483284123384e-05, "loss": 0.2009, "num_input_tokens_seen": 88224112, "step": 40870 }, { "epoch": 6.668026101141925, "grad_norm": 0.1175864115357399, "learning_rate": 4.215224378881995e-05, "loss": 0.1634, "num_input_tokens_seen": 88234704, "step": 40875 }, { "epoch": 6.66884176182708, "grad_norm": 0.5160861015319824, "learning_rate": 4.214965438879407e-05, "loss": 0.0558, "num_input_tokens_seen": 88247216, "step": 40880 }, { "epoch": 6.669657422512235, "grad_norm": 0.45359498262405396, "learning_rate": 4.21470646412087e-05, "loss": 0.1424, "num_input_tokens_seen": 88259664, "step": 40885 }, { "epoch": 6.67047308319739, "grad_norm": 0.6059242486953735, "learning_rate": 4.2144474546116305e-05, "loss": 0.1171, "num_input_tokens_seen": 88270128, "step": 40890 }, { "epoch": 6.671288743882545, "grad_norm": 0.20688357949256897, "learning_rate": 4.2141884103569395e-05, "loss": 0.0327, "num_input_tokens_seen": 88280176, "step": 40895 }, { "epoch": 6.672104404567699, "grad_norm": 1.741714358329773, "learning_rate": 4.213929331362045e-05, "loss": 0.1483, "num_input_tokens_seen": 88290288, "step": 40900 }, { "epoch": 6.672920065252855, "grad_norm": 0.0694768875837326, "learning_rate": 4.213670217632199e-05, "loss": 0.1198, "num_input_tokens_seen": 88301296, "step": 40905 }, { "epoch": 6.67373572593801, "grad_norm": 1.0053796768188477, "learning_rate": 4.2134110691726517e-05, "loss": 0.0771, "num_input_tokens_seen": 88312752, "step": 40910 }, { "epoch": 6.674551386623165, "grad_norm": 0.18206050992012024, "learning_rate": 4.2131518859886554e-05, "loss": 0.0303, "num_input_tokens_seen": 88324144, "step": 40915 }, { "epoch": 6.6753670473083195, "grad_norm": 0.16729618608951569, "learning_rate": 4.212892668085463e-05, "loss": 0.1265, "num_input_tokens_seen": 88335088, "step": 40920 }, { "epoch": 6.676182707993474, "grad_norm": 2.7037789821624756, "learning_rate": 4.212633415468327e-05, "loss": 0.0791, "num_input_tokens_seen": 88345360, "step": 40925 }, { "epoch": 6.67699836867863, "grad_norm": 1.3019731044769287, "learning_rate": 4.2123741281425026e-05, "loss": 0.2095, "num_input_tokens_seen": 88355888, "step": 40930 }, { "epoch": 6.677814029363785, "grad_norm": 0.046419449150562286, "learning_rate": 4.2121148061132445e-05, "loss": 0.0897, "num_input_tokens_seen": 88366832, "step": 40935 }, { "epoch": 6.67862969004894, "grad_norm": 1.2240774631500244, "learning_rate": 4.2118554493858075e-05, "loss": 0.2448, "num_input_tokens_seen": 88378320, "step": 40940 }, { "epoch": 6.6794453507340945, "grad_norm": 0.31136152148246765, "learning_rate": 4.211596057965449e-05, "loss": 0.1373, "num_input_tokens_seen": 88388656, "step": 40945 }, { "epoch": 6.680261011419249, "grad_norm": 1.7576931715011597, "learning_rate": 4.2113366318574244e-05, "loss": 0.3454, "num_input_tokens_seen": 88399632, "step": 40950 }, { "epoch": 6.681076672104405, "grad_norm": 1.509454369544983, "learning_rate": 4.211077171066992e-05, "loss": 0.0536, "num_input_tokens_seen": 88410032, "step": 40955 }, { "epoch": 6.68189233278956, "grad_norm": 0.2517678439617157, "learning_rate": 4.21081767559941e-05, "loss": 0.0768, "num_input_tokens_seen": 88421008, "step": 40960 }, { "epoch": 6.682707993474715, "grad_norm": 0.21293582022190094, "learning_rate": 4.210558145459937e-05, "loss": 0.0937, "num_input_tokens_seen": 88432144, "step": 40965 }, { "epoch": 6.6835236541598695, "grad_norm": 0.5361841917037964, "learning_rate": 4.210298580653834e-05, "loss": 0.1214, "num_input_tokens_seen": 88442576, "step": 40970 }, { "epoch": 6.684339314845024, "grad_norm": 0.6468632817268372, "learning_rate": 4.210038981186361e-05, "loss": 0.1557, "num_input_tokens_seen": 88452048, "step": 40975 }, { "epoch": 6.685154975530179, "grad_norm": 0.051988136023283005, "learning_rate": 4.209779347062778e-05, "loss": 0.3137, "num_input_tokens_seen": 88463280, "step": 40980 }, { "epoch": 6.685970636215334, "grad_norm": 0.24009697139263153, "learning_rate": 4.2095196782883475e-05, "loss": 0.1396, "num_input_tokens_seen": 88473488, "step": 40985 }, { "epoch": 6.68678629690049, "grad_norm": 0.06835070252418518, "learning_rate": 4.209259974868332e-05, "loss": 0.0132, "num_input_tokens_seen": 88484304, "step": 40990 }, { "epoch": 6.6876019575856445, "grad_norm": 0.0848446935415268, "learning_rate": 4.2090002368079953e-05, "loss": 0.1722, "num_input_tokens_seen": 88495088, "step": 40995 }, { "epoch": 6.688417618270799, "grad_norm": 0.10754358023405075, "learning_rate": 4.2087404641126e-05, "loss": 0.1522, "num_input_tokens_seen": 88506512, "step": 41000 }, { "epoch": 6.689233278955954, "grad_norm": 1.3827989101409912, "learning_rate": 4.208480656787412e-05, "loss": 0.049, "num_input_tokens_seen": 88518192, "step": 41005 }, { "epoch": 6.690048939641109, "grad_norm": 0.17444297671318054, "learning_rate": 4.2082208148376954e-05, "loss": 0.0478, "num_input_tokens_seen": 88526928, "step": 41010 }, { "epoch": 6.690864600326265, "grad_norm": 0.07162082940340042, "learning_rate": 4.207960938268718e-05, "loss": 0.1011, "num_input_tokens_seen": 88537616, "step": 41015 }, { "epoch": 6.691680261011419, "grad_norm": 0.14428557455539703, "learning_rate": 4.207701027085745e-05, "loss": 0.145, "num_input_tokens_seen": 88547408, "step": 41020 }, { "epoch": 6.692495921696574, "grad_norm": 0.031797077506780624, "learning_rate": 4.2074410812940446e-05, "loss": 0.0298, "num_input_tokens_seen": 88557232, "step": 41025 }, { "epoch": 6.693311582381729, "grad_norm": 0.3973465859889984, "learning_rate": 4.2071811008988845e-05, "loss": 0.139, "num_input_tokens_seen": 88567632, "step": 41030 }, { "epoch": 6.694127243066884, "grad_norm": 0.4901232421398163, "learning_rate": 4.206921085905533e-05, "loss": 0.1299, "num_input_tokens_seen": 88579312, "step": 41035 }, { "epoch": 6.69494290375204, "grad_norm": 1.2145001888275146, "learning_rate": 4.206661036319262e-05, "loss": 0.0587, "num_input_tokens_seen": 88590608, "step": 41040 }, { "epoch": 6.695758564437194, "grad_norm": 0.4748845100402832, "learning_rate": 4.2064009521453385e-05, "loss": 0.0665, "num_input_tokens_seen": 88601904, "step": 41045 }, { "epoch": 6.696574225122349, "grad_norm": 0.014801940880715847, "learning_rate": 4.206140833389035e-05, "loss": 0.0192, "num_input_tokens_seen": 88611088, "step": 41050 }, { "epoch": 6.697389885807504, "grad_norm": 1.3986117839813232, "learning_rate": 4.205880680055624e-05, "loss": 0.1586, "num_input_tokens_seen": 88623120, "step": 41055 }, { "epoch": 6.698205546492659, "grad_norm": 0.813060462474823, "learning_rate": 4.205620492150376e-05, "loss": 0.1968, "num_input_tokens_seen": 88634064, "step": 41060 }, { "epoch": 6.699021207177814, "grad_norm": 1.3021975755691528, "learning_rate": 4.205360269678566e-05, "loss": 0.0396, "num_input_tokens_seen": 88645392, "step": 41065 }, { "epoch": 6.699836867862969, "grad_norm": 0.18577681481838226, "learning_rate": 4.205100012645467e-05, "loss": 0.0288, "num_input_tokens_seen": 88657808, "step": 41070 }, { "epoch": 6.700652528548124, "grad_norm": 0.25211137533187866, "learning_rate": 4.2048397210563514e-05, "loss": 0.0314, "num_input_tokens_seen": 88668048, "step": 41075 }, { "epoch": 6.701468189233279, "grad_norm": 0.12198681384325027, "learning_rate": 4.204579394916497e-05, "loss": 0.0729, "num_input_tokens_seen": 88677456, "step": 41080 }, { "epoch": 6.702283849918434, "grad_norm": 1.919451117515564, "learning_rate": 4.204319034231179e-05, "loss": 0.2794, "num_input_tokens_seen": 88688592, "step": 41085 }, { "epoch": 6.703099510603589, "grad_norm": 0.14734585583209991, "learning_rate": 4.2040586390056735e-05, "loss": 0.0709, "num_input_tokens_seen": 88699152, "step": 41090 }, { "epoch": 6.7039151712887435, "grad_norm": 0.050836268812417984, "learning_rate": 4.203798209245258e-05, "loss": 0.0857, "num_input_tokens_seen": 88709776, "step": 41095 }, { "epoch": 6.704730831973899, "grad_norm": 1.824156641960144, "learning_rate": 4.20353774495521e-05, "loss": 0.2305, "num_input_tokens_seen": 88721328, "step": 41100 }, { "epoch": 6.705546492659054, "grad_norm": 1.2591736316680908, "learning_rate": 4.2032772461408084e-05, "loss": 0.2617, "num_input_tokens_seen": 88732432, "step": 41105 }, { "epoch": 6.706362153344209, "grad_norm": 0.08766840398311615, "learning_rate": 4.203016712807334e-05, "loss": 0.0619, "num_input_tokens_seen": 88743312, "step": 41110 }, { "epoch": 6.707177814029364, "grad_norm": 1.0325775146484375, "learning_rate": 4.202756144960065e-05, "loss": 0.0687, "num_input_tokens_seen": 88754096, "step": 41115 }, { "epoch": 6.7079934747145185, "grad_norm": 0.7821809649467468, "learning_rate": 4.202495542604281e-05, "loss": 0.096, "num_input_tokens_seen": 88765200, "step": 41120 }, { "epoch": 6.708809135399674, "grad_norm": 0.10666672140359879, "learning_rate": 4.202234905745267e-05, "loss": 0.1247, "num_input_tokens_seen": 88777136, "step": 41125 }, { "epoch": 6.709624796084829, "grad_norm": 0.9167828559875488, "learning_rate": 4.2019742343883025e-05, "loss": 0.1514, "num_input_tokens_seen": 88789040, "step": 41130 }, { "epoch": 6.710440456769984, "grad_norm": 0.23848161101341248, "learning_rate": 4.201713528538671e-05, "loss": 0.2741, "num_input_tokens_seen": 88799792, "step": 41135 }, { "epoch": 6.711256117455139, "grad_norm": 1.8237488269805908, "learning_rate": 4.201452788201656e-05, "loss": 0.1506, "num_input_tokens_seen": 88810224, "step": 41140 }, { "epoch": 6.712071778140293, "grad_norm": 0.16006386280059814, "learning_rate": 4.2011920133825424e-05, "loss": 0.1689, "num_input_tokens_seen": 88820112, "step": 41145 }, { "epoch": 6.712887438825448, "grad_norm": 1.3611068725585938, "learning_rate": 4.200931204086613e-05, "loss": 0.1264, "num_input_tokens_seen": 88830864, "step": 41150 }, { "epoch": 6.713703099510604, "grad_norm": 0.7960241436958313, "learning_rate": 4.200670360319157e-05, "loss": 0.0466, "num_input_tokens_seen": 88842096, "step": 41155 }, { "epoch": 6.714518760195759, "grad_norm": 1.1111003160476685, "learning_rate": 4.2004094820854564e-05, "loss": 0.1098, "num_input_tokens_seen": 88853232, "step": 41160 }, { "epoch": 6.715334420880914, "grad_norm": 0.3291186988353729, "learning_rate": 4.2001485693908025e-05, "loss": 0.1189, "num_input_tokens_seen": 88863408, "step": 41165 }, { "epoch": 6.716150081566068, "grad_norm": 0.914075493812561, "learning_rate": 4.199887622240481e-05, "loss": 0.1352, "num_input_tokens_seen": 88873648, "step": 41170 }, { "epoch": 6.716965742251223, "grad_norm": 0.03402798995375633, "learning_rate": 4.1996266406397785e-05, "loss": 0.0611, "num_input_tokens_seen": 88884720, "step": 41175 }, { "epoch": 6.717781402936378, "grad_norm": 0.23803672194480896, "learning_rate": 4.1993656245939876e-05, "loss": 0.0416, "num_input_tokens_seen": 88895632, "step": 41180 }, { "epoch": 6.718597063621534, "grad_norm": 0.3740423321723938, "learning_rate": 4.199104574108396e-05, "loss": 0.2101, "num_input_tokens_seen": 88905456, "step": 41185 }, { "epoch": 6.719412724306689, "grad_norm": 0.5504851937294006, "learning_rate": 4.1988434891882955e-05, "loss": 0.1375, "num_input_tokens_seen": 88916848, "step": 41190 }, { "epoch": 6.720228384991843, "grad_norm": 0.5390726923942566, "learning_rate": 4.198582369838976e-05, "loss": 0.0642, "num_input_tokens_seen": 88928240, "step": 41195 }, { "epoch": 6.721044045676998, "grad_norm": 0.38774755597114563, "learning_rate": 4.1983212160657296e-05, "loss": 0.0591, "num_input_tokens_seen": 88940016, "step": 41200 }, { "epoch": 6.721859706362153, "grad_norm": 0.9192478060722351, "learning_rate": 4.198060027873849e-05, "loss": 0.229, "num_input_tokens_seen": 88950544, "step": 41205 }, { "epoch": 6.722675367047309, "grad_norm": 0.7510636448860168, "learning_rate": 4.197798805268629e-05, "loss": 0.1214, "num_input_tokens_seen": 88961520, "step": 41210 }, { "epoch": 6.7234910277324635, "grad_norm": 0.41900429129600525, "learning_rate": 4.1975375482553616e-05, "loss": 0.1732, "num_input_tokens_seen": 88971952, "step": 41215 }, { "epoch": 6.724306688417618, "grad_norm": 1.1434489488601685, "learning_rate": 4.197276256839342e-05, "loss": 0.1696, "num_input_tokens_seen": 88981840, "step": 41220 }, { "epoch": 6.725122349102773, "grad_norm": 1.2363040447235107, "learning_rate": 4.197014931025867e-05, "loss": 0.1498, "num_input_tokens_seen": 88992656, "step": 41225 }, { "epoch": 6.725938009787928, "grad_norm": 1.2639511823654175, "learning_rate": 4.196753570820231e-05, "loss": 0.2348, "num_input_tokens_seen": 89004432, "step": 41230 }, { "epoch": 6.726753670473083, "grad_norm": 0.4306623935699463, "learning_rate": 4.196492176227731e-05, "loss": 0.1129, "num_input_tokens_seen": 89014736, "step": 41235 }, { "epoch": 6.7275693311582385, "grad_norm": 0.04406651481986046, "learning_rate": 4.196230747253666e-05, "loss": 0.1501, "num_input_tokens_seen": 89024976, "step": 41240 }, { "epoch": 6.728384991843393, "grad_norm": 0.12650693953037262, "learning_rate": 4.195969283903331e-05, "loss": 0.0141, "num_input_tokens_seen": 89035440, "step": 41245 }, { "epoch": 6.729200652528548, "grad_norm": 1.9657280445098877, "learning_rate": 4.1957077861820295e-05, "loss": 0.1123, "num_input_tokens_seen": 89046608, "step": 41250 }, { "epoch": 6.730016313213703, "grad_norm": 1.1049365997314453, "learning_rate": 4.195446254095057e-05, "loss": 0.1535, "num_input_tokens_seen": 89056912, "step": 41255 }, { "epoch": 6.730831973898858, "grad_norm": 0.20710915327072144, "learning_rate": 4.1951846876477154e-05, "loss": 0.1057, "num_input_tokens_seen": 89068048, "step": 41260 }, { "epoch": 6.731647634584013, "grad_norm": 0.34747809171676636, "learning_rate": 4.194923086845306e-05, "loss": 0.1248, "num_input_tokens_seen": 89078544, "step": 41265 }, { "epoch": 6.732463295269168, "grad_norm": 1.4968583583831787, "learning_rate": 4.1946614516931307e-05, "loss": 0.1303, "num_input_tokens_seen": 89090768, "step": 41270 }, { "epoch": 6.733278955954323, "grad_norm": 1.0915868282318115, "learning_rate": 4.19439978219649e-05, "loss": 0.2133, "num_input_tokens_seen": 89101040, "step": 41275 }, { "epoch": 6.734094616639478, "grad_norm": 0.5278109312057495, "learning_rate": 4.1941380783606886e-05, "loss": 0.1027, "num_input_tokens_seen": 89112112, "step": 41280 }, { "epoch": 6.734910277324633, "grad_norm": 0.12549856305122375, "learning_rate": 4.193876340191031e-05, "loss": 0.1741, "num_input_tokens_seen": 89123056, "step": 41285 }, { "epoch": 6.735725938009788, "grad_norm": 1.2722481489181519, "learning_rate": 4.193614567692819e-05, "loss": 0.1468, "num_input_tokens_seen": 89135120, "step": 41290 }, { "epoch": 6.736541598694943, "grad_norm": 0.2293323576450348, "learning_rate": 4.1933527608713595e-05, "loss": 0.166, "num_input_tokens_seen": 89147184, "step": 41295 }, { "epoch": 6.737357259380098, "grad_norm": 0.5854178071022034, "learning_rate": 4.193090919731959e-05, "loss": 0.0816, "num_input_tokens_seen": 89156912, "step": 41300 }, { "epoch": 6.738172920065253, "grad_norm": 0.07143547385931015, "learning_rate": 4.192829044279922e-05, "loss": 0.0466, "num_input_tokens_seen": 89167184, "step": 41305 }, { "epoch": 6.738988580750408, "grad_norm": 0.529371976852417, "learning_rate": 4.192567134520558e-05, "loss": 0.0916, "num_input_tokens_seen": 89178448, "step": 41310 }, { "epoch": 6.739804241435563, "grad_norm": 0.8313102722167969, "learning_rate": 4.1923051904591734e-05, "loss": 0.1835, "num_input_tokens_seen": 89189072, "step": 41315 }, { "epoch": 6.740619902120718, "grad_norm": 0.0318274199962616, "learning_rate": 4.192043212101078e-05, "loss": 0.0753, "num_input_tokens_seen": 89200496, "step": 41320 }, { "epoch": 6.741435562805873, "grad_norm": 0.2694694399833679, "learning_rate": 4.191781199451579e-05, "loss": 0.0586, "num_input_tokens_seen": 89209840, "step": 41325 }, { "epoch": 6.742251223491028, "grad_norm": 0.8671683073043823, "learning_rate": 4.191519152515988e-05, "loss": 0.1775, "num_input_tokens_seen": 89220240, "step": 41330 }, { "epoch": 6.743066884176183, "grad_norm": 0.13820353150367737, "learning_rate": 4.191257071299617e-05, "loss": 0.0884, "num_input_tokens_seen": 89232400, "step": 41335 }, { "epoch": 6.7438825448613375, "grad_norm": 2.752288818359375, "learning_rate": 4.1909949558077744e-05, "loss": 0.2484, "num_input_tokens_seen": 89242608, "step": 41340 }, { "epoch": 6.744698205546492, "grad_norm": 0.16220630705356598, "learning_rate": 4.190732806045774e-05, "loss": 0.0493, "num_input_tokens_seen": 89253328, "step": 41345 }, { "epoch": 6.745513866231647, "grad_norm": 0.12161535769701004, "learning_rate": 4.190470622018929e-05, "loss": 0.0869, "num_input_tokens_seen": 89263984, "step": 41350 }, { "epoch": 6.746329526916803, "grad_norm": 0.7563814520835876, "learning_rate": 4.1902084037325526e-05, "loss": 0.0369, "num_input_tokens_seen": 89275856, "step": 41355 }, { "epoch": 6.747145187601958, "grad_norm": 1.1406757831573486, "learning_rate": 4.189946151191959e-05, "loss": 0.1164, "num_input_tokens_seen": 89287024, "step": 41360 }, { "epoch": 6.7479608482871125, "grad_norm": 0.17589367926120758, "learning_rate": 4.1896838644024614e-05, "loss": 0.1852, "num_input_tokens_seen": 89298032, "step": 41365 }, { "epoch": 6.748776508972267, "grad_norm": 0.5559524893760681, "learning_rate": 4.189421543369378e-05, "loss": 0.0283, "num_input_tokens_seen": 89309424, "step": 41370 }, { "epoch": 6.749592169657422, "grad_norm": 1.2604196071624756, "learning_rate": 4.189159188098023e-05, "loss": 0.0763, "num_input_tokens_seen": 89320880, "step": 41375 }, { "epoch": 6.750407830342578, "grad_norm": 0.541278600692749, "learning_rate": 4.188896798593714e-05, "loss": 0.1998, "num_input_tokens_seen": 89332624, "step": 41380 }, { "epoch": 6.751223491027733, "grad_norm": 0.5937174558639526, "learning_rate": 4.1886343748617704e-05, "loss": 0.1386, "num_input_tokens_seen": 89343344, "step": 41385 }, { "epoch": 6.7520391517128875, "grad_norm": 0.9045332074165344, "learning_rate": 4.188371916907507e-05, "loss": 0.1153, "num_input_tokens_seen": 89355344, "step": 41390 }, { "epoch": 6.752854812398042, "grad_norm": 0.6255913376808167, "learning_rate": 4.1881094247362465e-05, "loss": 0.0896, "num_input_tokens_seen": 89366160, "step": 41395 }, { "epoch": 6.753670473083197, "grad_norm": 0.3508722186088562, "learning_rate": 4.187846898353307e-05, "loss": 0.1082, "num_input_tokens_seen": 89376976, "step": 41400 }, { "epoch": 6.754486133768353, "grad_norm": 1.374485969543457, "learning_rate": 4.187584337764008e-05, "loss": 0.2271, "num_input_tokens_seen": 89388176, "step": 41405 }, { "epoch": 6.755301794453508, "grad_norm": 1.2769699096679688, "learning_rate": 4.187321742973671e-05, "loss": 0.1359, "num_input_tokens_seen": 89399152, "step": 41410 }, { "epoch": 6.7561174551386625, "grad_norm": 1.225982666015625, "learning_rate": 4.187059113987619e-05, "loss": 0.2198, "num_input_tokens_seen": 89409968, "step": 41415 }, { "epoch": 6.756933115823817, "grad_norm": 0.0678834393620491, "learning_rate": 4.186796450811174e-05, "loss": 0.0511, "num_input_tokens_seen": 89420560, "step": 41420 }, { "epoch": 6.757748776508972, "grad_norm": 1.0749846696853638, "learning_rate": 4.1865337534496596e-05, "loss": 0.1543, "num_input_tokens_seen": 89431952, "step": 41425 }, { "epoch": 6.758564437194127, "grad_norm": 0.131515234708786, "learning_rate": 4.186271021908399e-05, "loss": 0.0765, "num_input_tokens_seen": 89443152, "step": 41430 }, { "epoch": 6.759380097879282, "grad_norm": 2.5137393474578857, "learning_rate": 4.186008256192716e-05, "loss": 0.1389, "num_input_tokens_seen": 89454576, "step": 41435 }, { "epoch": 6.760195758564437, "grad_norm": 0.12951231002807617, "learning_rate": 4.185745456307938e-05, "loss": 0.0346, "num_input_tokens_seen": 89465840, "step": 41440 }, { "epoch": 6.761011419249592, "grad_norm": 0.08627951890230179, "learning_rate": 4.185482622259389e-05, "loss": 0.0352, "num_input_tokens_seen": 89476400, "step": 41445 }, { "epoch": 6.761827079934747, "grad_norm": 1.519269347190857, "learning_rate": 4.185219754052397e-05, "loss": 0.2182, "num_input_tokens_seen": 89487568, "step": 41450 }, { "epoch": 6.762642740619902, "grad_norm": 1.3547859191894531, "learning_rate": 4.184956851692288e-05, "loss": 0.0765, "num_input_tokens_seen": 89496400, "step": 41455 }, { "epoch": 6.763458401305057, "grad_norm": 0.14931537210941315, "learning_rate": 4.1846939151843925e-05, "loss": 0.0699, "num_input_tokens_seen": 89507440, "step": 41460 }, { "epoch": 6.764274061990212, "grad_norm": 0.45335012674331665, "learning_rate": 4.184430944534036e-05, "loss": 0.1364, "num_input_tokens_seen": 89516624, "step": 41465 }, { "epoch": 6.765089722675367, "grad_norm": 0.10124984383583069, "learning_rate": 4.18416793974655e-05, "loss": 0.1404, "num_input_tokens_seen": 89526352, "step": 41470 }, { "epoch": 6.765905383360522, "grad_norm": 0.10135290026664734, "learning_rate": 4.183904900827264e-05, "loss": 0.1342, "num_input_tokens_seen": 89538128, "step": 41475 }, { "epoch": 6.766721044045677, "grad_norm": 0.4762704372406006, "learning_rate": 4.183641827781509e-05, "loss": 0.1092, "num_input_tokens_seen": 89549136, "step": 41480 }, { "epoch": 6.767536704730832, "grad_norm": 0.09386231750249863, "learning_rate": 4.183378720614617e-05, "loss": 0.0614, "num_input_tokens_seen": 89559728, "step": 41485 }, { "epoch": 6.768352365415987, "grad_norm": 0.15883035957813263, "learning_rate": 4.18311557933192e-05, "loss": 0.0674, "num_input_tokens_seen": 89569904, "step": 41490 }, { "epoch": 6.769168026101142, "grad_norm": 0.03238653019070625, "learning_rate": 4.18285240393875e-05, "loss": 0.1403, "num_input_tokens_seen": 89581008, "step": 41495 }, { "epoch": 6.769983686786297, "grad_norm": 0.9884521961212158, "learning_rate": 4.182589194440442e-05, "loss": 0.3412, "num_input_tokens_seen": 89591728, "step": 41500 }, { "epoch": 6.770799347471452, "grad_norm": 0.10266720503568649, "learning_rate": 4.1823259508423284e-05, "loss": 0.0179, "num_input_tokens_seen": 89601456, "step": 41505 }, { "epoch": 6.771615008156607, "grad_norm": 0.6911317706108093, "learning_rate": 4.182062673149746e-05, "loss": 0.0733, "num_input_tokens_seen": 89612368, "step": 41510 }, { "epoch": 6.7724306688417615, "grad_norm": 0.6706032156944275, "learning_rate": 4.181799361368029e-05, "loss": 0.0689, "num_input_tokens_seen": 89623312, "step": 41515 }, { "epoch": 6.773246329526917, "grad_norm": 1.9209445714950562, "learning_rate": 4.181536015502514e-05, "loss": 0.1375, "num_input_tokens_seen": 89633936, "step": 41520 }, { "epoch": 6.774061990212072, "grad_norm": 0.16249211132526398, "learning_rate": 4.18127263555854e-05, "loss": 0.1296, "num_input_tokens_seen": 89643696, "step": 41525 }, { "epoch": 6.774877650897227, "grad_norm": 0.26299548149108887, "learning_rate": 4.1810092215414424e-05, "loss": 0.0519, "num_input_tokens_seen": 89653936, "step": 41530 }, { "epoch": 6.775693311582382, "grad_norm": 0.419727087020874, "learning_rate": 4.18074577345656e-05, "loss": 0.0581, "num_input_tokens_seen": 89664752, "step": 41535 }, { "epoch": 6.7765089722675365, "grad_norm": 0.11219917237758636, "learning_rate": 4.1804822913092325e-05, "loss": 0.0454, "num_input_tokens_seen": 89675984, "step": 41540 }, { "epoch": 6.777324632952691, "grad_norm": 0.3020099699497223, "learning_rate": 4.1802187751047996e-05, "loss": 0.0176, "num_input_tokens_seen": 89686704, "step": 41545 }, { "epoch": 6.778140293637847, "grad_norm": 0.35641491413116455, "learning_rate": 4.179955224848602e-05, "loss": 0.1642, "num_input_tokens_seen": 89697040, "step": 41550 }, { "epoch": 6.778955954323002, "grad_norm": 0.061491720378398895, "learning_rate": 4.179691640545981e-05, "loss": 0.0904, "num_input_tokens_seen": 89708208, "step": 41555 }, { "epoch": 6.779771615008157, "grad_norm": 0.3139338195323944, "learning_rate": 4.179428022202277e-05, "loss": 0.0614, "num_input_tokens_seen": 89719120, "step": 41560 }, { "epoch": 6.780587275693311, "grad_norm": 0.1055896207690239, "learning_rate": 4.1791643698228346e-05, "loss": 0.0628, "num_input_tokens_seen": 89730384, "step": 41565 }, { "epoch": 6.781402936378466, "grad_norm": 0.0986936017870903, "learning_rate": 4.178900683412995e-05, "loss": 0.014, "num_input_tokens_seen": 89742128, "step": 41570 }, { "epoch": 6.782218597063622, "grad_norm": 0.032209210097789764, "learning_rate": 4.178636962978104e-05, "loss": 0.1143, "num_input_tokens_seen": 89752944, "step": 41575 }, { "epoch": 6.783034257748777, "grad_norm": 0.04926528036594391, "learning_rate": 4.178373208523505e-05, "loss": 0.2677, "num_input_tokens_seen": 89762448, "step": 41580 }, { "epoch": 6.783849918433932, "grad_norm": 0.13379015028476715, "learning_rate": 4.178109420054545e-05, "loss": 0.1974, "num_input_tokens_seen": 89773200, "step": 41585 }, { "epoch": 6.784665579119086, "grad_norm": 0.34955981373786926, "learning_rate": 4.1778455975765675e-05, "loss": 0.0678, "num_input_tokens_seen": 89783952, "step": 41590 }, { "epoch": 6.785481239804241, "grad_norm": 0.17110615968704224, "learning_rate": 4.17758174109492e-05, "loss": 0.0643, "num_input_tokens_seen": 89796528, "step": 41595 }, { "epoch": 6.786296900489396, "grad_norm": 0.14058014750480652, "learning_rate": 4.177317850614951e-05, "loss": 0.0134, "num_input_tokens_seen": 89807664, "step": 41600 }, { "epoch": 6.787112561174552, "grad_norm": 0.6529858708381653, "learning_rate": 4.1770539261420086e-05, "loss": 0.1467, "num_input_tokens_seen": 89817552, "step": 41605 }, { "epoch": 6.787928221859707, "grad_norm": 0.1579064428806305, "learning_rate": 4.1767899676814407e-05, "loss": 0.1047, "num_input_tokens_seen": 89829008, "step": 41610 }, { "epoch": 6.788743882544861, "grad_norm": 0.06334526091814041, "learning_rate": 4.1765259752385966e-05, "loss": 0.1163, "num_input_tokens_seen": 89839856, "step": 41615 }, { "epoch": 6.789559543230016, "grad_norm": 0.14767025411128998, "learning_rate": 4.1762619488188257e-05, "loss": 0.0757, "num_input_tokens_seen": 89851312, "step": 41620 }, { "epoch": 6.790375203915171, "grad_norm": 0.25362175703048706, "learning_rate": 4.175997888427482e-05, "loss": 0.1449, "num_input_tokens_seen": 89863248, "step": 41625 }, { "epoch": 6.791190864600326, "grad_norm": 0.8892568945884705, "learning_rate": 4.175733794069914e-05, "loss": 0.1476, "num_input_tokens_seen": 89875344, "step": 41630 }, { "epoch": 6.7920065252854815, "grad_norm": 0.05714044347405434, "learning_rate": 4.1754696657514744e-05, "loss": 0.022, "num_input_tokens_seen": 89887056, "step": 41635 }, { "epoch": 6.792822185970636, "grad_norm": 0.07634834200143814, "learning_rate": 4.175205503477517e-05, "loss": 0.1405, "num_input_tokens_seen": 89898064, "step": 41640 }, { "epoch": 6.793637846655791, "grad_norm": 0.3576151430606842, "learning_rate": 4.1749413072533945e-05, "loss": 0.1444, "num_input_tokens_seen": 89908816, "step": 41645 }, { "epoch": 6.794453507340946, "grad_norm": 1.2668757438659668, "learning_rate": 4.174677077084462e-05, "loss": 0.1269, "num_input_tokens_seen": 89918896, "step": 41650 }, { "epoch": 6.795269168026101, "grad_norm": 0.6157224774360657, "learning_rate": 4.174412812976074e-05, "loss": 0.0607, "num_input_tokens_seen": 89928880, "step": 41655 }, { "epoch": 6.7960848287112565, "grad_norm": 0.5813212990760803, "learning_rate": 4.174148514933586e-05, "loss": 0.0441, "num_input_tokens_seen": 89939952, "step": 41660 }, { "epoch": 6.796900489396411, "grad_norm": 0.891524612903595, "learning_rate": 4.173884182962355e-05, "loss": 0.2998, "num_input_tokens_seen": 89949360, "step": 41665 }, { "epoch": 6.797716150081566, "grad_norm": 0.23865823447704315, "learning_rate": 4.173619817067737e-05, "loss": 0.1056, "num_input_tokens_seen": 89960144, "step": 41670 }, { "epoch": 6.798531810766721, "grad_norm": 1.7832436561584473, "learning_rate": 4.17335541725509e-05, "loss": 0.2445, "num_input_tokens_seen": 89971184, "step": 41675 }, { "epoch": 6.799347471451876, "grad_norm": 0.08785682171583176, "learning_rate": 4.1730909835297735e-05, "loss": 0.0926, "num_input_tokens_seen": 89982288, "step": 41680 }, { "epoch": 6.800163132137031, "grad_norm": 0.0490611307322979, "learning_rate": 4.172826515897146e-05, "loss": 0.0517, "num_input_tokens_seen": 89993744, "step": 41685 }, { "epoch": 6.800978792822186, "grad_norm": 0.6950364112854004, "learning_rate": 4.1725620143625665e-05, "loss": 0.0557, "num_input_tokens_seen": 90005232, "step": 41690 }, { "epoch": 6.801794453507341, "grad_norm": 0.8800179362297058, "learning_rate": 4.172297478931396e-05, "loss": 0.1179, "num_input_tokens_seen": 90016240, "step": 41695 }, { "epoch": 6.802610114192496, "grad_norm": 0.06712570041418076, "learning_rate": 4.172032909608995e-05, "loss": 0.0355, "num_input_tokens_seen": 90026736, "step": 41700 }, { "epoch": 6.803425774877651, "grad_norm": 0.8178560733795166, "learning_rate": 4.171768306400727e-05, "loss": 0.1676, "num_input_tokens_seen": 90037296, "step": 41705 }, { "epoch": 6.804241435562806, "grad_norm": 0.12669794261455536, "learning_rate": 4.171503669311953e-05, "loss": 0.0949, "num_input_tokens_seen": 90048176, "step": 41710 }, { "epoch": 6.80505709624796, "grad_norm": 0.037840090692043304, "learning_rate": 4.1712389983480374e-05, "loss": 0.0369, "num_input_tokens_seen": 90059056, "step": 41715 }, { "epoch": 6.805872756933116, "grad_norm": 0.190931499004364, "learning_rate": 4.170974293514342e-05, "loss": 0.016, "num_input_tokens_seen": 90069456, "step": 41720 }, { "epoch": 6.806688417618271, "grad_norm": 0.08208778500556946, "learning_rate": 4.170709554816233e-05, "loss": 0.0242, "num_input_tokens_seen": 90081840, "step": 41725 }, { "epoch": 6.807504078303426, "grad_norm": 1.8592219352722168, "learning_rate": 4.170444782259076e-05, "loss": 0.1596, "num_input_tokens_seen": 90091600, "step": 41730 }, { "epoch": 6.808319738988581, "grad_norm": 0.8801218271255493, "learning_rate": 4.170179975848236e-05, "loss": 0.0514, "num_input_tokens_seen": 90103344, "step": 41735 }, { "epoch": 6.809135399673735, "grad_norm": 1.8918894529342651, "learning_rate": 4.169915135589081e-05, "loss": 0.0799, "num_input_tokens_seen": 90114352, "step": 41740 }, { "epoch": 6.809951060358891, "grad_norm": 1.3210211992263794, "learning_rate": 4.169650261486976e-05, "loss": 0.0528, "num_input_tokens_seen": 90125584, "step": 41745 }, { "epoch": 6.810766721044046, "grad_norm": 0.08849066495895386, "learning_rate": 4.169385353547291e-05, "loss": 0.133, "num_input_tokens_seen": 90136304, "step": 41750 }, { "epoch": 6.811582381729201, "grad_norm": 0.26787281036376953, "learning_rate": 4.169120411775395e-05, "loss": 0.0715, "num_input_tokens_seen": 90147088, "step": 41755 }, { "epoch": 6.8123980424143555, "grad_norm": 0.14122052490711212, "learning_rate": 4.168855436176655e-05, "loss": 0.2429, "num_input_tokens_seen": 90158512, "step": 41760 }, { "epoch": 6.81321370309951, "grad_norm": 0.393181174993515, "learning_rate": 4.168590426756443e-05, "loss": 0.1118, "num_input_tokens_seen": 90168880, "step": 41765 }, { "epoch": 6.814029363784666, "grad_norm": 0.363150954246521, "learning_rate": 4.168325383520129e-05, "loss": 0.0421, "num_input_tokens_seen": 90179696, "step": 41770 }, { "epoch": 6.814845024469821, "grad_norm": 1.0749361515045166, "learning_rate": 4.168060306473085e-05, "loss": 0.1306, "num_input_tokens_seen": 90190960, "step": 41775 }, { "epoch": 6.815660685154976, "grad_norm": 1.2622416019439697, "learning_rate": 4.167795195620683e-05, "loss": 0.1418, "num_input_tokens_seen": 90202288, "step": 41780 }, { "epoch": 6.8164763458401305, "grad_norm": 0.11061426997184753, "learning_rate": 4.167530050968295e-05, "loss": 0.0243, "num_input_tokens_seen": 90214288, "step": 41785 }, { "epoch": 6.817292006525285, "grad_norm": 0.9140056371688843, "learning_rate": 4.1672648725212956e-05, "loss": 0.1154, "num_input_tokens_seen": 90225168, "step": 41790 }, { "epoch": 6.81810766721044, "grad_norm": 0.027115050703287125, "learning_rate": 4.166999660285058e-05, "loss": 0.0723, "num_input_tokens_seen": 90235760, "step": 41795 }, { "epoch": 6.818923327895595, "grad_norm": 0.02916160225868225, "learning_rate": 4.166734414264958e-05, "loss": 0.2667, "num_input_tokens_seen": 90246864, "step": 41800 }, { "epoch": 6.819738988580751, "grad_norm": 0.06372866779565811, "learning_rate": 4.1664691344663706e-05, "loss": 0.2033, "num_input_tokens_seen": 90257680, "step": 41805 }, { "epoch": 6.8205546492659055, "grad_norm": 0.08871617168188095, "learning_rate": 4.166203820894672e-05, "loss": 0.18, "num_input_tokens_seen": 90268976, "step": 41810 }, { "epoch": 6.82137030995106, "grad_norm": 0.5330986380577087, "learning_rate": 4.165938473555239e-05, "loss": 0.0925, "num_input_tokens_seen": 90279696, "step": 41815 }, { "epoch": 6.822185970636215, "grad_norm": 0.08898533135652542, "learning_rate": 4.16567309245345e-05, "loss": 0.1678, "num_input_tokens_seen": 90288624, "step": 41820 }, { "epoch": 6.82300163132137, "grad_norm": 0.04373454675078392, "learning_rate": 4.1654076775946824e-05, "loss": 0.0918, "num_input_tokens_seen": 90299984, "step": 41825 }, { "epoch": 6.823817292006526, "grad_norm": 0.413668692111969, "learning_rate": 4.1651422289843154e-05, "loss": 0.0223, "num_input_tokens_seen": 90309264, "step": 41830 }, { "epoch": 6.8246329526916805, "grad_norm": 1.1055209636688232, "learning_rate": 4.1648767466277295e-05, "loss": 0.15, "num_input_tokens_seen": 90320528, "step": 41835 }, { "epoch": 6.825448613376835, "grad_norm": 0.05803424492478371, "learning_rate": 4.164611230530303e-05, "loss": 0.0562, "num_input_tokens_seen": 90331184, "step": 41840 }, { "epoch": 6.82626427406199, "grad_norm": 0.043935373425483704, "learning_rate": 4.164345680697419e-05, "loss": 0.0372, "num_input_tokens_seen": 90341200, "step": 41845 }, { "epoch": 6.827079934747145, "grad_norm": 0.11536208540201187, "learning_rate": 4.164080097134458e-05, "loss": 0.1235, "num_input_tokens_seen": 90350992, "step": 41850 }, { "epoch": 6.827895595432301, "grad_norm": 1.028529405593872, "learning_rate": 4.163814479846803e-05, "loss": 0.1685, "num_input_tokens_seen": 90361520, "step": 41855 }, { "epoch": 6.828711256117455, "grad_norm": 0.7181715369224548, "learning_rate": 4.163548828839837e-05, "loss": 0.193, "num_input_tokens_seen": 90372112, "step": 41860 }, { "epoch": 6.82952691680261, "grad_norm": 0.6142396926879883, "learning_rate": 4.1632831441189434e-05, "loss": 0.3457, "num_input_tokens_seen": 90383856, "step": 41865 }, { "epoch": 6.830342577487765, "grad_norm": 0.3263997733592987, "learning_rate": 4.163017425689507e-05, "loss": 0.1002, "num_input_tokens_seen": 90393552, "step": 41870 }, { "epoch": 6.83115823817292, "grad_norm": 2.2203168869018555, "learning_rate": 4.162751673556913e-05, "loss": 0.234, "num_input_tokens_seen": 90403056, "step": 41875 }, { "epoch": 6.831973898858075, "grad_norm": 0.23939292132854462, "learning_rate": 4.162485887726547e-05, "loss": 0.1981, "num_input_tokens_seen": 90414160, "step": 41880 }, { "epoch": 6.8327895595432295, "grad_norm": 0.7581632733345032, "learning_rate": 4.162220068203795e-05, "loss": 0.1099, "num_input_tokens_seen": 90426160, "step": 41885 }, { "epoch": 6.833605220228385, "grad_norm": 2.3535356521606445, "learning_rate": 4.161954214994045e-05, "loss": 0.3118, "num_input_tokens_seen": 90436816, "step": 41890 }, { "epoch": 6.83442088091354, "grad_norm": 0.23873037099838257, "learning_rate": 4.161688328102685e-05, "loss": 0.0978, "num_input_tokens_seen": 90448144, "step": 41895 }, { "epoch": 6.835236541598695, "grad_norm": 0.4193212687969208, "learning_rate": 4.161422407535102e-05, "loss": 0.0369, "num_input_tokens_seen": 90458352, "step": 41900 }, { "epoch": 6.83605220228385, "grad_norm": 1.83321213722229, "learning_rate": 4.161156453296687e-05, "loss": 0.222, "num_input_tokens_seen": 90469264, "step": 41905 }, { "epoch": 6.8368678629690045, "grad_norm": 0.31722187995910645, "learning_rate": 4.1608904653928285e-05, "loss": 0.1523, "num_input_tokens_seen": 90478832, "step": 41910 }, { "epoch": 6.83768352365416, "grad_norm": 0.050130605697631836, "learning_rate": 4.1606244438289184e-05, "loss": 0.0322, "num_input_tokens_seen": 90489840, "step": 41915 }, { "epoch": 6.838499184339315, "grad_norm": 0.9294564723968506, "learning_rate": 4.160358388610347e-05, "loss": 0.0656, "num_input_tokens_seen": 90501232, "step": 41920 }, { "epoch": 6.83931484502447, "grad_norm": 0.23434652388095856, "learning_rate": 4.160092299742506e-05, "loss": 0.016, "num_input_tokens_seen": 90512560, "step": 41925 }, { "epoch": 6.840130505709625, "grad_norm": 0.08924458920955658, "learning_rate": 4.159826177230789e-05, "loss": 0.0617, "num_input_tokens_seen": 90523856, "step": 41930 }, { "epoch": 6.8409461663947795, "grad_norm": 0.05655315890908241, "learning_rate": 4.15956002108059e-05, "loss": 0.1849, "num_input_tokens_seen": 90532720, "step": 41935 }, { "epoch": 6.841761827079935, "grad_norm": 0.22172647714614868, "learning_rate": 4.1592938312973e-05, "loss": 0.2315, "num_input_tokens_seen": 90543888, "step": 41940 }, { "epoch": 6.84257748776509, "grad_norm": 1.4219825267791748, "learning_rate": 4.159027607886317e-05, "loss": 0.2438, "num_input_tokens_seen": 90553712, "step": 41945 }, { "epoch": 6.843393148450245, "grad_norm": 1.3442803621292114, "learning_rate": 4.1587613508530335e-05, "loss": 0.0992, "num_input_tokens_seen": 90563920, "step": 41950 }, { "epoch": 6.8442088091354, "grad_norm": 0.15116146206855774, "learning_rate": 4.1584950602028477e-05, "loss": 0.0323, "num_input_tokens_seen": 90574544, "step": 41955 }, { "epoch": 6.8450244698205545, "grad_norm": 0.2767028510570526, "learning_rate": 4.158228735941155e-05, "loss": 0.0682, "num_input_tokens_seen": 90586384, "step": 41960 }, { "epoch": 6.845840130505709, "grad_norm": 0.29379385709762573, "learning_rate": 4.1579623780733544e-05, "loss": 0.1476, "num_input_tokens_seen": 90597776, "step": 41965 }, { "epoch": 6.846655791190865, "grad_norm": 0.4306793212890625, "learning_rate": 4.157695986604842e-05, "loss": 0.0416, "num_input_tokens_seen": 90610128, "step": 41970 }, { "epoch": 6.84747145187602, "grad_norm": 0.1617511808872223, "learning_rate": 4.157429561541019e-05, "loss": 0.1584, "num_input_tokens_seen": 90621168, "step": 41975 }, { "epoch": 6.848287112561175, "grad_norm": 0.19025476276874542, "learning_rate": 4.157163102887282e-05, "loss": 0.0695, "num_input_tokens_seen": 90631536, "step": 41980 }, { "epoch": 6.849102773246329, "grad_norm": 1.48812735080719, "learning_rate": 4.1568966106490325e-05, "loss": 0.1519, "num_input_tokens_seen": 90641168, "step": 41985 }, { "epoch": 6.849918433931484, "grad_norm": 0.05385416001081467, "learning_rate": 4.156630084831672e-05, "loss": 0.0893, "num_input_tokens_seen": 90651632, "step": 41990 }, { "epoch": 6.850734094616639, "grad_norm": 0.09166333079338074, "learning_rate": 4.1563635254406006e-05, "loss": 0.0377, "num_input_tokens_seen": 90662128, "step": 41995 }, { "epoch": 6.851549755301795, "grad_norm": 0.1565520465373993, "learning_rate": 4.156096932481221e-05, "loss": 0.068, "num_input_tokens_seen": 90673808, "step": 42000 }, { "epoch": 6.85236541598695, "grad_norm": 0.9766297936439514, "learning_rate": 4.1558303059589364e-05, "loss": 0.0856, "num_input_tokens_seen": 90684752, "step": 42005 }, { "epoch": 6.853181076672104, "grad_norm": 0.09165964275598526, "learning_rate": 4.1555636458791504e-05, "loss": 0.0892, "num_input_tokens_seen": 90695312, "step": 42010 }, { "epoch": 6.853996737357259, "grad_norm": 1.4181140661239624, "learning_rate": 4.1552969522472665e-05, "loss": 0.0953, "num_input_tokens_seen": 90705520, "step": 42015 }, { "epoch": 6.854812398042414, "grad_norm": 0.2613372504711151, "learning_rate": 4.15503022506869e-05, "loss": 0.0668, "num_input_tokens_seen": 90717584, "step": 42020 }, { "epoch": 6.85562805872757, "grad_norm": 0.6303694248199463, "learning_rate": 4.154763464348826e-05, "loss": 0.0696, "num_input_tokens_seen": 90728496, "step": 42025 }, { "epoch": 6.856443719412725, "grad_norm": 1.553351879119873, "learning_rate": 4.154496670093082e-05, "loss": 0.1369, "num_input_tokens_seen": 90738960, "step": 42030 }, { "epoch": 6.857259380097879, "grad_norm": 0.04683544114232063, "learning_rate": 4.1542298423068635e-05, "loss": 0.0671, "num_input_tokens_seen": 90750352, "step": 42035 }, { "epoch": 6.858075040783034, "grad_norm": 0.13092325627803802, "learning_rate": 4.153962980995579e-05, "loss": 0.1394, "num_input_tokens_seen": 90761040, "step": 42040 }, { "epoch": 6.858890701468189, "grad_norm": 0.870840847492218, "learning_rate": 4.153696086164637e-05, "loss": 0.113, "num_input_tokens_seen": 90770992, "step": 42045 }, { "epoch": 6.859706362153344, "grad_norm": 0.461501806974411, "learning_rate": 4.153429157819445e-05, "loss": 0.0862, "num_input_tokens_seen": 90779088, "step": 42050 }, { "epoch": 6.8605220228384995, "grad_norm": 0.13269738852977753, "learning_rate": 4.153162195965414e-05, "loss": 0.2307, "num_input_tokens_seen": 90789616, "step": 42055 }, { "epoch": 6.861337683523654, "grad_norm": 2.540459632873535, "learning_rate": 4.152895200607954e-05, "loss": 0.1324, "num_input_tokens_seen": 90800496, "step": 42060 }, { "epoch": 6.862153344208809, "grad_norm": 1.8957561254501343, "learning_rate": 4.152628171752475e-05, "loss": 0.2353, "num_input_tokens_seen": 90809808, "step": 42065 }, { "epoch": 6.862969004893964, "grad_norm": 0.38377705216407776, "learning_rate": 4.152361109404391e-05, "loss": 0.222, "num_input_tokens_seen": 90820880, "step": 42070 }, { "epoch": 6.863784665579119, "grad_norm": 0.3481258451938629, "learning_rate": 4.152094013569112e-05, "loss": 0.0737, "num_input_tokens_seen": 90831696, "step": 42075 }, { "epoch": 6.864600326264274, "grad_norm": 0.47346898913383484, "learning_rate": 4.151826884252052e-05, "loss": 0.0297, "num_input_tokens_seen": 90842512, "step": 42080 }, { "epoch": 6.865415986949429, "grad_norm": 0.9625535607337952, "learning_rate": 4.1515597214586254e-05, "loss": 0.088, "num_input_tokens_seen": 90852784, "step": 42085 }, { "epoch": 6.866231647634584, "grad_norm": 0.5905745029449463, "learning_rate": 4.151292525194246e-05, "loss": 0.2292, "num_input_tokens_seen": 90863280, "step": 42090 }, { "epoch": 6.867047308319739, "grad_norm": 0.2581447958946228, "learning_rate": 4.151025295464328e-05, "loss": 0.2122, "num_input_tokens_seen": 90874160, "step": 42095 }, { "epoch": 6.867862969004894, "grad_norm": 0.4481731951236725, "learning_rate": 4.1507580322742884e-05, "loss": 0.205, "num_input_tokens_seen": 90884752, "step": 42100 }, { "epoch": 6.868678629690049, "grad_norm": 0.08618056774139404, "learning_rate": 4.150490735629543e-05, "loss": 0.1514, "num_input_tokens_seen": 90894704, "step": 42105 }, { "epoch": 6.869494290375204, "grad_norm": 0.06623498350381851, "learning_rate": 4.150223405535509e-05, "loss": 0.1709, "num_input_tokens_seen": 90904528, "step": 42110 }, { "epoch": 6.870309951060359, "grad_norm": 0.729194700717926, "learning_rate": 4.149956041997604e-05, "loss": 0.0486, "num_input_tokens_seen": 90915696, "step": 42115 }, { "epoch": 6.871125611745514, "grad_norm": 0.3683704733848572, "learning_rate": 4.1496886450212466e-05, "loss": 0.0598, "num_input_tokens_seen": 90925776, "step": 42120 }, { "epoch": 6.871941272430669, "grad_norm": 1.2740342617034912, "learning_rate": 4.149421214611857e-05, "loss": 0.0413, "num_input_tokens_seen": 90936144, "step": 42125 }, { "epoch": 6.872756933115824, "grad_norm": 1.0959731340408325, "learning_rate": 4.1491537507748535e-05, "loss": 0.18, "num_input_tokens_seen": 90946480, "step": 42130 }, { "epoch": 6.873572593800979, "grad_norm": 1.555145025253296, "learning_rate": 4.148886253515657e-05, "loss": 0.0758, "num_input_tokens_seen": 90956720, "step": 42135 }, { "epoch": 6.874388254486134, "grad_norm": 0.2170177698135376, "learning_rate": 4.148618722839689e-05, "loss": 0.1091, "num_input_tokens_seen": 90967312, "step": 42140 }, { "epoch": 6.875203915171289, "grad_norm": 0.6877288222312927, "learning_rate": 4.14835115875237e-05, "loss": 0.2372, "num_input_tokens_seen": 90978000, "step": 42145 }, { "epoch": 6.876019575856444, "grad_norm": 0.8265016078948975, "learning_rate": 4.1480835612591256e-05, "loss": 0.0815, "num_input_tokens_seen": 90988304, "step": 42150 }, { "epoch": 6.876835236541599, "grad_norm": 0.9833165407180786, "learning_rate": 4.147815930365376e-05, "loss": 0.1354, "num_input_tokens_seen": 90998288, "step": 42155 }, { "epoch": 6.877650897226753, "grad_norm": 0.0524848997592926, "learning_rate": 4.1475482660765466e-05, "loss": 0.0738, "num_input_tokens_seen": 91009936, "step": 42160 }, { "epoch": 6.878466557911908, "grad_norm": 0.9924958944320679, "learning_rate": 4.147280568398061e-05, "loss": 0.0824, "num_input_tokens_seen": 91021456, "step": 42165 }, { "epoch": 6.879282218597064, "grad_norm": 0.2490752935409546, "learning_rate": 4.147012837335345e-05, "loss": 0.1899, "num_input_tokens_seen": 91032784, "step": 42170 }, { "epoch": 6.880097879282219, "grad_norm": 1.1796380281448364, "learning_rate": 4.146745072893825e-05, "loss": 0.1264, "num_input_tokens_seen": 91043952, "step": 42175 }, { "epoch": 6.8809135399673735, "grad_norm": 0.9605444073677063, "learning_rate": 4.146477275078927e-05, "loss": 0.1459, "num_input_tokens_seen": 91054704, "step": 42180 }, { "epoch": 6.881729200652528, "grad_norm": 0.3076046407222748, "learning_rate": 4.146209443896077e-05, "loss": 0.1746, "num_input_tokens_seen": 91064176, "step": 42185 }, { "epoch": 6.882544861337683, "grad_norm": 1.2473737001419067, "learning_rate": 4.1459415793507064e-05, "loss": 0.1215, "num_input_tokens_seen": 91074768, "step": 42190 }, { "epoch": 6.883360522022839, "grad_norm": 1.1845712661743164, "learning_rate": 4.14567368144824e-05, "loss": 0.1999, "num_input_tokens_seen": 91085520, "step": 42195 }, { "epoch": 6.884176182707994, "grad_norm": 1.163038969039917, "learning_rate": 4.1454057501941095e-05, "loss": 0.2832, "num_input_tokens_seen": 91094800, "step": 42200 }, { "epoch": 6.8849918433931485, "grad_norm": 0.5087058544158936, "learning_rate": 4.145137785593744e-05, "loss": 0.089, "num_input_tokens_seen": 91105840, "step": 42205 }, { "epoch": 6.885807504078303, "grad_norm": 0.08242414891719818, "learning_rate": 4.144869787652575e-05, "loss": 0.2183, "num_input_tokens_seen": 91116624, "step": 42210 }, { "epoch": 6.886623164763458, "grad_norm": 1.2503161430358887, "learning_rate": 4.144601756376032e-05, "loss": 0.1837, "num_input_tokens_seen": 91128176, "step": 42215 }, { "epoch": 6.887438825448614, "grad_norm": 0.3706887364387512, "learning_rate": 4.144333691769549e-05, "loss": 0.1647, "num_input_tokens_seen": 91138544, "step": 42220 }, { "epoch": 6.888254486133769, "grad_norm": 0.20723843574523926, "learning_rate": 4.144065593838557e-05, "loss": 0.1258, "num_input_tokens_seen": 91148624, "step": 42225 }, { "epoch": 6.8890701468189235, "grad_norm": 0.5743907690048218, "learning_rate": 4.1437974625884904e-05, "loss": 0.1408, "num_input_tokens_seen": 91158672, "step": 42230 }, { "epoch": 6.889885807504078, "grad_norm": 0.7585780024528503, "learning_rate": 4.143529298024782e-05, "loss": 0.0356, "num_input_tokens_seen": 91169648, "step": 42235 }, { "epoch": 6.890701468189233, "grad_norm": 0.40979093313217163, "learning_rate": 4.143261100152869e-05, "loss": 0.2036, "num_input_tokens_seen": 91180496, "step": 42240 }, { "epoch": 6.891517128874388, "grad_norm": 1.873368263244629, "learning_rate": 4.142992868978184e-05, "loss": 0.1551, "num_input_tokens_seen": 91191440, "step": 42245 }, { "epoch": 6.892332789559543, "grad_norm": 0.5884469151496887, "learning_rate": 4.142724604506164e-05, "loss": 0.1256, "num_input_tokens_seen": 91201584, "step": 42250 }, { "epoch": 6.8931484502446985, "grad_norm": 0.2944958209991455, "learning_rate": 4.142456306742247e-05, "loss": 0.031, "num_input_tokens_seen": 91213104, "step": 42255 }, { "epoch": 6.893964110929853, "grad_norm": 0.21510636806488037, "learning_rate": 4.142187975691869e-05, "loss": 0.0657, "num_input_tokens_seen": 91224560, "step": 42260 }, { "epoch": 6.894779771615008, "grad_norm": 0.08801030367612839, "learning_rate": 4.1419196113604676e-05, "loss": 0.11, "num_input_tokens_seen": 91235824, "step": 42265 }, { "epoch": 6.895595432300163, "grad_norm": 0.5000597834587097, "learning_rate": 4.141651213753483e-05, "loss": 0.0622, "num_input_tokens_seen": 91247600, "step": 42270 }, { "epoch": 6.896411092985318, "grad_norm": 0.24708358943462372, "learning_rate": 4.141382782876354e-05, "loss": 0.1396, "num_input_tokens_seen": 91259120, "step": 42275 }, { "epoch": 6.897226753670473, "grad_norm": 0.22647185623645782, "learning_rate": 4.141114318734521e-05, "loss": 0.0399, "num_input_tokens_seen": 91269040, "step": 42280 }, { "epoch": 6.898042414355628, "grad_norm": 1.765349268913269, "learning_rate": 4.140845821333424e-05, "loss": 0.0787, "num_input_tokens_seen": 91280208, "step": 42285 }, { "epoch": 6.898858075040783, "grad_norm": 0.12945522367954254, "learning_rate": 4.140577290678505e-05, "loss": 0.0358, "num_input_tokens_seen": 91291088, "step": 42290 }, { "epoch": 6.899673735725938, "grad_norm": 0.15010610222816467, "learning_rate": 4.140308726775206e-05, "loss": 0.0555, "num_input_tokens_seen": 91302032, "step": 42295 }, { "epoch": 6.900489396411093, "grad_norm": 0.10047396272420883, "learning_rate": 4.14004012962897e-05, "loss": 0.0573, "num_input_tokens_seen": 91313552, "step": 42300 }, { "epoch": 6.901305057096248, "grad_norm": 0.4421420693397522, "learning_rate": 4.1397714992452394e-05, "loss": 0.0555, "num_input_tokens_seen": 91323728, "step": 42305 }, { "epoch": 6.902120717781403, "grad_norm": 0.25737378001213074, "learning_rate": 4.13950283562946e-05, "loss": 0.1859, "num_input_tokens_seen": 91334960, "step": 42310 }, { "epoch": 6.902936378466558, "grad_norm": 0.051108893007040024, "learning_rate": 4.139234138787076e-05, "loss": 0.0387, "num_input_tokens_seen": 91346896, "step": 42315 }, { "epoch": 6.903752039151713, "grad_norm": 0.09498728066682816, "learning_rate": 4.138965408723532e-05, "loss": 0.1305, "num_input_tokens_seen": 91357168, "step": 42320 }, { "epoch": 6.904567699836868, "grad_norm": 0.7892846465110779, "learning_rate": 4.138696645444275e-05, "loss": 0.1361, "num_input_tokens_seen": 91368688, "step": 42325 }, { "epoch": 6.9053833605220225, "grad_norm": 0.4498803913593292, "learning_rate": 4.138427848954751e-05, "loss": 0.0751, "num_input_tokens_seen": 91379600, "step": 42330 }, { "epoch": 6.906199021207177, "grad_norm": 0.6590514779090881, "learning_rate": 4.1381590192604094e-05, "loss": 0.0808, "num_input_tokens_seen": 91389872, "step": 42335 }, { "epoch": 6.907014681892333, "grad_norm": 1.3116296529769897, "learning_rate": 4.137890156366696e-05, "loss": 0.1441, "num_input_tokens_seen": 91400944, "step": 42340 }, { "epoch": 6.907830342577488, "grad_norm": 1.0804290771484375, "learning_rate": 4.137621260279062e-05, "loss": 0.2331, "num_input_tokens_seen": 91412080, "step": 42345 }, { "epoch": 6.908646003262643, "grad_norm": 0.21697993576526642, "learning_rate": 4.137352331002955e-05, "loss": 0.063, "num_input_tokens_seen": 91423952, "step": 42350 }, { "epoch": 6.9094616639477975, "grad_norm": 0.5267625451087952, "learning_rate": 4.137083368543826e-05, "loss": 0.1795, "num_input_tokens_seen": 91434480, "step": 42355 }, { "epoch": 6.910277324632952, "grad_norm": 1.9844236373901367, "learning_rate": 4.136814372907125e-05, "loss": 0.2315, "num_input_tokens_seen": 91444464, "step": 42360 }, { "epoch": 6.911092985318108, "grad_norm": 0.2541223466396332, "learning_rate": 4.136545344098305e-05, "loss": 0.1536, "num_input_tokens_seen": 91453104, "step": 42365 }, { "epoch": 6.911908646003263, "grad_norm": 0.05659923702478409, "learning_rate": 4.1362762821228176e-05, "loss": 0.0449, "num_input_tokens_seen": 91463120, "step": 42370 }, { "epoch": 6.912724306688418, "grad_norm": 1.7733416557312012, "learning_rate": 4.136007186986115e-05, "loss": 0.0611, "num_input_tokens_seen": 91473104, "step": 42375 }, { "epoch": 6.9135399673735725, "grad_norm": 0.23543068766593933, "learning_rate": 4.1357380586936516e-05, "loss": 0.2568, "num_input_tokens_seen": 91484304, "step": 42380 }, { "epoch": 6.914355628058727, "grad_norm": 0.14639192819595337, "learning_rate": 4.1354688972508816e-05, "loss": 0.0899, "num_input_tokens_seen": 91495536, "step": 42385 }, { "epoch": 6.915171288743883, "grad_norm": 0.03934487700462341, "learning_rate": 4.135199702663259e-05, "loss": 0.064, "num_input_tokens_seen": 91505392, "step": 42390 }, { "epoch": 6.915986949429038, "grad_norm": 0.06383870542049408, "learning_rate": 4.1349304749362417e-05, "loss": 0.0349, "num_input_tokens_seen": 91517200, "step": 42395 }, { "epoch": 6.916802610114193, "grad_norm": 0.09340234845876694, "learning_rate": 4.134661214075283e-05, "loss": 0.0536, "num_input_tokens_seen": 91528592, "step": 42400 }, { "epoch": 6.917618270799347, "grad_norm": 0.2677341103553772, "learning_rate": 4.134391920085841e-05, "loss": 0.0425, "num_input_tokens_seen": 91539216, "step": 42405 }, { "epoch": 6.918433931484502, "grad_norm": 0.13916786015033722, "learning_rate": 4.134122592973374e-05, "loss": 0.2379, "num_input_tokens_seen": 91549616, "step": 42410 }, { "epoch": 6.919249592169657, "grad_norm": 0.17332658171653748, "learning_rate": 4.133853232743339e-05, "loss": 0.1068, "num_input_tokens_seen": 91560496, "step": 42415 }, { "epoch": 6.920065252854813, "grad_norm": 0.25876426696777344, "learning_rate": 4.1335838394011947e-05, "loss": 0.1214, "num_input_tokens_seen": 91571408, "step": 42420 }, { "epoch": 6.920880913539968, "grad_norm": 0.9561770558357239, "learning_rate": 4.133314412952403e-05, "loss": 0.0823, "num_input_tokens_seen": 91580496, "step": 42425 }, { "epoch": 6.921696574225122, "grad_norm": 0.5275673866271973, "learning_rate": 4.133044953402423e-05, "loss": 0.1647, "num_input_tokens_seen": 91591984, "step": 42430 }, { "epoch": 6.922512234910277, "grad_norm": 1.2248244285583496, "learning_rate": 4.132775460756714e-05, "loss": 0.1042, "num_input_tokens_seen": 91602672, "step": 42435 }, { "epoch": 6.923327895595432, "grad_norm": 2.009890079498291, "learning_rate": 4.1325059350207395e-05, "loss": 0.0633, "num_input_tokens_seen": 91612752, "step": 42440 }, { "epoch": 6.924143556280587, "grad_norm": 0.16793610155582428, "learning_rate": 4.1322363761999616e-05, "loss": 0.0554, "num_input_tokens_seen": 91623184, "step": 42445 }, { "epoch": 6.924959216965743, "grad_norm": 0.12217048555612564, "learning_rate": 4.1319667842998434e-05, "loss": 0.0663, "num_input_tokens_seen": 91633488, "step": 42450 }, { "epoch": 6.925774877650897, "grad_norm": 1.343083143234253, "learning_rate": 4.1316971593258466e-05, "loss": 0.2504, "num_input_tokens_seen": 91645456, "step": 42455 }, { "epoch": 6.926590538336052, "grad_norm": 0.20023740828037262, "learning_rate": 4.131427501283438e-05, "loss": 0.2126, "num_input_tokens_seen": 91656016, "step": 42460 }, { "epoch": 6.927406199021207, "grad_norm": 0.6084496974945068, "learning_rate": 4.131157810178081e-05, "loss": 0.1447, "num_input_tokens_seen": 91667728, "step": 42465 }, { "epoch": 6.928221859706362, "grad_norm": 0.12965354323387146, "learning_rate": 4.130888086015242e-05, "loss": 0.1048, "num_input_tokens_seen": 91679280, "step": 42470 }, { "epoch": 6.9290375203915175, "grad_norm": 0.4309539198875427, "learning_rate": 4.130618328800386e-05, "loss": 0.0596, "num_input_tokens_seen": 91690032, "step": 42475 }, { "epoch": 6.929853181076672, "grad_norm": 0.6926183700561523, "learning_rate": 4.130348538538982e-05, "loss": 0.1448, "num_input_tokens_seen": 91700944, "step": 42480 }, { "epoch": 6.930668841761827, "grad_norm": 0.46568572521209717, "learning_rate": 4.1300787152364965e-05, "loss": 0.1019, "num_input_tokens_seen": 91710096, "step": 42485 }, { "epoch": 6.931484502446982, "grad_norm": 0.5720492005348206, "learning_rate": 4.129808858898397e-05, "loss": 0.1934, "num_input_tokens_seen": 91719856, "step": 42490 }, { "epoch": 6.932300163132137, "grad_norm": 2.0766944885253906, "learning_rate": 4.129538969530155e-05, "loss": 0.2252, "num_input_tokens_seen": 91730448, "step": 42495 }, { "epoch": 6.933115823817292, "grad_norm": 1.7053112983703613, "learning_rate": 4.1292690471372376e-05, "loss": 0.1613, "num_input_tokens_seen": 91742256, "step": 42500 }, { "epoch": 6.933931484502447, "grad_norm": 1.8396285772323608, "learning_rate": 4.1289990917251164e-05, "loss": 0.1592, "num_input_tokens_seen": 91752848, "step": 42505 }, { "epoch": 6.934747145187602, "grad_norm": 0.9586758613586426, "learning_rate": 4.128729103299262e-05, "loss": 0.1691, "num_input_tokens_seen": 91764240, "step": 42510 }, { "epoch": 6.935562805872757, "grad_norm": 0.5406635403633118, "learning_rate": 4.1284590818651455e-05, "loss": 0.1314, "num_input_tokens_seen": 91775696, "step": 42515 }, { "epoch": 6.936378466557912, "grad_norm": 0.1940828263759613, "learning_rate": 4.128189027428241e-05, "loss": 0.0359, "num_input_tokens_seen": 91786576, "step": 42520 }, { "epoch": 6.937194127243067, "grad_norm": 0.3058643639087677, "learning_rate": 4.12791893999402e-05, "loss": 0.0661, "num_input_tokens_seen": 91796880, "step": 42525 }, { "epoch": 6.938009787928221, "grad_norm": 0.1716931015253067, "learning_rate": 4.1276488195679563e-05, "loss": 0.2036, "num_input_tokens_seen": 91807344, "step": 42530 }, { "epoch": 6.938825448613377, "grad_norm": 0.13462913036346436, "learning_rate": 4.127378666155525e-05, "loss": 0.0294, "num_input_tokens_seen": 91818640, "step": 42535 }, { "epoch": 6.939641109298532, "grad_norm": 0.20457877218723297, "learning_rate": 4.127108479762199e-05, "loss": 0.0478, "num_input_tokens_seen": 91829616, "step": 42540 }, { "epoch": 6.940456769983687, "grad_norm": 2.1486544609069824, "learning_rate": 4.1268382603934576e-05, "loss": 0.0804, "num_input_tokens_seen": 91839376, "step": 42545 }, { "epoch": 6.941272430668842, "grad_norm": 0.07943088561296463, "learning_rate": 4.1265680080547736e-05, "loss": 0.1428, "num_input_tokens_seen": 91849776, "step": 42550 }, { "epoch": 6.942088091353996, "grad_norm": 1.3676989078521729, "learning_rate": 4.1262977227516267e-05, "loss": 0.3183, "num_input_tokens_seen": 91861520, "step": 42555 }, { "epoch": 6.942903752039152, "grad_norm": 0.23984847962856293, "learning_rate": 4.126027404489492e-05, "loss": 0.0989, "num_input_tokens_seen": 91872880, "step": 42560 }, { "epoch": 6.943719412724307, "grad_norm": 0.6806470155715942, "learning_rate": 4.125757053273849e-05, "loss": 0.2049, "num_input_tokens_seen": 91884016, "step": 42565 }, { "epoch": 6.944535073409462, "grad_norm": 2.703220844268799, "learning_rate": 4.125486669110178e-05, "loss": 0.3365, "num_input_tokens_seen": 91895504, "step": 42570 }, { "epoch": 6.945350734094617, "grad_norm": 0.28853264451026917, "learning_rate": 4.125216252003957e-05, "loss": 0.0758, "num_input_tokens_seen": 91906288, "step": 42575 }, { "epoch": 6.946166394779771, "grad_norm": 1.4090224504470825, "learning_rate": 4.1249458019606675e-05, "loss": 0.1552, "num_input_tokens_seen": 91916752, "step": 42580 }, { "epoch": 6.946982055464927, "grad_norm": 1.146161675453186, "learning_rate": 4.124675318985789e-05, "loss": 0.1843, "num_input_tokens_seen": 91926992, "step": 42585 }, { "epoch": 6.947797716150082, "grad_norm": 0.1147821843624115, "learning_rate": 4.124404803084805e-05, "loss": 0.0908, "num_input_tokens_seen": 91935504, "step": 42590 }, { "epoch": 6.948613376835237, "grad_norm": 1.9903229475021362, "learning_rate": 4.124134254263197e-05, "loss": 0.3113, "num_input_tokens_seen": 91946320, "step": 42595 }, { "epoch": 6.9494290375203915, "grad_norm": 0.6707158088684082, "learning_rate": 4.123863672526447e-05, "loss": 0.0999, "num_input_tokens_seen": 91957008, "step": 42600 }, { "epoch": 6.950244698205546, "grad_norm": 1.2429769039154053, "learning_rate": 4.12359305788004e-05, "loss": 0.0736, "num_input_tokens_seen": 91967824, "step": 42605 }, { "epoch": 6.951060358890701, "grad_norm": 1.5787328481674194, "learning_rate": 4.1233224103294607e-05, "loss": 0.0803, "num_input_tokens_seen": 91979024, "step": 42610 }, { "epoch": 6.951876019575856, "grad_norm": 0.05125688388943672, "learning_rate": 4.123051729880193e-05, "loss": 0.1793, "num_input_tokens_seen": 91990288, "step": 42615 }, { "epoch": 6.952691680261012, "grad_norm": 0.3102165758609772, "learning_rate": 4.1227810165377226e-05, "loss": 0.1055, "num_input_tokens_seen": 92000592, "step": 42620 }, { "epoch": 6.9535073409461665, "grad_norm": 0.0538819245994091, "learning_rate": 4.1225102703075366e-05, "loss": 0.0952, "num_input_tokens_seen": 92009520, "step": 42625 }, { "epoch": 6.954323001631321, "grad_norm": 0.11607751995325089, "learning_rate": 4.122239491195122e-05, "loss": 0.0599, "num_input_tokens_seen": 92020848, "step": 42630 }, { "epoch": 6.955138662316476, "grad_norm": 2.1989989280700684, "learning_rate": 4.1219686792059665e-05, "loss": 0.1349, "num_input_tokens_seen": 92030384, "step": 42635 }, { "epoch": 6.955954323001631, "grad_norm": 1.2037887573242188, "learning_rate": 4.121697834345557e-05, "loss": 0.162, "num_input_tokens_seen": 92040528, "step": 42640 }, { "epoch": 6.956769983686787, "grad_norm": 0.31980037689208984, "learning_rate": 4.121426956619384e-05, "loss": 0.087, "num_input_tokens_seen": 92052368, "step": 42645 }, { "epoch": 6.9575856443719415, "grad_norm": 0.2893484830856323, "learning_rate": 4.121156046032937e-05, "loss": 0.1166, "num_input_tokens_seen": 92063152, "step": 42650 }, { "epoch": 6.958401305057096, "grad_norm": 0.6028510332107544, "learning_rate": 4.1208851025917064e-05, "loss": 0.0315, "num_input_tokens_seen": 92073296, "step": 42655 }, { "epoch": 6.959216965742251, "grad_norm": 0.9765540957450867, "learning_rate": 4.120614126301182e-05, "loss": 0.188, "num_input_tokens_seen": 92084176, "step": 42660 }, { "epoch": 6.960032626427406, "grad_norm": 1.4571518898010254, "learning_rate": 4.120343117166857e-05, "loss": 0.1354, "num_input_tokens_seen": 92095344, "step": 42665 }, { "epoch": 6.960848287112562, "grad_norm": 0.9786317348480225, "learning_rate": 4.120072075194223e-05, "loss": 0.0587, "num_input_tokens_seen": 92106384, "step": 42670 }, { "epoch": 6.9616639477977165, "grad_norm": 0.25055640935897827, "learning_rate": 4.119801000388774e-05, "loss": 0.0485, "num_input_tokens_seen": 92117104, "step": 42675 }, { "epoch": 6.962479608482871, "grad_norm": 0.46069151163101196, "learning_rate": 4.119529892756002e-05, "loss": 0.1112, "num_input_tokens_seen": 92127600, "step": 42680 }, { "epoch": 6.963295269168026, "grad_norm": 0.3226048946380615, "learning_rate": 4.1192587523014026e-05, "loss": 0.106, "num_input_tokens_seen": 92138448, "step": 42685 }, { "epoch": 6.964110929853181, "grad_norm": 0.2714746296405792, "learning_rate": 4.11898757903047e-05, "loss": 0.0161, "num_input_tokens_seen": 92148144, "step": 42690 }, { "epoch": 6.964926590538336, "grad_norm": 1.3453484773635864, "learning_rate": 4.118716372948701e-05, "loss": 0.3551, "num_input_tokens_seen": 92158640, "step": 42695 }, { "epoch": 6.9657422512234906, "grad_norm": 0.747111976146698, "learning_rate": 4.118445134061591e-05, "loss": 0.2237, "num_input_tokens_seen": 92169008, "step": 42700 }, { "epoch": 6.966557911908646, "grad_norm": 0.45422977209091187, "learning_rate": 4.118173862374637e-05, "loss": 0.0971, "num_input_tokens_seen": 92180432, "step": 42705 }, { "epoch": 6.967373572593801, "grad_norm": 0.5325521230697632, "learning_rate": 4.117902557893336e-05, "loss": 0.1547, "num_input_tokens_seen": 92191376, "step": 42710 }, { "epoch": 6.968189233278956, "grad_norm": 0.2492131143808365, "learning_rate": 4.117631220623188e-05, "loss": 0.0719, "num_input_tokens_seen": 92202256, "step": 42715 }, { "epoch": 6.969004893964111, "grad_norm": 0.6768286824226379, "learning_rate": 4.117359850569692e-05, "loss": 0.0193, "num_input_tokens_seen": 92212400, "step": 42720 }, { "epoch": 6.9698205546492655, "grad_norm": 1.2066516876220703, "learning_rate": 4.117088447738346e-05, "loss": 0.4311, "num_input_tokens_seen": 92223216, "step": 42725 }, { "epoch": 6.970636215334421, "grad_norm": 0.5230783224105835, "learning_rate": 4.116817012134651e-05, "loss": 0.0497, "num_input_tokens_seen": 92234928, "step": 42730 }, { "epoch": 6.971451876019576, "grad_norm": 0.7987650036811829, "learning_rate": 4.116545543764109e-05, "loss": 0.1447, "num_input_tokens_seen": 92245392, "step": 42735 }, { "epoch": 6.972267536704731, "grad_norm": 1.2477896213531494, "learning_rate": 4.11627404263222e-05, "loss": 0.1479, "num_input_tokens_seen": 92255824, "step": 42740 }, { "epoch": 6.973083197389886, "grad_norm": 0.22431635856628418, "learning_rate": 4.116002508744488e-05, "loss": 0.1265, "num_input_tokens_seen": 92265904, "step": 42745 }, { "epoch": 6.9738988580750405, "grad_norm": 0.06630650907754898, "learning_rate": 4.115730942106414e-05, "loss": 0.1098, "num_input_tokens_seen": 92276176, "step": 42750 }, { "epoch": 6.974714518760196, "grad_norm": 0.13222722709178925, "learning_rate": 4.115459342723503e-05, "loss": 0.0739, "num_input_tokens_seen": 92285744, "step": 42755 }, { "epoch": 6.975530179445351, "grad_norm": 0.5072905421257019, "learning_rate": 4.1151877106012596e-05, "loss": 0.0523, "num_input_tokens_seen": 92298128, "step": 42760 }, { "epoch": 6.976345840130506, "grad_norm": 0.0474415197968483, "learning_rate": 4.1149160457451886e-05, "loss": 0.0443, "num_input_tokens_seen": 92308752, "step": 42765 }, { "epoch": 6.977161500815661, "grad_norm": 0.5258273482322693, "learning_rate": 4.1146443481607945e-05, "loss": 0.1927, "num_input_tokens_seen": 92319632, "step": 42770 }, { "epoch": 6.9779771615008155, "grad_norm": 0.257369726896286, "learning_rate": 4.1143726178535844e-05, "loss": 0.0626, "num_input_tokens_seen": 92330576, "step": 42775 }, { "epoch": 6.97879282218597, "grad_norm": 1.0851033926010132, "learning_rate": 4.1141008548290654e-05, "loss": 0.0873, "num_input_tokens_seen": 92340304, "step": 42780 }, { "epoch": 6.979608482871125, "grad_norm": 0.30738314986228943, "learning_rate": 4.113829059092745e-05, "loss": 0.1869, "num_input_tokens_seen": 92350576, "step": 42785 }, { "epoch": 6.980424143556281, "grad_norm": 1.0426673889160156, "learning_rate": 4.1135572306501315e-05, "loss": 0.0817, "num_input_tokens_seen": 92359664, "step": 42790 }, { "epoch": 6.981239804241436, "grad_norm": 0.2152426689863205, "learning_rate": 4.113285369506733e-05, "loss": 0.1036, "num_input_tokens_seen": 92370032, "step": 42795 }, { "epoch": 6.9820554649265905, "grad_norm": 1.1355314254760742, "learning_rate": 4.113013475668061e-05, "loss": 0.1701, "num_input_tokens_seen": 92380976, "step": 42800 }, { "epoch": 6.982871125611745, "grad_norm": 0.42902863025665283, "learning_rate": 4.112741549139624e-05, "loss": 0.1195, "num_input_tokens_seen": 92393136, "step": 42805 }, { "epoch": 6.9836867862969, "grad_norm": 0.0978841558098793, "learning_rate": 4.1124695899269336e-05, "loss": 0.1329, "num_input_tokens_seen": 92404208, "step": 42810 }, { "epoch": 6.984502446982056, "grad_norm": 0.08448878675699234, "learning_rate": 4.1121975980355014e-05, "loss": 0.142, "num_input_tokens_seen": 92415152, "step": 42815 }, { "epoch": 6.985318107667211, "grad_norm": 0.1425817906856537, "learning_rate": 4.11192557347084e-05, "loss": 0.0259, "num_input_tokens_seen": 92426192, "step": 42820 }, { "epoch": 6.986133768352365, "grad_norm": 0.1943490356206894, "learning_rate": 4.111653516238462e-05, "loss": 0.0878, "num_input_tokens_seen": 92438064, "step": 42825 }, { "epoch": 6.98694942903752, "grad_norm": 0.8315006494522095, "learning_rate": 4.111381426343881e-05, "loss": 0.0893, "num_input_tokens_seen": 92447280, "step": 42830 }, { "epoch": 6.987765089722675, "grad_norm": 0.20431195199489594, "learning_rate": 4.1111093037926105e-05, "loss": 0.0589, "num_input_tokens_seen": 92458416, "step": 42835 }, { "epoch": 6.988580750407831, "grad_norm": 0.1757209300994873, "learning_rate": 4.110837148590167e-05, "loss": 0.086, "num_input_tokens_seen": 92469680, "step": 42840 }, { "epoch": 6.989396411092986, "grad_norm": 0.8324209451675415, "learning_rate": 4.110564960742065e-05, "loss": 0.0753, "num_input_tokens_seen": 92480592, "step": 42845 }, { "epoch": 6.99021207177814, "grad_norm": 0.13622471690177917, "learning_rate": 4.110292740253821e-05, "loss": 0.0865, "num_input_tokens_seen": 92492336, "step": 42850 }, { "epoch": 6.991027732463295, "grad_norm": 0.5990198850631714, "learning_rate": 4.110020487130951e-05, "loss": 0.0729, "num_input_tokens_seen": 92502608, "step": 42855 }, { "epoch": 6.99184339314845, "grad_norm": 0.8744381666183472, "learning_rate": 4.109748201378974e-05, "loss": 0.0594, "num_input_tokens_seen": 92513520, "step": 42860 }, { "epoch": 6.992659053833605, "grad_norm": 1.0876704454421997, "learning_rate": 4.109475883003408e-05, "loss": 0.0945, "num_input_tokens_seen": 92524656, "step": 42865 }, { "epoch": 6.993474714518761, "grad_norm": 0.6323155760765076, "learning_rate": 4.10920353200977e-05, "loss": 0.0541, "num_input_tokens_seen": 92535408, "step": 42870 }, { "epoch": 6.994290375203915, "grad_norm": 1.2883217334747314, "learning_rate": 4.108931148403582e-05, "loss": 0.0678, "num_input_tokens_seen": 92545744, "step": 42875 }, { "epoch": 6.99510603588907, "grad_norm": 1.8703725337982178, "learning_rate": 4.1086587321903635e-05, "loss": 0.0983, "num_input_tokens_seen": 92556432, "step": 42880 }, { "epoch": 6.995921696574225, "grad_norm": 1.3680534362792969, "learning_rate": 4.108386283375635e-05, "loss": 0.1001, "num_input_tokens_seen": 92568592, "step": 42885 }, { "epoch": 6.99673735725938, "grad_norm": 0.37934502959251404, "learning_rate": 4.108113801964918e-05, "loss": 0.046, "num_input_tokens_seen": 92579632, "step": 42890 }, { "epoch": 6.997553017944535, "grad_norm": 0.056825872510671616, "learning_rate": 4.107841287963734e-05, "loss": 0.0387, "num_input_tokens_seen": 92591344, "step": 42895 }, { "epoch": 6.99836867862969, "grad_norm": 0.0528714694082737, "learning_rate": 4.107568741377608e-05, "loss": 0.1511, "num_input_tokens_seen": 92601264, "step": 42900 }, { "epoch": 6.999184339314845, "grad_norm": 0.6498802900314331, "learning_rate": 4.107296162212061e-05, "loss": 0.1817, "num_input_tokens_seen": 92612464, "step": 42905 }, { "epoch": 7.0, "grad_norm": 0.3924463987350464, "learning_rate": 4.1070235504726185e-05, "loss": 0.0952, "num_input_tokens_seen": 92621824, "step": 42910 }, { "epoch": 7.0, "eval_loss": 0.14203979074954987, "eval_runtime": 90.5919, "eval_samples_per_second": 30.08, "eval_steps_per_second": 7.528, "num_input_tokens_seen": 92621824, "step": 42910 }, { "epoch": 7.000815660685155, "grad_norm": 0.28976792097091675, "learning_rate": 4.1067509061648045e-05, "loss": 0.1116, "num_input_tokens_seen": 92633024, "step": 42915 }, { "epoch": 7.00163132137031, "grad_norm": 0.20234088599681854, "learning_rate": 4.1064782292941464e-05, "loss": 0.114, "num_input_tokens_seen": 92644768, "step": 42920 }, { "epoch": 7.002446982055465, "grad_norm": 0.04275156930088997, "learning_rate": 4.106205519866168e-05, "loss": 0.0304, "num_input_tokens_seen": 92654240, "step": 42925 }, { "epoch": 7.00326264274062, "grad_norm": 0.4341692626476288, "learning_rate": 4.1059327778863975e-05, "loss": 0.0219, "num_input_tokens_seen": 92664288, "step": 42930 }, { "epoch": 7.004078303425775, "grad_norm": 0.02550877444446087, "learning_rate": 4.105660003360362e-05, "loss": 0.2026, "num_input_tokens_seen": 92675104, "step": 42935 }, { "epoch": 7.00489396411093, "grad_norm": 0.5572079420089722, "learning_rate": 4.10538719629359e-05, "loss": 0.0326, "num_input_tokens_seen": 92685056, "step": 42940 }, { "epoch": 7.005709624796085, "grad_norm": 1.3647472858428955, "learning_rate": 4.10511435669161e-05, "loss": 0.1451, "num_input_tokens_seen": 92696000, "step": 42945 }, { "epoch": 7.006525285481239, "grad_norm": 0.06144074723124504, "learning_rate": 4.1048414845599506e-05, "loss": 0.0791, "num_input_tokens_seen": 92706272, "step": 42950 }, { "epoch": 7.007340946166395, "grad_norm": 0.022631056606769562, "learning_rate": 4.104568579904142e-05, "loss": 0.2156, "num_input_tokens_seen": 92717120, "step": 42955 }, { "epoch": 7.00815660685155, "grad_norm": 0.045662831515073776, "learning_rate": 4.104295642729717e-05, "loss": 0.0262, "num_input_tokens_seen": 92726432, "step": 42960 }, { "epoch": 7.008972267536705, "grad_norm": 0.37548574805259705, "learning_rate": 4.104022673042205e-05, "loss": 0.1505, "num_input_tokens_seen": 92736896, "step": 42965 }, { "epoch": 7.00978792822186, "grad_norm": 0.3644719421863556, "learning_rate": 4.103749670847139e-05, "loss": 0.0812, "num_input_tokens_seen": 92747904, "step": 42970 }, { "epoch": 7.010603588907014, "grad_norm": 0.12273423373699188, "learning_rate": 4.103476636150051e-05, "loss": 0.1923, "num_input_tokens_seen": 92757920, "step": 42975 }, { "epoch": 7.011419249592169, "grad_norm": 0.027071300894021988, "learning_rate": 4.103203568956475e-05, "loss": 0.0636, "num_input_tokens_seen": 92769664, "step": 42980 }, { "epoch": 7.012234910277325, "grad_norm": 0.3042035400867462, "learning_rate": 4.102930469271945e-05, "loss": 0.0535, "num_input_tokens_seen": 92780256, "step": 42985 }, { "epoch": 7.01305057096248, "grad_norm": 1.1856006383895874, "learning_rate": 4.102657337101995e-05, "loss": 0.1714, "num_input_tokens_seen": 92791744, "step": 42990 }, { "epoch": 7.013866231647635, "grad_norm": 2.274740219116211, "learning_rate": 4.1023841724521616e-05, "loss": 0.0723, "num_input_tokens_seen": 92800928, "step": 42995 }, { "epoch": 7.014681892332789, "grad_norm": 0.22771218419075012, "learning_rate": 4.1021109753279794e-05, "loss": 0.0721, "num_input_tokens_seen": 92812352, "step": 43000 }, { "epoch": 7.015497553017944, "grad_norm": 0.1600140780210495, "learning_rate": 4.101837745734987e-05, "loss": 0.0742, "num_input_tokens_seen": 92822944, "step": 43005 }, { "epoch": 7.0163132137031, "grad_norm": 0.8103610277175903, "learning_rate": 4.101564483678719e-05, "loss": 0.1426, "num_input_tokens_seen": 92832960, "step": 43010 }, { "epoch": 7.017128874388255, "grad_norm": 0.18408167362213135, "learning_rate": 4.101291189164717e-05, "loss": 0.2075, "num_input_tokens_seen": 92843584, "step": 43015 }, { "epoch": 7.0179445350734095, "grad_norm": 0.18311148881912231, "learning_rate": 4.1010178621985166e-05, "loss": 0.0361, "num_input_tokens_seen": 92853440, "step": 43020 }, { "epoch": 7.018760195758564, "grad_norm": 0.6295896768569946, "learning_rate": 4.100744502785658e-05, "loss": 0.1708, "num_input_tokens_seen": 92862112, "step": 43025 }, { "epoch": 7.019575856443719, "grad_norm": 0.03312041237950325, "learning_rate": 4.100471110931682e-05, "loss": 0.0943, "num_input_tokens_seen": 92872000, "step": 43030 }, { "epoch": 7.020391517128874, "grad_norm": 0.24428068101406097, "learning_rate": 4.100197686642128e-05, "loss": 0.0381, "num_input_tokens_seen": 92881856, "step": 43035 }, { "epoch": 7.02120717781403, "grad_norm": 0.24372313916683197, "learning_rate": 4.099924229922538e-05, "loss": 0.0381, "num_input_tokens_seen": 92892768, "step": 43040 }, { "epoch": 7.0220228384991845, "grad_norm": 0.3145699203014374, "learning_rate": 4.0996507407784536e-05, "loss": 0.1847, "num_input_tokens_seen": 92904384, "step": 43045 }, { "epoch": 7.022838499184339, "grad_norm": 0.3654852509498596, "learning_rate": 4.099377219215418e-05, "loss": 0.1003, "num_input_tokens_seen": 92916160, "step": 43050 }, { "epoch": 7.023654159869494, "grad_norm": 2.1495392322540283, "learning_rate": 4.099103665238975e-05, "loss": 0.1641, "num_input_tokens_seen": 92924640, "step": 43055 }, { "epoch": 7.024469820554649, "grad_norm": 0.03262195363640785, "learning_rate": 4.0988300788546666e-05, "loss": 0.0333, "num_input_tokens_seen": 92935264, "step": 43060 }, { "epoch": 7.025285481239805, "grad_norm": 0.06413289904594421, "learning_rate": 4.0985564600680385e-05, "loss": 0.0345, "num_input_tokens_seen": 92947232, "step": 43065 }, { "epoch": 7.0261011419249595, "grad_norm": 0.07089027762413025, "learning_rate": 4.0982828088846356e-05, "loss": 0.1598, "num_input_tokens_seen": 92958816, "step": 43070 }, { "epoch": 7.026916802610114, "grad_norm": 1.5124577283859253, "learning_rate": 4.098009125310004e-05, "loss": 0.2812, "num_input_tokens_seen": 92969824, "step": 43075 }, { "epoch": 7.027732463295269, "grad_norm": 0.20687632262706757, "learning_rate": 4.0977354093496905e-05, "loss": 0.149, "num_input_tokens_seen": 92980928, "step": 43080 }, { "epoch": 7.028548123980424, "grad_norm": 0.8326447606086731, "learning_rate": 4.097461661009242e-05, "loss": 0.1837, "num_input_tokens_seen": 92992480, "step": 43085 }, { "epoch": 7.029363784665579, "grad_norm": 0.09071119129657745, "learning_rate": 4.097187880294206e-05, "loss": 0.1235, "num_input_tokens_seen": 93003232, "step": 43090 }, { "epoch": 7.0301794453507345, "grad_norm": 1.1633018255233765, "learning_rate": 4.096914067210133e-05, "loss": 0.1239, "num_input_tokens_seen": 93013952, "step": 43095 }, { "epoch": 7.030995106035889, "grad_norm": 1.4986851215362549, "learning_rate": 4.096640221762569e-05, "loss": 0.1606, "num_input_tokens_seen": 93024064, "step": 43100 }, { "epoch": 7.031810766721044, "grad_norm": 0.9991688132286072, "learning_rate": 4.0963663439570656e-05, "loss": 0.0586, "num_input_tokens_seen": 93036128, "step": 43105 }, { "epoch": 7.032626427406199, "grad_norm": 1.2265567779541016, "learning_rate": 4.0960924337991735e-05, "loss": 0.0829, "num_input_tokens_seen": 93047904, "step": 43110 }, { "epoch": 7.033442088091354, "grad_norm": 0.767280638217926, "learning_rate": 4.0958184912944436e-05, "loss": 0.2086, "num_input_tokens_seen": 93057568, "step": 43115 }, { "epoch": 7.034257748776509, "grad_norm": 0.469243586063385, "learning_rate": 4.095544516448427e-05, "loss": 0.0746, "num_input_tokens_seen": 93068192, "step": 43120 }, { "epoch": 7.035073409461664, "grad_norm": 0.0447370670735836, "learning_rate": 4.0952705092666765e-05, "loss": 0.0474, "num_input_tokens_seen": 93080032, "step": 43125 }, { "epoch": 7.035889070146819, "grad_norm": 0.44093501567840576, "learning_rate": 4.094996469754746e-05, "loss": 0.0743, "num_input_tokens_seen": 93090784, "step": 43130 }, { "epoch": 7.036704730831974, "grad_norm": 0.1907847672700882, "learning_rate": 4.094722397918189e-05, "loss": 0.0151, "num_input_tokens_seen": 93102176, "step": 43135 }, { "epoch": 7.037520391517129, "grad_norm": 0.25666874647140503, "learning_rate": 4.0944482937625586e-05, "loss": 0.0418, "num_input_tokens_seen": 93112928, "step": 43140 }, { "epoch": 7.0383360522022835, "grad_norm": 0.1660328507423401, "learning_rate": 4.0941741572934116e-05, "loss": 0.0555, "num_input_tokens_seen": 93123872, "step": 43145 }, { "epoch": 7.039151712887439, "grad_norm": 0.07088606804609299, "learning_rate": 4.093899988516303e-05, "loss": 0.1305, "num_input_tokens_seen": 93134368, "step": 43150 }, { "epoch": 7.039967373572594, "grad_norm": 1.4149539470672607, "learning_rate": 4.0936257874367885e-05, "loss": 0.1249, "num_input_tokens_seen": 93146720, "step": 43155 }, { "epoch": 7.040783034257749, "grad_norm": 1.3317880630493164, "learning_rate": 4.093351554060426e-05, "loss": 0.0877, "num_input_tokens_seen": 93157632, "step": 43160 }, { "epoch": 7.041598694942904, "grad_norm": 0.16687142848968506, "learning_rate": 4.0930772883927725e-05, "loss": 0.0634, "num_input_tokens_seen": 93167264, "step": 43165 }, { "epoch": 7.0424143556280585, "grad_norm": 1.4526561498641968, "learning_rate": 4.092802990439388e-05, "loss": 0.1641, "num_input_tokens_seen": 93179136, "step": 43170 }, { "epoch": 7.043230016313213, "grad_norm": 0.08946225047111511, "learning_rate": 4.0925286602058294e-05, "loss": 0.1484, "num_input_tokens_seen": 93189536, "step": 43175 }, { "epoch": 7.044045676998369, "grad_norm": 0.05669783428311348, "learning_rate": 4.092254297697657e-05, "loss": 0.0087, "num_input_tokens_seen": 93201568, "step": 43180 }, { "epoch": 7.044861337683524, "grad_norm": 0.03636821359395981, "learning_rate": 4.091979902920432e-05, "loss": 0.0902, "num_input_tokens_seen": 93212128, "step": 43185 }, { "epoch": 7.045676998368679, "grad_norm": 1.2748790979385376, "learning_rate": 4.091705475879715e-05, "loss": 0.3166, "num_input_tokens_seen": 93222784, "step": 43190 }, { "epoch": 7.0464926590538335, "grad_norm": 0.09669206291437149, "learning_rate": 4.091431016581066e-05, "loss": 0.0466, "num_input_tokens_seen": 93234016, "step": 43195 }, { "epoch": 7.047308319738988, "grad_norm": 2.0791797637939453, "learning_rate": 4.091156525030049e-05, "loss": 0.1659, "num_input_tokens_seen": 93243200, "step": 43200 }, { "epoch": 7.048123980424143, "grad_norm": 0.1605607122182846, "learning_rate": 4.090882001232227e-05, "loss": 0.014, "num_input_tokens_seen": 93254432, "step": 43205 }, { "epoch": 7.048939641109299, "grad_norm": 0.11674147099256516, "learning_rate": 4.090607445193163e-05, "loss": 0.0413, "num_input_tokens_seen": 93263360, "step": 43210 }, { "epoch": 7.049755301794454, "grad_norm": 0.04317712411284447, "learning_rate": 4.0903328569184214e-05, "loss": 0.0836, "num_input_tokens_seen": 93273376, "step": 43215 }, { "epoch": 7.0505709624796085, "grad_norm": 1.2195842266082764, "learning_rate": 4.0900582364135666e-05, "loss": 0.1157, "num_input_tokens_seen": 93284416, "step": 43220 }, { "epoch": 7.051386623164763, "grad_norm": 1.1530792713165283, "learning_rate": 4.0897835836841646e-05, "loss": 0.1062, "num_input_tokens_seen": 93295776, "step": 43225 }, { "epoch": 7.052202283849918, "grad_norm": 0.6885471343994141, "learning_rate": 4.089508898735781e-05, "loss": 0.1224, "num_input_tokens_seen": 93305408, "step": 43230 }, { "epoch": 7.053017944535074, "grad_norm": 2.4601619243621826, "learning_rate": 4.0892341815739844e-05, "loss": 0.3549, "num_input_tokens_seen": 93316192, "step": 43235 }, { "epoch": 7.053833605220229, "grad_norm": 0.4484296143054962, "learning_rate": 4.08895943220434e-05, "loss": 0.1399, "num_input_tokens_seen": 93327072, "step": 43240 }, { "epoch": 7.054649265905383, "grad_norm": 0.16447055339813232, "learning_rate": 4.088684650632417e-05, "loss": 0.0258, "num_input_tokens_seen": 93338368, "step": 43245 }, { "epoch": 7.055464926590538, "grad_norm": 0.5349028706550598, "learning_rate": 4.0884098368637844e-05, "loss": 0.0718, "num_input_tokens_seen": 93349952, "step": 43250 }, { "epoch": 7.056280587275693, "grad_norm": 1.529134750366211, "learning_rate": 4.0881349909040115e-05, "loss": 0.0948, "num_input_tokens_seen": 93360480, "step": 43255 }, { "epoch": 7.057096247960848, "grad_norm": 0.928591251373291, "learning_rate": 4.0878601127586684e-05, "loss": 0.0627, "num_input_tokens_seen": 93371104, "step": 43260 }, { "epoch": 7.057911908646004, "grad_norm": 0.47314146161079407, "learning_rate": 4.087585202433326e-05, "loss": 0.2204, "num_input_tokens_seen": 93381792, "step": 43265 }, { "epoch": 7.058727569331158, "grad_norm": 0.07175254076719284, "learning_rate": 4.087310259933554e-05, "loss": 0.0163, "num_input_tokens_seen": 93393024, "step": 43270 }, { "epoch": 7.059543230016313, "grad_norm": 0.057236310094594955, "learning_rate": 4.087035285264927e-05, "loss": 0.0931, "num_input_tokens_seen": 93403968, "step": 43275 }, { "epoch": 7.060358890701468, "grad_norm": 0.09200913459062576, "learning_rate": 4.0867602784330175e-05, "loss": 0.0403, "num_input_tokens_seen": 93414336, "step": 43280 }, { "epoch": 7.061174551386623, "grad_norm": 0.19901195168495178, "learning_rate": 4.0864852394433975e-05, "loss": 0.0282, "num_input_tokens_seen": 93425312, "step": 43285 }, { "epoch": 7.061990212071779, "grad_norm": 1.4791682958602905, "learning_rate": 4.086210168301641e-05, "loss": 0.1801, "num_input_tokens_seen": 93434368, "step": 43290 }, { "epoch": 7.062805872756933, "grad_norm": 1.9175097942352295, "learning_rate": 4.0859350650133234e-05, "loss": 0.1142, "num_input_tokens_seen": 93445696, "step": 43295 }, { "epoch": 7.063621533442088, "grad_norm": 0.5518743991851807, "learning_rate": 4.08565992958402e-05, "loss": 0.0702, "num_input_tokens_seen": 93456768, "step": 43300 }, { "epoch": 7.064437194127243, "grad_norm": 0.05479925498366356, "learning_rate": 4.0853847620193066e-05, "loss": 0.1054, "num_input_tokens_seen": 93468320, "step": 43305 }, { "epoch": 7.065252854812398, "grad_norm": 0.06526640802621841, "learning_rate": 4.0851095623247594e-05, "loss": 0.131, "num_input_tokens_seen": 93480832, "step": 43310 }, { "epoch": 7.066068515497553, "grad_norm": 0.0862571969628334, "learning_rate": 4.0848343305059564e-05, "loss": 0.0277, "num_input_tokens_seen": 93490880, "step": 43315 }, { "epoch": 7.066884176182708, "grad_norm": 0.028100566938519478, "learning_rate": 4.084559066568475e-05, "loss": 0.0092, "num_input_tokens_seen": 93501600, "step": 43320 }, { "epoch": 7.067699836867863, "grad_norm": 1.354984998703003, "learning_rate": 4.084283770517894e-05, "loss": 0.1372, "num_input_tokens_seen": 93512832, "step": 43325 }, { "epoch": 7.068515497553018, "grad_norm": 0.8097389936447144, "learning_rate": 4.084008442359792e-05, "loss": 0.0573, "num_input_tokens_seen": 93524416, "step": 43330 }, { "epoch": 7.069331158238173, "grad_norm": 0.04717355594038963, "learning_rate": 4.08373308209975e-05, "loss": 0.0504, "num_input_tokens_seen": 93535744, "step": 43335 }, { "epoch": 7.070146818923328, "grad_norm": 0.1590842306613922, "learning_rate": 4.0834576897433486e-05, "loss": 0.0357, "num_input_tokens_seen": 93546016, "step": 43340 }, { "epoch": 7.0709624796084825, "grad_norm": 1.2856779098510742, "learning_rate": 4.083182265296167e-05, "loss": 0.0842, "num_input_tokens_seen": 93555040, "step": 43345 }, { "epoch": 7.071778140293638, "grad_norm": 2.0701847076416016, "learning_rate": 4.0829068087637896e-05, "loss": 0.0845, "num_input_tokens_seen": 93565184, "step": 43350 }, { "epoch": 7.072593800978793, "grad_norm": 0.059858065098524094, "learning_rate": 4.0826313201517976e-05, "loss": 0.0432, "num_input_tokens_seen": 93576352, "step": 43355 }, { "epoch": 7.073409461663948, "grad_norm": 0.05784270167350769, "learning_rate": 4.0823557994657734e-05, "loss": 0.1094, "num_input_tokens_seen": 93586880, "step": 43360 }, { "epoch": 7.074225122349103, "grad_norm": 1.3456225395202637, "learning_rate": 4.0820802467113015e-05, "loss": 0.1278, "num_input_tokens_seen": 93598560, "step": 43365 }, { "epoch": 7.075040783034257, "grad_norm": 0.9685391187667847, "learning_rate": 4.0818046618939674e-05, "loss": 0.2288, "num_input_tokens_seen": 93608928, "step": 43370 }, { "epoch": 7.075856443719413, "grad_norm": 0.16240598261356354, "learning_rate": 4.081529045019355e-05, "loss": 0.2607, "num_input_tokens_seen": 93619552, "step": 43375 }, { "epoch": 7.076672104404568, "grad_norm": 0.9147653579711914, "learning_rate": 4.081253396093049e-05, "loss": 0.0373, "num_input_tokens_seen": 93631264, "step": 43380 }, { "epoch": 7.077487765089723, "grad_norm": 1.0005106925964355, "learning_rate": 4.0809777151206386e-05, "loss": 0.17, "num_input_tokens_seen": 93641792, "step": 43385 }, { "epoch": 7.078303425774878, "grad_norm": 0.06652077287435532, "learning_rate": 4.080702002107708e-05, "loss": 0.0758, "num_input_tokens_seen": 93652992, "step": 43390 }, { "epoch": 7.079119086460032, "grad_norm": 0.2532498240470886, "learning_rate": 4.080426257059846e-05, "loss": 0.0435, "num_input_tokens_seen": 93663552, "step": 43395 }, { "epoch": 7.079934747145187, "grad_norm": 0.5863286852836609, "learning_rate": 4.0801504799826415e-05, "loss": 0.1127, "num_input_tokens_seen": 93674272, "step": 43400 }, { "epoch": 7.080750407830343, "grad_norm": 0.11831016838550568, "learning_rate": 4.079874670881684e-05, "loss": 0.0126, "num_input_tokens_seen": 93685536, "step": 43405 }, { "epoch": 7.081566068515498, "grad_norm": 0.5160470008850098, "learning_rate": 4.0795988297625606e-05, "loss": 0.0801, "num_input_tokens_seen": 93696640, "step": 43410 }, { "epoch": 7.082381729200653, "grad_norm": 1.9541958570480347, "learning_rate": 4.079322956630863e-05, "loss": 0.1034, "num_input_tokens_seen": 93707008, "step": 43415 }, { "epoch": 7.083197389885807, "grad_norm": 0.5095996260643005, "learning_rate": 4.079047051492183e-05, "loss": 0.1056, "num_input_tokens_seen": 93717376, "step": 43420 }, { "epoch": 7.084013050570962, "grad_norm": 2.4830992221832275, "learning_rate": 4.0787711143521106e-05, "loss": 0.114, "num_input_tokens_seen": 93728032, "step": 43425 }, { "epoch": 7.084828711256117, "grad_norm": 0.04471273347735405, "learning_rate": 4.078495145216239e-05, "loss": 0.121, "num_input_tokens_seen": 93737792, "step": 43430 }, { "epoch": 7.085644371941273, "grad_norm": 0.29003721475601196, "learning_rate": 4.078219144090161e-05, "loss": 0.0336, "num_input_tokens_seen": 93749088, "step": 43435 }, { "epoch": 7.0864600326264275, "grad_norm": 0.5809717178344727, "learning_rate": 4.07794311097947e-05, "loss": 0.04, "num_input_tokens_seen": 93760512, "step": 43440 }, { "epoch": 7.087275693311582, "grad_norm": 0.7763233780860901, "learning_rate": 4.077667045889759e-05, "loss": 0.1594, "num_input_tokens_seen": 93771712, "step": 43445 }, { "epoch": 7.088091353996737, "grad_norm": 0.08372645080089569, "learning_rate": 4.077390948826625e-05, "loss": 0.0499, "num_input_tokens_seen": 93783008, "step": 43450 }, { "epoch": 7.088907014681892, "grad_norm": 0.8668458461761475, "learning_rate": 4.077114819795662e-05, "loss": 0.1124, "num_input_tokens_seen": 93792928, "step": 43455 }, { "epoch": 7.089722675367048, "grad_norm": 0.11359051614999771, "learning_rate": 4.0768386588024674e-05, "loss": 0.0273, "num_input_tokens_seen": 93803872, "step": 43460 }, { "epoch": 7.0905383360522025, "grad_norm": 0.5581191182136536, "learning_rate": 4.0765624658526355e-05, "loss": 0.0991, "num_input_tokens_seen": 93815584, "step": 43465 }, { "epoch": 7.091353996737357, "grad_norm": 0.9145686626434326, "learning_rate": 4.076286240951767e-05, "loss": 0.128, "num_input_tokens_seen": 93827040, "step": 43470 }, { "epoch": 7.092169657422512, "grad_norm": 0.07418882846832275, "learning_rate": 4.076009984105457e-05, "loss": 0.0848, "num_input_tokens_seen": 93836800, "step": 43475 }, { "epoch": 7.092985318107667, "grad_norm": 1.2059417963027954, "learning_rate": 4.075733695319305e-05, "loss": 0.1063, "num_input_tokens_seen": 93848032, "step": 43480 }, { "epoch": 7.093800978792822, "grad_norm": 1.480680227279663, "learning_rate": 4.075457374598912e-05, "loss": 0.2219, "num_input_tokens_seen": 93858720, "step": 43485 }, { "epoch": 7.0946166394779775, "grad_norm": 0.07320059090852737, "learning_rate": 4.0751810219498755e-05, "loss": 0.0524, "num_input_tokens_seen": 93869696, "step": 43490 }, { "epoch": 7.095432300163132, "grad_norm": 0.1453561633825302, "learning_rate": 4.074904637377798e-05, "loss": 0.1242, "num_input_tokens_seen": 93880672, "step": 43495 }, { "epoch": 7.096247960848287, "grad_norm": 1.0922526121139526, "learning_rate": 4.0746282208882794e-05, "loss": 0.0969, "num_input_tokens_seen": 93891840, "step": 43500 }, { "epoch": 7.097063621533442, "grad_norm": 0.05975661426782608, "learning_rate": 4.074351772486923e-05, "loss": 0.1475, "num_input_tokens_seen": 93900928, "step": 43505 }, { "epoch": 7.097879282218597, "grad_norm": 1.1086764335632324, "learning_rate": 4.07407529217933e-05, "loss": 0.1078, "num_input_tokens_seen": 93912480, "step": 43510 }, { "epoch": 7.0986949429037525, "grad_norm": 0.20874953269958496, "learning_rate": 4.073798779971105e-05, "loss": 0.0752, "num_input_tokens_seen": 93924192, "step": 43515 }, { "epoch": 7.099510603588907, "grad_norm": 1.017115831375122, "learning_rate": 4.073522235867852e-05, "loss": 0.0501, "num_input_tokens_seen": 93934304, "step": 43520 }, { "epoch": 7.100326264274062, "grad_norm": 0.4023149609565735, "learning_rate": 4.073245659875174e-05, "loss": 0.2255, "num_input_tokens_seen": 93944032, "step": 43525 }, { "epoch": 7.101141924959217, "grad_norm": 0.7287765145301819, "learning_rate": 4.072969051998676e-05, "loss": 0.0492, "num_input_tokens_seen": 93954304, "step": 43530 }, { "epoch": 7.101957585644372, "grad_norm": 0.042539726942777634, "learning_rate": 4.072692412243966e-05, "loss": 0.0794, "num_input_tokens_seen": 93964864, "step": 43535 }, { "epoch": 7.102773246329527, "grad_norm": 1.6749743223190308, "learning_rate": 4.072415740616648e-05, "loss": 0.1376, "num_input_tokens_seen": 93976448, "step": 43540 }, { "epoch": 7.103588907014682, "grad_norm": 0.6013262867927551, "learning_rate": 4.0721390371223315e-05, "loss": 0.0888, "num_input_tokens_seen": 93986848, "step": 43545 }, { "epoch": 7.104404567699837, "grad_norm": 0.6158692836761475, "learning_rate": 4.071862301766622e-05, "loss": 0.1103, "num_input_tokens_seen": 93997472, "step": 43550 }, { "epoch": 7.105220228384992, "grad_norm": 0.0416206456720829, "learning_rate": 4.071585534555129e-05, "loss": 0.0205, "num_input_tokens_seen": 94007744, "step": 43555 }, { "epoch": 7.106035889070147, "grad_norm": 1.716396450996399, "learning_rate": 4.071308735493462e-05, "loss": 0.357, "num_input_tokens_seen": 94018880, "step": 43560 }, { "epoch": 7.1068515497553015, "grad_norm": 0.04483853653073311, "learning_rate": 4.0710319045872297e-05, "loss": 0.0481, "num_input_tokens_seen": 94030112, "step": 43565 }, { "epoch": 7.107667210440456, "grad_norm": 0.507081151008606, "learning_rate": 4.070755041842043e-05, "loss": 0.0919, "num_input_tokens_seen": 94041120, "step": 43570 }, { "epoch": 7.108482871125612, "grad_norm": 0.601288914680481, "learning_rate": 4.0704781472635127e-05, "loss": 0.0652, "num_input_tokens_seen": 94052928, "step": 43575 }, { "epoch": 7.109298531810767, "grad_norm": 0.06023990362882614, "learning_rate": 4.07020122085725e-05, "loss": 0.061, "num_input_tokens_seen": 94064480, "step": 43580 }, { "epoch": 7.110114192495922, "grad_norm": 0.054590027779340744, "learning_rate": 4.069924262628869e-05, "loss": 0.0906, "num_input_tokens_seen": 94076064, "step": 43585 }, { "epoch": 7.1109298531810765, "grad_norm": 2.4533348083496094, "learning_rate": 4.06964727258398e-05, "loss": 0.1217, "num_input_tokens_seen": 94085216, "step": 43590 }, { "epoch": 7.111745513866231, "grad_norm": 3.0498592853546143, "learning_rate": 4.0693702507281986e-05, "loss": 0.2143, "num_input_tokens_seen": 94095904, "step": 43595 }, { "epoch": 7.112561174551387, "grad_norm": 1.0868034362792969, "learning_rate": 4.069093197067138e-05, "loss": 0.1273, "num_input_tokens_seen": 94108288, "step": 43600 }, { "epoch": 7.113376835236542, "grad_norm": 0.0563979335129261, "learning_rate": 4.068816111606412e-05, "loss": 0.0447, "num_input_tokens_seen": 94117440, "step": 43605 }, { "epoch": 7.114192495921697, "grad_norm": 0.030413487926125526, "learning_rate": 4.068538994351638e-05, "loss": 0.0868, "num_input_tokens_seen": 94127488, "step": 43610 }, { "epoch": 7.1150081566068515, "grad_norm": 0.9358881711959839, "learning_rate": 4.068261845308432e-05, "loss": 0.0888, "num_input_tokens_seen": 94137824, "step": 43615 }, { "epoch": 7.115823817292006, "grad_norm": 0.7439291477203369, "learning_rate": 4.06798466448241e-05, "loss": 0.2683, "num_input_tokens_seen": 94149632, "step": 43620 }, { "epoch": 7.116639477977161, "grad_norm": 1.1180161237716675, "learning_rate": 4.0677074518791904e-05, "loss": 0.2111, "num_input_tokens_seen": 94161184, "step": 43625 }, { "epoch": 7.117455138662317, "grad_norm": 0.06321403384208679, "learning_rate": 4.067430207504389e-05, "loss": 0.0616, "num_input_tokens_seen": 94170656, "step": 43630 }, { "epoch": 7.118270799347472, "grad_norm": 0.03369580954313278, "learning_rate": 4.067152931363627e-05, "loss": 0.0577, "num_input_tokens_seen": 94180384, "step": 43635 }, { "epoch": 7.1190864600326265, "grad_norm": 0.03507367521524429, "learning_rate": 4.0668756234625225e-05, "loss": 0.0204, "num_input_tokens_seen": 94191456, "step": 43640 }, { "epoch": 7.119902120717781, "grad_norm": 0.12038807570934296, "learning_rate": 4.066598283806695e-05, "loss": 0.1279, "num_input_tokens_seen": 94202112, "step": 43645 }, { "epoch": 7.120717781402936, "grad_norm": 1.8033629655838013, "learning_rate": 4.066320912401767e-05, "loss": 0.2045, "num_input_tokens_seen": 94213312, "step": 43650 }, { "epoch": 7.121533442088092, "grad_norm": 0.05784651264548302, "learning_rate": 4.066043509253358e-05, "loss": 0.1052, "num_input_tokens_seen": 94225248, "step": 43655 }, { "epoch": 7.122349102773247, "grad_norm": 1.9033442735671997, "learning_rate": 4.065766074367091e-05, "loss": 0.2152, "num_input_tokens_seen": 94236416, "step": 43660 }, { "epoch": 7.123164763458401, "grad_norm": 0.13096719980239868, "learning_rate": 4.065488607748589e-05, "loss": 0.0958, "num_input_tokens_seen": 94247040, "step": 43665 }, { "epoch": 7.123980424143556, "grad_norm": 0.7741528749465942, "learning_rate": 4.065211109403473e-05, "loss": 0.1112, "num_input_tokens_seen": 94257472, "step": 43670 }, { "epoch": 7.124796084828711, "grad_norm": 1.5050849914550781, "learning_rate": 4.064933579337369e-05, "loss": 0.1525, "num_input_tokens_seen": 94267776, "step": 43675 }, { "epoch": 7.125611745513866, "grad_norm": 1.3855477571487427, "learning_rate": 4.0646560175559e-05, "loss": 0.1199, "num_input_tokens_seen": 94279552, "step": 43680 }, { "epoch": 7.126427406199022, "grad_norm": 0.05044504255056381, "learning_rate": 4.0643784240646934e-05, "loss": 0.1347, "num_input_tokens_seen": 94291424, "step": 43685 }, { "epoch": 7.127243066884176, "grad_norm": 0.5085497498512268, "learning_rate": 4.064100798869372e-05, "loss": 0.0799, "num_input_tokens_seen": 94302080, "step": 43690 }, { "epoch": 7.128058727569331, "grad_norm": 0.7823203802108765, "learning_rate": 4.0638231419755645e-05, "loss": 0.1378, "num_input_tokens_seen": 94313440, "step": 43695 }, { "epoch": 7.128874388254486, "grad_norm": 0.4663481116294861, "learning_rate": 4.063545453388897e-05, "loss": 0.0554, "num_input_tokens_seen": 94322784, "step": 43700 }, { "epoch": 7.129690048939641, "grad_norm": 1.458263635635376, "learning_rate": 4.063267733114997e-05, "loss": 0.1336, "num_input_tokens_seen": 94333280, "step": 43705 }, { "epoch": 7.130505709624796, "grad_norm": 1.4033182859420776, "learning_rate": 4.062989981159494e-05, "loss": 0.0875, "num_input_tokens_seen": 94344032, "step": 43710 }, { "epoch": 7.131321370309951, "grad_norm": 0.41722890734672546, "learning_rate": 4.062712197528015e-05, "loss": 0.0987, "num_input_tokens_seen": 94354816, "step": 43715 }, { "epoch": 7.132137030995106, "grad_norm": 0.25542232394218445, "learning_rate": 4.0624343822261924e-05, "loss": 0.2399, "num_input_tokens_seen": 94364896, "step": 43720 }, { "epoch": 7.132952691680261, "grad_norm": 0.28986623883247375, "learning_rate": 4.0621565352596534e-05, "loss": 0.2839, "num_input_tokens_seen": 94375648, "step": 43725 }, { "epoch": 7.133768352365416, "grad_norm": 0.03213072568178177, "learning_rate": 4.0618786566340316e-05, "loss": 0.1909, "num_input_tokens_seen": 94386816, "step": 43730 }, { "epoch": 7.134584013050571, "grad_norm": 0.3538646399974823, "learning_rate": 4.061600746354957e-05, "loss": 0.0356, "num_input_tokens_seen": 94397856, "step": 43735 }, { "epoch": 7.135399673735726, "grad_norm": 1.8612194061279297, "learning_rate": 4.061322804428063e-05, "loss": 0.2078, "num_input_tokens_seen": 94408128, "step": 43740 }, { "epoch": 7.136215334420881, "grad_norm": 0.04821520298719406, "learning_rate": 4.061044830858981e-05, "loss": 0.1382, "num_input_tokens_seen": 94418912, "step": 43745 }, { "epoch": 7.137030995106036, "grad_norm": 0.31581589579582214, "learning_rate": 4.060766825653345e-05, "loss": 0.0418, "num_input_tokens_seen": 94428576, "step": 43750 }, { "epoch": 7.137846655791191, "grad_norm": 0.01894073747098446, "learning_rate": 4.060488788816789e-05, "loss": 0.3186, "num_input_tokens_seen": 94439232, "step": 43755 }, { "epoch": 7.138662316476346, "grad_norm": 0.5607459545135498, "learning_rate": 4.0602107203549486e-05, "loss": 0.0555, "num_input_tokens_seen": 94450144, "step": 43760 }, { "epoch": 7.1394779771615005, "grad_norm": 0.05276909098029137, "learning_rate": 4.059932620273459e-05, "loss": 0.0733, "num_input_tokens_seen": 94461664, "step": 43765 }, { "epoch": 7.140293637846656, "grad_norm": 0.351809024810791, "learning_rate": 4.059654488577955e-05, "loss": 0.0131, "num_input_tokens_seen": 94471392, "step": 43770 }, { "epoch": 7.141109298531811, "grad_norm": 0.3862272799015045, "learning_rate": 4.059376325274076e-05, "loss": 0.2273, "num_input_tokens_seen": 94482848, "step": 43775 }, { "epoch": 7.141924959216966, "grad_norm": 0.4154002070426941, "learning_rate": 4.059098130367456e-05, "loss": 0.0905, "num_input_tokens_seen": 94493792, "step": 43780 }, { "epoch": 7.142740619902121, "grad_norm": 0.8880160450935364, "learning_rate": 4.0588199038637356e-05, "loss": 0.0435, "num_input_tokens_seen": 94504448, "step": 43785 }, { "epoch": 7.143556280587275, "grad_norm": 0.5049669146537781, "learning_rate": 4.058541645768551e-05, "loss": 0.0952, "num_input_tokens_seen": 94515296, "step": 43790 }, { "epoch": 7.14437194127243, "grad_norm": 0.8555018901824951, "learning_rate": 4.058263356087544e-05, "loss": 0.0928, "num_input_tokens_seen": 94525984, "step": 43795 }, { "epoch": 7.145187601957586, "grad_norm": 0.31087884306907654, "learning_rate": 4.057985034826352e-05, "loss": 0.1458, "num_input_tokens_seen": 94536512, "step": 43800 }, { "epoch": 7.146003262642741, "grad_norm": 0.12663114070892334, "learning_rate": 4.0577066819906184e-05, "loss": 0.057, "num_input_tokens_seen": 94546208, "step": 43805 }, { "epoch": 7.146818923327896, "grad_norm": 0.15526893734931946, "learning_rate": 4.057428297585982e-05, "loss": 0.0306, "num_input_tokens_seen": 94558656, "step": 43810 }, { "epoch": 7.14763458401305, "grad_norm": 0.37145599722862244, "learning_rate": 4.0571498816180854e-05, "loss": 0.0325, "num_input_tokens_seen": 94568832, "step": 43815 }, { "epoch": 7.148450244698205, "grad_norm": 1.0492498874664307, "learning_rate": 4.056871434092571e-05, "loss": 0.1751, "num_input_tokens_seen": 94580512, "step": 43820 }, { "epoch": 7.149265905383361, "grad_norm": 0.8339369297027588, "learning_rate": 4.0565929550150816e-05, "loss": 0.1616, "num_input_tokens_seen": 94592608, "step": 43825 }, { "epoch": 7.150081566068516, "grad_norm": 1.381603717803955, "learning_rate": 4.056314444391262e-05, "loss": 0.1726, "num_input_tokens_seen": 94603200, "step": 43830 }, { "epoch": 7.150897226753671, "grad_norm": 0.5434371829032898, "learning_rate": 4.056035902226756e-05, "loss": 0.0585, "num_input_tokens_seen": 94614656, "step": 43835 }, { "epoch": 7.151712887438825, "grad_norm": 1.1635444164276123, "learning_rate": 4.0557573285272076e-05, "loss": 0.0815, "num_input_tokens_seen": 94624512, "step": 43840 }, { "epoch": 7.15252854812398, "grad_norm": 0.10864085704088211, "learning_rate": 4.055478723298264e-05, "loss": 0.1321, "num_input_tokens_seen": 94635360, "step": 43845 }, { "epoch": 7.153344208809135, "grad_norm": 0.04988052695989609, "learning_rate": 4.055200086545571e-05, "loss": 0.0191, "num_input_tokens_seen": 94645440, "step": 43850 }, { "epoch": 7.154159869494291, "grad_norm": 0.03793323040008545, "learning_rate": 4.0549214182747744e-05, "loss": 0.1868, "num_input_tokens_seen": 94656832, "step": 43855 }, { "epoch": 7.1549755301794455, "grad_norm": 0.16121035814285278, "learning_rate": 4.054642718491524e-05, "loss": 0.0208, "num_input_tokens_seen": 94666816, "step": 43860 }, { "epoch": 7.1557911908646, "grad_norm": 0.7574861645698547, "learning_rate": 4.054363987201465e-05, "loss": 0.2026, "num_input_tokens_seen": 94677888, "step": 43865 }, { "epoch": 7.156606851549755, "grad_norm": 1.0531988143920898, "learning_rate": 4.054085224410249e-05, "loss": 0.0838, "num_input_tokens_seen": 94689088, "step": 43870 }, { "epoch": 7.15742251223491, "grad_norm": 0.10111987590789795, "learning_rate": 4.0538064301235245e-05, "loss": 0.1552, "num_input_tokens_seen": 94700992, "step": 43875 }, { "epoch": 7.158238172920065, "grad_norm": 0.16033300757408142, "learning_rate": 4.053527604346941e-05, "loss": 0.0281, "num_input_tokens_seen": 94710112, "step": 43880 }, { "epoch": 7.1590538336052205, "grad_norm": 0.11318289488554001, "learning_rate": 4.0532487470861505e-05, "loss": 0.1479, "num_input_tokens_seen": 94720544, "step": 43885 }, { "epoch": 7.159869494290375, "grad_norm": 0.09621944278478622, "learning_rate": 4.0529698583468035e-05, "loss": 0.0521, "num_input_tokens_seen": 94730496, "step": 43890 }, { "epoch": 7.16068515497553, "grad_norm": 0.26476985216140747, "learning_rate": 4.052690938134553e-05, "loss": 0.1957, "num_input_tokens_seen": 94741696, "step": 43895 }, { "epoch": 7.161500815660685, "grad_norm": 0.2033979594707489, "learning_rate": 4.052411986455049e-05, "loss": 0.074, "num_input_tokens_seen": 94752000, "step": 43900 }, { "epoch": 7.16231647634584, "grad_norm": 0.4418167173862457, "learning_rate": 4.0521330033139476e-05, "loss": 0.0882, "num_input_tokens_seen": 94763264, "step": 43905 }, { "epoch": 7.1631321370309955, "grad_norm": 2.552973508834839, "learning_rate": 4.051853988716902e-05, "loss": 0.2527, "num_input_tokens_seen": 94773504, "step": 43910 }, { "epoch": 7.16394779771615, "grad_norm": 0.3670634329319, "learning_rate": 4.051574942669567e-05, "loss": 0.2161, "num_input_tokens_seen": 94783328, "step": 43915 }, { "epoch": 7.164763458401305, "grad_norm": 0.19594289362430573, "learning_rate": 4.0512958651775964e-05, "loss": 0.0127, "num_input_tokens_seen": 94795584, "step": 43920 }, { "epoch": 7.16557911908646, "grad_norm": 0.03903470188379288, "learning_rate": 4.051016756246648e-05, "loss": 0.1275, "num_input_tokens_seen": 94805408, "step": 43925 }, { "epoch": 7.166394779771615, "grad_norm": 0.9293474555015564, "learning_rate": 4.050737615882378e-05, "loss": 0.1051, "num_input_tokens_seen": 94817024, "step": 43930 }, { "epoch": 7.16721044045677, "grad_norm": 1.317966103553772, "learning_rate": 4.050458444090442e-05, "loss": 0.2104, "num_input_tokens_seen": 94828480, "step": 43935 }, { "epoch": 7.168026101141925, "grad_norm": 1.6664178371429443, "learning_rate": 4.050179240876499e-05, "loss": 0.1889, "num_input_tokens_seen": 94839552, "step": 43940 }, { "epoch": 7.16884176182708, "grad_norm": 0.09756238013505936, "learning_rate": 4.049900006246207e-05, "loss": 0.1074, "num_input_tokens_seen": 94851200, "step": 43945 }, { "epoch": 7.169657422512235, "grad_norm": 0.08359837532043457, "learning_rate": 4.049620740205226e-05, "loss": 0.1992, "num_input_tokens_seen": 94863744, "step": 43950 }, { "epoch": 7.17047308319739, "grad_norm": 0.17240044474601746, "learning_rate": 4.049341442759214e-05, "loss": 0.0952, "num_input_tokens_seen": 94873504, "step": 43955 }, { "epoch": 7.171288743882545, "grad_norm": 0.019175484776496887, "learning_rate": 4.049062113913832e-05, "loss": 0.0912, "num_input_tokens_seen": 94883232, "step": 43960 }, { "epoch": 7.1721044045677, "grad_norm": 0.4046456813812256, "learning_rate": 4.0487827536747414e-05, "loss": 0.0934, "num_input_tokens_seen": 94893984, "step": 43965 }, { "epoch": 7.172920065252855, "grad_norm": 0.11870085448026657, "learning_rate": 4.0485033620476035e-05, "loss": 0.1514, "num_input_tokens_seen": 94904960, "step": 43970 }, { "epoch": 7.17373572593801, "grad_norm": 1.7145805358886719, "learning_rate": 4.0482239390380806e-05, "loss": 0.1673, "num_input_tokens_seen": 94914656, "step": 43975 }, { "epoch": 7.174551386623165, "grad_norm": 0.40001213550567627, "learning_rate": 4.047944484651836e-05, "loss": 0.0867, "num_input_tokens_seen": 94925632, "step": 43980 }, { "epoch": 7.1753670473083195, "grad_norm": 0.3751975893974304, "learning_rate": 4.047664998894532e-05, "loss": 0.0984, "num_input_tokens_seen": 94937376, "step": 43985 }, { "epoch": 7.176182707993474, "grad_norm": 1.95479154586792, "learning_rate": 4.0473854817718336e-05, "loss": 0.1095, "num_input_tokens_seen": 94948608, "step": 43990 }, { "epoch": 7.17699836867863, "grad_norm": 0.0759594738483429, "learning_rate": 4.047105933289406e-05, "loss": 0.0504, "num_input_tokens_seen": 94958368, "step": 43995 }, { "epoch": 7.177814029363785, "grad_norm": 0.21713481843471527, "learning_rate": 4.046826353452914e-05, "loss": 0.0228, "num_input_tokens_seen": 94968576, "step": 44000 }, { "epoch": 7.17862969004894, "grad_norm": 1.3206610679626465, "learning_rate": 4.0465467422680226e-05, "loss": 0.1649, "num_input_tokens_seen": 94979552, "step": 44005 }, { "epoch": 7.1794453507340945, "grad_norm": 1.340448021888733, "learning_rate": 4.046267099740401e-05, "loss": 0.049, "num_input_tokens_seen": 94990656, "step": 44010 }, { "epoch": 7.180261011419249, "grad_norm": 0.32534259557724, "learning_rate": 4.045987425875714e-05, "loss": 0.1159, "num_input_tokens_seen": 95000480, "step": 44015 }, { "epoch": 7.181076672104404, "grad_norm": 0.15761801600456238, "learning_rate": 4.0457077206796304e-05, "loss": 0.0538, "num_input_tokens_seen": 95011584, "step": 44020 }, { "epoch": 7.18189233278956, "grad_norm": 0.05247490853071213, "learning_rate": 4.0454279841578197e-05, "loss": 0.1305, "num_input_tokens_seen": 95021824, "step": 44025 }, { "epoch": 7.182707993474715, "grad_norm": 0.31274253129959106, "learning_rate": 4.0451482163159506e-05, "loss": 0.0516, "num_input_tokens_seen": 95033696, "step": 44030 }, { "epoch": 7.1835236541598695, "grad_norm": 0.7448315620422363, "learning_rate": 4.0448684171596916e-05, "loss": 0.103, "num_input_tokens_seen": 95043936, "step": 44035 }, { "epoch": 7.184339314845024, "grad_norm": 3.861720561981201, "learning_rate": 4.044588586694715e-05, "loss": 0.0784, "num_input_tokens_seen": 95055040, "step": 44040 }, { "epoch": 7.185154975530179, "grad_norm": 1.0174751281738281, "learning_rate": 4.0443087249266906e-05, "loss": 0.1474, "num_input_tokens_seen": 95066368, "step": 44045 }, { "epoch": 7.185970636215335, "grad_norm": 1.1601581573486328, "learning_rate": 4.044028831861292e-05, "loss": 0.226, "num_input_tokens_seen": 95077728, "step": 44050 }, { "epoch": 7.18678629690049, "grad_norm": 0.350454717874527, "learning_rate": 4.04374890750419e-05, "loss": 0.1667, "num_input_tokens_seen": 95088160, "step": 44055 }, { "epoch": 7.1876019575856445, "grad_norm": 1.3872861862182617, "learning_rate": 4.0434689518610577e-05, "loss": 0.1597, "num_input_tokens_seen": 95099424, "step": 44060 }, { "epoch": 7.188417618270799, "grad_norm": 0.7073930501937866, "learning_rate": 4.0431889649375686e-05, "loss": 0.143, "num_input_tokens_seen": 95109600, "step": 44065 }, { "epoch": 7.189233278955954, "grad_norm": 0.15972256660461426, "learning_rate": 4.0429089467393985e-05, "loss": 0.1607, "num_input_tokens_seen": 95119424, "step": 44070 }, { "epoch": 7.190048939641109, "grad_norm": 1.3372445106506348, "learning_rate": 4.0426288972722205e-05, "loss": 0.0955, "num_input_tokens_seen": 95130368, "step": 44075 }, { "epoch": 7.190864600326265, "grad_norm": 0.8836774230003357, "learning_rate": 4.042348816541711e-05, "loss": 0.0437, "num_input_tokens_seen": 95142400, "step": 44080 }, { "epoch": 7.191680261011419, "grad_norm": 0.07579772174358368, "learning_rate": 4.042068704553546e-05, "loss": 0.2907, "num_input_tokens_seen": 95153472, "step": 44085 }, { "epoch": 7.192495921696574, "grad_norm": 0.8579636812210083, "learning_rate": 4.041788561313403e-05, "loss": 0.2222, "num_input_tokens_seen": 95163072, "step": 44090 }, { "epoch": 7.193311582381729, "grad_norm": 0.21370592713356018, "learning_rate": 4.041508386826959e-05, "loss": 0.0313, "num_input_tokens_seen": 95174656, "step": 44095 }, { "epoch": 7.194127243066884, "grad_norm": 0.5912361741065979, "learning_rate": 4.041228181099891e-05, "loss": 0.0604, "num_input_tokens_seen": 95184960, "step": 44100 }, { "epoch": 7.19494290375204, "grad_norm": 1.6125164031982422, "learning_rate": 4.04094794413788e-05, "loss": 0.1324, "num_input_tokens_seen": 95194848, "step": 44105 }, { "epoch": 7.195758564437194, "grad_norm": 1.5743303298950195, "learning_rate": 4.040667675946603e-05, "loss": 0.097, "num_input_tokens_seen": 95207072, "step": 44110 }, { "epoch": 7.196574225122349, "grad_norm": 0.03853422775864601, "learning_rate": 4.040387376531742e-05, "loss": 0.1724, "num_input_tokens_seen": 95217920, "step": 44115 }, { "epoch": 7.197389885807504, "grad_norm": 2.330207347869873, "learning_rate": 4.0401070458989766e-05, "loss": 0.1398, "num_input_tokens_seen": 95228224, "step": 44120 }, { "epoch": 7.198205546492659, "grad_norm": 0.25995495915412903, "learning_rate": 4.039826684053987e-05, "loss": 0.102, "num_input_tokens_seen": 95239008, "step": 44125 }, { "epoch": 7.199021207177814, "grad_norm": 1.166869044303894, "learning_rate": 4.039546291002457e-05, "loss": 0.3392, "num_input_tokens_seen": 95248800, "step": 44130 }, { "epoch": 7.199836867862969, "grad_norm": 0.14135529100894928, "learning_rate": 4.039265866750069e-05, "loss": 0.1661, "num_input_tokens_seen": 95259840, "step": 44135 }, { "epoch": 7.200652528548124, "grad_norm": 1.7439262866973877, "learning_rate": 4.038985411302505e-05, "loss": 0.1746, "num_input_tokens_seen": 95270848, "step": 44140 }, { "epoch": 7.201468189233279, "grad_norm": 0.0644741877913475, "learning_rate": 4.038704924665449e-05, "loss": 0.0614, "num_input_tokens_seen": 95281472, "step": 44145 }, { "epoch": 7.202283849918434, "grad_norm": 0.2022203654050827, "learning_rate": 4.038424406844586e-05, "loss": 0.2014, "num_input_tokens_seen": 95292160, "step": 44150 }, { "epoch": 7.203099510603589, "grad_norm": 0.9631451368331909, "learning_rate": 4.038143857845601e-05, "loss": 0.0628, "num_input_tokens_seen": 95302208, "step": 44155 }, { "epoch": 7.2039151712887435, "grad_norm": 0.2171817272901535, "learning_rate": 4.037863277674179e-05, "loss": 0.221, "num_input_tokens_seen": 95312032, "step": 44160 }, { "epoch": 7.204730831973899, "grad_norm": 0.7289314866065979, "learning_rate": 4.0375826663360065e-05, "loss": 0.1048, "num_input_tokens_seen": 95323744, "step": 44165 }, { "epoch": 7.205546492659054, "grad_norm": 0.7414669394493103, "learning_rate": 4.037302023836771e-05, "loss": 0.1509, "num_input_tokens_seen": 95333856, "step": 44170 }, { "epoch": 7.206362153344209, "grad_norm": 0.026991529390215874, "learning_rate": 4.0370213501821604e-05, "loss": 0.0709, "num_input_tokens_seen": 95344864, "step": 44175 }, { "epoch": 7.207177814029364, "grad_norm": 0.5269249677658081, "learning_rate": 4.0367406453778615e-05, "loss": 0.1086, "num_input_tokens_seen": 95355520, "step": 44180 }, { "epoch": 7.2079934747145185, "grad_norm": 0.24518489837646484, "learning_rate": 4.036459909429564e-05, "loss": 0.0307, "num_input_tokens_seen": 95366752, "step": 44185 }, { "epoch": 7.208809135399674, "grad_norm": 0.051408927887678146, "learning_rate": 4.0361791423429574e-05, "loss": 0.0651, "num_input_tokens_seen": 95378304, "step": 44190 }, { "epoch": 7.209624796084829, "grad_norm": 1.4894981384277344, "learning_rate": 4.035898344123732e-05, "loss": 0.2173, "num_input_tokens_seen": 95389024, "step": 44195 }, { "epoch": 7.210440456769984, "grad_norm": 0.15310344099998474, "learning_rate": 4.035617514777578e-05, "loss": 0.0214, "num_input_tokens_seen": 95398656, "step": 44200 }, { "epoch": 7.211256117455139, "grad_norm": 1.164002537727356, "learning_rate": 4.0353366543101865e-05, "loss": 0.1072, "num_input_tokens_seen": 95408160, "step": 44205 }, { "epoch": 7.212071778140293, "grad_norm": 0.6080202460289001, "learning_rate": 4.035055762727251e-05, "loss": 0.0557, "num_input_tokens_seen": 95419936, "step": 44210 }, { "epoch": 7.212887438825448, "grad_norm": 1.5122613906860352, "learning_rate": 4.034774840034462e-05, "loss": 0.1657, "num_input_tokens_seen": 95429952, "step": 44215 }, { "epoch": 7.213703099510604, "grad_norm": 0.12026864290237427, "learning_rate": 4.034493886237515e-05, "loss": 0.0461, "num_input_tokens_seen": 95441024, "step": 44220 }, { "epoch": 7.214518760195759, "grad_norm": 0.10231860727071762, "learning_rate": 4.034212901342102e-05, "loss": 0.0404, "num_input_tokens_seen": 95452448, "step": 44225 }, { "epoch": 7.215334420880914, "grad_norm": 0.07901457697153091, "learning_rate": 4.033931885353919e-05, "loss": 0.038, "num_input_tokens_seen": 95462432, "step": 44230 }, { "epoch": 7.216150081566068, "grad_norm": 0.31556782126426697, "learning_rate": 4.03365083827866e-05, "loss": 0.0473, "num_input_tokens_seen": 95473312, "step": 44235 }, { "epoch": 7.216965742251223, "grad_norm": 2.039837598800659, "learning_rate": 4.033369760122021e-05, "loss": 0.1488, "num_input_tokens_seen": 95482624, "step": 44240 }, { "epoch": 7.217781402936378, "grad_norm": 0.08270122110843658, "learning_rate": 4.033088650889698e-05, "loss": 0.0178, "num_input_tokens_seen": 95494304, "step": 44245 }, { "epoch": 7.218597063621534, "grad_norm": 0.09124062955379486, "learning_rate": 4.0328075105873897e-05, "loss": 0.0318, "num_input_tokens_seen": 95505024, "step": 44250 }, { "epoch": 7.219412724306689, "grad_norm": 0.3608414828777313, "learning_rate": 4.0325263392207924e-05, "loss": 0.0315, "num_input_tokens_seen": 95516224, "step": 44255 }, { "epoch": 7.220228384991843, "grad_norm": 0.10027620196342468, "learning_rate": 4.032245136795605e-05, "loss": 0.0256, "num_input_tokens_seen": 95527168, "step": 44260 }, { "epoch": 7.221044045676998, "grad_norm": 3.2020580768585205, "learning_rate": 4.031963903317526e-05, "loss": 0.1254, "num_input_tokens_seen": 95537152, "step": 44265 }, { "epoch": 7.221859706362153, "grad_norm": 1.7396881580352783, "learning_rate": 4.031682638792254e-05, "loss": 0.1697, "num_input_tokens_seen": 95549248, "step": 44270 }, { "epoch": 7.222675367047309, "grad_norm": 0.8351021409034729, "learning_rate": 4.031401343225491e-05, "loss": 0.1773, "num_input_tokens_seen": 95560704, "step": 44275 }, { "epoch": 7.2234910277324635, "grad_norm": 0.03372248634696007, "learning_rate": 4.031120016622938e-05, "loss": 0.1162, "num_input_tokens_seen": 95572864, "step": 44280 }, { "epoch": 7.224306688417618, "grad_norm": 1.3102623224258423, "learning_rate": 4.0308386589902933e-05, "loss": 0.1301, "num_input_tokens_seen": 95582944, "step": 44285 }, { "epoch": 7.225122349102773, "grad_norm": 0.05157173424959183, "learning_rate": 4.030557270333263e-05, "loss": 0.0547, "num_input_tokens_seen": 95593472, "step": 44290 }, { "epoch": 7.225938009787928, "grad_norm": 0.543393075466156, "learning_rate": 4.030275850657547e-05, "loss": 0.1658, "num_input_tokens_seen": 95604064, "step": 44295 }, { "epoch": 7.226753670473083, "grad_norm": 0.06518827378749847, "learning_rate": 4.02999439996885e-05, "loss": 0.0319, "num_input_tokens_seen": 95616192, "step": 44300 }, { "epoch": 7.2275693311582385, "grad_norm": 0.7236268520355225, "learning_rate": 4.0297129182728754e-05, "loss": 0.0846, "num_input_tokens_seen": 95626976, "step": 44305 }, { "epoch": 7.228384991843393, "grad_norm": 0.14383181929588318, "learning_rate": 4.0294314055753284e-05, "loss": 0.0403, "num_input_tokens_seen": 95635328, "step": 44310 }, { "epoch": 7.229200652528548, "grad_norm": 1.0165363550186157, "learning_rate": 4.029149861881914e-05, "loss": 0.1685, "num_input_tokens_seen": 95645248, "step": 44315 }, { "epoch": 7.230016313213703, "grad_norm": 0.8645735383033752, "learning_rate": 4.028868287198337e-05, "loss": 0.1859, "num_input_tokens_seen": 95656928, "step": 44320 }, { "epoch": 7.230831973898858, "grad_norm": 0.11128101497888565, "learning_rate": 4.0285866815303046e-05, "loss": 0.0081, "num_input_tokens_seen": 95667200, "step": 44325 }, { "epoch": 7.231647634584013, "grad_norm": 0.4005562365055084, "learning_rate": 4.028305044883525e-05, "loss": 0.1232, "num_input_tokens_seen": 95677248, "step": 44330 }, { "epoch": 7.232463295269168, "grad_norm": 0.07440701872110367, "learning_rate": 4.028023377263703e-05, "loss": 0.1896, "num_input_tokens_seen": 95687104, "step": 44335 }, { "epoch": 7.233278955954323, "grad_norm": 0.9389535784721375, "learning_rate": 4.02774167867655e-05, "loss": 0.1538, "num_input_tokens_seen": 95697792, "step": 44340 }, { "epoch": 7.234094616639478, "grad_norm": 0.11223983019590378, "learning_rate": 4.027459949127774e-05, "loss": 0.1155, "num_input_tokens_seen": 95708800, "step": 44345 }, { "epoch": 7.234910277324633, "grad_norm": 0.04016692191362381, "learning_rate": 4.0271781886230845e-05, "loss": 0.0531, "num_input_tokens_seen": 95719296, "step": 44350 }, { "epoch": 7.235725938009788, "grad_norm": 0.49647125601768494, "learning_rate": 4.0268963971681905e-05, "loss": 0.0727, "num_input_tokens_seen": 95730336, "step": 44355 }, { "epoch": 7.236541598694943, "grad_norm": 0.15650160610675812, "learning_rate": 4.026614574768806e-05, "loss": 0.2163, "num_input_tokens_seen": 95741760, "step": 44360 }, { "epoch": 7.237357259380098, "grad_norm": 0.09235084056854248, "learning_rate": 4.026332721430639e-05, "loss": 0.1923, "num_input_tokens_seen": 95752896, "step": 44365 }, { "epoch": 7.238172920065253, "grad_norm": 1.558906078338623, "learning_rate": 4.026050837159403e-05, "loss": 0.2739, "num_input_tokens_seen": 95763968, "step": 44370 }, { "epoch": 7.238988580750408, "grad_norm": 0.3005921542644501, "learning_rate": 4.0257689219608114e-05, "loss": 0.0618, "num_input_tokens_seen": 95774528, "step": 44375 }, { "epoch": 7.239804241435563, "grad_norm": 0.9217392206192017, "learning_rate": 4.0254869758405764e-05, "loss": 0.0649, "num_input_tokens_seen": 95785600, "step": 44380 }, { "epoch": 7.240619902120717, "grad_norm": 1.0175803899765015, "learning_rate": 4.025204998804414e-05, "loss": 0.1711, "num_input_tokens_seen": 95795104, "step": 44385 }, { "epoch": 7.241435562805873, "grad_norm": 0.1277313232421875, "learning_rate": 4.024922990858036e-05, "loss": 0.0693, "num_input_tokens_seen": 95807104, "step": 44390 }, { "epoch": 7.242251223491028, "grad_norm": 0.17999213933944702, "learning_rate": 4.0246409520071595e-05, "loss": 0.0879, "num_input_tokens_seen": 95816832, "step": 44395 }, { "epoch": 7.243066884176183, "grad_norm": 0.37073853611946106, "learning_rate": 4.0243588822574995e-05, "loss": 0.0511, "num_input_tokens_seen": 95826688, "step": 44400 }, { "epoch": 7.2438825448613375, "grad_norm": 0.09674109518527985, "learning_rate": 4.024076781614774e-05, "loss": 0.0516, "num_input_tokens_seen": 95835904, "step": 44405 }, { "epoch": 7.244698205546492, "grad_norm": 1.1976195573806763, "learning_rate": 4.023794650084699e-05, "loss": 0.2184, "num_input_tokens_seen": 95847136, "step": 44410 }, { "epoch": 7.245513866231648, "grad_norm": 0.16555610299110413, "learning_rate": 4.023512487672991e-05, "loss": 0.1673, "num_input_tokens_seen": 95858368, "step": 44415 }, { "epoch": 7.246329526916803, "grad_norm": 0.06996014714241028, "learning_rate": 4.0232302943853716e-05, "loss": 0.1613, "num_input_tokens_seen": 95868544, "step": 44420 }, { "epoch": 7.247145187601958, "grad_norm": 0.09738392382860184, "learning_rate": 4.022948070227557e-05, "loss": 0.0506, "num_input_tokens_seen": 95878944, "step": 44425 }, { "epoch": 7.2479608482871125, "grad_norm": 1.562113642692566, "learning_rate": 4.022665815205268e-05, "loss": 0.079, "num_input_tokens_seen": 95889792, "step": 44430 }, { "epoch": 7.248776508972267, "grad_norm": 0.1715397834777832, "learning_rate": 4.022383529324224e-05, "loss": 0.1086, "num_input_tokens_seen": 95900896, "step": 44435 }, { "epoch": 7.249592169657422, "grad_norm": 1.4438376426696777, "learning_rate": 4.022101212590147e-05, "loss": 0.1815, "num_input_tokens_seen": 95911808, "step": 44440 }, { "epoch": 7.250407830342578, "grad_norm": 0.2560165822505951, "learning_rate": 4.0218188650087576e-05, "loss": 0.0857, "num_input_tokens_seen": 95921696, "step": 44445 }, { "epoch": 7.251223491027733, "grad_norm": 0.23987318575382233, "learning_rate": 4.021536486585778e-05, "loss": 0.055, "num_input_tokens_seen": 95931104, "step": 44450 }, { "epoch": 7.2520391517128875, "grad_norm": 0.2156541347503662, "learning_rate": 4.0212540773269325e-05, "loss": 0.2173, "num_input_tokens_seen": 95941376, "step": 44455 }, { "epoch": 7.252854812398042, "grad_norm": 0.5168347954750061, "learning_rate": 4.020971637237943e-05, "loss": 0.0509, "num_input_tokens_seen": 95950496, "step": 44460 }, { "epoch": 7.253670473083197, "grad_norm": 1.583856463432312, "learning_rate": 4.0206891663245336e-05, "loss": 0.1133, "num_input_tokens_seen": 95961824, "step": 44465 }, { "epoch": 7.254486133768353, "grad_norm": 0.6059013605117798, "learning_rate": 4.020406664592429e-05, "loss": 0.1036, "num_input_tokens_seen": 95972768, "step": 44470 }, { "epoch": 7.255301794453508, "grad_norm": 1.1557574272155762, "learning_rate": 4.020124132047355e-05, "loss": 0.0798, "num_input_tokens_seen": 95985024, "step": 44475 }, { "epoch": 7.2561174551386625, "grad_norm": 0.18098124861717224, "learning_rate": 4.019841568695037e-05, "loss": 0.0629, "num_input_tokens_seen": 95996768, "step": 44480 }, { "epoch": 7.256933115823817, "grad_norm": 1.0972263813018799, "learning_rate": 4.019558974541201e-05, "loss": 0.1184, "num_input_tokens_seen": 96007328, "step": 44485 }, { "epoch": 7.257748776508972, "grad_norm": 0.13015137612819672, "learning_rate": 4.019276349591574e-05, "loss": 0.0522, "num_input_tokens_seen": 96017472, "step": 44490 }, { "epoch": 7.258564437194127, "grad_norm": 1.674820065498352, "learning_rate": 4.018993693851886e-05, "loss": 0.0732, "num_input_tokens_seen": 96026048, "step": 44495 }, { "epoch": 7.259380097879283, "grad_norm": 0.10057791322469711, "learning_rate": 4.018711007327863e-05, "loss": 0.0814, "num_input_tokens_seen": 96037024, "step": 44500 }, { "epoch": 7.260195758564437, "grad_norm": 0.5228774547576904, "learning_rate": 4.0184282900252344e-05, "loss": 0.0731, "num_input_tokens_seen": 96048640, "step": 44505 }, { "epoch": 7.261011419249592, "grad_norm": 1.3121931552886963, "learning_rate": 4.018145541949731e-05, "loss": 0.1179, "num_input_tokens_seen": 96059968, "step": 44510 }, { "epoch": 7.261827079934747, "grad_norm": 0.2322540283203125, "learning_rate": 4.017862763107082e-05, "loss": 0.1771, "num_input_tokens_seen": 96070592, "step": 44515 }, { "epoch": 7.262642740619902, "grad_norm": 0.8603408336639404, "learning_rate": 4.0175799535030184e-05, "loss": 0.0692, "num_input_tokens_seen": 96080768, "step": 44520 }, { "epoch": 7.263458401305057, "grad_norm": 0.30722156167030334, "learning_rate": 4.0172971131432724e-05, "loss": 0.1318, "num_input_tokens_seen": 96091584, "step": 44525 }, { "epoch": 7.264274061990212, "grad_norm": 1.1669600009918213, "learning_rate": 4.017014242033575e-05, "loss": 0.0434, "num_input_tokens_seen": 96103264, "step": 44530 }, { "epoch": 7.265089722675367, "grad_norm": 0.12110728025436401, "learning_rate": 4.01673134017966e-05, "loss": 0.3089, "num_input_tokens_seen": 96112992, "step": 44535 }, { "epoch": 7.265905383360522, "grad_norm": 0.6883626580238342, "learning_rate": 4.01644840758726e-05, "loss": 0.1212, "num_input_tokens_seen": 96124480, "step": 44540 }, { "epoch": 7.266721044045677, "grad_norm": 1.2585667371749878, "learning_rate": 4.016165444262109e-05, "loss": 0.1226, "num_input_tokens_seen": 96134848, "step": 44545 }, { "epoch": 7.267536704730832, "grad_norm": 0.3260308504104614, "learning_rate": 4.0158824502099425e-05, "loss": 0.0542, "num_input_tokens_seen": 96146176, "step": 44550 }, { "epoch": 7.268352365415987, "grad_norm": 0.19283966720104218, "learning_rate": 4.0155994254364946e-05, "loss": 0.0893, "num_input_tokens_seen": 96157824, "step": 44555 }, { "epoch": 7.269168026101142, "grad_norm": 0.033184394240379333, "learning_rate": 4.015316369947502e-05, "loss": 0.0413, "num_input_tokens_seen": 96168992, "step": 44560 }, { "epoch": 7.269983686786297, "grad_norm": 1.3164883852005005, "learning_rate": 4.0150332837487e-05, "loss": 0.0779, "num_input_tokens_seen": 96179712, "step": 44565 }, { "epoch": 7.270799347471452, "grad_norm": 1.1365199089050293, "learning_rate": 4.014750166845828e-05, "loss": 0.1231, "num_input_tokens_seen": 96190656, "step": 44570 }, { "epoch": 7.271615008156607, "grad_norm": 1.4118789434432983, "learning_rate": 4.014467019244622e-05, "loss": 0.0499, "num_input_tokens_seen": 96201120, "step": 44575 }, { "epoch": 7.2724306688417615, "grad_norm": 0.14112740755081177, "learning_rate": 4.0141838409508195e-05, "loss": 0.0339, "num_input_tokens_seen": 96210304, "step": 44580 }, { "epoch": 7.273246329526917, "grad_norm": 0.19778509438037872, "learning_rate": 4.013900631970161e-05, "loss": 0.2061, "num_input_tokens_seen": 96221472, "step": 44585 }, { "epoch": 7.274061990212072, "grad_norm": 0.040645238012075424, "learning_rate": 4.0136173923083865e-05, "loss": 0.035, "num_input_tokens_seen": 96232736, "step": 44590 }, { "epoch": 7.274877650897227, "grad_norm": 0.4699402153491974, "learning_rate": 4.0133341219712354e-05, "loss": 0.1806, "num_input_tokens_seen": 96244352, "step": 44595 }, { "epoch": 7.275693311582382, "grad_norm": 0.0572596900165081, "learning_rate": 4.0130508209644476e-05, "loss": 0.083, "num_input_tokens_seen": 96254240, "step": 44600 }, { "epoch": 7.2765089722675365, "grad_norm": 0.5049307346343994, "learning_rate": 4.0127674892937655e-05, "loss": 0.0312, "num_input_tokens_seen": 96265696, "step": 44605 }, { "epoch": 7.277324632952691, "grad_norm": 2.052891492843628, "learning_rate": 4.012484126964931e-05, "loss": 0.23, "num_input_tokens_seen": 96277664, "step": 44610 }, { "epoch": 7.278140293637847, "grad_norm": 1.162994384765625, "learning_rate": 4.0122007339836875e-05, "loss": 0.0635, "num_input_tokens_seen": 96288448, "step": 44615 }, { "epoch": 7.278955954323002, "grad_norm": 0.9428924322128296, "learning_rate": 4.0119173103557774e-05, "loss": 0.063, "num_input_tokens_seen": 96298432, "step": 44620 }, { "epoch": 7.279771615008157, "grad_norm": 0.013280157931149006, "learning_rate": 4.011633856086946e-05, "loss": 0.0741, "num_input_tokens_seen": 96309568, "step": 44625 }, { "epoch": 7.280587275693311, "grad_norm": 0.10480548441410065, "learning_rate": 4.011350371182936e-05, "loss": 0.0109, "num_input_tokens_seen": 96319968, "step": 44630 }, { "epoch": 7.281402936378466, "grad_norm": 0.22526274621486664, "learning_rate": 4.0110668556494935e-05, "loss": 0.0443, "num_input_tokens_seen": 96331232, "step": 44635 }, { "epoch": 7.282218597063622, "grad_norm": 1.0440008640289307, "learning_rate": 4.010783309492364e-05, "loss": 0.1291, "num_input_tokens_seen": 96341696, "step": 44640 }, { "epoch": 7.283034257748777, "grad_norm": 0.10698949545621872, "learning_rate": 4.010499732717294e-05, "loss": 0.1331, "num_input_tokens_seen": 96352160, "step": 44645 }, { "epoch": 7.283849918433932, "grad_norm": 0.536747395992279, "learning_rate": 4.010216125330032e-05, "loss": 0.066, "num_input_tokens_seen": 96362592, "step": 44650 }, { "epoch": 7.284665579119086, "grad_norm": 0.9081218838691711, "learning_rate": 4.0099324873363234e-05, "loss": 0.0772, "num_input_tokens_seen": 96375424, "step": 44655 }, { "epoch": 7.285481239804241, "grad_norm": 0.6035945415496826, "learning_rate": 4.009648818741918e-05, "loss": 0.2048, "num_input_tokens_seen": 96387296, "step": 44660 }, { "epoch": 7.286296900489396, "grad_norm": 1.3140681982040405, "learning_rate": 4.009365119552564e-05, "loss": 0.142, "num_input_tokens_seen": 96397632, "step": 44665 }, { "epoch": 7.287112561174552, "grad_norm": 0.1015821248292923, "learning_rate": 4.0090813897740095e-05, "loss": 0.0396, "num_input_tokens_seen": 96408096, "step": 44670 }, { "epoch": 7.287928221859707, "grad_norm": 0.01935841701924801, "learning_rate": 4.008797629412008e-05, "loss": 0.0159, "num_input_tokens_seen": 96417408, "step": 44675 }, { "epoch": 7.288743882544861, "grad_norm": 0.5755485892295837, "learning_rate": 4.008513838472309e-05, "loss": 0.1462, "num_input_tokens_seen": 96427328, "step": 44680 }, { "epoch": 7.289559543230016, "grad_norm": 0.20079916715621948, "learning_rate": 4.008230016960662e-05, "loss": 0.0342, "num_input_tokens_seen": 96437920, "step": 44685 }, { "epoch": 7.290375203915171, "grad_norm": 0.39247414469718933, "learning_rate": 4.007946164882821e-05, "loss": 0.0471, "num_input_tokens_seen": 96447872, "step": 44690 }, { "epoch": 7.291190864600326, "grad_norm": 0.03261252120137215, "learning_rate": 4.007662282244538e-05, "loss": 0.0191, "num_input_tokens_seen": 96458592, "step": 44695 }, { "epoch": 7.2920065252854815, "grad_norm": 0.12857235968112946, "learning_rate": 4.007378369051566e-05, "loss": 0.0942, "num_input_tokens_seen": 96469312, "step": 44700 }, { "epoch": 7.292822185970636, "grad_norm": 0.08966502547264099, "learning_rate": 4.0070944253096585e-05, "loss": 0.0283, "num_input_tokens_seen": 96478976, "step": 44705 }, { "epoch": 7.293637846655791, "grad_norm": 0.2901844382286072, "learning_rate": 4.0068104510245716e-05, "loss": 0.0783, "num_input_tokens_seen": 96489952, "step": 44710 }, { "epoch": 7.294453507340946, "grad_norm": 0.018718799576163292, "learning_rate": 4.006526446202059e-05, "loss": 0.1274, "num_input_tokens_seen": 96500672, "step": 44715 }, { "epoch": 7.295269168026101, "grad_norm": 2.250539779663086, "learning_rate": 4.006242410847877e-05, "loss": 0.299, "num_input_tokens_seen": 96511488, "step": 44720 }, { "epoch": 7.2960848287112565, "grad_norm": 0.11576388776302338, "learning_rate": 4.005958344967782e-05, "loss": 0.1837, "num_input_tokens_seen": 96522336, "step": 44725 }, { "epoch": 7.296900489396411, "grad_norm": 0.26303574442863464, "learning_rate": 4.00567424856753e-05, "loss": 0.3268, "num_input_tokens_seen": 96533088, "step": 44730 }, { "epoch": 7.297716150081566, "grad_norm": 0.5008658170700073, "learning_rate": 4.0053901216528796e-05, "loss": 0.0294, "num_input_tokens_seen": 96543552, "step": 44735 }, { "epoch": 7.298531810766721, "grad_norm": 0.06703486293554306, "learning_rate": 4.005105964229588e-05, "loss": 0.2312, "num_input_tokens_seen": 96554432, "step": 44740 }, { "epoch": 7.299347471451876, "grad_norm": 1.758281946182251, "learning_rate": 4.0048217763034154e-05, "loss": 0.1121, "num_input_tokens_seen": 96565536, "step": 44745 }, { "epoch": 7.300163132137031, "grad_norm": 0.9294658303260803, "learning_rate": 4.0045375578801214e-05, "loss": 0.0919, "num_input_tokens_seen": 96576576, "step": 44750 }, { "epoch": 7.300978792822186, "grad_norm": 1.4359904527664185, "learning_rate": 4.0042533089654634e-05, "loss": 0.068, "num_input_tokens_seen": 96588640, "step": 44755 }, { "epoch": 7.301794453507341, "grad_norm": 0.05241534486413002, "learning_rate": 4.003969029565205e-05, "loss": 0.1118, "num_input_tokens_seen": 96600160, "step": 44760 }, { "epoch": 7.302610114192496, "grad_norm": 0.2932645082473755, "learning_rate": 4.003684719685106e-05, "loss": 0.2807, "num_input_tokens_seen": 96610944, "step": 44765 }, { "epoch": 7.303425774877651, "grad_norm": 1.616828203201294, "learning_rate": 4.0034003793309284e-05, "loss": 0.1748, "num_input_tokens_seen": 96620960, "step": 44770 }, { "epoch": 7.304241435562806, "grad_norm": 0.06565353274345398, "learning_rate": 4.003116008508435e-05, "loss": 0.215, "num_input_tokens_seen": 96630592, "step": 44775 }, { "epoch": 7.30505709624796, "grad_norm": 0.034916892647743225, "learning_rate": 4.002831607223389e-05, "loss": 0.1303, "num_input_tokens_seen": 96642368, "step": 44780 }, { "epoch": 7.305872756933116, "grad_norm": 0.4039407968521118, "learning_rate": 4.0025471754815534e-05, "loss": 0.0284, "num_input_tokens_seen": 96654464, "step": 44785 }, { "epoch": 7.306688417618271, "grad_norm": 1.1718066930770874, "learning_rate": 4.002262713288694e-05, "loss": 0.1941, "num_input_tokens_seen": 96664992, "step": 44790 }, { "epoch": 7.307504078303426, "grad_norm": 0.18337342143058777, "learning_rate": 4.001978220650575e-05, "loss": 0.1173, "num_input_tokens_seen": 96675168, "step": 44795 }, { "epoch": 7.308319738988581, "grad_norm": 2.7113826274871826, "learning_rate": 4.0016936975729614e-05, "loss": 0.1306, "num_input_tokens_seen": 96684992, "step": 44800 }, { "epoch": 7.309135399673735, "grad_norm": 0.5888152718544006, "learning_rate": 4.0014091440616194e-05, "loss": 0.0243, "num_input_tokens_seen": 96695808, "step": 44805 }, { "epoch": 7.309951060358891, "grad_norm": 0.8000777959823608, "learning_rate": 4.001124560122318e-05, "loss": 0.2439, "num_input_tokens_seen": 96707840, "step": 44810 }, { "epoch": 7.310766721044046, "grad_norm": 1.7644994258880615, "learning_rate": 4.0008399457608213e-05, "loss": 0.1357, "num_input_tokens_seen": 96718784, "step": 44815 }, { "epoch": 7.311582381729201, "grad_norm": 0.11208020895719528, "learning_rate": 4.0005553009829e-05, "loss": 0.0211, "num_input_tokens_seen": 96729376, "step": 44820 }, { "epoch": 7.3123980424143555, "grad_norm": 0.2401978075504303, "learning_rate": 4.000270625794322e-05, "loss": 0.0499, "num_input_tokens_seen": 96740640, "step": 44825 }, { "epoch": 7.31321370309951, "grad_norm": 0.1351584792137146, "learning_rate": 3.999985920200857e-05, "loss": 0.0618, "num_input_tokens_seen": 96751104, "step": 44830 }, { "epoch": 7.314029363784665, "grad_norm": 0.43766286969184875, "learning_rate": 3.999701184208274e-05, "loss": 0.2154, "num_input_tokens_seen": 96760672, "step": 44835 }, { "epoch": 7.314845024469821, "grad_norm": 0.23553282022476196, "learning_rate": 3.9994164178223436e-05, "loss": 0.0465, "num_input_tokens_seen": 96771840, "step": 44840 }, { "epoch": 7.315660685154976, "grad_norm": 0.07670659571886063, "learning_rate": 3.999131621048838e-05, "loss": 0.1487, "num_input_tokens_seen": 96782592, "step": 44845 }, { "epoch": 7.3164763458401305, "grad_norm": 0.04002559557557106, "learning_rate": 3.998846793893528e-05, "loss": 0.1745, "num_input_tokens_seen": 96794016, "step": 44850 }, { "epoch": 7.317292006525285, "grad_norm": 0.07756166905164719, "learning_rate": 3.998561936362186e-05, "loss": 0.2027, "num_input_tokens_seen": 96803968, "step": 44855 }, { "epoch": 7.31810766721044, "grad_norm": 0.5212578177452087, "learning_rate": 3.9982770484605856e-05, "loss": 0.1624, "num_input_tokens_seen": 96814304, "step": 44860 }, { "epoch": 7.318923327895595, "grad_norm": 0.07130423933267593, "learning_rate": 3.9979921301945e-05, "loss": 0.0287, "num_input_tokens_seen": 96824128, "step": 44865 }, { "epoch": 7.319738988580751, "grad_norm": 0.050062395632267, "learning_rate": 3.9977071815697046e-05, "loss": 0.1292, "num_input_tokens_seen": 96835712, "step": 44870 }, { "epoch": 7.3205546492659055, "grad_norm": 0.4159732460975647, "learning_rate": 3.9974222025919724e-05, "loss": 0.0222, "num_input_tokens_seen": 96846272, "step": 44875 }, { "epoch": 7.32137030995106, "grad_norm": 0.11116896569728851, "learning_rate": 3.997137193267079e-05, "loss": 0.0775, "num_input_tokens_seen": 96857408, "step": 44880 }, { "epoch": 7.322185970636215, "grad_norm": 0.2901705503463745, "learning_rate": 3.996852153600802e-05, "loss": 0.1574, "num_input_tokens_seen": 96868032, "step": 44885 }, { "epoch": 7.32300163132137, "grad_norm": 0.056141000241041183, "learning_rate": 3.996567083598917e-05, "loss": 0.1617, "num_input_tokens_seen": 96879136, "step": 44890 }, { "epoch": 7.323817292006526, "grad_norm": 0.06958694756031036, "learning_rate": 3.9962819832672013e-05, "loss": 0.0876, "num_input_tokens_seen": 96890016, "step": 44895 }, { "epoch": 7.3246329526916805, "grad_norm": 0.18048927187919617, "learning_rate": 3.995996852611433e-05, "loss": 0.0714, "num_input_tokens_seen": 96900160, "step": 44900 }, { "epoch": 7.325448613376835, "grad_norm": 1.048250675201416, "learning_rate": 3.995711691637391e-05, "loss": 0.1168, "num_input_tokens_seen": 96909856, "step": 44905 }, { "epoch": 7.32626427406199, "grad_norm": 0.255473256111145, "learning_rate": 3.995426500350854e-05, "loss": 0.0454, "num_input_tokens_seen": 96921280, "step": 44910 }, { "epoch": 7.327079934747145, "grad_norm": 1.8658117055892944, "learning_rate": 3.995141278757602e-05, "loss": 0.1099, "num_input_tokens_seen": 96931136, "step": 44915 }, { "epoch": 7.327895595432301, "grad_norm": 0.041980139911174774, "learning_rate": 3.994856026863415e-05, "loss": 0.0279, "num_input_tokens_seen": 96942112, "step": 44920 }, { "epoch": 7.328711256117455, "grad_norm": 0.7605543732643127, "learning_rate": 3.994570744674074e-05, "loss": 0.0785, "num_input_tokens_seen": 96953664, "step": 44925 }, { "epoch": 7.32952691680261, "grad_norm": 0.3705054521560669, "learning_rate": 3.994285432195361e-05, "loss": 0.1253, "num_input_tokens_seen": 96964800, "step": 44930 }, { "epoch": 7.330342577487765, "grad_norm": 0.25303590297698975, "learning_rate": 3.994000089433059e-05, "loss": 0.0664, "num_input_tokens_seen": 96975680, "step": 44935 }, { "epoch": 7.33115823817292, "grad_norm": 0.32479310035705566, "learning_rate": 3.993714716392949e-05, "loss": 0.1128, "num_input_tokens_seen": 96986496, "step": 44940 }, { "epoch": 7.331973898858075, "grad_norm": 0.4025759696960449, "learning_rate": 3.9934293130808154e-05, "loss": 0.0288, "num_input_tokens_seen": 96997184, "step": 44945 }, { "epoch": 7.33278955954323, "grad_norm": 0.30651503801345825, "learning_rate": 3.9931438795024426e-05, "loss": 0.0231, "num_input_tokens_seen": 97008416, "step": 44950 }, { "epoch": 7.333605220228385, "grad_norm": 0.07650361955165863, "learning_rate": 3.992858415663615e-05, "loss": 0.055, "num_input_tokens_seen": 97020000, "step": 44955 }, { "epoch": 7.33442088091354, "grad_norm": 0.3174686133861542, "learning_rate": 3.992572921570117e-05, "loss": 0.0792, "num_input_tokens_seen": 97030752, "step": 44960 }, { "epoch": 7.335236541598695, "grad_norm": 0.21605920791625977, "learning_rate": 3.992287397227736e-05, "loss": 0.3141, "num_input_tokens_seen": 97040416, "step": 44965 }, { "epoch": 7.33605220228385, "grad_norm": 0.29162314534187317, "learning_rate": 3.9920018426422576e-05, "loss": 0.1875, "num_input_tokens_seen": 97051936, "step": 44970 }, { "epoch": 7.3368678629690045, "grad_norm": 0.9711532592773438, "learning_rate": 3.991716257819469e-05, "loss": 0.1573, "num_input_tokens_seen": 97060864, "step": 44975 }, { "epoch": 7.33768352365416, "grad_norm": 1.0058388710021973, "learning_rate": 3.991430642765158e-05, "loss": 0.1036, "num_input_tokens_seen": 97073376, "step": 44980 }, { "epoch": 7.338499184339315, "grad_norm": 1.593953013420105, "learning_rate": 3.991144997485113e-05, "loss": 0.0668, "num_input_tokens_seen": 97085696, "step": 44985 }, { "epoch": 7.33931484502447, "grad_norm": 0.05650465562939644, "learning_rate": 3.990859321985123e-05, "loss": 0.0738, "num_input_tokens_seen": 97097856, "step": 44990 }, { "epoch": 7.340130505709625, "grad_norm": 0.06789512187242508, "learning_rate": 3.990573616270978e-05, "loss": 0.1021, "num_input_tokens_seen": 97109280, "step": 44995 }, { "epoch": 7.3409461663947795, "grad_norm": 0.10615452378988266, "learning_rate": 3.990287880348468e-05, "loss": 0.1732, "num_input_tokens_seen": 97121728, "step": 45000 }, { "epoch": 7.341761827079935, "grad_norm": 0.0786074846982956, "learning_rate": 3.9900021142233825e-05, "loss": 0.0357, "num_input_tokens_seen": 97132480, "step": 45005 }, { "epoch": 7.34257748776509, "grad_norm": 0.10851925611495972, "learning_rate": 3.989716317901515e-05, "loss": 0.0192, "num_input_tokens_seen": 97143904, "step": 45010 }, { "epoch": 7.343393148450245, "grad_norm": 0.1274745762348175, "learning_rate": 3.9894304913886554e-05, "loss": 0.0794, "num_input_tokens_seen": 97154816, "step": 45015 }, { "epoch": 7.3442088091354, "grad_norm": 1.5581477880477905, "learning_rate": 3.9891446346905984e-05, "loss": 0.0777, "num_input_tokens_seen": 97165664, "step": 45020 }, { "epoch": 7.3450244698205545, "grad_norm": 0.1536228209733963, "learning_rate": 3.9888587478131356e-05, "loss": 0.0972, "num_input_tokens_seen": 97174976, "step": 45025 }, { "epoch": 7.345840130505709, "grad_norm": 0.9065752625465393, "learning_rate": 3.9885728307620615e-05, "loss": 0.0758, "num_input_tokens_seen": 97186720, "step": 45030 }, { "epoch": 7.346655791190865, "grad_norm": 1.0585486888885498, "learning_rate": 3.98828688354317e-05, "loss": 0.0766, "num_input_tokens_seen": 97196608, "step": 45035 }, { "epoch": 7.34747145187602, "grad_norm": 0.6917701959609985, "learning_rate": 3.988000906162258e-05, "loss": 0.0452, "num_input_tokens_seen": 97208352, "step": 45040 }, { "epoch": 7.348287112561175, "grad_norm": 0.42190617322921753, "learning_rate": 3.98771489862512e-05, "loss": 0.0568, "num_input_tokens_seen": 97218048, "step": 45045 }, { "epoch": 7.349102773246329, "grad_norm": 0.26578086614608765, "learning_rate": 3.987428860937552e-05, "loss": 0.1102, "num_input_tokens_seen": 97229920, "step": 45050 }, { "epoch": 7.349918433931484, "grad_norm": 0.8838286995887756, "learning_rate": 3.9871427931053506e-05, "loss": 0.0473, "num_input_tokens_seen": 97242816, "step": 45055 }, { "epoch": 7.350734094616639, "grad_norm": 0.23054292798042297, "learning_rate": 3.9868566951343144e-05, "loss": 0.0525, "num_input_tokens_seen": 97254656, "step": 45060 }, { "epoch": 7.351549755301795, "grad_norm": 0.05210962891578674, "learning_rate": 3.986570567030241e-05, "loss": 0.1481, "num_input_tokens_seen": 97266432, "step": 45065 }, { "epoch": 7.35236541598695, "grad_norm": 1.5220245122909546, "learning_rate": 3.986284408798929e-05, "loss": 0.1791, "num_input_tokens_seen": 97278080, "step": 45070 }, { "epoch": 7.353181076672104, "grad_norm": 1.9209479093551636, "learning_rate": 3.9859982204461785e-05, "loss": 0.1253, "num_input_tokens_seen": 97288672, "step": 45075 }, { "epoch": 7.353996737357259, "grad_norm": 0.06812570244073868, "learning_rate": 3.9857120019777884e-05, "loss": 0.1273, "num_input_tokens_seen": 97300736, "step": 45080 }, { "epoch": 7.354812398042414, "grad_norm": 0.3959147036075592, "learning_rate": 3.9854257533995595e-05, "loss": 0.1325, "num_input_tokens_seen": 97311808, "step": 45085 }, { "epoch": 7.35562805872757, "grad_norm": 0.07580491155385971, "learning_rate": 3.9851394747172946e-05, "loss": 0.1441, "num_input_tokens_seen": 97321120, "step": 45090 }, { "epoch": 7.356443719412725, "grad_norm": 1.9516332149505615, "learning_rate": 3.9848531659367934e-05, "loss": 0.1819, "num_input_tokens_seen": 97331712, "step": 45095 }, { "epoch": 7.357259380097879, "grad_norm": 0.08640298992395401, "learning_rate": 3.984566827063859e-05, "loss": 0.0216, "num_input_tokens_seen": 97342720, "step": 45100 }, { "epoch": 7.358075040783034, "grad_norm": 0.4612880349159241, "learning_rate": 3.984280458104295e-05, "loss": 0.0585, "num_input_tokens_seen": 97353920, "step": 45105 }, { "epoch": 7.358890701468189, "grad_norm": 0.6010879874229431, "learning_rate": 3.983994059063904e-05, "loss": 0.0583, "num_input_tokens_seen": 97364320, "step": 45110 }, { "epoch": 7.359706362153344, "grad_norm": 0.05236673355102539, "learning_rate": 3.983707629948491e-05, "loss": 0.0693, "num_input_tokens_seen": 97374656, "step": 45115 }, { "epoch": 7.3605220228384995, "grad_norm": 0.578855037689209, "learning_rate": 3.9834211707638614e-05, "loss": 0.088, "num_input_tokens_seen": 97386272, "step": 45120 }, { "epoch": 7.361337683523654, "grad_norm": 1.259432077407837, "learning_rate": 3.983134681515819e-05, "loss": 0.0938, "num_input_tokens_seen": 97396992, "step": 45125 }, { "epoch": 7.362153344208809, "grad_norm": 0.15231312811374664, "learning_rate": 3.982848162210171e-05, "loss": 0.1058, "num_input_tokens_seen": 97407296, "step": 45130 }, { "epoch": 7.362969004893964, "grad_norm": 0.15293718874454498, "learning_rate": 3.982561612852724e-05, "loss": 0.249, "num_input_tokens_seen": 97417824, "step": 45135 }, { "epoch": 7.363784665579119, "grad_norm": 0.032387565821409225, "learning_rate": 3.982275033449285e-05, "loss": 0.0101, "num_input_tokens_seen": 97428096, "step": 45140 }, { "epoch": 7.364600326264274, "grad_norm": 1.1773303747177124, "learning_rate": 3.981988424005662e-05, "loss": 0.1236, "num_input_tokens_seen": 97439680, "step": 45145 }, { "epoch": 7.365415986949429, "grad_norm": 0.6259506940841675, "learning_rate": 3.9817017845276636e-05, "loss": 0.2114, "num_input_tokens_seen": 97450880, "step": 45150 }, { "epoch": 7.366231647634584, "grad_norm": 2.7054145336151123, "learning_rate": 3.981415115021099e-05, "loss": 0.4274, "num_input_tokens_seen": 97461568, "step": 45155 }, { "epoch": 7.367047308319739, "grad_norm": 0.03877636045217514, "learning_rate": 3.981128415491778e-05, "loss": 0.2401, "num_input_tokens_seen": 97472416, "step": 45160 }, { "epoch": 7.367862969004894, "grad_norm": 0.06565164774656296, "learning_rate": 3.98084168594551e-05, "loss": 0.1173, "num_input_tokens_seen": 97483744, "step": 45165 }, { "epoch": 7.368678629690049, "grad_norm": 1.2063055038452148, "learning_rate": 3.980554926388107e-05, "loss": 0.0833, "num_input_tokens_seen": 97494752, "step": 45170 }, { "epoch": 7.369494290375204, "grad_norm": 0.04876679554581642, "learning_rate": 3.980268136825381e-05, "loss": 0.0269, "num_input_tokens_seen": 97505056, "step": 45175 }, { "epoch": 7.370309951060359, "grad_norm": 2.6660070419311523, "learning_rate": 3.979981317263143e-05, "loss": 0.2007, "num_input_tokens_seen": 97516352, "step": 45180 }, { "epoch": 7.371125611745514, "grad_norm": 0.03340243920683861, "learning_rate": 3.979694467707206e-05, "loss": 0.1831, "num_input_tokens_seen": 97526784, "step": 45185 }, { "epoch": 7.371941272430669, "grad_norm": 1.5093183517456055, "learning_rate": 3.979407588163383e-05, "loss": 0.1, "num_input_tokens_seen": 97538368, "step": 45190 }, { "epoch": 7.372756933115824, "grad_norm": 3.066398859024048, "learning_rate": 3.979120678637488e-05, "loss": 0.0798, "num_input_tokens_seen": 97549440, "step": 45195 }, { "epoch": 7.373572593800978, "grad_norm": 0.17208237946033478, "learning_rate": 3.9788337391353367e-05, "loss": 0.075, "num_input_tokens_seen": 97559264, "step": 45200 }, { "epoch": 7.374388254486134, "grad_norm": 1.4123018980026245, "learning_rate": 3.9785467696627445e-05, "loss": 0.0661, "num_input_tokens_seen": 97569312, "step": 45205 }, { "epoch": 7.375203915171289, "grad_norm": 0.2069598138332367, "learning_rate": 3.9782597702255256e-05, "loss": 0.0185, "num_input_tokens_seen": 97579712, "step": 45210 }, { "epoch": 7.376019575856444, "grad_norm": 0.07394790649414062, "learning_rate": 3.977972740829496e-05, "loss": 0.0348, "num_input_tokens_seen": 97590912, "step": 45215 }, { "epoch": 7.376835236541599, "grad_norm": 0.3878190815448761, "learning_rate": 3.977685681480476e-05, "loss": 0.0731, "num_input_tokens_seen": 97602176, "step": 45220 }, { "epoch": 7.377650897226753, "grad_norm": 1.4826823472976685, "learning_rate": 3.97739859218428e-05, "loss": 0.2482, "num_input_tokens_seen": 97612128, "step": 45225 }, { "epoch": 7.378466557911908, "grad_norm": 1.069420576095581, "learning_rate": 3.9771114729467276e-05, "loss": 0.1129, "num_input_tokens_seen": 97622528, "step": 45230 }, { "epoch": 7.379282218597064, "grad_norm": 0.22493389248847961, "learning_rate": 3.976824323773637e-05, "loss": 0.1313, "num_input_tokens_seen": 97634304, "step": 45235 }, { "epoch": 7.380097879282219, "grad_norm": 0.44273513555526733, "learning_rate": 3.9765371446708274e-05, "loss": 0.1264, "num_input_tokens_seen": 97644864, "step": 45240 }, { "epoch": 7.3809135399673735, "grad_norm": 0.3474082946777344, "learning_rate": 3.976249935644121e-05, "loss": 0.0619, "num_input_tokens_seen": 97656768, "step": 45245 }, { "epoch": 7.381729200652528, "grad_norm": 3.174506902694702, "learning_rate": 3.975962696699336e-05, "loss": 0.2311, "num_input_tokens_seen": 97668416, "step": 45250 }, { "epoch": 7.382544861337683, "grad_norm": 0.5570809245109558, "learning_rate": 3.975675427842294e-05, "loss": 0.0344, "num_input_tokens_seen": 97679392, "step": 45255 }, { "epoch": 7.383360522022839, "grad_norm": 0.6204909086227417, "learning_rate": 3.9753881290788187e-05, "loss": 0.2353, "num_input_tokens_seen": 97689312, "step": 45260 }, { "epoch": 7.384176182707994, "grad_norm": 0.45900458097457886, "learning_rate": 3.9751008004147305e-05, "loss": 0.0392, "num_input_tokens_seen": 97700448, "step": 45265 }, { "epoch": 7.3849918433931485, "grad_norm": 0.9639485478401184, "learning_rate": 3.974813441855854e-05, "loss": 0.0393, "num_input_tokens_seen": 97710528, "step": 45270 }, { "epoch": 7.385807504078303, "grad_norm": 0.1354067027568817, "learning_rate": 3.974526053408011e-05, "loss": 0.0801, "num_input_tokens_seen": 97718720, "step": 45275 }, { "epoch": 7.386623164763458, "grad_norm": 0.04489562287926674, "learning_rate": 3.974238635077028e-05, "loss": 0.1177, "num_input_tokens_seen": 97727744, "step": 45280 }, { "epoch": 7.387438825448613, "grad_norm": 0.05803345516324043, "learning_rate": 3.973951186868728e-05, "loss": 0.0163, "num_input_tokens_seen": 97738112, "step": 45285 }, { "epoch": 7.388254486133769, "grad_norm": 0.07614785432815552, "learning_rate": 3.9736637087889375e-05, "loss": 0.0462, "num_input_tokens_seen": 97748320, "step": 45290 }, { "epoch": 7.3890701468189235, "grad_norm": 0.35868924856185913, "learning_rate": 3.973376200843483e-05, "loss": 0.0844, "num_input_tokens_seen": 97760480, "step": 45295 }, { "epoch": 7.389885807504078, "grad_norm": 1.0380159616470337, "learning_rate": 3.97308866303819e-05, "loss": 0.1682, "num_input_tokens_seen": 97770912, "step": 45300 }, { "epoch": 7.390701468189233, "grad_norm": 1.9907766580581665, "learning_rate": 3.972801095378888e-05, "loss": 0.3658, "num_input_tokens_seen": 97781984, "step": 45305 }, { "epoch": 7.391517128874388, "grad_norm": 1.9482812881469727, "learning_rate": 3.972513497871402e-05, "loss": 0.1468, "num_input_tokens_seen": 97793152, "step": 45310 }, { "epoch": 7.392332789559543, "grad_norm": 0.07165088504552841, "learning_rate": 3.972225870521562e-05, "loss": 0.129, "num_input_tokens_seen": 97804896, "step": 45315 }, { "epoch": 7.3931484502446985, "grad_norm": 0.8819929957389832, "learning_rate": 3.971938213335198e-05, "loss": 0.1302, "num_input_tokens_seen": 97815008, "step": 45320 }, { "epoch": 7.393964110929853, "grad_norm": 0.9911462664604187, "learning_rate": 3.971650526318138e-05, "loss": 0.1092, "num_input_tokens_seen": 97826400, "step": 45325 }, { "epoch": 7.394779771615008, "grad_norm": 0.6362459063529968, "learning_rate": 3.971362809476213e-05, "loss": 0.153, "num_input_tokens_seen": 97837024, "step": 45330 }, { "epoch": 7.395595432300163, "grad_norm": 0.07295188307762146, "learning_rate": 3.971075062815255e-05, "loss": 0.1388, "num_input_tokens_seen": 97847616, "step": 45335 }, { "epoch": 7.396411092985318, "grad_norm": 0.03175389766693115, "learning_rate": 3.9707872863410936e-05, "loss": 0.0693, "num_input_tokens_seen": 97858464, "step": 45340 }, { "epoch": 7.397226753670473, "grad_norm": 1.8053933382034302, "learning_rate": 3.970499480059563e-05, "loss": 0.0895, "num_input_tokens_seen": 97868224, "step": 45345 }, { "epoch": 7.398042414355628, "grad_norm": 0.3456277847290039, "learning_rate": 3.970211643976495e-05, "loss": 0.1302, "num_input_tokens_seen": 97879520, "step": 45350 }, { "epoch": 7.398858075040783, "grad_norm": 1.1064069271087646, "learning_rate": 3.9699237780977214e-05, "loss": 0.0752, "num_input_tokens_seen": 97889568, "step": 45355 }, { "epoch": 7.399673735725938, "grad_norm": 1.1840842962265015, "learning_rate": 3.969635882429079e-05, "loss": 0.07, "num_input_tokens_seen": 97899840, "step": 45360 }, { "epoch": 7.400489396411093, "grad_norm": 1.613537311553955, "learning_rate": 3.969347956976401e-05, "loss": 0.1582, "num_input_tokens_seen": 97910112, "step": 45365 }, { "epoch": 7.401305057096248, "grad_norm": 1.3418773412704468, "learning_rate": 3.9690600017455224e-05, "loss": 0.0545, "num_input_tokens_seen": 97921664, "step": 45370 }, { "epoch": 7.402120717781403, "grad_norm": 0.2896011769771576, "learning_rate": 3.9687720167422795e-05, "loss": 0.1003, "num_input_tokens_seen": 97934112, "step": 45375 }, { "epoch": 7.402936378466558, "grad_norm": 0.6540544033050537, "learning_rate": 3.968484001972508e-05, "loss": 0.0847, "num_input_tokens_seen": 97945760, "step": 45380 }, { "epoch": 7.403752039151713, "grad_norm": 0.2732723653316498, "learning_rate": 3.968195957442046e-05, "loss": 0.3187, "num_input_tokens_seen": 97956320, "step": 45385 }, { "epoch": 7.404567699836868, "grad_norm": 2.9644858837127686, "learning_rate": 3.9679078831567295e-05, "loss": 0.1871, "num_input_tokens_seen": 97966368, "step": 45390 }, { "epoch": 7.4053833605220225, "grad_norm": 0.2992634177207947, "learning_rate": 3.9676197791223976e-05, "loss": 0.0885, "num_input_tokens_seen": 97976736, "step": 45395 }, { "epoch": 7.406199021207178, "grad_norm": 0.1720576286315918, "learning_rate": 3.96733164534489e-05, "loss": 0.0823, "num_input_tokens_seen": 97987616, "step": 45400 }, { "epoch": 7.407014681892333, "grad_norm": 0.08918851613998413, "learning_rate": 3.967043481830045e-05, "loss": 0.0971, "num_input_tokens_seen": 97999424, "step": 45405 }, { "epoch": 7.407830342577488, "grad_norm": 0.5020250678062439, "learning_rate": 3.9667552885837026e-05, "loss": 0.1908, "num_input_tokens_seen": 98010688, "step": 45410 }, { "epoch": 7.408646003262643, "grad_norm": 0.63718181848526, "learning_rate": 3.966467065611703e-05, "loss": 0.0909, "num_input_tokens_seen": 98022304, "step": 45415 }, { "epoch": 7.4094616639477975, "grad_norm": 0.4876381456851959, "learning_rate": 3.9661788129198885e-05, "loss": 0.1061, "num_input_tokens_seen": 98032448, "step": 45420 }, { "epoch": 7.410277324632952, "grad_norm": 2.1378962993621826, "learning_rate": 3.9658905305141005e-05, "loss": 0.1429, "num_input_tokens_seen": 98043936, "step": 45425 }, { "epoch": 7.411092985318108, "grad_norm": 0.8998500108718872, "learning_rate": 3.965602218400181e-05, "loss": 0.2109, "num_input_tokens_seen": 98054880, "step": 45430 }, { "epoch": 7.411908646003263, "grad_norm": 0.3877856135368347, "learning_rate": 3.965313876583973e-05, "loss": 0.0468, "num_input_tokens_seen": 98065440, "step": 45435 }, { "epoch": 7.412724306688418, "grad_norm": 0.03625469282269478, "learning_rate": 3.96502550507132e-05, "loss": 0.0344, "num_input_tokens_seen": 98076256, "step": 45440 }, { "epoch": 7.4135399673735725, "grad_norm": 2.1541004180908203, "learning_rate": 3.964737103868068e-05, "loss": 0.1091, "num_input_tokens_seen": 98087648, "step": 45445 }, { "epoch": 7.414355628058727, "grad_norm": 0.12687824666500092, "learning_rate": 3.9644486729800604e-05, "loss": 0.0424, "num_input_tokens_seen": 98097952, "step": 45450 }, { "epoch": 7.415171288743883, "grad_norm": 0.2293287068605423, "learning_rate": 3.9641602124131413e-05, "loss": 0.0746, "num_input_tokens_seen": 98109440, "step": 45455 }, { "epoch": 7.415986949429038, "grad_norm": 1.0040326118469238, "learning_rate": 3.9638717221731586e-05, "loss": 0.0521, "num_input_tokens_seen": 98120160, "step": 45460 }, { "epoch": 7.416802610114193, "grad_norm": 1.177797794342041, "learning_rate": 3.963583202265958e-05, "loss": 0.0972, "num_input_tokens_seen": 98131488, "step": 45465 }, { "epoch": 7.417618270799347, "grad_norm": 0.023419734090566635, "learning_rate": 3.963294652697388e-05, "loss": 0.0831, "num_input_tokens_seen": 98142528, "step": 45470 }, { "epoch": 7.418433931484502, "grad_norm": 0.18246795237064362, "learning_rate": 3.9630060734732956e-05, "loss": 0.1292, "num_input_tokens_seen": 98153472, "step": 45475 }, { "epoch": 7.419249592169657, "grad_norm": 0.626147985458374, "learning_rate": 3.962717464599529e-05, "loss": 0.2225, "num_input_tokens_seen": 98162784, "step": 45480 }, { "epoch": 7.420065252854813, "grad_norm": 0.1380259394645691, "learning_rate": 3.962428826081937e-05, "loss": 0.1507, "num_input_tokens_seen": 98173376, "step": 45485 }, { "epoch": 7.420880913539968, "grad_norm": 0.9171939492225647, "learning_rate": 3.96214015792637e-05, "loss": 0.1296, "num_input_tokens_seen": 98183872, "step": 45490 }, { "epoch": 7.421696574225122, "grad_norm": 0.5757381916046143, "learning_rate": 3.961851460138678e-05, "loss": 0.117, "num_input_tokens_seen": 98195200, "step": 45495 }, { "epoch": 7.422512234910277, "grad_norm": 0.6793146729469299, "learning_rate": 3.961562732724711e-05, "loss": 0.1251, "num_input_tokens_seen": 98205344, "step": 45500 }, { "epoch": 7.423327895595432, "grad_norm": 0.19945119321346283, "learning_rate": 3.9612739756903215e-05, "loss": 0.1077, "num_input_tokens_seen": 98216896, "step": 45505 }, { "epoch": 7.424143556280587, "grad_norm": 0.5900705456733704, "learning_rate": 3.9609851890413605e-05, "loss": 0.1373, "num_input_tokens_seen": 98228896, "step": 45510 }, { "epoch": 7.424959216965743, "grad_norm": 1.4477360248565674, "learning_rate": 3.960696372783682e-05, "loss": 0.1531, "num_input_tokens_seen": 98239488, "step": 45515 }, { "epoch": 7.425774877650897, "grad_norm": 0.19115978479385376, "learning_rate": 3.960407526923139e-05, "loss": 0.0767, "num_input_tokens_seen": 98251232, "step": 45520 }, { "epoch": 7.426590538336052, "grad_norm": 0.23874254524707794, "learning_rate": 3.9601186514655834e-05, "loss": 0.0755, "num_input_tokens_seen": 98262432, "step": 45525 }, { "epoch": 7.427406199021207, "grad_norm": 0.026227694004774094, "learning_rate": 3.9598297464168724e-05, "loss": 0.1807, "num_input_tokens_seen": 98272320, "step": 45530 }, { "epoch": 7.428221859706362, "grad_norm": 0.2932020127773285, "learning_rate": 3.9595408117828584e-05, "loss": 0.1642, "num_input_tokens_seen": 98283904, "step": 45535 }, { "epoch": 7.4290375203915175, "grad_norm": 2.164597511291504, "learning_rate": 3.9592518475693995e-05, "loss": 0.1834, "num_input_tokens_seen": 98295840, "step": 45540 }, { "epoch": 7.429853181076672, "grad_norm": 0.06299233436584473, "learning_rate": 3.9589628537823496e-05, "loss": 0.0591, "num_input_tokens_seen": 98307072, "step": 45545 }, { "epoch": 7.430668841761827, "grad_norm": 1.4456589221954346, "learning_rate": 3.958673830427567e-05, "loss": 0.0661, "num_input_tokens_seen": 98318144, "step": 45550 }, { "epoch": 7.431484502446982, "grad_norm": 1.1608368158340454, "learning_rate": 3.95838477751091e-05, "loss": 0.1739, "num_input_tokens_seen": 98329728, "step": 45555 }, { "epoch": 7.432300163132137, "grad_norm": 0.32945477962493896, "learning_rate": 3.958095695038234e-05, "loss": 0.0168, "num_input_tokens_seen": 98340256, "step": 45560 }, { "epoch": 7.433115823817292, "grad_norm": 0.024681665003299713, "learning_rate": 3.957806583015399e-05, "loss": 0.0706, "num_input_tokens_seen": 98350976, "step": 45565 }, { "epoch": 7.433931484502447, "grad_norm": 2.0471115112304688, "learning_rate": 3.957517441448264e-05, "loss": 0.1773, "num_input_tokens_seen": 98361536, "step": 45570 }, { "epoch": 7.434747145187602, "grad_norm": 0.3658164143562317, "learning_rate": 3.95722827034269e-05, "loss": 0.0276, "num_input_tokens_seen": 98372064, "step": 45575 }, { "epoch": 7.435562805872757, "grad_norm": 2.103689670562744, "learning_rate": 3.956939069704536e-05, "loss": 0.2394, "num_input_tokens_seen": 98382688, "step": 45580 }, { "epoch": 7.436378466557912, "grad_norm": 0.2993679344654083, "learning_rate": 3.9566498395396636e-05, "loss": 0.0807, "num_input_tokens_seen": 98393312, "step": 45585 }, { "epoch": 7.437194127243067, "grad_norm": 0.25081324577331543, "learning_rate": 3.956360579853935e-05, "loss": 0.1825, "num_input_tokens_seen": 98403136, "step": 45590 }, { "epoch": 7.438009787928221, "grad_norm": 0.040813788771629333, "learning_rate": 3.956071290653211e-05, "loss": 0.0959, "num_input_tokens_seen": 98414752, "step": 45595 }, { "epoch": 7.438825448613377, "grad_norm": 0.013881834223866463, "learning_rate": 3.955781971943355e-05, "loss": 0.0671, "num_input_tokens_seen": 98426496, "step": 45600 }, { "epoch": 7.439641109298532, "grad_norm": 0.24192126095294952, "learning_rate": 3.9554926237302305e-05, "loss": 0.1146, "num_input_tokens_seen": 98437696, "step": 45605 }, { "epoch": 7.440456769983687, "grad_norm": 1.042880654335022, "learning_rate": 3.9552032460197016e-05, "loss": 0.2051, "num_input_tokens_seen": 98448640, "step": 45610 }, { "epoch": 7.441272430668842, "grad_norm": 0.1953016221523285, "learning_rate": 3.954913838817633e-05, "loss": 0.0319, "num_input_tokens_seen": 98459360, "step": 45615 }, { "epoch": 7.442088091353996, "grad_norm": 0.275450736284256, "learning_rate": 3.95462440212989e-05, "loss": 0.1305, "num_input_tokens_seen": 98470560, "step": 45620 }, { "epoch": 7.442903752039152, "grad_norm": 0.1275310516357422, "learning_rate": 3.954334935962338e-05, "loss": 0.0685, "num_input_tokens_seen": 98482432, "step": 45625 }, { "epoch": 7.443719412724307, "grad_norm": 1.3320436477661133, "learning_rate": 3.954045440320844e-05, "loss": 0.0909, "num_input_tokens_seen": 98493056, "step": 45630 }, { "epoch": 7.444535073409462, "grad_norm": 0.08457234501838684, "learning_rate": 3.953755915211274e-05, "loss": 0.2112, "num_input_tokens_seen": 98503904, "step": 45635 }, { "epoch": 7.445350734094617, "grad_norm": 0.0697297677397728, "learning_rate": 3.9534663606394964e-05, "loss": 0.0755, "num_input_tokens_seen": 98514848, "step": 45640 }, { "epoch": 7.446166394779771, "grad_norm": 0.37955474853515625, "learning_rate": 3.9531767766113785e-05, "loss": 0.0282, "num_input_tokens_seen": 98525696, "step": 45645 }, { "epoch": 7.446982055464926, "grad_norm": 0.03773285821080208, "learning_rate": 3.9528871631327904e-05, "loss": 0.0954, "num_input_tokens_seen": 98536672, "step": 45650 }, { "epoch": 7.447797716150082, "grad_norm": 0.2980487048625946, "learning_rate": 3.952597520209601e-05, "loss": 0.0591, "num_input_tokens_seen": 98546912, "step": 45655 }, { "epoch": 7.448613376835237, "grad_norm": 2.5341639518737793, "learning_rate": 3.95230784784768e-05, "loss": 0.4776, "num_input_tokens_seen": 98558336, "step": 45660 }, { "epoch": 7.4494290375203915, "grad_norm": 0.12469830363988876, "learning_rate": 3.952018146052897e-05, "loss": 0.0404, "num_input_tokens_seen": 98568096, "step": 45665 }, { "epoch": 7.450244698205546, "grad_norm": 0.08387323468923569, "learning_rate": 3.951728414831125e-05, "loss": 0.1089, "num_input_tokens_seen": 98579072, "step": 45670 }, { "epoch": 7.451060358890701, "grad_norm": 0.957221508026123, "learning_rate": 3.951438654188235e-05, "loss": 0.2456, "num_input_tokens_seen": 98589600, "step": 45675 }, { "epoch": 7.451876019575856, "grad_norm": 1.5492496490478516, "learning_rate": 3.9511488641301e-05, "loss": 0.1898, "num_input_tokens_seen": 98600864, "step": 45680 }, { "epoch": 7.452691680261012, "grad_norm": 0.06993179023265839, "learning_rate": 3.9508590446625913e-05, "loss": 0.0854, "num_input_tokens_seen": 98611968, "step": 45685 }, { "epoch": 7.4535073409461665, "grad_norm": 1.8168549537658691, "learning_rate": 3.950569195791585e-05, "loss": 0.1742, "num_input_tokens_seen": 98622912, "step": 45690 }, { "epoch": 7.454323001631321, "grad_norm": 1.3613325357437134, "learning_rate": 3.9502793175229526e-05, "loss": 0.3732, "num_input_tokens_seen": 98634592, "step": 45695 }, { "epoch": 7.455138662316476, "grad_norm": 0.05022747069597244, "learning_rate": 3.94998940986257e-05, "loss": 0.1351, "num_input_tokens_seen": 98644096, "step": 45700 }, { "epoch": 7.455954323001631, "grad_norm": 0.38685569167137146, "learning_rate": 3.949699472816313e-05, "loss": 0.1526, "num_input_tokens_seen": 98654400, "step": 45705 }, { "epoch": 7.456769983686787, "grad_norm": 0.06959343701601028, "learning_rate": 3.949409506390057e-05, "loss": 0.0214, "num_input_tokens_seen": 98665568, "step": 45710 }, { "epoch": 7.4575856443719415, "grad_norm": 0.22237366437911987, "learning_rate": 3.949119510589678e-05, "loss": 0.099, "num_input_tokens_seen": 98674624, "step": 45715 }, { "epoch": 7.458401305057096, "grad_norm": 1.9147441387176514, "learning_rate": 3.948829485421055e-05, "loss": 0.2162, "num_input_tokens_seen": 98685536, "step": 45720 }, { "epoch": 7.459216965742251, "grad_norm": 0.8999019265174866, "learning_rate": 3.9485394308900634e-05, "loss": 0.1607, "num_input_tokens_seen": 98696352, "step": 45725 }, { "epoch": 7.460032626427406, "grad_norm": 0.05370182916522026, "learning_rate": 3.948249347002584e-05, "loss": 0.068, "num_input_tokens_seen": 98707808, "step": 45730 }, { "epoch": 7.460848287112561, "grad_norm": 0.7846839427947998, "learning_rate": 3.9479592337644936e-05, "loss": 0.1535, "num_input_tokens_seen": 98718112, "step": 45735 }, { "epoch": 7.4616639477977165, "grad_norm": 1.3282339572906494, "learning_rate": 3.947669091181672e-05, "loss": 0.1254, "num_input_tokens_seen": 98729312, "step": 45740 }, { "epoch": 7.462479608482871, "grad_norm": 0.1672871708869934, "learning_rate": 3.9473789192599996e-05, "loss": 0.0394, "num_input_tokens_seen": 98739168, "step": 45745 }, { "epoch": 7.463295269168026, "grad_norm": 2.2029216289520264, "learning_rate": 3.947088718005358e-05, "loss": 0.4899, "num_input_tokens_seen": 98749376, "step": 45750 }, { "epoch": 7.464110929853181, "grad_norm": 0.09335856139659882, "learning_rate": 3.9467984874236276e-05, "loss": 0.0484, "num_input_tokens_seen": 98760928, "step": 45755 }, { "epoch": 7.464926590538336, "grad_norm": 0.20601731538772583, "learning_rate": 3.946508227520691e-05, "loss": 0.1556, "num_input_tokens_seen": 98770560, "step": 45760 }, { "epoch": 7.465742251223491, "grad_norm": 0.13950751721858978, "learning_rate": 3.9462179383024286e-05, "loss": 0.1081, "num_input_tokens_seen": 98780512, "step": 45765 }, { "epoch": 7.466557911908646, "grad_norm": 0.4599851071834564, "learning_rate": 3.9459276197747266e-05, "loss": 0.0708, "num_input_tokens_seen": 98792192, "step": 45770 }, { "epoch": 7.467373572593801, "grad_norm": 1.8420939445495605, "learning_rate": 3.9456372719434665e-05, "loss": 0.1571, "num_input_tokens_seen": 98802752, "step": 45775 }, { "epoch": 7.468189233278956, "grad_norm": 0.37044838070869446, "learning_rate": 3.945346894814532e-05, "loss": 0.041, "num_input_tokens_seen": 98813600, "step": 45780 }, { "epoch": 7.469004893964111, "grad_norm": 0.035492971539497375, "learning_rate": 3.9450564883938105e-05, "loss": 0.0264, "num_input_tokens_seen": 98825280, "step": 45785 }, { "epoch": 7.4698205546492655, "grad_norm": 1.4379512071609497, "learning_rate": 3.944766052687186e-05, "loss": 0.1599, "num_input_tokens_seen": 98836480, "step": 45790 }, { "epoch": 7.470636215334421, "grad_norm": 1.19994056224823, "learning_rate": 3.944475587700545e-05, "loss": 0.1168, "num_input_tokens_seen": 98847424, "step": 45795 }, { "epoch": 7.471451876019576, "grad_norm": 1.4529664516448975, "learning_rate": 3.944185093439773e-05, "loss": 0.1736, "num_input_tokens_seen": 98858016, "step": 45800 }, { "epoch": 7.472267536704731, "grad_norm": 0.1602623611688614, "learning_rate": 3.9438945699107584e-05, "loss": 0.0211, "num_input_tokens_seen": 98868864, "step": 45805 }, { "epoch": 7.473083197389886, "grad_norm": 2.0693161487579346, "learning_rate": 3.9436040171193886e-05, "loss": 0.262, "num_input_tokens_seen": 98879936, "step": 45810 }, { "epoch": 7.4738988580750405, "grad_norm": 0.1825985163450241, "learning_rate": 3.9433134350715516e-05, "loss": 0.0436, "num_input_tokens_seen": 98891328, "step": 45815 }, { "epoch": 7.474714518760196, "grad_norm": 0.66017746925354, "learning_rate": 3.943022823773137e-05, "loss": 0.0944, "num_input_tokens_seen": 98902592, "step": 45820 }, { "epoch": 7.475530179445351, "grad_norm": 0.1813019961118698, "learning_rate": 3.9427321832300346e-05, "loss": 0.0915, "num_input_tokens_seen": 98913440, "step": 45825 }, { "epoch": 7.476345840130506, "grad_norm": 0.49413347244262695, "learning_rate": 3.942441513448134e-05, "loss": 0.074, "num_input_tokens_seen": 98923808, "step": 45830 }, { "epoch": 7.477161500815661, "grad_norm": 1.4276341199874878, "learning_rate": 3.9421508144333266e-05, "loss": 0.0572, "num_input_tokens_seen": 98933856, "step": 45835 }, { "epoch": 7.4779771615008155, "grad_norm": 2.005139112472534, "learning_rate": 3.9418600861915034e-05, "loss": 0.0678, "num_input_tokens_seen": 98944960, "step": 45840 }, { "epoch": 7.47879282218597, "grad_norm": 0.8827309608459473, "learning_rate": 3.941569328728556e-05, "loss": 0.0771, "num_input_tokens_seen": 98956512, "step": 45845 }, { "epoch": 7.479608482871126, "grad_norm": 0.9265012145042419, "learning_rate": 3.9412785420503776e-05, "loss": 0.1839, "num_input_tokens_seen": 98967520, "step": 45850 }, { "epoch": 7.480424143556281, "grad_norm": 0.8216276168823242, "learning_rate": 3.940987726162862e-05, "loss": 0.2073, "num_input_tokens_seen": 98977920, "step": 45855 }, { "epoch": 7.481239804241436, "grad_norm": 0.05378274619579315, "learning_rate": 3.9406968810719e-05, "loss": 0.0456, "num_input_tokens_seen": 98987392, "step": 45860 }, { "epoch": 7.4820554649265905, "grad_norm": 0.36597055196762085, "learning_rate": 3.9404060067833903e-05, "loss": 0.0298, "num_input_tokens_seen": 98998304, "step": 45865 }, { "epoch": 7.482871125611745, "grad_norm": 0.3221963047981262, "learning_rate": 3.940115103303225e-05, "loss": 0.1455, "num_input_tokens_seen": 99008480, "step": 45870 }, { "epoch": 7.4836867862969, "grad_norm": 0.6390048265457153, "learning_rate": 3.939824170637299e-05, "loss": 0.1889, "num_input_tokens_seen": 99019584, "step": 45875 }, { "epoch": 7.484502446982056, "grad_norm": 1.2401912212371826, "learning_rate": 3.939533208791511e-05, "loss": 0.1778, "num_input_tokens_seen": 99029920, "step": 45880 }, { "epoch": 7.485318107667211, "grad_norm": 1.6718692779541016, "learning_rate": 3.9392422177717554e-05, "loss": 0.325, "num_input_tokens_seen": 99040288, "step": 45885 }, { "epoch": 7.486133768352365, "grad_norm": 0.2652661204338074, "learning_rate": 3.938951197583931e-05, "loss": 0.0766, "num_input_tokens_seen": 99053024, "step": 45890 }, { "epoch": 7.48694942903752, "grad_norm": 0.020650282502174377, "learning_rate": 3.938660148233935e-05, "loss": 0.0785, "num_input_tokens_seen": 99062720, "step": 45895 }, { "epoch": 7.487765089722675, "grad_norm": 0.4444652795791626, "learning_rate": 3.938369069727666e-05, "loss": 0.0402, "num_input_tokens_seen": 99073728, "step": 45900 }, { "epoch": 7.488580750407831, "grad_norm": 0.3780200183391571, "learning_rate": 3.9380779620710226e-05, "loss": 0.3662, "num_input_tokens_seen": 99084704, "step": 45905 }, { "epoch": 7.489396411092986, "grad_norm": 2.0110769271850586, "learning_rate": 3.937786825269906e-05, "loss": 0.4331, "num_input_tokens_seen": 99095744, "step": 45910 }, { "epoch": 7.49021207177814, "grad_norm": 0.24032920598983765, "learning_rate": 3.937495659330215e-05, "loss": 0.2009, "num_input_tokens_seen": 99107168, "step": 45915 }, { "epoch": 7.491027732463295, "grad_norm": 0.07304226607084274, "learning_rate": 3.937204464257851e-05, "loss": 0.1246, "num_input_tokens_seen": 99118208, "step": 45920 }, { "epoch": 7.49184339314845, "grad_norm": 0.16934749484062195, "learning_rate": 3.936913240058715e-05, "loss": 0.0287, "num_input_tokens_seen": 99129152, "step": 45925 }, { "epoch": 7.492659053833605, "grad_norm": 0.2645258903503418, "learning_rate": 3.9366219867387096e-05, "loss": 0.0373, "num_input_tokens_seen": 99140192, "step": 45930 }, { "epoch": 7.493474714518761, "grad_norm": 1.8430571556091309, "learning_rate": 3.936330704303737e-05, "loss": 0.2112, "num_input_tokens_seen": 99150368, "step": 45935 }, { "epoch": 7.494290375203915, "grad_norm": 0.1199047863483429, "learning_rate": 3.9360393927597006e-05, "loss": 0.127, "num_input_tokens_seen": 99161728, "step": 45940 }, { "epoch": 7.49510603588907, "grad_norm": 1.0906745195388794, "learning_rate": 3.935748052112504e-05, "loss": 0.1453, "num_input_tokens_seen": 99173024, "step": 45945 }, { "epoch": 7.495921696574225, "grad_norm": 0.5180841684341431, "learning_rate": 3.935456682368052e-05, "loss": 0.1125, "num_input_tokens_seen": 99183008, "step": 45950 }, { "epoch": 7.49673735725938, "grad_norm": 0.09949576109647751, "learning_rate": 3.935165283532249e-05, "loss": 0.0429, "num_input_tokens_seen": 99194976, "step": 45955 }, { "epoch": 7.497553017944535, "grad_norm": 1.596845030784607, "learning_rate": 3.934873855611001e-05, "loss": 0.1225, "num_input_tokens_seen": 99205760, "step": 45960 }, { "epoch": 7.49836867862969, "grad_norm": 0.18278667330741882, "learning_rate": 3.9345823986102145e-05, "loss": 0.0681, "num_input_tokens_seen": 99215808, "step": 45965 }, { "epoch": 7.499184339314845, "grad_norm": 0.3067534267902374, "learning_rate": 3.934290912535795e-05, "loss": 0.2072, "num_input_tokens_seen": 99226848, "step": 45970 }, { "epoch": 7.5, "grad_norm": 0.23681843280792236, "learning_rate": 3.9339993973936515e-05, "loss": 0.1312, "num_input_tokens_seen": 99237152, "step": 45975 }, { "epoch": 7.500815660685155, "grad_norm": 1.378263235092163, "learning_rate": 3.933707853189691e-05, "loss": 0.1837, "num_input_tokens_seen": 99247936, "step": 45980 }, { "epoch": 7.50163132137031, "grad_norm": 0.7560259699821472, "learning_rate": 3.933416279929821e-05, "loss": 0.0796, "num_input_tokens_seen": 99259456, "step": 45985 }, { "epoch": 7.502446982055465, "grad_norm": 0.1118052676320076, "learning_rate": 3.9331246776199525e-05, "loss": 0.0312, "num_input_tokens_seen": 99269760, "step": 45990 }, { "epoch": 7.50326264274062, "grad_norm": 0.6604208946228027, "learning_rate": 3.932833046265994e-05, "loss": 0.0897, "num_input_tokens_seen": 99280256, "step": 45995 }, { "epoch": 7.504078303425775, "grad_norm": 0.2774318754673004, "learning_rate": 3.932541385873857e-05, "loss": 0.08, "num_input_tokens_seen": 99289600, "step": 46000 }, { "epoch": 7.50489396411093, "grad_norm": 0.15517885982990265, "learning_rate": 3.932249696449451e-05, "loss": 0.2062, "num_input_tokens_seen": 99300128, "step": 46005 }, { "epoch": 7.505709624796085, "grad_norm": 1.0134930610656738, "learning_rate": 3.931957977998687e-05, "loss": 0.2422, "num_input_tokens_seen": 99311680, "step": 46010 }, { "epoch": 7.506525285481239, "grad_norm": 1.1450533866882324, "learning_rate": 3.9316662305274786e-05, "loss": 0.2283, "num_input_tokens_seen": 99322976, "step": 46015 }, { "epoch": 7.507340946166395, "grad_norm": 0.3120420575141907, "learning_rate": 3.9313744540417384e-05, "loss": 0.1113, "num_input_tokens_seen": 99335584, "step": 46020 }, { "epoch": 7.50815660685155, "grad_norm": 0.4269145727157593, "learning_rate": 3.931082648547379e-05, "loss": 0.0536, "num_input_tokens_seen": 99346528, "step": 46025 }, { "epoch": 7.508972267536705, "grad_norm": 0.13807544112205505, "learning_rate": 3.930790814050314e-05, "loss": 0.0553, "num_input_tokens_seen": 99357280, "step": 46030 }, { "epoch": 7.50978792822186, "grad_norm": 0.2532283365726471, "learning_rate": 3.9304989505564585e-05, "loss": 0.1149, "num_input_tokens_seen": 99368192, "step": 46035 }, { "epoch": 7.510603588907014, "grad_norm": 0.7900917530059814, "learning_rate": 3.9302070580717266e-05, "loss": 0.0732, "num_input_tokens_seen": 99379104, "step": 46040 }, { "epoch": 7.511419249592169, "grad_norm": 1.2655311822891235, "learning_rate": 3.929915136602034e-05, "loss": 0.1796, "num_input_tokens_seen": 99391200, "step": 46045 }, { "epoch": 7.512234910277325, "grad_norm": 0.1300199329853058, "learning_rate": 3.9296231861532975e-05, "loss": 0.0797, "num_input_tokens_seen": 99402272, "step": 46050 }, { "epoch": 7.51305057096248, "grad_norm": 2.0374393463134766, "learning_rate": 3.929331206731434e-05, "loss": 0.1044, "num_input_tokens_seen": 99414016, "step": 46055 }, { "epoch": 7.513866231647635, "grad_norm": 0.26226696372032166, "learning_rate": 3.92903919834236e-05, "loss": 0.1407, "num_input_tokens_seen": 99423424, "step": 46060 }, { "epoch": 7.514681892332789, "grad_norm": 1.628927230834961, "learning_rate": 3.928747160991993e-05, "loss": 0.1328, "num_input_tokens_seen": 99434592, "step": 46065 }, { "epoch": 7.515497553017944, "grad_norm": 1.2914178371429443, "learning_rate": 3.928455094686253e-05, "loss": 0.0821, "num_input_tokens_seen": 99444096, "step": 46070 }, { "epoch": 7.5163132137031, "grad_norm": 0.17777979373931885, "learning_rate": 3.928162999431059e-05, "loss": 0.1888, "num_input_tokens_seen": 99455328, "step": 46075 }, { "epoch": 7.517128874388255, "grad_norm": 1.1632227897644043, "learning_rate": 3.927870875232329e-05, "loss": 0.2216, "num_input_tokens_seen": 99465152, "step": 46080 }, { "epoch": 7.5179445350734095, "grad_norm": 0.05567224696278572, "learning_rate": 3.9275787220959846e-05, "loss": 0.0278, "num_input_tokens_seen": 99475072, "step": 46085 }, { "epoch": 7.518760195758564, "grad_norm": 1.6957029104232788, "learning_rate": 3.9272865400279465e-05, "loss": 0.1456, "num_input_tokens_seen": 99485344, "step": 46090 }, { "epoch": 7.519575856443719, "grad_norm": 0.24595479667186737, "learning_rate": 3.926994329034137e-05, "loss": 0.1213, "num_input_tokens_seen": 99496864, "step": 46095 }, { "epoch": 7.520391517128875, "grad_norm": 0.26828959584236145, "learning_rate": 3.926702089120476e-05, "loss": 0.0604, "num_input_tokens_seen": 99507520, "step": 46100 }, { "epoch": 7.52120717781403, "grad_norm": 0.8832375407218933, "learning_rate": 3.926409820292887e-05, "loss": 0.1602, "num_input_tokens_seen": 99517664, "step": 46105 }, { "epoch": 7.5220228384991845, "grad_norm": 0.2854491174221039, "learning_rate": 3.9261175225572945e-05, "loss": 0.0692, "num_input_tokens_seen": 99527392, "step": 46110 }, { "epoch": 7.522838499184339, "grad_norm": 0.16268542408943176, "learning_rate": 3.92582519591962e-05, "loss": 0.1477, "num_input_tokens_seen": 99538784, "step": 46115 }, { "epoch": 7.523654159869494, "grad_norm": 1.8475532531738281, "learning_rate": 3.92553284038579e-05, "loss": 0.1675, "num_input_tokens_seen": 99549024, "step": 46120 }, { "epoch": 7.524469820554649, "grad_norm": 1.1432790756225586, "learning_rate": 3.925240455961728e-05, "loss": 0.1506, "num_input_tokens_seen": 99560480, "step": 46125 }, { "epoch": 7.525285481239804, "grad_norm": 1.2066763639450073, "learning_rate": 3.9249480426533615e-05, "loss": 0.1301, "num_input_tokens_seen": 99571968, "step": 46130 }, { "epoch": 7.5261011419249595, "grad_norm": 0.4910949468612671, "learning_rate": 3.924655600466614e-05, "loss": 0.1833, "num_input_tokens_seen": 99582112, "step": 46135 }, { "epoch": 7.526916802610114, "grad_norm": 0.3095390498638153, "learning_rate": 3.9243631294074135e-05, "loss": 0.1533, "num_input_tokens_seen": 99591360, "step": 46140 }, { "epoch": 7.527732463295269, "grad_norm": 1.3440659046173096, "learning_rate": 3.924070629481687e-05, "loss": 0.0997, "num_input_tokens_seen": 99601696, "step": 46145 }, { "epoch": 7.528548123980424, "grad_norm": 1.169547200202942, "learning_rate": 3.923778100695364e-05, "loss": 0.2098, "num_input_tokens_seen": 99612928, "step": 46150 }, { "epoch": 7.529363784665579, "grad_norm": 0.7126346230506897, "learning_rate": 3.9234855430543706e-05, "loss": 0.0939, "num_input_tokens_seen": 99622720, "step": 46155 }, { "epoch": 7.5301794453507345, "grad_norm": 1.0744470357894897, "learning_rate": 3.9231929565646365e-05, "loss": 0.3248, "num_input_tokens_seen": 99633888, "step": 46160 }, { "epoch": 7.530995106035889, "grad_norm": 1.4718196392059326, "learning_rate": 3.922900341232092e-05, "loss": 0.1136, "num_input_tokens_seen": 99644960, "step": 46165 }, { "epoch": 7.531810766721044, "grad_norm": 0.06161430478096008, "learning_rate": 3.922607697062668e-05, "loss": 0.1052, "num_input_tokens_seen": 99656096, "step": 46170 }, { "epoch": 7.532626427406199, "grad_norm": 2.067298650741577, "learning_rate": 3.922315024062293e-05, "loss": 0.1385, "num_input_tokens_seen": 99666752, "step": 46175 }, { "epoch": 7.533442088091354, "grad_norm": 0.8723980188369751, "learning_rate": 3.9220223222369e-05, "loss": 0.1013, "num_input_tokens_seen": 99678016, "step": 46180 }, { "epoch": 7.5342577487765094, "grad_norm": 0.2555999159812927, "learning_rate": 3.921729591592421e-05, "loss": 0.0546, "num_input_tokens_seen": 99687456, "step": 46185 }, { "epoch": 7.535073409461664, "grad_norm": 0.7583758234977722, "learning_rate": 3.921436832134788e-05, "loss": 0.0653, "num_input_tokens_seen": 99699584, "step": 46190 }, { "epoch": 7.535889070146819, "grad_norm": 0.22194291651248932, "learning_rate": 3.921144043869934e-05, "loss": 0.022, "num_input_tokens_seen": 99710624, "step": 46195 }, { "epoch": 7.536704730831974, "grad_norm": 2.0132951736450195, "learning_rate": 3.920851226803795e-05, "loss": 0.1017, "num_input_tokens_seen": 99720704, "step": 46200 }, { "epoch": 7.537520391517129, "grad_norm": 0.10506951808929443, "learning_rate": 3.920558380942301e-05, "loss": 0.1412, "num_input_tokens_seen": 99729984, "step": 46205 }, { "epoch": 7.5383360522022835, "grad_norm": 0.08109574019908905, "learning_rate": 3.92026550629139e-05, "loss": 0.0821, "num_input_tokens_seen": 99741632, "step": 46210 }, { "epoch": 7.539151712887438, "grad_norm": 0.09524103254079819, "learning_rate": 3.919972602856997e-05, "loss": 0.0849, "num_input_tokens_seen": 99750688, "step": 46215 }, { "epoch": 7.539967373572594, "grad_norm": 0.3709430694580078, "learning_rate": 3.919679670645058e-05, "loss": 0.1298, "num_input_tokens_seen": 99761440, "step": 46220 }, { "epoch": 7.540783034257749, "grad_norm": 0.893372118473053, "learning_rate": 3.919386709661509e-05, "loss": 0.1318, "num_input_tokens_seen": 99770912, "step": 46225 }, { "epoch": 7.541598694942904, "grad_norm": 1.4568995237350464, "learning_rate": 3.919093719912288e-05, "loss": 0.101, "num_input_tokens_seen": 99782368, "step": 46230 }, { "epoch": 7.5424143556280585, "grad_norm": 0.9094259142875671, "learning_rate": 3.9188007014033334e-05, "loss": 0.0808, "num_input_tokens_seen": 99794560, "step": 46235 }, { "epoch": 7.543230016313213, "grad_norm": 0.26138386130332947, "learning_rate": 3.918507654140582e-05, "loss": 0.1056, "num_input_tokens_seen": 99805184, "step": 46240 }, { "epoch": 7.544045676998369, "grad_norm": 0.37710827589035034, "learning_rate": 3.918214578129973e-05, "loss": 0.1173, "num_input_tokens_seen": 99816160, "step": 46245 }, { "epoch": 7.544861337683524, "grad_norm": 0.29310905933380127, "learning_rate": 3.917921473377447e-05, "loss": 0.0581, "num_input_tokens_seen": 99826784, "step": 46250 }, { "epoch": 7.545676998368679, "grad_norm": 0.07188422977924347, "learning_rate": 3.917628339888944e-05, "loss": 0.0215, "num_input_tokens_seen": 99837920, "step": 46255 }, { "epoch": 7.5464926590538335, "grad_norm": 1.3793871402740479, "learning_rate": 3.9173351776704025e-05, "loss": 0.1927, "num_input_tokens_seen": 99848672, "step": 46260 }, { "epoch": 7.547308319738988, "grad_norm": 0.1706310212612152, "learning_rate": 3.917041986727768e-05, "loss": 0.216, "num_input_tokens_seen": 99861216, "step": 46265 }, { "epoch": 7.548123980424144, "grad_norm": 0.46885696053504944, "learning_rate": 3.9167487670669776e-05, "loss": 0.1562, "num_input_tokens_seen": 99870784, "step": 46270 }, { "epoch": 7.548939641109299, "grad_norm": 0.45549899339675903, "learning_rate": 3.9164555186939776e-05, "loss": 0.1092, "num_input_tokens_seen": 99882272, "step": 46275 }, { "epoch": 7.549755301794454, "grad_norm": 0.8108024597167969, "learning_rate": 3.91616224161471e-05, "loss": 0.1395, "num_input_tokens_seen": 99892992, "step": 46280 }, { "epoch": 7.5505709624796085, "grad_norm": 0.1285947561264038, "learning_rate": 3.915868935835118e-05, "loss": 0.0409, "num_input_tokens_seen": 99903744, "step": 46285 }, { "epoch": 7.551386623164763, "grad_norm": 0.1642843633890152, "learning_rate": 3.915575601361144e-05, "loss": 0.1371, "num_input_tokens_seen": 99914688, "step": 46290 }, { "epoch": 7.552202283849918, "grad_norm": 0.43479207158088684, "learning_rate": 3.9152822381987366e-05, "loss": 0.0711, "num_input_tokens_seen": 99925632, "step": 46295 }, { "epoch": 7.553017944535073, "grad_norm": 1.1018613576889038, "learning_rate": 3.9149888463538384e-05, "loss": 0.1619, "num_input_tokens_seen": 99935552, "step": 46300 }, { "epoch": 7.553833605220229, "grad_norm": 0.4674350619316101, "learning_rate": 3.9146954258323965e-05, "loss": 0.0237, "num_input_tokens_seen": 99946880, "step": 46305 }, { "epoch": 7.554649265905383, "grad_norm": 1.4967987537384033, "learning_rate": 3.9144019766403564e-05, "loss": 0.1633, "num_input_tokens_seen": 99957792, "step": 46310 }, { "epoch": 7.555464926590538, "grad_norm": 0.1841173619031906, "learning_rate": 3.914108498783667e-05, "loss": 0.175, "num_input_tokens_seen": 99967168, "step": 46315 }, { "epoch": 7.556280587275693, "grad_norm": 0.44141265749931335, "learning_rate": 3.9138149922682745e-05, "loss": 0.0579, "num_input_tokens_seen": 99978784, "step": 46320 }, { "epoch": 7.557096247960848, "grad_norm": 0.3475121259689331, "learning_rate": 3.913521457100127e-05, "loss": 0.1623, "num_input_tokens_seen": 99990144, "step": 46325 }, { "epoch": 7.557911908646004, "grad_norm": 0.21673616766929626, "learning_rate": 3.913227893285175e-05, "loss": 0.0155, "num_input_tokens_seen": 100001568, "step": 46330 }, { "epoch": 7.558727569331158, "grad_norm": 1.804120421409607, "learning_rate": 3.912934300829366e-05, "loss": 0.1813, "num_input_tokens_seen": 100012128, "step": 46335 }, { "epoch": 7.559543230016313, "grad_norm": 0.09261759370565414, "learning_rate": 3.912640679738652e-05, "loss": 0.1481, "num_input_tokens_seen": 100023936, "step": 46340 }, { "epoch": 7.560358890701468, "grad_norm": 1.2653428316116333, "learning_rate": 3.9123470300189815e-05, "loss": 0.0905, "num_input_tokens_seen": 100035552, "step": 46345 }, { "epoch": 7.561174551386623, "grad_norm": 0.25104308128356934, "learning_rate": 3.912053351676308e-05, "loss": 0.0187, "num_input_tokens_seen": 100045408, "step": 46350 }, { "epoch": 7.561990212071779, "grad_norm": 0.22597941756248474, "learning_rate": 3.911759644716581e-05, "loss": 0.1904, "num_input_tokens_seen": 100057184, "step": 46355 }, { "epoch": 7.562805872756933, "grad_norm": 0.28469347953796387, "learning_rate": 3.911465909145754e-05, "loss": 0.0273, "num_input_tokens_seen": 100068384, "step": 46360 }, { "epoch": 7.563621533442088, "grad_norm": 0.7793480157852173, "learning_rate": 3.9111721449697806e-05, "loss": 0.1756, "num_input_tokens_seen": 100079488, "step": 46365 }, { "epoch": 7.564437194127243, "grad_norm": 0.28165605664253235, "learning_rate": 3.910878352194612e-05, "loss": 0.1766, "num_input_tokens_seen": 100090720, "step": 46370 }, { "epoch": 7.565252854812398, "grad_norm": 0.8141551613807678, "learning_rate": 3.910584530826205e-05, "loss": 0.0938, "num_input_tokens_seen": 100101248, "step": 46375 }, { "epoch": 7.566068515497553, "grad_norm": 0.41564998030662537, "learning_rate": 3.9102906808705124e-05, "loss": 0.0492, "num_input_tokens_seen": 100112224, "step": 46380 }, { "epoch": 7.566884176182708, "grad_norm": 0.2572745680809021, "learning_rate": 3.909996802333491e-05, "loss": 0.0906, "num_input_tokens_seen": 100121632, "step": 46385 }, { "epoch": 7.567699836867863, "grad_norm": 1.6539955139160156, "learning_rate": 3.9097028952210944e-05, "loss": 0.0904, "num_input_tokens_seen": 100133024, "step": 46390 }, { "epoch": 7.568515497553018, "grad_norm": 0.44763392210006714, "learning_rate": 3.9094089595392814e-05, "loss": 0.202, "num_input_tokens_seen": 100143008, "step": 46395 }, { "epoch": 7.569331158238173, "grad_norm": 0.04475487396121025, "learning_rate": 3.909114995294007e-05, "loss": 0.1672, "num_input_tokens_seen": 100153408, "step": 46400 }, { "epoch": 7.570146818923328, "grad_norm": 1.1805700063705444, "learning_rate": 3.90882100249123e-05, "loss": 0.1927, "num_input_tokens_seen": 100163744, "step": 46405 }, { "epoch": 7.5709624796084825, "grad_norm": 0.2645778954029083, "learning_rate": 3.9085269811369084e-05, "loss": 0.0753, "num_input_tokens_seen": 100172608, "step": 46410 }, { "epoch": 7.571778140293638, "grad_norm": 0.13140533864498138, "learning_rate": 3.9082329312370006e-05, "loss": 0.044, "num_input_tokens_seen": 100182400, "step": 46415 }, { "epoch": 7.572593800978793, "grad_norm": 1.5311123132705688, "learning_rate": 3.9079388527974655e-05, "loss": 0.0751, "num_input_tokens_seen": 100191936, "step": 46420 }, { "epoch": 7.573409461663948, "grad_norm": 0.07467470318078995, "learning_rate": 3.9076447458242635e-05, "loss": 0.1026, "num_input_tokens_seen": 100202048, "step": 46425 }, { "epoch": 7.574225122349103, "grad_norm": 0.24795308709144592, "learning_rate": 3.907350610323355e-05, "loss": 0.1184, "num_input_tokens_seen": 100213632, "step": 46430 }, { "epoch": 7.575040783034257, "grad_norm": 1.2748138904571533, "learning_rate": 3.907056446300702e-05, "loss": 0.2255, "num_input_tokens_seen": 100224096, "step": 46435 }, { "epoch": 7.575856443719413, "grad_norm": 0.46071553230285645, "learning_rate": 3.906762253762264e-05, "loss": 0.1361, "num_input_tokens_seen": 100234368, "step": 46440 }, { "epoch": 7.576672104404568, "grad_norm": 0.0475834496319294, "learning_rate": 3.9064680327140046e-05, "loss": 0.0591, "num_input_tokens_seen": 100244256, "step": 46445 }, { "epoch": 7.577487765089723, "grad_norm": 1.8208330869674683, "learning_rate": 3.906173783161887e-05, "loss": 0.1027, "num_input_tokens_seen": 100255360, "step": 46450 }, { "epoch": 7.578303425774878, "grad_norm": 0.04036610201001167, "learning_rate": 3.9058795051118736e-05, "loss": 0.1149, "num_input_tokens_seen": 100265824, "step": 46455 }, { "epoch": 7.579119086460032, "grad_norm": 0.1466396152973175, "learning_rate": 3.9055851985699274e-05, "loss": 0.2221, "num_input_tokens_seen": 100276512, "step": 46460 }, { "epoch": 7.579934747145187, "grad_norm": 0.05334404855966568, "learning_rate": 3.905290863542015e-05, "loss": 0.0903, "num_input_tokens_seen": 100286560, "step": 46465 }, { "epoch": 7.580750407830343, "grad_norm": 0.8740490078926086, "learning_rate": 3.904996500034101e-05, "loss": 0.0987, "num_input_tokens_seen": 100297856, "step": 46470 }, { "epoch": 7.581566068515498, "grad_norm": 0.3425986170768738, "learning_rate": 3.90470210805215e-05, "loss": 0.1992, "num_input_tokens_seen": 100308352, "step": 46475 }, { "epoch": 7.582381729200653, "grad_norm": 0.9825703501701355, "learning_rate": 3.904407687602128e-05, "loss": 0.0828, "num_input_tokens_seen": 100317536, "step": 46480 }, { "epoch": 7.583197389885807, "grad_norm": 0.0947476327419281, "learning_rate": 3.9041132386900034e-05, "loss": 0.0259, "num_input_tokens_seen": 100327808, "step": 46485 }, { "epoch": 7.584013050570962, "grad_norm": 0.05340777337551117, "learning_rate": 3.903818761321742e-05, "loss": 0.0905, "num_input_tokens_seen": 100339712, "step": 46490 }, { "epoch": 7.584828711256117, "grad_norm": 1.0816164016723633, "learning_rate": 3.903524255503314e-05, "loss": 0.2123, "num_input_tokens_seen": 100349696, "step": 46495 }, { "epoch": 7.585644371941273, "grad_norm": 0.7336763143539429, "learning_rate": 3.903229721240686e-05, "loss": 0.2893, "num_input_tokens_seen": 100360160, "step": 46500 }, { "epoch": 7.5864600326264275, "grad_norm": 0.7071265578269958, "learning_rate": 3.902935158539827e-05, "loss": 0.0504, "num_input_tokens_seen": 100370848, "step": 46505 }, { "epoch": 7.587275693311582, "grad_norm": 0.5877469778060913, "learning_rate": 3.902640567406708e-05, "loss": 0.0309, "num_input_tokens_seen": 100381248, "step": 46510 }, { "epoch": 7.588091353996737, "grad_norm": 2.0109474658966064, "learning_rate": 3.902345947847298e-05, "loss": 0.1067, "num_input_tokens_seen": 100389888, "step": 46515 }, { "epoch": 7.588907014681892, "grad_norm": 0.2313920110464096, "learning_rate": 3.902051299867569e-05, "loss": 0.0622, "num_input_tokens_seen": 100399840, "step": 46520 }, { "epoch": 7.589722675367048, "grad_norm": 0.8867098689079285, "learning_rate": 3.9017566234734904e-05, "loss": 0.1078, "num_input_tokens_seen": 100410848, "step": 46525 }, { "epoch": 7.5905383360522025, "grad_norm": 1.1245237588882446, "learning_rate": 3.901461918671036e-05, "loss": 0.2845, "num_input_tokens_seen": 100422368, "step": 46530 }, { "epoch": 7.591353996737357, "grad_norm": 0.14400865137577057, "learning_rate": 3.9011671854661785e-05, "loss": 0.1104, "num_input_tokens_seen": 100433504, "step": 46535 }, { "epoch": 7.592169657422512, "grad_norm": 0.23571769893169403, "learning_rate": 3.90087242386489e-05, "loss": 0.0809, "num_input_tokens_seen": 100443584, "step": 46540 }, { "epoch": 7.592985318107667, "grad_norm": 1.2850818634033203, "learning_rate": 3.900577633873146e-05, "loss": 0.2279, "num_input_tokens_seen": 100453216, "step": 46545 }, { "epoch": 7.593800978792823, "grad_norm": 0.3957311809062958, "learning_rate": 3.900282815496917e-05, "loss": 0.0568, "num_input_tokens_seen": 100463296, "step": 46550 }, { "epoch": 7.5946166394779775, "grad_norm": 0.9584978818893433, "learning_rate": 3.899987968742183e-05, "loss": 0.2857, "num_input_tokens_seen": 100473728, "step": 46555 }, { "epoch": 7.595432300163132, "grad_norm": 1.8389469385147095, "learning_rate": 3.899693093614915e-05, "loss": 0.1002, "num_input_tokens_seen": 100485344, "step": 46560 }, { "epoch": 7.596247960848287, "grad_norm": 1.8543059825897217, "learning_rate": 3.899398190121092e-05, "loss": 0.146, "num_input_tokens_seen": 100495552, "step": 46565 }, { "epoch": 7.597063621533442, "grad_norm": 0.10677322745323181, "learning_rate": 3.8991032582666884e-05, "loss": 0.0937, "num_input_tokens_seen": 100506944, "step": 46570 }, { "epoch": 7.597879282218597, "grad_norm": 0.9180313944816589, "learning_rate": 3.898808298057683e-05, "loss": 0.125, "num_input_tokens_seen": 100517760, "step": 46575 }, { "epoch": 7.598694942903752, "grad_norm": 0.39005225896835327, "learning_rate": 3.898513309500052e-05, "loss": 0.0907, "num_input_tokens_seen": 100528768, "step": 46580 }, { "epoch": 7.599510603588907, "grad_norm": 0.5913889408111572, "learning_rate": 3.898218292599776e-05, "loss": 0.1002, "num_input_tokens_seen": 100539392, "step": 46585 }, { "epoch": 7.600326264274062, "grad_norm": 0.42878198623657227, "learning_rate": 3.897923247362833e-05, "loss": 0.1037, "num_input_tokens_seen": 100550176, "step": 46590 }, { "epoch": 7.601141924959217, "grad_norm": 1.5063260793685913, "learning_rate": 3.8976281737952e-05, "loss": 0.1208, "num_input_tokens_seen": 100561216, "step": 46595 }, { "epoch": 7.601957585644372, "grad_norm": 0.1869320571422577, "learning_rate": 3.89733307190286e-05, "loss": 0.0326, "num_input_tokens_seen": 100571648, "step": 46600 }, { "epoch": 7.602773246329527, "grad_norm": 0.9755046963691711, "learning_rate": 3.897037941691793e-05, "loss": 0.1922, "num_input_tokens_seen": 100583776, "step": 46605 }, { "epoch": 7.603588907014682, "grad_norm": 0.39665940403938293, "learning_rate": 3.8967427831679794e-05, "loss": 0.0506, "num_input_tokens_seen": 100592992, "step": 46610 }, { "epoch": 7.604404567699837, "grad_norm": 0.556189775466919, "learning_rate": 3.8964475963374015e-05, "loss": 0.1492, "num_input_tokens_seen": 100603680, "step": 46615 }, { "epoch": 7.605220228384992, "grad_norm": 1.8824820518493652, "learning_rate": 3.896152381206042e-05, "loss": 0.258, "num_input_tokens_seen": 100615808, "step": 46620 }, { "epoch": 7.606035889070147, "grad_norm": 1.9155240058898926, "learning_rate": 3.895857137779882e-05, "loss": 0.1811, "num_input_tokens_seen": 100625696, "step": 46625 }, { "epoch": 7.6068515497553015, "grad_norm": 0.3535859286785126, "learning_rate": 3.895561866064907e-05, "loss": 0.0726, "num_input_tokens_seen": 100635968, "step": 46630 }, { "epoch": 7.607667210440457, "grad_norm": 0.2454894334077835, "learning_rate": 3.895266566067101e-05, "loss": 0.1172, "num_input_tokens_seen": 100647104, "step": 46635 }, { "epoch": 7.608482871125612, "grad_norm": 0.1036338210105896, "learning_rate": 3.8949712377924475e-05, "loss": 0.0163, "num_input_tokens_seen": 100656992, "step": 46640 }, { "epoch": 7.609298531810767, "grad_norm": 1.2469701766967773, "learning_rate": 3.8946758812469324e-05, "loss": 0.1183, "num_input_tokens_seen": 100668288, "step": 46645 }, { "epoch": 7.610114192495922, "grad_norm": 0.5641491413116455, "learning_rate": 3.894380496436542e-05, "loss": 0.1853, "num_input_tokens_seen": 100680736, "step": 46650 }, { "epoch": 7.6109298531810765, "grad_norm": 0.29211610555648804, "learning_rate": 3.8940850833672603e-05, "loss": 0.0566, "num_input_tokens_seen": 100692000, "step": 46655 }, { "epoch": 7.611745513866231, "grad_norm": 1.3356266021728516, "learning_rate": 3.893789642045077e-05, "loss": 0.243, "num_input_tokens_seen": 100701440, "step": 46660 }, { "epoch": 7.612561174551386, "grad_norm": 0.1150565892457962, "learning_rate": 3.893494172475978e-05, "loss": 0.1193, "num_input_tokens_seen": 100712224, "step": 46665 }, { "epoch": 7.613376835236542, "grad_norm": 1.0062711238861084, "learning_rate": 3.8931986746659524e-05, "loss": 0.0744, "num_input_tokens_seen": 100722912, "step": 46670 }, { "epoch": 7.614192495921697, "grad_norm": 0.08601010590791702, "learning_rate": 3.892903148620988e-05, "loss": 0.0999, "num_input_tokens_seen": 100732384, "step": 46675 }, { "epoch": 7.6150081566068515, "grad_norm": 1.2047306299209595, "learning_rate": 3.892607594347074e-05, "loss": 0.2007, "num_input_tokens_seen": 100743328, "step": 46680 }, { "epoch": 7.615823817292006, "grad_norm": 0.660514235496521, "learning_rate": 3.892312011850201e-05, "loss": 0.0949, "num_input_tokens_seen": 100754944, "step": 46685 }, { "epoch": 7.616639477977161, "grad_norm": 0.7704527378082275, "learning_rate": 3.892016401136358e-05, "loss": 0.2683, "num_input_tokens_seen": 100767200, "step": 46690 }, { "epoch": 7.617455138662317, "grad_norm": 1.1635453701019287, "learning_rate": 3.891720762211537e-05, "loss": 0.1396, "num_input_tokens_seen": 100777408, "step": 46695 }, { "epoch": 7.618270799347472, "grad_norm": 0.1360381692647934, "learning_rate": 3.8914250950817297e-05, "loss": 0.2069, "num_input_tokens_seen": 100787584, "step": 46700 }, { "epoch": 7.6190864600326265, "grad_norm": 0.21970383822917938, "learning_rate": 3.891129399752928e-05, "loss": 0.0953, "num_input_tokens_seen": 100796800, "step": 46705 }, { "epoch": 7.619902120717781, "grad_norm": 0.24437034130096436, "learning_rate": 3.890833676231123e-05, "loss": 0.0302, "num_input_tokens_seen": 100806528, "step": 46710 }, { "epoch": 7.620717781402936, "grad_norm": 1.197222352027893, "learning_rate": 3.89053792452231e-05, "loss": 0.0903, "num_input_tokens_seen": 100817120, "step": 46715 }, { "epoch": 7.621533442088092, "grad_norm": 2.4116721153259277, "learning_rate": 3.890242144632481e-05, "loss": 0.2809, "num_input_tokens_seen": 100828000, "step": 46720 }, { "epoch": 7.622349102773247, "grad_norm": 0.10524783283472061, "learning_rate": 3.889946336567632e-05, "loss": 0.0269, "num_input_tokens_seen": 100839424, "step": 46725 }, { "epoch": 7.623164763458401, "grad_norm": 0.7084595561027527, "learning_rate": 3.889650500333758e-05, "loss": 0.0952, "num_input_tokens_seen": 100849472, "step": 46730 }, { "epoch": 7.623980424143556, "grad_norm": 1.204048991203308, "learning_rate": 3.889354635936853e-05, "loss": 0.0619, "num_input_tokens_seen": 100859232, "step": 46735 }, { "epoch": 7.624796084828711, "grad_norm": 0.46564385294914246, "learning_rate": 3.889058743382913e-05, "loss": 0.0696, "num_input_tokens_seen": 100869888, "step": 46740 }, { "epoch": 7.625611745513866, "grad_norm": 0.403065025806427, "learning_rate": 3.888762822677936e-05, "loss": 0.1954, "num_input_tokens_seen": 100881440, "step": 46745 }, { "epoch": 7.626427406199021, "grad_norm": 0.6556480526924133, "learning_rate": 3.888466873827919e-05, "loss": 0.1082, "num_input_tokens_seen": 100892160, "step": 46750 }, { "epoch": 7.627243066884176, "grad_norm": 0.27345138788223267, "learning_rate": 3.888170896838859e-05, "loss": 0.0891, "num_input_tokens_seen": 100901280, "step": 46755 }, { "epoch": 7.628058727569331, "grad_norm": 1.8373863697052002, "learning_rate": 3.887874891716755e-05, "loss": 0.1926, "num_input_tokens_seen": 100913440, "step": 46760 }, { "epoch": 7.628874388254486, "grad_norm": 0.06585919857025146, "learning_rate": 3.887578858467605e-05, "loss": 0.1975, "num_input_tokens_seen": 100923840, "step": 46765 }, { "epoch": 7.629690048939641, "grad_norm": 0.3939535915851593, "learning_rate": 3.887282797097411e-05, "loss": 0.0842, "num_input_tokens_seen": 100933824, "step": 46770 }, { "epoch": 7.630505709624796, "grad_norm": 0.31483733654022217, "learning_rate": 3.8869867076121694e-05, "loss": 0.0571, "num_input_tokens_seen": 100943424, "step": 46775 }, { "epoch": 7.631321370309951, "grad_norm": 0.4083932936191559, "learning_rate": 3.886690590017883e-05, "loss": 0.1454, "num_input_tokens_seen": 100955040, "step": 46780 }, { "epoch": 7.632137030995106, "grad_norm": 1.7674119472503662, "learning_rate": 3.886394444320554e-05, "loss": 0.1082, "num_input_tokens_seen": 100966848, "step": 46785 }, { "epoch": 7.632952691680261, "grad_norm": 0.12073198705911636, "learning_rate": 3.886098270526181e-05, "loss": 0.0923, "num_input_tokens_seen": 100977888, "step": 46790 }, { "epoch": 7.633768352365416, "grad_norm": 0.16527751088142395, "learning_rate": 3.885802068640769e-05, "loss": 0.0339, "num_input_tokens_seen": 100988736, "step": 46795 }, { "epoch": 7.634584013050571, "grad_norm": 0.5068936347961426, "learning_rate": 3.8855058386703194e-05, "loss": 0.1011, "num_input_tokens_seen": 100999360, "step": 46800 }, { "epoch": 7.635399673735726, "grad_norm": 0.31832486391067505, "learning_rate": 3.885209580620838e-05, "loss": 0.0344, "num_input_tokens_seen": 101011104, "step": 46805 }, { "epoch": 7.636215334420881, "grad_norm": 0.41865062713623047, "learning_rate": 3.8849132944983256e-05, "loss": 0.0303, "num_input_tokens_seen": 101021664, "step": 46810 }, { "epoch": 7.637030995106036, "grad_norm": 0.15827620029449463, "learning_rate": 3.8846169803087885e-05, "loss": 0.1073, "num_input_tokens_seen": 101032576, "step": 46815 }, { "epoch": 7.637846655791191, "grad_norm": 0.7083969712257385, "learning_rate": 3.884320638058232e-05, "loss": 0.1058, "num_input_tokens_seen": 101043040, "step": 46820 }, { "epoch": 7.638662316476346, "grad_norm": 0.1738099753856659, "learning_rate": 3.884024267752662e-05, "loss": 0.2497, "num_input_tokens_seen": 101054400, "step": 46825 }, { "epoch": 7.6394779771615005, "grad_norm": 0.38538408279418945, "learning_rate": 3.8837278693980844e-05, "loss": 0.0327, "num_input_tokens_seen": 101064896, "step": 46830 }, { "epoch": 7.640293637846656, "grad_norm": 0.8182572722434998, "learning_rate": 3.883431443000506e-05, "loss": 0.075, "num_input_tokens_seen": 101074848, "step": 46835 }, { "epoch": 7.641109298531811, "grad_norm": 0.13531559705734253, "learning_rate": 3.883134988565934e-05, "loss": 0.1346, "num_input_tokens_seen": 101085312, "step": 46840 }, { "epoch": 7.641924959216966, "grad_norm": 0.3100294768810272, "learning_rate": 3.882838506100377e-05, "loss": 0.0169, "num_input_tokens_seen": 101095584, "step": 46845 }, { "epoch": 7.642740619902121, "grad_norm": 0.08699710667133331, "learning_rate": 3.882541995609844e-05, "loss": 0.0106, "num_input_tokens_seen": 101105760, "step": 46850 }, { "epoch": 7.643556280587275, "grad_norm": 0.0862332284450531, "learning_rate": 3.882245457100343e-05, "loss": 0.0679, "num_input_tokens_seen": 101116576, "step": 46855 }, { "epoch": 7.64437194127243, "grad_norm": 0.5462794899940491, "learning_rate": 3.8819488905778847e-05, "loss": 0.0573, "num_input_tokens_seen": 101127840, "step": 46860 }, { "epoch": 7.645187601957586, "grad_norm": 0.31633633375167847, "learning_rate": 3.881652296048479e-05, "loss": 0.1345, "num_input_tokens_seen": 101140192, "step": 46865 }, { "epoch": 7.646003262642741, "grad_norm": 1.150389552116394, "learning_rate": 3.881355673518137e-05, "loss": 0.1856, "num_input_tokens_seen": 101150336, "step": 46870 }, { "epoch": 7.646818923327896, "grad_norm": 0.13993579149246216, "learning_rate": 3.881059022992869e-05, "loss": 0.0813, "num_input_tokens_seen": 101161088, "step": 46875 }, { "epoch": 7.64763458401305, "grad_norm": 1.010800838470459, "learning_rate": 3.880762344478688e-05, "loss": 0.0666, "num_input_tokens_seen": 101171776, "step": 46880 }, { "epoch": 7.648450244698205, "grad_norm": 0.10054805874824524, "learning_rate": 3.8804656379816076e-05, "loss": 0.0281, "num_input_tokens_seen": 101180544, "step": 46885 }, { "epoch": 7.649265905383361, "grad_norm": 0.6959864497184753, "learning_rate": 3.880168903507638e-05, "loss": 0.0419, "num_input_tokens_seen": 101192448, "step": 46890 }, { "epoch": 7.650081566068516, "grad_norm": 0.3417758345603943, "learning_rate": 3.879872141062796e-05, "loss": 0.0864, "num_input_tokens_seen": 101203168, "step": 46895 }, { "epoch": 7.650897226753671, "grad_norm": 1.5955113172531128, "learning_rate": 3.879575350653095e-05, "loss": 0.0852, "num_input_tokens_seen": 101213216, "step": 46900 }, { "epoch": 7.651712887438825, "grad_norm": 0.559394359588623, "learning_rate": 3.879278532284548e-05, "loss": 0.12, "num_input_tokens_seen": 101224960, "step": 46905 }, { "epoch": 7.65252854812398, "grad_norm": 1.464809775352478, "learning_rate": 3.878981685963173e-05, "loss": 0.1155, "num_input_tokens_seen": 101235296, "step": 46910 }, { "epoch": 7.653344208809135, "grad_norm": 1.4618879556655884, "learning_rate": 3.878684811694985e-05, "loss": 0.1631, "num_input_tokens_seen": 101246784, "step": 46915 }, { "epoch": 7.654159869494291, "grad_norm": 0.17293424904346466, "learning_rate": 3.8783879094859995e-05, "loss": 0.1332, "num_input_tokens_seen": 101257440, "step": 46920 }, { "epoch": 7.6549755301794455, "grad_norm": 0.06102149188518524, "learning_rate": 3.878090979342234e-05, "loss": 0.0514, "num_input_tokens_seen": 101268672, "step": 46925 }, { "epoch": 7.6557911908646, "grad_norm": 0.0417453870177269, "learning_rate": 3.877794021269707e-05, "loss": 0.1057, "num_input_tokens_seen": 101279168, "step": 46930 }, { "epoch": 7.656606851549755, "grad_norm": 3.0221879482269287, "learning_rate": 3.877497035274437e-05, "loss": 0.3085, "num_input_tokens_seen": 101290336, "step": 46935 }, { "epoch": 7.65742251223491, "grad_norm": 0.07115667313337326, "learning_rate": 3.877200021362441e-05, "loss": 0.2087, "num_input_tokens_seen": 101301344, "step": 46940 }, { "epoch": 7.658238172920065, "grad_norm": 0.10156070441007614, "learning_rate": 3.8769029795397406e-05, "loss": 0.2347, "num_input_tokens_seen": 101312160, "step": 46945 }, { "epoch": 7.6590538336052205, "grad_norm": 0.3696909546852112, "learning_rate": 3.876605909812353e-05, "loss": 0.1088, "num_input_tokens_seen": 101322720, "step": 46950 }, { "epoch": 7.659869494290375, "grad_norm": 0.3244011700153351, "learning_rate": 3.876308812186301e-05, "loss": 0.0712, "num_input_tokens_seen": 101334048, "step": 46955 }, { "epoch": 7.66068515497553, "grad_norm": 1.5113704204559326, "learning_rate": 3.8760116866676054e-05, "loss": 0.2363, "num_input_tokens_seen": 101345760, "step": 46960 }, { "epoch": 7.661500815660685, "grad_norm": 1.76423180103302, "learning_rate": 3.875714533262287e-05, "loss": 0.1361, "num_input_tokens_seen": 101356416, "step": 46965 }, { "epoch": 7.66231647634584, "grad_norm": 1.3825182914733887, "learning_rate": 3.875417351976369e-05, "loss": 0.2291, "num_input_tokens_seen": 101367360, "step": 46970 }, { "epoch": 7.6631321370309955, "grad_norm": 0.27052071690559387, "learning_rate": 3.875120142815873e-05, "loss": 0.0304, "num_input_tokens_seen": 101378848, "step": 46975 }, { "epoch": 7.66394779771615, "grad_norm": 0.0869910791516304, "learning_rate": 3.874822905786823e-05, "loss": 0.0997, "num_input_tokens_seen": 101389184, "step": 46980 }, { "epoch": 7.664763458401305, "grad_norm": 1.5440685749053955, "learning_rate": 3.874525640895242e-05, "loss": 0.2122, "num_input_tokens_seen": 101399840, "step": 46985 }, { "epoch": 7.66557911908646, "grad_norm": 0.49047040939331055, "learning_rate": 3.874228348147156e-05, "loss": 0.2166, "num_input_tokens_seen": 101410112, "step": 46990 }, { "epoch": 7.666394779771615, "grad_norm": 0.5207722783088684, "learning_rate": 3.8739310275485897e-05, "loss": 0.1121, "num_input_tokens_seen": 101420224, "step": 46995 }, { "epoch": 7.6672104404567705, "grad_norm": 0.6452193856239319, "learning_rate": 3.873633679105567e-05, "loss": 0.0418, "num_input_tokens_seen": 101430080, "step": 47000 }, { "epoch": 7.668026101141925, "grad_norm": 0.16912509500980377, "learning_rate": 3.873336302824116e-05, "loss": 0.0704, "num_input_tokens_seen": 101441856, "step": 47005 }, { "epoch": 7.66884176182708, "grad_norm": 1.9242092370986938, "learning_rate": 3.8730388987102617e-05, "loss": 0.2147, "num_input_tokens_seen": 101451232, "step": 47010 }, { "epoch": 7.669657422512235, "grad_norm": 1.0831177234649658, "learning_rate": 3.8727414667700334e-05, "loss": 0.1424, "num_input_tokens_seen": 101461792, "step": 47015 }, { "epoch": 7.67047308319739, "grad_norm": 1.5522561073303223, "learning_rate": 3.872444007009457e-05, "loss": 0.2524, "num_input_tokens_seen": 101472096, "step": 47020 }, { "epoch": 7.671288743882545, "grad_norm": 1.3081586360931396, "learning_rate": 3.872146519434563e-05, "loss": 0.1563, "num_input_tokens_seen": 101482784, "step": 47025 }, { "epoch": 7.672104404567699, "grad_norm": 0.5423646569252014, "learning_rate": 3.871849004051378e-05, "loss": 0.1043, "num_input_tokens_seen": 101493248, "step": 47030 }, { "epoch": 7.672920065252855, "grad_norm": 0.12390108406543732, "learning_rate": 3.871551460865932e-05, "loss": 0.0296, "num_input_tokens_seen": 101504128, "step": 47035 }, { "epoch": 7.67373572593801, "grad_norm": 0.7030653357505798, "learning_rate": 3.871253889884257e-05, "loss": 0.0671, "num_input_tokens_seen": 101515808, "step": 47040 }, { "epoch": 7.674551386623165, "grad_norm": 0.08767931163311005, "learning_rate": 3.870956291112382e-05, "loss": 0.147, "num_input_tokens_seen": 101526976, "step": 47045 }, { "epoch": 7.6753670473083195, "grad_norm": 0.4027406871318817, "learning_rate": 3.8706586645563384e-05, "loss": 0.0429, "num_input_tokens_seen": 101537440, "step": 47050 }, { "epoch": 7.676182707993474, "grad_norm": 0.31554505228996277, "learning_rate": 3.8703610102221577e-05, "loss": 0.2494, "num_input_tokens_seen": 101549280, "step": 47055 }, { "epoch": 7.67699836867863, "grad_norm": 0.1747203767299652, "learning_rate": 3.8700633281158726e-05, "loss": 0.0295, "num_input_tokens_seen": 101558720, "step": 47060 }, { "epoch": 7.677814029363785, "grad_norm": 0.08785583823919296, "learning_rate": 3.869765618243517e-05, "loss": 0.1829, "num_input_tokens_seen": 101570400, "step": 47065 }, { "epoch": 7.67862969004894, "grad_norm": 1.3817498683929443, "learning_rate": 3.8694678806111234e-05, "loss": 0.1388, "num_input_tokens_seen": 101581472, "step": 47070 }, { "epoch": 7.6794453507340945, "grad_norm": 0.04103320464491844, "learning_rate": 3.869170115224726e-05, "loss": 0.0392, "num_input_tokens_seen": 101592480, "step": 47075 }, { "epoch": 7.680261011419249, "grad_norm": 0.877912163734436, "learning_rate": 3.868872322090359e-05, "loss": 0.107, "num_input_tokens_seen": 101601952, "step": 47080 }, { "epoch": 7.681076672104405, "grad_norm": 1.389467716217041, "learning_rate": 3.8685745012140575e-05, "loss": 0.154, "num_input_tokens_seen": 101612384, "step": 47085 }, { "epoch": 7.68189233278956, "grad_norm": 0.5905671715736389, "learning_rate": 3.8682766526018575e-05, "loss": 0.0749, "num_input_tokens_seen": 101623872, "step": 47090 }, { "epoch": 7.682707993474715, "grad_norm": 0.17661626636981964, "learning_rate": 3.8679787762597965e-05, "loss": 0.1339, "num_input_tokens_seen": 101635072, "step": 47095 }, { "epoch": 7.6835236541598695, "grad_norm": 0.23967327177524567, "learning_rate": 3.867680872193909e-05, "loss": 0.1056, "num_input_tokens_seen": 101644896, "step": 47100 }, { "epoch": 7.684339314845024, "grad_norm": 0.0765729546546936, "learning_rate": 3.867382940410234e-05, "loss": 0.0432, "num_input_tokens_seen": 101655584, "step": 47105 }, { "epoch": 7.685154975530179, "grad_norm": 0.3792727589607239, "learning_rate": 3.8670849809148094e-05, "loss": 0.0536, "num_input_tokens_seen": 101666912, "step": 47110 }, { "epoch": 7.685970636215334, "grad_norm": 0.05175553262233734, "learning_rate": 3.866786993713673e-05, "loss": 0.0232, "num_input_tokens_seen": 101676768, "step": 47115 }, { "epoch": 7.68678629690049, "grad_norm": 0.22913490235805511, "learning_rate": 3.866488978812864e-05, "loss": 0.0517, "num_input_tokens_seen": 101687936, "step": 47120 }, { "epoch": 7.6876019575856445, "grad_norm": 1.728147268295288, "learning_rate": 3.8661909362184235e-05, "loss": 0.3165, "num_input_tokens_seen": 101698592, "step": 47125 }, { "epoch": 7.688417618270799, "grad_norm": 2.7966630458831787, "learning_rate": 3.8658928659363894e-05, "loss": 0.1558, "num_input_tokens_seen": 101709312, "step": 47130 }, { "epoch": 7.689233278955954, "grad_norm": 0.07662336528301239, "learning_rate": 3.865594767972804e-05, "loss": 0.1681, "num_input_tokens_seen": 101720512, "step": 47135 }, { "epoch": 7.690048939641109, "grad_norm": 0.3681540787220001, "learning_rate": 3.865296642333708e-05, "loss": 0.1477, "num_input_tokens_seen": 101732640, "step": 47140 }, { "epoch": 7.690864600326265, "grad_norm": 0.1851116269826889, "learning_rate": 3.864998489025144e-05, "loss": 0.1007, "num_input_tokens_seen": 101742400, "step": 47145 }, { "epoch": 7.691680261011419, "grad_norm": 0.1839848905801773, "learning_rate": 3.864700308053154e-05, "loss": 0.2853, "num_input_tokens_seen": 101751936, "step": 47150 }, { "epoch": 7.692495921696574, "grad_norm": 1.4921467304229736, "learning_rate": 3.8644020994237806e-05, "loss": 0.0714, "num_input_tokens_seen": 101763136, "step": 47155 }, { "epoch": 7.693311582381729, "grad_norm": 0.6511791944503784, "learning_rate": 3.864103863143068e-05, "loss": 0.0835, "num_input_tokens_seen": 101773440, "step": 47160 }, { "epoch": 7.694127243066884, "grad_norm": 0.46802154183387756, "learning_rate": 3.8638055992170595e-05, "loss": 0.1619, "num_input_tokens_seen": 101783264, "step": 47165 }, { "epoch": 7.69494290375204, "grad_norm": 0.278992623090744, "learning_rate": 3.8635073076518014e-05, "loss": 0.0585, "num_input_tokens_seen": 101793856, "step": 47170 }, { "epoch": 7.695758564437194, "grad_norm": 0.09452103078365326, "learning_rate": 3.8632089884533375e-05, "loss": 0.1407, "num_input_tokens_seen": 101804832, "step": 47175 }, { "epoch": 7.696574225122349, "grad_norm": 0.2677273452281952, "learning_rate": 3.8629106416277136e-05, "loss": 0.1223, "num_input_tokens_seen": 101815808, "step": 47180 }, { "epoch": 7.697389885807504, "grad_norm": 0.11565174907445908, "learning_rate": 3.8626122671809775e-05, "loss": 0.1246, "num_input_tokens_seen": 101825120, "step": 47185 }, { "epoch": 7.698205546492659, "grad_norm": 0.8182146549224854, "learning_rate": 3.862313865119175e-05, "loss": 0.1324, "num_input_tokens_seen": 101836416, "step": 47190 }, { "epoch": 7.699021207177814, "grad_norm": 0.40049993991851807, "learning_rate": 3.862015435448353e-05, "loss": 0.0972, "num_input_tokens_seen": 101846240, "step": 47195 }, { "epoch": 7.699836867862969, "grad_norm": 0.19330257177352905, "learning_rate": 3.8617169781745615e-05, "loss": 0.1184, "num_input_tokens_seen": 101857152, "step": 47200 }, { "epoch": 7.700652528548124, "grad_norm": 0.49246442317962646, "learning_rate": 3.861418493303848e-05, "loss": 0.0872, "num_input_tokens_seen": 101867296, "step": 47205 }, { "epoch": 7.701468189233279, "grad_norm": 0.5533923506736755, "learning_rate": 3.86111998084226e-05, "loss": 0.0908, "num_input_tokens_seen": 101876000, "step": 47210 }, { "epoch": 7.702283849918434, "grad_norm": 0.9256342649459839, "learning_rate": 3.860821440795851e-05, "loss": 0.0373, "num_input_tokens_seen": 101886080, "step": 47215 }, { "epoch": 7.703099510603589, "grad_norm": 0.6485630869865417, "learning_rate": 3.860522873170668e-05, "loss": 0.152, "num_input_tokens_seen": 101896480, "step": 47220 }, { "epoch": 7.7039151712887435, "grad_norm": 0.023657381534576416, "learning_rate": 3.860224277972764e-05, "loss": 0.0352, "num_input_tokens_seen": 101907136, "step": 47225 }, { "epoch": 7.704730831973899, "grad_norm": 0.13420630991458893, "learning_rate": 3.859925655208188e-05, "loss": 0.0984, "num_input_tokens_seen": 101918624, "step": 47230 }, { "epoch": 7.705546492659054, "grad_norm": 0.16022340953350067, "learning_rate": 3.859627004882994e-05, "loss": 0.0567, "num_input_tokens_seen": 101929408, "step": 47235 }, { "epoch": 7.706362153344209, "grad_norm": 0.058354806154966354, "learning_rate": 3.8593283270032344e-05, "loss": 0.1061, "num_input_tokens_seen": 101940736, "step": 47240 }, { "epoch": 7.707177814029364, "grad_norm": 0.8890196084976196, "learning_rate": 3.859029621574961e-05, "loss": 0.1272, "num_input_tokens_seen": 101950848, "step": 47245 }, { "epoch": 7.7079934747145185, "grad_norm": 0.11235952377319336, "learning_rate": 3.858730888604229e-05, "loss": 0.0743, "num_input_tokens_seen": 101961376, "step": 47250 }, { "epoch": 7.708809135399674, "grad_norm": 0.18781159818172455, "learning_rate": 3.858432128097091e-05, "loss": 0.0792, "num_input_tokens_seen": 101972448, "step": 47255 }, { "epoch": 7.709624796084829, "grad_norm": 0.2910650372505188, "learning_rate": 3.858133340059604e-05, "loss": 0.0265, "num_input_tokens_seen": 101983712, "step": 47260 }, { "epoch": 7.710440456769984, "grad_norm": 0.5959053635597229, "learning_rate": 3.857834524497821e-05, "loss": 0.1356, "num_input_tokens_seen": 101993792, "step": 47265 }, { "epoch": 7.711256117455139, "grad_norm": 0.3672882616519928, "learning_rate": 3.8575356814177986e-05, "loss": 0.0603, "num_input_tokens_seen": 102004800, "step": 47270 }, { "epoch": 7.712071778140293, "grad_norm": 0.8320317268371582, "learning_rate": 3.857236810825594e-05, "loss": 0.1653, "num_input_tokens_seen": 102015936, "step": 47275 }, { "epoch": 7.712887438825448, "grad_norm": 0.4325639009475708, "learning_rate": 3.856937912727263e-05, "loss": 0.225, "num_input_tokens_seen": 102026592, "step": 47280 }, { "epoch": 7.713703099510604, "grad_norm": 0.9592970609664917, "learning_rate": 3.8566389871288644e-05, "loss": 0.0512, "num_input_tokens_seen": 102036448, "step": 47285 }, { "epoch": 7.714518760195759, "grad_norm": 1.6542729139328003, "learning_rate": 3.856340034036456e-05, "loss": 0.1992, "num_input_tokens_seen": 102048800, "step": 47290 }, { "epoch": 7.715334420880914, "grad_norm": 0.4928557276725769, "learning_rate": 3.856041053456095e-05, "loss": 0.0516, "num_input_tokens_seen": 102060000, "step": 47295 }, { "epoch": 7.716150081566068, "grad_norm": 0.1096368134021759, "learning_rate": 3.8557420453938434e-05, "loss": 0.0308, "num_input_tokens_seen": 102071104, "step": 47300 }, { "epoch": 7.716965742251223, "grad_norm": 1.105696439743042, "learning_rate": 3.855443009855758e-05, "loss": 0.1757, "num_input_tokens_seen": 102082880, "step": 47305 }, { "epoch": 7.717781402936378, "grad_norm": 0.04499049857258797, "learning_rate": 3.855143946847902e-05, "loss": 0.0731, "num_input_tokens_seen": 102094816, "step": 47310 }, { "epoch": 7.718597063621534, "grad_norm": 0.34861230850219727, "learning_rate": 3.8548448563763334e-05, "loss": 0.1522, "num_input_tokens_seen": 102105632, "step": 47315 }, { "epoch": 7.719412724306689, "grad_norm": 1.8101402521133423, "learning_rate": 3.854545738447115e-05, "loss": 0.1629, "num_input_tokens_seen": 102115808, "step": 47320 }, { "epoch": 7.720228384991843, "grad_norm": 1.0227177143096924, "learning_rate": 3.8542465930663095e-05, "loss": 0.0963, "num_input_tokens_seen": 102127392, "step": 47325 }, { "epoch": 7.721044045676998, "grad_norm": 1.0482475757598877, "learning_rate": 3.853947420239979e-05, "loss": 0.0655, "num_input_tokens_seen": 102139136, "step": 47330 }, { "epoch": 7.721859706362153, "grad_norm": 0.03917352110147476, "learning_rate": 3.8536482199741865e-05, "loss": 0.2501, "num_input_tokens_seen": 102150688, "step": 47335 }, { "epoch": 7.722675367047309, "grad_norm": 1.0975412130355835, "learning_rate": 3.8533489922749944e-05, "loss": 0.4118, "num_input_tokens_seen": 102161568, "step": 47340 }, { "epoch": 7.7234910277324635, "grad_norm": 0.21146218478679657, "learning_rate": 3.8530497371484695e-05, "loss": 0.0204, "num_input_tokens_seen": 102173440, "step": 47345 }, { "epoch": 7.724306688417618, "grad_norm": 0.058959513902664185, "learning_rate": 3.8527504546006746e-05, "loss": 0.0257, "num_input_tokens_seen": 102185376, "step": 47350 }, { "epoch": 7.725122349102773, "grad_norm": 0.32624927163124084, "learning_rate": 3.852451144637675e-05, "loss": 0.0348, "num_input_tokens_seen": 102195456, "step": 47355 }, { "epoch": 7.725938009787928, "grad_norm": 1.4573309421539307, "learning_rate": 3.8521518072655386e-05, "loss": 0.1248, "num_input_tokens_seen": 102206080, "step": 47360 }, { "epoch": 7.726753670473083, "grad_norm": 2.027231216430664, "learning_rate": 3.8518524424903297e-05, "loss": 0.1335, "num_input_tokens_seen": 102216736, "step": 47365 }, { "epoch": 7.7275693311582385, "grad_norm": 0.14253559708595276, "learning_rate": 3.8515530503181165e-05, "loss": 0.0363, "num_input_tokens_seen": 102227904, "step": 47370 }, { "epoch": 7.728384991843393, "grad_norm": 1.0532923936843872, "learning_rate": 3.851253630754966e-05, "loss": 0.0513, "num_input_tokens_seen": 102239200, "step": 47375 }, { "epoch": 7.729200652528548, "grad_norm": 0.2563938498497009, "learning_rate": 3.8509541838069455e-05, "loss": 0.0223, "num_input_tokens_seen": 102251264, "step": 47380 }, { "epoch": 7.730016313213703, "grad_norm": 0.3112757205963135, "learning_rate": 3.850654709480126e-05, "loss": 0.0987, "num_input_tokens_seen": 102262432, "step": 47385 }, { "epoch": 7.730831973898858, "grad_norm": 2.2276647090911865, "learning_rate": 3.850355207780575e-05, "loss": 0.2143, "num_input_tokens_seen": 102274912, "step": 47390 }, { "epoch": 7.731647634584013, "grad_norm": 0.7110088467597961, "learning_rate": 3.850055678714362e-05, "loss": 0.1031, "num_input_tokens_seen": 102285696, "step": 47395 }, { "epoch": 7.732463295269168, "grad_norm": 0.8498117327690125, "learning_rate": 3.849756122287559e-05, "loss": 0.0581, "num_input_tokens_seen": 102296160, "step": 47400 }, { "epoch": 7.733278955954323, "grad_norm": 1.3833866119384766, "learning_rate": 3.849456538506235e-05, "loss": 0.1024, "num_input_tokens_seen": 102307712, "step": 47405 }, { "epoch": 7.734094616639478, "grad_norm": 0.16742351651191711, "learning_rate": 3.8491569273764624e-05, "loss": 0.1116, "num_input_tokens_seen": 102320032, "step": 47410 }, { "epoch": 7.734910277324633, "grad_norm": 0.11258775740861893, "learning_rate": 3.848857288904314e-05, "loss": 0.203, "num_input_tokens_seen": 102330656, "step": 47415 }, { "epoch": 7.735725938009788, "grad_norm": 0.47220051288604736, "learning_rate": 3.8485576230958614e-05, "loss": 0.117, "num_input_tokens_seen": 102340704, "step": 47420 }, { "epoch": 7.736541598694943, "grad_norm": 1.1721611022949219, "learning_rate": 3.8482579299571775e-05, "loss": 0.0567, "num_input_tokens_seen": 102350752, "step": 47425 }, { "epoch": 7.737357259380098, "grad_norm": 0.09858588874340057, "learning_rate": 3.847958209494336e-05, "loss": 0.086, "num_input_tokens_seen": 102360800, "step": 47430 }, { "epoch": 7.738172920065253, "grad_norm": 1.7631583213806152, "learning_rate": 3.847658461713412e-05, "loss": 0.1555, "num_input_tokens_seen": 102372096, "step": 47435 }, { "epoch": 7.738988580750408, "grad_norm": 0.08491463959217072, "learning_rate": 3.8473586866204785e-05, "loss": 0.1787, "num_input_tokens_seen": 102382112, "step": 47440 }, { "epoch": 7.739804241435563, "grad_norm": 1.2068063020706177, "learning_rate": 3.8470588842216126e-05, "loss": 0.0812, "num_input_tokens_seen": 102391392, "step": 47445 }, { "epoch": 7.740619902120718, "grad_norm": 0.9636883735656738, "learning_rate": 3.84675905452289e-05, "loss": 0.0726, "num_input_tokens_seen": 102402720, "step": 47450 }, { "epoch": 7.741435562805873, "grad_norm": 0.04937894642353058, "learning_rate": 3.846459197530386e-05, "loss": 0.3078, "num_input_tokens_seen": 102412992, "step": 47455 }, { "epoch": 7.742251223491028, "grad_norm": 0.7638336420059204, "learning_rate": 3.846159313250179e-05, "loss": 0.2659, "num_input_tokens_seen": 102424416, "step": 47460 }, { "epoch": 7.743066884176183, "grad_norm": 0.1519625037908554, "learning_rate": 3.8458594016883455e-05, "loss": 0.1938, "num_input_tokens_seen": 102435296, "step": 47465 }, { "epoch": 7.7438825448613375, "grad_norm": 1.169859528541565, "learning_rate": 3.8455594628509636e-05, "loss": 0.1208, "num_input_tokens_seen": 102445856, "step": 47470 }, { "epoch": 7.744698205546492, "grad_norm": 0.2292788326740265, "learning_rate": 3.845259496744112e-05, "loss": 0.0136, "num_input_tokens_seen": 102457024, "step": 47475 }, { "epoch": 7.745513866231647, "grad_norm": 1.4843575954437256, "learning_rate": 3.84495950337387e-05, "loss": 0.1347, "num_input_tokens_seen": 102467744, "step": 47480 }, { "epoch": 7.746329526916803, "grad_norm": 1.8050823211669922, "learning_rate": 3.844659482746318e-05, "loss": 0.1821, "num_input_tokens_seen": 102477408, "step": 47485 }, { "epoch": 7.747145187601958, "grad_norm": 1.0496468544006348, "learning_rate": 3.844359434867536e-05, "loss": 0.0736, "num_input_tokens_seen": 102487840, "step": 47490 }, { "epoch": 7.7479608482871125, "grad_norm": 0.2854286730289459, "learning_rate": 3.844059359743605e-05, "loss": 0.077, "num_input_tokens_seen": 102498624, "step": 47495 }, { "epoch": 7.748776508972267, "grad_norm": 0.26388490200042725, "learning_rate": 3.843759257380605e-05, "loss": 0.0462, "num_input_tokens_seen": 102509952, "step": 47500 }, { "epoch": 7.749592169657422, "grad_norm": 0.08342530578374863, "learning_rate": 3.843459127784619e-05, "loss": 0.0146, "num_input_tokens_seen": 102521120, "step": 47505 }, { "epoch": 7.750407830342578, "grad_norm": 0.25788694620132446, "learning_rate": 3.84315897096173e-05, "loss": 0.1421, "num_input_tokens_seen": 102532832, "step": 47510 }, { "epoch": 7.751223491027733, "grad_norm": 0.960186779499054, "learning_rate": 3.842858786918021e-05, "loss": 0.083, "num_input_tokens_seen": 102542752, "step": 47515 }, { "epoch": 7.7520391517128875, "grad_norm": 0.1408757120370865, "learning_rate": 3.842558575659574e-05, "loss": 0.0579, "num_input_tokens_seen": 102553920, "step": 47520 }, { "epoch": 7.752854812398042, "grad_norm": 1.7053077220916748, "learning_rate": 3.842258337192475e-05, "loss": 0.1432, "num_input_tokens_seen": 102565088, "step": 47525 }, { "epoch": 7.753670473083197, "grad_norm": 0.56120765209198, "learning_rate": 3.8419580715228074e-05, "loss": 0.0384, "num_input_tokens_seen": 102576704, "step": 47530 }, { "epoch": 7.754486133768353, "grad_norm": 0.5345782041549683, "learning_rate": 3.8416577786566577e-05, "loss": 0.2663, "num_input_tokens_seen": 102588000, "step": 47535 }, { "epoch": 7.755301794453508, "grad_norm": 1.3266825675964355, "learning_rate": 3.841357458600111e-05, "loss": 0.1951, "num_input_tokens_seen": 102598144, "step": 47540 }, { "epoch": 7.7561174551386625, "grad_norm": 0.29005253314971924, "learning_rate": 3.8410571113592534e-05, "loss": 0.1105, "num_input_tokens_seen": 102608256, "step": 47545 }, { "epoch": 7.756933115823817, "grad_norm": 1.0592870712280273, "learning_rate": 3.8407567369401725e-05, "loss": 0.082, "num_input_tokens_seen": 102618528, "step": 47550 }, { "epoch": 7.757748776508972, "grad_norm": 0.840216338634491, "learning_rate": 3.8404563353489556e-05, "loss": 0.1813, "num_input_tokens_seen": 102629632, "step": 47555 }, { "epoch": 7.758564437194127, "grad_norm": 0.35633501410484314, "learning_rate": 3.8401559065916907e-05, "loss": 0.0458, "num_input_tokens_seen": 102639712, "step": 47560 }, { "epoch": 7.759380097879282, "grad_norm": 0.9867788553237915, "learning_rate": 3.839855450674465e-05, "loss": 0.1249, "num_input_tokens_seen": 102650272, "step": 47565 }, { "epoch": 7.760195758564437, "grad_norm": 0.3335767686367035, "learning_rate": 3.83955496760337e-05, "loss": 0.1614, "num_input_tokens_seen": 102661216, "step": 47570 }, { "epoch": 7.761011419249592, "grad_norm": 1.1070374250411987, "learning_rate": 3.839254457384495e-05, "loss": 0.0644, "num_input_tokens_seen": 102671200, "step": 47575 }, { "epoch": 7.761827079934747, "grad_norm": 0.6088117361068726, "learning_rate": 3.8389539200239275e-05, "loss": 0.0938, "num_input_tokens_seen": 102682144, "step": 47580 }, { "epoch": 7.762642740619902, "grad_norm": 1.0218414068222046, "learning_rate": 3.838653355527762e-05, "loss": 0.1592, "num_input_tokens_seen": 102693376, "step": 47585 }, { "epoch": 7.763458401305057, "grad_norm": 0.09349432587623596, "learning_rate": 3.838352763902087e-05, "loss": 0.0469, "num_input_tokens_seen": 102704800, "step": 47590 }, { "epoch": 7.764274061990212, "grad_norm": 1.0597634315490723, "learning_rate": 3.838052145152995e-05, "loss": 0.1812, "num_input_tokens_seen": 102715776, "step": 47595 }, { "epoch": 7.765089722675367, "grad_norm": 2.1668574810028076, "learning_rate": 3.83775149928658e-05, "loss": 0.2043, "num_input_tokens_seen": 102727232, "step": 47600 }, { "epoch": 7.765905383360522, "grad_norm": 0.3405378460884094, "learning_rate": 3.8374508263089326e-05, "loss": 0.1649, "num_input_tokens_seen": 102737568, "step": 47605 }, { "epoch": 7.766721044045677, "grad_norm": 0.16477367281913757, "learning_rate": 3.8371501262261485e-05, "loss": 0.1794, "num_input_tokens_seen": 102749728, "step": 47610 }, { "epoch": 7.767536704730832, "grad_norm": 0.6417464017868042, "learning_rate": 3.83684939904432e-05, "loss": 0.0428, "num_input_tokens_seen": 102760256, "step": 47615 }, { "epoch": 7.768352365415987, "grad_norm": 0.21477285027503967, "learning_rate": 3.8365486447695435e-05, "loss": 0.0426, "num_input_tokens_seen": 102771648, "step": 47620 }, { "epoch": 7.769168026101142, "grad_norm": 0.22502651810646057, "learning_rate": 3.8362478634079124e-05, "loss": 0.1295, "num_input_tokens_seen": 102782912, "step": 47625 }, { "epoch": 7.769983686786297, "grad_norm": 0.14628149569034576, "learning_rate": 3.835947054965524e-05, "loss": 0.1171, "num_input_tokens_seen": 102795200, "step": 47630 }, { "epoch": 7.770799347471452, "grad_norm": 0.5223028063774109, "learning_rate": 3.8356462194484735e-05, "loss": 0.1719, "num_input_tokens_seen": 102805600, "step": 47635 }, { "epoch": 7.771615008156607, "grad_norm": 1.8050780296325684, "learning_rate": 3.8353453568628575e-05, "loss": 0.1296, "num_input_tokens_seen": 102816800, "step": 47640 }, { "epoch": 7.7724306688417615, "grad_norm": 0.18964619934558868, "learning_rate": 3.835044467214775e-05, "loss": 0.1244, "num_input_tokens_seen": 102827968, "step": 47645 }, { "epoch": 7.773246329526917, "grad_norm": 0.025013770908117294, "learning_rate": 3.8347435505103216e-05, "loss": 0.0593, "num_input_tokens_seen": 102838400, "step": 47650 }, { "epoch": 7.774061990212072, "grad_norm": 0.5167728662490845, "learning_rate": 3.834442606755597e-05, "loss": 0.0497, "num_input_tokens_seen": 102848704, "step": 47655 }, { "epoch": 7.774877650897227, "grad_norm": 0.40865427255630493, "learning_rate": 3.834141635956699e-05, "loss": 0.1154, "num_input_tokens_seen": 102859296, "step": 47660 }, { "epoch": 7.775693311582382, "grad_norm": 0.1229039654135704, "learning_rate": 3.833840638119731e-05, "loss": 0.0399, "num_input_tokens_seen": 102870592, "step": 47665 }, { "epoch": 7.7765089722675365, "grad_norm": 1.6150487661361694, "learning_rate": 3.8335396132507884e-05, "loss": 0.1203, "num_input_tokens_seen": 102881568, "step": 47670 }, { "epoch": 7.777324632952691, "grad_norm": 1.2821452617645264, "learning_rate": 3.833238561355974e-05, "loss": 0.1521, "num_input_tokens_seen": 102892320, "step": 47675 }, { "epoch": 7.778140293637847, "grad_norm": 0.5340595841407776, "learning_rate": 3.8329374824413896e-05, "loss": 0.257, "num_input_tokens_seen": 102903456, "step": 47680 }, { "epoch": 7.778955954323002, "grad_norm": 0.1300726979970932, "learning_rate": 3.832636376513136e-05, "loss": 0.1026, "num_input_tokens_seen": 102914208, "step": 47685 }, { "epoch": 7.779771615008157, "grad_norm": 1.7133119106292725, "learning_rate": 3.832335243577315e-05, "loss": 0.1388, "num_input_tokens_seen": 102925152, "step": 47690 }, { "epoch": 7.780587275693311, "grad_norm": 1.011945366859436, "learning_rate": 3.83203408364003e-05, "loss": 0.1792, "num_input_tokens_seen": 102936064, "step": 47695 }, { "epoch": 7.781402936378466, "grad_norm": 0.35619476437568665, "learning_rate": 3.831732896707385e-05, "loss": 0.1358, "num_input_tokens_seen": 102947200, "step": 47700 }, { "epoch": 7.782218597063622, "grad_norm": 1.133428931236267, "learning_rate": 3.831431682785484e-05, "loss": 0.1524, "num_input_tokens_seen": 102957216, "step": 47705 }, { "epoch": 7.783034257748777, "grad_norm": 0.4890064299106598, "learning_rate": 3.83113044188043e-05, "loss": 0.0471, "num_input_tokens_seen": 102968160, "step": 47710 }, { "epoch": 7.783849918433932, "grad_norm": 0.32268115878105164, "learning_rate": 3.830829173998329e-05, "loss": 0.0857, "num_input_tokens_seen": 102978912, "step": 47715 }, { "epoch": 7.784665579119086, "grad_norm": 1.5057659149169922, "learning_rate": 3.830527879145287e-05, "loss": 0.3058, "num_input_tokens_seen": 102989568, "step": 47720 }, { "epoch": 7.785481239804241, "grad_norm": 0.1710575819015503, "learning_rate": 3.83022655732741e-05, "loss": 0.0309, "num_input_tokens_seen": 103000512, "step": 47725 }, { "epoch": 7.786296900489396, "grad_norm": 0.10738617926836014, "learning_rate": 3.829925208550803e-05, "loss": 0.081, "num_input_tokens_seen": 103012320, "step": 47730 }, { "epoch": 7.787112561174552, "grad_norm": 0.020792128518223763, "learning_rate": 3.829623832821576e-05, "loss": 0.0265, "num_input_tokens_seen": 103023936, "step": 47735 }, { "epoch": 7.787928221859707, "grad_norm": 1.509466290473938, "learning_rate": 3.829322430145834e-05, "loss": 0.0904, "num_input_tokens_seen": 103033696, "step": 47740 }, { "epoch": 7.788743882544861, "grad_norm": 1.6766175031661987, "learning_rate": 3.8290210005296865e-05, "loss": 0.1582, "num_input_tokens_seen": 103045280, "step": 47745 }, { "epoch": 7.789559543230016, "grad_norm": 0.09769909083843231, "learning_rate": 3.828719543979244e-05, "loss": 0.0134, "num_input_tokens_seen": 103055584, "step": 47750 }, { "epoch": 7.790375203915171, "grad_norm": 0.9195945262908936, "learning_rate": 3.8284180605006124e-05, "loss": 0.1376, "num_input_tokens_seen": 103066880, "step": 47755 }, { "epoch": 7.791190864600326, "grad_norm": 0.12094347923994064, "learning_rate": 3.828116550099905e-05, "loss": 0.0475, "num_input_tokens_seen": 103077568, "step": 47760 }, { "epoch": 7.7920065252854815, "grad_norm": 0.036654211580753326, "learning_rate": 3.82781501278323e-05, "loss": 0.0715, "num_input_tokens_seen": 103088672, "step": 47765 }, { "epoch": 7.792822185970636, "grad_norm": 0.9625695943832397, "learning_rate": 3.8275134485567e-05, "loss": 0.0496, "num_input_tokens_seen": 103098496, "step": 47770 }, { "epoch": 7.793637846655791, "grad_norm": 1.501736044883728, "learning_rate": 3.827211857426426e-05, "loss": 0.23, "num_input_tokens_seen": 103109856, "step": 47775 }, { "epoch": 7.794453507340946, "grad_norm": 0.4276854395866394, "learning_rate": 3.826910239398519e-05, "loss": 0.1122, "num_input_tokens_seen": 103120032, "step": 47780 }, { "epoch": 7.795269168026101, "grad_norm": 1.4263328313827515, "learning_rate": 3.826608594479094e-05, "loss": 0.1844, "num_input_tokens_seen": 103131072, "step": 47785 }, { "epoch": 7.7960848287112565, "grad_norm": 1.3208824396133423, "learning_rate": 3.826306922674262e-05, "loss": 0.1422, "num_input_tokens_seen": 103142240, "step": 47790 }, { "epoch": 7.796900489396411, "grad_norm": 1.3410564661026, "learning_rate": 3.826005223990138e-05, "loss": 0.1595, "num_input_tokens_seen": 103152960, "step": 47795 }, { "epoch": 7.797716150081566, "grad_norm": 0.44201895594596863, "learning_rate": 3.825703498432836e-05, "loss": 0.161, "num_input_tokens_seen": 103164352, "step": 47800 }, { "epoch": 7.798531810766721, "grad_norm": 2.171236515045166, "learning_rate": 3.82540174600847e-05, "loss": 0.1764, "num_input_tokens_seen": 103175680, "step": 47805 }, { "epoch": 7.799347471451876, "grad_norm": 0.5606493949890137, "learning_rate": 3.825099966723158e-05, "loss": 0.0444, "num_input_tokens_seen": 103185952, "step": 47810 }, { "epoch": 7.800163132137031, "grad_norm": 1.6732910871505737, "learning_rate": 3.824798160583012e-05, "loss": 0.2605, "num_input_tokens_seen": 103195840, "step": 47815 }, { "epoch": 7.800978792822186, "grad_norm": 0.7569252252578735, "learning_rate": 3.8244963275941524e-05, "loss": 0.0996, "num_input_tokens_seen": 103207104, "step": 47820 }, { "epoch": 7.801794453507341, "grad_norm": 0.3339773714542389, "learning_rate": 3.824194467762694e-05, "loss": 0.1024, "num_input_tokens_seen": 103218624, "step": 47825 }, { "epoch": 7.802610114192496, "grad_norm": 0.2426123172044754, "learning_rate": 3.823892581094755e-05, "loss": 0.0608, "num_input_tokens_seen": 103230592, "step": 47830 }, { "epoch": 7.803425774877651, "grad_norm": 0.3664318323135376, "learning_rate": 3.823590667596453e-05, "loss": 0.1682, "num_input_tokens_seen": 103241984, "step": 47835 }, { "epoch": 7.804241435562806, "grad_norm": 2.123911142349243, "learning_rate": 3.823288727273907e-05, "loss": 0.2112, "num_input_tokens_seen": 103252800, "step": 47840 }, { "epoch": 7.80505709624796, "grad_norm": 0.04519585892558098, "learning_rate": 3.822986760133236e-05, "loss": 0.059, "num_input_tokens_seen": 103265440, "step": 47845 }, { "epoch": 7.805872756933116, "grad_norm": 0.5775474309921265, "learning_rate": 3.8226847661805605e-05, "loss": 0.0716, "num_input_tokens_seen": 103274976, "step": 47850 }, { "epoch": 7.806688417618271, "grad_norm": 0.33611249923706055, "learning_rate": 3.822382745422e-05, "loss": 0.0682, "num_input_tokens_seen": 103285344, "step": 47855 }, { "epoch": 7.807504078303426, "grad_norm": 0.8813247084617615, "learning_rate": 3.8220806978636756e-05, "loss": 0.0447, "num_input_tokens_seen": 103296576, "step": 47860 }, { "epoch": 7.808319738988581, "grad_norm": 0.2503659129142761, "learning_rate": 3.8217786235117095e-05, "loss": 0.1225, "num_input_tokens_seen": 103306880, "step": 47865 }, { "epoch": 7.809135399673735, "grad_norm": 0.5096401572227478, "learning_rate": 3.8214765223722214e-05, "loss": 0.139, "num_input_tokens_seen": 103317760, "step": 47870 }, { "epoch": 7.809951060358891, "grad_norm": 0.5995042324066162, "learning_rate": 3.8211743944513364e-05, "loss": 0.1207, "num_input_tokens_seen": 103328096, "step": 47875 }, { "epoch": 7.810766721044046, "grad_norm": 1.5243330001831055, "learning_rate": 3.8208722397551755e-05, "loss": 0.2791, "num_input_tokens_seen": 103338560, "step": 47880 }, { "epoch": 7.811582381729201, "grad_norm": 1.1566541194915771, "learning_rate": 3.820570058289863e-05, "loss": 0.1712, "num_input_tokens_seen": 103349824, "step": 47885 }, { "epoch": 7.8123980424143555, "grad_norm": 1.0391896963119507, "learning_rate": 3.8202678500615234e-05, "loss": 0.0342, "num_input_tokens_seen": 103361056, "step": 47890 }, { "epoch": 7.81321370309951, "grad_norm": 0.12364999949932098, "learning_rate": 3.8199656150762804e-05, "loss": 0.051, "num_input_tokens_seen": 103371520, "step": 47895 }, { "epoch": 7.814029363784666, "grad_norm": 1.1204750537872314, "learning_rate": 3.81966335334026e-05, "loss": 0.0746, "num_input_tokens_seen": 103382784, "step": 47900 }, { "epoch": 7.814845024469821, "grad_norm": 0.19221773743629456, "learning_rate": 3.8193610648595876e-05, "loss": 0.2393, "num_input_tokens_seen": 103393376, "step": 47905 }, { "epoch": 7.815660685154976, "grad_norm": 0.03744202479720116, "learning_rate": 3.8190587496403894e-05, "loss": 0.0801, "num_input_tokens_seen": 103404000, "step": 47910 }, { "epoch": 7.8164763458401305, "grad_norm": 0.012576722539961338, "learning_rate": 3.818756407688793e-05, "loss": 0.1014, "num_input_tokens_seen": 103414848, "step": 47915 }, { "epoch": 7.817292006525285, "grad_norm": 0.05214405059814453, "learning_rate": 3.818454039010924e-05, "loss": 0.1143, "num_input_tokens_seen": 103426144, "step": 47920 }, { "epoch": 7.81810766721044, "grad_norm": 0.20531420409679413, "learning_rate": 3.818151643612912e-05, "loss": 0.107, "num_input_tokens_seen": 103435584, "step": 47925 }, { "epoch": 7.818923327895595, "grad_norm": 0.24460510909557343, "learning_rate": 3.817849221500884e-05, "loss": 0.0595, "num_input_tokens_seen": 103446464, "step": 47930 }, { "epoch": 7.819738988580751, "grad_norm": 1.9439287185668945, "learning_rate": 3.817546772680971e-05, "loss": 0.1908, "num_input_tokens_seen": 103457248, "step": 47935 }, { "epoch": 7.8205546492659055, "grad_norm": 0.3448755443096161, "learning_rate": 3.8172442971593e-05, "loss": 0.1435, "num_input_tokens_seen": 103468128, "step": 47940 }, { "epoch": 7.82137030995106, "grad_norm": 0.20662319660186768, "learning_rate": 3.8169417949420034e-05, "loss": 0.1855, "num_input_tokens_seen": 103478656, "step": 47945 }, { "epoch": 7.822185970636215, "grad_norm": 0.296064555644989, "learning_rate": 3.81663926603521e-05, "loss": 0.1676, "num_input_tokens_seen": 103489472, "step": 47950 }, { "epoch": 7.82300163132137, "grad_norm": 1.4236829280853271, "learning_rate": 3.816336710445052e-05, "loss": 0.259, "num_input_tokens_seen": 103500384, "step": 47955 }, { "epoch": 7.823817292006526, "grad_norm": 1.0121957063674927, "learning_rate": 3.81603412817766e-05, "loss": 0.3403, "num_input_tokens_seen": 103510432, "step": 47960 }, { "epoch": 7.8246329526916805, "grad_norm": 0.1119810938835144, "learning_rate": 3.815731519239167e-05, "loss": 0.1304, "num_input_tokens_seen": 103520544, "step": 47965 }, { "epoch": 7.825448613376835, "grad_norm": 1.3447171449661255, "learning_rate": 3.815428883635707e-05, "loss": 0.1267, "num_input_tokens_seen": 103531264, "step": 47970 }, { "epoch": 7.82626427406199, "grad_norm": 0.6693137884140015, "learning_rate": 3.81512622137341e-05, "loss": 0.0734, "num_input_tokens_seen": 103542176, "step": 47975 }, { "epoch": 7.827079934747145, "grad_norm": 0.15854217112064362, "learning_rate": 3.814823532458413e-05, "loss": 0.0444, "num_input_tokens_seen": 103552608, "step": 47980 }, { "epoch": 7.827895595432301, "grad_norm": 0.27358734607696533, "learning_rate": 3.814520816896848e-05, "loss": 0.1088, "num_input_tokens_seen": 103563552, "step": 47985 }, { "epoch": 7.828711256117455, "grad_norm": 0.16852526366710663, "learning_rate": 3.814218074694852e-05, "loss": 0.1303, "num_input_tokens_seen": 103575296, "step": 47990 }, { "epoch": 7.82952691680261, "grad_norm": 0.459322452545166, "learning_rate": 3.81391530585856e-05, "loss": 0.0389, "num_input_tokens_seen": 103586304, "step": 47995 }, { "epoch": 7.830342577487765, "grad_norm": 1.7107402086257935, "learning_rate": 3.813612510394107e-05, "loss": 0.1266, "num_input_tokens_seen": 103596768, "step": 48000 }, { "epoch": 7.83115823817292, "grad_norm": 1.271584391593933, "learning_rate": 3.813309688307629e-05, "loss": 0.0938, "num_input_tokens_seen": 103607808, "step": 48005 }, { "epoch": 7.831973898858075, "grad_norm": 0.4470289647579193, "learning_rate": 3.813006839605265e-05, "loss": 0.0926, "num_input_tokens_seen": 103618848, "step": 48010 }, { "epoch": 7.8327895595432295, "grad_norm": 1.3665982484817505, "learning_rate": 3.812703964293152e-05, "loss": 0.0984, "num_input_tokens_seen": 103629600, "step": 48015 }, { "epoch": 7.833605220228385, "grad_norm": 0.230375275015831, "learning_rate": 3.812401062377428e-05, "loss": 0.0207, "num_input_tokens_seen": 103639744, "step": 48020 }, { "epoch": 7.83442088091354, "grad_norm": 0.41921892762184143, "learning_rate": 3.8120981338642306e-05, "loss": 0.1373, "num_input_tokens_seen": 103649472, "step": 48025 }, { "epoch": 7.835236541598695, "grad_norm": 0.8317067623138428, "learning_rate": 3.811795178759701e-05, "loss": 0.1169, "num_input_tokens_seen": 103659968, "step": 48030 }, { "epoch": 7.83605220228385, "grad_norm": 0.7167588472366333, "learning_rate": 3.811492197069977e-05, "loss": 0.0394, "num_input_tokens_seen": 103671584, "step": 48035 }, { "epoch": 7.8368678629690045, "grad_norm": 1.6177756786346436, "learning_rate": 3.8111891888012e-05, "loss": 0.1591, "num_input_tokens_seen": 103681984, "step": 48040 }, { "epoch": 7.83768352365416, "grad_norm": 0.2159140408039093, "learning_rate": 3.810886153959511e-05, "loss": 0.2036, "num_input_tokens_seen": 103692544, "step": 48045 }, { "epoch": 7.838499184339315, "grad_norm": 0.1096939668059349, "learning_rate": 3.8105830925510515e-05, "loss": 0.032, "num_input_tokens_seen": 103703488, "step": 48050 }, { "epoch": 7.83931484502447, "grad_norm": 0.03138270974159241, "learning_rate": 3.810280004581963e-05, "loss": 0.0723, "num_input_tokens_seen": 103714464, "step": 48055 }, { "epoch": 7.840130505709625, "grad_norm": 1.2034037113189697, "learning_rate": 3.809976890058388e-05, "loss": 0.0696, "num_input_tokens_seen": 103725184, "step": 48060 }, { "epoch": 7.8409461663947795, "grad_norm": 0.672785222530365, "learning_rate": 3.8096737489864685e-05, "loss": 0.0814, "num_input_tokens_seen": 103735520, "step": 48065 }, { "epoch": 7.841761827079935, "grad_norm": 2.1859512329101562, "learning_rate": 3.80937058137235e-05, "loss": 0.1517, "num_input_tokens_seen": 103747072, "step": 48070 }, { "epoch": 7.84257748776509, "grad_norm": 0.3157350718975067, "learning_rate": 3.809067387222175e-05, "loss": 0.1121, "num_input_tokens_seen": 103757632, "step": 48075 }, { "epoch": 7.843393148450245, "grad_norm": 0.5584840178489685, "learning_rate": 3.8087641665420894e-05, "loss": 0.1056, "num_input_tokens_seen": 103768576, "step": 48080 }, { "epoch": 7.8442088091354, "grad_norm": 0.056778986006975174, "learning_rate": 3.8084609193382375e-05, "loss": 0.1522, "num_input_tokens_seen": 103779840, "step": 48085 }, { "epoch": 7.8450244698205545, "grad_norm": 0.5488232970237732, "learning_rate": 3.8081576456167655e-05, "loss": 0.0885, "num_input_tokens_seen": 103789664, "step": 48090 }, { "epoch": 7.845840130505709, "grad_norm": 0.12129713594913483, "learning_rate": 3.807854345383819e-05, "loss": 0.1615, "num_input_tokens_seen": 103800832, "step": 48095 }, { "epoch": 7.846655791190865, "grad_norm": 1.2770178318023682, "learning_rate": 3.8075510186455444e-05, "loss": 0.1139, "num_input_tokens_seen": 103811616, "step": 48100 }, { "epoch": 7.84747145187602, "grad_norm": 0.7497474551200867, "learning_rate": 3.80724766540809e-05, "loss": 0.28, "num_input_tokens_seen": 103822464, "step": 48105 }, { "epoch": 7.848287112561175, "grad_norm": 0.5003208518028259, "learning_rate": 3.806944285677604e-05, "loss": 0.0559, "num_input_tokens_seen": 103833984, "step": 48110 }, { "epoch": 7.849102773246329, "grad_norm": 0.10186001658439636, "learning_rate": 3.806640879460234e-05, "loss": 0.0406, "num_input_tokens_seen": 103845664, "step": 48115 }, { "epoch": 7.849918433931484, "grad_norm": 0.16290009021759033, "learning_rate": 3.8063374467621284e-05, "loss": 0.3731, "num_input_tokens_seen": 103855936, "step": 48120 }, { "epoch": 7.850734094616639, "grad_norm": 0.7542627453804016, "learning_rate": 3.806033987589437e-05, "loss": 0.0517, "num_input_tokens_seen": 103866304, "step": 48125 }, { "epoch": 7.851549755301795, "grad_norm": 0.1625671088695526, "learning_rate": 3.8057305019483106e-05, "loss": 0.1388, "num_input_tokens_seen": 103877280, "step": 48130 }, { "epoch": 7.85236541598695, "grad_norm": 0.9256788492202759, "learning_rate": 3.805426989844899e-05, "loss": 0.1629, "num_input_tokens_seen": 103888192, "step": 48135 }, { "epoch": 7.853181076672104, "grad_norm": 1.3410139083862305, "learning_rate": 3.8051234512853525e-05, "loss": 0.1188, "num_input_tokens_seen": 103897568, "step": 48140 }, { "epoch": 7.853996737357259, "grad_norm": 0.9669333696365356, "learning_rate": 3.804819886275825e-05, "loss": 0.0594, "num_input_tokens_seen": 103908448, "step": 48145 }, { "epoch": 7.854812398042414, "grad_norm": 1.4499256610870361, "learning_rate": 3.804516294822467e-05, "loss": 0.0751, "num_input_tokens_seen": 103917856, "step": 48150 }, { "epoch": 7.85562805872757, "grad_norm": 0.12524031102657318, "learning_rate": 3.804212676931431e-05, "loss": 0.0805, "num_input_tokens_seen": 103929792, "step": 48155 }, { "epoch": 7.856443719412725, "grad_norm": 3.1539673805236816, "learning_rate": 3.8039090326088703e-05, "loss": 0.1658, "num_input_tokens_seen": 103939488, "step": 48160 }, { "epoch": 7.857259380097879, "grad_norm": 0.5800585746765137, "learning_rate": 3.8036053618609396e-05, "loss": 0.0678, "num_input_tokens_seen": 103949600, "step": 48165 }, { "epoch": 7.858075040783034, "grad_norm": 1.5202301740646362, "learning_rate": 3.8033016646937924e-05, "loss": 0.2876, "num_input_tokens_seen": 103959936, "step": 48170 }, { "epoch": 7.858890701468189, "grad_norm": 0.4957876205444336, "learning_rate": 3.802997941113583e-05, "loss": 0.0628, "num_input_tokens_seen": 103971136, "step": 48175 }, { "epoch": 7.859706362153344, "grad_norm": 0.3623082637786865, "learning_rate": 3.802694191126468e-05, "loss": 0.1369, "num_input_tokens_seen": 103981760, "step": 48180 }, { "epoch": 7.8605220228384995, "grad_norm": 0.5594725608825684, "learning_rate": 3.802390414738603e-05, "loss": 0.0855, "num_input_tokens_seen": 103992480, "step": 48185 }, { "epoch": 7.861337683523654, "grad_norm": 1.735734462738037, "learning_rate": 3.8020866119561425e-05, "loss": 0.1198, "num_input_tokens_seen": 104002720, "step": 48190 }, { "epoch": 7.862153344208809, "grad_norm": 1.8755738735198975, "learning_rate": 3.801782782785246e-05, "loss": 0.1075, "num_input_tokens_seen": 104011968, "step": 48195 }, { "epoch": 7.862969004893964, "grad_norm": 0.3442074954509735, "learning_rate": 3.8014789272320703e-05, "loss": 0.1332, "num_input_tokens_seen": 104023328, "step": 48200 }, { "epoch": 7.863784665579119, "grad_norm": 1.4950814247131348, "learning_rate": 3.801175045302773e-05, "loss": 0.1756, "num_input_tokens_seen": 104032896, "step": 48205 }, { "epoch": 7.864600326264274, "grad_norm": 2.432830572128296, "learning_rate": 3.8008711370035116e-05, "loss": 0.1081, "num_input_tokens_seen": 104044128, "step": 48210 }, { "epoch": 7.865415986949429, "grad_norm": 1.4299746751785278, "learning_rate": 3.8005672023404474e-05, "loss": 0.0851, "num_input_tokens_seen": 104055808, "step": 48215 }, { "epoch": 7.866231647634584, "grad_norm": 0.058080416172742844, "learning_rate": 3.8002632413197385e-05, "loss": 0.217, "num_input_tokens_seen": 104066816, "step": 48220 }, { "epoch": 7.867047308319739, "grad_norm": 1.56882643699646, "learning_rate": 3.799959253947546e-05, "loss": 0.1183, "num_input_tokens_seen": 104078624, "step": 48225 }, { "epoch": 7.867862969004894, "grad_norm": 0.10829909890890121, "learning_rate": 3.79965524023003e-05, "loss": 0.0253, "num_input_tokens_seen": 104090592, "step": 48230 }, { "epoch": 7.868678629690049, "grad_norm": 0.4782920181751251, "learning_rate": 3.799351200173351e-05, "loss": 0.1724, "num_input_tokens_seen": 104101376, "step": 48235 }, { "epoch": 7.869494290375204, "grad_norm": 1.7373571395874023, "learning_rate": 3.799047133783672e-05, "loss": 0.1061, "num_input_tokens_seen": 104111968, "step": 48240 }, { "epoch": 7.870309951060359, "grad_norm": 0.6274707317352295, "learning_rate": 3.7987430410671555e-05, "loss": 0.0453, "num_input_tokens_seen": 104123616, "step": 48245 }, { "epoch": 7.871125611745514, "grad_norm": 1.0530273914337158, "learning_rate": 3.7984389220299634e-05, "loss": 0.0603, "num_input_tokens_seen": 104134176, "step": 48250 }, { "epoch": 7.871941272430669, "grad_norm": 0.9104216694831848, "learning_rate": 3.7981347766782597e-05, "loss": 0.0639, "num_input_tokens_seen": 104144704, "step": 48255 }, { "epoch": 7.872756933115824, "grad_norm": 0.08346215635538101, "learning_rate": 3.797830605018207e-05, "loss": 0.0154, "num_input_tokens_seen": 104155648, "step": 48260 }, { "epoch": 7.873572593800979, "grad_norm": 1.3099164962768555, "learning_rate": 3.797526407055971e-05, "loss": 0.0935, "num_input_tokens_seen": 104166080, "step": 48265 }, { "epoch": 7.874388254486134, "grad_norm": 0.30167582631111145, "learning_rate": 3.797222182797717e-05, "loss": 0.1614, "num_input_tokens_seen": 104177152, "step": 48270 }, { "epoch": 7.875203915171289, "grad_norm": 0.708794355392456, "learning_rate": 3.79691793224961e-05, "loss": 0.1392, "num_input_tokens_seen": 104187104, "step": 48275 }, { "epoch": 7.876019575856444, "grad_norm": 0.7222641110420227, "learning_rate": 3.796613655417815e-05, "loss": 0.2387, "num_input_tokens_seen": 104198176, "step": 48280 }, { "epoch": 7.876835236541599, "grad_norm": 1.0299071073532104, "learning_rate": 3.796309352308499e-05, "loss": 0.1494, "num_input_tokens_seen": 104208896, "step": 48285 }, { "epoch": 7.877650897226753, "grad_norm": 0.5557078719139099, "learning_rate": 3.796005022927831e-05, "loss": 0.0813, "num_input_tokens_seen": 104220512, "step": 48290 }, { "epoch": 7.878466557911908, "grad_norm": 1.092395305633545, "learning_rate": 3.795700667281976e-05, "loss": 0.2133, "num_input_tokens_seen": 104230816, "step": 48295 }, { "epoch": 7.879282218597064, "grad_norm": 0.5805056691169739, "learning_rate": 3.795396285377104e-05, "loss": 0.1358, "num_input_tokens_seen": 104242240, "step": 48300 }, { "epoch": 7.880097879282219, "grad_norm": 0.10242613404989243, "learning_rate": 3.7950918772193824e-05, "loss": 0.0596, "num_input_tokens_seen": 104253376, "step": 48305 }, { "epoch": 7.8809135399673735, "grad_norm": 0.17249421775341034, "learning_rate": 3.794787442814981e-05, "loss": 0.0876, "num_input_tokens_seen": 104264864, "step": 48310 }, { "epoch": 7.881729200652528, "grad_norm": 1.1027077436447144, "learning_rate": 3.794482982170069e-05, "loss": 0.0736, "num_input_tokens_seen": 104275712, "step": 48315 }, { "epoch": 7.882544861337683, "grad_norm": 0.3475077152252197, "learning_rate": 3.7941784952908175e-05, "loss": 0.0405, "num_input_tokens_seen": 104287328, "step": 48320 }, { "epoch": 7.883360522022839, "grad_norm": 0.9551035761833191, "learning_rate": 3.793873982183398e-05, "loss": 0.0367, "num_input_tokens_seen": 104297344, "step": 48325 }, { "epoch": 7.884176182707994, "grad_norm": 0.08159037679433823, "learning_rate": 3.79356944285398e-05, "loss": 0.027, "num_input_tokens_seen": 104308992, "step": 48330 }, { "epoch": 7.8849918433931485, "grad_norm": 0.4670132100582123, "learning_rate": 3.793264877308735e-05, "loss": 0.1302, "num_input_tokens_seen": 104319616, "step": 48335 }, { "epoch": 7.885807504078303, "grad_norm": 0.1168588250875473, "learning_rate": 3.792960285553839e-05, "loss": 0.0973, "num_input_tokens_seen": 104331520, "step": 48340 }, { "epoch": 7.886623164763458, "grad_norm": 2.011251449584961, "learning_rate": 3.7926556675954616e-05, "loss": 0.1351, "num_input_tokens_seen": 104341664, "step": 48345 }, { "epoch": 7.887438825448614, "grad_norm": 0.6050944328308105, "learning_rate": 3.792351023439777e-05, "loss": 0.1341, "num_input_tokens_seen": 104352640, "step": 48350 }, { "epoch": 7.888254486133769, "grad_norm": 2.5419657230377197, "learning_rate": 3.79204635309296e-05, "loss": 0.1811, "num_input_tokens_seen": 104362496, "step": 48355 }, { "epoch": 7.8890701468189235, "grad_norm": 0.043874308466911316, "learning_rate": 3.791741656561184e-05, "loss": 0.1598, "num_input_tokens_seen": 104372192, "step": 48360 }, { "epoch": 7.889885807504078, "grad_norm": 0.06263367086648941, "learning_rate": 3.791436933850625e-05, "loss": 0.049, "num_input_tokens_seen": 104383168, "step": 48365 }, { "epoch": 7.890701468189233, "grad_norm": 0.08699174225330353, "learning_rate": 3.7911321849674573e-05, "loss": 0.0222, "num_input_tokens_seen": 104393632, "step": 48370 }, { "epoch": 7.891517128874388, "grad_norm": 1.1867144107818604, "learning_rate": 3.7908274099178586e-05, "loss": 0.2085, "num_input_tokens_seen": 104404096, "step": 48375 }, { "epoch": 7.892332789559543, "grad_norm": 0.1087498664855957, "learning_rate": 3.790522608708005e-05, "loss": 0.0765, "num_input_tokens_seen": 104414592, "step": 48380 }, { "epoch": 7.8931484502446985, "grad_norm": 0.7726771831512451, "learning_rate": 3.790217781344074e-05, "loss": 0.058, "num_input_tokens_seen": 104424736, "step": 48385 }, { "epoch": 7.893964110929853, "grad_norm": 0.7169740200042725, "learning_rate": 3.7899129278322426e-05, "loss": 0.0559, "num_input_tokens_seen": 104434976, "step": 48390 }, { "epoch": 7.894779771615008, "grad_norm": 0.1282583773136139, "learning_rate": 3.789608048178689e-05, "loss": 0.1864, "num_input_tokens_seen": 104445920, "step": 48395 }, { "epoch": 7.895595432300163, "grad_norm": 0.2650596797466278, "learning_rate": 3.7893031423895934e-05, "loss": 0.2091, "num_input_tokens_seen": 104455584, "step": 48400 }, { "epoch": 7.896411092985318, "grad_norm": 0.22559472918510437, "learning_rate": 3.788998210471133e-05, "loss": 0.0206, "num_input_tokens_seen": 104466304, "step": 48405 }, { "epoch": 7.897226753670473, "grad_norm": 0.02872028388082981, "learning_rate": 3.788693252429489e-05, "loss": 0.037, "num_input_tokens_seen": 104476096, "step": 48410 }, { "epoch": 7.898042414355628, "grad_norm": 0.742340087890625, "learning_rate": 3.788388268270841e-05, "loss": 0.1387, "num_input_tokens_seen": 104486656, "step": 48415 }, { "epoch": 7.898858075040783, "grad_norm": 2.2024364471435547, "learning_rate": 3.7880832580013704e-05, "loss": 0.1936, "num_input_tokens_seen": 104497600, "step": 48420 }, { "epoch": 7.899673735725938, "grad_norm": 0.08548078685998917, "learning_rate": 3.7877782216272586e-05, "loss": 0.1122, "num_input_tokens_seen": 104508384, "step": 48425 }, { "epoch": 7.900489396411093, "grad_norm": 0.08665855973958969, "learning_rate": 3.787473159154688e-05, "loss": 0.0894, "num_input_tokens_seen": 104519136, "step": 48430 }, { "epoch": 7.901305057096248, "grad_norm": 0.6912166476249695, "learning_rate": 3.7871680705898404e-05, "loss": 0.0297, "num_input_tokens_seen": 104529728, "step": 48435 }, { "epoch": 7.902120717781403, "grad_norm": 0.11696377396583557, "learning_rate": 3.7868629559388986e-05, "loss": 0.072, "num_input_tokens_seen": 104541888, "step": 48440 }, { "epoch": 7.902936378466558, "grad_norm": 0.1385737806558609, "learning_rate": 3.786557815208046e-05, "loss": 0.0184, "num_input_tokens_seen": 104552736, "step": 48445 }, { "epoch": 7.903752039151713, "grad_norm": 1.8134437799453735, "learning_rate": 3.786252648403468e-05, "loss": 0.1693, "num_input_tokens_seen": 104562464, "step": 48450 }, { "epoch": 7.904567699836868, "grad_norm": 0.3716672956943512, "learning_rate": 3.785947455531348e-05, "loss": 0.1156, "num_input_tokens_seen": 104573600, "step": 48455 }, { "epoch": 7.9053833605220225, "grad_norm": 0.16127994656562805, "learning_rate": 3.785642236597872e-05, "loss": 0.0626, "num_input_tokens_seen": 104584800, "step": 48460 }, { "epoch": 7.906199021207177, "grad_norm": 2.293917655944824, "learning_rate": 3.785336991609225e-05, "loss": 0.1914, "num_input_tokens_seen": 104596192, "step": 48465 }, { "epoch": 7.907014681892333, "grad_norm": 2.35912823677063, "learning_rate": 3.785031720571592e-05, "loss": 0.2402, "num_input_tokens_seen": 104606976, "step": 48470 }, { "epoch": 7.907830342577488, "grad_norm": 0.3203434944152832, "learning_rate": 3.7847264234911626e-05, "loss": 0.1396, "num_input_tokens_seen": 104618304, "step": 48475 }, { "epoch": 7.908646003262643, "grad_norm": 0.34123966097831726, "learning_rate": 3.784421100374122e-05, "loss": 0.0245, "num_input_tokens_seen": 104627840, "step": 48480 }, { "epoch": 7.9094616639477975, "grad_norm": 0.6285391449928284, "learning_rate": 3.784115751226658e-05, "loss": 0.0796, "num_input_tokens_seen": 104639296, "step": 48485 }, { "epoch": 7.910277324632952, "grad_norm": 1.006499171257019, "learning_rate": 3.78381037605496e-05, "loss": 0.11, "num_input_tokens_seen": 104648992, "step": 48490 }, { "epoch": 7.911092985318108, "grad_norm": 0.10837593674659729, "learning_rate": 3.783504974865216e-05, "loss": 0.047, "num_input_tokens_seen": 104659680, "step": 48495 }, { "epoch": 7.911908646003263, "grad_norm": 0.03239646926522255, "learning_rate": 3.783199547663615e-05, "loss": 0.1542, "num_input_tokens_seen": 104670688, "step": 48500 }, { "epoch": 7.912724306688418, "grad_norm": 0.44463056325912476, "learning_rate": 3.7828940944563474e-05, "loss": 0.1257, "num_input_tokens_seen": 104681600, "step": 48505 }, { "epoch": 7.9135399673735725, "grad_norm": 0.7592862844467163, "learning_rate": 3.782588615249603e-05, "loss": 0.1701, "num_input_tokens_seen": 104692672, "step": 48510 }, { "epoch": 7.914355628058727, "grad_norm": 0.15358413755893707, "learning_rate": 3.7822831100495734e-05, "loss": 0.0501, "num_input_tokens_seen": 104703296, "step": 48515 }, { "epoch": 7.915171288743883, "grad_norm": 0.11243858188390732, "learning_rate": 3.78197757886245e-05, "loss": 0.1038, "num_input_tokens_seen": 104715648, "step": 48520 }, { "epoch": 7.915986949429038, "grad_norm": 1.5184530019760132, "learning_rate": 3.7816720216944255e-05, "loss": 0.1169, "num_input_tokens_seen": 104724864, "step": 48525 }, { "epoch": 7.916802610114193, "grad_norm": 1.2436635494232178, "learning_rate": 3.781366438551691e-05, "loss": 0.1204, "num_input_tokens_seen": 104734592, "step": 48530 }, { "epoch": 7.917618270799347, "grad_norm": 1.5334172248840332, "learning_rate": 3.78106082944044e-05, "loss": 0.2111, "num_input_tokens_seen": 104744992, "step": 48535 }, { "epoch": 7.918433931484502, "grad_norm": 0.022694583982229233, "learning_rate": 3.780755194366866e-05, "loss": 0.1386, "num_input_tokens_seen": 104756512, "step": 48540 }, { "epoch": 7.919249592169657, "grad_norm": 2.7663259506225586, "learning_rate": 3.7804495333371636e-05, "loss": 0.1589, "num_input_tokens_seen": 104767840, "step": 48545 }, { "epoch": 7.920065252854813, "grad_norm": 1.2698746919631958, "learning_rate": 3.780143846357527e-05, "loss": 0.2184, "num_input_tokens_seen": 104778944, "step": 48550 }, { "epoch": 7.920880913539968, "grad_norm": 0.05190170183777809, "learning_rate": 3.779838133434151e-05, "loss": 0.0667, "num_input_tokens_seen": 104789088, "step": 48555 }, { "epoch": 7.921696574225122, "grad_norm": 0.18558791279792786, "learning_rate": 3.779532394573232e-05, "loss": 0.1991, "num_input_tokens_seen": 104800032, "step": 48560 }, { "epoch": 7.922512234910277, "grad_norm": 0.18677254021167755, "learning_rate": 3.779226629780965e-05, "loss": 0.1495, "num_input_tokens_seen": 104810720, "step": 48565 }, { "epoch": 7.923327895595432, "grad_norm": 0.06827356666326523, "learning_rate": 3.778920839063549e-05, "loss": 0.0243, "num_input_tokens_seen": 104820896, "step": 48570 }, { "epoch": 7.924143556280587, "grad_norm": 0.24919724464416504, "learning_rate": 3.778615022427179e-05, "loss": 0.1992, "num_input_tokens_seen": 104831808, "step": 48575 }, { "epoch": 7.924959216965743, "grad_norm": 0.05481189116835594, "learning_rate": 3.778309179878053e-05, "loss": 0.0312, "num_input_tokens_seen": 104841920, "step": 48580 }, { "epoch": 7.925774877650897, "grad_norm": 0.10898087173700333, "learning_rate": 3.7780033114223705e-05, "loss": 0.0323, "num_input_tokens_seen": 104853408, "step": 48585 }, { "epoch": 7.926590538336052, "grad_norm": 0.09869863092899323, "learning_rate": 3.77769741706633e-05, "loss": 0.1754, "num_input_tokens_seen": 104862880, "step": 48590 }, { "epoch": 7.927406199021207, "grad_norm": 1.6962385177612305, "learning_rate": 3.7773914968161294e-05, "loss": 0.3182, "num_input_tokens_seen": 104873952, "step": 48595 }, { "epoch": 7.928221859706362, "grad_norm": 0.24177810549736023, "learning_rate": 3.777085550677971e-05, "loss": 0.0198, "num_input_tokens_seen": 104884128, "step": 48600 }, { "epoch": 7.9290375203915175, "grad_norm": 1.1700793504714966, "learning_rate": 3.776779578658053e-05, "loss": 0.1264, "num_input_tokens_seen": 104895296, "step": 48605 }, { "epoch": 7.929853181076672, "grad_norm": 0.11061568558216095, "learning_rate": 3.7764735807625766e-05, "loss": 0.102, "num_input_tokens_seen": 104905760, "step": 48610 }, { "epoch": 7.930668841761827, "grad_norm": 1.135535478591919, "learning_rate": 3.776167556997744e-05, "loss": 0.1, "num_input_tokens_seen": 104916864, "step": 48615 }, { "epoch": 7.931484502446982, "grad_norm": 0.08464992046356201, "learning_rate": 3.775861507369758e-05, "loss": 0.1145, "num_input_tokens_seen": 104927104, "step": 48620 }, { "epoch": 7.932300163132137, "grad_norm": 1.805676817893982, "learning_rate": 3.775555431884819e-05, "loss": 0.1171, "num_input_tokens_seen": 104937632, "step": 48625 }, { "epoch": 7.933115823817292, "grad_norm": 1.680954933166504, "learning_rate": 3.775249330549132e-05, "loss": 0.2734, "num_input_tokens_seen": 104948448, "step": 48630 }, { "epoch": 7.933931484502447, "grad_norm": 0.17425955832004547, "learning_rate": 3.774943203368898e-05, "loss": 0.0289, "num_input_tokens_seen": 104958688, "step": 48635 }, { "epoch": 7.934747145187602, "grad_norm": 0.18714989721775055, "learning_rate": 3.774637050350324e-05, "loss": 0.0353, "num_input_tokens_seen": 104969408, "step": 48640 }, { "epoch": 7.935562805872757, "grad_norm": 0.05413559451699257, "learning_rate": 3.774330871499612e-05, "loss": 0.042, "num_input_tokens_seen": 104979968, "step": 48645 }, { "epoch": 7.936378466557912, "grad_norm": 0.5990417003631592, "learning_rate": 3.7740246668229694e-05, "loss": 0.1742, "num_input_tokens_seen": 104990688, "step": 48650 }, { "epoch": 7.937194127243067, "grad_norm": 0.901869535446167, "learning_rate": 3.7737184363266e-05, "loss": 0.1058, "num_input_tokens_seen": 105002048, "step": 48655 }, { "epoch": 7.938009787928221, "grad_norm": 0.5641941428184509, "learning_rate": 3.773412180016711e-05, "loss": 0.07, "num_input_tokens_seen": 105013056, "step": 48660 }, { "epoch": 7.938825448613377, "grad_norm": 3.167038917541504, "learning_rate": 3.7731058978995084e-05, "loss": 0.3764, "num_input_tokens_seen": 105021856, "step": 48665 }, { "epoch": 7.939641109298532, "grad_norm": 1.0700324773788452, "learning_rate": 3.7727995899812e-05, "loss": 0.2016, "num_input_tokens_seen": 105031936, "step": 48670 }, { "epoch": 7.940456769983687, "grad_norm": 1.63407301902771, "learning_rate": 3.7724932562679924e-05, "loss": 0.2661, "num_input_tokens_seen": 105044000, "step": 48675 }, { "epoch": 7.941272430668842, "grad_norm": 1.4618186950683594, "learning_rate": 3.772186896766096e-05, "loss": 0.1066, "num_input_tokens_seen": 105054272, "step": 48680 }, { "epoch": 7.942088091353996, "grad_norm": 0.13361625373363495, "learning_rate": 3.771880511481718e-05, "loss": 0.0419, "num_input_tokens_seen": 105065984, "step": 48685 }, { "epoch": 7.942903752039152, "grad_norm": 1.674031376838684, "learning_rate": 3.771574100421067e-05, "loss": 0.1165, "num_input_tokens_seen": 105077440, "step": 48690 }, { "epoch": 7.943719412724307, "grad_norm": 1.88042151927948, "learning_rate": 3.7712676635903544e-05, "loss": 0.1846, "num_input_tokens_seen": 105087104, "step": 48695 }, { "epoch": 7.944535073409462, "grad_norm": 0.946808397769928, "learning_rate": 3.77096120099579e-05, "loss": 0.0936, "num_input_tokens_seen": 105097824, "step": 48700 }, { "epoch": 7.945350734094617, "grad_norm": 0.5068144798278809, "learning_rate": 3.770654712643584e-05, "loss": 0.0408, "num_input_tokens_seen": 105108064, "step": 48705 }, { "epoch": 7.946166394779771, "grad_norm": 0.4203794300556183, "learning_rate": 3.770348198539949e-05, "loss": 0.0305, "num_input_tokens_seen": 105118784, "step": 48710 }, { "epoch": 7.946982055464927, "grad_norm": 0.14435319602489471, "learning_rate": 3.770041658691096e-05, "loss": 0.091, "num_input_tokens_seen": 105128608, "step": 48715 }, { "epoch": 7.947797716150082, "grad_norm": 0.6269714832305908, "learning_rate": 3.769735093103237e-05, "loss": 0.0545, "num_input_tokens_seen": 105138720, "step": 48720 }, { "epoch": 7.948613376835237, "grad_norm": 0.6431462168693542, "learning_rate": 3.769428501782587e-05, "loss": 0.0971, "num_input_tokens_seen": 105150368, "step": 48725 }, { "epoch": 7.9494290375203915, "grad_norm": 0.7077387571334839, "learning_rate": 3.769121884735357e-05, "loss": 0.0878, "num_input_tokens_seen": 105160544, "step": 48730 }, { "epoch": 7.950244698205546, "grad_norm": 1.54108464717865, "learning_rate": 3.768815241967762e-05, "loss": 0.1644, "num_input_tokens_seen": 105170784, "step": 48735 }, { "epoch": 7.951060358890701, "grad_norm": 0.39930960536003113, "learning_rate": 3.7685085734860166e-05, "loss": 0.1424, "num_input_tokens_seen": 105180960, "step": 48740 }, { "epoch": 7.951876019575856, "grad_norm": 0.5832653641700745, "learning_rate": 3.7682018792963357e-05, "loss": 0.1243, "num_input_tokens_seen": 105191200, "step": 48745 }, { "epoch": 7.952691680261012, "grad_norm": 1.5338565111160278, "learning_rate": 3.767895159404935e-05, "loss": 0.1509, "num_input_tokens_seen": 105202176, "step": 48750 }, { "epoch": 7.9535073409461665, "grad_norm": 1.0577102899551392, "learning_rate": 3.7675884138180306e-05, "loss": 0.2409, "num_input_tokens_seen": 105212352, "step": 48755 }, { "epoch": 7.954323001631321, "grad_norm": 0.8910152912139893, "learning_rate": 3.767281642541839e-05, "loss": 0.1584, "num_input_tokens_seen": 105222464, "step": 48760 }, { "epoch": 7.955138662316476, "grad_norm": 1.1524964570999146, "learning_rate": 3.766974845582577e-05, "loss": 0.0419, "num_input_tokens_seen": 105233984, "step": 48765 }, { "epoch": 7.955954323001631, "grad_norm": 0.1343867927789688, "learning_rate": 3.766668022946462e-05, "loss": 0.0281, "num_input_tokens_seen": 105244768, "step": 48770 }, { "epoch": 7.956769983686787, "grad_norm": 0.1493895798921585, "learning_rate": 3.766361174639713e-05, "loss": 0.2095, "num_input_tokens_seen": 105255680, "step": 48775 }, { "epoch": 7.9575856443719415, "grad_norm": 0.021501855924725533, "learning_rate": 3.766054300668549e-05, "loss": 0.0223, "num_input_tokens_seen": 105268608, "step": 48780 }, { "epoch": 7.958401305057096, "grad_norm": 1.760240912437439, "learning_rate": 3.765747401039188e-05, "loss": 0.1256, "num_input_tokens_seen": 105278464, "step": 48785 }, { "epoch": 7.959216965742251, "grad_norm": 0.4236885905265808, "learning_rate": 3.7654404757578496e-05, "loss": 0.0547, "num_input_tokens_seen": 105289856, "step": 48790 }, { "epoch": 7.960032626427406, "grad_norm": 0.06448156386613846, "learning_rate": 3.765133524830756e-05, "loss": 0.12, "num_input_tokens_seen": 105300160, "step": 48795 }, { "epoch": 7.960848287112562, "grad_norm": 0.07824885845184326, "learning_rate": 3.764826548264125e-05, "loss": 0.1368, "num_input_tokens_seen": 105312352, "step": 48800 }, { "epoch": 7.9616639477977165, "grad_norm": 0.38757237792015076, "learning_rate": 3.76451954606418e-05, "loss": 0.0489, "num_input_tokens_seen": 105324544, "step": 48805 }, { "epoch": 7.962479608482871, "grad_norm": 1.4336884021759033, "learning_rate": 3.764212518237144e-05, "loss": 0.0974, "num_input_tokens_seen": 105335072, "step": 48810 }, { "epoch": 7.963295269168026, "grad_norm": 0.33013561367988586, "learning_rate": 3.7639054647892355e-05, "loss": 0.0202, "num_input_tokens_seen": 105344960, "step": 48815 }, { "epoch": 7.964110929853181, "grad_norm": 0.8488060832023621, "learning_rate": 3.7635983857266796e-05, "loss": 0.0279, "num_input_tokens_seen": 105355264, "step": 48820 }, { "epoch": 7.964926590538336, "grad_norm": 0.040187686681747437, "learning_rate": 3.7632912810556994e-05, "loss": 0.0477, "num_input_tokens_seen": 105366624, "step": 48825 }, { "epoch": 7.9657422512234906, "grad_norm": 0.05870238319039345, "learning_rate": 3.762984150782519e-05, "loss": 0.1701, "num_input_tokens_seen": 105377888, "step": 48830 }, { "epoch": 7.966557911908646, "grad_norm": 0.12833499908447266, "learning_rate": 3.762676994913363e-05, "loss": 0.0642, "num_input_tokens_seen": 105388192, "step": 48835 }, { "epoch": 7.967373572593801, "grad_norm": 1.7222912311553955, "learning_rate": 3.762369813454455e-05, "loss": 0.3104, "num_input_tokens_seen": 105400256, "step": 48840 }, { "epoch": 7.968189233278956, "grad_norm": 0.8392804861068726, "learning_rate": 3.7620626064120226e-05, "loss": 0.1882, "num_input_tokens_seen": 105411328, "step": 48845 }, { "epoch": 7.969004893964111, "grad_norm": 0.527154266834259, "learning_rate": 3.7617553737922895e-05, "loss": 0.0849, "num_input_tokens_seen": 105421408, "step": 48850 }, { "epoch": 7.9698205546492655, "grad_norm": 0.052186910063028336, "learning_rate": 3.7614481156014834e-05, "loss": 0.0272, "num_input_tokens_seen": 105431872, "step": 48855 }, { "epoch": 7.970636215334421, "grad_norm": 0.31017911434173584, "learning_rate": 3.7611408318458305e-05, "loss": 0.1068, "num_input_tokens_seen": 105443776, "step": 48860 }, { "epoch": 7.971451876019576, "grad_norm": 4.792959690093994, "learning_rate": 3.7608335225315595e-05, "loss": 0.0926, "num_input_tokens_seen": 105453088, "step": 48865 }, { "epoch": 7.972267536704731, "grad_norm": 0.08753935992717743, "learning_rate": 3.760526187664897e-05, "loss": 0.0426, "num_input_tokens_seen": 105463744, "step": 48870 }, { "epoch": 7.973083197389886, "grad_norm": 0.06870773434638977, "learning_rate": 3.760218827252072e-05, "loss": 0.0398, "num_input_tokens_seen": 105474752, "step": 48875 }, { "epoch": 7.9738988580750405, "grad_norm": 0.1358431875705719, "learning_rate": 3.759911441299315e-05, "loss": 0.0814, "num_input_tokens_seen": 105486016, "step": 48880 }, { "epoch": 7.974714518760196, "grad_norm": 0.035859860479831696, "learning_rate": 3.759604029812853e-05, "loss": 0.0787, "num_input_tokens_seen": 105496800, "step": 48885 }, { "epoch": 7.975530179445351, "grad_norm": 0.027612468227744102, "learning_rate": 3.759296592798919e-05, "loss": 0.1461, "num_input_tokens_seen": 105506528, "step": 48890 }, { "epoch": 7.976345840130506, "grad_norm": 1.3588707447052002, "learning_rate": 3.7589891302637404e-05, "loss": 0.1079, "num_input_tokens_seen": 105517312, "step": 48895 }, { "epoch": 7.977161500815661, "grad_norm": 0.042517129331827164, "learning_rate": 3.7586816422135506e-05, "loss": 0.1697, "num_input_tokens_seen": 105526976, "step": 48900 }, { "epoch": 7.9779771615008155, "grad_norm": 0.3287971317768097, "learning_rate": 3.758374128654581e-05, "loss": 0.0288, "num_input_tokens_seen": 105537408, "step": 48905 }, { "epoch": 7.97879282218597, "grad_norm": 0.49931421875953674, "learning_rate": 3.758066589593063e-05, "loss": 0.0831, "num_input_tokens_seen": 105547712, "step": 48910 }, { "epoch": 7.979608482871125, "grad_norm": 0.34758424758911133, "learning_rate": 3.757759025035229e-05, "loss": 0.209, "num_input_tokens_seen": 105557760, "step": 48915 }, { "epoch": 7.980424143556281, "grad_norm": 1.1906344890594482, "learning_rate": 3.7574514349873135e-05, "loss": 0.2003, "num_input_tokens_seen": 105568800, "step": 48920 }, { "epoch": 7.981239804241436, "grad_norm": 0.037632908672094345, "learning_rate": 3.75714381945555e-05, "loss": 0.0183, "num_input_tokens_seen": 105578944, "step": 48925 }, { "epoch": 7.9820554649265905, "grad_norm": 1.2706423997879028, "learning_rate": 3.7568361784461706e-05, "loss": 0.1147, "num_input_tokens_seen": 105589664, "step": 48930 }, { "epoch": 7.982871125611745, "grad_norm": 0.15355940163135529, "learning_rate": 3.756528511965413e-05, "loss": 0.0325, "num_input_tokens_seen": 105601056, "step": 48935 }, { "epoch": 7.9836867862969, "grad_norm": 0.3544341027736664, "learning_rate": 3.7562208200195105e-05, "loss": 0.1087, "num_input_tokens_seen": 105611808, "step": 48940 }, { "epoch": 7.984502446982056, "grad_norm": 0.04599902033805847, "learning_rate": 3.7559131026147e-05, "loss": 0.1609, "num_input_tokens_seen": 105622944, "step": 48945 }, { "epoch": 7.985318107667211, "grad_norm": 0.7613700032234192, "learning_rate": 3.755605359757216e-05, "loss": 0.2055, "num_input_tokens_seen": 105633920, "step": 48950 }, { "epoch": 7.986133768352365, "grad_norm": 0.8221794366836548, "learning_rate": 3.755297591453298e-05, "loss": 0.1904, "num_input_tokens_seen": 105645280, "step": 48955 }, { "epoch": 7.98694942903752, "grad_norm": 0.6912385821342468, "learning_rate": 3.754989797709181e-05, "loss": 0.0414, "num_input_tokens_seen": 105656032, "step": 48960 }, { "epoch": 7.987765089722675, "grad_norm": 0.19984100759029388, "learning_rate": 3.7546819785311035e-05, "loss": 0.0301, "num_input_tokens_seen": 105666208, "step": 48965 }, { "epoch": 7.988580750407831, "grad_norm": 2.3816778659820557, "learning_rate": 3.754374133925305e-05, "loss": 0.2687, "num_input_tokens_seen": 105677600, "step": 48970 }, { "epoch": 7.989396411092986, "grad_norm": 1.3010941743850708, "learning_rate": 3.7540662638980216e-05, "loss": 0.1055, "num_input_tokens_seen": 105688256, "step": 48975 }, { "epoch": 7.99021207177814, "grad_norm": 0.11452072858810425, "learning_rate": 3.753758368455496e-05, "loss": 0.0564, "num_input_tokens_seen": 105698560, "step": 48980 }, { "epoch": 7.991027732463295, "grad_norm": 0.2337435632944107, "learning_rate": 3.753450447603966e-05, "loss": 0.1039, "num_input_tokens_seen": 105709792, "step": 48985 }, { "epoch": 7.99184339314845, "grad_norm": 0.2914660573005676, "learning_rate": 3.753142501349673e-05, "loss": 0.1658, "num_input_tokens_seen": 105720608, "step": 48990 }, { "epoch": 7.992659053833605, "grad_norm": 0.29873713850975037, "learning_rate": 3.752834529698858e-05, "loss": 0.0179, "num_input_tokens_seen": 105732192, "step": 48995 }, { "epoch": 7.993474714518761, "grad_norm": 0.4406777024269104, "learning_rate": 3.752526532657761e-05, "loss": 0.226, "num_input_tokens_seen": 105742688, "step": 49000 }, { "epoch": 7.994290375203915, "grad_norm": 0.7258381247520447, "learning_rate": 3.752218510232624e-05, "loss": 0.1203, "num_input_tokens_seen": 105754208, "step": 49005 }, { "epoch": 7.99510603588907, "grad_norm": 0.09683001786470413, "learning_rate": 3.7519104624296915e-05, "loss": 0.0337, "num_input_tokens_seen": 105765920, "step": 49010 }, { "epoch": 7.995921696574225, "grad_norm": 0.458947092294693, "learning_rate": 3.751602389255205e-05, "loss": 0.1513, "num_input_tokens_seen": 105776992, "step": 49015 }, { "epoch": 7.99673735725938, "grad_norm": 0.044104862958192825, "learning_rate": 3.7512942907154085e-05, "loss": 0.1336, "num_input_tokens_seen": 105788576, "step": 49020 }, { "epoch": 7.997553017944535, "grad_norm": 0.1044570803642273, "learning_rate": 3.750986166816546e-05, "loss": 0.0331, "num_input_tokens_seen": 105800384, "step": 49025 }, { "epoch": 7.99836867862969, "grad_norm": 1.3989938497543335, "learning_rate": 3.7506780175648616e-05, "loss": 0.1238, "num_input_tokens_seen": 105810528, "step": 49030 }, { "epoch": 7.999184339314845, "grad_norm": 0.03433897718787193, "learning_rate": 3.750369842966601e-05, "loss": 0.0382, "num_input_tokens_seen": 105821440, "step": 49035 }, { "epoch": 8.0, "grad_norm": 0.37924909591674805, "learning_rate": 3.750061643028009e-05, "loss": 0.0184, "num_input_tokens_seen": 105830544, "step": 49040 }, { "epoch": 8.0, "eval_loss": 0.13326387107372284, "eval_runtime": 90.6636, "eval_samples_per_second": 30.056, "eval_steps_per_second": 7.522, "num_input_tokens_seen": 105830544, "step": 49040 }, { "epoch": 8.000815660685156, "grad_norm": 0.3519747853279114, "learning_rate": 3.7497534177553316e-05, "loss": 0.0627, "num_input_tokens_seen": 105843312, "step": 49045 }, { "epoch": 8.00163132137031, "grad_norm": 0.4392331540584564, "learning_rate": 3.749445167154816e-05, "loss": 0.0217, "num_input_tokens_seen": 105851248, "step": 49050 }, { "epoch": 8.002446982055465, "grad_norm": 1.9463732242584229, "learning_rate": 3.7491368912327096e-05, "loss": 0.1082, "num_input_tokens_seen": 105862352, "step": 49055 }, { "epoch": 8.00326264274062, "grad_norm": 0.19472476840019226, "learning_rate": 3.748828589995259e-05, "loss": 0.0946, "num_input_tokens_seen": 105873264, "step": 49060 }, { "epoch": 8.004078303425775, "grad_norm": 0.03692524880170822, "learning_rate": 3.748520263448713e-05, "loss": 0.0341, "num_input_tokens_seen": 105884272, "step": 49065 }, { "epoch": 8.00489396411093, "grad_norm": 1.1084281206130981, "learning_rate": 3.74821191159932e-05, "loss": 0.1973, "num_input_tokens_seen": 105894832, "step": 49070 }, { "epoch": 8.005709624796085, "grad_norm": 0.03393913432955742, "learning_rate": 3.747903534453329e-05, "loss": 0.0064, "num_input_tokens_seen": 105905520, "step": 49075 }, { "epoch": 8.00652528548124, "grad_norm": 0.15202555060386658, "learning_rate": 3.747595132016989e-05, "loss": 0.1069, "num_input_tokens_seen": 105917424, "step": 49080 }, { "epoch": 8.007340946166394, "grad_norm": 0.9325438141822815, "learning_rate": 3.747286704296552e-05, "loss": 0.0486, "num_input_tokens_seen": 105928560, "step": 49085 }, { "epoch": 8.00815660685155, "grad_norm": 0.25435879826545715, "learning_rate": 3.7469782512982664e-05, "loss": 0.0734, "num_input_tokens_seen": 105939120, "step": 49090 }, { "epoch": 8.008972267536704, "grad_norm": 0.47081610560417175, "learning_rate": 3.746669773028386e-05, "loss": 0.0542, "num_input_tokens_seen": 105949424, "step": 49095 }, { "epoch": 8.00978792822186, "grad_norm": 2.496182441711426, "learning_rate": 3.7463612694931605e-05, "loss": 0.1311, "num_input_tokens_seen": 105959280, "step": 49100 }, { "epoch": 8.010603588907015, "grad_norm": 0.5026490688323975, "learning_rate": 3.7460527406988434e-05, "loss": 0.0774, "num_input_tokens_seen": 105968272, "step": 49105 }, { "epoch": 8.01141924959217, "grad_norm": 0.9132103323936462, "learning_rate": 3.7457441866516864e-05, "loss": 0.1854, "num_input_tokens_seen": 105978736, "step": 49110 }, { "epoch": 8.012234910277325, "grad_norm": 0.13710254430770874, "learning_rate": 3.7454356073579426e-05, "loss": 0.0466, "num_input_tokens_seen": 105989872, "step": 49115 }, { "epoch": 8.013050570962479, "grad_norm": 0.08939941227436066, "learning_rate": 3.745127002823867e-05, "loss": 0.0582, "num_input_tokens_seen": 105999792, "step": 49120 }, { "epoch": 8.013866231647635, "grad_norm": 0.05266597867012024, "learning_rate": 3.744818373055713e-05, "loss": 0.0846, "num_input_tokens_seen": 106010256, "step": 49125 }, { "epoch": 8.01468189233279, "grad_norm": 1.206714391708374, "learning_rate": 3.7445097180597355e-05, "loss": 0.257, "num_input_tokens_seen": 106020496, "step": 49130 }, { "epoch": 8.015497553017944, "grad_norm": 0.07978343963623047, "learning_rate": 3.7442010378421895e-05, "loss": 0.0927, "num_input_tokens_seen": 106031248, "step": 49135 }, { "epoch": 8.0163132137031, "grad_norm": 0.029915761202573776, "learning_rate": 3.743892332409331e-05, "loss": 0.0064, "num_input_tokens_seen": 106041232, "step": 49140 }, { "epoch": 8.017128874388254, "grad_norm": 0.27315905690193176, "learning_rate": 3.743583601767417e-05, "loss": 0.1198, "num_input_tokens_seen": 106051696, "step": 49145 }, { "epoch": 8.01794453507341, "grad_norm": 0.8164329528808594, "learning_rate": 3.7432748459227026e-05, "loss": 0.087, "num_input_tokens_seen": 106062672, "step": 49150 }, { "epoch": 8.018760195758565, "grad_norm": 0.35626766085624695, "learning_rate": 3.7429660648814474e-05, "loss": 0.0266, "num_input_tokens_seen": 106072784, "step": 49155 }, { "epoch": 8.01957585644372, "grad_norm": 0.03441992029547691, "learning_rate": 3.742657258649908e-05, "loss": 0.0206, "num_input_tokens_seen": 106083952, "step": 49160 }, { "epoch": 8.020391517128875, "grad_norm": 0.9590872526168823, "learning_rate": 3.742348427234342e-05, "loss": 0.1425, "num_input_tokens_seen": 106095632, "step": 49165 }, { "epoch": 8.021207177814029, "grad_norm": 0.2222815901041031, "learning_rate": 3.7420395706410094e-05, "loss": 0.0179, "num_input_tokens_seen": 106106160, "step": 49170 }, { "epoch": 8.022022838499185, "grad_norm": 1.9723519086837769, "learning_rate": 3.74173068887617e-05, "loss": 0.1921, "num_input_tokens_seen": 106116784, "step": 49175 }, { "epoch": 8.022838499184338, "grad_norm": 0.18970847129821777, "learning_rate": 3.7414217819460824e-05, "loss": 0.0941, "num_input_tokens_seen": 106127088, "step": 49180 }, { "epoch": 8.023654159869494, "grad_norm": 1.060659408569336, "learning_rate": 3.741112849857007e-05, "loss": 0.124, "num_input_tokens_seen": 106136912, "step": 49185 }, { "epoch": 8.02446982055465, "grad_norm": 0.4731878936290741, "learning_rate": 3.7408038926152054e-05, "loss": 0.0738, "num_input_tokens_seen": 106148816, "step": 49190 }, { "epoch": 8.025285481239804, "grad_norm": 0.0559539720416069, "learning_rate": 3.7404949102269395e-05, "loss": 0.0204, "num_input_tokens_seen": 106158672, "step": 49195 }, { "epoch": 8.02610114192496, "grad_norm": 1.5580743551254272, "learning_rate": 3.74018590269847e-05, "loss": 0.0874, "num_input_tokens_seen": 106169744, "step": 49200 }, { "epoch": 8.026916802610113, "grad_norm": 0.04042466729879379, "learning_rate": 3.739876870036061e-05, "loss": 0.204, "num_input_tokens_seen": 106179824, "step": 49205 }, { "epoch": 8.02773246329527, "grad_norm": 0.07107236981391907, "learning_rate": 3.739567812245973e-05, "loss": 0.1343, "num_input_tokens_seen": 106190640, "step": 49210 }, { "epoch": 8.028548123980425, "grad_norm": 0.5376845598220825, "learning_rate": 3.739258729334472e-05, "loss": 0.0378, "num_input_tokens_seen": 106200112, "step": 49215 }, { "epoch": 8.029363784665579, "grad_norm": 0.31421512365341187, "learning_rate": 3.738949621307819e-05, "loss": 0.0769, "num_input_tokens_seen": 106211440, "step": 49220 }, { "epoch": 8.030179445350734, "grad_norm": 0.1325739324092865, "learning_rate": 3.738640488172281e-05, "loss": 0.1611, "num_input_tokens_seen": 106222224, "step": 49225 }, { "epoch": 8.030995106035888, "grad_norm": 0.0640292540192604, "learning_rate": 3.7383313299341234e-05, "loss": 0.1273, "num_input_tokens_seen": 106233968, "step": 49230 }, { "epoch": 8.031810766721044, "grad_norm": 0.8226086497306824, "learning_rate": 3.738022146599609e-05, "loss": 0.1276, "num_input_tokens_seen": 106244944, "step": 49235 }, { "epoch": 8.0326264274062, "grad_norm": 1.1697971820831299, "learning_rate": 3.7377129381750056e-05, "loss": 0.1543, "num_input_tokens_seen": 106255664, "step": 49240 }, { "epoch": 8.033442088091354, "grad_norm": 0.8488547801971436, "learning_rate": 3.73740370466658e-05, "loss": 0.0756, "num_input_tokens_seen": 106266672, "step": 49245 }, { "epoch": 8.03425774877651, "grad_norm": 0.46679016947746277, "learning_rate": 3.737094446080598e-05, "loss": 0.2152, "num_input_tokens_seen": 106278288, "step": 49250 }, { "epoch": 8.035073409461663, "grad_norm": 1.3946012258529663, "learning_rate": 3.736785162423328e-05, "loss": 0.0506, "num_input_tokens_seen": 106289744, "step": 49255 }, { "epoch": 8.035889070146819, "grad_norm": 0.4805401861667633, "learning_rate": 3.736475853701037e-05, "loss": 0.075, "num_input_tokens_seen": 106298480, "step": 49260 }, { "epoch": 8.036704730831975, "grad_norm": 0.05134117603302002, "learning_rate": 3.7361665199199946e-05, "loss": 0.0538, "num_input_tokens_seen": 106310800, "step": 49265 }, { "epoch": 8.037520391517129, "grad_norm": 1.589311122894287, "learning_rate": 3.7358571610864704e-05, "loss": 0.1512, "num_input_tokens_seen": 106322000, "step": 49270 }, { "epoch": 8.038336052202284, "grad_norm": 0.04535532370209694, "learning_rate": 3.7355477772067315e-05, "loss": 0.1349, "num_input_tokens_seen": 106331632, "step": 49275 }, { "epoch": 8.039151712887438, "grad_norm": 0.967099666595459, "learning_rate": 3.735238368287051e-05, "loss": 0.0327, "num_input_tokens_seen": 106343344, "step": 49280 }, { "epoch": 8.039967373572594, "grad_norm": 0.0660645142197609, "learning_rate": 3.734928934333697e-05, "loss": 0.1328, "num_input_tokens_seen": 106354288, "step": 49285 }, { "epoch": 8.040783034257748, "grad_norm": 0.044233206659555435, "learning_rate": 3.7346194753529416e-05, "loss": 0.0391, "num_input_tokens_seen": 106365168, "step": 49290 }, { "epoch": 8.041598694942904, "grad_norm": 0.23188401758670807, "learning_rate": 3.734309991351056e-05, "loss": 0.0692, "num_input_tokens_seen": 106375440, "step": 49295 }, { "epoch": 8.04241435562806, "grad_norm": 1.933927059173584, "learning_rate": 3.7340004823343126e-05, "loss": 0.1123, "num_input_tokens_seen": 106386032, "step": 49300 }, { "epoch": 8.043230016313213, "grad_norm": 0.1843259334564209, "learning_rate": 3.733690948308984e-05, "loss": 0.0349, "num_input_tokens_seen": 106398160, "step": 49305 }, { "epoch": 8.044045676998369, "grad_norm": 0.34819093346595764, "learning_rate": 3.733381389281344e-05, "loss": 0.1176, "num_input_tokens_seen": 106408432, "step": 49310 }, { "epoch": 8.044861337683523, "grad_norm": 0.20030030608177185, "learning_rate": 3.733071805257665e-05, "loss": 0.28, "num_input_tokens_seen": 106419440, "step": 49315 }, { "epoch": 8.045676998368679, "grad_norm": 0.15307466685771942, "learning_rate": 3.732762196244222e-05, "loss": 0.0475, "num_input_tokens_seen": 106430000, "step": 49320 }, { "epoch": 8.046492659053834, "grad_norm": 0.05157376825809479, "learning_rate": 3.732452562247289e-05, "loss": 0.0253, "num_input_tokens_seen": 106441392, "step": 49325 }, { "epoch": 8.047308319738988, "grad_norm": 1.0975478887557983, "learning_rate": 3.732142903273142e-05, "loss": 0.0448, "num_input_tokens_seen": 106452688, "step": 49330 }, { "epoch": 8.048123980424144, "grad_norm": 2.822425603866577, "learning_rate": 3.7318332193280544e-05, "loss": 0.1333, "num_input_tokens_seen": 106462352, "step": 49335 }, { "epoch": 8.048939641109298, "grad_norm": 1.2710636854171753, "learning_rate": 3.731523510418305e-05, "loss": 0.2093, "num_input_tokens_seen": 106472208, "step": 49340 }, { "epoch": 8.049755301794454, "grad_norm": 1.1743566989898682, "learning_rate": 3.7312137765501695e-05, "loss": 0.3808, "num_input_tokens_seen": 106481840, "step": 49345 }, { "epoch": 8.05057096247961, "grad_norm": 0.08226776123046875, "learning_rate": 3.730904017729924e-05, "loss": 0.2099, "num_input_tokens_seen": 106492912, "step": 49350 }, { "epoch": 8.051386623164763, "grad_norm": 0.0688541978597641, "learning_rate": 3.730594233963848e-05, "loss": 0.2933, "num_input_tokens_seen": 106503440, "step": 49355 }, { "epoch": 8.052202283849919, "grad_norm": 0.3286757171154022, "learning_rate": 3.730284425258218e-05, "loss": 0.0866, "num_input_tokens_seen": 106512880, "step": 49360 }, { "epoch": 8.053017944535073, "grad_norm": 0.05726669356226921, "learning_rate": 3.729974591619314e-05, "loss": 0.0762, "num_input_tokens_seen": 106525296, "step": 49365 }, { "epoch": 8.053833605220229, "grad_norm": 2.428412914276123, "learning_rate": 3.729664733053414e-05, "loss": 0.22, "num_input_tokens_seen": 106535536, "step": 49370 }, { "epoch": 8.054649265905383, "grad_norm": 0.42171987891197205, "learning_rate": 3.729354849566799e-05, "loss": 0.1268, "num_input_tokens_seen": 106546160, "step": 49375 }, { "epoch": 8.055464926590538, "grad_norm": 0.41008463501930237, "learning_rate": 3.729044941165748e-05, "loss": 0.0954, "num_input_tokens_seen": 106558064, "step": 49380 }, { "epoch": 8.056280587275694, "grad_norm": 0.19097842276096344, "learning_rate": 3.7287350078565424e-05, "loss": 0.0743, "num_input_tokens_seen": 106568400, "step": 49385 }, { "epoch": 8.057096247960848, "grad_norm": 0.76596999168396, "learning_rate": 3.728425049645463e-05, "loss": 0.1019, "num_input_tokens_seen": 106578192, "step": 49390 }, { "epoch": 8.057911908646004, "grad_norm": 0.06594305485486984, "learning_rate": 3.728115066538791e-05, "loss": 0.0451, "num_input_tokens_seen": 106587312, "step": 49395 }, { "epoch": 8.058727569331158, "grad_norm": 0.09622905403375626, "learning_rate": 3.727805058542809e-05, "loss": 0.1364, "num_input_tokens_seen": 106598672, "step": 49400 }, { "epoch": 8.059543230016313, "grad_norm": 1.886012315750122, "learning_rate": 3.727495025663801e-05, "loss": 0.0759, "num_input_tokens_seen": 106608208, "step": 49405 }, { "epoch": 8.060358890701469, "grad_norm": 0.04060984402894974, "learning_rate": 3.727184967908049e-05, "loss": 0.0399, "num_input_tokens_seen": 106619984, "step": 49410 }, { "epoch": 8.061174551386623, "grad_norm": 1.6695386171340942, "learning_rate": 3.726874885281836e-05, "loss": 0.1272, "num_input_tokens_seen": 106631792, "step": 49415 }, { "epoch": 8.061990212071779, "grad_norm": 0.020426159724593163, "learning_rate": 3.726564777791448e-05, "loss": 0.1033, "num_input_tokens_seen": 106643600, "step": 49420 }, { "epoch": 8.062805872756933, "grad_norm": 1.6907663345336914, "learning_rate": 3.726254645443168e-05, "loss": 0.1179, "num_input_tokens_seen": 106654672, "step": 49425 }, { "epoch": 8.063621533442088, "grad_norm": 1.0665582418441772, "learning_rate": 3.7259444882432824e-05, "loss": 0.1093, "num_input_tokens_seen": 106665456, "step": 49430 }, { "epoch": 8.064437194127244, "grad_norm": 0.5380499958992004, "learning_rate": 3.725634306198077e-05, "loss": 0.1807, "num_input_tokens_seen": 106676880, "step": 49435 }, { "epoch": 8.065252854812398, "grad_norm": 0.10355117172002792, "learning_rate": 3.7253240993138366e-05, "loss": 0.123, "num_input_tokens_seen": 106688528, "step": 49440 }, { "epoch": 8.066068515497554, "grad_norm": 0.2687314450740814, "learning_rate": 3.725013867596849e-05, "loss": 0.1207, "num_input_tokens_seen": 106698992, "step": 49445 }, { "epoch": 8.066884176182707, "grad_norm": 0.23897427320480347, "learning_rate": 3.724703611053402e-05, "loss": 0.0968, "num_input_tokens_seen": 106709168, "step": 49450 }, { "epoch": 8.067699836867863, "grad_norm": 0.4275815784931183, "learning_rate": 3.724393329689783e-05, "loss": 0.1867, "num_input_tokens_seen": 106719920, "step": 49455 }, { "epoch": 8.068515497553017, "grad_norm": 0.0716458410024643, "learning_rate": 3.724083023512278e-05, "loss": 0.0744, "num_input_tokens_seen": 106729872, "step": 49460 }, { "epoch": 8.069331158238173, "grad_norm": 2.3214306831359863, "learning_rate": 3.723772692527179e-05, "loss": 0.1197, "num_input_tokens_seen": 106741168, "step": 49465 }, { "epoch": 8.070146818923329, "grad_norm": 0.3270811438560486, "learning_rate": 3.723462336740774e-05, "loss": 0.0393, "num_input_tokens_seen": 106751376, "step": 49470 }, { "epoch": 8.070962479608482, "grad_norm": 0.8062207102775574, "learning_rate": 3.7231519561593526e-05, "loss": 0.1852, "num_input_tokens_seen": 106761904, "step": 49475 }, { "epoch": 8.071778140293638, "grad_norm": 0.897616982460022, "learning_rate": 3.722841550789205e-05, "loss": 0.1754, "num_input_tokens_seen": 106772240, "step": 49480 }, { "epoch": 8.072593800978792, "grad_norm": 0.18250060081481934, "learning_rate": 3.722531120636622e-05, "loss": 0.0473, "num_input_tokens_seen": 106783216, "step": 49485 }, { "epoch": 8.073409461663948, "grad_norm": 0.4288577735424042, "learning_rate": 3.722220665707895e-05, "loss": 0.112, "num_input_tokens_seen": 106794896, "step": 49490 }, { "epoch": 8.074225122349104, "grad_norm": 0.9301254749298096, "learning_rate": 3.721910186009316e-05, "loss": 0.1831, "num_input_tokens_seen": 106805360, "step": 49495 }, { "epoch": 8.075040783034257, "grad_norm": 2.231471538543701, "learning_rate": 3.721599681547177e-05, "loss": 0.1459, "num_input_tokens_seen": 106816144, "step": 49500 }, { "epoch": 8.075856443719413, "grad_norm": 0.3077041506767273, "learning_rate": 3.72128915232777e-05, "loss": 0.0327, "num_input_tokens_seen": 106827088, "step": 49505 }, { "epoch": 8.076672104404567, "grad_norm": 0.2607018053531647, "learning_rate": 3.720978598357389e-05, "loss": 0.0505, "num_input_tokens_seen": 106837488, "step": 49510 }, { "epoch": 8.077487765089723, "grad_norm": 0.06457935273647308, "learning_rate": 3.7206680196423284e-05, "loss": 0.0239, "num_input_tokens_seen": 106849520, "step": 49515 }, { "epoch": 8.078303425774878, "grad_norm": 0.030213112011551857, "learning_rate": 3.720357416188882e-05, "loss": 0.0375, "num_input_tokens_seen": 106860528, "step": 49520 }, { "epoch": 8.079119086460032, "grad_norm": 1.3063668012619019, "learning_rate": 3.720046788003344e-05, "loss": 0.1072, "num_input_tokens_seen": 106869936, "step": 49525 }, { "epoch": 8.079934747145188, "grad_norm": 1.3258123397827148, "learning_rate": 3.71973613509201e-05, "loss": 0.0717, "num_input_tokens_seen": 106881936, "step": 49530 }, { "epoch": 8.080750407830342, "grad_norm": 0.33584675192832947, "learning_rate": 3.719425457461177e-05, "loss": 0.0285, "num_input_tokens_seen": 106891376, "step": 49535 }, { "epoch": 8.081566068515498, "grad_norm": 0.5233251452445984, "learning_rate": 3.719114755117139e-05, "loss": 0.0998, "num_input_tokens_seen": 106901168, "step": 49540 }, { "epoch": 8.082381729200652, "grad_norm": 1.308191180229187, "learning_rate": 3.718804028066194e-05, "loss": 0.2573, "num_input_tokens_seen": 106912432, "step": 49545 }, { "epoch": 8.083197389885807, "grad_norm": 1.8167341947555542, "learning_rate": 3.7184932763146393e-05, "loss": 0.0393, "num_input_tokens_seen": 106924592, "step": 49550 }, { "epoch": 8.084013050570963, "grad_norm": 0.09463085234165192, "learning_rate": 3.718182499868773e-05, "loss": 0.0972, "num_input_tokens_seen": 106934704, "step": 49555 }, { "epoch": 8.084828711256117, "grad_norm": 0.7647126913070679, "learning_rate": 3.717871698734893e-05, "loss": 0.1097, "num_input_tokens_seen": 106945936, "step": 49560 }, { "epoch": 8.085644371941273, "grad_norm": 1.0551490783691406, "learning_rate": 3.717560872919298e-05, "loss": 0.1139, "num_input_tokens_seen": 106957264, "step": 49565 }, { "epoch": 8.086460032626427, "grad_norm": 0.5061132311820984, "learning_rate": 3.717250022428287e-05, "loss": 0.2564, "num_input_tokens_seen": 106968816, "step": 49570 }, { "epoch": 8.087275693311582, "grad_norm": 0.04559715464711189, "learning_rate": 3.71693914726816e-05, "loss": 0.171, "num_input_tokens_seen": 106981680, "step": 49575 }, { "epoch": 8.088091353996738, "grad_norm": 0.4819051921367645, "learning_rate": 3.716628247445218e-05, "loss": 0.0968, "num_input_tokens_seen": 106992048, "step": 49580 }, { "epoch": 8.088907014681892, "grad_norm": 0.5014238357543945, "learning_rate": 3.71631732296576e-05, "loss": 0.0347, "num_input_tokens_seen": 107004016, "step": 49585 }, { "epoch": 8.089722675367048, "grad_norm": 1.7306557893753052, "learning_rate": 3.716006373836089e-05, "loss": 0.1088, "num_input_tokens_seen": 107015024, "step": 49590 }, { "epoch": 8.090538336052202, "grad_norm": 0.4566853940486908, "learning_rate": 3.715695400062507e-05, "loss": 0.0176, "num_input_tokens_seen": 107025008, "step": 49595 }, { "epoch": 8.091353996737357, "grad_norm": 1.002339482307434, "learning_rate": 3.715384401651315e-05, "loss": 0.0589, "num_input_tokens_seen": 107035408, "step": 49600 }, { "epoch": 8.092169657422513, "grad_norm": 0.7068689465522766, "learning_rate": 3.7150733786088154e-05, "loss": 0.0925, "num_input_tokens_seen": 107046960, "step": 49605 }, { "epoch": 8.092985318107667, "grad_norm": 1.105251669883728, "learning_rate": 3.714762330941313e-05, "loss": 0.2436, "num_input_tokens_seen": 107058736, "step": 49610 }, { "epoch": 8.093800978792823, "grad_norm": 0.21282444894313812, "learning_rate": 3.714451258655111e-05, "loss": 0.1131, "num_input_tokens_seen": 107069872, "step": 49615 }, { "epoch": 8.094616639477977, "grad_norm": 0.05616597458720207, "learning_rate": 3.714140161756514e-05, "loss": 0.0526, "num_input_tokens_seen": 107080400, "step": 49620 }, { "epoch": 8.095432300163132, "grad_norm": 0.645323634147644, "learning_rate": 3.713829040251826e-05, "loss": 0.0534, "num_input_tokens_seen": 107091696, "step": 49625 }, { "epoch": 8.096247960848286, "grad_norm": 1.188189148902893, "learning_rate": 3.713517894147352e-05, "loss": 0.1734, "num_input_tokens_seen": 107101168, "step": 49630 }, { "epoch": 8.097063621533442, "grad_norm": 0.22854280471801758, "learning_rate": 3.7132067234493997e-05, "loss": 0.0937, "num_input_tokens_seen": 107112464, "step": 49635 }, { "epoch": 8.097879282218598, "grad_norm": 2.159069776535034, "learning_rate": 3.712895528164273e-05, "loss": 0.1434, "num_input_tokens_seen": 107123280, "step": 49640 }, { "epoch": 8.098694942903752, "grad_norm": 1.9676742553710938, "learning_rate": 3.71258430829828e-05, "loss": 0.144, "num_input_tokens_seen": 107134800, "step": 49645 }, { "epoch": 8.099510603588907, "grad_norm": 0.07666834443807602, "learning_rate": 3.712273063857728e-05, "loss": 0.1029, "num_input_tokens_seen": 107145904, "step": 49650 }, { "epoch": 8.100326264274061, "grad_norm": 1.1161853075027466, "learning_rate": 3.711961794848925e-05, "loss": 0.1518, "num_input_tokens_seen": 107156464, "step": 49655 }, { "epoch": 8.101141924959217, "grad_norm": 0.46665382385253906, "learning_rate": 3.711650501278178e-05, "loss": 0.0911, "num_input_tokens_seen": 107167824, "step": 49660 }, { "epoch": 8.101957585644373, "grad_norm": 1.7840396165847778, "learning_rate": 3.7113391831517965e-05, "loss": 0.0892, "num_input_tokens_seen": 107178928, "step": 49665 }, { "epoch": 8.102773246329527, "grad_norm": 0.13944868743419647, "learning_rate": 3.7110278404760904e-05, "loss": 0.1812, "num_input_tokens_seen": 107191120, "step": 49670 }, { "epoch": 8.103588907014682, "grad_norm": 0.18051064014434814, "learning_rate": 3.710716473257368e-05, "loss": 0.0989, "num_input_tokens_seen": 107201872, "step": 49675 }, { "epoch": 8.104404567699836, "grad_norm": 1.4650537967681885, "learning_rate": 3.710405081501941e-05, "loss": 0.1696, "num_input_tokens_seen": 107214192, "step": 49680 }, { "epoch": 8.105220228384992, "grad_norm": 0.720413327217102, "learning_rate": 3.710093665216119e-05, "loss": 0.0654, "num_input_tokens_seen": 107225616, "step": 49685 }, { "epoch": 8.106035889070148, "grad_norm": 0.09833262115716934, "learning_rate": 3.709782224406214e-05, "loss": 0.1457, "num_input_tokens_seen": 107236208, "step": 49690 }, { "epoch": 8.106851549755302, "grad_norm": 0.06233258172869682, "learning_rate": 3.709470759078537e-05, "loss": 0.0316, "num_input_tokens_seen": 107247632, "step": 49695 }, { "epoch": 8.107667210440457, "grad_norm": 0.1290806233882904, "learning_rate": 3.709159269239402e-05, "loss": 0.0065, "num_input_tokens_seen": 107258288, "step": 49700 }, { "epoch": 8.108482871125611, "grad_norm": 0.04505281522870064, "learning_rate": 3.70884775489512e-05, "loss": 0.0128, "num_input_tokens_seen": 107268944, "step": 49705 }, { "epoch": 8.109298531810767, "grad_norm": 0.11153709143400192, "learning_rate": 3.708536216052004e-05, "loss": 0.1066, "num_input_tokens_seen": 107279472, "step": 49710 }, { "epoch": 8.11011419249592, "grad_norm": 1.833717942237854, "learning_rate": 3.708224652716369e-05, "loss": 0.1179, "num_input_tokens_seen": 107290064, "step": 49715 }, { "epoch": 8.110929853181077, "grad_norm": 0.043324727565050125, "learning_rate": 3.7079130648945284e-05, "loss": 0.0078, "num_input_tokens_seen": 107300560, "step": 49720 }, { "epoch": 8.111745513866232, "grad_norm": 0.22078530490398407, "learning_rate": 3.707601452592797e-05, "loss": 0.0384, "num_input_tokens_seen": 107310512, "step": 49725 }, { "epoch": 8.112561174551386, "grad_norm": 1.6794185638427734, "learning_rate": 3.707289815817491e-05, "loss": 0.0509, "num_input_tokens_seen": 107321296, "step": 49730 }, { "epoch": 8.113376835236542, "grad_norm": 0.25799861550331116, "learning_rate": 3.706978154574924e-05, "loss": 0.1026, "num_input_tokens_seen": 107331920, "step": 49735 }, { "epoch": 8.114192495921696, "grad_norm": 1.474941611289978, "learning_rate": 3.7066664688714145e-05, "loss": 0.12, "num_input_tokens_seen": 107342768, "step": 49740 }, { "epoch": 8.115008156606851, "grad_norm": 0.46307212114334106, "learning_rate": 3.7063547587132783e-05, "loss": 0.0856, "num_input_tokens_seen": 107352304, "step": 49745 }, { "epoch": 8.115823817292007, "grad_norm": 0.015098211355507374, "learning_rate": 3.7060430241068325e-05, "loss": 0.0152, "num_input_tokens_seen": 107362448, "step": 49750 }, { "epoch": 8.116639477977161, "grad_norm": 0.052428025752305984, "learning_rate": 3.705731265058395e-05, "loss": 0.0215, "num_input_tokens_seen": 107372368, "step": 49755 }, { "epoch": 8.117455138662317, "grad_norm": 0.5274379849433899, "learning_rate": 3.705419481574284e-05, "loss": 0.0707, "num_input_tokens_seen": 107383120, "step": 49760 }, { "epoch": 8.11827079934747, "grad_norm": 1.427869439125061, "learning_rate": 3.705107673660817e-05, "loss": 0.0641, "num_input_tokens_seen": 107394576, "step": 49765 }, { "epoch": 8.119086460032626, "grad_norm": 0.06818311661481857, "learning_rate": 3.704795841324315e-05, "loss": 0.2007, "num_input_tokens_seen": 107404752, "step": 49770 }, { "epoch": 8.119902120717782, "grad_norm": 0.3250872790813446, "learning_rate": 3.704483984571097e-05, "loss": 0.1114, "num_input_tokens_seen": 107414288, "step": 49775 }, { "epoch": 8.120717781402936, "grad_norm": 1.7344938516616821, "learning_rate": 3.704172103407482e-05, "loss": 0.1616, "num_input_tokens_seen": 107425648, "step": 49780 }, { "epoch": 8.121533442088092, "grad_norm": 0.4271804094314575, "learning_rate": 3.703860197839792e-05, "loss": 0.1153, "num_input_tokens_seen": 107437936, "step": 49785 }, { "epoch": 8.122349102773246, "grad_norm": 1.7368472814559937, "learning_rate": 3.703548267874349e-05, "loss": 0.1349, "num_input_tokens_seen": 107448336, "step": 49790 }, { "epoch": 8.123164763458401, "grad_norm": 0.4390563666820526, "learning_rate": 3.703236313517473e-05, "loss": 0.0837, "num_input_tokens_seen": 107458416, "step": 49795 }, { "epoch": 8.123980424143557, "grad_norm": 0.10456763207912445, "learning_rate": 3.7029243347754864e-05, "loss": 0.0687, "num_input_tokens_seen": 107469872, "step": 49800 }, { "epoch": 8.124796084828711, "grad_norm": 0.9202861785888672, "learning_rate": 3.7026123316547126e-05, "loss": 0.1734, "num_input_tokens_seen": 107479760, "step": 49805 }, { "epoch": 8.125611745513867, "grad_norm": 0.80658358335495, "learning_rate": 3.702300304161474e-05, "loss": 0.1064, "num_input_tokens_seen": 107490096, "step": 49810 }, { "epoch": 8.12642740619902, "grad_norm": 0.2910720407962799, "learning_rate": 3.701988252302094e-05, "loss": 0.1486, "num_input_tokens_seen": 107500368, "step": 49815 }, { "epoch": 8.127243066884176, "grad_norm": 0.4468544125556946, "learning_rate": 3.701676176082898e-05, "loss": 0.0665, "num_input_tokens_seen": 107511600, "step": 49820 }, { "epoch": 8.12805872756933, "grad_norm": 0.19364526867866516, "learning_rate": 3.70136407551021e-05, "loss": 0.126, "num_input_tokens_seen": 107522672, "step": 49825 }, { "epoch": 8.128874388254486, "grad_norm": 1.3700603246688843, "learning_rate": 3.701051950590354e-05, "loss": 0.0417, "num_input_tokens_seen": 107534096, "step": 49830 }, { "epoch": 8.129690048939642, "grad_norm": 2.010178804397583, "learning_rate": 3.7007398013296576e-05, "loss": 0.249, "num_input_tokens_seen": 107543472, "step": 49835 }, { "epoch": 8.130505709624796, "grad_norm": 1.3760910034179688, "learning_rate": 3.7004276277344454e-05, "loss": 0.2761, "num_input_tokens_seen": 107554224, "step": 49840 }, { "epoch": 8.131321370309951, "grad_norm": 0.1802513748407364, "learning_rate": 3.700115429811044e-05, "loss": 0.1054, "num_input_tokens_seen": 107564240, "step": 49845 }, { "epoch": 8.132137030995105, "grad_norm": 0.6458748579025269, "learning_rate": 3.699803207565782e-05, "loss": 0.082, "num_input_tokens_seen": 107574800, "step": 49850 }, { "epoch": 8.132952691680261, "grad_norm": 0.36005133390426636, "learning_rate": 3.699490961004986e-05, "loss": 0.2474, "num_input_tokens_seen": 107585520, "step": 49855 }, { "epoch": 8.133768352365417, "grad_norm": 0.1503547728061676, "learning_rate": 3.699178690134983e-05, "loss": 0.137, "num_input_tokens_seen": 107596112, "step": 49860 }, { "epoch": 8.13458401305057, "grad_norm": 0.6030675172805786, "learning_rate": 3.698866394962103e-05, "loss": 0.0258, "num_input_tokens_seen": 107607120, "step": 49865 }, { "epoch": 8.135399673735726, "grad_norm": 0.13287097215652466, "learning_rate": 3.6985540754926743e-05, "loss": 0.1813, "num_input_tokens_seen": 107618000, "step": 49870 }, { "epoch": 8.13621533442088, "grad_norm": 0.5421844720840454, "learning_rate": 3.6982417317330275e-05, "loss": 0.1337, "num_input_tokens_seen": 107629232, "step": 49875 }, { "epoch": 8.137030995106036, "grad_norm": 0.9381259083747864, "learning_rate": 3.6979293636894916e-05, "loss": 0.0647, "num_input_tokens_seen": 107640784, "step": 49880 }, { "epoch": 8.137846655791192, "grad_norm": 1.6220587491989136, "learning_rate": 3.697616971368397e-05, "loss": 0.2183, "num_input_tokens_seen": 107650576, "step": 49885 }, { "epoch": 8.138662316476346, "grad_norm": 0.15553490817546844, "learning_rate": 3.697304554776076e-05, "loss": 0.0204, "num_input_tokens_seen": 107660368, "step": 49890 }, { "epoch": 8.139477977161501, "grad_norm": 0.052507348358631134, "learning_rate": 3.696992113918859e-05, "loss": 0.09, "num_input_tokens_seen": 107670096, "step": 49895 }, { "epoch": 8.140293637846655, "grad_norm": 1.984494686126709, "learning_rate": 3.696679648803078e-05, "loss": 0.2121, "num_input_tokens_seen": 107680240, "step": 49900 }, { "epoch": 8.141109298531811, "grad_norm": 0.040503859519958496, "learning_rate": 3.696367159435066e-05, "loss": 0.051, "num_input_tokens_seen": 107691504, "step": 49905 }, { "epoch": 8.141924959216965, "grad_norm": 1.7327821254730225, "learning_rate": 3.696054645821156e-05, "loss": 0.2462, "num_input_tokens_seen": 107701968, "step": 49910 }, { "epoch": 8.14274061990212, "grad_norm": 1.2765849828720093, "learning_rate": 3.6957421079676815e-05, "loss": 0.2279, "num_input_tokens_seen": 107713136, "step": 49915 }, { "epoch": 8.143556280587276, "grad_norm": 0.9911106824874878, "learning_rate": 3.6954295458809754e-05, "loss": 0.1098, "num_input_tokens_seen": 107724080, "step": 49920 }, { "epoch": 8.14437194127243, "grad_norm": 0.6760399341583252, "learning_rate": 3.695116959567373e-05, "loss": 0.0264, "num_input_tokens_seen": 107735792, "step": 49925 }, { "epoch": 8.145187601957586, "grad_norm": 0.45278286933898926, "learning_rate": 3.694804349033211e-05, "loss": 0.108, "num_input_tokens_seen": 107747280, "step": 49930 }, { "epoch": 8.14600326264274, "grad_norm": 1.198797345161438, "learning_rate": 3.694491714284821e-05, "loss": 0.1728, "num_input_tokens_seen": 107758448, "step": 49935 }, { "epoch": 8.146818923327896, "grad_norm": 0.2771700918674469, "learning_rate": 3.694179055328543e-05, "loss": 0.0487, "num_input_tokens_seen": 107769712, "step": 49940 }, { "epoch": 8.147634584013051, "grad_norm": 0.03497885912656784, "learning_rate": 3.69386637217071e-05, "loss": 0.1195, "num_input_tokens_seen": 107780624, "step": 49945 }, { "epoch": 8.148450244698205, "grad_norm": 0.1810864806175232, "learning_rate": 3.693553664817661e-05, "loss": 0.0611, "num_input_tokens_seen": 107790800, "step": 49950 }, { "epoch": 8.149265905383361, "grad_norm": 0.11095081269741058, "learning_rate": 3.693240933275733e-05, "loss": 0.1262, "num_input_tokens_seen": 107802736, "step": 49955 }, { "epoch": 8.150081566068515, "grad_norm": 0.42581069469451904, "learning_rate": 3.6929281775512636e-05, "loss": 0.0397, "num_input_tokens_seen": 107813552, "step": 49960 }, { "epoch": 8.15089722675367, "grad_norm": 0.10139238834381104, "learning_rate": 3.6926153976505915e-05, "loss": 0.1669, "num_input_tokens_seen": 107825264, "step": 49965 }, { "epoch": 8.151712887438826, "grad_norm": 0.5673112273216248, "learning_rate": 3.692302593580055e-05, "loss": 0.037, "num_input_tokens_seen": 107836880, "step": 49970 }, { "epoch": 8.15252854812398, "grad_norm": 0.1945202499628067, "learning_rate": 3.691989765345994e-05, "loss": 0.0356, "num_input_tokens_seen": 107849392, "step": 49975 }, { "epoch": 8.153344208809136, "grad_norm": 1.3808109760284424, "learning_rate": 3.691676912954749e-05, "loss": 0.0934, "num_input_tokens_seen": 107860144, "step": 49980 }, { "epoch": 8.15415986949429, "grad_norm": 0.09474756568670273, "learning_rate": 3.69136403641266e-05, "loss": 0.1722, "num_input_tokens_seen": 107869520, "step": 49985 }, { "epoch": 8.154975530179446, "grad_norm": 0.3527454137802124, "learning_rate": 3.6910511357260664e-05, "loss": 0.1525, "num_input_tokens_seen": 107879792, "step": 49990 }, { "epoch": 8.1557911908646, "grad_norm": 1.0900264978408813, "learning_rate": 3.69073821090131e-05, "loss": 0.0595, "num_input_tokens_seen": 107890672, "step": 49995 }, { "epoch": 8.156606851549755, "grad_norm": 0.08869513124227524, "learning_rate": 3.6904252619447345e-05, "loss": 0.1284, "num_input_tokens_seen": 107900528, "step": 50000 }, { "epoch": 8.15742251223491, "grad_norm": 1.5097944736480713, "learning_rate": 3.69011228886268e-05, "loss": 0.3093, "num_input_tokens_seen": 107911248, "step": 50005 }, { "epoch": 8.158238172920065, "grad_norm": 0.24542425572872162, "learning_rate": 3.689799291661491e-05, "loss": 0.0168, "num_input_tokens_seen": 107922800, "step": 50010 }, { "epoch": 8.15905383360522, "grad_norm": 0.8559415936470032, "learning_rate": 3.68948627034751e-05, "loss": 0.0876, "num_input_tokens_seen": 107934032, "step": 50015 }, { "epoch": 8.159869494290374, "grad_norm": 0.04786519333720207, "learning_rate": 3.68917322492708e-05, "loss": 0.0722, "num_input_tokens_seen": 107946128, "step": 50020 }, { "epoch": 8.16068515497553, "grad_norm": 0.052465829998254776, "learning_rate": 3.688860155406546e-05, "loss": 0.0642, "num_input_tokens_seen": 107956912, "step": 50025 }, { "epoch": 8.161500815660686, "grad_norm": 1.4841512441635132, "learning_rate": 3.688547061792254e-05, "loss": 0.1357, "num_input_tokens_seen": 107968112, "step": 50030 }, { "epoch": 8.16231647634584, "grad_norm": 0.08950036764144897, "learning_rate": 3.688233944090547e-05, "loss": 0.0637, "num_input_tokens_seen": 107978352, "step": 50035 }, { "epoch": 8.163132137030995, "grad_norm": 1.036084771156311, "learning_rate": 3.687920802307771e-05, "loss": 0.0614, "num_input_tokens_seen": 107989264, "step": 50040 }, { "epoch": 8.16394779771615, "grad_norm": 0.4328325688838959, "learning_rate": 3.6876076364502745e-05, "loss": 0.0541, "num_input_tokens_seen": 107999888, "step": 50045 }, { "epoch": 8.164763458401305, "grad_norm": 0.05894358456134796, "learning_rate": 3.687294446524401e-05, "loss": 0.0907, "num_input_tokens_seen": 108009680, "step": 50050 }, { "epoch": 8.16557911908646, "grad_norm": 0.7734270095825195, "learning_rate": 3.686981232536501e-05, "loss": 0.1219, "num_input_tokens_seen": 108020368, "step": 50055 }, { "epoch": 8.166394779771615, "grad_norm": 0.26486918330192566, "learning_rate": 3.686667994492919e-05, "loss": 0.1457, "num_input_tokens_seen": 108030736, "step": 50060 }, { "epoch": 8.16721044045677, "grad_norm": 0.07333678752183914, "learning_rate": 3.686354732400006e-05, "loss": 0.0203, "num_input_tokens_seen": 108041232, "step": 50065 }, { "epoch": 8.168026101141924, "grad_norm": 0.09237316250801086, "learning_rate": 3.686041446264109e-05, "loss": 0.1056, "num_input_tokens_seen": 108053264, "step": 50070 }, { "epoch": 8.16884176182708, "grad_norm": 7.59705114364624, "learning_rate": 3.685728136091578e-05, "loss": 0.0857, "num_input_tokens_seen": 108063088, "step": 50075 }, { "epoch": 8.169657422512234, "grad_norm": 0.5161164402961731, "learning_rate": 3.6854148018887616e-05, "loss": 0.0431, "num_input_tokens_seen": 108072688, "step": 50080 }, { "epoch": 8.17047308319739, "grad_norm": 0.9626469612121582, "learning_rate": 3.68510144366201e-05, "loss": 0.0552, "num_input_tokens_seen": 108085296, "step": 50085 }, { "epoch": 8.171288743882545, "grad_norm": 0.8847259879112244, "learning_rate": 3.684788061417674e-05, "loss": 0.0695, "num_input_tokens_seen": 108096816, "step": 50090 }, { "epoch": 8.1721044045677, "grad_norm": 0.0831744372844696, "learning_rate": 3.6844746551621054e-05, "loss": 0.1756, "num_input_tokens_seen": 108107280, "step": 50095 }, { "epoch": 8.172920065252855, "grad_norm": 0.06224730238318443, "learning_rate": 3.684161224901656e-05, "loss": 0.1084, "num_input_tokens_seen": 108117552, "step": 50100 }, { "epoch": 8.173735725938009, "grad_norm": 0.7830578684806824, "learning_rate": 3.6838477706426766e-05, "loss": 0.0576, "num_input_tokens_seen": 108129264, "step": 50105 }, { "epoch": 8.174551386623165, "grad_norm": 1.9621553421020508, "learning_rate": 3.6835342923915205e-05, "loss": 0.082, "num_input_tokens_seen": 108139472, "step": 50110 }, { "epoch": 8.17536704730832, "grad_norm": 0.8932862877845764, "learning_rate": 3.68322079015454e-05, "loss": 0.1052, "num_input_tokens_seen": 108149712, "step": 50115 }, { "epoch": 8.176182707993474, "grad_norm": 0.10268041491508484, "learning_rate": 3.6829072639380895e-05, "loss": 0.0823, "num_input_tokens_seen": 108160464, "step": 50120 }, { "epoch": 8.17699836867863, "grad_norm": 1.8511784076690674, "learning_rate": 3.682593713748523e-05, "loss": 0.1913, "num_input_tokens_seen": 108171312, "step": 50125 }, { "epoch": 8.177814029363784, "grad_norm": 0.6542529463768005, "learning_rate": 3.682280139592194e-05, "loss": 0.1775, "num_input_tokens_seen": 108181616, "step": 50130 }, { "epoch": 8.17862969004894, "grad_norm": 0.04034741222858429, "learning_rate": 3.681966541475459e-05, "loss": 0.1885, "num_input_tokens_seen": 108191792, "step": 50135 }, { "epoch": 8.179445350734095, "grad_norm": 0.3873077630996704, "learning_rate": 3.681652919404672e-05, "loss": 0.2129, "num_input_tokens_seen": 108203216, "step": 50140 }, { "epoch": 8.18026101141925, "grad_norm": 0.13403967022895813, "learning_rate": 3.68133927338619e-05, "loss": 0.0616, "num_input_tokens_seen": 108214000, "step": 50145 }, { "epoch": 8.181076672104405, "grad_norm": 1.1661133766174316, "learning_rate": 3.681025603426368e-05, "loss": 0.1021, "num_input_tokens_seen": 108224528, "step": 50150 }, { "epoch": 8.181892332789559, "grad_norm": 0.6238021850585938, "learning_rate": 3.680711909531564e-05, "loss": 0.024, "num_input_tokens_seen": 108235088, "step": 50155 }, { "epoch": 8.182707993474715, "grad_norm": 1.117180585861206, "learning_rate": 3.680398191708136e-05, "loss": 0.0898, "num_input_tokens_seen": 108245616, "step": 50160 }, { "epoch": 8.18352365415987, "grad_norm": 1.511391043663025, "learning_rate": 3.680084449962441e-05, "loss": 0.1694, "num_input_tokens_seen": 108256144, "step": 50165 }, { "epoch": 8.184339314845024, "grad_norm": 0.2544296681880951, "learning_rate": 3.679770684300838e-05, "loss": 0.1189, "num_input_tokens_seen": 108267248, "step": 50170 }, { "epoch": 8.18515497553018, "grad_norm": 2.721825122833252, "learning_rate": 3.679456894729685e-05, "loss": 0.1797, "num_input_tokens_seen": 108278128, "step": 50175 }, { "epoch": 8.185970636215334, "grad_norm": 0.5606516599655151, "learning_rate": 3.679143081255342e-05, "loss": 0.2162, "num_input_tokens_seen": 108290352, "step": 50180 }, { "epoch": 8.18678629690049, "grad_norm": 0.48966318368911743, "learning_rate": 3.6788292438841684e-05, "loss": 0.0394, "num_input_tokens_seen": 108301776, "step": 50185 }, { "epoch": 8.187601957585644, "grad_norm": 1.900164008140564, "learning_rate": 3.678515382622525e-05, "loss": 0.0838, "num_input_tokens_seen": 108312208, "step": 50190 }, { "epoch": 8.1884176182708, "grad_norm": 1.3442363739013672, "learning_rate": 3.678201497476772e-05, "loss": 0.1679, "num_input_tokens_seen": 108322480, "step": 50195 }, { "epoch": 8.189233278955955, "grad_norm": 0.3446032404899597, "learning_rate": 3.6778875884532715e-05, "loss": 0.0434, "num_input_tokens_seen": 108333392, "step": 50200 }, { "epoch": 8.190048939641109, "grad_norm": 0.07435698062181473, "learning_rate": 3.677573655558384e-05, "loss": 0.0302, "num_input_tokens_seen": 108343856, "step": 50205 }, { "epoch": 8.190864600326265, "grad_norm": 0.08702895790338516, "learning_rate": 3.677259698798473e-05, "loss": 0.0895, "num_input_tokens_seen": 108355728, "step": 50210 }, { "epoch": 8.191680261011419, "grad_norm": 0.06019704043865204, "learning_rate": 3.6769457181799e-05, "loss": 0.2584, "num_input_tokens_seen": 108366736, "step": 50215 }, { "epoch": 8.192495921696574, "grad_norm": 0.48704952001571655, "learning_rate": 3.67663171370903e-05, "loss": 0.0355, "num_input_tokens_seen": 108376912, "step": 50220 }, { "epoch": 8.19331158238173, "grad_norm": 0.1686732918024063, "learning_rate": 3.6763176853922254e-05, "loss": 0.2239, "num_input_tokens_seen": 108387440, "step": 50225 }, { "epoch": 8.194127243066884, "grad_norm": 0.13096849620342255, "learning_rate": 3.67600363323585e-05, "loss": 0.2654, "num_input_tokens_seen": 108397232, "step": 50230 }, { "epoch": 8.19494290375204, "grad_norm": 0.8832622766494751, "learning_rate": 3.6756895572462704e-05, "loss": 0.06, "num_input_tokens_seen": 108408016, "step": 50235 }, { "epoch": 8.195758564437194, "grad_norm": 1.5004000663757324, "learning_rate": 3.675375457429849e-05, "loss": 0.1129, "num_input_tokens_seen": 108419024, "step": 50240 }, { "epoch": 8.19657422512235, "grad_norm": 0.10239819437265396, "learning_rate": 3.675061333792954e-05, "loss": 0.0307, "num_input_tokens_seen": 108430320, "step": 50245 }, { "epoch": 8.197389885807505, "grad_norm": 0.06289447098970413, "learning_rate": 3.67474718634195e-05, "loss": 0.0169, "num_input_tokens_seen": 108441104, "step": 50250 }, { "epoch": 8.198205546492659, "grad_norm": 0.16951854526996613, "learning_rate": 3.674433015083203e-05, "loss": 0.1751, "num_input_tokens_seen": 108452656, "step": 50255 }, { "epoch": 8.199021207177815, "grad_norm": 1.390104055404663, "learning_rate": 3.6741188200230823e-05, "loss": 0.1626, "num_input_tokens_seen": 108462992, "step": 50260 }, { "epoch": 8.199836867862969, "grad_norm": 0.05926079303026199, "learning_rate": 3.673804601167954e-05, "loss": 0.0842, "num_input_tokens_seen": 108473744, "step": 50265 }, { "epoch": 8.200652528548124, "grad_norm": 0.29574137926101685, "learning_rate": 3.6734903585241865e-05, "loss": 0.1059, "num_input_tokens_seen": 108485136, "step": 50270 }, { "epoch": 8.201468189233278, "grad_norm": 1.2055307626724243, "learning_rate": 3.673176092098148e-05, "loss": 0.144, "num_input_tokens_seen": 108496720, "step": 50275 }, { "epoch": 8.202283849918434, "grad_norm": 1.6551684141159058, "learning_rate": 3.6728618018962075e-05, "loss": 0.0998, "num_input_tokens_seen": 108508016, "step": 50280 }, { "epoch": 8.20309951060359, "grad_norm": 1.0741239786148071, "learning_rate": 3.672547487924735e-05, "loss": 0.1932, "num_input_tokens_seen": 108517872, "step": 50285 }, { "epoch": 8.203915171288743, "grad_norm": 0.997214138507843, "learning_rate": 3.6722331501901e-05, "loss": 0.1612, "num_input_tokens_seen": 108527280, "step": 50290 }, { "epoch": 8.2047308319739, "grad_norm": 0.1501348912715912, "learning_rate": 3.6719187886986736e-05, "loss": 0.1545, "num_input_tokens_seen": 108538192, "step": 50295 }, { "epoch": 8.205546492659053, "grad_norm": 1.3140076398849487, "learning_rate": 3.671604403456826e-05, "loss": 0.1898, "num_input_tokens_seen": 108546960, "step": 50300 }, { "epoch": 8.206362153344209, "grad_norm": 0.48490026593208313, "learning_rate": 3.6712899944709286e-05, "loss": 0.0676, "num_input_tokens_seen": 108556912, "step": 50305 }, { "epoch": 8.207177814029365, "grad_norm": 0.12691770493984222, "learning_rate": 3.670975561747354e-05, "loss": 0.0538, "num_input_tokens_seen": 108568144, "step": 50310 }, { "epoch": 8.207993474714518, "grad_norm": 0.05067085102200508, "learning_rate": 3.670661105292474e-05, "loss": 0.0101, "num_input_tokens_seen": 108578160, "step": 50315 }, { "epoch": 8.208809135399674, "grad_norm": 0.09486005455255508, "learning_rate": 3.670346625112661e-05, "loss": 0.116, "num_input_tokens_seen": 108589296, "step": 50320 }, { "epoch": 8.209624796084828, "grad_norm": 0.4080318510532379, "learning_rate": 3.6700321212142894e-05, "loss": 0.0392, "num_input_tokens_seen": 108600240, "step": 50325 }, { "epoch": 8.210440456769984, "grad_norm": 1.5865602493286133, "learning_rate": 3.669717593603733e-05, "loss": 0.1184, "num_input_tokens_seen": 108611056, "step": 50330 }, { "epoch": 8.21125611745514, "grad_norm": 0.3654673099517822, "learning_rate": 3.669403042287366e-05, "loss": 0.188, "num_input_tokens_seen": 108620400, "step": 50335 }, { "epoch": 8.212071778140293, "grad_norm": 0.41790589690208435, "learning_rate": 3.669088467271562e-05, "loss": 0.0317, "num_input_tokens_seen": 108632432, "step": 50340 }, { "epoch": 8.21288743882545, "grad_norm": 0.8099961280822754, "learning_rate": 3.668773868562697e-05, "loss": 0.2433, "num_input_tokens_seen": 108642896, "step": 50345 }, { "epoch": 8.213703099510603, "grad_norm": 1.4720426797866821, "learning_rate": 3.6684592461671475e-05, "loss": 0.148, "num_input_tokens_seen": 108652816, "step": 50350 }, { "epoch": 8.214518760195759, "grad_norm": 0.31958553194999695, "learning_rate": 3.668144600091288e-05, "loss": 0.1977, "num_input_tokens_seen": 108663792, "step": 50355 }, { "epoch": 8.215334420880913, "grad_norm": 0.12675809860229492, "learning_rate": 3.667829930341497e-05, "loss": 0.0257, "num_input_tokens_seen": 108674896, "step": 50360 }, { "epoch": 8.216150081566068, "grad_norm": 1.0486598014831543, "learning_rate": 3.667515236924151e-05, "loss": 0.1532, "num_input_tokens_seen": 108685584, "step": 50365 }, { "epoch": 8.216965742251224, "grad_norm": 1.2290148735046387, "learning_rate": 3.667200519845628e-05, "loss": 0.1514, "num_input_tokens_seen": 108696560, "step": 50370 }, { "epoch": 8.217781402936378, "grad_norm": 0.28721603751182556, "learning_rate": 3.666885779112305e-05, "loss": 0.0307, "num_input_tokens_seen": 108706512, "step": 50375 }, { "epoch": 8.218597063621534, "grad_norm": 1.0052610635757446, "learning_rate": 3.666571014730562e-05, "loss": 0.1955, "num_input_tokens_seen": 108717808, "step": 50380 }, { "epoch": 8.219412724306688, "grad_norm": 0.5047445893287659, "learning_rate": 3.6662562267067774e-05, "loss": 0.0597, "num_input_tokens_seen": 108729040, "step": 50385 }, { "epoch": 8.220228384991843, "grad_norm": 0.07414247840642929, "learning_rate": 3.6659414150473304e-05, "loss": 0.0954, "num_input_tokens_seen": 108739536, "step": 50390 }, { "epoch": 8.221044045676999, "grad_norm": 0.7429039478302002, "learning_rate": 3.665626579758602e-05, "loss": 0.0441, "num_input_tokens_seen": 108750640, "step": 50395 }, { "epoch": 8.221859706362153, "grad_norm": 0.6020564436912537, "learning_rate": 3.6653117208469726e-05, "loss": 0.0963, "num_input_tokens_seen": 108760368, "step": 50400 }, { "epoch": 8.222675367047309, "grad_norm": 0.29395824670791626, "learning_rate": 3.6649968383188214e-05, "loss": 0.065, "num_input_tokens_seen": 108772336, "step": 50405 }, { "epoch": 8.223491027732463, "grad_norm": 2.159144878387451, "learning_rate": 3.664681932180533e-05, "loss": 0.1063, "num_input_tokens_seen": 108782800, "step": 50410 }, { "epoch": 8.224306688417618, "grad_norm": 0.02784937620162964, "learning_rate": 3.6643670024384866e-05, "loss": 0.0592, "num_input_tokens_seen": 108794000, "step": 50415 }, { "epoch": 8.225122349102774, "grad_norm": 0.6062574982643127, "learning_rate": 3.664052049099066e-05, "loss": 0.0313, "num_input_tokens_seen": 108805392, "step": 50420 }, { "epoch": 8.225938009787928, "grad_norm": 1.045332670211792, "learning_rate": 3.6637370721686546e-05, "loss": 0.0605, "num_input_tokens_seen": 108817616, "step": 50425 }, { "epoch": 8.226753670473084, "grad_norm": 0.13454051315784454, "learning_rate": 3.663422071653635e-05, "loss": 0.0472, "num_input_tokens_seen": 108829360, "step": 50430 }, { "epoch": 8.227569331158238, "grad_norm": 0.9158738255500793, "learning_rate": 3.6631070475603904e-05, "loss": 0.1988, "num_input_tokens_seen": 108839696, "step": 50435 }, { "epoch": 8.228384991843393, "grad_norm": 0.12539826333522797, "learning_rate": 3.6627919998953064e-05, "loss": 0.0599, "num_input_tokens_seen": 108851184, "step": 50440 }, { "epoch": 8.229200652528547, "grad_norm": 0.38683581352233887, "learning_rate": 3.6624769286647676e-05, "loss": 0.1968, "num_input_tokens_seen": 108861744, "step": 50445 }, { "epoch": 8.230016313213703, "grad_norm": 1.6099454164505005, "learning_rate": 3.6621618338751594e-05, "loss": 0.0542, "num_input_tokens_seen": 108872816, "step": 50450 }, { "epoch": 8.230831973898859, "grad_norm": 0.11770971119403839, "learning_rate": 3.661846715532867e-05, "loss": 0.0514, "num_input_tokens_seen": 108884688, "step": 50455 }, { "epoch": 8.231647634584013, "grad_norm": 1.1209896802902222, "learning_rate": 3.661531573644277e-05, "loss": 0.1616, "num_input_tokens_seen": 108896208, "step": 50460 }, { "epoch": 8.232463295269168, "grad_norm": 0.9183205366134644, "learning_rate": 3.6612164082157754e-05, "loss": 0.1414, "num_input_tokens_seen": 108907536, "step": 50465 }, { "epoch": 8.233278955954322, "grad_norm": 0.0185911413282156, "learning_rate": 3.6609012192537515e-05, "loss": 0.0727, "num_input_tokens_seen": 108917360, "step": 50470 }, { "epoch": 8.234094616639478, "grad_norm": 0.05116751044988632, "learning_rate": 3.6605860067645906e-05, "loss": 0.1297, "num_input_tokens_seen": 108927952, "step": 50475 }, { "epoch": 8.234910277324634, "grad_norm": 0.9193283319473267, "learning_rate": 3.660270770754683e-05, "loss": 0.0382, "num_input_tokens_seen": 108939248, "step": 50480 }, { "epoch": 8.235725938009788, "grad_norm": 1.1883975267410278, "learning_rate": 3.659955511230416e-05, "loss": 0.2364, "num_input_tokens_seen": 108950832, "step": 50485 }, { "epoch": 8.236541598694943, "grad_norm": 0.9812391996383667, "learning_rate": 3.659640228198179e-05, "loss": 0.1373, "num_input_tokens_seen": 108961488, "step": 50490 }, { "epoch": 8.237357259380097, "grad_norm": 0.9483889937400818, "learning_rate": 3.659324921664361e-05, "loss": 0.1565, "num_input_tokens_seen": 108971888, "step": 50495 }, { "epoch": 8.238172920065253, "grad_norm": 1.248734712600708, "learning_rate": 3.6590095916353534e-05, "loss": 0.1066, "num_input_tokens_seen": 108983856, "step": 50500 }, { "epoch": 8.238988580750409, "grad_norm": 0.06839264929294586, "learning_rate": 3.658694238117546e-05, "loss": 0.2556, "num_input_tokens_seen": 108995408, "step": 50505 }, { "epoch": 8.239804241435563, "grad_norm": 0.02316019870340824, "learning_rate": 3.65837886111733e-05, "loss": 0.169, "num_input_tokens_seen": 109006736, "step": 50510 }, { "epoch": 8.240619902120718, "grad_norm": 0.3217988610267639, "learning_rate": 3.658063460641097e-05, "loss": 0.026, "num_input_tokens_seen": 109016528, "step": 50515 }, { "epoch": 8.241435562805872, "grad_norm": 1.020825743675232, "learning_rate": 3.657748036695239e-05, "loss": 0.1444, "num_input_tokens_seen": 109028688, "step": 50520 }, { "epoch": 8.242251223491028, "grad_norm": 0.06317421793937683, "learning_rate": 3.657432589286148e-05, "loss": 0.2001, "num_input_tokens_seen": 109039792, "step": 50525 }, { "epoch": 8.243066884176184, "grad_norm": 0.13814303278923035, "learning_rate": 3.657117118420217e-05, "loss": 0.02, "num_input_tokens_seen": 109050416, "step": 50530 }, { "epoch": 8.243882544861338, "grad_norm": 0.08932449668645859, "learning_rate": 3.65680162410384e-05, "loss": 0.1436, "num_input_tokens_seen": 109060816, "step": 50535 }, { "epoch": 8.244698205546493, "grad_norm": 0.07782863080501556, "learning_rate": 3.65648610634341e-05, "loss": 0.0195, "num_input_tokens_seen": 109071920, "step": 50540 }, { "epoch": 8.245513866231647, "grad_norm": 0.2726622223854065, "learning_rate": 3.656170565145323e-05, "loss": 0.1787, "num_input_tokens_seen": 109082512, "step": 50545 }, { "epoch": 8.246329526916803, "grad_norm": 0.1001446321606636, "learning_rate": 3.655855000515972e-05, "loss": 0.1662, "num_input_tokens_seen": 109093744, "step": 50550 }, { "epoch": 8.247145187601957, "grad_norm": 1.0486910343170166, "learning_rate": 3.655539412461753e-05, "loss": 0.0418, "num_input_tokens_seen": 109104368, "step": 50555 }, { "epoch": 8.247960848287113, "grad_norm": 1.1086108684539795, "learning_rate": 3.655223800989063e-05, "loss": 0.2103, "num_input_tokens_seen": 109115920, "step": 50560 }, { "epoch": 8.248776508972268, "grad_norm": 0.562792956829071, "learning_rate": 3.6549081661042964e-05, "loss": 0.065, "num_input_tokens_seen": 109127088, "step": 50565 }, { "epoch": 8.249592169657422, "grad_norm": 0.056805942207574844, "learning_rate": 3.6545925078138505e-05, "loss": 0.0775, "num_input_tokens_seen": 109138736, "step": 50570 }, { "epoch": 8.250407830342578, "grad_norm": 0.05263840779662132, "learning_rate": 3.654276826124122e-05, "loss": 0.1526, "num_input_tokens_seen": 109149488, "step": 50575 }, { "epoch": 8.251223491027732, "grad_norm": 0.7909536361694336, "learning_rate": 3.653961121041511e-05, "loss": 0.0451, "num_input_tokens_seen": 109160656, "step": 50580 }, { "epoch": 8.252039151712887, "grad_norm": 0.4655928611755371, "learning_rate": 3.6536453925724134e-05, "loss": 0.0259, "num_input_tokens_seen": 109171312, "step": 50585 }, { "epoch": 8.252854812398043, "grad_norm": 0.31767717003822327, "learning_rate": 3.653329640723228e-05, "loss": 0.0163, "num_input_tokens_seen": 109182384, "step": 50590 }, { "epoch": 8.253670473083197, "grad_norm": 0.03286808729171753, "learning_rate": 3.6530138655003546e-05, "loss": 0.0358, "num_input_tokens_seen": 109191792, "step": 50595 }, { "epoch": 8.254486133768353, "grad_norm": 0.3663429021835327, "learning_rate": 3.652698066910193e-05, "loss": 0.1066, "num_input_tokens_seen": 109202960, "step": 50600 }, { "epoch": 8.255301794453507, "grad_norm": 0.5793092846870422, "learning_rate": 3.652382244959142e-05, "loss": 0.0915, "num_input_tokens_seen": 109214288, "step": 50605 }, { "epoch": 8.256117455138662, "grad_norm": 0.07868511229753494, "learning_rate": 3.6520663996536034e-05, "loss": 0.0795, "num_input_tokens_seen": 109225328, "step": 50610 }, { "epoch": 8.256933115823816, "grad_norm": 0.18677887320518494, "learning_rate": 3.651750530999978e-05, "loss": 0.1268, "num_input_tokens_seen": 109236016, "step": 50615 }, { "epoch": 8.257748776508972, "grad_norm": 0.540507972240448, "learning_rate": 3.651434639004666e-05, "loss": 0.3038, "num_input_tokens_seen": 109247344, "step": 50620 }, { "epoch": 8.258564437194128, "grad_norm": 0.48391908407211304, "learning_rate": 3.651118723674071e-05, "loss": 0.1781, "num_input_tokens_seen": 109258864, "step": 50625 }, { "epoch": 8.259380097879282, "grad_norm": 0.06972142308950424, "learning_rate": 3.6508027850145946e-05, "loss": 0.0586, "num_input_tokens_seen": 109270192, "step": 50630 }, { "epoch": 8.260195758564437, "grad_norm": 0.24818803369998932, "learning_rate": 3.65048682303264e-05, "loss": 0.0196, "num_input_tokens_seen": 109280016, "step": 50635 }, { "epoch": 8.261011419249591, "grad_norm": 0.09745069593191147, "learning_rate": 3.650170837734611e-05, "loss": 0.0468, "num_input_tokens_seen": 109290256, "step": 50640 }, { "epoch": 8.261827079934747, "grad_norm": 0.4733607769012451, "learning_rate": 3.64985482912691e-05, "loss": 0.0875, "num_input_tokens_seen": 109300208, "step": 50645 }, { "epoch": 8.262642740619903, "grad_norm": 1.1014478206634521, "learning_rate": 3.649538797215942e-05, "loss": 0.1645, "num_input_tokens_seen": 109310960, "step": 50650 }, { "epoch": 8.263458401305057, "grad_norm": 0.31082189083099365, "learning_rate": 3.649222742008113e-05, "loss": 0.1752, "num_input_tokens_seen": 109321040, "step": 50655 }, { "epoch": 8.264274061990212, "grad_norm": 0.1948697566986084, "learning_rate": 3.648906663509826e-05, "loss": 0.3091, "num_input_tokens_seen": 109331056, "step": 50660 }, { "epoch": 8.265089722675366, "grad_norm": 0.09897306561470032, "learning_rate": 3.6485905617274884e-05, "loss": 0.1474, "num_input_tokens_seen": 109340816, "step": 50665 }, { "epoch": 8.265905383360522, "grad_norm": 0.23268413543701172, "learning_rate": 3.648274436667506e-05, "loss": 0.0699, "num_input_tokens_seen": 109351984, "step": 50670 }, { "epoch": 8.266721044045678, "grad_norm": 2.200930118560791, "learning_rate": 3.647958288336286e-05, "loss": 0.1123, "num_input_tokens_seen": 109361616, "step": 50675 }, { "epoch": 8.267536704730832, "grad_norm": 1.6322835683822632, "learning_rate": 3.647642116740234e-05, "loss": 0.0587, "num_input_tokens_seen": 109372240, "step": 50680 }, { "epoch": 8.268352365415987, "grad_norm": 0.2776290476322174, "learning_rate": 3.6473259218857596e-05, "loss": 0.0476, "num_input_tokens_seen": 109383120, "step": 50685 }, { "epoch": 8.269168026101141, "grad_norm": 0.4091043174266815, "learning_rate": 3.647009703779269e-05, "loss": 0.0584, "num_input_tokens_seen": 109391952, "step": 50690 }, { "epoch": 8.269983686786297, "grad_norm": 0.45401859283447266, "learning_rate": 3.646693462427173e-05, "loss": 0.0788, "num_input_tokens_seen": 109402576, "step": 50695 }, { "epoch": 8.270799347471453, "grad_norm": 1.4900327920913696, "learning_rate": 3.646377197835879e-05, "loss": 0.1867, "num_input_tokens_seen": 109412976, "step": 50700 }, { "epoch": 8.271615008156607, "grad_norm": 0.9129364490509033, "learning_rate": 3.646060910011796e-05, "loss": 0.1431, "num_input_tokens_seen": 109425040, "step": 50705 }, { "epoch": 8.272430668841762, "grad_norm": 0.22982533276081085, "learning_rate": 3.645744598961335e-05, "loss": 0.0402, "num_input_tokens_seen": 109436592, "step": 50710 }, { "epoch": 8.273246329526916, "grad_norm": 1.567118525505066, "learning_rate": 3.645428264690905e-05, "loss": 0.1311, "num_input_tokens_seen": 109447184, "step": 50715 }, { "epoch": 8.274061990212072, "grad_norm": 0.2782807946205139, "learning_rate": 3.645111907206921e-05, "loss": 0.0478, "num_input_tokens_seen": 109457616, "step": 50720 }, { "epoch": 8.274877650897226, "grad_norm": 0.10319600999355316, "learning_rate": 3.6447955265157895e-05, "loss": 0.0065, "num_input_tokens_seen": 109466896, "step": 50725 }, { "epoch": 8.275693311582382, "grad_norm": 0.6577317118644714, "learning_rate": 3.644479122623925e-05, "loss": 0.0528, "num_input_tokens_seen": 109478800, "step": 50730 }, { "epoch": 8.276508972267537, "grad_norm": 0.9945542812347412, "learning_rate": 3.644162695537739e-05, "loss": 0.0595, "num_input_tokens_seen": 109488272, "step": 50735 }, { "epoch": 8.277324632952691, "grad_norm": 0.669520378112793, "learning_rate": 3.6438462452636446e-05, "loss": 0.02, "num_input_tokens_seen": 109499248, "step": 50740 }, { "epoch": 8.278140293637847, "grad_norm": 0.6656333208084106, "learning_rate": 3.643529771808055e-05, "loss": 0.2349, "num_input_tokens_seen": 109509584, "step": 50745 }, { "epoch": 8.278955954323001, "grad_norm": 1.081526756286621, "learning_rate": 3.643213275177384e-05, "loss": 0.2084, "num_input_tokens_seen": 109520656, "step": 50750 }, { "epoch": 8.279771615008157, "grad_norm": 0.18803882598876953, "learning_rate": 3.6428967553780454e-05, "loss": 0.0664, "num_input_tokens_seen": 109531600, "step": 50755 }, { "epoch": 8.280587275693312, "grad_norm": 0.6641442179679871, "learning_rate": 3.6425802124164545e-05, "loss": 0.1667, "num_input_tokens_seen": 109541936, "step": 50760 }, { "epoch": 8.281402936378466, "grad_norm": 1.4563028812408447, "learning_rate": 3.642263646299025e-05, "loss": 0.0511, "num_input_tokens_seen": 109552336, "step": 50765 }, { "epoch": 8.282218597063622, "grad_norm": 0.549636960029602, "learning_rate": 3.6419470570321755e-05, "loss": 0.1666, "num_input_tokens_seen": 109563376, "step": 50770 }, { "epoch": 8.283034257748776, "grad_norm": 1.4057115316390991, "learning_rate": 3.6416304446223194e-05, "loss": 0.1763, "num_input_tokens_seen": 109575216, "step": 50775 }, { "epoch": 8.283849918433932, "grad_norm": 0.7515982389450073, "learning_rate": 3.6413138090758736e-05, "loss": 0.084, "num_input_tokens_seen": 109585904, "step": 50780 }, { "epoch": 8.284665579119087, "grad_norm": 0.44325292110443115, "learning_rate": 3.640997150399256e-05, "loss": 0.1673, "num_input_tokens_seen": 109597936, "step": 50785 }, { "epoch": 8.285481239804241, "grad_norm": 0.0748375803232193, "learning_rate": 3.640680468598884e-05, "loss": 0.0976, "num_input_tokens_seen": 109609424, "step": 50790 }, { "epoch": 8.286296900489397, "grad_norm": 0.06863036751747131, "learning_rate": 3.640363763681174e-05, "loss": 0.0502, "num_input_tokens_seen": 109621488, "step": 50795 }, { "epoch": 8.28711256117455, "grad_norm": 1.1856822967529297, "learning_rate": 3.6400470356525474e-05, "loss": 0.117, "num_input_tokens_seen": 109631696, "step": 50800 }, { "epoch": 8.287928221859707, "grad_norm": 1.587158441543579, "learning_rate": 3.6397302845194205e-05, "loss": 0.2745, "num_input_tokens_seen": 109642544, "step": 50805 }, { "epoch": 8.28874388254486, "grad_norm": 0.8355345129966736, "learning_rate": 3.639413510288214e-05, "loss": 0.056, "num_input_tokens_seen": 109652752, "step": 50810 }, { "epoch": 8.289559543230016, "grad_norm": 1.2041263580322266, "learning_rate": 3.6390967129653464e-05, "loss": 0.0956, "num_input_tokens_seen": 109664496, "step": 50815 }, { "epoch": 8.290375203915172, "grad_norm": 0.2914222478866577, "learning_rate": 3.638779892557239e-05, "loss": 0.1459, "num_input_tokens_seen": 109676752, "step": 50820 }, { "epoch": 8.291190864600326, "grad_norm": 0.272479772567749, "learning_rate": 3.638463049070313e-05, "loss": 0.0466, "num_input_tokens_seen": 109687792, "step": 50825 }, { "epoch": 8.292006525285482, "grad_norm": 0.16914397478103638, "learning_rate": 3.638146182510989e-05, "loss": 0.0795, "num_input_tokens_seen": 109698416, "step": 50830 }, { "epoch": 8.292822185970635, "grad_norm": 0.047991640865802765, "learning_rate": 3.637829292885689e-05, "loss": 0.1665, "num_input_tokens_seen": 109709488, "step": 50835 }, { "epoch": 8.293637846655791, "grad_norm": 0.21859456598758698, "learning_rate": 3.637512380200834e-05, "loss": 0.1517, "num_input_tokens_seen": 109720912, "step": 50840 }, { "epoch": 8.294453507340947, "grad_norm": 0.09421870857477188, "learning_rate": 3.637195444462849e-05, "loss": 0.0462, "num_input_tokens_seen": 109731600, "step": 50845 }, { "epoch": 8.2952691680261, "grad_norm": 0.2525019645690918, "learning_rate": 3.6368784856781544e-05, "loss": 0.2094, "num_input_tokens_seen": 109742064, "step": 50850 }, { "epoch": 8.296084828711257, "grad_norm": 0.4039621949195862, "learning_rate": 3.636561503853175e-05, "loss": 0.0652, "num_input_tokens_seen": 109752848, "step": 50855 }, { "epoch": 8.29690048939641, "grad_norm": 0.6163412928581238, "learning_rate": 3.636244498994335e-05, "loss": 0.0543, "num_input_tokens_seen": 109763728, "step": 50860 }, { "epoch": 8.297716150081566, "grad_norm": 1.6097831726074219, "learning_rate": 3.63592747110806e-05, "loss": 0.387, "num_input_tokens_seen": 109774864, "step": 50865 }, { "epoch": 8.298531810766722, "grad_norm": 1.2019779682159424, "learning_rate": 3.635610420200773e-05, "loss": 0.11, "num_input_tokens_seen": 109785296, "step": 50870 }, { "epoch": 8.299347471451876, "grad_norm": 0.9062266945838928, "learning_rate": 3.6352933462789e-05, "loss": 0.1946, "num_input_tokens_seen": 109795856, "step": 50875 }, { "epoch": 8.300163132137031, "grad_norm": 1.1941107511520386, "learning_rate": 3.634976249348867e-05, "loss": 0.1493, "num_input_tokens_seen": 109806480, "step": 50880 }, { "epoch": 8.300978792822185, "grad_norm": 1.6509658098220825, "learning_rate": 3.634659129417101e-05, "loss": 0.0378, "num_input_tokens_seen": 109817680, "step": 50885 }, { "epoch": 8.301794453507341, "grad_norm": 0.10043757408857346, "learning_rate": 3.634341986490028e-05, "loss": 0.0232, "num_input_tokens_seen": 109829616, "step": 50890 }, { "epoch": 8.302610114192497, "grad_norm": 0.06054379791021347, "learning_rate": 3.634024820574076e-05, "loss": 0.0747, "num_input_tokens_seen": 109839632, "step": 50895 }, { "epoch": 8.30342577487765, "grad_norm": 0.9616122245788574, "learning_rate": 3.6337076316756714e-05, "loss": 0.0533, "num_input_tokens_seen": 109851344, "step": 50900 }, { "epoch": 8.304241435562806, "grad_norm": 0.09316527098417282, "learning_rate": 3.633390419801244e-05, "loss": 0.0689, "num_input_tokens_seen": 109861072, "step": 50905 }, { "epoch": 8.30505709624796, "grad_norm": 0.37959104776382446, "learning_rate": 3.633073184957222e-05, "loss": 0.097, "num_input_tokens_seen": 109872144, "step": 50910 }, { "epoch": 8.305872756933116, "grad_norm": 0.9092586636543274, "learning_rate": 3.632755927150035e-05, "loss": 0.0863, "num_input_tokens_seen": 109883152, "step": 50915 }, { "epoch": 8.30668841761827, "grad_norm": 0.049467507749795914, "learning_rate": 3.6324386463861116e-05, "loss": 0.0799, "num_input_tokens_seen": 109894672, "step": 50920 }, { "epoch": 8.307504078303426, "grad_norm": 1.246925711631775, "learning_rate": 3.632121342671882e-05, "loss": 0.1498, "num_input_tokens_seen": 109905456, "step": 50925 }, { "epoch": 8.308319738988581, "grad_norm": 1.7853368520736694, "learning_rate": 3.631804016013779e-05, "loss": 0.2645, "num_input_tokens_seen": 109915024, "step": 50930 }, { "epoch": 8.309135399673735, "grad_norm": 0.4363076686859131, "learning_rate": 3.63148666641823e-05, "loss": 0.1595, "num_input_tokens_seen": 109923728, "step": 50935 }, { "epoch": 8.309951060358891, "grad_norm": 0.4620721936225891, "learning_rate": 3.6311692938916694e-05, "loss": 0.1262, "num_input_tokens_seen": 109934128, "step": 50940 }, { "epoch": 8.310766721044045, "grad_norm": 0.07927261292934418, "learning_rate": 3.630851898440527e-05, "loss": 0.0332, "num_input_tokens_seen": 109945072, "step": 50945 }, { "epoch": 8.3115823817292, "grad_norm": 1.2514394521713257, "learning_rate": 3.630534480071237e-05, "loss": 0.1438, "num_input_tokens_seen": 109955184, "step": 50950 }, { "epoch": 8.312398042414356, "grad_norm": 0.9219037294387817, "learning_rate": 3.630217038790232e-05, "loss": 0.1601, "num_input_tokens_seen": 109965232, "step": 50955 }, { "epoch": 8.31321370309951, "grad_norm": 0.15347807109355927, "learning_rate": 3.629899574603944e-05, "loss": 0.0274, "num_input_tokens_seen": 109976944, "step": 50960 }, { "epoch": 8.314029363784666, "grad_norm": 0.0346340648829937, "learning_rate": 3.629582087518808e-05, "loss": 0.0776, "num_input_tokens_seen": 109987504, "step": 50965 }, { "epoch": 8.31484502446982, "grad_norm": 0.0957387313246727, "learning_rate": 3.629264577541258e-05, "loss": 0.0961, "num_input_tokens_seen": 109998160, "step": 50970 }, { "epoch": 8.315660685154976, "grad_norm": 0.2584664821624756, "learning_rate": 3.628947044677729e-05, "loss": 0.0931, "num_input_tokens_seen": 110008464, "step": 50975 }, { "epoch": 8.31647634584013, "grad_norm": 1.75700044631958, "learning_rate": 3.628629488934656e-05, "loss": 0.2652, "num_input_tokens_seen": 110020080, "step": 50980 }, { "epoch": 8.317292006525285, "grad_norm": 0.13371339440345764, "learning_rate": 3.628311910318475e-05, "loss": 0.0455, "num_input_tokens_seen": 110029104, "step": 50985 }, { "epoch": 8.318107667210441, "grad_norm": 0.6007882356643677, "learning_rate": 3.6279943088356205e-05, "loss": 0.0587, "num_input_tokens_seen": 110040528, "step": 50990 }, { "epoch": 8.318923327895595, "grad_norm": 0.10668915510177612, "learning_rate": 3.627676684492531e-05, "loss": 0.1078, "num_input_tokens_seen": 110049744, "step": 50995 }, { "epoch": 8.31973898858075, "grad_norm": 0.4750482141971588, "learning_rate": 3.627359037295643e-05, "loss": 0.0517, "num_input_tokens_seen": 110061680, "step": 51000 }, { "epoch": 8.320554649265905, "grad_norm": 0.15130041539669037, "learning_rate": 3.6270413672513945e-05, "loss": 0.0876, "num_input_tokens_seen": 110072880, "step": 51005 }, { "epoch": 8.32137030995106, "grad_norm": 0.11057135462760925, "learning_rate": 3.626723674366222e-05, "loss": 0.1841, "num_input_tokens_seen": 110084464, "step": 51010 }, { "epoch": 8.322185970636216, "grad_norm": 0.2398347109556198, "learning_rate": 3.6264059586465655e-05, "loss": 0.1298, "num_input_tokens_seen": 110095536, "step": 51015 }, { "epoch": 8.32300163132137, "grad_norm": 0.9365376830101013, "learning_rate": 3.6260882200988634e-05, "loss": 0.15, "num_input_tokens_seen": 110104496, "step": 51020 }, { "epoch": 8.323817292006526, "grad_norm": 0.02837030589580536, "learning_rate": 3.625770458729555e-05, "loss": 0.0792, "num_input_tokens_seen": 110115888, "step": 51025 }, { "epoch": 8.32463295269168, "grad_norm": 0.10247194021940231, "learning_rate": 3.625452674545079e-05, "loss": 0.0919, "num_input_tokens_seen": 110125104, "step": 51030 }, { "epoch": 8.325448613376835, "grad_norm": 0.15554983913898468, "learning_rate": 3.625134867551877e-05, "loss": 0.057, "num_input_tokens_seen": 110135728, "step": 51035 }, { "epoch": 8.326264274061991, "grad_norm": 0.8381115198135376, "learning_rate": 3.624817037756391e-05, "loss": 0.1283, "num_input_tokens_seen": 110146832, "step": 51040 }, { "epoch": 8.327079934747145, "grad_norm": 0.4730567932128906, "learning_rate": 3.6244991851650596e-05, "loss": 0.1664, "num_input_tokens_seen": 110157520, "step": 51045 }, { "epoch": 8.3278955954323, "grad_norm": 0.3798120319843292, "learning_rate": 3.624181309784326e-05, "loss": 0.1126, "num_input_tokens_seen": 110168688, "step": 51050 }, { "epoch": 8.328711256117455, "grad_norm": 0.38035741448402405, "learning_rate": 3.623863411620632e-05, "loss": 0.0395, "num_input_tokens_seen": 110179056, "step": 51055 }, { "epoch": 8.32952691680261, "grad_norm": 0.5377225279808044, "learning_rate": 3.62354549068042e-05, "loss": 0.1459, "num_input_tokens_seen": 110190160, "step": 51060 }, { "epoch": 8.330342577487766, "grad_norm": 0.04135315492749214, "learning_rate": 3.6232275469701345e-05, "loss": 0.0367, "num_input_tokens_seen": 110201424, "step": 51065 }, { "epoch": 8.33115823817292, "grad_norm": 0.1395295113325119, "learning_rate": 3.622909580496217e-05, "loss": 0.1192, "num_input_tokens_seen": 110212304, "step": 51070 }, { "epoch": 8.331973898858076, "grad_norm": 0.781674861907959, "learning_rate": 3.622591591265112e-05, "loss": 0.0702, "num_input_tokens_seen": 110223216, "step": 51075 }, { "epoch": 8.33278955954323, "grad_norm": 0.351835697889328, "learning_rate": 3.6222735792832654e-05, "loss": 0.2802, "num_input_tokens_seen": 110234160, "step": 51080 }, { "epoch": 8.333605220228385, "grad_norm": 1.54045569896698, "learning_rate": 3.62195554455712e-05, "loss": 0.1196, "num_input_tokens_seen": 110245264, "step": 51085 }, { "epoch": 8.33442088091354, "grad_norm": 0.5112733244895935, "learning_rate": 3.6216374870931226e-05, "loss": 0.034, "num_input_tokens_seen": 110256560, "step": 51090 }, { "epoch": 8.335236541598695, "grad_norm": 0.8369888067245483, "learning_rate": 3.6213194068977184e-05, "loss": 0.2477, "num_input_tokens_seen": 110267920, "step": 51095 }, { "epoch": 8.33605220228385, "grad_norm": 0.21983277797698975, "learning_rate": 3.621001303977354e-05, "loss": 0.2953, "num_input_tokens_seen": 110278032, "step": 51100 }, { "epoch": 8.336867862969005, "grad_norm": 0.542553722858429, "learning_rate": 3.6206831783384756e-05, "loss": 0.0419, "num_input_tokens_seen": 110289232, "step": 51105 }, { "epoch": 8.33768352365416, "grad_norm": 0.9938862323760986, "learning_rate": 3.6203650299875315e-05, "loss": 0.1232, "num_input_tokens_seen": 110299664, "step": 51110 }, { "epoch": 8.338499184339314, "grad_norm": 0.22569669783115387, "learning_rate": 3.620046858930969e-05, "loss": 0.0972, "num_input_tokens_seen": 110310480, "step": 51115 }, { "epoch": 8.33931484502447, "grad_norm": 0.26200515031814575, "learning_rate": 3.619728665175235e-05, "loss": 0.0325, "num_input_tokens_seen": 110322640, "step": 51120 }, { "epoch": 8.340130505709626, "grad_norm": 0.2399054616689682, "learning_rate": 3.61941044872678e-05, "loss": 0.0333, "num_input_tokens_seen": 110334992, "step": 51125 }, { "epoch": 8.34094616639478, "grad_norm": 0.26225119829177856, "learning_rate": 3.619092209592052e-05, "loss": 0.0436, "num_input_tokens_seen": 110345616, "step": 51130 }, { "epoch": 8.341761827079935, "grad_norm": 0.4595571756362915, "learning_rate": 3.6187739477775005e-05, "loss": 0.1152, "num_input_tokens_seen": 110356752, "step": 51135 }, { "epoch": 8.34257748776509, "grad_norm": 0.19824740290641785, "learning_rate": 3.618455663289576e-05, "loss": 0.0856, "num_input_tokens_seen": 110367568, "step": 51140 }, { "epoch": 8.343393148450245, "grad_norm": 0.33977010846138, "learning_rate": 3.6181373561347286e-05, "loss": 0.1449, "num_input_tokens_seen": 110378576, "step": 51145 }, { "epoch": 8.3442088091354, "grad_norm": 0.7275192141532898, "learning_rate": 3.6178190263194094e-05, "loss": 0.0562, "num_input_tokens_seen": 110389072, "step": 51150 }, { "epoch": 8.345024469820554, "grad_norm": 0.5933098793029785, "learning_rate": 3.617500673850069e-05, "loss": 0.2438, "num_input_tokens_seen": 110400464, "step": 51155 }, { "epoch": 8.34584013050571, "grad_norm": 0.26048439741134644, "learning_rate": 3.617182298733161e-05, "loss": 0.1434, "num_input_tokens_seen": 110411728, "step": 51160 }, { "epoch": 8.346655791190864, "grad_norm": 0.054487794637680054, "learning_rate": 3.6168639009751346e-05, "loss": 0.0323, "num_input_tokens_seen": 110422480, "step": 51165 }, { "epoch": 8.34747145187602, "grad_norm": 0.20426127314567566, "learning_rate": 3.616545480582446e-05, "loss": 0.0401, "num_input_tokens_seen": 110433680, "step": 51170 }, { "epoch": 8.348287112561174, "grad_norm": 0.404881089925766, "learning_rate": 3.6162270375615455e-05, "loss": 0.1212, "num_input_tokens_seen": 110442512, "step": 51175 }, { "epoch": 8.34910277324633, "grad_norm": 0.5320453643798828, "learning_rate": 3.615908571918889e-05, "loss": 0.0555, "num_input_tokens_seen": 110454032, "step": 51180 }, { "epoch": 8.349918433931485, "grad_norm": 2.5656399726867676, "learning_rate": 3.615590083660929e-05, "loss": 0.1126, "num_input_tokens_seen": 110464432, "step": 51185 }, { "epoch": 8.350734094616639, "grad_norm": 0.7648941874504089, "learning_rate": 3.615271572794121e-05, "loss": 0.1708, "num_input_tokens_seen": 110476176, "step": 51190 }, { "epoch": 8.351549755301795, "grad_norm": 0.13134194910526276, "learning_rate": 3.61495303932492e-05, "loss": 0.1953, "num_input_tokens_seen": 110486800, "step": 51195 }, { "epoch": 8.352365415986949, "grad_norm": 0.1347435563802719, "learning_rate": 3.614634483259781e-05, "loss": 0.1354, "num_input_tokens_seen": 110498448, "step": 51200 }, { "epoch": 8.353181076672104, "grad_norm": 0.05609791353344917, "learning_rate": 3.614315904605161e-05, "loss": 0.0429, "num_input_tokens_seen": 110510064, "step": 51205 }, { "epoch": 8.35399673735726, "grad_norm": 2.139280319213867, "learning_rate": 3.6139973033675146e-05, "loss": 0.0744, "num_input_tokens_seen": 110522352, "step": 51210 }, { "epoch": 8.354812398042414, "grad_norm": 1.03024423122406, "learning_rate": 3.6136786795532996e-05, "loss": 0.1022, "num_input_tokens_seen": 110534640, "step": 51215 }, { "epoch": 8.35562805872757, "grad_norm": 0.296495258808136, "learning_rate": 3.613360033168973e-05, "loss": 0.0757, "num_input_tokens_seen": 110546608, "step": 51220 }, { "epoch": 8.356443719412724, "grad_norm": 0.3817918300628662, "learning_rate": 3.613041364220994e-05, "loss": 0.0694, "num_input_tokens_seen": 110556656, "step": 51225 }, { "epoch": 8.35725938009788, "grad_norm": 0.4099913537502289, "learning_rate": 3.6127226727158195e-05, "loss": 0.1222, "num_input_tokens_seen": 110567184, "step": 51230 }, { "epoch": 8.358075040783035, "grad_norm": 0.5044026374816895, "learning_rate": 3.612403958659908e-05, "loss": 0.0313, "num_input_tokens_seen": 110576848, "step": 51235 }, { "epoch": 8.358890701468189, "grad_norm": 0.2760913074016571, "learning_rate": 3.61208522205972e-05, "loss": 0.1256, "num_input_tokens_seen": 110587920, "step": 51240 }, { "epoch": 8.359706362153345, "grad_norm": 0.14489497244358063, "learning_rate": 3.6117664629217134e-05, "loss": 0.1199, "num_input_tokens_seen": 110599184, "step": 51245 }, { "epoch": 8.360522022838499, "grad_norm": 0.7691605091094971, "learning_rate": 3.611447681252349e-05, "loss": 0.0906, "num_input_tokens_seen": 110610768, "step": 51250 }, { "epoch": 8.361337683523654, "grad_norm": 0.8182364702224731, "learning_rate": 3.611128877058088e-05, "loss": 0.2436, "num_input_tokens_seen": 110621136, "step": 51255 }, { "epoch": 8.362153344208808, "grad_norm": 0.07833246141672134, "learning_rate": 3.61081005034539e-05, "loss": 0.0852, "num_input_tokens_seen": 110631088, "step": 51260 }, { "epoch": 8.362969004893964, "grad_norm": 0.8918946981430054, "learning_rate": 3.610491201120718e-05, "loss": 0.0889, "num_input_tokens_seen": 110643184, "step": 51265 }, { "epoch": 8.36378466557912, "grad_norm": 1.9982445240020752, "learning_rate": 3.6101723293905325e-05, "loss": 0.1433, "num_input_tokens_seen": 110653712, "step": 51270 }, { "epoch": 8.364600326264274, "grad_norm": 0.21356453001499176, "learning_rate": 3.6098534351612965e-05, "loss": 0.1601, "num_input_tokens_seen": 110663600, "step": 51275 }, { "epoch": 8.36541598694943, "grad_norm": 1.623137354850769, "learning_rate": 3.6095345184394725e-05, "loss": 0.117, "num_input_tokens_seen": 110674192, "step": 51280 }, { "epoch": 8.366231647634583, "grad_norm": 0.17900702357292175, "learning_rate": 3.609215579231524e-05, "loss": 0.1478, "num_input_tokens_seen": 110684880, "step": 51285 }, { "epoch": 8.367047308319739, "grad_norm": 0.20181041955947876, "learning_rate": 3.608896617543915e-05, "loss": 0.0357, "num_input_tokens_seen": 110695120, "step": 51290 }, { "epoch": 8.367862969004895, "grad_norm": 0.17991358041763306, "learning_rate": 3.608577633383109e-05, "loss": 0.0586, "num_input_tokens_seen": 110704912, "step": 51295 }, { "epoch": 8.368678629690049, "grad_norm": 0.9226688146591187, "learning_rate": 3.608258626755571e-05, "loss": 0.0531, "num_input_tokens_seen": 110715952, "step": 51300 }, { "epoch": 8.369494290375204, "grad_norm": 0.2955048084259033, "learning_rate": 3.607939597667766e-05, "loss": 0.1232, "num_input_tokens_seen": 110727312, "step": 51305 }, { "epoch": 8.370309951060358, "grad_norm": 1.2250899076461792, "learning_rate": 3.6076205461261596e-05, "loss": 0.0759, "num_input_tokens_seen": 110738448, "step": 51310 }, { "epoch": 8.371125611745514, "grad_norm": 1.340246319770813, "learning_rate": 3.607301472137218e-05, "loss": 0.0872, "num_input_tokens_seen": 110750960, "step": 51315 }, { "epoch": 8.37194127243067, "grad_norm": 0.09159938246011734, "learning_rate": 3.6069823757074065e-05, "loss": 0.059, "num_input_tokens_seen": 110761584, "step": 51320 }, { "epoch": 8.372756933115824, "grad_norm": 0.1922876536846161, "learning_rate": 3.606663256843193e-05, "loss": 0.0125, "num_input_tokens_seen": 110771792, "step": 51325 }, { "epoch": 8.37357259380098, "grad_norm": 1.1640325784683228, "learning_rate": 3.6063441155510455e-05, "loss": 0.1048, "num_input_tokens_seen": 110783472, "step": 51330 }, { "epoch": 8.374388254486133, "grad_norm": 0.3877801299095154, "learning_rate": 3.60602495183743e-05, "loss": 0.1166, "num_input_tokens_seen": 110793584, "step": 51335 }, { "epoch": 8.375203915171289, "grad_norm": 1.053789496421814, "learning_rate": 3.605705765708817e-05, "loss": 0.0975, "num_input_tokens_seen": 110803440, "step": 51340 }, { "epoch": 8.376019575856443, "grad_norm": 0.3083735406398773, "learning_rate": 3.605386557171672e-05, "loss": 0.0351, "num_input_tokens_seen": 110813808, "step": 51345 }, { "epoch": 8.376835236541599, "grad_norm": 1.9093235731124878, "learning_rate": 3.605067326232468e-05, "loss": 0.1858, "num_input_tokens_seen": 110824016, "step": 51350 }, { "epoch": 8.377650897226754, "grad_norm": 0.6548117399215698, "learning_rate": 3.6047480728976716e-05, "loss": 0.1237, "num_input_tokens_seen": 110834672, "step": 51355 }, { "epoch": 8.378466557911908, "grad_norm": 0.09323234856128693, "learning_rate": 3.604428797173755e-05, "loss": 0.0121, "num_input_tokens_seen": 110845456, "step": 51360 }, { "epoch": 8.379282218597064, "grad_norm": 0.26638245582580566, "learning_rate": 3.604109499067187e-05, "loss": 0.1839, "num_input_tokens_seen": 110856080, "step": 51365 }, { "epoch": 8.380097879282218, "grad_norm": 0.4515795111656189, "learning_rate": 3.6037901785844404e-05, "loss": 0.1532, "num_input_tokens_seen": 110867056, "step": 51370 }, { "epoch": 8.380913539967374, "grad_norm": 1.2055970430374146, "learning_rate": 3.6034708357319844e-05, "loss": 0.066, "num_input_tokens_seen": 110878352, "step": 51375 }, { "epoch": 8.38172920065253, "grad_norm": 0.8282480835914612, "learning_rate": 3.6031514705162925e-05, "loss": 0.0405, "num_input_tokens_seen": 110889584, "step": 51380 }, { "epoch": 8.382544861337683, "grad_norm": 2.0893547534942627, "learning_rate": 3.602832082943837e-05, "loss": 0.1054, "num_input_tokens_seen": 110900464, "step": 51385 }, { "epoch": 8.383360522022839, "grad_norm": 0.28737470507621765, "learning_rate": 3.60251267302109e-05, "loss": 0.0777, "num_input_tokens_seen": 110911568, "step": 51390 }, { "epoch": 8.384176182707993, "grad_norm": 0.974488377571106, "learning_rate": 3.602193240754524e-05, "loss": 0.1341, "num_input_tokens_seen": 110922640, "step": 51395 }, { "epoch": 8.384991843393149, "grad_norm": 0.13013797998428345, "learning_rate": 3.601873786150616e-05, "loss": 0.1367, "num_input_tokens_seen": 110933872, "step": 51400 }, { "epoch": 8.385807504078304, "grad_norm": 0.04176190122961998, "learning_rate": 3.601554309215836e-05, "loss": 0.0874, "num_input_tokens_seen": 110945648, "step": 51405 }, { "epoch": 8.386623164763458, "grad_norm": 0.06133057177066803, "learning_rate": 3.6012348099566614e-05, "loss": 0.0226, "num_input_tokens_seen": 110956528, "step": 51410 }, { "epoch": 8.387438825448614, "grad_norm": 1.5045959949493408, "learning_rate": 3.600915288379566e-05, "loss": 0.1024, "num_input_tokens_seen": 110966224, "step": 51415 }, { "epoch": 8.388254486133768, "grad_norm": 0.08650155365467072, "learning_rate": 3.600595744491026e-05, "loss": 0.0644, "num_input_tokens_seen": 110976912, "step": 51420 }, { "epoch": 8.389070146818923, "grad_norm": 0.18749380111694336, "learning_rate": 3.600276178297516e-05, "loss": 0.0522, "num_input_tokens_seen": 110988080, "step": 51425 }, { "epoch": 8.38988580750408, "grad_norm": 0.16047364473342896, "learning_rate": 3.599956589805514e-05, "loss": 0.0623, "num_input_tokens_seen": 110999472, "step": 51430 }, { "epoch": 8.390701468189233, "grad_norm": 0.36653128266334534, "learning_rate": 3.599636979021497e-05, "loss": 0.0786, "num_input_tokens_seen": 111011184, "step": 51435 }, { "epoch": 8.391517128874389, "grad_norm": 0.11917620897293091, "learning_rate": 3.599317345951941e-05, "loss": 0.0189, "num_input_tokens_seen": 111022480, "step": 51440 }, { "epoch": 8.392332789559543, "grad_norm": 0.9498599767684937, "learning_rate": 3.5989976906033244e-05, "loss": 0.2123, "num_input_tokens_seen": 111032912, "step": 51445 }, { "epoch": 8.393148450244698, "grad_norm": 1.1380043029785156, "learning_rate": 3.598678012982126e-05, "loss": 0.1241, "num_input_tokens_seen": 111044272, "step": 51450 }, { "epoch": 8.393964110929852, "grad_norm": 0.9135814309120178, "learning_rate": 3.5983583130948237e-05, "loss": 0.0989, "num_input_tokens_seen": 111054704, "step": 51455 }, { "epoch": 8.394779771615008, "grad_norm": 0.19155582785606384, "learning_rate": 3.5980385909478965e-05, "loss": 0.0215, "num_input_tokens_seen": 111065040, "step": 51460 }, { "epoch": 8.395595432300164, "grad_norm": 0.13676166534423828, "learning_rate": 3.597718846547824e-05, "loss": 0.0351, "num_input_tokens_seen": 111077200, "step": 51465 }, { "epoch": 8.396411092985318, "grad_norm": 0.4837947189807892, "learning_rate": 3.597399079901087e-05, "loss": 0.0974, "num_input_tokens_seen": 111088016, "step": 51470 }, { "epoch": 8.397226753670473, "grad_norm": 0.46607887744903564, "learning_rate": 3.597079291014166e-05, "loss": 0.0611, "num_input_tokens_seen": 111098320, "step": 51475 }, { "epoch": 8.398042414355627, "grad_norm": 0.03474970534443855, "learning_rate": 3.59675947989354e-05, "loss": 0.0284, "num_input_tokens_seen": 111108528, "step": 51480 }, { "epoch": 8.398858075040783, "grad_norm": 1.8403065204620361, "learning_rate": 3.596439646545694e-05, "loss": 0.1864, "num_input_tokens_seen": 111120080, "step": 51485 }, { "epoch": 8.399673735725939, "grad_norm": 0.4457859694957733, "learning_rate": 3.596119790977105e-05, "loss": 0.068, "num_input_tokens_seen": 111130512, "step": 51490 }, { "epoch": 8.400489396411093, "grad_norm": 0.15543276071548462, "learning_rate": 3.5957999131942594e-05, "loss": 0.1968, "num_input_tokens_seen": 111141520, "step": 51495 }, { "epoch": 8.401305057096248, "grad_norm": 0.11757298558950424, "learning_rate": 3.595480013203638e-05, "loss": 0.0738, "num_input_tokens_seen": 111152112, "step": 51500 }, { "epoch": 8.402120717781402, "grad_norm": 1.7338680028915405, "learning_rate": 3.595160091011725e-05, "loss": 0.1568, "num_input_tokens_seen": 111162896, "step": 51505 }, { "epoch": 8.402936378466558, "grad_norm": 0.3877275884151459, "learning_rate": 3.5948401466250023e-05, "loss": 0.2414, "num_input_tokens_seen": 111173616, "step": 51510 }, { "epoch": 8.403752039151712, "grad_norm": 0.15188772976398468, "learning_rate": 3.5945201800499565e-05, "loss": 0.125, "num_input_tokens_seen": 111183856, "step": 51515 }, { "epoch": 8.404567699836868, "grad_norm": 0.18441782891750336, "learning_rate": 3.59420019129307e-05, "loss": 0.1493, "num_input_tokens_seen": 111193936, "step": 51520 }, { "epoch": 8.405383360522023, "grad_norm": 0.060139287263154984, "learning_rate": 3.593880180360829e-05, "loss": 0.1413, "num_input_tokens_seen": 111204912, "step": 51525 }, { "epoch": 8.406199021207177, "grad_norm": 0.5280714631080627, "learning_rate": 3.593560147259717e-05, "loss": 0.0894, "num_input_tokens_seen": 111215408, "step": 51530 }, { "epoch": 8.407014681892333, "grad_norm": 0.05674228072166443, "learning_rate": 3.593240091996223e-05, "loss": 0.0809, "num_input_tokens_seen": 111227056, "step": 51535 }, { "epoch": 8.407830342577487, "grad_norm": 0.7988004684448242, "learning_rate": 3.5929200145768314e-05, "loss": 0.2429, "num_input_tokens_seen": 111238512, "step": 51540 }, { "epoch": 8.408646003262643, "grad_norm": 0.2317010760307312, "learning_rate": 3.592599915008029e-05, "loss": 0.0437, "num_input_tokens_seen": 111249392, "step": 51545 }, { "epoch": 8.409461663947798, "grad_norm": 1.980305552482605, "learning_rate": 3.592279793296303e-05, "loss": 0.1796, "num_input_tokens_seen": 111259600, "step": 51550 }, { "epoch": 8.410277324632952, "grad_norm": 0.0634550154209137, "learning_rate": 3.591959649448142e-05, "loss": 0.0609, "num_input_tokens_seen": 111270032, "step": 51555 }, { "epoch": 8.411092985318108, "grad_norm": 0.05208989605307579, "learning_rate": 3.591639483470033e-05, "loss": 0.1513, "num_input_tokens_seen": 111279504, "step": 51560 }, { "epoch": 8.411908646003262, "grad_norm": 1.0716453790664673, "learning_rate": 3.5913192953684646e-05, "loss": 0.1394, "num_input_tokens_seen": 111291504, "step": 51565 }, { "epoch": 8.412724306688418, "grad_norm": 0.1576842963695526, "learning_rate": 3.590999085149927e-05, "loss": 0.1694, "num_input_tokens_seen": 111303152, "step": 51570 }, { "epoch": 8.413539967373573, "grad_norm": 1.2595045566558838, "learning_rate": 3.590678852820909e-05, "loss": 0.0548, "num_input_tokens_seen": 111313232, "step": 51575 }, { "epoch": 8.414355628058727, "grad_norm": 0.022313129156827927, "learning_rate": 3.590358598387899e-05, "loss": 0.0438, "num_input_tokens_seen": 111324240, "step": 51580 }, { "epoch": 8.415171288743883, "grad_norm": 0.06836946308612823, "learning_rate": 3.590038321857391e-05, "loss": 0.1566, "num_input_tokens_seen": 111335056, "step": 51585 }, { "epoch": 8.415986949429037, "grad_norm": 0.6618645787239075, "learning_rate": 3.589718023235872e-05, "loss": 0.058, "num_input_tokens_seen": 111345200, "step": 51590 }, { "epoch": 8.416802610114193, "grad_norm": 0.5883658528327942, "learning_rate": 3.5893977025298356e-05, "loss": 0.1321, "num_input_tokens_seen": 111354640, "step": 51595 }, { "epoch": 8.417618270799348, "grad_norm": 0.1748906672000885, "learning_rate": 3.589077359745772e-05, "loss": 0.0335, "num_input_tokens_seen": 111365008, "step": 51600 }, { "epoch": 8.418433931484502, "grad_norm": 0.2633807063102722, "learning_rate": 3.588756994890175e-05, "loss": 0.2313, "num_input_tokens_seen": 111376080, "step": 51605 }, { "epoch": 8.419249592169658, "grad_norm": 0.7650120854377747, "learning_rate": 3.5884366079695357e-05, "loss": 0.1628, "num_input_tokens_seen": 111387376, "step": 51610 }, { "epoch": 8.420065252854812, "grad_norm": 0.28888213634490967, "learning_rate": 3.5881161989903476e-05, "loss": 0.0577, "num_input_tokens_seen": 111398224, "step": 51615 }, { "epoch": 8.420880913539968, "grad_norm": 1.150469183921814, "learning_rate": 3.5877957679591054e-05, "loss": 0.2086, "num_input_tokens_seen": 111408624, "step": 51620 }, { "epoch": 8.421696574225122, "grad_norm": 0.22290687263011932, "learning_rate": 3.587475314882301e-05, "loss": 0.0226, "num_input_tokens_seen": 111419664, "step": 51625 }, { "epoch": 8.422512234910277, "grad_norm": 1.524092197418213, "learning_rate": 3.58715483976643e-05, "loss": 0.1096, "num_input_tokens_seen": 111430480, "step": 51630 }, { "epoch": 8.423327895595433, "grad_norm": 0.01890864595770836, "learning_rate": 3.5868343426179865e-05, "loss": 0.199, "num_input_tokens_seen": 111439984, "step": 51635 }, { "epoch": 8.424143556280587, "grad_norm": 0.5067139863967896, "learning_rate": 3.586513823443467e-05, "loss": 0.0385, "num_input_tokens_seen": 111451472, "step": 51640 }, { "epoch": 8.424959216965743, "grad_norm": 0.5890741944313049, "learning_rate": 3.586193282249367e-05, "loss": 0.0733, "num_input_tokens_seen": 111462544, "step": 51645 }, { "epoch": 8.425774877650896, "grad_norm": 1.6760234832763672, "learning_rate": 3.585872719042181e-05, "loss": 0.0394, "num_input_tokens_seen": 111474800, "step": 51650 }, { "epoch": 8.426590538336052, "grad_norm": 0.3806290626525879, "learning_rate": 3.585552133828407e-05, "loss": 0.0237, "num_input_tokens_seen": 111485776, "step": 51655 }, { "epoch": 8.427406199021208, "grad_norm": 0.025258053094148636, "learning_rate": 3.5852315266145426e-05, "loss": 0.2251, "num_input_tokens_seen": 111497616, "step": 51660 }, { "epoch": 8.428221859706362, "grad_norm": 1.368477702140808, "learning_rate": 3.5849108974070844e-05, "loss": 0.195, "num_input_tokens_seen": 111508848, "step": 51665 }, { "epoch": 8.429037520391518, "grad_norm": 0.14461259543895721, "learning_rate": 3.58459024621253e-05, "loss": 0.2204, "num_input_tokens_seen": 111520816, "step": 51670 }, { "epoch": 8.429853181076671, "grad_norm": 1.8979921340942383, "learning_rate": 3.584269573037379e-05, "loss": 0.1517, "num_input_tokens_seen": 111531984, "step": 51675 }, { "epoch": 8.430668841761827, "grad_norm": 1.78390634059906, "learning_rate": 3.5839488778881295e-05, "loss": 0.151, "num_input_tokens_seen": 111542800, "step": 51680 }, { "epoch": 8.431484502446983, "grad_norm": 0.1147497221827507, "learning_rate": 3.583628160771281e-05, "loss": 0.1739, "num_input_tokens_seen": 111553040, "step": 51685 }, { "epoch": 8.432300163132137, "grad_norm": 0.0882759541273117, "learning_rate": 3.583307421693333e-05, "loss": 0.0164, "num_input_tokens_seen": 111563120, "step": 51690 }, { "epoch": 8.433115823817293, "grad_norm": 0.06852493435144424, "learning_rate": 3.582986660660785e-05, "loss": 0.0328, "num_input_tokens_seen": 111573776, "step": 51695 }, { "epoch": 8.433931484502446, "grad_norm": 0.1830441653728485, "learning_rate": 3.5826658776801406e-05, "loss": 0.1938, "num_input_tokens_seen": 111584080, "step": 51700 }, { "epoch": 8.434747145187602, "grad_norm": 1.7259764671325684, "learning_rate": 3.5823450727578964e-05, "loss": 0.1548, "num_input_tokens_seen": 111594480, "step": 51705 }, { "epoch": 8.435562805872756, "grad_norm": 1.6041245460510254, "learning_rate": 3.5820242459005577e-05, "loss": 0.1533, "num_input_tokens_seen": 111604656, "step": 51710 }, { "epoch": 8.436378466557912, "grad_norm": 0.08437437564134598, "learning_rate": 3.581703397114625e-05, "loss": 0.0822, "num_input_tokens_seen": 111616304, "step": 51715 }, { "epoch": 8.437194127243067, "grad_norm": 0.4644218385219574, "learning_rate": 3.5813825264066e-05, "loss": 0.0901, "num_input_tokens_seen": 111627920, "step": 51720 }, { "epoch": 8.438009787928221, "grad_norm": 0.3101450204849243, "learning_rate": 3.581061633782986e-05, "loss": 0.0744, "num_input_tokens_seen": 111638864, "step": 51725 }, { "epoch": 8.438825448613377, "grad_norm": 0.8364295363426208, "learning_rate": 3.580740719250287e-05, "loss": 0.0492, "num_input_tokens_seen": 111649296, "step": 51730 }, { "epoch": 8.439641109298531, "grad_norm": 1.013901948928833, "learning_rate": 3.580419782815007e-05, "loss": 0.0721, "num_input_tokens_seen": 111659152, "step": 51735 }, { "epoch": 8.440456769983687, "grad_norm": 0.6707078814506531, "learning_rate": 3.5800988244836485e-05, "loss": 0.0475, "num_input_tokens_seen": 111669744, "step": 51740 }, { "epoch": 8.441272430668842, "grad_norm": 0.14809967577457428, "learning_rate": 3.5797778442627175e-05, "loss": 0.1082, "num_input_tokens_seen": 111680304, "step": 51745 }, { "epoch": 8.442088091353996, "grad_norm": 1.12557852268219, "learning_rate": 3.579456842158719e-05, "loss": 0.1226, "num_input_tokens_seen": 111692400, "step": 51750 }, { "epoch": 8.442903752039152, "grad_norm": 0.1604951173067093, "learning_rate": 3.5791358181781574e-05, "loss": 0.0873, "num_input_tokens_seen": 111702160, "step": 51755 }, { "epoch": 8.443719412724306, "grad_norm": 0.27500465512275696, "learning_rate": 3.57881477232754e-05, "loss": 0.1358, "num_input_tokens_seen": 111713712, "step": 51760 }, { "epoch": 8.444535073409462, "grad_norm": 0.02779519371688366, "learning_rate": 3.578493704613373e-05, "loss": 0.069, "num_input_tokens_seen": 111724112, "step": 51765 }, { "epoch": 8.445350734094617, "grad_norm": 0.3786196708679199, "learning_rate": 3.578172615042162e-05, "loss": 0.0451, "num_input_tokens_seen": 111735504, "step": 51770 }, { "epoch": 8.446166394779771, "grad_norm": 0.03745320811867714, "learning_rate": 3.5778515036204164e-05, "loss": 0.0214, "num_input_tokens_seen": 111744912, "step": 51775 }, { "epoch": 8.446982055464927, "grad_norm": 0.6853382587432861, "learning_rate": 3.577530370354642e-05, "loss": 0.0521, "num_input_tokens_seen": 111756080, "step": 51780 }, { "epoch": 8.447797716150081, "grad_norm": 0.21588166058063507, "learning_rate": 3.577209215251348e-05, "loss": 0.0163, "num_input_tokens_seen": 111766928, "step": 51785 }, { "epoch": 8.448613376835237, "grad_norm": 0.13678976893424988, "learning_rate": 3.5768880383170425e-05, "loss": 0.2084, "num_input_tokens_seen": 111775760, "step": 51790 }, { "epoch": 8.449429037520392, "grad_norm": 0.5272514820098877, "learning_rate": 3.5765668395582345e-05, "loss": 0.1533, "num_input_tokens_seen": 111787184, "step": 51795 }, { "epoch": 8.450244698205546, "grad_norm": 0.24545617401599884, "learning_rate": 3.576245618981434e-05, "loss": 0.133, "num_input_tokens_seen": 111798064, "step": 51800 }, { "epoch": 8.451060358890702, "grad_norm": 1.915589690208435, "learning_rate": 3.575924376593151e-05, "loss": 0.1191, "num_input_tokens_seen": 111808016, "step": 51805 }, { "epoch": 8.451876019575856, "grad_norm": 0.41095319390296936, "learning_rate": 3.575603112399895e-05, "loss": 0.0265, "num_input_tokens_seen": 111819568, "step": 51810 }, { "epoch": 8.452691680261012, "grad_norm": 1.0429835319519043, "learning_rate": 3.5752818264081776e-05, "loss": 0.0454, "num_input_tokens_seen": 111829840, "step": 51815 }, { "epoch": 8.453507340946166, "grad_norm": 0.10371223837137222, "learning_rate": 3.5749605186245104e-05, "loss": 0.2136, "num_input_tokens_seen": 111841136, "step": 51820 }, { "epoch": 8.454323001631321, "grad_norm": 0.029012620449066162, "learning_rate": 3.5746391890554045e-05, "loss": 0.2103, "num_input_tokens_seen": 111852208, "step": 51825 }, { "epoch": 8.455138662316477, "grad_norm": 0.06732276827096939, "learning_rate": 3.574317837707372e-05, "loss": 0.0641, "num_input_tokens_seen": 111862608, "step": 51830 }, { "epoch": 8.455954323001631, "grad_norm": 0.02335296757519245, "learning_rate": 3.573996464586925e-05, "loss": 0.1042, "num_input_tokens_seen": 111873424, "step": 51835 }, { "epoch": 8.456769983686787, "grad_norm": 3.241154193878174, "learning_rate": 3.5736750697005784e-05, "loss": 0.3919, "num_input_tokens_seen": 111884016, "step": 51840 }, { "epoch": 8.45758564437194, "grad_norm": 0.553382396697998, "learning_rate": 3.5733536530548436e-05, "loss": 0.0648, "num_input_tokens_seen": 111895440, "step": 51845 }, { "epoch": 8.458401305057096, "grad_norm": 1.3532764911651611, "learning_rate": 3.573032214656235e-05, "loss": 0.0492, "num_input_tokens_seen": 111907120, "step": 51850 }, { "epoch": 8.459216965742252, "grad_norm": 0.2597179412841797, "learning_rate": 3.572710754511268e-05, "loss": 0.1624, "num_input_tokens_seen": 111916464, "step": 51855 }, { "epoch": 8.460032626427406, "grad_norm": 2.3676986694335938, "learning_rate": 3.5723892726264565e-05, "loss": 0.1332, "num_input_tokens_seen": 111927376, "step": 51860 }, { "epoch": 8.460848287112562, "grad_norm": 1.5395007133483887, "learning_rate": 3.5720677690083165e-05, "loss": 0.187, "num_input_tokens_seen": 111937392, "step": 51865 }, { "epoch": 8.461663947797716, "grad_norm": 0.35936352610588074, "learning_rate": 3.5717462436633633e-05, "loss": 0.0896, "num_input_tokens_seen": 111948848, "step": 51870 }, { "epoch": 8.462479608482871, "grad_norm": 0.03578457981348038, "learning_rate": 3.571424696598112e-05, "loss": 0.0108, "num_input_tokens_seen": 111959664, "step": 51875 }, { "epoch": 8.463295269168025, "grad_norm": 0.349495530128479, "learning_rate": 3.57110312781908e-05, "loss": 0.2165, "num_input_tokens_seen": 111969712, "step": 51880 }, { "epoch": 8.464110929853181, "grad_norm": 0.9625959396362305, "learning_rate": 3.570781537332785e-05, "loss": 0.0781, "num_input_tokens_seen": 111981168, "step": 51885 }, { "epoch": 8.464926590538337, "grad_norm": 1.1675256490707397, "learning_rate": 3.570459925145744e-05, "loss": 0.1322, "num_input_tokens_seen": 111991952, "step": 51890 }, { "epoch": 8.46574225122349, "grad_norm": 1.677173137664795, "learning_rate": 3.570138291264474e-05, "loss": 0.0721, "num_input_tokens_seen": 112003024, "step": 51895 }, { "epoch": 8.466557911908646, "grad_norm": 1.4100098609924316, "learning_rate": 3.5698166356954934e-05, "loss": 0.1597, "num_input_tokens_seen": 112013360, "step": 51900 }, { "epoch": 8.4673735725938, "grad_norm": 0.9235019087791443, "learning_rate": 3.5694949584453225e-05, "loss": 0.2038, "num_input_tokens_seen": 112023664, "step": 51905 }, { "epoch": 8.468189233278956, "grad_norm": 1.300853967666626, "learning_rate": 3.56917325952048e-05, "loss": 0.1057, "num_input_tokens_seen": 112034928, "step": 51910 }, { "epoch": 8.469004893964112, "grad_norm": 2.028724193572998, "learning_rate": 3.568851538927484e-05, "loss": 0.2318, "num_input_tokens_seen": 112046128, "step": 51915 }, { "epoch": 8.469820554649266, "grad_norm": 0.016420355066657066, "learning_rate": 3.5685297966728556e-05, "loss": 0.009, "num_input_tokens_seen": 112056944, "step": 51920 }, { "epoch": 8.470636215334421, "grad_norm": 0.04112578183412552, "learning_rate": 3.568208032763116e-05, "loss": 0.3045, "num_input_tokens_seen": 112067472, "step": 51925 }, { "epoch": 8.471451876019575, "grad_norm": 0.07695192098617554, "learning_rate": 3.567886247204785e-05, "loss": 0.0199, "num_input_tokens_seen": 112076400, "step": 51930 }, { "epoch": 8.47226753670473, "grad_norm": 1.1090939044952393, "learning_rate": 3.5675644400043844e-05, "loss": 0.2418, "num_input_tokens_seen": 112087312, "step": 51935 }, { "epoch": 8.473083197389887, "grad_norm": 0.5952257513999939, "learning_rate": 3.567242611168436e-05, "loss": 0.2443, "num_input_tokens_seen": 112098544, "step": 51940 }, { "epoch": 8.47389885807504, "grad_norm": 0.16900388896465302, "learning_rate": 3.5669207607034625e-05, "loss": 0.1319, "num_input_tokens_seen": 112108304, "step": 51945 }, { "epoch": 8.474714518760196, "grad_norm": 0.18651576340198517, "learning_rate": 3.566598888615986e-05, "loss": 0.0096, "num_input_tokens_seen": 112119600, "step": 51950 }, { "epoch": 8.47553017944535, "grad_norm": 0.21288298070430756, "learning_rate": 3.566276994912531e-05, "loss": 0.0203, "num_input_tokens_seen": 112130192, "step": 51955 }, { "epoch": 8.476345840130506, "grad_norm": 0.06136761233210564, "learning_rate": 3.5659550795996186e-05, "loss": 0.1942, "num_input_tokens_seen": 112142288, "step": 51960 }, { "epoch": 8.477161500815662, "grad_norm": 0.27444764971733093, "learning_rate": 3.5656331426837755e-05, "loss": 0.1395, "num_input_tokens_seen": 112154032, "step": 51965 }, { "epoch": 8.477977161500815, "grad_norm": 0.831919252872467, "learning_rate": 3.5653111841715236e-05, "loss": 0.0274, "num_input_tokens_seen": 112165520, "step": 51970 }, { "epoch": 8.478792822185971, "grad_norm": 1.3082411289215088, "learning_rate": 3.56498920406939e-05, "loss": 0.1548, "num_input_tokens_seen": 112176560, "step": 51975 }, { "epoch": 8.479608482871125, "grad_norm": 0.22760692238807678, "learning_rate": 3.564667202383899e-05, "loss": 0.0123, "num_input_tokens_seen": 112187248, "step": 51980 }, { "epoch": 8.48042414355628, "grad_norm": 0.18473930656909943, "learning_rate": 3.5643451791215764e-05, "loss": 0.0797, "num_input_tokens_seen": 112198128, "step": 51985 }, { "epoch": 8.481239804241435, "grad_norm": 1.6289498805999756, "learning_rate": 3.564023134288949e-05, "loss": 0.1893, "num_input_tokens_seen": 112209680, "step": 51990 }, { "epoch": 8.48205546492659, "grad_norm": 0.2438415288925171, "learning_rate": 3.5637010678925434e-05, "loss": 0.0357, "num_input_tokens_seen": 112220688, "step": 51995 }, { "epoch": 8.482871125611746, "grad_norm": 3.4588046073913574, "learning_rate": 3.563378979938885e-05, "loss": 0.2105, "num_input_tokens_seen": 112231632, "step": 52000 }, { "epoch": 8.4836867862969, "grad_norm": 0.07502371817827225, "learning_rate": 3.563056870434503e-05, "loss": 0.2138, "num_input_tokens_seen": 112240592, "step": 52005 }, { "epoch": 8.484502446982056, "grad_norm": 0.1208309456706047, "learning_rate": 3.5627347393859254e-05, "loss": 0.0266, "num_input_tokens_seen": 112251440, "step": 52010 }, { "epoch": 8.48531810766721, "grad_norm": 1.142547369003296, "learning_rate": 3.5624125867996794e-05, "loss": 0.1405, "num_input_tokens_seen": 112262192, "step": 52015 }, { "epoch": 8.486133768352365, "grad_norm": 0.1442275047302246, "learning_rate": 3.562090412682295e-05, "loss": 0.0174, "num_input_tokens_seen": 112273808, "step": 52020 }, { "epoch": 8.486949429037521, "grad_norm": 1.5779790878295898, "learning_rate": 3.561768217040301e-05, "loss": 0.0762, "num_input_tokens_seen": 112284304, "step": 52025 }, { "epoch": 8.487765089722675, "grad_norm": 1.6315484046936035, "learning_rate": 3.5614459998802275e-05, "loss": 0.2104, "num_input_tokens_seen": 112295152, "step": 52030 }, { "epoch": 8.48858075040783, "grad_norm": 0.16910041868686676, "learning_rate": 3.5611237612086035e-05, "loss": 0.0993, "num_input_tokens_seen": 112305648, "step": 52035 }, { "epoch": 8.489396411092985, "grad_norm": 1.2727233171463013, "learning_rate": 3.560801501031961e-05, "loss": 0.114, "num_input_tokens_seen": 112316144, "step": 52040 }, { "epoch": 8.49021207177814, "grad_norm": 0.1665848046541214, "learning_rate": 3.56047921935683e-05, "loss": 0.0582, "num_input_tokens_seen": 112328560, "step": 52045 }, { "epoch": 8.491027732463296, "grad_norm": 0.4663834571838379, "learning_rate": 3.560156916189742e-05, "loss": 0.0939, "num_input_tokens_seen": 112338640, "step": 52050 }, { "epoch": 8.49184339314845, "grad_norm": 0.7605535984039307, "learning_rate": 3.55983459153723e-05, "loss": 0.0834, "num_input_tokens_seen": 112349232, "step": 52055 }, { "epoch": 8.492659053833606, "grad_norm": 0.16398510336875916, "learning_rate": 3.5595122454058246e-05, "loss": 0.0741, "num_input_tokens_seen": 112360400, "step": 52060 }, { "epoch": 8.49347471451876, "grad_norm": 0.07951267063617706, "learning_rate": 3.55918987780206e-05, "loss": 0.0628, "num_input_tokens_seen": 112371408, "step": 52065 }, { "epoch": 8.494290375203915, "grad_norm": 0.7087630033493042, "learning_rate": 3.558867488732469e-05, "loss": 0.1364, "num_input_tokens_seen": 112382544, "step": 52070 }, { "epoch": 8.49510603588907, "grad_norm": 2.0785250663757324, "learning_rate": 3.5585450782035843e-05, "loss": 0.1712, "num_input_tokens_seen": 112392688, "step": 52075 }, { "epoch": 8.495921696574225, "grad_norm": 0.3274303674697876, "learning_rate": 3.558222646221941e-05, "loss": 0.1196, "num_input_tokens_seen": 112404240, "step": 52080 }, { "epoch": 8.49673735725938, "grad_norm": 0.2156830132007599, "learning_rate": 3.5579001927940744e-05, "loss": 0.1162, "num_input_tokens_seen": 112414768, "step": 52085 }, { "epoch": 8.497553017944535, "grad_norm": 0.07576823234558105, "learning_rate": 3.5575777179265175e-05, "loss": 0.1641, "num_input_tokens_seen": 112425424, "step": 52090 }, { "epoch": 8.49836867862969, "grad_norm": 0.13575178384780884, "learning_rate": 3.557255221625806e-05, "loss": 0.0862, "num_input_tokens_seen": 112436720, "step": 52095 }, { "epoch": 8.499184339314844, "grad_norm": 0.46838733553886414, "learning_rate": 3.556932703898477e-05, "loss": 0.2293, "num_input_tokens_seen": 112446896, "step": 52100 }, { "epoch": 8.5, "grad_norm": 0.04084919020533562, "learning_rate": 3.556610164751066e-05, "loss": 0.0668, "num_input_tokens_seen": 112458064, "step": 52105 }, { "epoch": 8.500815660685156, "grad_norm": 0.4751873016357422, "learning_rate": 3.556287604190109e-05, "loss": 0.0701, "num_input_tokens_seen": 112469200, "step": 52110 }, { "epoch": 8.50163132137031, "grad_norm": 3.117898941040039, "learning_rate": 3.555965022222144e-05, "loss": 0.3429, "num_input_tokens_seen": 112479888, "step": 52115 }, { "epoch": 8.502446982055465, "grad_norm": 0.47680217027664185, "learning_rate": 3.555642418853708e-05, "loss": 0.3404, "num_input_tokens_seen": 112490896, "step": 52120 }, { "epoch": 8.50326264274062, "grad_norm": 0.17491136491298676, "learning_rate": 3.55531979409134e-05, "loss": 0.1084, "num_input_tokens_seen": 112503152, "step": 52125 }, { "epoch": 8.504078303425775, "grad_norm": 0.2691986560821533, "learning_rate": 3.554997147941577e-05, "loss": 0.1666, "num_input_tokens_seen": 112513936, "step": 52130 }, { "epoch": 8.50489396411093, "grad_norm": 1.434309959411621, "learning_rate": 3.554674480410958e-05, "loss": 0.1681, "num_input_tokens_seen": 112523536, "step": 52135 }, { "epoch": 8.505709624796085, "grad_norm": 0.08212113380432129, "learning_rate": 3.554351791506024e-05, "loss": 0.1595, "num_input_tokens_seen": 112534512, "step": 52140 }, { "epoch": 8.50652528548124, "grad_norm": 0.773959755897522, "learning_rate": 3.554029081233312e-05, "loss": 0.2353, "num_input_tokens_seen": 112546032, "step": 52145 }, { "epoch": 8.507340946166394, "grad_norm": 1.2641667127609253, "learning_rate": 3.553706349599364e-05, "loss": 0.2079, "num_input_tokens_seen": 112556016, "step": 52150 }, { "epoch": 8.50815660685155, "grad_norm": 0.9897227883338928, "learning_rate": 3.553383596610721e-05, "loss": 0.0854, "num_input_tokens_seen": 112565520, "step": 52155 }, { "epoch": 8.508972267536706, "grad_norm": 0.13593991100788116, "learning_rate": 3.553060822273922e-05, "loss": 0.0355, "num_input_tokens_seen": 112575824, "step": 52160 }, { "epoch": 8.50978792822186, "grad_norm": 1.9166802167892456, "learning_rate": 3.5527380265955105e-05, "loss": 0.0898, "num_input_tokens_seen": 112587056, "step": 52165 }, { "epoch": 8.510603588907015, "grad_norm": 0.2206590622663498, "learning_rate": 3.552415209582026e-05, "loss": 0.0144, "num_input_tokens_seen": 112598192, "step": 52170 }, { "epoch": 8.51141924959217, "grad_norm": 0.4158288240432739, "learning_rate": 3.552092371240013e-05, "loss": 0.0271, "num_input_tokens_seen": 112609584, "step": 52175 }, { "epoch": 8.512234910277325, "grad_norm": 1.2309507131576538, "learning_rate": 3.551769511576013e-05, "loss": 0.1191, "num_input_tokens_seen": 112620720, "step": 52180 }, { "epoch": 8.513050570962479, "grad_norm": 0.1448352038860321, "learning_rate": 3.55144663059657e-05, "loss": 0.0629, "num_input_tokens_seen": 112631696, "step": 52185 }, { "epoch": 8.513866231647635, "grad_norm": 0.260559618473053, "learning_rate": 3.5511237283082274e-05, "loss": 0.158, "num_input_tokens_seen": 112641872, "step": 52190 }, { "epoch": 8.51468189233279, "grad_norm": 0.2262922078371048, "learning_rate": 3.550800804717529e-05, "loss": 0.033, "num_input_tokens_seen": 112652240, "step": 52195 }, { "epoch": 8.515497553017944, "grad_norm": 0.122391477227211, "learning_rate": 3.550477859831019e-05, "loss": 0.026, "num_input_tokens_seen": 112663824, "step": 52200 }, { "epoch": 8.5163132137031, "grad_norm": 0.5510165691375732, "learning_rate": 3.550154893655243e-05, "loss": 0.0549, "num_input_tokens_seen": 112675216, "step": 52205 }, { "epoch": 8.517128874388254, "grad_norm": 0.3898826539516449, "learning_rate": 3.549831906196745e-05, "loss": 0.0302, "num_input_tokens_seen": 112684464, "step": 52210 }, { "epoch": 8.51794453507341, "grad_norm": 0.14635978639125824, "learning_rate": 3.549508897462072e-05, "loss": 0.0437, "num_input_tokens_seen": 112695120, "step": 52215 }, { "epoch": 8.518760195758565, "grad_norm": 0.1197909414768219, "learning_rate": 3.54918586745777e-05, "loss": 0.0373, "num_input_tokens_seen": 112705776, "step": 52220 }, { "epoch": 8.51957585644372, "grad_norm": 0.13032737374305725, "learning_rate": 3.548862816190385e-05, "loss": 0.2131, "num_input_tokens_seen": 112716368, "step": 52225 }, { "epoch": 8.520391517128875, "grad_norm": 0.45316267013549805, "learning_rate": 3.548539743666465e-05, "loss": 0.0332, "num_input_tokens_seen": 112727440, "step": 52230 }, { "epoch": 8.521207177814029, "grad_norm": 1.9456642866134644, "learning_rate": 3.5482166498925565e-05, "loss": 0.2792, "num_input_tokens_seen": 112736336, "step": 52235 }, { "epoch": 8.522022838499185, "grad_norm": 1.022364616394043, "learning_rate": 3.5478935348752086e-05, "loss": 0.1217, "num_input_tokens_seen": 112747312, "step": 52240 }, { "epoch": 8.522838499184338, "grad_norm": 0.5901534557342529, "learning_rate": 3.547570398620968e-05, "loss": 0.289, "num_input_tokens_seen": 112758160, "step": 52245 }, { "epoch": 8.523654159869494, "grad_norm": 0.7935692667961121, "learning_rate": 3.547247241136384e-05, "loss": 0.1136, "num_input_tokens_seen": 112768080, "step": 52250 }, { "epoch": 8.52446982055465, "grad_norm": 0.14394952356815338, "learning_rate": 3.546924062428008e-05, "loss": 0.116, "num_input_tokens_seen": 112778416, "step": 52255 }, { "epoch": 8.525285481239804, "grad_norm": 1.472119927406311, "learning_rate": 3.546600862502385e-05, "loss": 0.1124, "num_input_tokens_seen": 112788944, "step": 52260 }, { "epoch": 8.52610114192496, "grad_norm": 0.05966883525252342, "learning_rate": 3.54627764136607e-05, "loss": 0.0265, "num_input_tokens_seen": 112799664, "step": 52265 }, { "epoch": 8.526916802610113, "grad_norm": 0.1918697953224182, "learning_rate": 3.545954399025611e-05, "loss": 0.2808, "num_input_tokens_seen": 112809840, "step": 52270 }, { "epoch": 8.52773246329527, "grad_norm": 0.7055855393409729, "learning_rate": 3.5456311354875585e-05, "loss": 0.1202, "num_input_tokens_seen": 112820240, "step": 52275 }, { "epoch": 8.528548123980425, "grad_norm": 0.09746824949979782, "learning_rate": 3.545307850758465e-05, "loss": 0.1178, "num_input_tokens_seen": 112830704, "step": 52280 }, { "epoch": 8.529363784665579, "grad_norm": 0.9721487760543823, "learning_rate": 3.5449845448448804e-05, "loss": 0.0801, "num_input_tokens_seen": 112842352, "step": 52285 }, { "epoch": 8.530179445350734, "grad_norm": 0.6786628365516663, "learning_rate": 3.54466121775336e-05, "loss": 0.1496, "num_input_tokens_seen": 112853904, "step": 52290 }, { "epoch": 8.530995106035888, "grad_norm": 0.9294056296348572, "learning_rate": 3.544337869490454e-05, "loss": 0.0835, "num_input_tokens_seen": 112863504, "step": 52295 }, { "epoch": 8.531810766721044, "grad_norm": 0.2239755243062973, "learning_rate": 3.544014500062717e-05, "loss": 0.0438, "num_input_tokens_seen": 112873808, "step": 52300 }, { "epoch": 8.5326264274062, "grad_norm": 1.4879475831985474, "learning_rate": 3.543691109476702e-05, "loss": 0.1582, "num_input_tokens_seen": 112883696, "step": 52305 }, { "epoch": 8.533442088091354, "grad_norm": 0.15004466474056244, "learning_rate": 3.543367697738962e-05, "loss": 0.0407, "num_input_tokens_seen": 112893424, "step": 52310 }, { "epoch": 8.53425774877651, "grad_norm": 0.18103931844234467, "learning_rate": 3.543044264856051e-05, "loss": 0.1101, "num_input_tokens_seen": 112904016, "step": 52315 }, { "epoch": 8.535073409461663, "grad_norm": 1.0650873184204102, "learning_rate": 3.5427208108345265e-05, "loss": 0.0983, "num_input_tokens_seen": 112915376, "step": 52320 }, { "epoch": 8.535889070146819, "grad_norm": 0.05544290319085121, "learning_rate": 3.5423973356809415e-05, "loss": 0.1038, "num_input_tokens_seen": 112925424, "step": 52325 }, { "epoch": 8.536704730831975, "grad_norm": 0.1162441298365593, "learning_rate": 3.542073839401851e-05, "loss": 0.1596, "num_input_tokens_seen": 112936656, "step": 52330 }, { "epoch": 8.537520391517129, "grad_norm": 0.06786376237869263, "learning_rate": 3.541750322003814e-05, "loss": 0.0695, "num_input_tokens_seen": 112947600, "step": 52335 }, { "epoch": 8.538336052202284, "grad_norm": 0.033679235726594925, "learning_rate": 3.541426783493385e-05, "loss": 0.0201, "num_input_tokens_seen": 112957840, "step": 52340 }, { "epoch": 8.539151712887438, "grad_norm": 0.9994202852249146, "learning_rate": 3.54110322387712e-05, "loss": 0.061, "num_input_tokens_seen": 112969488, "step": 52345 }, { "epoch": 8.539967373572594, "grad_norm": 0.5472522377967834, "learning_rate": 3.5407796431615775e-05, "loss": 0.0296, "num_input_tokens_seen": 112979216, "step": 52350 }, { "epoch": 8.540783034257748, "grad_norm": 0.3772125244140625, "learning_rate": 3.5404560413533154e-05, "loss": 0.0251, "num_input_tokens_seen": 112990736, "step": 52355 }, { "epoch": 8.541598694942904, "grad_norm": 1.0997424125671387, "learning_rate": 3.540132418458892e-05, "loss": 0.2335, "num_input_tokens_seen": 113001296, "step": 52360 }, { "epoch": 8.54241435562806, "grad_norm": 0.03787146136164665, "learning_rate": 3.5398087744848655e-05, "loss": 0.0966, "num_input_tokens_seen": 113011312, "step": 52365 }, { "epoch": 8.543230016313213, "grad_norm": 0.25709104537963867, "learning_rate": 3.5394851094377944e-05, "loss": 0.0595, "num_input_tokens_seen": 113020688, "step": 52370 }, { "epoch": 8.544045676998369, "grad_norm": 0.2832893133163452, "learning_rate": 3.53916142332424e-05, "loss": 0.0547, "num_input_tokens_seen": 113031888, "step": 52375 }, { "epoch": 8.544861337683523, "grad_norm": 1.142756462097168, "learning_rate": 3.5388377161507606e-05, "loss": 0.149, "num_input_tokens_seen": 113042800, "step": 52380 }, { "epoch": 8.545676998368679, "grad_norm": 0.24979156255722046, "learning_rate": 3.538513987923917e-05, "loss": 0.0222, "num_input_tokens_seen": 113052176, "step": 52385 }, { "epoch": 8.546492659053834, "grad_norm": 0.021530237048864365, "learning_rate": 3.53819023865027e-05, "loss": 0.1783, "num_input_tokens_seen": 113062608, "step": 52390 }, { "epoch": 8.547308319738988, "grad_norm": 0.9156792759895325, "learning_rate": 3.53786646833638e-05, "loss": 0.1064, "num_input_tokens_seen": 113075248, "step": 52395 }, { "epoch": 8.548123980424144, "grad_norm": 0.0671289786696434, "learning_rate": 3.537542676988811e-05, "loss": 0.1004, "num_input_tokens_seen": 113086256, "step": 52400 }, { "epoch": 8.548939641109298, "grad_norm": 1.2010853290557861, "learning_rate": 3.537218864614122e-05, "loss": 0.0585, "num_input_tokens_seen": 113096752, "step": 52405 }, { "epoch": 8.549755301794454, "grad_norm": 0.02383730374276638, "learning_rate": 3.5368950312188775e-05, "loss": 0.0677, "num_input_tokens_seen": 113106480, "step": 52410 }, { "epoch": 8.550570962479608, "grad_norm": 0.05394671857357025, "learning_rate": 3.53657117680964e-05, "loss": 0.1429, "num_input_tokens_seen": 113117424, "step": 52415 }, { "epoch": 8.551386623164763, "grad_norm": 2.031140089035034, "learning_rate": 3.5362473013929726e-05, "loss": 0.1665, "num_input_tokens_seen": 113127728, "step": 52420 }, { "epoch": 8.552202283849919, "grad_norm": 1.3766446113586426, "learning_rate": 3.5359234049754395e-05, "loss": 0.1057, "num_input_tokens_seen": 113138000, "step": 52425 }, { "epoch": 8.553017944535073, "grad_norm": 0.2356065958738327, "learning_rate": 3.5355994875636025e-05, "loss": 0.1338, "num_input_tokens_seen": 113148752, "step": 52430 }, { "epoch": 8.553833605220229, "grad_norm": 0.06690789759159088, "learning_rate": 3.53527554916403e-05, "loss": 0.058, "num_input_tokens_seen": 113159600, "step": 52435 }, { "epoch": 8.554649265905383, "grad_norm": 1.6127958297729492, "learning_rate": 3.5349515897832846e-05, "loss": 0.086, "num_input_tokens_seen": 113170416, "step": 52440 }, { "epoch": 8.555464926590538, "grad_norm": 1.2350280284881592, "learning_rate": 3.5346276094279326e-05, "loss": 0.1365, "num_input_tokens_seen": 113179984, "step": 52445 }, { "epoch": 8.556280587275694, "grad_norm": 1.0790265798568726, "learning_rate": 3.534303608104539e-05, "loss": 0.1138, "num_input_tokens_seen": 113189680, "step": 52450 }, { "epoch": 8.557096247960848, "grad_norm": 0.6921663284301758, "learning_rate": 3.533979585819671e-05, "loss": 0.0938, "num_input_tokens_seen": 113200112, "step": 52455 }, { "epoch": 8.557911908646004, "grad_norm": 0.3804577887058258, "learning_rate": 3.533655542579895e-05, "loss": 0.0813, "num_input_tokens_seen": 113210416, "step": 52460 }, { "epoch": 8.558727569331158, "grad_norm": 0.04477493092417717, "learning_rate": 3.533331478391778e-05, "loss": 0.0082, "num_input_tokens_seen": 113221392, "step": 52465 }, { "epoch": 8.559543230016313, "grad_norm": 0.22156327962875366, "learning_rate": 3.533007393261888e-05, "loss": 0.3254, "num_input_tokens_seen": 113232528, "step": 52470 }, { "epoch": 8.560358890701469, "grad_norm": 0.4401692748069763, "learning_rate": 3.5326832871967926e-05, "loss": 0.1724, "num_input_tokens_seen": 113243056, "step": 52475 }, { "epoch": 8.561174551386623, "grad_norm": 1.8889089822769165, "learning_rate": 3.53235916020306e-05, "loss": 0.2279, "num_input_tokens_seen": 113254576, "step": 52480 }, { "epoch": 8.561990212071779, "grad_norm": 0.0828947201371193, "learning_rate": 3.53203501228726e-05, "loss": 0.1226, "num_input_tokens_seen": 113263472, "step": 52485 }, { "epoch": 8.562805872756933, "grad_norm": 0.6436769962310791, "learning_rate": 3.5317108434559604e-05, "loss": 0.0354, "num_input_tokens_seen": 113276112, "step": 52490 }, { "epoch": 8.563621533442088, "grad_norm": 0.31105315685272217, "learning_rate": 3.531386653715732e-05, "loss": 0.0491, "num_input_tokens_seen": 113285520, "step": 52495 }, { "epoch": 8.564437194127244, "grad_norm": 1.3270888328552246, "learning_rate": 3.531062443073145e-05, "loss": 0.1125, "num_input_tokens_seen": 113296304, "step": 52500 }, { "epoch": 8.565252854812398, "grad_norm": 0.04772128537297249, "learning_rate": 3.53073821153477e-05, "loss": 0.0922, "num_input_tokens_seen": 113307632, "step": 52505 }, { "epoch": 8.566068515497554, "grad_norm": 1.5177644491195679, "learning_rate": 3.530413959107177e-05, "loss": 0.098, "num_input_tokens_seen": 113318352, "step": 52510 }, { "epoch": 8.566884176182707, "grad_norm": 0.05223431438207626, "learning_rate": 3.530089685796938e-05, "loss": 0.2105, "num_input_tokens_seen": 113329552, "step": 52515 }, { "epoch": 8.567699836867863, "grad_norm": 0.5368385314941406, "learning_rate": 3.5297653916106245e-05, "loss": 0.0827, "num_input_tokens_seen": 113339216, "step": 52520 }, { "epoch": 8.568515497553017, "grad_norm": 1.2105319499969482, "learning_rate": 3.529441076554809e-05, "loss": 0.2175, "num_input_tokens_seen": 113349904, "step": 52525 }, { "epoch": 8.569331158238173, "grad_norm": 0.9760951399803162, "learning_rate": 3.529116740636065e-05, "loss": 0.0871, "num_input_tokens_seen": 113358832, "step": 52530 }, { "epoch": 8.570146818923329, "grad_norm": 0.11879494786262512, "learning_rate": 3.528792383860964e-05, "loss": 0.1373, "num_input_tokens_seen": 113370480, "step": 52535 }, { "epoch": 8.570962479608482, "grad_norm": 0.05868314951658249, "learning_rate": 3.5284680062360805e-05, "loss": 0.025, "num_input_tokens_seen": 113380592, "step": 52540 }, { "epoch": 8.571778140293638, "grad_norm": 0.6603497862815857, "learning_rate": 3.528143607767988e-05, "loss": 0.1314, "num_input_tokens_seen": 113392112, "step": 52545 }, { "epoch": 8.572593800978792, "grad_norm": 0.08219785243272781, "learning_rate": 3.527819188463261e-05, "loss": 0.0942, "num_input_tokens_seen": 113403152, "step": 52550 }, { "epoch": 8.573409461663948, "grad_norm": 0.1843755692243576, "learning_rate": 3.527494748328474e-05, "loss": 0.0222, "num_input_tokens_seen": 113413424, "step": 52555 }, { "epoch": 8.574225122349104, "grad_norm": 1.9261094331741333, "learning_rate": 3.5271702873702025e-05, "loss": 0.1591, "num_input_tokens_seen": 113424848, "step": 52560 }, { "epoch": 8.575040783034257, "grad_norm": 1.5911905765533447, "learning_rate": 3.526845805595023e-05, "loss": 0.3736, "num_input_tokens_seen": 113434608, "step": 52565 }, { "epoch": 8.575856443719413, "grad_norm": 0.04965768754482269, "learning_rate": 3.526521303009509e-05, "loss": 0.1559, "num_input_tokens_seen": 113446224, "step": 52570 }, { "epoch": 8.576672104404567, "grad_norm": 0.09059443324804306, "learning_rate": 3.52619677962024e-05, "loss": 0.2573, "num_input_tokens_seen": 113457296, "step": 52575 }, { "epoch": 8.577487765089723, "grad_norm": 0.02927604503929615, "learning_rate": 3.5258722354337906e-05, "loss": 0.0369, "num_input_tokens_seen": 113468720, "step": 52580 }, { "epoch": 8.578303425774878, "grad_norm": 1.1237528324127197, "learning_rate": 3.525547670456739e-05, "loss": 0.1737, "num_input_tokens_seen": 113479152, "step": 52585 }, { "epoch": 8.579119086460032, "grad_norm": 1.5708491802215576, "learning_rate": 3.5252230846956636e-05, "loss": 0.1594, "num_input_tokens_seen": 113490608, "step": 52590 }, { "epoch": 8.579934747145188, "grad_norm": 0.7508123517036438, "learning_rate": 3.524898478157141e-05, "loss": 0.0384, "num_input_tokens_seen": 113501968, "step": 52595 }, { "epoch": 8.580750407830342, "grad_norm": 0.25022879242897034, "learning_rate": 3.524573850847751e-05, "loss": 0.0615, "num_input_tokens_seen": 113512944, "step": 52600 }, { "epoch": 8.581566068515498, "grad_norm": 0.09723370522260666, "learning_rate": 3.5242492027740713e-05, "loss": 0.0258, "num_input_tokens_seen": 113524304, "step": 52605 }, { "epoch": 8.582381729200652, "grad_norm": 1.6535488367080688, "learning_rate": 3.5239245339426827e-05, "loss": 0.1802, "num_input_tokens_seen": 113536400, "step": 52610 }, { "epoch": 8.583197389885807, "grad_norm": 1.2278910875320435, "learning_rate": 3.5235998443601645e-05, "loss": 0.2273, "num_input_tokens_seen": 113546992, "step": 52615 }, { "epoch": 8.584013050570963, "grad_norm": 0.971698522567749, "learning_rate": 3.523275134033097e-05, "loss": 0.0549, "num_input_tokens_seen": 113557808, "step": 52620 }, { "epoch": 8.584828711256117, "grad_norm": 0.19600753486156464, "learning_rate": 3.5229504029680614e-05, "loss": 0.1122, "num_input_tokens_seen": 113568336, "step": 52625 }, { "epoch": 8.585644371941273, "grad_norm": 0.49116793274879456, "learning_rate": 3.522625651171638e-05, "loss": 0.0683, "num_input_tokens_seen": 113579280, "step": 52630 }, { "epoch": 8.586460032626427, "grad_norm": 0.3957135081291199, "learning_rate": 3.5223008786504074e-05, "loss": 0.0494, "num_input_tokens_seen": 113591024, "step": 52635 }, { "epoch": 8.587275693311582, "grad_norm": 0.706154465675354, "learning_rate": 3.5219760854109537e-05, "loss": 0.1485, "num_input_tokens_seen": 113602032, "step": 52640 }, { "epoch": 8.588091353996738, "grad_norm": 0.24008111655712128, "learning_rate": 3.521651271459858e-05, "loss": 0.0402, "num_input_tokens_seen": 113612688, "step": 52645 }, { "epoch": 8.588907014681892, "grad_norm": 1.3657729625701904, "learning_rate": 3.521326436803703e-05, "loss": 0.0877, "num_input_tokens_seen": 113622224, "step": 52650 }, { "epoch": 8.589722675367048, "grad_norm": 1.6764377355575562, "learning_rate": 3.521001581449072e-05, "loss": 0.1233, "num_input_tokens_seen": 113631920, "step": 52655 }, { "epoch": 8.590538336052202, "grad_norm": 0.3410123288631439, "learning_rate": 3.520676705402549e-05, "loss": 0.0379, "num_input_tokens_seen": 113642672, "step": 52660 }, { "epoch": 8.591353996737357, "grad_norm": 0.09522531181573868, "learning_rate": 3.520351808670717e-05, "loss": 0.1676, "num_input_tokens_seen": 113653456, "step": 52665 }, { "epoch": 8.592169657422513, "grad_norm": 0.4220746159553528, "learning_rate": 3.520026891260162e-05, "loss": 0.1216, "num_input_tokens_seen": 113664304, "step": 52670 }, { "epoch": 8.592985318107667, "grad_norm": 2.889477491378784, "learning_rate": 3.519701953177468e-05, "loss": 0.1965, "num_input_tokens_seen": 113675024, "step": 52675 }, { "epoch": 8.593800978792823, "grad_norm": 0.03298897296190262, "learning_rate": 3.51937699442922e-05, "loss": 0.2052, "num_input_tokens_seen": 113685200, "step": 52680 }, { "epoch": 8.594616639477977, "grad_norm": 0.5005972981452942, "learning_rate": 3.519052015022004e-05, "loss": 0.0739, "num_input_tokens_seen": 113696528, "step": 52685 }, { "epoch": 8.595432300163132, "grad_norm": 0.04303469508886337, "learning_rate": 3.5187270149624066e-05, "loss": 0.1436, "num_input_tokens_seen": 113707568, "step": 52690 }, { "epoch": 8.596247960848288, "grad_norm": 0.08540280163288116, "learning_rate": 3.518401994257014e-05, "loss": 0.0148, "num_input_tokens_seen": 113719280, "step": 52695 }, { "epoch": 8.597063621533442, "grad_norm": 2.3951852321624756, "learning_rate": 3.5180769529124124e-05, "loss": 0.2423, "num_input_tokens_seen": 113729072, "step": 52700 }, { "epoch": 8.597879282218598, "grad_norm": 1.1749117374420166, "learning_rate": 3.5177518909351895e-05, "loss": 0.0669, "num_input_tokens_seen": 113740784, "step": 52705 }, { "epoch": 8.598694942903752, "grad_norm": 0.06564731895923615, "learning_rate": 3.517426808331934e-05, "loss": 0.1092, "num_input_tokens_seen": 113751088, "step": 52710 }, { "epoch": 8.599510603588907, "grad_norm": 1.2389968633651733, "learning_rate": 3.5171017051092335e-05, "loss": 0.3148, "num_input_tokens_seen": 113762672, "step": 52715 }, { "epoch": 8.600326264274061, "grad_norm": 0.17740988731384277, "learning_rate": 3.5167765812736776e-05, "loss": 0.0778, "num_input_tokens_seen": 113773072, "step": 52720 }, { "epoch": 8.601141924959217, "grad_norm": 0.07796266674995422, "learning_rate": 3.516451436831854e-05, "loss": 0.0344, "num_input_tokens_seen": 113784080, "step": 52725 }, { "epoch": 8.601957585644373, "grad_norm": 0.7353047132492065, "learning_rate": 3.5161262717903514e-05, "loss": 0.0541, "num_input_tokens_seen": 113794704, "step": 52730 }, { "epoch": 8.602773246329527, "grad_norm": 0.28926289081573486, "learning_rate": 3.5158010861557616e-05, "loss": 0.0346, "num_input_tokens_seen": 113806512, "step": 52735 }, { "epoch": 8.603588907014682, "grad_norm": 1.3381948471069336, "learning_rate": 3.5154758799346737e-05, "loss": 0.2258, "num_input_tokens_seen": 113817872, "step": 52740 }, { "epoch": 8.604404567699836, "grad_norm": 0.7404674291610718, "learning_rate": 3.5151506531336795e-05, "loss": 0.0755, "num_input_tokens_seen": 113829808, "step": 52745 }, { "epoch": 8.605220228384992, "grad_norm": 0.01823987066745758, "learning_rate": 3.514825405759369e-05, "loss": 0.0205, "num_input_tokens_seen": 113840656, "step": 52750 }, { "epoch": 8.606035889070148, "grad_norm": 0.22635076940059662, "learning_rate": 3.5145001378183344e-05, "loss": 0.0995, "num_input_tokens_seen": 113851664, "step": 52755 }, { "epoch": 8.606851549755302, "grad_norm": 0.5516262650489807, "learning_rate": 3.514174849317168e-05, "loss": 0.0756, "num_input_tokens_seen": 113862288, "step": 52760 }, { "epoch": 8.607667210440457, "grad_norm": 1.681876540184021, "learning_rate": 3.513849540262462e-05, "loss": 0.0756, "num_input_tokens_seen": 113873456, "step": 52765 }, { "epoch": 8.608482871125611, "grad_norm": 0.02754034660756588, "learning_rate": 3.513524210660808e-05, "loss": 0.2123, "num_input_tokens_seen": 113883536, "step": 52770 }, { "epoch": 8.609298531810767, "grad_norm": 0.2697581946849823, "learning_rate": 3.513198860518801e-05, "loss": 0.0407, "num_input_tokens_seen": 113893520, "step": 52775 }, { "epoch": 8.61011419249592, "grad_norm": 0.10773982852697372, "learning_rate": 3.5128734898430325e-05, "loss": 0.1936, "num_input_tokens_seen": 113904592, "step": 52780 }, { "epoch": 8.610929853181077, "grad_norm": 0.22267241775989532, "learning_rate": 3.512548098640098e-05, "loss": 0.0691, "num_input_tokens_seen": 113916272, "step": 52785 }, { "epoch": 8.611745513866232, "grad_norm": 0.13228316605091095, "learning_rate": 3.512222686916592e-05, "loss": 0.0353, "num_input_tokens_seen": 113928336, "step": 52790 }, { "epoch": 8.612561174551386, "grad_norm": 0.483734667301178, "learning_rate": 3.51189725467911e-05, "loss": 0.1742, "num_input_tokens_seen": 113939920, "step": 52795 }, { "epoch": 8.613376835236542, "grad_norm": 1.8148266077041626, "learning_rate": 3.511571801934246e-05, "loss": 0.1, "num_input_tokens_seen": 113949616, "step": 52800 }, { "epoch": 8.614192495921696, "grad_norm": 0.3122309744358063, "learning_rate": 3.511246328688596e-05, "loss": 0.0459, "num_input_tokens_seen": 113959408, "step": 52805 }, { "epoch": 8.615008156606851, "grad_norm": 0.16548782587051392, "learning_rate": 3.510920834948756e-05, "loss": 0.2827, "num_input_tokens_seen": 113970736, "step": 52810 }, { "epoch": 8.615823817292007, "grad_norm": 0.510585367679596, "learning_rate": 3.5105953207213224e-05, "loss": 0.1064, "num_input_tokens_seen": 113981008, "step": 52815 }, { "epoch": 8.616639477977161, "grad_norm": 0.1827545315027237, "learning_rate": 3.510269786012894e-05, "loss": 0.0817, "num_input_tokens_seen": 113992816, "step": 52820 }, { "epoch": 8.617455138662317, "grad_norm": 1.037408709526062, "learning_rate": 3.509944230830066e-05, "loss": 0.0458, "num_input_tokens_seen": 114005392, "step": 52825 }, { "epoch": 8.61827079934747, "grad_norm": 0.20402735471725464, "learning_rate": 3.5096186551794375e-05, "loss": 0.0496, "num_input_tokens_seen": 114015600, "step": 52830 }, { "epoch": 8.619086460032626, "grad_norm": 1.8530120849609375, "learning_rate": 3.5092930590676054e-05, "loss": 0.2674, "num_input_tokens_seen": 114026448, "step": 52835 }, { "epoch": 8.619902120717782, "grad_norm": 1.3726917505264282, "learning_rate": 3.5089674425011696e-05, "loss": 0.1239, "num_input_tokens_seen": 114037552, "step": 52840 }, { "epoch": 8.620717781402936, "grad_norm": 0.07043557614088058, "learning_rate": 3.5086418054867283e-05, "loss": 0.1587, "num_input_tokens_seen": 114049072, "step": 52845 }, { "epoch": 8.621533442088092, "grad_norm": 1.5763593912124634, "learning_rate": 3.508316148030881e-05, "loss": 0.1291, "num_input_tokens_seen": 114059824, "step": 52850 }, { "epoch": 8.622349102773246, "grad_norm": 0.2029757797718048, "learning_rate": 3.507990470140229e-05, "loss": 0.1524, "num_input_tokens_seen": 114070960, "step": 52855 }, { "epoch": 8.623164763458401, "grad_norm": 0.18390251696109772, "learning_rate": 3.507664771821371e-05, "loss": 0.0631, "num_input_tokens_seen": 114082224, "step": 52860 }, { "epoch": 8.623980424143557, "grad_norm": 0.09777892380952835, "learning_rate": 3.507339053080907e-05, "loss": 0.0658, "num_input_tokens_seen": 114091760, "step": 52865 }, { "epoch": 8.624796084828711, "grad_norm": 1.5169143676757812, "learning_rate": 3.5070133139254404e-05, "loss": 0.1089, "num_input_tokens_seen": 114100912, "step": 52870 }, { "epoch": 8.625611745513867, "grad_norm": 0.08067968487739563, "learning_rate": 3.506687554361571e-05, "loss": 0.1511, "num_input_tokens_seen": 114111856, "step": 52875 }, { "epoch": 8.62642740619902, "grad_norm": 1.5091379880905151, "learning_rate": 3.506361774395901e-05, "loss": 0.2353, "num_input_tokens_seen": 114121872, "step": 52880 }, { "epoch": 8.627243066884176, "grad_norm": 0.6956257224082947, "learning_rate": 3.506035974035032e-05, "loss": 0.14, "num_input_tokens_seen": 114132304, "step": 52885 }, { "epoch": 8.62805872756933, "grad_norm": 1.5911556482315063, "learning_rate": 3.50571015328557e-05, "loss": 0.2046, "num_input_tokens_seen": 114142160, "step": 52890 }, { "epoch": 8.628874388254486, "grad_norm": 0.6523218154907227, "learning_rate": 3.505384312154114e-05, "loss": 0.1534, "num_input_tokens_seen": 114153392, "step": 52895 }, { "epoch": 8.629690048939642, "grad_norm": 0.22488515079021454, "learning_rate": 3.505058450647271e-05, "loss": 0.0743, "num_input_tokens_seen": 114165648, "step": 52900 }, { "epoch": 8.630505709624796, "grad_norm": 0.4950055480003357, "learning_rate": 3.5047325687716425e-05, "loss": 0.0867, "num_input_tokens_seen": 114175248, "step": 52905 }, { "epoch": 8.631321370309951, "grad_norm": 0.9048983454704285, "learning_rate": 3.504406666533834e-05, "loss": 0.2842, "num_input_tokens_seen": 114186256, "step": 52910 }, { "epoch": 8.632137030995105, "grad_norm": 1.6640087366104126, "learning_rate": 3.50408074394045e-05, "loss": 0.0476, "num_input_tokens_seen": 114197264, "step": 52915 }, { "epoch": 8.632952691680261, "grad_norm": 0.7710539698600769, "learning_rate": 3.503754800998097e-05, "loss": 0.0269, "num_input_tokens_seen": 114208944, "step": 52920 }, { "epoch": 8.633768352365417, "grad_norm": 0.10191284865140915, "learning_rate": 3.503428837713379e-05, "loss": 0.1259, "num_input_tokens_seen": 114219504, "step": 52925 }, { "epoch": 8.63458401305057, "grad_norm": 0.1359483152627945, "learning_rate": 3.5031028540929023e-05, "loss": 0.0188, "num_input_tokens_seen": 114229072, "step": 52930 }, { "epoch": 8.635399673735726, "grad_norm": 0.6612180471420288, "learning_rate": 3.502776850143275e-05, "loss": 0.0892, "num_input_tokens_seen": 114238768, "step": 52935 }, { "epoch": 8.63621533442088, "grad_norm": 0.03401526063680649, "learning_rate": 3.5024508258711006e-05, "loss": 0.0106, "num_input_tokens_seen": 114250256, "step": 52940 }, { "epoch": 8.637030995106036, "grad_norm": 1.6418368816375732, "learning_rate": 3.5021247812829904e-05, "loss": 0.1594, "num_input_tokens_seen": 114259696, "step": 52945 }, { "epoch": 8.63784665579119, "grad_norm": 0.144802525639534, "learning_rate": 3.5017987163855495e-05, "loss": 0.2103, "num_input_tokens_seen": 114270288, "step": 52950 }, { "epoch": 8.638662316476346, "grad_norm": 0.08053570240736008, "learning_rate": 3.501472631185387e-05, "loss": 0.0709, "num_input_tokens_seen": 114280784, "step": 52955 }, { "epoch": 8.639477977161501, "grad_norm": 1.2460087537765503, "learning_rate": 3.5011465256891106e-05, "loss": 0.0901, "num_input_tokens_seen": 114290832, "step": 52960 }, { "epoch": 8.640293637846655, "grad_norm": 0.8580738306045532, "learning_rate": 3.5008203999033304e-05, "loss": 0.1201, "num_input_tokens_seen": 114301648, "step": 52965 }, { "epoch": 8.641109298531811, "grad_norm": 0.6255552172660828, "learning_rate": 3.500494253834655e-05, "loss": 0.0942, "num_input_tokens_seen": 114311728, "step": 52970 }, { "epoch": 8.641924959216965, "grad_norm": 0.38919273018836975, "learning_rate": 3.500168087489695e-05, "loss": 0.1853, "num_input_tokens_seen": 114322512, "step": 52975 }, { "epoch": 8.64274061990212, "grad_norm": 0.569433867931366, "learning_rate": 3.499841900875058e-05, "loss": 0.1483, "num_input_tokens_seen": 114332592, "step": 52980 }, { "epoch": 8.643556280587276, "grad_norm": 1.4224754571914673, "learning_rate": 3.499515693997358e-05, "loss": 0.2237, "num_input_tokens_seen": 114343696, "step": 52985 }, { "epoch": 8.64437194127243, "grad_norm": 0.11553514748811722, "learning_rate": 3.499189466863204e-05, "loss": 0.1503, "num_input_tokens_seen": 114354928, "step": 52990 }, { "epoch": 8.645187601957586, "grad_norm": 0.7507365942001343, "learning_rate": 3.4988632194792085e-05, "loss": 0.0622, "num_input_tokens_seen": 114365392, "step": 52995 }, { "epoch": 8.64600326264274, "grad_norm": 0.6708755493164062, "learning_rate": 3.4985369518519825e-05, "loss": 0.1032, "num_input_tokens_seen": 114374992, "step": 53000 }, { "epoch": 8.646818923327896, "grad_norm": 1.2432000637054443, "learning_rate": 3.498210663988138e-05, "loss": 0.032, "num_input_tokens_seen": 114385808, "step": 53005 }, { "epoch": 8.647634584013051, "grad_norm": 0.6605715751647949, "learning_rate": 3.497884355894289e-05, "loss": 0.0908, "num_input_tokens_seen": 114397072, "step": 53010 }, { "epoch": 8.648450244698205, "grad_norm": 1.1002888679504395, "learning_rate": 3.4975580275770464e-05, "loss": 0.0614, "num_input_tokens_seen": 114408624, "step": 53015 }, { "epoch": 8.649265905383361, "grad_norm": 0.1315363049507141, "learning_rate": 3.4972316790430256e-05, "loss": 0.2173, "num_input_tokens_seen": 114418512, "step": 53020 }, { "epoch": 8.650081566068515, "grad_norm": 1.533881664276123, "learning_rate": 3.4969053102988386e-05, "loss": 0.05, "num_input_tokens_seen": 114427792, "step": 53025 }, { "epoch": 8.65089722675367, "grad_norm": 0.1593366414308548, "learning_rate": 3.496578921351102e-05, "loss": 0.327, "num_input_tokens_seen": 114439888, "step": 53030 }, { "epoch": 8.651712887438826, "grad_norm": 0.31711986660957336, "learning_rate": 3.4962525122064296e-05, "loss": 0.0558, "num_input_tokens_seen": 114451280, "step": 53035 }, { "epoch": 8.65252854812398, "grad_norm": 0.12808483839035034, "learning_rate": 3.4959260828714355e-05, "loss": 0.0837, "num_input_tokens_seen": 114463376, "step": 53040 }, { "epoch": 8.653344208809136, "grad_norm": 0.15937639772891998, "learning_rate": 3.495599633352736e-05, "loss": 0.0773, "num_input_tokens_seen": 114474896, "step": 53045 }, { "epoch": 8.65415986949429, "grad_norm": 0.061936408281326294, "learning_rate": 3.495273163656947e-05, "loss": 0.1427, "num_input_tokens_seen": 114486000, "step": 53050 }, { "epoch": 8.654975530179446, "grad_norm": 0.03605318069458008, "learning_rate": 3.494946673790684e-05, "loss": 0.0805, "num_input_tokens_seen": 114497456, "step": 53055 }, { "epoch": 8.655791190864601, "grad_norm": 0.2606339454650879, "learning_rate": 3.4946201637605654e-05, "loss": 0.0236, "num_input_tokens_seen": 114508368, "step": 53060 }, { "epoch": 8.656606851549755, "grad_norm": 0.7976143956184387, "learning_rate": 3.4942936335732065e-05, "loss": 0.15, "num_input_tokens_seen": 114519888, "step": 53065 }, { "epoch": 8.65742251223491, "grad_norm": 0.09576039761304855, "learning_rate": 3.493967083235227e-05, "loss": 0.1637, "num_input_tokens_seen": 114529712, "step": 53070 }, { "epoch": 8.658238172920065, "grad_norm": 0.44670364260673523, "learning_rate": 3.493640512753243e-05, "loss": 0.1505, "num_input_tokens_seen": 114540528, "step": 53075 }, { "epoch": 8.65905383360522, "grad_norm": 0.34559211134910583, "learning_rate": 3.493313922133874e-05, "loss": 0.1821, "num_input_tokens_seen": 114552016, "step": 53080 }, { "epoch": 8.659869494290374, "grad_norm": 0.9607167840003967, "learning_rate": 3.492987311383738e-05, "loss": 0.0324, "num_input_tokens_seen": 114562672, "step": 53085 }, { "epoch": 8.66068515497553, "grad_norm": 0.055849965661764145, "learning_rate": 3.492660680509454e-05, "loss": 0.0413, "num_input_tokens_seen": 114574320, "step": 53090 }, { "epoch": 8.661500815660686, "grad_norm": 0.21651484072208405, "learning_rate": 3.492334029517642e-05, "loss": 0.1786, "num_input_tokens_seen": 114585456, "step": 53095 }, { "epoch": 8.66231647634584, "grad_norm": 0.05117970332503319, "learning_rate": 3.492007358414923e-05, "loss": 0.1111, "num_input_tokens_seen": 114596400, "step": 53100 }, { "epoch": 8.663132137030995, "grad_norm": 1.2784432172775269, "learning_rate": 3.491680667207915e-05, "loss": 0.0558, "num_input_tokens_seen": 114607120, "step": 53105 }, { "epoch": 8.66394779771615, "grad_norm": 0.9067749381065369, "learning_rate": 3.491353955903242e-05, "loss": 0.0635, "num_input_tokens_seen": 114617808, "step": 53110 }, { "epoch": 8.664763458401305, "grad_norm": 0.6989530324935913, "learning_rate": 3.491027224507522e-05, "loss": 0.0594, "num_input_tokens_seen": 114627664, "step": 53115 }, { "epoch": 8.66557911908646, "grad_norm": 0.03955959901213646, "learning_rate": 3.490700473027379e-05, "loss": 0.1005, "num_input_tokens_seen": 114638064, "step": 53120 }, { "epoch": 8.666394779771615, "grad_norm": 0.049337513744831085, "learning_rate": 3.490373701469434e-05, "loss": 0.1907, "num_input_tokens_seen": 114647760, "step": 53125 }, { "epoch": 8.66721044045677, "grad_norm": 1.22119140625, "learning_rate": 3.490046909840308e-05, "loss": 0.1322, "num_input_tokens_seen": 114658192, "step": 53130 }, { "epoch": 8.668026101141924, "grad_norm": 0.0558633990585804, "learning_rate": 3.489720098146627e-05, "loss": 0.0106, "num_input_tokens_seen": 114669392, "step": 53135 }, { "epoch": 8.66884176182708, "grad_norm": 1.4943022727966309, "learning_rate": 3.4893932663950126e-05, "loss": 0.1584, "num_input_tokens_seen": 114681168, "step": 53140 }, { "epoch": 8.669657422512234, "grad_norm": 1.940343976020813, "learning_rate": 3.489066414592088e-05, "loss": 0.0952, "num_input_tokens_seen": 114692688, "step": 53145 }, { "epoch": 8.67047308319739, "grad_norm": 2.068502426147461, "learning_rate": 3.488739542744478e-05, "loss": 0.1235, "num_input_tokens_seen": 114703472, "step": 53150 }, { "epoch": 8.671288743882545, "grad_norm": 0.05481525883078575, "learning_rate": 3.488412650858807e-05, "loss": 0.0902, "num_input_tokens_seen": 114714704, "step": 53155 }, { "epoch": 8.6721044045677, "grad_norm": 0.31999292969703674, "learning_rate": 3.4880857389416996e-05, "loss": 0.0331, "num_input_tokens_seen": 114725200, "step": 53160 }, { "epoch": 8.672920065252855, "grad_norm": 0.5142622590065002, "learning_rate": 3.4877588069997814e-05, "loss": 0.0759, "num_input_tokens_seen": 114735568, "step": 53165 }, { "epoch": 8.673735725938009, "grad_norm": 0.0526764914393425, "learning_rate": 3.487431855039678e-05, "loss": 0.0382, "num_input_tokens_seen": 114746448, "step": 53170 }, { "epoch": 8.674551386623165, "grad_norm": 0.5775672197341919, "learning_rate": 3.487104883068015e-05, "loss": 0.1358, "num_input_tokens_seen": 114758704, "step": 53175 }, { "epoch": 8.67536704730832, "grad_norm": 0.22309604287147522, "learning_rate": 3.486777891091419e-05, "loss": 0.1295, "num_input_tokens_seen": 114769744, "step": 53180 }, { "epoch": 8.676182707993474, "grad_norm": 1.557898998260498, "learning_rate": 3.486450879116518e-05, "loss": 0.078, "num_input_tokens_seen": 114780112, "step": 53185 }, { "epoch": 8.67699836867863, "grad_norm": 1.2064014673233032, "learning_rate": 3.486123847149939e-05, "loss": 0.1204, "num_input_tokens_seen": 114790384, "step": 53190 }, { "epoch": 8.677814029363784, "grad_norm": 0.6474229693412781, "learning_rate": 3.485796795198308e-05, "loss": 0.029, "num_input_tokens_seen": 114801168, "step": 53195 }, { "epoch": 8.67862969004894, "grad_norm": 1.3456910848617554, "learning_rate": 3.485469723268255e-05, "loss": 0.167, "num_input_tokens_seen": 114812560, "step": 53200 }, { "epoch": 8.679445350734095, "grad_norm": 0.602898359298706, "learning_rate": 3.4851426313664074e-05, "loss": 0.1042, "num_input_tokens_seen": 114823664, "step": 53205 }, { "epoch": 8.68026101141925, "grad_norm": 0.8314574360847473, "learning_rate": 3.484815519499395e-05, "loss": 0.0384, "num_input_tokens_seen": 114835056, "step": 53210 }, { "epoch": 8.681076672104405, "grad_norm": 0.12137751281261444, "learning_rate": 3.484488387673847e-05, "loss": 0.2335, "num_input_tokens_seen": 114845744, "step": 53215 }, { "epoch": 8.681892332789559, "grad_norm": 0.6489961743354797, "learning_rate": 3.4841612358963925e-05, "loss": 0.1506, "num_input_tokens_seen": 114857072, "step": 53220 }, { "epoch": 8.682707993474715, "grad_norm": 0.6561000943183899, "learning_rate": 3.483834064173662e-05, "loss": 0.0811, "num_input_tokens_seen": 114868304, "step": 53225 }, { "epoch": 8.68352365415987, "grad_norm": 1.7570544481277466, "learning_rate": 3.483506872512286e-05, "loss": 0.1095, "num_input_tokens_seen": 114877360, "step": 53230 }, { "epoch": 8.684339314845024, "grad_norm": 0.03677009418606758, "learning_rate": 3.483179660918896e-05, "loss": 0.0764, "num_input_tokens_seen": 114888496, "step": 53235 }, { "epoch": 8.68515497553018, "grad_norm": 1.3182131052017212, "learning_rate": 3.482852429400123e-05, "loss": 0.1796, "num_input_tokens_seen": 114900656, "step": 53240 }, { "epoch": 8.685970636215334, "grad_norm": 0.07260408252477646, "learning_rate": 3.482525177962599e-05, "loss": 0.0334, "num_input_tokens_seen": 114912208, "step": 53245 }, { "epoch": 8.68678629690049, "grad_norm": 3.209885835647583, "learning_rate": 3.482197906612955e-05, "loss": 0.2159, "num_input_tokens_seen": 114924144, "step": 53250 }, { "epoch": 8.687601957585644, "grad_norm": 0.5966078639030457, "learning_rate": 3.481870615357825e-05, "loss": 0.1325, "num_input_tokens_seen": 114936496, "step": 53255 }, { "epoch": 8.6884176182708, "grad_norm": 0.23217934370040894, "learning_rate": 3.481543304203842e-05, "loss": 0.2296, "num_input_tokens_seen": 114946928, "step": 53260 }, { "epoch": 8.689233278955955, "grad_norm": 0.9678106904029846, "learning_rate": 3.481215973157637e-05, "loss": 0.0813, "num_input_tokens_seen": 114957392, "step": 53265 }, { "epoch": 8.690048939641109, "grad_norm": 0.15667815506458282, "learning_rate": 3.480888622225847e-05, "loss": 0.0877, "num_input_tokens_seen": 114968816, "step": 53270 }, { "epoch": 8.690864600326265, "grad_norm": 0.9832010865211487, "learning_rate": 3.480561251415104e-05, "loss": 0.0725, "num_input_tokens_seen": 114979408, "step": 53275 }, { "epoch": 8.691680261011419, "grad_norm": 1.2201226949691772, "learning_rate": 3.4802338607320436e-05, "loss": 0.1128, "num_input_tokens_seen": 114989776, "step": 53280 }, { "epoch": 8.692495921696574, "grad_norm": 0.5072615146636963, "learning_rate": 3.4799064501833e-05, "loss": 0.1652, "num_input_tokens_seen": 115000976, "step": 53285 }, { "epoch": 8.69331158238173, "grad_norm": 0.20362144708633423, "learning_rate": 3.47957901977551e-05, "loss": 0.1488, "num_input_tokens_seen": 115010608, "step": 53290 }, { "epoch": 8.694127243066884, "grad_norm": 1.6933541297912598, "learning_rate": 3.479251569515308e-05, "loss": 0.1351, "num_input_tokens_seen": 115021904, "step": 53295 }, { "epoch": 8.69494290375204, "grad_norm": 1.6077899932861328, "learning_rate": 3.478924099409331e-05, "loss": 0.0936, "num_input_tokens_seen": 115032240, "step": 53300 }, { "epoch": 8.695758564437194, "grad_norm": 0.10175283998250961, "learning_rate": 3.4785966094642145e-05, "loss": 0.0872, "num_input_tokens_seen": 115043024, "step": 53305 }, { "epoch": 8.69657422512235, "grad_norm": 1.9581025838851929, "learning_rate": 3.478269099686597e-05, "loss": 0.2409, "num_input_tokens_seen": 115053584, "step": 53310 }, { "epoch": 8.697389885807503, "grad_norm": 0.12812893092632294, "learning_rate": 3.4779415700831146e-05, "loss": 0.1815, "num_input_tokens_seen": 115064368, "step": 53315 }, { "epoch": 8.698205546492659, "grad_norm": 1.142639398574829, "learning_rate": 3.477614020660406e-05, "loss": 0.0813, "num_input_tokens_seen": 115073712, "step": 53320 }, { "epoch": 8.699021207177815, "grad_norm": 1.1733347177505493, "learning_rate": 3.477286451425109e-05, "loss": 0.112, "num_input_tokens_seen": 115085616, "step": 53325 }, { "epoch": 8.699836867862969, "grad_norm": 1.5202486515045166, "learning_rate": 3.476958862383862e-05, "loss": 0.1997, "num_input_tokens_seen": 115096080, "step": 53330 }, { "epoch": 8.700652528548124, "grad_norm": 0.1843380331993103, "learning_rate": 3.476631253543305e-05, "loss": 0.0761, "num_input_tokens_seen": 115107344, "step": 53335 }, { "epoch": 8.701468189233278, "grad_norm": 0.7378100156784058, "learning_rate": 3.4763036249100766e-05, "loss": 0.1574, "num_input_tokens_seen": 115117712, "step": 53340 }, { "epoch": 8.702283849918434, "grad_norm": 1.4506255388259888, "learning_rate": 3.4759759764908156e-05, "loss": 0.2184, "num_input_tokens_seen": 115128048, "step": 53345 }, { "epoch": 8.70309951060359, "grad_norm": 0.08733662962913513, "learning_rate": 3.4756483082921644e-05, "loss": 0.0559, "num_input_tokens_seen": 115139152, "step": 53350 }, { "epoch": 8.703915171288743, "grad_norm": 1.7550034523010254, "learning_rate": 3.475320620320762e-05, "loss": 0.1663, "num_input_tokens_seen": 115150032, "step": 53355 }, { "epoch": 8.7047308319739, "grad_norm": 0.15107505023479462, "learning_rate": 3.47499291258325e-05, "loss": 0.0124, "num_input_tokens_seen": 115159536, "step": 53360 }, { "epoch": 8.705546492659053, "grad_norm": 1.0234438180923462, "learning_rate": 3.4746651850862705e-05, "loss": 0.1743, "num_input_tokens_seen": 115170576, "step": 53365 }, { "epoch": 8.706362153344209, "grad_norm": 0.032373204827308655, "learning_rate": 3.4743374378364636e-05, "loss": 0.1992, "num_input_tokens_seen": 115182160, "step": 53370 }, { "epoch": 8.707177814029365, "grad_norm": 0.13644464313983917, "learning_rate": 3.474009670840473e-05, "loss": 0.0583, "num_input_tokens_seen": 115190960, "step": 53375 }, { "epoch": 8.707993474714518, "grad_norm": 0.14046938717365265, "learning_rate": 3.473681884104941e-05, "loss": 0.032, "num_input_tokens_seen": 115201680, "step": 53380 }, { "epoch": 8.708809135399674, "grad_norm": 1.0496768951416016, "learning_rate": 3.47335407763651e-05, "loss": 0.1111, "num_input_tokens_seen": 115212496, "step": 53385 }, { "epoch": 8.709624796084828, "grad_norm": 2.6581428050994873, "learning_rate": 3.4730262514418245e-05, "loss": 0.1643, "num_input_tokens_seen": 115223888, "step": 53390 }, { "epoch": 8.710440456769984, "grad_norm": 0.09812317788600922, "learning_rate": 3.472698405527527e-05, "loss": 0.1122, "num_input_tokens_seen": 115233616, "step": 53395 }, { "epoch": 8.71125611745514, "grad_norm": 0.6143572330474854, "learning_rate": 3.472370539900262e-05, "loss": 0.0725, "num_input_tokens_seen": 115244496, "step": 53400 }, { "epoch": 8.712071778140293, "grad_norm": 1.311945915222168, "learning_rate": 3.4720426545666754e-05, "loss": 0.155, "num_input_tokens_seen": 115255216, "step": 53405 }, { "epoch": 8.71288743882545, "grad_norm": 0.2692243158817291, "learning_rate": 3.471714749533411e-05, "loss": 0.0354, "num_input_tokens_seen": 115264720, "step": 53410 }, { "epoch": 8.713703099510603, "grad_norm": 0.6298941969871521, "learning_rate": 3.471386824807114e-05, "loss": 0.2322, "num_input_tokens_seen": 115276080, "step": 53415 }, { "epoch": 8.714518760195759, "grad_norm": 0.24568429589271545, "learning_rate": 3.471058880394431e-05, "loss": 0.1146, "num_input_tokens_seen": 115286576, "step": 53420 }, { "epoch": 8.715334420880914, "grad_norm": 0.2572667598724365, "learning_rate": 3.4707309163020087e-05, "loss": 0.1092, "num_input_tokens_seen": 115297200, "step": 53425 }, { "epoch": 8.716150081566068, "grad_norm": 0.10905976593494415, "learning_rate": 3.4704029325364916e-05, "loss": 0.0299, "num_input_tokens_seen": 115308240, "step": 53430 }, { "epoch": 8.716965742251224, "grad_norm": 0.12053941190242767, "learning_rate": 3.470074929104529e-05, "loss": 0.0387, "num_input_tokens_seen": 115319600, "step": 53435 }, { "epoch": 8.717781402936378, "grad_norm": 0.6578710675239563, "learning_rate": 3.469746906012767e-05, "loss": 0.0523, "num_input_tokens_seen": 115330736, "step": 53440 }, { "epoch": 8.718597063621534, "grad_norm": 0.04175400733947754, "learning_rate": 3.469418863267854e-05, "loss": 0.325, "num_input_tokens_seen": 115342672, "step": 53445 }, { "epoch": 8.719412724306688, "grad_norm": 0.031560517847537994, "learning_rate": 3.4690908008764364e-05, "loss": 0.2917, "num_input_tokens_seen": 115354320, "step": 53450 }, { "epoch": 8.720228384991843, "grad_norm": 1.205114483833313, "learning_rate": 3.468762718845166e-05, "loss": 0.3079, "num_input_tokens_seen": 115364880, "step": 53455 }, { "epoch": 8.721044045676999, "grad_norm": 0.1440308392047882, "learning_rate": 3.46843461718069e-05, "loss": 0.133, "num_input_tokens_seen": 115375824, "step": 53460 }, { "epoch": 8.721859706362153, "grad_norm": 0.42310062050819397, "learning_rate": 3.468106495889657e-05, "loss": 0.1426, "num_input_tokens_seen": 115386992, "step": 53465 }, { "epoch": 8.722675367047309, "grad_norm": 0.12657073140144348, "learning_rate": 3.467778354978719e-05, "loss": 0.1101, "num_input_tokens_seen": 115397008, "step": 53470 }, { "epoch": 8.723491027732463, "grad_norm": 0.2029554843902588, "learning_rate": 3.467450194454524e-05, "loss": 0.1735, "num_input_tokens_seen": 115407888, "step": 53475 }, { "epoch": 8.724306688417618, "grad_norm": 0.7587800025939941, "learning_rate": 3.467122014323724e-05, "loss": 0.1448, "num_input_tokens_seen": 115417936, "step": 53480 }, { "epoch": 8.725122349102774, "grad_norm": 0.418096661567688, "learning_rate": 3.4667938145929694e-05, "loss": 0.0812, "num_input_tokens_seen": 115429200, "step": 53485 }, { "epoch": 8.725938009787928, "grad_norm": 0.1043587327003479, "learning_rate": 3.466465595268911e-05, "loss": 0.0962, "num_input_tokens_seen": 115439920, "step": 53490 }, { "epoch": 8.726753670473084, "grad_norm": 0.5376614332199097, "learning_rate": 3.4661373563582024e-05, "loss": 0.0976, "num_input_tokens_seen": 115450608, "step": 53495 }, { "epoch": 8.727569331158238, "grad_norm": 0.4887325167655945, "learning_rate": 3.465809097867494e-05, "loss": 0.1267, "num_input_tokens_seen": 115461680, "step": 53500 }, { "epoch": 8.728384991843393, "grad_norm": 0.836781919002533, "learning_rate": 3.465480819803439e-05, "loss": 0.0309, "num_input_tokens_seen": 115472752, "step": 53505 }, { "epoch": 8.729200652528547, "grad_norm": 0.931130588054657, "learning_rate": 3.465152522172691e-05, "loss": 0.2198, "num_input_tokens_seen": 115483536, "step": 53510 }, { "epoch": 8.730016313213703, "grad_norm": 0.49501925706863403, "learning_rate": 3.4648242049819015e-05, "loss": 0.0489, "num_input_tokens_seen": 115494736, "step": 53515 }, { "epoch": 8.730831973898859, "grad_norm": 0.06370478123426437, "learning_rate": 3.464495868237726e-05, "loss": 0.1208, "num_input_tokens_seen": 115505552, "step": 53520 }, { "epoch": 8.731647634584013, "grad_norm": 0.7363098859786987, "learning_rate": 3.4641675119468186e-05, "loss": 0.1964, "num_input_tokens_seen": 115515920, "step": 53525 }, { "epoch": 8.732463295269168, "grad_norm": 1.1011090278625488, "learning_rate": 3.4638391361158334e-05, "loss": 0.2117, "num_input_tokens_seen": 115526896, "step": 53530 }, { "epoch": 8.733278955954322, "grad_norm": 0.8408984541893005, "learning_rate": 3.4635107407514255e-05, "loss": 0.181, "num_input_tokens_seen": 115536912, "step": 53535 }, { "epoch": 8.734094616639478, "grad_norm": 0.636412501335144, "learning_rate": 3.46318232586025e-05, "loss": 0.0687, "num_input_tokens_seen": 115547728, "step": 53540 }, { "epoch": 8.734910277324634, "grad_norm": 0.5498171448707581, "learning_rate": 3.462853891448963e-05, "loss": 0.0463, "num_input_tokens_seen": 115559184, "step": 53545 }, { "epoch": 8.735725938009788, "grad_norm": 0.06292671710252762, "learning_rate": 3.46252543752422e-05, "loss": 0.1102, "num_input_tokens_seen": 115569552, "step": 53550 }, { "epoch": 8.736541598694943, "grad_norm": 1.7497806549072266, "learning_rate": 3.462196964092677e-05, "loss": 0.1643, "num_input_tokens_seen": 115578928, "step": 53555 }, { "epoch": 8.737357259380097, "grad_norm": 0.5802031755447388, "learning_rate": 3.4618684711609934e-05, "loss": 0.0964, "num_input_tokens_seen": 115589904, "step": 53560 }, { "epoch": 8.738172920065253, "grad_norm": 0.45212042331695557, "learning_rate": 3.4615399587358244e-05, "loss": 0.0882, "num_input_tokens_seen": 115599440, "step": 53565 }, { "epoch": 8.738988580750409, "grad_norm": 0.27094176411628723, "learning_rate": 3.4612114268238284e-05, "loss": 0.1136, "num_input_tokens_seen": 115610064, "step": 53570 }, { "epoch": 8.739804241435563, "grad_norm": 1.7294111251831055, "learning_rate": 3.4608828754316636e-05, "loss": 0.2169, "num_input_tokens_seen": 115621360, "step": 53575 }, { "epoch": 8.740619902120718, "grad_norm": 0.4610711634159088, "learning_rate": 3.4605543045659884e-05, "loss": 0.1821, "num_input_tokens_seen": 115632272, "step": 53580 }, { "epoch": 8.741435562805872, "grad_norm": 0.46715906262397766, "learning_rate": 3.460225714233462e-05, "loss": 0.0526, "num_input_tokens_seen": 115643280, "step": 53585 }, { "epoch": 8.742251223491028, "grad_norm": 0.08039989322423935, "learning_rate": 3.459897104440743e-05, "loss": 0.096, "num_input_tokens_seen": 115654128, "step": 53590 }, { "epoch": 8.743066884176184, "grad_norm": 0.3030228018760681, "learning_rate": 3.4595684751944914e-05, "loss": 0.0709, "num_input_tokens_seen": 115664528, "step": 53595 }, { "epoch": 8.743882544861338, "grad_norm": 1.4216951131820679, "learning_rate": 3.459239826501367e-05, "loss": 0.1821, "num_input_tokens_seen": 115675024, "step": 53600 }, { "epoch": 8.744698205546493, "grad_norm": 0.3492230176925659, "learning_rate": 3.4589111583680315e-05, "loss": 0.0866, "num_input_tokens_seen": 115687056, "step": 53605 }, { "epoch": 8.745513866231647, "grad_norm": 0.24281342327594757, "learning_rate": 3.4585824708011445e-05, "loss": 0.139, "num_input_tokens_seen": 115697840, "step": 53610 }, { "epoch": 8.746329526916803, "grad_norm": 0.5512803196907043, "learning_rate": 3.458253763807368e-05, "loss": 0.0266, "num_input_tokens_seen": 115708848, "step": 53615 }, { "epoch": 8.747145187601957, "grad_norm": 0.6862422227859497, "learning_rate": 3.4579250373933624e-05, "loss": 0.2913, "num_input_tokens_seen": 115719568, "step": 53620 }, { "epoch": 8.747960848287113, "grad_norm": 0.24391111731529236, "learning_rate": 3.457596291565791e-05, "loss": 0.0949, "num_input_tokens_seen": 115730768, "step": 53625 }, { "epoch": 8.748776508972268, "grad_norm": 0.6368191838264465, "learning_rate": 3.457267526331316e-05, "loss": 0.0326, "num_input_tokens_seen": 115741840, "step": 53630 }, { "epoch": 8.749592169657422, "grad_norm": 0.03685532510280609, "learning_rate": 3.456938741696601e-05, "loss": 0.1066, "num_input_tokens_seen": 115752208, "step": 53635 }, { "epoch": 8.750407830342578, "grad_norm": 0.5313101410865784, "learning_rate": 3.456609937668308e-05, "loss": 0.0406, "num_input_tokens_seen": 115762992, "step": 53640 }, { "epoch": 8.751223491027732, "grad_norm": 0.7754632830619812, "learning_rate": 3.456281114253101e-05, "loss": 0.1482, "num_input_tokens_seen": 115774224, "step": 53645 }, { "epoch": 8.752039151712887, "grad_norm": 0.9505372643470764, "learning_rate": 3.455952271457644e-05, "loss": 0.0491, "num_input_tokens_seen": 115785776, "step": 53650 }, { "epoch": 8.752854812398043, "grad_norm": 2.498756170272827, "learning_rate": 3.455623409288601e-05, "loss": 0.2017, "num_input_tokens_seen": 115796656, "step": 53655 }, { "epoch": 8.753670473083197, "grad_norm": 1.7778997421264648, "learning_rate": 3.4552945277526377e-05, "loss": 0.1345, "num_input_tokens_seen": 115808272, "step": 53660 }, { "epoch": 8.754486133768353, "grad_norm": 0.5078904628753662, "learning_rate": 3.454965626856419e-05, "loss": 0.071, "num_input_tokens_seen": 115820144, "step": 53665 }, { "epoch": 8.755301794453507, "grad_norm": 0.29899927973747253, "learning_rate": 3.454636706606611e-05, "loss": 0.2565, "num_input_tokens_seen": 115831888, "step": 53670 }, { "epoch": 8.756117455138662, "grad_norm": 1.0105931758880615, "learning_rate": 3.4543077670098785e-05, "loss": 0.0837, "num_input_tokens_seen": 115843024, "step": 53675 }, { "epoch": 8.756933115823816, "grad_norm": 0.9679227471351624, "learning_rate": 3.4539788080728885e-05, "loss": 0.1461, "num_input_tokens_seen": 115854288, "step": 53680 }, { "epoch": 8.757748776508972, "grad_norm": 0.8311558365821838, "learning_rate": 3.4536498298023066e-05, "loss": 0.0856, "num_input_tokens_seen": 115865296, "step": 53685 }, { "epoch": 8.758564437194128, "grad_norm": 0.6704322099685669, "learning_rate": 3.453320832204803e-05, "loss": 0.0672, "num_input_tokens_seen": 115876304, "step": 53690 }, { "epoch": 8.759380097879282, "grad_norm": 1.217667818069458, "learning_rate": 3.4529918152870425e-05, "loss": 0.0844, "num_input_tokens_seen": 115887632, "step": 53695 }, { "epoch": 8.760195758564437, "grad_norm": 0.6618497967720032, "learning_rate": 3.452662779055694e-05, "loss": 0.1449, "num_input_tokens_seen": 115898640, "step": 53700 }, { "epoch": 8.761011419249591, "grad_norm": 0.6344866156578064, "learning_rate": 3.4523337235174256e-05, "loss": 0.0492, "num_input_tokens_seen": 115909392, "step": 53705 }, { "epoch": 8.761827079934747, "grad_norm": 0.10207716375589371, "learning_rate": 3.452004648678907e-05, "loss": 0.0603, "num_input_tokens_seen": 115920080, "step": 53710 }, { "epoch": 8.762642740619903, "grad_norm": 0.8280075788497925, "learning_rate": 3.4516755545468054e-05, "loss": 0.0428, "num_input_tokens_seen": 115930640, "step": 53715 }, { "epoch": 8.763458401305057, "grad_norm": 0.8650482892990112, "learning_rate": 3.451346441127792e-05, "loss": 0.0633, "num_input_tokens_seen": 115941392, "step": 53720 }, { "epoch": 8.764274061990212, "grad_norm": 0.3431732952594757, "learning_rate": 3.451017308428536e-05, "loss": 0.0703, "num_input_tokens_seen": 115952176, "step": 53725 }, { "epoch": 8.765089722675366, "grad_norm": 0.8964838981628418, "learning_rate": 3.450688156455708e-05, "loss": 0.1209, "num_input_tokens_seen": 115962896, "step": 53730 }, { "epoch": 8.765905383360522, "grad_norm": 1.4433621168136597, "learning_rate": 3.4503589852159776e-05, "loss": 0.0712, "num_input_tokens_seen": 115972944, "step": 53735 }, { "epoch": 8.766721044045678, "grad_norm": 0.14572395384311676, "learning_rate": 3.450029794716018e-05, "loss": 0.1161, "num_input_tokens_seen": 115983600, "step": 53740 }, { "epoch": 8.767536704730832, "grad_norm": 0.10273425281047821, "learning_rate": 3.449700584962499e-05, "loss": 0.1853, "num_input_tokens_seen": 115994832, "step": 53745 }, { "epoch": 8.768352365415987, "grad_norm": 0.04157509282231331, "learning_rate": 3.4493713559620926e-05, "loss": 0.03, "num_input_tokens_seen": 116004848, "step": 53750 }, { "epoch": 8.769168026101141, "grad_norm": 0.6760563254356384, "learning_rate": 3.449042107721471e-05, "loss": 0.06, "num_input_tokens_seen": 116016016, "step": 53755 }, { "epoch": 8.769983686786297, "grad_norm": 0.12030309438705444, "learning_rate": 3.448712840247308e-05, "loss": 0.072, "num_input_tokens_seen": 116027088, "step": 53760 }, { "epoch": 8.770799347471453, "grad_norm": 0.530924916267395, "learning_rate": 3.448383553546275e-05, "loss": 0.1687, "num_input_tokens_seen": 116038192, "step": 53765 }, { "epoch": 8.771615008156607, "grad_norm": 0.11781347543001175, "learning_rate": 3.448054247625046e-05, "loss": 0.1627, "num_input_tokens_seen": 116048272, "step": 53770 }, { "epoch": 8.772430668841762, "grad_norm": 0.7181287407875061, "learning_rate": 3.447724922490296e-05, "loss": 0.0969, "num_input_tokens_seen": 116060112, "step": 53775 }, { "epoch": 8.773246329526916, "grad_norm": 0.1159982681274414, "learning_rate": 3.447395578148697e-05, "loss": 0.0758, "num_input_tokens_seen": 116071216, "step": 53780 }, { "epoch": 8.774061990212072, "grad_norm": 0.3438650965690613, "learning_rate": 3.447066214606926e-05, "loss": 0.1782, "num_input_tokens_seen": 116081360, "step": 53785 }, { "epoch": 8.774877650897226, "grad_norm": 0.4781661033630371, "learning_rate": 3.446736831871656e-05, "loss": 0.0972, "num_input_tokens_seen": 116091216, "step": 53790 }, { "epoch": 8.775693311582382, "grad_norm": 0.09902874380350113, "learning_rate": 3.446407429949563e-05, "loss": 0.0528, "num_input_tokens_seen": 116101168, "step": 53795 }, { "epoch": 8.776508972267537, "grad_norm": 0.4472412168979645, "learning_rate": 3.4460780088473234e-05, "loss": 0.0613, "num_input_tokens_seen": 116112272, "step": 53800 }, { "epoch": 8.777324632952691, "grad_norm": 1.710715413093567, "learning_rate": 3.445748568571612e-05, "loss": 0.2278, "num_input_tokens_seen": 116123184, "step": 53805 }, { "epoch": 8.778140293637847, "grad_norm": 0.3497745394706726, "learning_rate": 3.4454191091291065e-05, "loss": 0.0378, "num_input_tokens_seen": 116133776, "step": 53810 }, { "epoch": 8.778955954323001, "grad_norm": 1.8717206716537476, "learning_rate": 3.4450896305264835e-05, "loss": 0.098, "num_input_tokens_seen": 116144688, "step": 53815 }, { "epoch": 8.779771615008157, "grad_norm": 0.1997430920600891, "learning_rate": 3.44476013277042e-05, "loss": 0.115, "num_input_tokens_seen": 116154288, "step": 53820 }, { "epoch": 8.780587275693312, "grad_norm": 0.4003358483314514, "learning_rate": 3.444430615867593e-05, "loss": 0.0588, "num_input_tokens_seen": 116165584, "step": 53825 }, { "epoch": 8.781402936378466, "grad_norm": 0.17017154395580292, "learning_rate": 3.444101079824683e-05, "loss": 0.0475, "num_input_tokens_seen": 116176848, "step": 53830 }, { "epoch": 8.782218597063622, "grad_norm": 0.07249663770198822, "learning_rate": 3.4437715246483665e-05, "loss": 0.0821, "num_input_tokens_seen": 116186800, "step": 53835 }, { "epoch": 8.783034257748776, "grad_norm": 0.06559067964553833, "learning_rate": 3.4434419503453225e-05, "loss": 0.1163, "num_input_tokens_seen": 116196656, "step": 53840 }, { "epoch": 8.783849918433932, "grad_norm": 0.8899879455566406, "learning_rate": 3.443112356922231e-05, "loss": 0.0636, "num_input_tokens_seen": 116207760, "step": 53845 }, { "epoch": 8.784665579119086, "grad_norm": 1.8563474416732788, "learning_rate": 3.442782744385771e-05, "loss": 0.1173, "num_input_tokens_seen": 116216752, "step": 53850 }, { "epoch": 8.785481239804241, "grad_norm": 2.275294780731201, "learning_rate": 3.442453112742622e-05, "loss": 0.1644, "num_input_tokens_seen": 116227568, "step": 53855 }, { "epoch": 8.786296900489397, "grad_norm": 0.8585104942321777, "learning_rate": 3.442123461999466e-05, "loss": 0.0396, "num_input_tokens_seen": 116238736, "step": 53860 }, { "epoch": 8.78711256117455, "grad_norm": 2.053318977355957, "learning_rate": 3.441793792162982e-05, "loss": 0.3069, "num_input_tokens_seen": 116249584, "step": 53865 }, { "epoch": 8.787928221859707, "grad_norm": 0.09388406574726105, "learning_rate": 3.441464103239853e-05, "loss": 0.0623, "num_input_tokens_seen": 116259664, "step": 53870 }, { "epoch": 8.78874388254486, "grad_norm": 1.6244971752166748, "learning_rate": 3.4411343952367584e-05, "loss": 0.1479, "num_input_tokens_seen": 116270352, "step": 53875 }, { "epoch": 8.789559543230016, "grad_norm": 1.3126401901245117, "learning_rate": 3.440804668160382e-05, "loss": 0.0741, "num_input_tokens_seen": 116281232, "step": 53880 }, { "epoch": 8.790375203915172, "grad_norm": 0.2679999768733978, "learning_rate": 3.440474922017406e-05, "loss": 0.0296, "num_input_tokens_seen": 116291728, "step": 53885 }, { "epoch": 8.791190864600326, "grad_norm": 0.7202944755554199, "learning_rate": 3.4401451568145125e-05, "loss": 0.1628, "num_input_tokens_seen": 116302480, "step": 53890 }, { "epoch": 8.792006525285482, "grad_norm": 1.750565528869629, "learning_rate": 3.439815372558384e-05, "loss": 0.2154, "num_input_tokens_seen": 116314192, "step": 53895 }, { "epoch": 8.792822185970635, "grad_norm": 0.0842670276761055, "learning_rate": 3.439485569255706e-05, "loss": 0.1945, "num_input_tokens_seen": 116325616, "step": 53900 }, { "epoch": 8.793637846655791, "grad_norm": 0.04587702080607414, "learning_rate": 3.4391557469131604e-05, "loss": 0.0132, "num_input_tokens_seen": 116335344, "step": 53905 }, { "epoch": 8.794453507340947, "grad_norm": 0.18276245892047882, "learning_rate": 3.438825905537432e-05, "loss": 0.0294, "num_input_tokens_seen": 116346480, "step": 53910 }, { "epoch": 8.7952691680261, "grad_norm": 0.09991907328367233, "learning_rate": 3.438496045135206e-05, "loss": 0.2065, "num_input_tokens_seen": 116358576, "step": 53915 }, { "epoch": 8.796084828711257, "grad_norm": 0.7723411321640015, "learning_rate": 3.438166165713167e-05, "loss": 0.1438, "num_input_tokens_seen": 116369872, "step": 53920 }, { "epoch": 8.79690048939641, "grad_norm": 0.4512215852737427, "learning_rate": 3.437836267278e-05, "loss": 0.0297, "num_input_tokens_seen": 116380432, "step": 53925 }, { "epoch": 8.797716150081566, "grad_norm": 0.11331134289503098, "learning_rate": 3.437506349836392e-05, "loss": 0.0865, "num_input_tokens_seen": 116390544, "step": 53930 }, { "epoch": 8.798531810766722, "grad_norm": 0.09418024867773056, "learning_rate": 3.437176413395028e-05, "loss": 0.051, "num_input_tokens_seen": 116402512, "step": 53935 }, { "epoch": 8.799347471451876, "grad_norm": 0.22354675829410553, "learning_rate": 3.436846457960595e-05, "loss": 0.1201, "num_input_tokens_seen": 116413488, "step": 53940 }, { "epoch": 8.800163132137031, "grad_norm": 0.12552878260612488, "learning_rate": 3.436516483539781e-05, "loss": 0.038, "num_input_tokens_seen": 116424976, "step": 53945 }, { "epoch": 8.800978792822185, "grad_norm": 0.2985049784183502, "learning_rate": 3.436186490139272e-05, "loss": 0.1534, "num_input_tokens_seen": 116435856, "step": 53950 }, { "epoch": 8.801794453507341, "grad_norm": 0.37806040048599243, "learning_rate": 3.4358564777657556e-05, "loss": 0.2591, "num_input_tokens_seen": 116447472, "step": 53955 }, { "epoch": 8.802610114192497, "grad_norm": 0.8309801816940308, "learning_rate": 3.435526446425921e-05, "loss": 0.1986, "num_input_tokens_seen": 116458064, "step": 53960 }, { "epoch": 8.80342577487765, "grad_norm": 0.4249274730682373, "learning_rate": 3.4351963961264554e-05, "loss": 0.0549, "num_input_tokens_seen": 116468464, "step": 53965 }, { "epoch": 8.804241435562806, "grad_norm": 0.019961806014180183, "learning_rate": 3.4348663268740485e-05, "loss": 0.1578, "num_input_tokens_seen": 116477488, "step": 53970 }, { "epoch": 8.80505709624796, "grad_norm": 0.6162349581718445, "learning_rate": 3.4345362386753896e-05, "loss": 0.1118, "num_input_tokens_seen": 116488368, "step": 53975 }, { "epoch": 8.805872756933116, "grad_norm": 0.6028348803520203, "learning_rate": 3.434206131537169e-05, "loss": 0.1241, "num_input_tokens_seen": 116498704, "step": 53980 }, { "epoch": 8.80668841761827, "grad_norm": 0.26062753796577454, "learning_rate": 3.433876005466076e-05, "loss": 0.0556, "num_input_tokens_seen": 116510000, "step": 53985 }, { "epoch": 8.807504078303426, "grad_norm": 0.23696812987327576, "learning_rate": 3.4335458604688e-05, "loss": 0.1719, "num_input_tokens_seen": 116521552, "step": 53990 }, { "epoch": 8.808319738988581, "grad_norm": 0.4588795006275177, "learning_rate": 3.433215696552034e-05, "loss": 0.0508, "num_input_tokens_seen": 116532208, "step": 53995 }, { "epoch": 8.809135399673735, "grad_norm": 0.200884610414505, "learning_rate": 3.432885513722467e-05, "loss": 0.085, "num_input_tokens_seen": 116542608, "step": 54000 }, { "epoch": 8.809951060358891, "grad_norm": 0.2220146358013153, "learning_rate": 3.432555311986793e-05, "loss": 0.2421, "num_input_tokens_seen": 116552656, "step": 54005 }, { "epoch": 8.810766721044045, "grad_norm": 0.11485530436038971, "learning_rate": 3.4322250913517016e-05, "loss": 0.0881, "num_input_tokens_seen": 116565008, "step": 54010 }, { "epoch": 8.8115823817292, "grad_norm": 0.14516374468803406, "learning_rate": 3.431894851823886e-05, "loss": 0.1015, "num_input_tokens_seen": 116574576, "step": 54015 }, { "epoch": 8.812398042414356, "grad_norm": 0.043232012540102005, "learning_rate": 3.4315645934100396e-05, "loss": 0.0467, "num_input_tokens_seen": 116584944, "step": 54020 }, { "epoch": 8.81321370309951, "grad_norm": 0.6430656909942627, "learning_rate": 3.431234316116855e-05, "loss": 0.1041, "num_input_tokens_seen": 116595920, "step": 54025 }, { "epoch": 8.814029363784666, "grad_norm": 0.6252656579017639, "learning_rate": 3.430904019951025e-05, "loss": 0.0474, "num_input_tokens_seen": 116606704, "step": 54030 }, { "epoch": 8.81484502446982, "grad_norm": 0.22598619759082794, "learning_rate": 3.430573704919244e-05, "loss": 0.0485, "num_input_tokens_seen": 116618032, "step": 54035 }, { "epoch": 8.815660685154976, "grad_norm": 0.12566550076007843, "learning_rate": 3.4302433710282076e-05, "loss": 0.1121, "num_input_tokens_seen": 116628464, "step": 54040 }, { "epoch": 8.81647634584013, "grad_norm": 1.9877161979675293, "learning_rate": 3.4299130182846075e-05, "loss": 0.0634, "num_input_tokens_seen": 116639408, "step": 54045 }, { "epoch": 8.817292006525285, "grad_norm": 0.04754677787423134, "learning_rate": 3.429582646695142e-05, "loss": 0.3368, "num_input_tokens_seen": 116649808, "step": 54050 }, { "epoch": 8.818107667210441, "grad_norm": 0.18693295121192932, "learning_rate": 3.4292522562665045e-05, "loss": 0.1601, "num_input_tokens_seen": 116661584, "step": 54055 }, { "epoch": 8.818923327895595, "grad_norm": 0.04867332801222801, "learning_rate": 3.42892184700539e-05, "loss": 0.0367, "num_input_tokens_seen": 116671664, "step": 54060 }, { "epoch": 8.81973898858075, "grad_norm": 0.17292089760303497, "learning_rate": 3.428591418918497e-05, "loss": 0.1007, "num_input_tokens_seen": 116683248, "step": 54065 }, { "epoch": 8.820554649265905, "grad_norm": 1.0096813440322876, "learning_rate": 3.4282609720125207e-05, "loss": 0.1865, "num_input_tokens_seen": 116693936, "step": 54070 }, { "epoch": 8.82137030995106, "grad_norm": 0.06009567156434059, "learning_rate": 3.427930506294158e-05, "loss": 0.0825, "num_input_tokens_seen": 116703856, "step": 54075 }, { "epoch": 8.822185970636216, "grad_norm": 0.038908880203962326, "learning_rate": 3.427600021770106e-05, "loss": 0.0357, "num_input_tokens_seen": 116713136, "step": 54080 }, { "epoch": 8.82300163132137, "grad_norm": 0.0752197653055191, "learning_rate": 3.4272695184470645e-05, "loss": 0.0413, "num_input_tokens_seen": 116723696, "step": 54085 }, { "epoch": 8.823817292006526, "grad_norm": 0.8850474953651428, "learning_rate": 3.426938996331728e-05, "loss": 0.2125, "num_input_tokens_seen": 116735280, "step": 54090 }, { "epoch": 8.82463295269168, "grad_norm": 1.7151517868041992, "learning_rate": 3.426608455430797e-05, "loss": 0.1241, "num_input_tokens_seen": 116746416, "step": 54095 }, { "epoch": 8.825448613376835, "grad_norm": 0.4062533378601074, "learning_rate": 3.426277895750971e-05, "loss": 0.1478, "num_input_tokens_seen": 116757232, "step": 54100 }, { "epoch": 8.826264274061991, "grad_norm": 0.062092941254377365, "learning_rate": 3.4259473172989484e-05, "loss": 0.1846, "num_input_tokens_seen": 116768304, "step": 54105 }, { "epoch": 8.827079934747145, "grad_norm": 0.7436076402664185, "learning_rate": 3.425616720081429e-05, "loss": 0.16, "num_input_tokens_seen": 116779216, "step": 54110 }, { "epoch": 8.8278955954323, "grad_norm": 1.5941979885101318, "learning_rate": 3.425286104105112e-05, "loss": 0.0407, "num_input_tokens_seen": 116790160, "step": 54115 }, { "epoch": 8.828711256117455, "grad_norm": 0.18929028511047363, "learning_rate": 3.424955469376698e-05, "loss": 0.0294, "num_input_tokens_seen": 116801936, "step": 54120 }, { "epoch": 8.82952691680261, "grad_norm": 0.30490046739578247, "learning_rate": 3.424624815902888e-05, "loss": 0.1686, "num_input_tokens_seen": 116812816, "step": 54125 }, { "epoch": 8.830342577487766, "grad_norm": 0.6968782544136047, "learning_rate": 3.424294143690384e-05, "loss": 0.1506, "num_input_tokens_seen": 116821840, "step": 54130 }, { "epoch": 8.83115823817292, "grad_norm": 0.11558196693658829, "learning_rate": 3.423963452745886e-05, "loss": 0.0803, "num_input_tokens_seen": 116832912, "step": 54135 }, { "epoch": 8.831973898858076, "grad_norm": 0.4007997214794159, "learning_rate": 3.423632743076096e-05, "loss": 0.0212, "num_input_tokens_seen": 116842896, "step": 54140 }, { "epoch": 8.83278955954323, "grad_norm": 1.4936789274215698, "learning_rate": 3.423302014687718e-05, "loss": 0.142, "num_input_tokens_seen": 116853744, "step": 54145 }, { "epoch": 8.833605220228385, "grad_norm": 1.1878392696380615, "learning_rate": 3.4229712675874526e-05, "loss": 0.0972, "num_input_tokens_seen": 116863824, "step": 54150 }, { "epoch": 8.83442088091354, "grad_norm": 0.015650462359189987, "learning_rate": 3.422640501782004e-05, "loss": 0.0401, "num_input_tokens_seen": 116875056, "step": 54155 }, { "epoch": 8.835236541598695, "grad_norm": 0.0941283106803894, "learning_rate": 3.422309717278074e-05, "loss": 0.148, "num_input_tokens_seen": 116886768, "step": 54160 }, { "epoch": 8.83605220228385, "grad_norm": 0.9078091979026794, "learning_rate": 3.421978914082369e-05, "loss": 0.0848, "num_input_tokens_seen": 116898032, "step": 54165 }, { "epoch": 8.836867862969005, "grad_norm": 0.3207542896270752, "learning_rate": 3.421648092201591e-05, "loss": 0.0357, "num_input_tokens_seen": 116909008, "step": 54170 }, { "epoch": 8.83768352365416, "grad_norm": 0.5151383876800537, "learning_rate": 3.421317251642445e-05, "loss": 0.0295, "num_input_tokens_seen": 116920464, "step": 54175 }, { "epoch": 8.838499184339314, "grad_norm": 0.26590320467948914, "learning_rate": 3.420986392411637e-05, "loss": 0.0783, "num_input_tokens_seen": 116931568, "step": 54180 }, { "epoch": 8.83931484502447, "grad_norm": 1.396988034248352, "learning_rate": 3.420655514515871e-05, "loss": 0.079, "num_input_tokens_seen": 116943312, "step": 54185 }, { "epoch": 8.840130505709626, "grad_norm": 0.09727350622415543, "learning_rate": 3.420324617961853e-05, "loss": 0.0316, "num_input_tokens_seen": 116954480, "step": 54190 }, { "epoch": 8.84094616639478, "grad_norm": 0.4853753447532654, "learning_rate": 3.4199937027562885e-05, "loss": 0.1045, "num_input_tokens_seen": 116964880, "step": 54195 }, { "epoch": 8.841761827079935, "grad_norm": 0.4889524579048157, "learning_rate": 3.4196627689058854e-05, "loss": 0.1469, "num_input_tokens_seen": 116974896, "step": 54200 }, { "epoch": 8.84257748776509, "grad_norm": 0.4666008949279785, "learning_rate": 3.419331816417349e-05, "loss": 0.0453, "num_input_tokens_seen": 116985072, "step": 54205 }, { "epoch": 8.843393148450245, "grad_norm": 2.382402181625366, "learning_rate": 3.419000845297388e-05, "loss": 0.0776, "num_input_tokens_seen": 116995856, "step": 54210 }, { "epoch": 8.844208809135399, "grad_norm": 1.337551236152649, "learning_rate": 3.418669855552708e-05, "loss": 0.256, "num_input_tokens_seen": 117007152, "step": 54215 }, { "epoch": 8.845024469820554, "grad_norm": 1.4559677839279175, "learning_rate": 3.418338847190018e-05, "loss": 0.1002, "num_input_tokens_seen": 117017040, "step": 54220 }, { "epoch": 8.84584013050571, "grad_norm": 0.9080584049224854, "learning_rate": 3.418007820216027e-05, "loss": 0.2293, "num_input_tokens_seen": 117028304, "step": 54225 }, { "epoch": 8.846655791190864, "grad_norm": 0.05631916597485542, "learning_rate": 3.417676774637443e-05, "loss": 0.1642, "num_input_tokens_seen": 117039120, "step": 54230 }, { "epoch": 8.84747145187602, "grad_norm": 0.0316501148045063, "learning_rate": 3.417345710460974e-05, "loss": 0.0461, "num_input_tokens_seen": 117050032, "step": 54235 }, { "epoch": 8.848287112561174, "grad_norm": 1.6487336158752441, "learning_rate": 3.4170146276933315e-05, "loss": 0.2707, "num_input_tokens_seen": 117058448, "step": 54240 }, { "epoch": 8.84910277324633, "grad_norm": 0.5369938015937805, "learning_rate": 3.416683526341223e-05, "loss": 0.0455, "num_input_tokens_seen": 117068880, "step": 54245 }, { "epoch": 8.849918433931485, "grad_norm": 1.2747021913528442, "learning_rate": 3.416352406411361e-05, "loss": 0.2241, "num_input_tokens_seen": 117079312, "step": 54250 }, { "epoch": 8.850734094616639, "grad_norm": 4.186721324920654, "learning_rate": 3.416021267910455e-05, "loss": 0.2767, "num_input_tokens_seen": 117089776, "step": 54255 }, { "epoch": 8.851549755301795, "grad_norm": 0.042322367429733276, "learning_rate": 3.4156901108452155e-05, "loss": 0.233, "num_input_tokens_seen": 117100304, "step": 54260 }, { "epoch": 8.852365415986949, "grad_norm": 0.08277716487646103, "learning_rate": 3.415358935222355e-05, "loss": 0.1361, "num_input_tokens_seen": 117110064, "step": 54265 }, { "epoch": 8.853181076672104, "grad_norm": 1.1884137392044067, "learning_rate": 3.4150277410485835e-05, "loss": 0.1529, "num_input_tokens_seen": 117121872, "step": 54270 }, { "epoch": 8.85399673735726, "grad_norm": 0.12140405923128128, "learning_rate": 3.414696528330615e-05, "loss": 0.0368, "num_input_tokens_seen": 117133040, "step": 54275 }, { "epoch": 8.854812398042414, "grad_norm": 0.9001656770706177, "learning_rate": 3.41436529707516e-05, "loss": 0.1504, "num_input_tokens_seen": 117145488, "step": 54280 }, { "epoch": 8.85562805872757, "grad_norm": 0.10572702437639236, "learning_rate": 3.414034047288932e-05, "loss": 0.1766, "num_input_tokens_seen": 117156464, "step": 54285 }, { "epoch": 8.856443719412724, "grad_norm": 0.29015639424324036, "learning_rate": 3.413702778978646e-05, "loss": 0.0915, "num_input_tokens_seen": 117166128, "step": 54290 }, { "epoch": 8.85725938009788, "grad_norm": 2.0044655799865723, "learning_rate": 3.413371492151013e-05, "loss": 0.0887, "num_input_tokens_seen": 117176272, "step": 54295 }, { "epoch": 8.858075040783035, "grad_norm": 0.9786196351051331, "learning_rate": 3.413040186812748e-05, "loss": 0.2606, "num_input_tokens_seen": 117187440, "step": 54300 }, { "epoch": 8.858890701468189, "grad_norm": 0.09341907501220703, "learning_rate": 3.412708862970566e-05, "loss": 0.0455, "num_input_tokens_seen": 117198416, "step": 54305 }, { "epoch": 8.859706362153345, "grad_norm": 0.025598160922527313, "learning_rate": 3.4123775206311807e-05, "loss": 0.1812, "num_input_tokens_seen": 117209104, "step": 54310 }, { "epoch": 8.860522022838499, "grad_norm": 0.07940279692411423, "learning_rate": 3.4120461598013073e-05, "loss": 0.0202, "num_input_tokens_seen": 117220592, "step": 54315 }, { "epoch": 8.861337683523654, "grad_norm": 0.13871316611766815, "learning_rate": 3.411714780487661e-05, "loss": 0.0541, "num_input_tokens_seen": 117231280, "step": 54320 }, { "epoch": 8.86215334420881, "grad_norm": 1.1943215131759644, "learning_rate": 3.411383382696959e-05, "loss": 0.1575, "num_input_tokens_seen": 117243216, "step": 54325 }, { "epoch": 8.862969004893964, "grad_norm": 0.2301040142774582, "learning_rate": 3.411051966435916e-05, "loss": 0.0381, "num_input_tokens_seen": 117252976, "step": 54330 }, { "epoch": 8.86378466557912, "grad_norm": 0.3147326707839966, "learning_rate": 3.41072053171125e-05, "loss": 0.1571, "num_input_tokens_seen": 117264016, "step": 54335 }, { "epoch": 8.864600326264274, "grad_norm": 0.9436716437339783, "learning_rate": 3.4103890785296764e-05, "loss": 0.2727, "num_input_tokens_seen": 117274864, "step": 54340 }, { "epoch": 8.86541598694943, "grad_norm": 0.7242986559867859, "learning_rate": 3.4100576068979135e-05, "loss": 0.0257, "num_input_tokens_seen": 117285552, "step": 54345 }, { "epoch": 8.866231647634583, "grad_norm": 0.9139003157615662, "learning_rate": 3.4097261168226784e-05, "loss": 0.1011, "num_input_tokens_seen": 117295728, "step": 54350 }, { "epoch": 8.867047308319739, "grad_norm": 0.3226946294307709, "learning_rate": 3.4093946083106897e-05, "loss": 0.2053, "num_input_tokens_seen": 117307408, "step": 54355 }, { "epoch": 8.867862969004895, "grad_norm": 1.3990896940231323, "learning_rate": 3.409063081368666e-05, "loss": 0.2099, "num_input_tokens_seen": 117318832, "step": 54360 }, { "epoch": 8.868678629690049, "grad_norm": 0.7099462747573853, "learning_rate": 3.408731536003325e-05, "loss": 0.1268, "num_input_tokens_seen": 117329712, "step": 54365 }, { "epoch": 8.869494290375204, "grad_norm": 0.8042317628860474, "learning_rate": 3.408399972221387e-05, "loss": 0.1498, "num_input_tokens_seen": 117339728, "step": 54370 }, { "epoch": 8.870309951060358, "grad_norm": 0.28684887290000916, "learning_rate": 3.4080683900295715e-05, "loss": 0.0794, "num_input_tokens_seen": 117351408, "step": 54375 }, { "epoch": 8.871125611745514, "grad_norm": 1.7032421827316284, "learning_rate": 3.4077367894345975e-05, "loss": 0.3478, "num_input_tokens_seen": 117361904, "step": 54380 }, { "epoch": 8.87194127243067, "grad_norm": 0.07295506447553635, "learning_rate": 3.4074051704431866e-05, "loss": 0.0816, "num_input_tokens_seen": 117371536, "step": 54385 }, { "epoch": 8.872756933115824, "grad_norm": 0.16007885336875916, "learning_rate": 3.4070735330620576e-05, "loss": 0.2635, "num_input_tokens_seen": 117379984, "step": 54390 }, { "epoch": 8.87357259380098, "grad_norm": 0.025105133652687073, "learning_rate": 3.406741877297934e-05, "loss": 0.0436, "num_input_tokens_seen": 117390416, "step": 54395 }, { "epoch": 8.874388254486133, "grad_norm": 1.5429617166519165, "learning_rate": 3.406410203157535e-05, "loss": 0.1333, "num_input_tokens_seen": 117401136, "step": 54400 }, { "epoch": 8.875203915171289, "grad_norm": 0.7588427662849426, "learning_rate": 3.406078510647584e-05, "loss": 0.1406, "num_input_tokens_seen": 117411344, "step": 54405 }, { "epoch": 8.876019575856443, "grad_norm": 1.1037631034851074, "learning_rate": 3.4057467997748013e-05, "loss": 0.0621, "num_input_tokens_seen": 117421680, "step": 54410 }, { "epoch": 8.876835236541599, "grad_norm": 0.19371314346790314, "learning_rate": 3.405415070545913e-05, "loss": 0.0306, "num_input_tokens_seen": 117432048, "step": 54415 }, { "epoch": 8.877650897226754, "grad_norm": 0.19165350496768951, "learning_rate": 3.405083322967638e-05, "loss": 0.1048, "num_input_tokens_seen": 117444080, "step": 54420 }, { "epoch": 8.878466557911908, "grad_norm": 0.7736664414405823, "learning_rate": 3.404751557046702e-05, "loss": 0.1427, "num_input_tokens_seen": 117454864, "step": 54425 }, { "epoch": 8.879282218597064, "grad_norm": 0.15386489033699036, "learning_rate": 3.404419772789827e-05, "loss": 0.1413, "num_input_tokens_seen": 117464944, "step": 54430 }, { "epoch": 8.880097879282218, "grad_norm": 0.6109556555747986, "learning_rate": 3.4040879702037384e-05, "loss": 0.0493, "num_input_tokens_seen": 117475344, "step": 54435 }, { "epoch": 8.880913539967374, "grad_norm": 2.018376350402832, "learning_rate": 3.403756149295161e-05, "loss": 0.1983, "num_input_tokens_seen": 117486288, "step": 54440 }, { "epoch": 8.88172920065253, "grad_norm": 0.4025244116783142, "learning_rate": 3.4034243100708177e-05, "loss": 0.0503, "num_input_tokens_seen": 117496432, "step": 54445 }, { "epoch": 8.882544861337683, "grad_norm": 0.3888257145881653, "learning_rate": 3.403092452537435e-05, "loss": 0.0888, "num_input_tokens_seen": 117506960, "step": 54450 }, { "epoch": 8.883360522022839, "grad_norm": 0.5948699712753296, "learning_rate": 3.4027605767017376e-05, "loss": 0.2053, "num_input_tokens_seen": 117517360, "step": 54455 }, { "epoch": 8.884176182707993, "grad_norm": 0.46650001406669617, "learning_rate": 3.402428682570453e-05, "loss": 0.154, "num_input_tokens_seen": 117527376, "step": 54460 }, { "epoch": 8.884991843393149, "grad_norm": 0.09303148090839386, "learning_rate": 3.402096770150306e-05, "loss": 0.0471, "num_input_tokens_seen": 117537104, "step": 54465 }, { "epoch": 8.885807504078304, "grad_norm": 1.4512388706207275, "learning_rate": 3.4017648394480234e-05, "loss": 0.1493, "num_input_tokens_seen": 117546896, "step": 54470 }, { "epoch": 8.886623164763458, "grad_norm": 0.43144190311431885, "learning_rate": 3.401432890470332e-05, "loss": 0.0931, "num_input_tokens_seen": 117557328, "step": 54475 }, { "epoch": 8.887438825448614, "grad_norm": 0.19312380254268646, "learning_rate": 3.4011009232239597e-05, "loss": 0.084, "num_input_tokens_seen": 117567568, "step": 54480 }, { "epoch": 8.888254486133768, "grad_norm": 0.3253687918186188, "learning_rate": 3.400768937715635e-05, "loss": 0.0384, "num_input_tokens_seen": 117578032, "step": 54485 }, { "epoch": 8.889070146818923, "grad_norm": 0.28332391381263733, "learning_rate": 3.4004369339520835e-05, "loss": 0.1422, "num_input_tokens_seen": 117589456, "step": 54490 }, { "epoch": 8.88988580750408, "grad_norm": 1.255337119102478, "learning_rate": 3.400104911940036e-05, "loss": 0.0833, "num_input_tokens_seen": 117599952, "step": 54495 }, { "epoch": 8.890701468189233, "grad_norm": 0.10047538578510284, "learning_rate": 3.399772871686221e-05, "loss": 0.1516, "num_input_tokens_seen": 117610288, "step": 54500 }, { "epoch": 8.891517128874389, "grad_norm": 0.12597717344760895, "learning_rate": 3.399440813197367e-05, "loss": 0.0231, "num_input_tokens_seen": 117620528, "step": 54505 }, { "epoch": 8.892332789559543, "grad_norm": 0.09715010970830917, "learning_rate": 3.399108736480204e-05, "loss": 0.0314, "num_input_tokens_seen": 117631984, "step": 54510 }, { "epoch": 8.893148450244698, "grad_norm": 0.06087972968816757, "learning_rate": 3.398776641541461e-05, "loss": 0.0357, "num_input_tokens_seen": 117642800, "step": 54515 }, { "epoch": 8.893964110929852, "grad_norm": 0.5327943563461304, "learning_rate": 3.3984445283878704e-05, "loss": 0.0462, "num_input_tokens_seen": 117654544, "step": 54520 }, { "epoch": 8.894779771615008, "grad_norm": 0.3728635609149933, "learning_rate": 3.3981123970261606e-05, "loss": 0.0541, "num_input_tokens_seen": 117664688, "step": 54525 }, { "epoch": 8.895595432300164, "grad_norm": 0.045825064182281494, "learning_rate": 3.397780247463065e-05, "loss": 0.0237, "num_input_tokens_seen": 117675664, "step": 54530 }, { "epoch": 8.896411092985318, "grad_norm": 0.2956750690937042, "learning_rate": 3.397448079705313e-05, "loss": 0.0304, "num_input_tokens_seen": 117685744, "step": 54535 }, { "epoch": 8.897226753670473, "grad_norm": 1.265141248703003, "learning_rate": 3.397115893759637e-05, "loss": 0.0975, "num_input_tokens_seen": 117697104, "step": 54540 }, { "epoch": 8.898042414355627, "grad_norm": 1.8016185760498047, "learning_rate": 3.39678368963277e-05, "loss": 0.118, "num_input_tokens_seen": 117708272, "step": 54545 }, { "epoch": 8.898858075040783, "grad_norm": 1.567153811454773, "learning_rate": 3.3964514673314426e-05, "loss": 0.056, "num_input_tokens_seen": 117718832, "step": 54550 }, { "epoch": 8.899673735725939, "grad_norm": 0.34423139691352844, "learning_rate": 3.39611922686239e-05, "loss": 0.1231, "num_input_tokens_seen": 117729744, "step": 54555 }, { "epoch": 8.900489396411093, "grad_norm": 0.8741215467453003, "learning_rate": 3.3957869682323444e-05, "loss": 0.1096, "num_input_tokens_seen": 117741104, "step": 54560 }, { "epoch": 8.901305057096248, "grad_norm": 0.158351868391037, "learning_rate": 3.395454691448039e-05, "loss": 0.0211, "num_input_tokens_seen": 117751280, "step": 54565 }, { "epoch": 8.902120717781402, "grad_norm": 0.11860797554254532, "learning_rate": 3.395122396516209e-05, "loss": 0.0982, "num_input_tokens_seen": 117761200, "step": 54570 }, { "epoch": 8.902936378466558, "grad_norm": 0.4322584867477417, "learning_rate": 3.394790083443588e-05, "loss": 0.0809, "num_input_tokens_seen": 117771536, "step": 54575 }, { "epoch": 8.903752039151712, "grad_norm": 1.7495473623275757, "learning_rate": 3.3944577522369105e-05, "loss": 0.0686, "num_input_tokens_seen": 117781808, "step": 54580 }, { "epoch": 8.904567699836868, "grad_norm": 0.032353147864341736, "learning_rate": 3.394125402902912e-05, "loss": 0.088, "num_input_tokens_seen": 117793296, "step": 54585 }, { "epoch": 8.905383360522023, "grad_norm": 0.08324003219604492, "learning_rate": 3.393793035448328e-05, "loss": 0.2225, "num_input_tokens_seen": 117803568, "step": 54590 }, { "epoch": 8.906199021207177, "grad_norm": 0.7857070565223694, "learning_rate": 3.393460649879895e-05, "loss": 0.1113, "num_input_tokens_seen": 117813648, "step": 54595 }, { "epoch": 8.907014681892333, "grad_norm": 0.3809477984905243, "learning_rate": 3.3931282462043465e-05, "loss": 0.0329, "num_input_tokens_seen": 117824816, "step": 54600 }, { "epoch": 8.907830342577487, "grad_norm": 0.0437287837266922, "learning_rate": 3.392795824428423e-05, "loss": 0.1703, "num_input_tokens_seen": 117835312, "step": 54605 }, { "epoch": 8.908646003262643, "grad_norm": 0.062693290412426, "learning_rate": 3.392463384558859e-05, "loss": 0.0925, "num_input_tokens_seen": 117845616, "step": 54610 }, { "epoch": 8.909461663947798, "grad_norm": 0.3856740891933441, "learning_rate": 3.392130926602392e-05, "loss": 0.0863, "num_input_tokens_seen": 117856656, "step": 54615 }, { "epoch": 8.910277324632952, "grad_norm": 2.251166820526123, "learning_rate": 3.391798450565761e-05, "loss": 0.4045, "num_input_tokens_seen": 117867056, "step": 54620 }, { "epoch": 8.911092985318108, "grad_norm": 1.6779932975769043, "learning_rate": 3.391465956455701e-05, "loss": 0.2324, "num_input_tokens_seen": 117877552, "step": 54625 }, { "epoch": 8.911908646003262, "grad_norm": 0.19281820952892303, "learning_rate": 3.3911334442789544e-05, "loss": 0.1394, "num_input_tokens_seen": 117888624, "step": 54630 }, { "epoch": 8.912724306688418, "grad_norm": 1.7763255834579468, "learning_rate": 3.390800914042257e-05, "loss": 0.1986, "num_input_tokens_seen": 117897520, "step": 54635 }, { "epoch": 8.913539967373573, "grad_norm": 0.7712929248809814, "learning_rate": 3.3904683657523495e-05, "loss": 0.0767, "num_input_tokens_seen": 117907920, "step": 54640 }, { "epoch": 8.914355628058727, "grad_norm": 0.8420498371124268, "learning_rate": 3.3901357994159713e-05, "loss": 0.1528, "num_input_tokens_seen": 117919088, "step": 54645 }, { "epoch": 8.915171288743883, "grad_norm": 1.6522166728973389, "learning_rate": 3.3898032150398615e-05, "loss": 0.059, "num_input_tokens_seen": 117928848, "step": 54650 }, { "epoch": 8.915986949429037, "grad_norm": 0.08081876486539841, "learning_rate": 3.3894706126307604e-05, "loss": 0.1243, "num_input_tokens_seen": 117939984, "step": 54655 }, { "epoch": 8.916802610114193, "grad_norm": 0.10246030986309052, "learning_rate": 3.38913799219541e-05, "loss": 0.1002, "num_input_tokens_seen": 117952784, "step": 54660 }, { "epoch": 8.917618270799348, "grad_norm": 0.08282265812158585, "learning_rate": 3.38880535374055e-05, "loss": 0.1916, "num_input_tokens_seen": 117963504, "step": 54665 }, { "epoch": 8.918433931484502, "grad_norm": 0.3701862692832947, "learning_rate": 3.388472697272921e-05, "loss": 0.0955, "num_input_tokens_seen": 117973808, "step": 54670 }, { "epoch": 8.919249592169658, "grad_norm": 0.03647751733660698, "learning_rate": 3.3881400227992664e-05, "loss": 0.0886, "num_input_tokens_seen": 117984464, "step": 54675 }, { "epoch": 8.920065252854812, "grad_norm": 0.2774692475795746, "learning_rate": 3.387807330326328e-05, "loss": 0.0271, "num_input_tokens_seen": 117994928, "step": 54680 }, { "epoch": 8.920880913539968, "grad_norm": 0.5610756278038025, "learning_rate": 3.3874746198608466e-05, "loss": 0.0813, "num_input_tokens_seen": 118004304, "step": 54685 }, { "epoch": 8.921696574225122, "grad_norm": 1.4381000995635986, "learning_rate": 3.387141891409567e-05, "loss": 0.1018, "num_input_tokens_seen": 118015152, "step": 54690 }, { "epoch": 8.922512234910277, "grad_norm": 0.7576891779899597, "learning_rate": 3.3868091449792314e-05, "loss": 0.0616, "num_input_tokens_seen": 118027344, "step": 54695 }, { "epoch": 8.923327895595433, "grad_norm": 1.5873781442642212, "learning_rate": 3.3864763805765834e-05, "loss": 0.2039, "num_input_tokens_seen": 118038672, "step": 54700 }, { "epoch": 8.924143556280587, "grad_norm": 1.083382248878479, "learning_rate": 3.386143598208367e-05, "loss": 0.082, "num_input_tokens_seen": 118050576, "step": 54705 }, { "epoch": 8.924959216965743, "grad_norm": 0.07846518605947495, "learning_rate": 3.385810797881326e-05, "loss": 0.1659, "num_input_tokens_seen": 118061520, "step": 54710 }, { "epoch": 8.925774877650896, "grad_norm": 0.11374888569116592, "learning_rate": 3.3854779796022064e-05, "loss": 0.1974, "num_input_tokens_seen": 118072080, "step": 54715 }, { "epoch": 8.926590538336052, "grad_norm": 0.9299497008323669, "learning_rate": 3.385145143377752e-05, "loss": 0.0832, "num_input_tokens_seen": 118082416, "step": 54720 }, { "epoch": 8.927406199021208, "grad_norm": 0.27781185507774353, "learning_rate": 3.384812289214707e-05, "loss": 0.1853, "num_input_tokens_seen": 118092880, "step": 54725 }, { "epoch": 8.928221859706362, "grad_norm": 0.5854212641716003, "learning_rate": 3.3844794171198195e-05, "loss": 0.0531, "num_input_tokens_seen": 118105136, "step": 54730 }, { "epoch": 8.929037520391518, "grad_norm": 0.1663471758365631, "learning_rate": 3.384146527099835e-05, "loss": 0.143, "num_input_tokens_seen": 118114736, "step": 54735 }, { "epoch": 8.929853181076671, "grad_norm": 1.3094654083251953, "learning_rate": 3.383813619161499e-05, "loss": 0.0552, "num_input_tokens_seen": 118126064, "step": 54740 }, { "epoch": 8.930668841761827, "grad_norm": 0.0672982856631279, "learning_rate": 3.383480693311559e-05, "loss": 0.0968, "num_input_tokens_seen": 118137456, "step": 54745 }, { "epoch": 8.931484502446983, "grad_norm": 0.1560061275959015, "learning_rate": 3.3831477495567624e-05, "loss": 0.1096, "num_input_tokens_seen": 118147376, "step": 54750 }, { "epoch": 8.932300163132137, "grad_norm": 0.03745612874627113, "learning_rate": 3.382814787903855e-05, "loss": 0.0534, "num_input_tokens_seen": 118158096, "step": 54755 }, { "epoch": 8.933115823817293, "grad_norm": 0.11795037984848022, "learning_rate": 3.3824818083595875e-05, "loss": 0.2357, "num_input_tokens_seen": 118168688, "step": 54760 }, { "epoch": 8.933931484502446, "grad_norm": 0.14013437926769257, "learning_rate": 3.3821488109307056e-05, "loss": 0.2762, "num_input_tokens_seen": 118179792, "step": 54765 }, { "epoch": 8.934747145187602, "grad_norm": 0.18060407042503357, "learning_rate": 3.381815795623959e-05, "loss": 0.0297, "num_input_tokens_seen": 118190736, "step": 54770 }, { "epoch": 8.935562805872756, "grad_norm": 0.2975825369358063, "learning_rate": 3.3814827624460974e-05, "loss": 0.1349, "num_input_tokens_seen": 118201584, "step": 54775 }, { "epoch": 8.936378466557912, "grad_norm": 0.17402401566505432, "learning_rate": 3.381149711403869e-05, "loss": 0.0713, "num_input_tokens_seen": 118212624, "step": 54780 }, { "epoch": 8.937194127243067, "grad_norm": 0.364996075630188, "learning_rate": 3.380816642504023e-05, "loss": 0.1772, "num_input_tokens_seen": 118222800, "step": 54785 }, { "epoch": 8.938009787928221, "grad_norm": 0.2277614176273346, "learning_rate": 3.380483555753311e-05, "loss": 0.0845, "num_input_tokens_seen": 118232592, "step": 54790 }, { "epoch": 8.938825448613377, "grad_norm": 1.0466256141662598, "learning_rate": 3.3801504511584836e-05, "loss": 0.0468, "num_input_tokens_seen": 118244464, "step": 54795 }, { "epoch": 8.939641109298531, "grad_norm": 0.7288828492164612, "learning_rate": 3.37981732872629e-05, "loss": 0.0617, "num_input_tokens_seen": 118253552, "step": 54800 }, { "epoch": 8.940456769983687, "grad_norm": 0.05954848229885101, "learning_rate": 3.379484188463482e-05, "loss": 0.1379, "num_input_tokens_seen": 118263824, "step": 54805 }, { "epoch": 8.941272430668842, "grad_norm": 2.255336046218872, "learning_rate": 3.379151030376812e-05, "loss": 0.2128, "num_input_tokens_seen": 118275216, "step": 54810 }, { "epoch": 8.942088091353996, "grad_norm": 1.2823987007141113, "learning_rate": 3.3788178544730306e-05, "loss": 0.0649, "num_input_tokens_seen": 118286224, "step": 54815 }, { "epoch": 8.942903752039152, "grad_norm": 0.18912722170352936, "learning_rate": 3.3784846607588905e-05, "loss": 0.1958, "num_input_tokens_seen": 118296240, "step": 54820 }, { "epoch": 8.943719412724306, "grad_norm": 0.0748048722743988, "learning_rate": 3.3781514492411443e-05, "loss": 0.0483, "num_input_tokens_seen": 118305680, "step": 54825 }, { "epoch": 8.944535073409462, "grad_norm": 0.0831337720155716, "learning_rate": 3.377818219926545e-05, "loss": 0.1381, "num_input_tokens_seen": 118316880, "step": 54830 }, { "epoch": 8.945350734094617, "grad_norm": 0.37314093112945557, "learning_rate": 3.377484972821845e-05, "loss": 0.0988, "num_input_tokens_seen": 118328304, "step": 54835 }, { "epoch": 8.946166394779771, "grad_norm": 0.16668634116649628, "learning_rate": 3.3771517079338e-05, "loss": 0.0564, "num_input_tokens_seen": 118339728, "step": 54840 }, { "epoch": 8.946982055464927, "grad_norm": 0.4608229398727417, "learning_rate": 3.376818425269163e-05, "loss": 0.1037, "num_input_tokens_seen": 118349872, "step": 54845 }, { "epoch": 8.947797716150081, "grad_norm": 0.1932167261838913, "learning_rate": 3.3764851248346874e-05, "loss": 0.2347, "num_input_tokens_seen": 118359888, "step": 54850 }, { "epoch": 8.948613376835237, "grad_norm": 0.06272583454847336, "learning_rate": 3.376151806637129e-05, "loss": 0.0534, "num_input_tokens_seen": 118370544, "step": 54855 }, { "epoch": 8.949429037520392, "grad_norm": 0.18092337250709534, "learning_rate": 3.375818470683243e-05, "loss": 0.0397, "num_input_tokens_seen": 118381584, "step": 54860 }, { "epoch": 8.950244698205546, "grad_norm": 0.3867020010948181, "learning_rate": 3.3754851169797855e-05, "loss": 0.0668, "num_input_tokens_seen": 118392624, "step": 54865 }, { "epoch": 8.951060358890702, "grad_norm": 0.07290148735046387, "learning_rate": 3.37515174553351e-05, "loss": 0.0742, "num_input_tokens_seen": 118402768, "step": 54870 }, { "epoch": 8.951876019575856, "grad_norm": 2.2576606273651123, "learning_rate": 3.374818356351175e-05, "loss": 0.0808, "num_input_tokens_seen": 118413584, "step": 54875 }, { "epoch": 8.952691680261012, "grad_norm": 0.243523970246315, "learning_rate": 3.374484949439536e-05, "loss": 0.1543, "num_input_tokens_seen": 118424400, "step": 54880 }, { "epoch": 8.953507340946166, "grad_norm": 0.0671665221452713, "learning_rate": 3.37415152480535e-05, "loss": 0.0362, "num_input_tokens_seen": 118435696, "step": 54885 }, { "epoch": 8.954323001631321, "grad_norm": 0.5935929417610168, "learning_rate": 3.373818082455375e-05, "loss": 0.2031, "num_input_tokens_seen": 118446544, "step": 54890 }, { "epoch": 8.955138662316477, "grad_norm": 1.627990484237671, "learning_rate": 3.373484622396367e-05, "loss": 0.2087, "num_input_tokens_seen": 118457616, "step": 54895 }, { "epoch": 8.955954323001631, "grad_norm": 0.3796806037425995, "learning_rate": 3.373151144635086e-05, "loss": 0.0906, "num_input_tokens_seen": 118468816, "step": 54900 }, { "epoch": 8.956769983686787, "grad_norm": 0.022545045241713524, "learning_rate": 3.372817649178289e-05, "loss": 0.1092, "num_input_tokens_seen": 118480176, "step": 54905 }, { "epoch": 8.95758564437194, "grad_norm": 0.10307960957288742, "learning_rate": 3.3724841360327346e-05, "loss": 0.0255, "num_input_tokens_seen": 118491024, "step": 54910 }, { "epoch": 8.958401305057096, "grad_norm": 1.0334508419036865, "learning_rate": 3.372150605205183e-05, "loss": 0.0351, "num_input_tokens_seen": 118502096, "step": 54915 }, { "epoch": 8.959216965742252, "grad_norm": 0.7029260993003845, "learning_rate": 3.371817056702393e-05, "loss": 0.0599, "num_input_tokens_seen": 118512464, "step": 54920 }, { "epoch": 8.960032626427406, "grad_norm": 0.07979778200387955, "learning_rate": 3.3714834905311244e-05, "loss": 0.0716, "num_input_tokens_seen": 118523728, "step": 54925 }, { "epoch": 8.960848287112562, "grad_norm": 0.28887951374053955, "learning_rate": 3.3711499066981375e-05, "loss": 0.1625, "num_input_tokens_seen": 118533680, "step": 54930 }, { "epoch": 8.961663947797716, "grad_norm": 0.17012132704257965, "learning_rate": 3.370816305210192e-05, "loss": 0.0995, "num_input_tokens_seen": 118544176, "step": 54935 }, { "epoch": 8.962479608482871, "grad_norm": 0.14945822954177856, "learning_rate": 3.370482686074049e-05, "loss": 0.0428, "num_input_tokens_seen": 118554384, "step": 54940 }, { "epoch": 8.963295269168025, "grad_norm": 0.9488592743873596, "learning_rate": 3.3701490492964714e-05, "loss": 0.1421, "num_input_tokens_seen": 118566064, "step": 54945 }, { "epoch": 8.964110929853181, "grad_norm": 0.1723267287015915, "learning_rate": 3.369815394884218e-05, "loss": 0.182, "num_input_tokens_seen": 118575952, "step": 54950 }, { "epoch": 8.964926590538337, "grad_norm": 0.30039182305336, "learning_rate": 3.369481722844053e-05, "loss": 0.0623, "num_input_tokens_seen": 118587408, "step": 54955 }, { "epoch": 8.96574225122349, "grad_norm": 0.06006140261888504, "learning_rate": 3.369148033182738e-05, "loss": 0.1465, "num_input_tokens_seen": 118598320, "step": 54960 }, { "epoch": 8.966557911908646, "grad_norm": 0.8139910101890564, "learning_rate": 3.368814325907035e-05, "loss": 0.1692, "num_input_tokens_seen": 118609488, "step": 54965 }, { "epoch": 8.9673735725938, "grad_norm": 0.23262767493724823, "learning_rate": 3.368480601023709e-05, "loss": 0.2136, "num_input_tokens_seen": 118621040, "step": 54970 }, { "epoch": 8.968189233278956, "grad_norm": 0.5727052092552185, "learning_rate": 3.36814685853952e-05, "loss": 0.188, "num_input_tokens_seen": 118632720, "step": 54975 }, { "epoch": 8.969004893964112, "grad_norm": 0.05185478925704956, "learning_rate": 3.367813098461234e-05, "loss": 0.0163, "num_input_tokens_seen": 118643024, "step": 54980 }, { "epoch": 8.969820554649266, "grad_norm": 2.4938018321990967, "learning_rate": 3.3674793207956156e-05, "loss": 0.283, "num_input_tokens_seen": 118653712, "step": 54985 }, { "epoch": 8.970636215334421, "grad_norm": 1.7473134994506836, "learning_rate": 3.3671455255494276e-05, "loss": 0.2401, "num_input_tokens_seen": 118664400, "step": 54990 }, { "epoch": 8.971451876019575, "grad_norm": 0.1152496486902237, "learning_rate": 3.366811712729436e-05, "loss": 0.088, "num_input_tokens_seen": 118675504, "step": 54995 }, { "epoch": 8.97226753670473, "grad_norm": 0.5217407941818237, "learning_rate": 3.366477882342405e-05, "loss": 0.0722, "num_input_tokens_seen": 118686192, "step": 55000 }, { "epoch": 8.973083197389887, "grad_norm": 0.10121320933103561, "learning_rate": 3.3661440343951e-05, "loss": 0.0436, "num_input_tokens_seen": 118697776, "step": 55005 }, { "epoch": 8.97389885807504, "grad_norm": 0.832232654094696, "learning_rate": 3.365810168894288e-05, "loss": 0.0541, "num_input_tokens_seen": 118708720, "step": 55010 }, { "epoch": 8.974714518760196, "grad_norm": 1.9379137754440308, "learning_rate": 3.365476285846734e-05, "loss": 0.0484, "num_input_tokens_seen": 118720112, "step": 55015 }, { "epoch": 8.97553017944535, "grad_norm": 0.4416922926902771, "learning_rate": 3.3651423852592056e-05, "loss": 0.3371, "num_input_tokens_seen": 118731184, "step": 55020 }, { "epoch": 8.976345840130506, "grad_norm": 1.0308737754821777, "learning_rate": 3.36480846713847e-05, "loss": 0.2244, "num_input_tokens_seen": 118741392, "step": 55025 }, { "epoch": 8.977161500815662, "grad_norm": 1.1582157611846924, "learning_rate": 3.364474531491292e-05, "loss": 0.134, "num_input_tokens_seen": 118751984, "step": 55030 }, { "epoch": 8.977977161500815, "grad_norm": 1.0585860013961792, "learning_rate": 3.364140578324442e-05, "loss": 0.1072, "num_input_tokens_seen": 118763600, "step": 55035 }, { "epoch": 8.978792822185971, "grad_norm": 0.1621156632900238, "learning_rate": 3.363806607644687e-05, "loss": 0.0765, "num_input_tokens_seen": 118774512, "step": 55040 }, { "epoch": 8.979608482871125, "grad_norm": 0.1326158344745636, "learning_rate": 3.363472619458795e-05, "loss": 0.0271, "num_input_tokens_seen": 118785072, "step": 55045 }, { "epoch": 8.98042414355628, "grad_norm": 0.16593262553215027, "learning_rate": 3.363138613773535e-05, "loss": 0.0449, "num_input_tokens_seen": 118795984, "step": 55050 }, { "epoch": 8.981239804241435, "grad_norm": 0.14153510332107544, "learning_rate": 3.362804590595676e-05, "loss": 0.0624, "num_input_tokens_seen": 118804656, "step": 55055 }, { "epoch": 8.98205546492659, "grad_norm": 1.4537192583084106, "learning_rate": 3.3624705499319875e-05, "loss": 0.0778, "num_input_tokens_seen": 118814224, "step": 55060 }, { "epoch": 8.982871125611746, "grad_norm": 0.0826830118894577, "learning_rate": 3.3621364917892394e-05, "loss": 0.06, "num_input_tokens_seen": 118825552, "step": 55065 }, { "epoch": 8.9836867862969, "grad_norm": 0.07833445817232132, "learning_rate": 3.361802416174201e-05, "loss": 0.1432, "num_input_tokens_seen": 118836752, "step": 55070 }, { "epoch": 8.984502446982056, "grad_norm": 0.08847296237945557, "learning_rate": 3.361468323093644e-05, "loss": 0.1106, "num_input_tokens_seen": 118846160, "step": 55075 }, { "epoch": 8.98531810766721, "grad_norm": 0.09186594933271408, "learning_rate": 3.361134212554338e-05, "loss": 0.0147, "num_input_tokens_seen": 118857168, "step": 55080 }, { "epoch": 8.986133768352365, "grad_norm": 0.1502893716096878, "learning_rate": 3.360800084563055e-05, "loss": 0.0631, "num_input_tokens_seen": 118868432, "step": 55085 }, { "epoch": 8.986949429037521, "grad_norm": 0.03672964870929718, "learning_rate": 3.360465939126566e-05, "loss": 0.1808, "num_input_tokens_seen": 118879120, "step": 55090 }, { "epoch": 8.987765089722675, "grad_norm": 0.030154559761285782, "learning_rate": 3.360131776251644e-05, "loss": 0.099, "num_input_tokens_seen": 118889040, "step": 55095 }, { "epoch": 8.98858075040783, "grad_norm": 0.021606463938951492, "learning_rate": 3.35979759594506e-05, "loss": 0.0188, "num_input_tokens_seen": 118898608, "step": 55100 }, { "epoch": 8.989396411092985, "grad_norm": 1.4056029319763184, "learning_rate": 3.359463398213587e-05, "loss": 0.0528, "num_input_tokens_seen": 118909264, "step": 55105 }, { "epoch": 8.99021207177814, "grad_norm": 0.20304667949676514, "learning_rate": 3.359129183063997e-05, "loss": 0.1708, "num_input_tokens_seen": 118920048, "step": 55110 }, { "epoch": 8.991027732463294, "grad_norm": 0.2445741444826126, "learning_rate": 3.358794950503066e-05, "loss": 0.0696, "num_input_tokens_seen": 118931120, "step": 55115 }, { "epoch": 8.99184339314845, "grad_norm": 0.876416027545929, "learning_rate": 3.3584607005375647e-05, "loss": 0.0951, "num_input_tokens_seen": 118940464, "step": 55120 }, { "epoch": 8.992659053833606, "grad_norm": 0.5924700498580933, "learning_rate": 3.358126433174268e-05, "loss": 0.1817, "num_input_tokens_seen": 118951120, "step": 55125 }, { "epoch": 8.99347471451876, "grad_norm": 1.1084094047546387, "learning_rate": 3.357792148419951e-05, "loss": 0.2033, "num_input_tokens_seen": 118961616, "step": 55130 }, { "epoch": 8.994290375203915, "grad_norm": 0.14605796337127686, "learning_rate": 3.357457846281389e-05, "loss": 0.1386, "num_input_tokens_seen": 118973296, "step": 55135 }, { "epoch": 8.99510603588907, "grad_norm": 2.499316453933716, "learning_rate": 3.3571235267653545e-05, "loss": 0.2307, "num_input_tokens_seen": 118983344, "step": 55140 }, { "epoch": 8.995921696574225, "grad_norm": 0.06030241772532463, "learning_rate": 3.356789189878625e-05, "loss": 0.0755, "num_input_tokens_seen": 118994544, "step": 55145 }, { "epoch": 8.99673735725938, "grad_norm": 0.8263081908226013, "learning_rate": 3.3564548356279755e-05, "loss": 0.1126, "num_input_tokens_seen": 119005264, "step": 55150 }, { "epoch": 8.997553017944535, "grad_norm": 1.6292468309402466, "learning_rate": 3.356120464020182e-05, "loss": 0.2635, "num_input_tokens_seen": 119016688, "step": 55155 }, { "epoch": 8.99836867862969, "grad_norm": 1.4252746105194092, "learning_rate": 3.355786075062021e-05, "loss": 0.1386, "num_input_tokens_seen": 119027856, "step": 55160 }, { "epoch": 8.999184339314844, "grad_norm": 1.320676565170288, "learning_rate": 3.35545166876027e-05, "loss": 0.1735, "num_input_tokens_seen": 119038000, "step": 55165 }, { "epoch": 9.0, "grad_norm": 0.02646855264902115, "learning_rate": 3.355117245121706e-05, "loss": 0.0353, "num_input_tokens_seen": 119047920, "step": 55170 }, { "epoch": 9.0, "eval_loss": 0.1360049545764923, "eval_runtime": 90.8051, "eval_samples_per_second": 30.009, "eval_steps_per_second": 7.511, "num_input_tokens_seen": 119047920, "step": 55170 }, { "epoch": 9.000815660685156, "grad_norm": 0.9937355518341064, "learning_rate": 3.354782804153106e-05, "loss": 0.1597, "num_input_tokens_seen": 119058096, "step": 55175 }, { "epoch": 9.00163132137031, "grad_norm": 1.6433994770050049, "learning_rate": 3.354448345861248e-05, "loss": 0.1434, "num_input_tokens_seen": 119068080, "step": 55180 }, { "epoch": 9.002446982055465, "grad_norm": 2.0651748180389404, "learning_rate": 3.3541138702529107e-05, "loss": 0.1909, "num_input_tokens_seen": 119078576, "step": 55185 }, { "epoch": 9.00326264274062, "grad_norm": 1.4629515409469604, "learning_rate": 3.353779377334872e-05, "loss": 0.2095, "num_input_tokens_seen": 119089808, "step": 55190 }, { "epoch": 9.004078303425775, "grad_norm": 0.903633713722229, "learning_rate": 3.3534448671139115e-05, "loss": 0.1397, "num_input_tokens_seen": 119100208, "step": 55195 }, { "epoch": 9.00489396411093, "grad_norm": 0.14800812304019928, "learning_rate": 3.353110339596807e-05, "loss": 0.0885, "num_input_tokens_seen": 119111632, "step": 55200 }, { "epoch": 9.005709624796085, "grad_norm": 0.45837363600730896, "learning_rate": 3.352775794790341e-05, "loss": 0.0567, "num_input_tokens_seen": 119122608, "step": 55205 }, { "epoch": 9.00652528548124, "grad_norm": 0.13885033130645752, "learning_rate": 3.35244123270129e-05, "loss": 0.0164, "num_input_tokens_seen": 119132976, "step": 55210 }, { "epoch": 9.007340946166394, "grad_norm": 1.8250070810317993, "learning_rate": 3.352106653336436e-05, "loss": 0.1166, "num_input_tokens_seen": 119143984, "step": 55215 }, { "epoch": 9.00815660685155, "grad_norm": 2.449272394180298, "learning_rate": 3.3517720567025614e-05, "loss": 0.0815, "num_input_tokens_seen": 119154256, "step": 55220 }, { "epoch": 9.008972267536704, "grad_norm": 0.29644331336021423, "learning_rate": 3.351437442806444e-05, "loss": 0.1559, "num_input_tokens_seen": 119165712, "step": 55225 }, { "epoch": 9.00978792822186, "grad_norm": 0.2132609635591507, "learning_rate": 3.351102811654867e-05, "loss": 0.0998, "num_input_tokens_seen": 119176496, "step": 55230 }, { "epoch": 9.010603588907015, "grad_norm": 0.3777677118778229, "learning_rate": 3.350768163254612e-05, "loss": 0.1025, "num_input_tokens_seen": 119186640, "step": 55235 }, { "epoch": 9.01141924959217, "grad_norm": 1.3094329833984375, "learning_rate": 3.350433497612461e-05, "loss": 0.194, "num_input_tokens_seen": 119197680, "step": 55240 }, { "epoch": 9.012234910277325, "grad_norm": 0.31408441066741943, "learning_rate": 3.350098814735196e-05, "loss": 0.0504, "num_input_tokens_seen": 119208080, "step": 55245 }, { "epoch": 9.013050570962479, "grad_norm": 1.7648072242736816, "learning_rate": 3.3497641146296e-05, "loss": 0.1398, "num_input_tokens_seen": 119218896, "step": 55250 }, { "epoch": 9.013866231647635, "grad_norm": 1.3759593963623047, "learning_rate": 3.349429397302457e-05, "loss": 0.2432, "num_input_tokens_seen": 119230032, "step": 55255 }, { "epoch": 9.01468189233279, "grad_norm": 0.07050401717424393, "learning_rate": 3.3490946627605485e-05, "loss": 0.0379, "num_input_tokens_seen": 119240112, "step": 55260 }, { "epoch": 9.015497553017944, "grad_norm": 0.09282228350639343, "learning_rate": 3.3487599110106604e-05, "loss": 0.3134, "num_input_tokens_seen": 119251184, "step": 55265 }, { "epoch": 9.0163132137031, "grad_norm": 0.22859664261341095, "learning_rate": 3.3484251420595755e-05, "loss": 0.0309, "num_input_tokens_seen": 119263216, "step": 55270 }, { "epoch": 9.017128874388254, "grad_norm": 0.3909575641155243, "learning_rate": 3.3480903559140795e-05, "loss": 0.1067, "num_input_tokens_seen": 119272112, "step": 55275 }, { "epoch": 9.01794453507341, "grad_norm": 0.6518594622612, "learning_rate": 3.347755552580956e-05, "loss": 0.1797, "num_input_tokens_seen": 119282352, "step": 55280 }, { "epoch": 9.018760195758565, "grad_norm": 1.2103434801101685, "learning_rate": 3.347420732066991e-05, "loss": 0.1454, "num_input_tokens_seen": 119293488, "step": 55285 }, { "epoch": 9.01957585644372, "grad_norm": 2.5715456008911133, "learning_rate": 3.3470858943789694e-05, "loss": 0.1012, "num_input_tokens_seen": 119304592, "step": 55290 }, { "epoch": 9.020391517128875, "grad_norm": 0.8577227592468262, "learning_rate": 3.3467510395236786e-05, "loss": 0.1896, "num_input_tokens_seen": 119315088, "step": 55295 }, { "epoch": 9.021207177814029, "grad_norm": 1.0467658042907715, "learning_rate": 3.346416167507903e-05, "loss": 0.1455, "num_input_tokens_seen": 119326416, "step": 55300 }, { "epoch": 9.022022838499185, "grad_norm": 0.6549656987190247, "learning_rate": 3.34608127833843e-05, "loss": 0.0846, "num_input_tokens_seen": 119337296, "step": 55305 }, { "epoch": 9.022838499184338, "grad_norm": 0.9644857048988342, "learning_rate": 3.345746372022047e-05, "loss": 0.1515, "num_input_tokens_seen": 119347824, "step": 55310 }, { "epoch": 9.023654159869494, "grad_norm": 0.22920475900173187, "learning_rate": 3.34541144856554e-05, "loss": 0.0408, "num_input_tokens_seen": 119359216, "step": 55315 }, { "epoch": 9.02446982055465, "grad_norm": 0.0963352769613266, "learning_rate": 3.345076507975699e-05, "loss": 0.0953, "num_input_tokens_seen": 119370800, "step": 55320 }, { "epoch": 9.025285481239804, "grad_norm": 1.508742332458496, "learning_rate": 3.3447415502593086e-05, "loss": 0.1715, "num_input_tokens_seen": 119380944, "step": 55325 }, { "epoch": 9.02610114192496, "grad_norm": 0.08546284586191177, "learning_rate": 3.34440657542316e-05, "loss": 0.0931, "num_input_tokens_seen": 119390608, "step": 55330 }, { "epoch": 9.026916802610113, "grad_norm": 0.09984074532985687, "learning_rate": 3.344071583474041e-05, "loss": 0.0721, "num_input_tokens_seen": 119401328, "step": 55335 }, { "epoch": 9.02773246329527, "grad_norm": 1.2422547340393066, "learning_rate": 3.343736574418741e-05, "loss": 0.0411, "num_input_tokens_seen": 119411472, "step": 55340 }, { "epoch": 9.028548123980425, "grad_norm": 0.06977944821119308, "learning_rate": 3.3434015482640484e-05, "loss": 0.0424, "num_input_tokens_seen": 119423152, "step": 55345 }, { "epoch": 9.029363784665579, "grad_norm": 1.2654237747192383, "learning_rate": 3.3430665050167535e-05, "loss": 0.1763, "num_input_tokens_seen": 119434992, "step": 55350 }, { "epoch": 9.030179445350734, "grad_norm": 0.179976224899292, "learning_rate": 3.3427314446836466e-05, "loss": 0.2043, "num_input_tokens_seen": 119445520, "step": 55355 }, { "epoch": 9.030995106035888, "grad_norm": 0.7265439629554749, "learning_rate": 3.342396367271518e-05, "loss": 0.1253, "num_input_tokens_seen": 119456080, "step": 55360 }, { "epoch": 9.031810766721044, "grad_norm": 0.24601638317108154, "learning_rate": 3.3420612727871576e-05, "loss": 0.0617, "num_input_tokens_seen": 119467504, "step": 55365 }, { "epoch": 9.0326264274062, "grad_norm": 0.7949322462081909, "learning_rate": 3.341726161237357e-05, "loss": 0.1907, "num_input_tokens_seen": 119478896, "step": 55370 }, { "epoch": 9.033442088091354, "grad_norm": 0.40516531467437744, "learning_rate": 3.3413910326289095e-05, "loss": 0.1085, "num_input_tokens_seen": 119489840, "step": 55375 }, { "epoch": 9.03425774877651, "grad_norm": 0.11142010986804962, "learning_rate": 3.341055886968605e-05, "loss": 0.0737, "num_input_tokens_seen": 119499472, "step": 55380 }, { "epoch": 9.035073409461663, "grad_norm": 0.8121809959411621, "learning_rate": 3.340720724263236e-05, "loss": 0.1563, "num_input_tokens_seen": 119510768, "step": 55385 }, { "epoch": 9.035889070146819, "grad_norm": 0.2610316276550293, "learning_rate": 3.3403855445195934e-05, "loss": 0.2074, "num_input_tokens_seen": 119521968, "step": 55390 }, { "epoch": 9.036704730831975, "grad_norm": 0.014718465507030487, "learning_rate": 3.3400503477444725e-05, "loss": 0.1268, "num_input_tokens_seen": 119532912, "step": 55395 }, { "epoch": 9.037520391517129, "grad_norm": 0.3228744864463806, "learning_rate": 3.339715133944666e-05, "loss": 0.0335, "num_input_tokens_seen": 119544208, "step": 55400 }, { "epoch": 9.038336052202284, "grad_norm": 0.1300881803035736, "learning_rate": 3.339379903126967e-05, "loss": 0.0659, "num_input_tokens_seen": 119554960, "step": 55405 }, { "epoch": 9.039151712887438, "grad_norm": 0.06614574790000916, "learning_rate": 3.3390446552981696e-05, "loss": 0.1383, "num_input_tokens_seen": 119566224, "step": 55410 }, { "epoch": 9.039967373572594, "grad_norm": 0.2088375836610794, "learning_rate": 3.338709390465068e-05, "loss": 0.1179, "num_input_tokens_seen": 119577424, "step": 55415 }, { "epoch": 9.040783034257748, "grad_norm": 0.1173444390296936, "learning_rate": 3.338374108634456e-05, "loss": 0.1606, "num_input_tokens_seen": 119588752, "step": 55420 }, { "epoch": 9.041598694942904, "grad_norm": 0.14047090709209442, "learning_rate": 3.338038809813129e-05, "loss": 0.0407, "num_input_tokens_seen": 119600048, "step": 55425 }, { "epoch": 9.04241435562806, "grad_norm": 1.5800702571868896, "learning_rate": 3.3377034940078834e-05, "loss": 0.2143, "num_input_tokens_seen": 119608944, "step": 55430 }, { "epoch": 9.043230016313213, "grad_norm": 0.07487630099058151, "learning_rate": 3.337368161225513e-05, "loss": 0.1116, "num_input_tokens_seen": 119618512, "step": 55435 }, { "epoch": 9.044045676998369, "grad_norm": 0.16008509695529938, "learning_rate": 3.337032811472815e-05, "loss": 0.0346, "num_input_tokens_seen": 119629040, "step": 55440 }, { "epoch": 9.044861337683523, "grad_norm": 0.14476609230041504, "learning_rate": 3.336697444756585e-05, "loss": 0.0503, "num_input_tokens_seen": 119639248, "step": 55445 }, { "epoch": 9.045676998368679, "grad_norm": 0.22982630133628845, "learning_rate": 3.33636206108362e-05, "loss": 0.134, "num_input_tokens_seen": 119648848, "step": 55450 }, { "epoch": 9.046492659053834, "grad_norm": 1.2709848880767822, "learning_rate": 3.3360266604607164e-05, "loss": 0.0623, "num_input_tokens_seen": 119660624, "step": 55455 }, { "epoch": 9.047308319738988, "grad_norm": 0.5767439603805542, "learning_rate": 3.3356912428946726e-05, "loss": 0.0746, "num_input_tokens_seen": 119671120, "step": 55460 }, { "epoch": 9.048123980424144, "grad_norm": 0.06065903604030609, "learning_rate": 3.3353558083922856e-05, "loss": 0.2489, "num_input_tokens_seen": 119680720, "step": 55465 }, { "epoch": 9.048939641109298, "grad_norm": 0.3344850540161133, "learning_rate": 3.335020356960352e-05, "loss": 0.173, "num_input_tokens_seen": 119690896, "step": 55470 }, { "epoch": 9.049755301794454, "grad_norm": 0.3106737732887268, "learning_rate": 3.3346848886056726e-05, "loss": 0.063, "num_input_tokens_seen": 119699824, "step": 55475 }, { "epoch": 9.05057096247961, "grad_norm": 1.3670947551727295, "learning_rate": 3.334349403335045e-05, "loss": 0.2168, "num_input_tokens_seen": 119709552, "step": 55480 }, { "epoch": 9.051386623164763, "grad_norm": 0.3795602023601532, "learning_rate": 3.3340139011552685e-05, "loss": 0.0846, "num_input_tokens_seen": 119721136, "step": 55485 }, { "epoch": 9.052202283849919, "grad_norm": 0.18450938165187836, "learning_rate": 3.333678382073142e-05, "loss": 0.1166, "num_input_tokens_seen": 119731408, "step": 55490 }, { "epoch": 9.053017944535073, "grad_norm": 0.03656255826354027, "learning_rate": 3.333342846095466e-05, "loss": 0.1896, "num_input_tokens_seen": 119743728, "step": 55495 }, { "epoch": 9.053833605220229, "grad_norm": 1.8870028257369995, "learning_rate": 3.3330072932290396e-05, "loss": 0.1853, "num_input_tokens_seen": 119754992, "step": 55500 }, { "epoch": 9.054649265905383, "grad_norm": 0.7792218327522278, "learning_rate": 3.3326717234806634e-05, "loss": 0.0739, "num_input_tokens_seen": 119765936, "step": 55505 }, { "epoch": 9.055464926590538, "grad_norm": 0.02033388242125511, "learning_rate": 3.332336136857139e-05, "loss": 0.0053, "num_input_tokens_seen": 119776688, "step": 55510 }, { "epoch": 9.056280587275694, "grad_norm": 0.9553652405738831, "learning_rate": 3.332000533365267e-05, "loss": 0.0748, "num_input_tokens_seen": 119787088, "step": 55515 }, { "epoch": 9.057096247960848, "grad_norm": 0.0467890165746212, "learning_rate": 3.3316649130118484e-05, "loss": 0.0726, "num_input_tokens_seen": 119798576, "step": 55520 }, { "epoch": 9.057911908646004, "grad_norm": 1.0114936828613281, "learning_rate": 3.331329275803685e-05, "loss": 0.1742, "num_input_tokens_seen": 119809424, "step": 55525 }, { "epoch": 9.058727569331158, "grad_norm": 0.5894865393638611, "learning_rate": 3.3309936217475793e-05, "loss": 0.0976, "num_input_tokens_seen": 119821232, "step": 55530 }, { "epoch": 9.059543230016313, "grad_norm": 0.04463489353656769, "learning_rate": 3.330657950850334e-05, "loss": 0.208, "num_input_tokens_seen": 119832688, "step": 55535 }, { "epoch": 9.060358890701469, "grad_norm": 2.093216896057129, "learning_rate": 3.330322263118751e-05, "loss": 0.2752, "num_input_tokens_seen": 119842736, "step": 55540 }, { "epoch": 9.061174551386623, "grad_norm": 1.3549140691757202, "learning_rate": 3.3299865585596344e-05, "loss": 0.0505, "num_input_tokens_seen": 119853808, "step": 55545 }, { "epoch": 9.061990212071779, "grad_norm": 0.8555570840835571, "learning_rate": 3.329650837179787e-05, "loss": 0.2491, "num_input_tokens_seen": 119864976, "step": 55550 }, { "epoch": 9.062805872756933, "grad_norm": 0.08720274269580841, "learning_rate": 3.3293150989860136e-05, "loss": 0.0797, "num_input_tokens_seen": 119875312, "step": 55555 }, { "epoch": 9.063621533442088, "grad_norm": 0.3669755756855011, "learning_rate": 3.328979343985117e-05, "loss": 0.1231, "num_input_tokens_seen": 119886704, "step": 55560 }, { "epoch": 9.064437194127244, "grad_norm": 0.5504811406135559, "learning_rate": 3.328643572183903e-05, "loss": 0.0294, "num_input_tokens_seen": 119897424, "step": 55565 }, { "epoch": 9.065252854812398, "grad_norm": 0.8077211976051331, "learning_rate": 3.328307783589175e-05, "loss": 0.0966, "num_input_tokens_seen": 119909520, "step": 55570 }, { "epoch": 9.066068515497554, "grad_norm": 0.11276384443044662, "learning_rate": 3.32797197820774e-05, "loss": 0.1151, "num_input_tokens_seen": 119919056, "step": 55575 }, { "epoch": 9.066884176182707, "grad_norm": 1.3125030994415283, "learning_rate": 3.327636156046401e-05, "loss": 0.1725, "num_input_tokens_seen": 119930448, "step": 55580 }, { "epoch": 9.067699836867863, "grad_norm": 0.3193964660167694, "learning_rate": 3.327300317111966e-05, "loss": 0.0361, "num_input_tokens_seen": 119939824, "step": 55585 }, { "epoch": 9.068515497553017, "grad_norm": 1.379907250404358, "learning_rate": 3.326964461411241e-05, "loss": 0.1416, "num_input_tokens_seen": 119951152, "step": 55590 }, { "epoch": 9.069331158238173, "grad_norm": 1.0513278245925903, "learning_rate": 3.326628588951032e-05, "loss": 0.0886, "num_input_tokens_seen": 119961104, "step": 55595 }, { "epoch": 9.070146818923329, "grad_norm": 0.842991828918457, "learning_rate": 3.326292699738146e-05, "loss": 0.1197, "num_input_tokens_seen": 119971920, "step": 55600 }, { "epoch": 9.070962479608482, "grad_norm": 0.06140881031751633, "learning_rate": 3.32595679377939e-05, "loss": 0.1424, "num_input_tokens_seen": 119980944, "step": 55605 }, { "epoch": 9.071778140293638, "grad_norm": 0.5706720352172852, "learning_rate": 3.3256208710815715e-05, "loss": 0.2159, "num_input_tokens_seen": 119991792, "step": 55610 }, { "epoch": 9.072593800978792, "grad_norm": 0.8368108868598938, "learning_rate": 3.3252849316515e-05, "loss": 0.0613, "num_input_tokens_seen": 120002064, "step": 55615 }, { "epoch": 9.073409461663948, "grad_norm": 0.6899718642234802, "learning_rate": 3.324948975495981e-05, "loss": 0.1988, "num_input_tokens_seen": 120012560, "step": 55620 }, { "epoch": 9.074225122349104, "grad_norm": 0.10747227817773819, "learning_rate": 3.3246130026218254e-05, "loss": 0.1341, "num_input_tokens_seen": 120023568, "step": 55625 }, { "epoch": 9.075040783034257, "grad_norm": 0.15021789073944092, "learning_rate": 3.32427701303584e-05, "loss": 0.2198, "num_input_tokens_seen": 120034448, "step": 55630 }, { "epoch": 9.075856443719413, "grad_norm": 0.39191916584968567, "learning_rate": 3.3239410067448366e-05, "loss": 0.1719, "num_input_tokens_seen": 120046096, "step": 55635 }, { "epoch": 9.076672104404567, "grad_norm": 0.4574274718761444, "learning_rate": 3.3236049837556226e-05, "loss": 0.0419, "num_input_tokens_seen": 120055344, "step": 55640 }, { "epoch": 9.077487765089723, "grad_norm": 0.20702895522117615, "learning_rate": 3.32326894407501e-05, "loss": 0.0893, "num_input_tokens_seen": 120065808, "step": 55645 }, { "epoch": 9.078303425774878, "grad_norm": 0.10205117613077164, "learning_rate": 3.3229328877098064e-05, "loss": 0.0566, "num_input_tokens_seen": 120075472, "step": 55650 }, { "epoch": 9.079119086460032, "grad_norm": 0.11952735483646393, "learning_rate": 3.322596814666824e-05, "loss": 0.1112, "num_input_tokens_seen": 120085072, "step": 55655 }, { "epoch": 9.079934747145188, "grad_norm": 0.1970566362142563, "learning_rate": 3.3222607249528746e-05, "loss": 0.1141, "num_input_tokens_seen": 120096304, "step": 55660 }, { "epoch": 9.080750407830342, "grad_norm": 0.16820785403251648, "learning_rate": 3.321924618574768e-05, "loss": 0.0195, "num_input_tokens_seen": 120105968, "step": 55665 }, { "epoch": 9.081566068515498, "grad_norm": 0.04818108677864075, "learning_rate": 3.321588495539316e-05, "loss": 0.0959, "num_input_tokens_seen": 120117360, "step": 55670 }, { "epoch": 9.082381729200652, "grad_norm": 0.8781956434249878, "learning_rate": 3.321252355853331e-05, "loss": 0.1094, "num_input_tokens_seen": 120128464, "step": 55675 }, { "epoch": 9.083197389885807, "grad_norm": 0.22449566423892975, "learning_rate": 3.3209161995236256e-05, "loss": 0.0675, "num_input_tokens_seen": 120139440, "step": 55680 }, { "epoch": 9.084013050570963, "grad_norm": 0.7085325121879578, "learning_rate": 3.320580026557011e-05, "loss": 0.0986, "num_input_tokens_seen": 120149392, "step": 55685 }, { "epoch": 9.084828711256117, "grad_norm": 1.9459774494171143, "learning_rate": 3.3202438369603026e-05, "loss": 0.2191, "num_input_tokens_seen": 120160848, "step": 55690 }, { "epoch": 9.085644371941273, "grad_norm": 0.21416564285755157, "learning_rate": 3.319907630740311e-05, "loss": 0.0187, "num_input_tokens_seen": 120172368, "step": 55695 }, { "epoch": 9.086460032626427, "grad_norm": 0.10219063609838486, "learning_rate": 3.319571407903852e-05, "loss": 0.0854, "num_input_tokens_seen": 120183184, "step": 55700 }, { "epoch": 9.087275693311582, "grad_norm": 0.25086426734924316, "learning_rate": 3.3192351684577385e-05, "loss": 0.0968, "num_input_tokens_seen": 120194480, "step": 55705 }, { "epoch": 9.088091353996738, "grad_norm": 0.11156244575977325, "learning_rate": 3.318898912408785e-05, "loss": 0.1624, "num_input_tokens_seen": 120205616, "step": 55710 }, { "epoch": 9.088907014681892, "grad_norm": 0.4955519735813141, "learning_rate": 3.318562639763805e-05, "loss": 0.1454, "num_input_tokens_seen": 120216496, "step": 55715 }, { "epoch": 9.089722675367048, "grad_norm": 2.003920793533325, "learning_rate": 3.3182263505296164e-05, "loss": 0.2631, "num_input_tokens_seen": 120226896, "step": 55720 }, { "epoch": 9.090538336052202, "grad_norm": 0.5837907791137695, "learning_rate": 3.3178900447130324e-05, "loss": 0.0932, "num_input_tokens_seen": 120236400, "step": 55725 }, { "epoch": 9.091353996737357, "grad_norm": 0.4552067816257477, "learning_rate": 3.317553722320869e-05, "loss": 0.0336, "num_input_tokens_seen": 120247920, "step": 55730 }, { "epoch": 9.092169657422513, "grad_norm": 0.06568125635385513, "learning_rate": 3.317217383359942e-05, "loss": 0.0571, "num_input_tokens_seen": 120258512, "step": 55735 }, { "epoch": 9.092985318107667, "grad_norm": 1.7361172437667847, "learning_rate": 3.3168810278370684e-05, "loss": 0.0678, "num_input_tokens_seen": 120270032, "step": 55740 }, { "epoch": 9.093800978792823, "grad_norm": 0.08322347700595856, "learning_rate": 3.316544655759064e-05, "loss": 0.0175, "num_input_tokens_seen": 120280816, "step": 55745 }, { "epoch": 9.094616639477977, "grad_norm": 0.22181957960128784, "learning_rate": 3.3162082671327465e-05, "loss": 0.1339, "num_input_tokens_seen": 120291664, "step": 55750 }, { "epoch": 9.095432300163132, "grad_norm": 1.1285758018493652, "learning_rate": 3.315871861964933e-05, "loss": 0.1065, "num_input_tokens_seen": 120302320, "step": 55755 }, { "epoch": 9.096247960848286, "grad_norm": 0.13503572344779968, "learning_rate": 3.315535440262442e-05, "loss": 0.0738, "num_input_tokens_seen": 120312688, "step": 55760 }, { "epoch": 9.097063621533442, "grad_norm": 0.6403428316116333, "learning_rate": 3.31519900203209e-05, "loss": 0.0769, "num_input_tokens_seen": 120322928, "step": 55765 }, { "epoch": 9.097879282218598, "grad_norm": 0.17461571097373962, "learning_rate": 3.314862547280696e-05, "loss": 0.0959, "num_input_tokens_seen": 120332080, "step": 55770 }, { "epoch": 9.098694942903752, "grad_norm": 0.10411927849054337, "learning_rate": 3.3145260760150785e-05, "loss": 0.159, "num_input_tokens_seen": 120344400, "step": 55775 }, { "epoch": 9.099510603588907, "grad_norm": 1.1721047163009644, "learning_rate": 3.3141895882420576e-05, "loss": 0.1366, "num_input_tokens_seen": 120355344, "step": 55780 }, { "epoch": 9.100326264274061, "grad_norm": 0.20494787395000458, "learning_rate": 3.313853083968451e-05, "loss": 0.1104, "num_input_tokens_seen": 120366640, "step": 55785 }, { "epoch": 9.101141924959217, "grad_norm": 0.07962337881326675, "learning_rate": 3.313516563201079e-05, "loss": 0.1091, "num_input_tokens_seen": 120377584, "step": 55790 }, { "epoch": 9.101957585644373, "grad_norm": 0.04399466887116432, "learning_rate": 3.313180025946763e-05, "loss": 0.0198, "num_input_tokens_seen": 120388368, "step": 55795 }, { "epoch": 9.102773246329527, "grad_norm": 0.0867219790816307, "learning_rate": 3.3128434722123214e-05, "loss": 0.0384, "num_input_tokens_seen": 120399312, "step": 55800 }, { "epoch": 9.103588907014682, "grad_norm": 1.7354813814163208, "learning_rate": 3.312506902004576e-05, "loss": 0.2294, "num_input_tokens_seen": 120409616, "step": 55805 }, { "epoch": 9.104404567699836, "grad_norm": 0.7096068263053894, "learning_rate": 3.3121703153303477e-05, "loss": 0.2202, "num_input_tokens_seen": 120421424, "step": 55810 }, { "epoch": 9.105220228384992, "grad_norm": 1.045236349105835, "learning_rate": 3.311833712196457e-05, "loss": 0.1024, "num_input_tokens_seen": 120431088, "step": 55815 }, { "epoch": 9.106035889070148, "grad_norm": 0.1486205756664276, "learning_rate": 3.311497092609727e-05, "loss": 0.0126, "num_input_tokens_seen": 120442320, "step": 55820 }, { "epoch": 9.106851549755302, "grad_norm": 1.4170244932174683, "learning_rate": 3.311160456576978e-05, "loss": 0.1235, "num_input_tokens_seen": 120453104, "step": 55825 }, { "epoch": 9.107667210440457, "grad_norm": 0.0437014102935791, "learning_rate": 3.310823804105034e-05, "loss": 0.0601, "num_input_tokens_seen": 120463536, "step": 55830 }, { "epoch": 9.108482871125611, "grad_norm": 0.6444684267044067, "learning_rate": 3.310487135200717e-05, "loss": 0.1121, "num_input_tokens_seen": 120474672, "step": 55835 }, { "epoch": 9.109298531810767, "grad_norm": 0.9880913496017456, "learning_rate": 3.31015044987085e-05, "loss": 0.0495, "num_input_tokens_seen": 120484208, "step": 55840 }, { "epoch": 9.11011419249592, "grad_norm": 1.1773916482925415, "learning_rate": 3.309813748122256e-05, "loss": 0.1371, "num_input_tokens_seen": 120494896, "step": 55845 }, { "epoch": 9.110929853181077, "grad_norm": 1.6060909032821655, "learning_rate": 3.309477029961759e-05, "loss": 0.0708, "num_input_tokens_seen": 120505936, "step": 55850 }, { "epoch": 9.111745513866232, "grad_norm": 0.1794084757566452, "learning_rate": 3.3091402953961836e-05, "loss": 0.0134, "num_input_tokens_seen": 120516944, "step": 55855 }, { "epoch": 9.112561174551386, "grad_norm": 2.557543992996216, "learning_rate": 3.3088035444323534e-05, "loss": 0.0648, "num_input_tokens_seen": 120527344, "step": 55860 }, { "epoch": 9.113376835236542, "grad_norm": 0.29075533151626587, "learning_rate": 3.3084667770770934e-05, "loss": 0.0389, "num_input_tokens_seen": 120537520, "step": 55865 }, { "epoch": 9.114192495921696, "grad_norm": 0.16239969432353973, "learning_rate": 3.3081299933372285e-05, "loss": 0.104, "num_input_tokens_seen": 120549072, "step": 55870 }, { "epoch": 9.115008156606851, "grad_norm": 2.1543679237365723, "learning_rate": 3.3077931932195844e-05, "loss": 0.1298, "num_input_tokens_seen": 120559888, "step": 55875 }, { "epoch": 9.115823817292007, "grad_norm": 0.11480921506881714, "learning_rate": 3.307456376730986e-05, "loss": 0.117, "num_input_tokens_seen": 120571504, "step": 55880 }, { "epoch": 9.116639477977161, "grad_norm": 1.1951342821121216, "learning_rate": 3.30711954387826e-05, "loss": 0.1743, "num_input_tokens_seen": 120581456, "step": 55885 }, { "epoch": 9.117455138662317, "grad_norm": 0.3735257387161255, "learning_rate": 3.306782694668233e-05, "loss": 0.1235, "num_input_tokens_seen": 120592080, "step": 55890 }, { "epoch": 9.11827079934747, "grad_norm": 0.13866913318634033, "learning_rate": 3.30644582910773e-05, "loss": 0.0439, "num_input_tokens_seen": 120602128, "step": 55895 }, { "epoch": 9.119086460032626, "grad_norm": 0.7178953886032104, "learning_rate": 3.30610894720358e-05, "loss": 0.0515, "num_input_tokens_seen": 120614352, "step": 55900 }, { "epoch": 9.119902120717782, "grad_norm": 0.36642447113990784, "learning_rate": 3.3057720489626096e-05, "loss": 0.027, "num_input_tokens_seen": 120625424, "step": 55905 }, { "epoch": 9.120717781402936, "grad_norm": 1.471447229385376, "learning_rate": 3.3054351343916463e-05, "loss": 0.1165, "num_input_tokens_seen": 120635536, "step": 55910 }, { "epoch": 9.121533442088092, "grad_norm": 0.05878964811563492, "learning_rate": 3.3050982034975186e-05, "loss": 0.1646, "num_input_tokens_seen": 120646064, "step": 55915 }, { "epoch": 9.122349102773246, "grad_norm": 0.1144709512591362, "learning_rate": 3.304761256287054e-05, "loss": 0.013, "num_input_tokens_seen": 120657072, "step": 55920 }, { "epoch": 9.123164763458401, "grad_norm": 0.9718388319015503, "learning_rate": 3.304424292767082e-05, "loss": 0.1358, "num_input_tokens_seen": 120668816, "step": 55925 }, { "epoch": 9.123980424143557, "grad_norm": 0.3017953932285309, "learning_rate": 3.304087312944431e-05, "loss": 0.088, "num_input_tokens_seen": 120679248, "step": 55930 }, { "epoch": 9.124796084828711, "grad_norm": 0.298222154378891, "learning_rate": 3.3037503168259305e-05, "loss": 0.0347, "num_input_tokens_seen": 120690096, "step": 55935 }, { "epoch": 9.125611745513867, "grad_norm": 1.038281798362732, "learning_rate": 3.3034133044184105e-05, "loss": 0.0584, "num_input_tokens_seen": 120702352, "step": 55940 }, { "epoch": 9.12642740619902, "grad_norm": 0.1479915976524353, "learning_rate": 3.3030762757287006e-05, "loss": 0.1317, "num_input_tokens_seen": 120713072, "step": 55945 }, { "epoch": 9.127243066884176, "grad_norm": 0.06145327538251877, "learning_rate": 3.302739230763631e-05, "loss": 0.0765, "num_input_tokens_seen": 120725200, "step": 55950 }, { "epoch": 9.12805872756933, "grad_norm": 2.454322099685669, "learning_rate": 3.302402169530032e-05, "loss": 0.1411, "num_input_tokens_seen": 120735696, "step": 55955 }, { "epoch": 9.128874388254486, "grad_norm": 1.2543485164642334, "learning_rate": 3.302065092034736e-05, "loss": 0.2419, "num_input_tokens_seen": 120747344, "step": 55960 }, { "epoch": 9.129690048939642, "grad_norm": 2.0800349712371826, "learning_rate": 3.301727998284573e-05, "loss": 0.1814, "num_input_tokens_seen": 120758416, "step": 55965 }, { "epoch": 9.130505709624796, "grad_norm": 2.1457161903381348, "learning_rate": 3.3013908882863757e-05, "loss": 0.2107, "num_input_tokens_seen": 120769552, "step": 55970 }, { "epoch": 9.131321370309951, "grad_norm": 0.3937300443649292, "learning_rate": 3.301053762046975e-05, "loss": 0.1987, "num_input_tokens_seen": 120780656, "step": 55975 }, { "epoch": 9.132137030995105, "grad_norm": 0.5181900858879089, "learning_rate": 3.300716619573204e-05, "loss": 0.09, "num_input_tokens_seen": 120792048, "step": 55980 }, { "epoch": 9.132952691680261, "grad_norm": 0.1987432837486267, "learning_rate": 3.300379460871894e-05, "loss": 0.1769, "num_input_tokens_seen": 120802224, "step": 55985 }, { "epoch": 9.133768352365417, "grad_norm": 0.36461442708969116, "learning_rate": 3.300042285949879e-05, "loss": 0.0852, "num_input_tokens_seen": 120812720, "step": 55990 }, { "epoch": 9.13458401305057, "grad_norm": 1.4296377897262573, "learning_rate": 3.299705094813992e-05, "loss": 0.1633, "num_input_tokens_seen": 120824880, "step": 55995 }, { "epoch": 9.135399673735726, "grad_norm": 0.07952619343996048, "learning_rate": 3.299367887471068e-05, "loss": 0.0859, "num_input_tokens_seen": 120835632, "step": 56000 }, { "epoch": 9.13621533442088, "grad_norm": 0.11043451726436615, "learning_rate": 3.2990306639279385e-05, "loss": 0.0314, "num_input_tokens_seen": 120847984, "step": 56005 }, { "epoch": 9.137030995106036, "grad_norm": 0.3928413391113281, "learning_rate": 3.298693424191439e-05, "loss": 0.2651, "num_input_tokens_seen": 120858832, "step": 56010 }, { "epoch": 9.137846655791192, "grad_norm": 0.6058120727539062, "learning_rate": 3.298356168268405e-05, "loss": 0.0994, "num_input_tokens_seen": 120869872, "step": 56015 }, { "epoch": 9.138662316476346, "grad_norm": 1.3131543397903442, "learning_rate": 3.2980188961656685e-05, "loss": 0.158, "num_input_tokens_seen": 120879920, "step": 56020 }, { "epoch": 9.139477977161501, "grad_norm": 0.03416287899017334, "learning_rate": 3.297681607890069e-05, "loss": 0.1551, "num_input_tokens_seen": 120891088, "step": 56025 }, { "epoch": 9.140293637846655, "grad_norm": 0.6741237044334412, "learning_rate": 3.297344303448439e-05, "loss": 0.0691, "num_input_tokens_seen": 120901968, "step": 56030 }, { "epoch": 9.141109298531811, "grad_norm": 2.7777953147888184, "learning_rate": 3.297006982847615e-05, "loss": 0.1488, "num_input_tokens_seen": 120912176, "step": 56035 }, { "epoch": 9.141924959216965, "grad_norm": 0.4439253509044647, "learning_rate": 3.296669646094433e-05, "loss": 0.0301, "num_input_tokens_seen": 120923216, "step": 56040 }, { "epoch": 9.14274061990212, "grad_norm": 0.7824525833129883, "learning_rate": 3.296332293195731e-05, "loss": 0.2, "num_input_tokens_seen": 120934448, "step": 56045 }, { "epoch": 9.143556280587276, "grad_norm": 0.5925036668777466, "learning_rate": 3.2959949241583436e-05, "loss": 0.0527, "num_input_tokens_seen": 120944880, "step": 56050 }, { "epoch": 9.14437194127243, "grad_norm": 1.7279399633407593, "learning_rate": 3.29565753898911e-05, "loss": 0.177, "num_input_tokens_seen": 120956304, "step": 56055 }, { "epoch": 9.145187601957586, "grad_norm": 0.9831217527389526, "learning_rate": 3.295320137694867e-05, "loss": 0.0784, "num_input_tokens_seen": 120966320, "step": 56060 }, { "epoch": 9.14600326264274, "grad_norm": 0.8716552257537842, "learning_rate": 3.294982720282453e-05, "loss": 0.0606, "num_input_tokens_seen": 120977648, "step": 56065 }, { "epoch": 9.146818923327896, "grad_norm": 1.376104712486267, "learning_rate": 3.294645286758705e-05, "loss": 0.1058, "num_input_tokens_seen": 120987728, "step": 56070 }, { "epoch": 9.147634584013051, "grad_norm": 1.1057698726654053, "learning_rate": 3.2943078371304624e-05, "loss": 0.0933, "num_input_tokens_seen": 120998640, "step": 56075 }, { "epoch": 9.148450244698205, "grad_norm": 0.06242941692471504, "learning_rate": 3.293970371404563e-05, "loss": 0.0947, "num_input_tokens_seen": 121008912, "step": 56080 }, { "epoch": 9.149265905383361, "grad_norm": 1.8161500692367554, "learning_rate": 3.293632889587848e-05, "loss": 0.1937, "num_input_tokens_seen": 121019184, "step": 56085 }, { "epoch": 9.150081566068515, "grad_norm": 1.326737880706787, "learning_rate": 3.2932953916871555e-05, "loss": 0.1421, "num_input_tokens_seen": 121031184, "step": 56090 }, { "epoch": 9.15089722675367, "grad_norm": 2.421316146850586, "learning_rate": 3.292957877709325e-05, "loss": 0.3195, "num_input_tokens_seen": 121042672, "step": 56095 }, { "epoch": 9.151712887438826, "grad_norm": 1.0152004957199097, "learning_rate": 3.292620347661198e-05, "loss": 0.2714, "num_input_tokens_seen": 121053520, "step": 56100 }, { "epoch": 9.15252854812398, "grad_norm": 0.1549215465784073, "learning_rate": 3.2922828015496135e-05, "loss": 0.0711, "num_input_tokens_seen": 121064688, "step": 56105 }, { "epoch": 9.153344208809136, "grad_norm": 0.4093683660030365, "learning_rate": 3.291945239381412e-05, "loss": 0.271, "num_input_tokens_seen": 121075088, "step": 56110 }, { "epoch": 9.15415986949429, "grad_norm": 0.1125531867146492, "learning_rate": 3.291607661163437e-05, "loss": 0.1255, "num_input_tokens_seen": 121085424, "step": 56115 }, { "epoch": 9.154975530179446, "grad_norm": 0.4408663511276245, "learning_rate": 3.291270066902528e-05, "loss": 0.0803, "num_input_tokens_seen": 121097072, "step": 56120 }, { "epoch": 9.1557911908646, "grad_norm": 0.8426562547683716, "learning_rate": 3.290932456605528e-05, "loss": 0.0749, "num_input_tokens_seen": 121108880, "step": 56125 }, { "epoch": 9.156606851549755, "grad_norm": 0.49615931510925293, "learning_rate": 3.2905948302792775e-05, "loss": 0.1073, "num_input_tokens_seen": 121120432, "step": 56130 }, { "epoch": 9.15742251223491, "grad_norm": 2.2897891998291016, "learning_rate": 3.2902571879306197e-05, "loss": 0.2964, "num_input_tokens_seen": 121131952, "step": 56135 }, { "epoch": 9.158238172920065, "grad_norm": 0.08132919669151306, "learning_rate": 3.2899195295663984e-05, "loss": 0.0533, "num_input_tokens_seen": 121140304, "step": 56140 }, { "epoch": 9.15905383360522, "grad_norm": 0.09535525739192963, "learning_rate": 3.2895818551934556e-05, "loss": 0.1172, "num_input_tokens_seen": 121150608, "step": 56145 }, { "epoch": 9.159869494290374, "grad_norm": 0.07328379154205322, "learning_rate": 3.289244164818634e-05, "loss": 0.0809, "num_input_tokens_seen": 121161296, "step": 56150 }, { "epoch": 9.16068515497553, "grad_norm": 0.1162865161895752, "learning_rate": 3.288906458448779e-05, "loss": 0.1296, "num_input_tokens_seen": 121172976, "step": 56155 }, { "epoch": 9.161500815660686, "grad_norm": 0.16069604456424713, "learning_rate": 3.288568736090733e-05, "loss": 0.0386, "num_input_tokens_seen": 121183024, "step": 56160 }, { "epoch": 9.16231647634584, "grad_norm": 1.666226863861084, "learning_rate": 3.288230997751341e-05, "loss": 0.1734, "num_input_tokens_seen": 121194032, "step": 56165 }, { "epoch": 9.163132137030995, "grad_norm": 0.5723041892051697, "learning_rate": 3.287893243437449e-05, "loss": 0.0462, "num_input_tokens_seen": 121205296, "step": 56170 }, { "epoch": 9.16394779771615, "grad_norm": 0.10655837506055832, "learning_rate": 3.287555473155901e-05, "loss": 0.0423, "num_input_tokens_seen": 121217168, "step": 56175 }, { "epoch": 9.164763458401305, "grad_norm": 1.9718843698501587, "learning_rate": 3.287217686913541e-05, "loss": 0.3592, "num_input_tokens_seen": 121228176, "step": 56180 }, { "epoch": 9.16557911908646, "grad_norm": 0.03423153981566429, "learning_rate": 3.2868798847172166e-05, "loss": 0.0307, "num_input_tokens_seen": 121237968, "step": 56185 }, { "epoch": 9.166394779771615, "grad_norm": 0.125861257314682, "learning_rate": 3.2865420665737736e-05, "loss": 0.0901, "num_input_tokens_seen": 121248048, "step": 56190 }, { "epoch": 9.16721044045677, "grad_norm": 0.22129938006401062, "learning_rate": 3.2862042324900574e-05, "loss": 0.0662, "num_input_tokens_seen": 121259088, "step": 56195 }, { "epoch": 9.168026101141924, "grad_norm": 0.07871201634407043, "learning_rate": 3.285866382472915e-05, "loss": 0.2442, "num_input_tokens_seen": 121270416, "step": 56200 }, { "epoch": 9.16884176182708, "grad_norm": 0.30046790838241577, "learning_rate": 3.285528516529193e-05, "loss": 0.0928, "num_input_tokens_seen": 121281424, "step": 56205 }, { "epoch": 9.169657422512234, "grad_norm": 1.3019740581512451, "learning_rate": 3.28519063466574e-05, "loss": 0.0999, "num_input_tokens_seen": 121292496, "step": 56210 }, { "epoch": 9.17047308319739, "grad_norm": 0.9653511643409729, "learning_rate": 3.284852736889402e-05, "loss": 0.048, "num_input_tokens_seen": 121303280, "step": 56215 }, { "epoch": 9.171288743882545, "grad_norm": 0.3487928509712219, "learning_rate": 3.2845148232070276e-05, "loss": 0.0551, "num_input_tokens_seen": 121315248, "step": 56220 }, { "epoch": 9.1721044045677, "grad_norm": 0.6552265882492065, "learning_rate": 3.2841768936254655e-05, "loss": 0.1233, "num_input_tokens_seen": 121326256, "step": 56225 }, { "epoch": 9.172920065252855, "grad_norm": 0.160771444439888, "learning_rate": 3.2838389481515636e-05, "loss": 0.2191, "num_input_tokens_seen": 121337264, "step": 56230 }, { "epoch": 9.173735725938009, "grad_norm": 0.3046984374523163, "learning_rate": 3.283500986792171e-05, "loss": 0.1315, "num_input_tokens_seen": 121347600, "step": 56235 }, { "epoch": 9.174551386623165, "grad_norm": 0.14217524230480194, "learning_rate": 3.2831630095541374e-05, "loss": 0.186, "num_input_tokens_seen": 121358800, "step": 56240 }, { "epoch": 9.17536704730832, "grad_norm": 0.2961280941963196, "learning_rate": 3.282825016444312e-05, "loss": 0.0311, "num_input_tokens_seen": 121369712, "step": 56245 }, { "epoch": 9.176182707993474, "grad_norm": 0.2338673174381256, "learning_rate": 3.282487007469544e-05, "loss": 0.1112, "num_input_tokens_seen": 121380784, "step": 56250 }, { "epoch": 9.17699836867863, "grad_norm": 0.24237555265426636, "learning_rate": 3.282148982636684e-05, "loss": 0.0582, "num_input_tokens_seen": 121392752, "step": 56255 }, { "epoch": 9.177814029363784, "grad_norm": 0.07405974715948105, "learning_rate": 3.281810941952583e-05, "loss": 0.1351, "num_input_tokens_seen": 121402928, "step": 56260 }, { "epoch": 9.17862969004894, "grad_norm": 0.05381287261843681, "learning_rate": 3.281472885424092e-05, "loss": 0.031, "num_input_tokens_seen": 121413008, "step": 56265 }, { "epoch": 9.179445350734095, "grad_norm": 0.15933585166931152, "learning_rate": 3.281134813058061e-05, "loss": 0.0997, "num_input_tokens_seen": 121422896, "step": 56270 }, { "epoch": 9.18026101141925, "grad_norm": 0.32339751720428467, "learning_rate": 3.280796724861342e-05, "loss": 0.1609, "num_input_tokens_seen": 121434512, "step": 56275 }, { "epoch": 9.181076672104405, "grad_norm": 0.4496016204357147, "learning_rate": 3.2804586208407865e-05, "loss": 0.2318, "num_input_tokens_seen": 121445840, "step": 56280 }, { "epoch": 9.181892332789559, "grad_norm": 0.19620941579341888, "learning_rate": 3.2801205010032476e-05, "loss": 0.1118, "num_input_tokens_seen": 121457264, "step": 56285 }, { "epoch": 9.182707993474715, "grad_norm": 0.20355428755283356, "learning_rate": 3.279782365355577e-05, "loss": 0.0134, "num_input_tokens_seen": 121467728, "step": 56290 }, { "epoch": 9.18352365415987, "grad_norm": 0.05427641049027443, "learning_rate": 3.279444213904628e-05, "loss": 0.1848, "num_input_tokens_seen": 121478608, "step": 56295 }, { "epoch": 9.184339314845024, "grad_norm": 0.7520946264266968, "learning_rate": 3.279106046657252e-05, "loss": 0.0645, "num_input_tokens_seen": 121489488, "step": 56300 }, { "epoch": 9.18515497553018, "grad_norm": 0.13647854328155518, "learning_rate": 3.278767863620304e-05, "loss": 0.1503, "num_input_tokens_seen": 121500400, "step": 56305 }, { "epoch": 9.185970636215334, "grad_norm": 0.16512931883335114, "learning_rate": 3.2784296648006374e-05, "loss": 0.1103, "num_input_tokens_seen": 121511088, "step": 56310 }, { "epoch": 9.18678629690049, "grad_norm": 0.321087121963501, "learning_rate": 3.2780914502051066e-05, "loss": 0.1435, "num_input_tokens_seen": 121523088, "step": 56315 }, { "epoch": 9.187601957585644, "grad_norm": 0.05254685878753662, "learning_rate": 3.2777532198405654e-05, "loss": 0.1041, "num_input_tokens_seen": 121533808, "step": 56320 }, { "epoch": 9.1884176182708, "grad_norm": 0.8439064025878906, "learning_rate": 3.2774149737138685e-05, "loss": 0.1289, "num_input_tokens_seen": 121544240, "step": 56325 }, { "epoch": 9.189233278955955, "grad_norm": 1.4545472860336304, "learning_rate": 3.277076711831871e-05, "loss": 0.2825, "num_input_tokens_seen": 121554992, "step": 56330 }, { "epoch": 9.190048939641109, "grad_norm": 0.1018960028886795, "learning_rate": 3.2767384342014276e-05, "loss": 0.1532, "num_input_tokens_seen": 121565424, "step": 56335 }, { "epoch": 9.190864600326265, "grad_norm": 0.3384646475315094, "learning_rate": 3.2764001408293946e-05, "loss": 0.1632, "num_input_tokens_seen": 121575504, "step": 56340 }, { "epoch": 9.191680261011419, "grad_norm": 0.5258664488792419, "learning_rate": 3.2760618317226275e-05, "loss": 0.0241, "num_input_tokens_seen": 121586320, "step": 56345 }, { "epoch": 9.192495921696574, "grad_norm": 0.17811541259288788, "learning_rate": 3.275723506887984e-05, "loss": 0.185, "num_input_tokens_seen": 121596976, "step": 56350 }, { "epoch": 9.19331158238173, "grad_norm": 0.12403548508882523, "learning_rate": 3.275385166332319e-05, "loss": 0.0847, "num_input_tokens_seen": 121607344, "step": 56355 }, { "epoch": 9.194127243066884, "grad_norm": 0.18279241025447845, "learning_rate": 3.2750468100624895e-05, "loss": 0.1127, "num_input_tokens_seen": 121617424, "step": 56360 }, { "epoch": 9.19494290375204, "grad_norm": 1.6205097436904907, "learning_rate": 3.274708438085354e-05, "loss": 0.1044, "num_input_tokens_seen": 121629744, "step": 56365 }, { "epoch": 9.195758564437194, "grad_norm": 0.752716064453125, "learning_rate": 3.2743700504077685e-05, "loss": 0.2006, "num_input_tokens_seen": 121641008, "step": 56370 }, { "epoch": 9.19657422512235, "grad_norm": 0.37465426325798035, "learning_rate": 3.274031647036592e-05, "loss": 0.0704, "num_input_tokens_seen": 121652528, "step": 56375 }, { "epoch": 9.197389885807505, "grad_norm": 1.3847506046295166, "learning_rate": 3.2736932279786816e-05, "loss": 0.1146, "num_input_tokens_seen": 121663856, "step": 56380 }, { "epoch": 9.198205546492659, "grad_norm": 1.8987882137298584, "learning_rate": 3.273354793240897e-05, "loss": 0.1076, "num_input_tokens_seen": 121676112, "step": 56385 }, { "epoch": 9.199021207177815, "grad_norm": 0.12102043628692627, "learning_rate": 3.273016342830096e-05, "loss": 0.1417, "num_input_tokens_seen": 121688400, "step": 56390 }, { "epoch": 9.199836867862969, "grad_norm": 1.8574997186660767, "learning_rate": 3.27267787675314e-05, "loss": 0.126, "num_input_tokens_seen": 121698928, "step": 56395 }, { "epoch": 9.200652528548124, "grad_norm": 1.5735622644424438, "learning_rate": 3.2723393950168844e-05, "loss": 0.2257, "num_input_tokens_seen": 121708496, "step": 56400 }, { "epoch": 9.201468189233278, "grad_norm": 1.0195099115371704, "learning_rate": 3.272000897628192e-05, "loss": 0.0763, "num_input_tokens_seen": 121720240, "step": 56405 }, { "epoch": 9.202283849918434, "grad_norm": 0.3926168382167816, "learning_rate": 3.271662384593922e-05, "loss": 0.0743, "num_input_tokens_seen": 121732080, "step": 56410 }, { "epoch": 9.20309951060359, "grad_norm": 0.5194308161735535, "learning_rate": 3.271323855920935e-05, "loss": 0.1301, "num_input_tokens_seen": 121742096, "step": 56415 }, { "epoch": 9.203915171288743, "grad_norm": 0.1964164823293686, "learning_rate": 3.2709853116160914e-05, "loss": 0.0374, "num_input_tokens_seen": 121752496, "step": 56420 }, { "epoch": 9.2047308319739, "grad_norm": 0.542054295539856, "learning_rate": 3.2706467516862526e-05, "loss": 0.1749, "num_input_tokens_seen": 121763024, "step": 56425 }, { "epoch": 9.205546492659053, "grad_norm": 0.07136756926774979, "learning_rate": 3.2703081761382795e-05, "loss": 0.1149, "num_input_tokens_seen": 121774352, "step": 56430 }, { "epoch": 9.206362153344209, "grad_norm": 0.03528943657875061, "learning_rate": 3.269969584979033e-05, "loss": 0.0637, "num_input_tokens_seen": 121785296, "step": 56435 }, { "epoch": 9.207177814029365, "grad_norm": 0.4168407618999481, "learning_rate": 3.269630978215378e-05, "loss": 0.1909, "num_input_tokens_seen": 121796752, "step": 56440 }, { "epoch": 9.207993474714518, "grad_norm": 1.3250657320022583, "learning_rate": 3.269292355854174e-05, "loss": 0.0964, "num_input_tokens_seen": 121807440, "step": 56445 }, { "epoch": 9.208809135399674, "grad_norm": 0.05168815702199936, "learning_rate": 3.268953717902285e-05, "loss": 0.0286, "num_input_tokens_seen": 121818640, "step": 56450 }, { "epoch": 9.209624796084828, "grad_norm": 0.026960410177707672, "learning_rate": 3.2686150643665726e-05, "loss": 0.0831, "num_input_tokens_seen": 121828560, "step": 56455 }, { "epoch": 9.210440456769984, "grad_norm": 0.9565767645835876, "learning_rate": 3.268276395253901e-05, "loss": 0.1187, "num_input_tokens_seen": 121839088, "step": 56460 }, { "epoch": 9.21125611745514, "grad_norm": 0.456496924161911, "learning_rate": 3.267937710571134e-05, "loss": 0.0198, "num_input_tokens_seen": 121849744, "step": 56465 }, { "epoch": 9.212071778140293, "grad_norm": 0.04420676827430725, "learning_rate": 3.267599010325135e-05, "loss": 0.2087, "num_input_tokens_seen": 121859696, "step": 56470 }, { "epoch": 9.21288743882545, "grad_norm": 0.9906243681907654, "learning_rate": 3.267260294522768e-05, "loss": 0.0406, "num_input_tokens_seen": 121869840, "step": 56475 }, { "epoch": 9.213703099510603, "grad_norm": 0.9472156167030334, "learning_rate": 3.2669215631708976e-05, "loss": 0.0377, "num_input_tokens_seen": 121880432, "step": 56480 }, { "epoch": 9.214518760195759, "grad_norm": 0.45641908049583435, "learning_rate": 3.266582816276389e-05, "loss": 0.0859, "num_input_tokens_seen": 121889264, "step": 56485 }, { "epoch": 9.215334420880913, "grad_norm": 1.2131414413452148, "learning_rate": 3.266244053846108e-05, "loss": 0.075, "num_input_tokens_seen": 121900016, "step": 56490 }, { "epoch": 9.216150081566068, "grad_norm": 0.23531164228916168, "learning_rate": 3.265905275886918e-05, "loss": 0.0605, "num_input_tokens_seen": 121911312, "step": 56495 }, { "epoch": 9.216965742251224, "grad_norm": 0.1523832529783249, "learning_rate": 3.265566482405687e-05, "loss": 0.0292, "num_input_tokens_seen": 121923088, "step": 56500 }, { "epoch": 9.217781402936378, "grad_norm": 1.5216038227081299, "learning_rate": 3.265227673409279e-05, "loss": 0.1665, "num_input_tokens_seen": 121932720, "step": 56505 }, { "epoch": 9.218597063621534, "grad_norm": 1.333785891532898, "learning_rate": 3.2648888489045625e-05, "loss": 0.0781, "num_input_tokens_seen": 121943248, "step": 56510 }, { "epoch": 9.219412724306688, "grad_norm": 1.018165946006775, "learning_rate": 3.264550008898403e-05, "loss": 0.1833, "num_input_tokens_seen": 121953712, "step": 56515 }, { "epoch": 9.220228384991843, "grad_norm": 0.38709378242492676, "learning_rate": 3.264211153397667e-05, "loss": 0.0252, "num_input_tokens_seen": 121965456, "step": 56520 }, { "epoch": 9.221044045676999, "grad_norm": 0.13664470613002777, "learning_rate": 3.263872282409223e-05, "loss": 0.0447, "num_input_tokens_seen": 121975120, "step": 56525 }, { "epoch": 9.221859706362153, "grad_norm": 1.2342332601547241, "learning_rate": 3.2635333959399375e-05, "loss": 0.1481, "num_input_tokens_seen": 121987344, "step": 56530 }, { "epoch": 9.222675367047309, "grad_norm": 0.6969313025474548, "learning_rate": 3.26319449399668e-05, "loss": 0.06, "num_input_tokens_seen": 121998448, "step": 56535 }, { "epoch": 9.223491027732463, "grad_norm": 0.05996856838464737, "learning_rate": 3.2628555765863174e-05, "loss": 0.2231, "num_input_tokens_seen": 122009104, "step": 56540 }, { "epoch": 9.224306688417618, "grad_norm": 1.7525697946548462, "learning_rate": 3.262516643715718e-05, "loss": 0.169, "num_input_tokens_seen": 122019952, "step": 56545 }, { "epoch": 9.225122349102774, "grad_norm": 0.15650948882102966, "learning_rate": 3.2621776953917535e-05, "loss": 0.0971, "num_input_tokens_seen": 122031280, "step": 56550 }, { "epoch": 9.225938009787928, "grad_norm": 0.13138525187969208, "learning_rate": 3.2618387316212894e-05, "loss": 0.0152, "num_input_tokens_seen": 122041584, "step": 56555 }, { "epoch": 9.226753670473084, "grad_norm": 1.1746917963027954, "learning_rate": 3.261499752411198e-05, "loss": 0.0783, "num_input_tokens_seen": 122051312, "step": 56560 }, { "epoch": 9.227569331158238, "grad_norm": 1.5830081701278687, "learning_rate": 3.2611607577683474e-05, "loss": 0.2235, "num_input_tokens_seen": 122061552, "step": 56565 }, { "epoch": 9.228384991843393, "grad_norm": 0.11935564875602722, "learning_rate": 3.260821747699609e-05, "loss": 0.1401, "num_input_tokens_seen": 122071184, "step": 56570 }, { "epoch": 9.229200652528547, "grad_norm": 0.8755420446395874, "learning_rate": 3.2604827222118524e-05, "loss": 0.0931, "num_input_tokens_seen": 122083344, "step": 56575 }, { "epoch": 9.230016313213703, "grad_norm": 0.3278210759162903, "learning_rate": 3.2601436813119484e-05, "loss": 0.0137, "num_input_tokens_seen": 122094832, "step": 56580 }, { "epoch": 9.230831973898859, "grad_norm": 0.08601387590169907, "learning_rate": 3.259804625006769e-05, "loss": 0.1054, "num_input_tokens_seen": 122106032, "step": 56585 }, { "epoch": 9.231647634584013, "grad_norm": 1.7705743312835693, "learning_rate": 3.259465553303185e-05, "loss": 0.1418, "num_input_tokens_seen": 122116304, "step": 56590 }, { "epoch": 9.232463295269168, "grad_norm": 0.20623497664928436, "learning_rate": 3.259126466208067e-05, "loss": 0.0705, "num_input_tokens_seen": 122128176, "step": 56595 }, { "epoch": 9.233278955954322, "grad_norm": 0.032014910131692886, "learning_rate": 3.2587873637282896e-05, "loss": 0.1246, "num_input_tokens_seen": 122139696, "step": 56600 }, { "epoch": 9.234094616639478, "grad_norm": 1.1661996841430664, "learning_rate": 3.258448245870723e-05, "loss": 0.3207, "num_input_tokens_seen": 122151024, "step": 56605 }, { "epoch": 9.234910277324634, "grad_norm": 0.04090098664164543, "learning_rate": 3.2581091126422406e-05, "loss": 0.1027, "num_input_tokens_seen": 122161648, "step": 56610 }, { "epoch": 9.235725938009788, "grad_norm": 0.11047308146953583, "learning_rate": 3.257769964049715e-05, "loss": 0.086, "num_input_tokens_seen": 122172656, "step": 56615 }, { "epoch": 9.236541598694943, "grad_norm": 0.10580724477767944, "learning_rate": 3.2574308001000196e-05, "loss": 0.1576, "num_input_tokens_seen": 122183056, "step": 56620 }, { "epoch": 9.237357259380097, "grad_norm": 0.6362166404724121, "learning_rate": 3.2570916208000284e-05, "loss": 0.0544, "num_input_tokens_seen": 122194224, "step": 56625 }, { "epoch": 9.238172920065253, "grad_norm": 0.04750592261552811, "learning_rate": 3.2567524261566154e-05, "loss": 0.0818, "num_input_tokens_seen": 122204784, "step": 56630 }, { "epoch": 9.238988580750409, "grad_norm": 0.053832244127988815, "learning_rate": 3.2564132161766545e-05, "loss": 0.0137, "num_input_tokens_seen": 122215280, "step": 56635 }, { "epoch": 9.239804241435563, "grad_norm": 1.3599514961242676, "learning_rate": 3.25607399086702e-05, "loss": 0.1605, "num_input_tokens_seen": 122226128, "step": 56640 }, { "epoch": 9.240619902120718, "grad_norm": 1.9251784086227417, "learning_rate": 3.255734750234587e-05, "loss": 0.1646, "num_input_tokens_seen": 122237552, "step": 56645 }, { "epoch": 9.241435562805872, "grad_norm": 0.5709407925605774, "learning_rate": 3.25539549428623e-05, "loss": 0.2049, "num_input_tokens_seen": 122248880, "step": 56650 }, { "epoch": 9.242251223491028, "grad_norm": 1.4833379983901978, "learning_rate": 3.255056223028826e-05, "loss": 0.1784, "num_input_tokens_seen": 122259760, "step": 56655 }, { "epoch": 9.243066884176184, "grad_norm": 1.3695124387741089, "learning_rate": 3.254716936469249e-05, "loss": 0.1227, "num_input_tokens_seen": 122271120, "step": 56660 }, { "epoch": 9.243882544861338, "grad_norm": 0.23310406506061554, "learning_rate": 3.254377634614376e-05, "loss": 0.0929, "num_input_tokens_seen": 122281424, "step": 56665 }, { "epoch": 9.244698205546493, "grad_norm": 0.7338806390762329, "learning_rate": 3.2540383174710836e-05, "loss": 0.0342, "num_input_tokens_seen": 122292784, "step": 56670 }, { "epoch": 9.245513866231647, "grad_norm": 0.12634995579719543, "learning_rate": 3.253698985046248e-05, "loss": 0.0721, "num_input_tokens_seen": 122302992, "step": 56675 }, { "epoch": 9.246329526916803, "grad_norm": 1.2912042140960693, "learning_rate": 3.2533596373467464e-05, "loss": 0.0607, "num_input_tokens_seen": 122314320, "step": 56680 }, { "epoch": 9.247145187601957, "grad_norm": 0.0801251009106636, "learning_rate": 3.253020274379456e-05, "loss": 0.0085, "num_input_tokens_seen": 122324848, "step": 56685 }, { "epoch": 9.247960848287113, "grad_norm": 0.059131868183612823, "learning_rate": 3.2526808961512544e-05, "loss": 0.0438, "num_input_tokens_seen": 122335760, "step": 56690 }, { "epoch": 9.248776508972268, "grad_norm": 0.27345263957977295, "learning_rate": 3.252341502669019e-05, "loss": 0.1575, "num_input_tokens_seen": 122346416, "step": 56695 }, { "epoch": 9.249592169657422, "grad_norm": 0.15503764152526855, "learning_rate": 3.2520020939396295e-05, "loss": 0.1303, "num_input_tokens_seen": 122358608, "step": 56700 }, { "epoch": 9.250407830342578, "grad_norm": 0.7014792561531067, "learning_rate": 3.2516626699699634e-05, "loss": 0.042, "num_input_tokens_seen": 122369264, "step": 56705 }, { "epoch": 9.251223491027732, "grad_norm": 2.3544795513153076, "learning_rate": 3.2513232307669e-05, "loss": 0.3432, "num_input_tokens_seen": 122380528, "step": 56710 }, { "epoch": 9.252039151712887, "grad_norm": 0.2400657683610916, "learning_rate": 3.2509837763373176e-05, "loss": 0.0241, "num_input_tokens_seen": 122391856, "step": 56715 }, { "epoch": 9.252854812398043, "grad_norm": 0.2326628565788269, "learning_rate": 3.2506443066880965e-05, "loss": 0.0324, "num_input_tokens_seen": 122402672, "step": 56720 }, { "epoch": 9.253670473083197, "grad_norm": 2.4974701404571533, "learning_rate": 3.250304821826117e-05, "loss": 0.1955, "num_input_tokens_seen": 122412368, "step": 56725 }, { "epoch": 9.254486133768353, "grad_norm": 0.9025511145591736, "learning_rate": 3.249965321758257e-05, "loss": 0.0859, "num_input_tokens_seen": 122423248, "step": 56730 }, { "epoch": 9.255301794453507, "grad_norm": 0.32504209876060486, "learning_rate": 3.249625806491399e-05, "loss": 0.0665, "num_input_tokens_seen": 122433040, "step": 56735 }, { "epoch": 9.256117455138662, "grad_norm": 0.06260944902896881, "learning_rate": 3.2492862760324236e-05, "loss": 0.0361, "num_input_tokens_seen": 122442928, "step": 56740 }, { "epoch": 9.256933115823816, "grad_norm": 0.3175083100795746, "learning_rate": 3.248946730388211e-05, "loss": 0.0632, "num_input_tokens_seen": 122453488, "step": 56745 }, { "epoch": 9.257748776508972, "grad_norm": 0.05143755301833153, "learning_rate": 3.2486071695656424e-05, "loss": 0.0152, "num_input_tokens_seen": 122463632, "step": 56750 }, { "epoch": 9.258564437194128, "grad_norm": 0.4935646951198578, "learning_rate": 3.2482675935716e-05, "loss": 0.0242, "num_input_tokens_seen": 122474736, "step": 56755 }, { "epoch": 9.259380097879282, "grad_norm": 0.05573234707117081, "learning_rate": 3.2479280024129656e-05, "loss": 0.0367, "num_input_tokens_seen": 122485776, "step": 56760 }, { "epoch": 9.260195758564437, "grad_norm": 0.06272690743207932, "learning_rate": 3.2475883960966216e-05, "loss": 0.018, "num_input_tokens_seen": 122496176, "step": 56765 }, { "epoch": 9.261011419249591, "grad_norm": 0.019306141883134842, "learning_rate": 3.24724877462945e-05, "loss": 0.0195, "num_input_tokens_seen": 122507760, "step": 56770 }, { "epoch": 9.261827079934747, "grad_norm": 1.6236945390701294, "learning_rate": 3.2469091380183345e-05, "loss": 0.1201, "num_input_tokens_seen": 122518768, "step": 56775 }, { "epoch": 9.262642740619903, "grad_norm": 2.2391672134399414, "learning_rate": 3.246569486270158e-05, "loss": 0.0552, "num_input_tokens_seen": 122531216, "step": 56780 }, { "epoch": 9.263458401305057, "grad_norm": 1.1195038557052612, "learning_rate": 3.2462298193918026e-05, "loss": 0.0973, "num_input_tokens_seen": 122541552, "step": 56785 }, { "epoch": 9.264274061990212, "grad_norm": 0.5447977781295776, "learning_rate": 3.2458901373901544e-05, "loss": 0.1058, "num_input_tokens_seen": 122553712, "step": 56790 }, { "epoch": 9.265089722675366, "grad_norm": 0.4922010004520416, "learning_rate": 3.245550440272095e-05, "loss": 0.1147, "num_input_tokens_seen": 122564976, "step": 56795 }, { "epoch": 9.265905383360522, "grad_norm": 0.27328601479530334, "learning_rate": 3.2452107280445114e-05, "loss": 0.2169, "num_input_tokens_seen": 122576272, "step": 56800 }, { "epoch": 9.266721044045678, "grad_norm": 1.5867213010787964, "learning_rate": 3.244871000714287e-05, "loss": 0.1686, "num_input_tokens_seen": 122586576, "step": 56805 }, { "epoch": 9.267536704730832, "grad_norm": 0.3434564769268036, "learning_rate": 3.2445312582883065e-05, "loss": 0.0363, "num_input_tokens_seen": 122598000, "step": 56810 }, { "epoch": 9.268352365415987, "grad_norm": 0.6582960486412048, "learning_rate": 3.2441915007734556e-05, "loss": 0.1099, "num_input_tokens_seen": 122606896, "step": 56815 }, { "epoch": 9.269168026101141, "grad_norm": 1.4916068315505981, "learning_rate": 3.24385172817662e-05, "loss": 0.0411, "num_input_tokens_seen": 122617200, "step": 56820 }, { "epoch": 9.269983686786297, "grad_norm": 1.6646080017089844, "learning_rate": 3.2435119405046855e-05, "loss": 0.2973, "num_input_tokens_seen": 122626512, "step": 56825 }, { "epoch": 9.270799347471453, "grad_norm": 0.22659868001937866, "learning_rate": 3.243172137764538e-05, "loss": 0.1644, "num_input_tokens_seen": 122636848, "step": 56830 }, { "epoch": 9.271615008156607, "grad_norm": 1.1426162719726562, "learning_rate": 3.242832319963064e-05, "loss": 0.133, "num_input_tokens_seen": 122646928, "step": 56835 }, { "epoch": 9.272430668841762, "grad_norm": 1.2471851110458374, "learning_rate": 3.242492487107151e-05, "loss": 0.1242, "num_input_tokens_seen": 122656752, "step": 56840 }, { "epoch": 9.273246329526916, "grad_norm": 1.491571068763733, "learning_rate": 3.2421526392036866e-05, "loss": 0.1447, "num_input_tokens_seen": 122668656, "step": 56845 }, { "epoch": 9.274061990212072, "grad_norm": 0.34979915618896484, "learning_rate": 3.241812776259557e-05, "loss": 0.0924, "num_input_tokens_seen": 122679728, "step": 56850 }, { "epoch": 9.274877650897226, "grad_norm": 1.236465573310852, "learning_rate": 3.24147289828165e-05, "loss": 0.1563, "num_input_tokens_seen": 122691536, "step": 56855 }, { "epoch": 9.275693311582382, "grad_norm": 0.17513945698738098, "learning_rate": 3.241133005276854e-05, "loss": 0.0862, "num_input_tokens_seen": 122701488, "step": 56860 }, { "epoch": 9.276508972267537, "grad_norm": 1.7705386877059937, "learning_rate": 3.240793097252058e-05, "loss": 0.0394, "num_input_tokens_seen": 122712784, "step": 56865 }, { "epoch": 9.277324632952691, "grad_norm": 0.12784357368946075, "learning_rate": 3.2404531742141506e-05, "loss": 0.1971, "num_input_tokens_seen": 122723184, "step": 56870 }, { "epoch": 9.278140293637847, "grad_norm": 0.6201958656311035, "learning_rate": 3.240113236170019e-05, "loss": 0.1236, "num_input_tokens_seen": 122733808, "step": 56875 }, { "epoch": 9.278955954323001, "grad_norm": 0.8606315851211548, "learning_rate": 3.239773283126555e-05, "loss": 0.0931, "num_input_tokens_seen": 122745552, "step": 56880 }, { "epoch": 9.279771615008157, "grad_norm": 0.5782193541526794, "learning_rate": 3.239433315090646e-05, "loss": 0.044, "num_input_tokens_seen": 122755632, "step": 56885 }, { "epoch": 9.280587275693312, "grad_norm": 0.6191297173500061, "learning_rate": 3.2390933320691836e-05, "loss": 0.2255, "num_input_tokens_seen": 122767696, "step": 56890 }, { "epoch": 9.281402936378466, "grad_norm": 1.2863959074020386, "learning_rate": 3.238753334069057e-05, "loss": 0.0727, "num_input_tokens_seen": 122779376, "step": 56895 }, { "epoch": 9.282218597063622, "grad_norm": 0.29855379462242126, "learning_rate": 3.238413321097157e-05, "loss": 0.0913, "num_input_tokens_seen": 122790832, "step": 56900 }, { "epoch": 9.283034257748776, "grad_norm": 0.19635169208049774, "learning_rate": 3.238073293160373e-05, "loss": 0.2309, "num_input_tokens_seen": 122801456, "step": 56905 }, { "epoch": 9.283849918433932, "grad_norm": 0.6647099256515503, "learning_rate": 3.237733250265599e-05, "loss": 0.0723, "num_input_tokens_seen": 122813520, "step": 56910 }, { "epoch": 9.284665579119087, "grad_norm": 0.7223971486091614, "learning_rate": 3.2373931924197246e-05, "loss": 0.0705, "num_input_tokens_seen": 122823344, "step": 56915 }, { "epoch": 9.285481239804241, "grad_norm": 0.2460835576057434, "learning_rate": 3.237053119629641e-05, "loss": 0.2116, "num_input_tokens_seen": 122834128, "step": 56920 }, { "epoch": 9.286296900489397, "grad_norm": 0.04429909214377403, "learning_rate": 3.2367130319022416e-05, "loss": 0.0197, "num_input_tokens_seen": 122845456, "step": 56925 }, { "epoch": 9.28711256117455, "grad_norm": 0.46134254336357117, "learning_rate": 3.236372929244418e-05, "loss": 0.0842, "num_input_tokens_seen": 122856272, "step": 56930 }, { "epoch": 9.287928221859707, "grad_norm": 0.018667912110686302, "learning_rate": 3.2360328116630615e-05, "loss": 0.1276, "num_input_tokens_seen": 122866544, "step": 56935 }, { "epoch": 9.28874388254486, "grad_norm": 0.6495105028152466, "learning_rate": 3.235692679165068e-05, "loss": 0.1184, "num_input_tokens_seen": 122876432, "step": 56940 }, { "epoch": 9.289559543230016, "grad_norm": 1.1300832033157349, "learning_rate": 3.235352531757328e-05, "loss": 0.0778, "num_input_tokens_seen": 122888432, "step": 56945 }, { "epoch": 9.290375203915172, "grad_norm": 0.8641207218170166, "learning_rate": 3.235012369446737e-05, "loss": 0.1811, "num_input_tokens_seen": 122900016, "step": 56950 }, { "epoch": 9.291190864600326, "grad_norm": 0.10296676307916641, "learning_rate": 3.234672192240187e-05, "loss": 0.1036, "num_input_tokens_seen": 122910096, "step": 56955 }, { "epoch": 9.292006525285482, "grad_norm": 0.07463021576404572, "learning_rate": 3.234332000144574e-05, "loss": 0.0186, "num_input_tokens_seen": 122920336, "step": 56960 }, { "epoch": 9.292822185970635, "grad_norm": 0.4546389579772949, "learning_rate": 3.23399179316679e-05, "loss": 0.221, "num_input_tokens_seen": 122931632, "step": 56965 }, { "epoch": 9.293637846655791, "grad_norm": 0.03805546090006828, "learning_rate": 3.2336515713137325e-05, "loss": 0.079, "num_input_tokens_seen": 122943728, "step": 56970 }, { "epoch": 9.294453507340947, "grad_norm": 0.6338427066802979, "learning_rate": 3.233311334592294e-05, "loss": 0.0623, "num_input_tokens_seen": 122953264, "step": 56975 }, { "epoch": 9.2952691680261, "grad_norm": 1.5611629486083984, "learning_rate": 3.232971083009372e-05, "loss": 0.1001, "num_input_tokens_seen": 122965296, "step": 56980 }, { "epoch": 9.296084828711257, "grad_norm": 2.4653968811035156, "learning_rate": 3.232630816571861e-05, "loss": 0.2516, "num_input_tokens_seen": 122974384, "step": 56985 }, { "epoch": 9.29690048939641, "grad_norm": 1.633493185043335, "learning_rate": 3.2322905352866576e-05, "loss": 0.1064, "num_input_tokens_seen": 122985296, "step": 56990 }, { "epoch": 9.297716150081566, "grad_norm": 0.2106853723526001, "learning_rate": 3.2319502391606574e-05, "loss": 0.0223, "num_input_tokens_seen": 122996272, "step": 56995 }, { "epoch": 9.298531810766722, "grad_norm": 0.14711709320545197, "learning_rate": 3.231609928200756e-05, "loss": 0.098, "num_input_tokens_seen": 123007440, "step": 57000 }, { "epoch": 9.299347471451876, "grad_norm": 1.6847093105316162, "learning_rate": 3.2312696024138524e-05, "loss": 0.3172, "num_input_tokens_seen": 123018096, "step": 57005 }, { "epoch": 9.300163132137031, "grad_norm": 1.2628881931304932, "learning_rate": 3.230929261806842e-05, "loss": 0.2625, "num_input_tokens_seen": 123029072, "step": 57010 }, { "epoch": 9.300978792822185, "grad_norm": 0.5893460512161255, "learning_rate": 3.230588906386623e-05, "loss": 0.1798, "num_input_tokens_seen": 123040816, "step": 57015 }, { "epoch": 9.301794453507341, "grad_norm": 0.04123875871300697, "learning_rate": 3.230248536160093e-05, "loss": 0.0516, "num_input_tokens_seen": 123051376, "step": 57020 }, { "epoch": 9.302610114192497, "grad_norm": 1.354828119277954, "learning_rate": 3.22990815113415e-05, "loss": 0.1154, "num_input_tokens_seen": 123061232, "step": 57025 }, { "epoch": 9.30342577487765, "grad_norm": 0.2507532238960266, "learning_rate": 3.229567751315693e-05, "loss": 0.2956, "num_input_tokens_seen": 123071632, "step": 57030 }, { "epoch": 9.304241435562806, "grad_norm": 1.270537257194519, "learning_rate": 3.229227336711619e-05, "loss": 0.1225, "num_input_tokens_seen": 123083600, "step": 57035 }, { "epoch": 9.30505709624796, "grad_norm": 0.17282640933990479, "learning_rate": 3.228886907328828e-05, "loss": 0.1402, "num_input_tokens_seen": 123093392, "step": 57040 }, { "epoch": 9.305872756933116, "grad_norm": 2.9195356369018555, "learning_rate": 3.22854646317422e-05, "loss": 0.0728, "num_input_tokens_seen": 123104784, "step": 57045 }, { "epoch": 9.30668841761827, "grad_norm": 0.9933068156242371, "learning_rate": 3.228206004254694e-05, "loss": 0.2372, "num_input_tokens_seen": 123115344, "step": 57050 }, { "epoch": 9.307504078303426, "grad_norm": 2.4801511764526367, "learning_rate": 3.2278655305771484e-05, "loss": 0.1483, "num_input_tokens_seen": 123125520, "step": 57055 }, { "epoch": 9.308319738988581, "grad_norm": 1.1944994926452637, "learning_rate": 3.227525042148485e-05, "loss": 0.0442, "num_input_tokens_seen": 123136016, "step": 57060 }, { "epoch": 9.309135399673735, "grad_norm": 0.5420856475830078, "learning_rate": 3.227184538975603e-05, "loss": 0.1657, "num_input_tokens_seen": 123147536, "step": 57065 }, { "epoch": 9.309951060358891, "grad_norm": 0.07282459735870361, "learning_rate": 3.226844021065404e-05, "loss": 0.0505, "num_input_tokens_seen": 123157904, "step": 57070 }, { "epoch": 9.310766721044045, "grad_norm": 0.12441812455654144, "learning_rate": 3.226503488424789e-05, "loss": 0.0516, "num_input_tokens_seen": 123168720, "step": 57075 }, { "epoch": 9.3115823817292, "grad_norm": 0.5382671356201172, "learning_rate": 3.2261629410606596e-05, "loss": 0.1006, "num_input_tokens_seen": 123180144, "step": 57080 }, { "epoch": 9.312398042414356, "grad_norm": 0.29242920875549316, "learning_rate": 3.225822378979917e-05, "loss": 0.0614, "num_input_tokens_seen": 123190896, "step": 57085 }, { "epoch": 9.31321370309951, "grad_norm": 0.17562374472618103, "learning_rate": 3.225481802189463e-05, "loss": 0.0567, "num_input_tokens_seen": 123201424, "step": 57090 }, { "epoch": 9.314029363784666, "grad_norm": 0.6000133156776428, "learning_rate": 3.2251412106962e-05, "loss": 0.1307, "num_input_tokens_seen": 123211536, "step": 57095 }, { "epoch": 9.31484502446982, "grad_norm": 0.04192288592457771, "learning_rate": 3.22480060450703e-05, "loss": 0.0761, "num_input_tokens_seen": 123222896, "step": 57100 }, { "epoch": 9.315660685154976, "grad_norm": 1.291662573814392, "learning_rate": 3.224459983628856e-05, "loss": 0.1221, "num_input_tokens_seen": 123235280, "step": 57105 }, { "epoch": 9.31647634584013, "grad_norm": 0.2749282717704773, "learning_rate": 3.224119348068582e-05, "loss": 0.1546, "num_input_tokens_seen": 123246416, "step": 57110 }, { "epoch": 9.317292006525285, "grad_norm": 0.8486137986183167, "learning_rate": 3.223778697833111e-05, "loss": 0.0458, "num_input_tokens_seen": 123257200, "step": 57115 }, { "epoch": 9.318107667210441, "grad_norm": 0.32587525248527527, "learning_rate": 3.223438032929346e-05, "loss": 0.2217, "num_input_tokens_seen": 123268816, "step": 57120 }, { "epoch": 9.318923327895595, "grad_norm": 0.4007803201675415, "learning_rate": 3.223097353364192e-05, "loss": 0.1417, "num_input_tokens_seen": 123279408, "step": 57125 }, { "epoch": 9.31973898858075, "grad_norm": 1.0838655233383179, "learning_rate": 3.222756659144552e-05, "loss": 0.1584, "num_input_tokens_seen": 123289232, "step": 57130 }, { "epoch": 9.320554649265905, "grad_norm": 0.9426287412643433, "learning_rate": 3.222415950277332e-05, "loss": 0.1547, "num_input_tokens_seen": 123299984, "step": 57135 }, { "epoch": 9.32137030995106, "grad_norm": 0.42092186212539673, "learning_rate": 3.2220752267694365e-05, "loss": 0.1466, "num_input_tokens_seen": 123310768, "step": 57140 }, { "epoch": 9.322185970636216, "grad_norm": 1.3599016666412354, "learning_rate": 3.22173448862777e-05, "loss": 0.0429, "num_input_tokens_seen": 123321232, "step": 57145 }, { "epoch": 9.32300163132137, "grad_norm": 0.07451926171779633, "learning_rate": 3.2213937358592396e-05, "loss": 0.0898, "num_input_tokens_seen": 123332432, "step": 57150 }, { "epoch": 9.323817292006526, "grad_norm": 0.2300940304994583, "learning_rate": 3.221052968470749e-05, "loss": 0.1075, "num_input_tokens_seen": 123344240, "step": 57155 }, { "epoch": 9.32463295269168, "grad_norm": 1.856168270111084, "learning_rate": 3.2207121864692056e-05, "loss": 0.2638, "num_input_tokens_seen": 123354672, "step": 57160 }, { "epoch": 9.325448613376835, "grad_norm": 0.05073513463139534, "learning_rate": 3.220371389861515e-05, "loss": 0.2174, "num_input_tokens_seen": 123365168, "step": 57165 }, { "epoch": 9.326264274061991, "grad_norm": 0.251430481672287, "learning_rate": 3.220030578654585e-05, "loss": 0.0502, "num_input_tokens_seen": 123375696, "step": 57170 }, { "epoch": 9.327079934747145, "grad_norm": 0.22524233162403107, "learning_rate": 3.21968975285532e-05, "loss": 0.2369, "num_input_tokens_seen": 123387344, "step": 57175 }, { "epoch": 9.3278955954323, "grad_norm": 0.2860410511493683, "learning_rate": 3.2193489124706315e-05, "loss": 0.0538, "num_input_tokens_seen": 123398992, "step": 57180 }, { "epoch": 9.328711256117455, "grad_norm": 1.107856035232544, "learning_rate": 3.219008057507424e-05, "loss": 0.2048, "num_input_tokens_seen": 123409168, "step": 57185 }, { "epoch": 9.32952691680261, "grad_norm": 1.4224152565002441, "learning_rate": 3.218667187972606e-05, "loss": 0.0664, "num_input_tokens_seen": 123421488, "step": 57190 }, { "epoch": 9.330342577487766, "grad_norm": 0.418652206659317, "learning_rate": 3.218326303873086e-05, "loss": 0.0683, "num_input_tokens_seen": 123432816, "step": 57195 }, { "epoch": 9.33115823817292, "grad_norm": 0.4308347702026367, "learning_rate": 3.217985405215772e-05, "loss": 0.0981, "num_input_tokens_seen": 123444048, "step": 57200 }, { "epoch": 9.331973898858076, "grad_norm": 0.04153786599636078, "learning_rate": 3.2176444920075735e-05, "loss": 0.123, "num_input_tokens_seen": 123454320, "step": 57205 }, { "epoch": 9.33278955954323, "grad_norm": 0.06132509931921959, "learning_rate": 3.217303564255398e-05, "loss": 0.0128, "num_input_tokens_seen": 123465424, "step": 57210 }, { "epoch": 9.333605220228385, "grad_norm": 0.3621059060096741, "learning_rate": 3.2169626219661565e-05, "loss": 0.092, "num_input_tokens_seen": 123475856, "step": 57215 }, { "epoch": 9.33442088091354, "grad_norm": 0.1647091805934906, "learning_rate": 3.2166216651467576e-05, "loss": 0.3552, "num_input_tokens_seen": 123484816, "step": 57220 }, { "epoch": 9.335236541598695, "grad_norm": 0.053915444761514664, "learning_rate": 3.2162806938041114e-05, "loss": 0.0987, "num_input_tokens_seen": 123495920, "step": 57225 }, { "epoch": 9.33605220228385, "grad_norm": 0.07460243254899979, "learning_rate": 3.215939707945128e-05, "loss": 0.2281, "num_input_tokens_seen": 123507344, "step": 57230 }, { "epoch": 9.336867862969005, "grad_norm": 0.5180610418319702, "learning_rate": 3.215598707576719e-05, "loss": 0.1239, "num_input_tokens_seen": 123517808, "step": 57235 }, { "epoch": 9.33768352365416, "grad_norm": 0.0836428552865982, "learning_rate": 3.2152576927057946e-05, "loss": 0.2634, "num_input_tokens_seen": 123529456, "step": 57240 }, { "epoch": 9.338499184339314, "grad_norm": 1.1800168752670288, "learning_rate": 3.2149166633392645e-05, "loss": 0.0762, "num_input_tokens_seen": 123538960, "step": 57245 }, { "epoch": 9.33931484502447, "grad_norm": 1.6273350715637207, "learning_rate": 3.214575619484041e-05, "loss": 0.2273, "num_input_tokens_seen": 123549200, "step": 57250 }, { "epoch": 9.340130505709626, "grad_norm": 1.0181171894073486, "learning_rate": 3.2142345611470376e-05, "loss": 0.1291, "num_input_tokens_seen": 123560176, "step": 57255 }, { "epoch": 9.34094616639478, "grad_norm": 0.09096119552850723, "learning_rate": 3.213893488335164e-05, "loss": 0.1606, "num_input_tokens_seen": 123571280, "step": 57260 }, { "epoch": 9.341761827079935, "grad_norm": 0.2345035970211029, "learning_rate": 3.2135524010553336e-05, "loss": 0.2323, "num_input_tokens_seen": 123581296, "step": 57265 }, { "epoch": 9.34257748776509, "grad_norm": 0.14810767769813538, "learning_rate": 3.213211299314458e-05, "loss": 0.1679, "num_input_tokens_seen": 123592016, "step": 57270 }, { "epoch": 9.343393148450245, "grad_norm": 0.06848577409982681, "learning_rate": 3.2128701831194514e-05, "loss": 0.0981, "num_input_tokens_seen": 123602992, "step": 57275 }, { "epoch": 9.3442088091354, "grad_norm": 0.7655584216117859, "learning_rate": 3.212529052477226e-05, "loss": 0.0796, "num_input_tokens_seen": 123614384, "step": 57280 }, { "epoch": 9.345024469820554, "grad_norm": 0.6856145262718201, "learning_rate": 3.212187907394694e-05, "loss": 0.0537, "num_input_tokens_seen": 123625744, "step": 57285 }, { "epoch": 9.34584013050571, "grad_norm": 2.416163444519043, "learning_rate": 3.211846747878772e-05, "loss": 0.2663, "num_input_tokens_seen": 123636816, "step": 57290 }, { "epoch": 9.346655791190864, "grad_norm": 1.4430413246154785, "learning_rate": 3.211505573936373e-05, "loss": 0.1807, "num_input_tokens_seen": 123648656, "step": 57295 }, { "epoch": 9.34747145187602, "grad_norm": 1.569131851196289, "learning_rate": 3.21116438557441e-05, "loss": 0.0444, "num_input_tokens_seen": 123658672, "step": 57300 }, { "epoch": 9.348287112561174, "grad_norm": 1.6472804546356201, "learning_rate": 3.2108231827997986e-05, "loss": 0.2162, "num_input_tokens_seen": 123669968, "step": 57305 }, { "epoch": 9.34910277324633, "grad_norm": 1.7652074098587036, "learning_rate": 3.210481965619454e-05, "loss": 0.0864, "num_input_tokens_seen": 123680880, "step": 57310 }, { "epoch": 9.349918433931485, "grad_norm": 1.828427791595459, "learning_rate": 3.2101407340402914e-05, "loss": 0.1507, "num_input_tokens_seen": 123691536, "step": 57315 }, { "epoch": 9.350734094616639, "grad_norm": 0.6469227075576782, "learning_rate": 3.2097994880692255e-05, "loss": 0.165, "num_input_tokens_seen": 123701904, "step": 57320 }, { "epoch": 9.351549755301795, "grad_norm": 0.07503308355808258, "learning_rate": 3.2094582277131725e-05, "loss": 0.0464, "num_input_tokens_seen": 123712528, "step": 57325 }, { "epoch": 9.352365415986949, "grad_norm": 0.9905797839164734, "learning_rate": 3.209116952979049e-05, "loss": 0.1168, "num_input_tokens_seen": 123723184, "step": 57330 }, { "epoch": 9.353181076672104, "grad_norm": 0.17982643842697144, "learning_rate": 3.20877566387377e-05, "loss": 0.0376, "num_input_tokens_seen": 123733968, "step": 57335 }, { "epoch": 9.35399673735726, "grad_norm": 0.35196980834007263, "learning_rate": 3.208434360404254e-05, "loss": 0.0578, "num_input_tokens_seen": 123743120, "step": 57340 }, { "epoch": 9.354812398042414, "grad_norm": 0.18246223032474518, "learning_rate": 3.2080930425774164e-05, "loss": 0.056, "num_input_tokens_seen": 123754992, "step": 57345 }, { "epoch": 9.35562805872757, "grad_norm": 0.051056619733572006, "learning_rate": 3.207751710400175e-05, "loss": 0.1359, "num_input_tokens_seen": 123765488, "step": 57350 }, { "epoch": 9.356443719412724, "grad_norm": 1.5001940727233887, "learning_rate": 3.207410363879447e-05, "loss": 0.1262, "num_input_tokens_seen": 123776944, "step": 57355 }, { "epoch": 9.35725938009788, "grad_norm": 0.8291954398155212, "learning_rate": 3.207069003022151e-05, "loss": 0.1591, "num_input_tokens_seen": 123789168, "step": 57360 }, { "epoch": 9.358075040783035, "grad_norm": 0.08621425181627274, "learning_rate": 3.2067276278352047e-05, "loss": 0.1657, "num_input_tokens_seen": 123799536, "step": 57365 }, { "epoch": 9.358890701468189, "grad_norm": 1.4762568473815918, "learning_rate": 3.206386238325527e-05, "loss": 0.1573, "num_input_tokens_seen": 123811536, "step": 57370 }, { "epoch": 9.359706362153345, "grad_norm": 1.7767634391784668, "learning_rate": 3.2060448345000355e-05, "loss": 0.1181, "num_input_tokens_seen": 123820848, "step": 57375 }, { "epoch": 9.360522022838499, "grad_norm": 0.26438385248184204, "learning_rate": 3.20570341636565e-05, "loss": 0.0747, "num_input_tokens_seen": 123832112, "step": 57380 }, { "epoch": 9.361337683523654, "grad_norm": 0.047203533351421356, "learning_rate": 3.205361983929289e-05, "loss": 0.1325, "num_input_tokens_seen": 123842544, "step": 57385 }, { "epoch": 9.362153344208808, "grad_norm": 1.1972616910934448, "learning_rate": 3.205020537197872e-05, "loss": 0.1845, "num_input_tokens_seen": 123852816, "step": 57390 }, { "epoch": 9.362969004893964, "grad_norm": 0.39785921573638916, "learning_rate": 3.204679076178321e-05, "loss": 0.2444, "num_input_tokens_seen": 123863312, "step": 57395 }, { "epoch": 9.36378466557912, "grad_norm": 0.13960939645767212, "learning_rate": 3.204337600877553e-05, "loss": 0.0453, "num_input_tokens_seen": 123873936, "step": 57400 }, { "epoch": 9.364600326264274, "grad_norm": 2.536147117614746, "learning_rate": 3.203996111302491e-05, "loss": 0.2007, "num_input_tokens_seen": 123885648, "step": 57405 }, { "epoch": 9.36541598694943, "grad_norm": 0.2324008345603943, "learning_rate": 3.2036546074600546e-05, "loss": 0.0893, "num_input_tokens_seen": 123896528, "step": 57410 }, { "epoch": 9.366231647634583, "grad_norm": 0.15534555912017822, "learning_rate": 3.203313089357165e-05, "loss": 0.0929, "num_input_tokens_seen": 123907728, "step": 57415 }, { "epoch": 9.367047308319739, "grad_norm": 0.17651373147964478, "learning_rate": 3.202971557000743e-05, "loss": 0.0347, "num_input_tokens_seen": 123918128, "step": 57420 }, { "epoch": 9.367862969004895, "grad_norm": 0.7410059571266174, "learning_rate": 3.202630010397711e-05, "loss": 0.037, "num_input_tokens_seen": 123929392, "step": 57425 }, { "epoch": 9.368678629690049, "grad_norm": 0.7354090213775635, "learning_rate": 3.20228844955499e-05, "loss": 0.1828, "num_input_tokens_seen": 123940688, "step": 57430 }, { "epoch": 9.369494290375204, "grad_norm": 0.05377647653222084, "learning_rate": 3.2019468744795034e-05, "loss": 0.0954, "num_input_tokens_seen": 123950704, "step": 57435 }, { "epoch": 9.370309951060358, "grad_norm": 0.6378769874572754, "learning_rate": 3.201605285178173e-05, "loss": 0.0755, "num_input_tokens_seen": 123961552, "step": 57440 }, { "epoch": 9.371125611745514, "grad_norm": 0.05233173072338104, "learning_rate": 3.201263681657921e-05, "loss": 0.0664, "num_input_tokens_seen": 123971408, "step": 57445 }, { "epoch": 9.37194127243067, "grad_norm": 0.18934661149978638, "learning_rate": 3.200922063925671e-05, "loss": 0.0443, "num_input_tokens_seen": 123981264, "step": 57450 }, { "epoch": 9.372756933115824, "grad_norm": 0.394631564617157, "learning_rate": 3.200580431988346e-05, "loss": 0.1712, "num_input_tokens_seen": 123992944, "step": 57455 }, { "epoch": 9.37357259380098, "grad_norm": 0.3746657967567444, "learning_rate": 3.20023878585287e-05, "loss": 0.1153, "num_input_tokens_seen": 124004464, "step": 57460 }, { "epoch": 9.374388254486133, "grad_norm": 1.0905053615570068, "learning_rate": 3.199897125526167e-05, "loss": 0.1265, "num_input_tokens_seen": 124017072, "step": 57465 }, { "epoch": 9.375203915171289, "grad_norm": 0.13233424723148346, "learning_rate": 3.1995554510151596e-05, "loss": 0.1375, "num_input_tokens_seen": 124027792, "step": 57470 }, { "epoch": 9.376019575856443, "grad_norm": 1.5768909454345703, "learning_rate": 3.199213762326775e-05, "loss": 0.1833, "num_input_tokens_seen": 124039440, "step": 57475 }, { "epoch": 9.376835236541599, "grad_norm": 1.0457690954208374, "learning_rate": 3.198872059467936e-05, "loss": 0.1297, "num_input_tokens_seen": 124050608, "step": 57480 }, { "epoch": 9.377650897226754, "grad_norm": 0.26272615790367126, "learning_rate": 3.198530342445568e-05, "loss": 0.0809, "num_input_tokens_seen": 124061296, "step": 57485 }, { "epoch": 9.378466557911908, "grad_norm": 0.684099018573761, "learning_rate": 3.198188611266596e-05, "loss": 0.2418, "num_input_tokens_seen": 124071920, "step": 57490 }, { "epoch": 9.379282218597064, "grad_norm": 0.43276578187942505, "learning_rate": 3.1978468659379464e-05, "loss": 0.1322, "num_input_tokens_seen": 124082608, "step": 57495 }, { "epoch": 9.380097879282218, "grad_norm": 0.5100680589675903, "learning_rate": 3.1975051064665454e-05, "loss": 0.1816, "num_input_tokens_seen": 124094416, "step": 57500 }, { "epoch": 9.380913539967374, "grad_norm": 0.9505792260169983, "learning_rate": 3.197163332859318e-05, "loss": 0.0767, "num_input_tokens_seen": 124105168, "step": 57505 }, { "epoch": 9.38172920065253, "grad_norm": 0.033184684813022614, "learning_rate": 3.196821545123191e-05, "loss": 0.0938, "num_input_tokens_seen": 124114992, "step": 57510 }, { "epoch": 9.382544861337683, "grad_norm": 1.3761247396469116, "learning_rate": 3.196479743265092e-05, "loss": 0.0459, "num_input_tokens_seen": 124125936, "step": 57515 }, { "epoch": 9.383360522022839, "grad_norm": 0.15403734147548676, "learning_rate": 3.196137927291947e-05, "loss": 0.1248, "num_input_tokens_seen": 124137616, "step": 57520 }, { "epoch": 9.384176182707993, "grad_norm": 1.6110366582870483, "learning_rate": 3.195796097210684e-05, "loss": 0.1161, "num_input_tokens_seen": 124147856, "step": 57525 }, { "epoch": 9.384991843393149, "grad_norm": 0.8534193634986877, "learning_rate": 3.19545425302823e-05, "loss": 0.0999, "num_input_tokens_seen": 124159824, "step": 57530 }, { "epoch": 9.385807504078304, "grad_norm": 0.1406291127204895, "learning_rate": 3.195112394751514e-05, "loss": 0.0267, "num_input_tokens_seen": 124169168, "step": 57535 }, { "epoch": 9.386623164763458, "grad_norm": 0.6477407217025757, "learning_rate": 3.1947705223874636e-05, "loss": 0.0654, "num_input_tokens_seen": 124179952, "step": 57540 }, { "epoch": 9.387438825448614, "grad_norm": 0.755301833152771, "learning_rate": 3.194428635943006e-05, "loss": 0.1716, "num_input_tokens_seen": 124189648, "step": 57545 }, { "epoch": 9.388254486133768, "grad_norm": 0.282600075006485, "learning_rate": 3.194086735425073e-05, "loss": 0.0985, "num_input_tokens_seen": 124200336, "step": 57550 }, { "epoch": 9.389070146818923, "grad_norm": 1.4142062664031982, "learning_rate": 3.1937448208405906e-05, "loss": 0.2097, "num_input_tokens_seen": 124209264, "step": 57555 }, { "epoch": 9.38988580750408, "grad_norm": 0.03292163833975792, "learning_rate": 3.193402892196489e-05, "loss": 0.0249, "num_input_tokens_seen": 124219184, "step": 57560 }, { "epoch": 9.390701468189233, "grad_norm": 0.14939233660697937, "learning_rate": 3.193060949499699e-05, "loss": 0.0333, "num_input_tokens_seen": 124230800, "step": 57565 }, { "epoch": 9.391517128874389, "grad_norm": 0.11001837998628616, "learning_rate": 3.192718992757149e-05, "loss": 0.0239, "num_input_tokens_seen": 124242224, "step": 57570 }, { "epoch": 9.392332789559543, "grad_norm": 1.678247332572937, "learning_rate": 3.1923770219757696e-05, "loss": 0.1281, "num_input_tokens_seen": 124252592, "step": 57575 }, { "epoch": 9.393148450244698, "grad_norm": 0.44785937666893005, "learning_rate": 3.192035037162492e-05, "loss": 0.0733, "num_input_tokens_seen": 124263760, "step": 57580 }, { "epoch": 9.393964110929852, "grad_norm": 0.5850504040718079, "learning_rate": 3.191693038324247e-05, "loss": 0.0338, "num_input_tokens_seen": 124274704, "step": 57585 }, { "epoch": 9.394779771615008, "grad_norm": 0.1885998398065567, "learning_rate": 3.1913510254679645e-05, "loss": 0.0257, "num_input_tokens_seen": 124284784, "step": 57590 }, { "epoch": 9.395595432300164, "grad_norm": 0.36006706953048706, "learning_rate": 3.1910089986005766e-05, "loss": 0.0687, "num_input_tokens_seen": 124294512, "step": 57595 }, { "epoch": 9.396411092985318, "grad_norm": 0.38179638981819153, "learning_rate": 3.190666957729015e-05, "loss": 0.0681, "num_input_tokens_seen": 124306096, "step": 57600 }, { "epoch": 9.397226753670473, "grad_norm": 1.3171473741531372, "learning_rate": 3.19032490286021e-05, "loss": 0.148, "num_input_tokens_seen": 124317104, "step": 57605 }, { "epoch": 9.398042414355627, "grad_norm": 1.5455718040466309, "learning_rate": 3.1899828340010964e-05, "loss": 0.1533, "num_input_tokens_seen": 124327792, "step": 57610 }, { "epoch": 9.398858075040783, "grad_norm": 0.06382492184638977, "learning_rate": 3.189640751158605e-05, "loss": 0.1277, "num_input_tokens_seen": 124340112, "step": 57615 }, { "epoch": 9.399673735725939, "grad_norm": 1.5444148778915405, "learning_rate": 3.189298654339669e-05, "loss": 0.1836, "num_input_tokens_seen": 124351184, "step": 57620 }, { "epoch": 9.400489396411093, "grad_norm": 0.06486373394727707, "learning_rate": 3.188956543551221e-05, "loss": 0.1269, "num_input_tokens_seen": 124361456, "step": 57625 }, { "epoch": 9.401305057096248, "grad_norm": 1.9717812538146973, "learning_rate": 3.188614418800195e-05, "loss": 0.1483, "num_input_tokens_seen": 124372848, "step": 57630 }, { "epoch": 9.402120717781402, "grad_norm": 1.0898768901824951, "learning_rate": 3.1882722800935245e-05, "loss": 0.0873, "num_input_tokens_seen": 124383664, "step": 57635 }, { "epoch": 9.402936378466558, "grad_norm": 0.29796329140663147, "learning_rate": 3.187930127438143e-05, "loss": 0.1138, "num_input_tokens_seen": 124394160, "step": 57640 }, { "epoch": 9.403752039151712, "grad_norm": 0.16951005160808563, "learning_rate": 3.187587960840984e-05, "loss": 0.1029, "num_input_tokens_seen": 124405584, "step": 57645 }, { "epoch": 9.404567699836868, "grad_norm": 1.046751856803894, "learning_rate": 3.187245780308984e-05, "loss": 0.0589, "num_input_tokens_seen": 124416560, "step": 57650 }, { "epoch": 9.405383360522023, "grad_norm": 0.19135750830173492, "learning_rate": 3.1869035858490754e-05, "loss": 0.0419, "num_input_tokens_seen": 124426864, "step": 57655 }, { "epoch": 9.406199021207177, "grad_norm": 1.1077619791030884, "learning_rate": 3.1865613774681945e-05, "loss": 0.0718, "num_input_tokens_seen": 124438384, "step": 57660 }, { "epoch": 9.407014681892333, "grad_norm": 1.0209766626358032, "learning_rate": 3.186219155173276e-05, "loss": 0.1002, "num_input_tokens_seen": 124449936, "step": 57665 }, { "epoch": 9.407830342577487, "grad_norm": 0.04259273782372475, "learning_rate": 3.185876918971257e-05, "loss": 0.2105, "num_input_tokens_seen": 124461040, "step": 57670 }, { "epoch": 9.408646003262643, "grad_norm": 1.9181625843048096, "learning_rate": 3.185534668869071e-05, "loss": 0.1714, "num_input_tokens_seen": 124472144, "step": 57675 }, { "epoch": 9.409461663947798, "grad_norm": 0.17454387247562408, "learning_rate": 3.185192404873655e-05, "loss": 0.0821, "num_input_tokens_seen": 124483792, "step": 57680 }, { "epoch": 9.410277324632952, "grad_norm": 0.16981889307498932, "learning_rate": 3.184850126991947e-05, "loss": 0.2267, "num_input_tokens_seen": 124495152, "step": 57685 }, { "epoch": 9.411092985318108, "grad_norm": 1.3218698501586914, "learning_rate": 3.184507835230881e-05, "loss": 0.0734, "num_input_tokens_seen": 124505360, "step": 57690 }, { "epoch": 9.411908646003262, "grad_norm": 1.2657114267349243, "learning_rate": 3.184165529597396e-05, "loss": 0.2585, "num_input_tokens_seen": 124515184, "step": 57695 }, { "epoch": 9.412724306688418, "grad_norm": 0.32783040404319763, "learning_rate": 3.183823210098429e-05, "loss": 0.1682, "num_input_tokens_seen": 124526416, "step": 57700 }, { "epoch": 9.413539967373573, "grad_norm": 1.120460033416748, "learning_rate": 3.1834808767409164e-05, "loss": 0.1745, "num_input_tokens_seen": 124537648, "step": 57705 }, { "epoch": 9.414355628058727, "grad_norm": 0.08342591673135757, "learning_rate": 3.183138529531797e-05, "loss": 0.2228, "num_input_tokens_seen": 124547920, "step": 57710 }, { "epoch": 9.415171288743883, "grad_norm": 1.0390325784683228, "learning_rate": 3.182796168478008e-05, "loss": 0.0614, "num_input_tokens_seen": 124556624, "step": 57715 }, { "epoch": 9.415986949429037, "grad_norm": 1.3775887489318848, "learning_rate": 3.18245379358649e-05, "loss": 0.2302, "num_input_tokens_seen": 124566448, "step": 57720 }, { "epoch": 9.416802610114193, "grad_norm": 0.11363940685987473, "learning_rate": 3.182111404864179e-05, "loss": 0.0776, "num_input_tokens_seen": 124576624, "step": 57725 }, { "epoch": 9.417618270799348, "grad_norm": 0.20457936823368073, "learning_rate": 3.181769002318016e-05, "loss": 0.1084, "num_input_tokens_seen": 124587184, "step": 57730 }, { "epoch": 9.418433931484502, "grad_norm": 1.4719294309616089, "learning_rate": 3.181426585954938e-05, "loss": 0.1092, "num_input_tokens_seen": 124598448, "step": 57735 }, { "epoch": 9.419249592169658, "grad_norm": 0.4384433329105377, "learning_rate": 3.181084155781886e-05, "loss": 0.1759, "num_input_tokens_seen": 124610576, "step": 57740 }, { "epoch": 9.420065252854812, "grad_norm": 0.28177663683891296, "learning_rate": 3.1807417118058005e-05, "loss": 0.0715, "num_input_tokens_seen": 124620656, "step": 57745 }, { "epoch": 9.420880913539968, "grad_norm": 0.12457732856273651, "learning_rate": 3.18039925403362e-05, "loss": 0.0705, "num_input_tokens_seen": 124631344, "step": 57750 }, { "epoch": 9.421696574225122, "grad_norm": 0.40392962098121643, "learning_rate": 3.180056782472286e-05, "loss": 0.0211, "num_input_tokens_seen": 124641872, "step": 57755 }, { "epoch": 9.422512234910277, "grad_norm": 0.09968820214271545, "learning_rate": 3.179714297128738e-05, "loss": 0.1636, "num_input_tokens_seen": 124652464, "step": 57760 }, { "epoch": 9.423327895595433, "grad_norm": 0.06600308418273926, "learning_rate": 3.179371798009918e-05, "loss": 0.043, "num_input_tokens_seen": 124662896, "step": 57765 }, { "epoch": 9.424143556280587, "grad_norm": 0.143720343708992, "learning_rate": 3.179029285122766e-05, "loss": 0.0569, "num_input_tokens_seen": 124674480, "step": 57770 }, { "epoch": 9.424959216965743, "grad_norm": 0.34496158361434937, "learning_rate": 3.178686758474225e-05, "loss": 0.1233, "num_input_tokens_seen": 124684240, "step": 57775 }, { "epoch": 9.425774877650896, "grad_norm": 1.2418266534805298, "learning_rate": 3.178344218071235e-05, "loss": 0.0407, "num_input_tokens_seen": 124696336, "step": 57780 }, { "epoch": 9.426590538336052, "grad_norm": 0.80563884973526, "learning_rate": 3.17800166392074e-05, "loss": 0.2047, "num_input_tokens_seen": 124708432, "step": 57785 }, { "epoch": 9.427406199021208, "grad_norm": 0.5960645079612732, "learning_rate": 3.1776590960296806e-05, "loss": 0.0478, "num_input_tokens_seen": 124721072, "step": 57790 }, { "epoch": 9.428221859706362, "grad_norm": 0.23106831312179565, "learning_rate": 3.1773165144049994e-05, "loss": 0.1206, "num_input_tokens_seen": 124731696, "step": 57795 }, { "epoch": 9.429037520391518, "grad_norm": 0.18968619406223297, "learning_rate": 3.1769739190536404e-05, "loss": 0.192, "num_input_tokens_seen": 124742416, "step": 57800 }, { "epoch": 9.429853181076671, "grad_norm": 0.5960783958435059, "learning_rate": 3.176631309982546e-05, "loss": 0.0919, "num_input_tokens_seen": 124752784, "step": 57805 }, { "epoch": 9.430668841761827, "grad_norm": 1.5576863288879395, "learning_rate": 3.1762886871986594e-05, "loss": 0.2524, "num_input_tokens_seen": 124763248, "step": 57810 }, { "epoch": 9.431484502446983, "grad_norm": 0.15953096747398376, "learning_rate": 3.175946050708925e-05, "loss": 0.2956, "num_input_tokens_seen": 124774576, "step": 57815 }, { "epoch": 9.432300163132137, "grad_norm": 1.6398661136627197, "learning_rate": 3.1756034005202854e-05, "loss": 0.1332, "num_input_tokens_seen": 124784368, "step": 57820 }, { "epoch": 9.433115823817293, "grad_norm": 0.04161077365279198, "learning_rate": 3.175260736639687e-05, "loss": 0.1307, "num_input_tokens_seen": 124793904, "step": 57825 }, { "epoch": 9.433931484502446, "grad_norm": 0.6658881902694702, "learning_rate": 3.174918059074073e-05, "loss": 0.0275, "num_input_tokens_seen": 124804848, "step": 57830 }, { "epoch": 9.434747145187602, "grad_norm": 1.2117184400558472, "learning_rate": 3.174575367830388e-05, "loss": 0.1561, "num_input_tokens_seen": 124817840, "step": 57835 }, { "epoch": 9.435562805872756, "grad_norm": 0.45210957527160645, "learning_rate": 3.174232662915578e-05, "loss": 0.058, "num_input_tokens_seen": 124828080, "step": 57840 }, { "epoch": 9.436378466557912, "grad_norm": 0.25115764141082764, "learning_rate": 3.173889944336587e-05, "loss": 0.0317, "num_input_tokens_seen": 124839632, "step": 57845 }, { "epoch": 9.437194127243067, "grad_norm": 0.7889304757118225, "learning_rate": 3.1735472121003615e-05, "loss": 0.1015, "num_input_tokens_seen": 124852176, "step": 57850 }, { "epoch": 9.438009787928221, "grad_norm": 0.7714184522628784, "learning_rate": 3.173204466213848e-05, "loss": 0.1311, "num_input_tokens_seen": 124862768, "step": 57855 }, { "epoch": 9.438825448613377, "grad_norm": 0.8196341395378113, "learning_rate": 3.172861706683992e-05, "loss": 0.1417, "num_input_tokens_seen": 124873936, "step": 57860 }, { "epoch": 9.439641109298531, "grad_norm": 0.20708827674388885, "learning_rate": 3.172518933517739e-05, "loss": 0.1217, "num_input_tokens_seen": 124884912, "step": 57865 }, { "epoch": 9.440456769983687, "grad_norm": 0.16513708233833313, "learning_rate": 3.172176146722037e-05, "loss": 0.0221, "num_input_tokens_seen": 124895632, "step": 57870 }, { "epoch": 9.441272430668842, "grad_norm": 0.44072070717811584, "learning_rate": 3.171833346303833e-05, "loss": 0.0265, "num_input_tokens_seen": 124906640, "step": 57875 }, { "epoch": 9.442088091353996, "grad_norm": 0.05764523893594742, "learning_rate": 3.1714905322700726e-05, "loss": 0.0243, "num_input_tokens_seen": 124917392, "step": 57880 }, { "epoch": 9.442903752039152, "grad_norm": 0.2586611807346344, "learning_rate": 3.171147704627706e-05, "loss": 0.0664, "num_input_tokens_seen": 124927408, "step": 57885 }, { "epoch": 9.443719412724306, "grad_norm": 0.3330066502094269, "learning_rate": 3.17080486338368e-05, "loss": 0.1021, "num_input_tokens_seen": 124938160, "step": 57890 }, { "epoch": 9.444535073409462, "grad_norm": 0.053035322576761246, "learning_rate": 3.170462008544942e-05, "loss": 0.0554, "num_input_tokens_seen": 124947376, "step": 57895 }, { "epoch": 9.445350734094617, "grad_norm": 0.16929389536380768, "learning_rate": 3.170119140118441e-05, "loss": 0.1128, "num_input_tokens_seen": 124958576, "step": 57900 }, { "epoch": 9.446166394779771, "grad_norm": 0.17914645373821259, "learning_rate": 3.169776258111126e-05, "loss": 0.2637, "num_input_tokens_seen": 124968944, "step": 57905 }, { "epoch": 9.446982055464927, "grad_norm": 2.087057590484619, "learning_rate": 3.169433362529944e-05, "loss": 0.0989, "num_input_tokens_seen": 124980496, "step": 57910 }, { "epoch": 9.447797716150081, "grad_norm": 3.369265079498291, "learning_rate": 3.169090453381847e-05, "loss": 0.2176, "num_input_tokens_seen": 124990256, "step": 57915 }, { "epoch": 9.448613376835237, "grad_norm": 0.05702650919556618, "learning_rate": 3.1687475306737826e-05, "loss": 0.112, "num_input_tokens_seen": 125001840, "step": 57920 }, { "epoch": 9.449429037520392, "grad_norm": 0.035702940076589584, "learning_rate": 3.1684045944127006e-05, "loss": 0.1088, "num_input_tokens_seen": 125012272, "step": 57925 }, { "epoch": 9.450244698205546, "grad_norm": 1.4787081480026245, "learning_rate": 3.168061644605552e-05, "loss": 0.1567, "num_input_tokens_seen": 125023024, "step": 57930 }, { "epoch": 9.451060358890702, "grad_norm": 0.8216298818588257, "learning_rate": 3.1677186812592876e-05, "loss": 0.0969, "num_input_tokens_seen": 125035184, "step": 57935 }, { "epoch": 9.451876019575856, "grad_norm": 0.6328655481338501, "learning_rate": 3.167375704380856e-05, "loss": 0.0353, "num_input_tokens_seen": 125045424, "step": 57940 }, { "epoch": 9.452691680261012, "grad_norm": 1.817566156387329, "learning_rate": 3.167032713977209e-05, "loss": 0.2663, "num_input_tokens_seen": 125055504, "step": 57945 }, { "epoch": 9.453507340946166, "grad_norm": 0.1433526873588562, "learning_rate": 3.166689710055299e-05, "loss": 0.1482, "num_input_tokens_seen": 125065744, "step": 57950 }, { "epoch": 9.454323001631321, "grad_norm": 1.3249770402908325, "learning_rate": 3.1663466926220756e-05, "loss": 0.1945, "num_input_tokens_seen": 125076720, "step": 57955 }, { "epoch": 9.455138662316477, "grad_norm": 0.596173882484436, "learning_rate": 3.166003661684491e-05, "loss": 0.2942, "num_input_tokens_seen": 125087504, "step": 57960 }, { "epoch": 9.455954323001631, "grad_norm": 0.8802763223648071, "learning_rate": 3.1656606172494974e-05, "loss": 0.0536, "num_input_tokens_seen": 125099408, "step": 57965 }, { "epoch": 9.456769983686787, "grad_norm": 1.181046724319458, "learning_rate": 3.165317559324047e-05, "loss": 0.1464, "num_input_tokens_seen": 125109904, "step": 57970 }, { "epoch": 9.45758564437194, "grad_norm": 0.21545550227165222, "learning_rate": 3.164974487915092e-05, "loss": 0.0484, "num_input_tokens_seen": 125120144, "step": 57975 }, { "epoch": 9.458401305057096, "grad_norm": 1.172001838684082, "learning_rate": 3.164631403029586e-05, "loss": 0.1839, "num_input_tokens_seen": 125131056, "step": 57980 }, { "epoch": 9.459216965742252, "grad_norm": 0.6798404455184937, "learning_rate": 3.1642883046744816e-05, "loss": 0.138, "num_input_tokens_seen": 125141072, "step": 57985 }, { "epoch": 9.460032626427406, "grad_norm": 0.053156450390815735, "learning_rate": 3.1639451928567316e-05, "loss": 0.0488, "num_input_tokens_seen": 125152656, "step": 57990 }, { "epoch": 9.460848287112562, "grad_norm": 0.11347946524620056, "learning_rate": 3.16360206758329e-05, "loss": 0.1003, "num_input_tokens_seen": 125164880, "step": 57995 }, { "epoch": 9.461663947797716, "grad_norm": 0.22087685763835907, "learning_rate": 3.1632589288611104e-05, "loss": 0.0542, "num_input_tokens_seen": 125175792, "step": 58000 }, { "epoch": 9.462479608482871, "grad_norm": 0.7761229276657104, "learning_rate": 3.1629157766971475e-05, "loss": 0.0941, "num_input_tokens_seen": 125187152, "step": 58005 }, { "epoch": 9.463295269168025, "grad_norm": 0.12991470098495483, "learning_rate": 3.162572611098355e-05, "loss": 0.0292, "num_input_tokens_seen": 125197072, "step": 58010 }, { "epoch": 9.464110929853181, "grad_norm": 1.6661555767059326, "learning_rate": 3.162229432071688e-05, "loss": 0.12, "num_input_tokens_seen": 125208144, "step": 58015 }, { "epoch": 9.464926590538337, "grad_norm": 0.6290974020957947, "learning_rate": 3.1618862396241016e-05, "loss": 0.0884, "num_input_tokens_seen": 125218448, "step": 58020 }, { "epoch": 9.46574225122349, "grad_norm": 1.4987034797668457, "learning_rate": 3.1615430337625514e-05, "loss": 0.2197, "num_input_tokens_seen": 125229168, "step": 58025 }, { "epoch": 9.466557911908646, "grad_norm": 0.693274199962616, "learning_rate": 3.161199814493992e-05, "loss": 0.1993, "num_input_tokens_seen": 125240496, "step": 58030 }, { "epoch": 9.4673735725938, "grad_norm": 0.4821571409702301, "learning_rate": 3.16085658182538e-05, "loss": 0.1028, "num_input_tokens_seen": 125251408, "step": 58035 }, { "epoch": 9.468189233278956, "grad_norm": 0.09512275457382202, "learning_rate": 3.16051333576367e-05, "loss": 0.1113, "num_input_tokens_seen": 125262128, "step": 58040 }, { "epoch": 9.469004893964112, "grad_norm": 0.4442083239555359, "learning_rate": 3.16017007631582e-05, "loss": 0.0322, "num_input_tokens_seen": 125272816, "step": 58045 }, { "epoch": 9.469820554649266, "grad_norm": 0.46986252069473267, "learning_rate": 3.1598268034887855e-05, "loss": 0.1342, "num_input_tokens_seen": 125283792, "step": 58050 }, { "epoch": 9.470636215334421, "grad_norm": 2.0773417949676514, "learning_rate": 3.159483517289524e-05, "loss": 0.1709, "num_input_tokens_seen": 125294960, "step": 58055 }, { "epoch": 9.471451876019575, "grad_norm": 0.03484518826007843, "learning_rate": 3.159140217724993e-05, "loss": 0.0403, "num_input_tokens_seen": 125304816, "step": 58060 }, { "epoch": 9.47226753670473, "grad_norm": 0.18281644582748413, "learning_rate": 3.1587969048021494e-05, "loss": 0.0667, "num_input_tokens_seen": 125314448, "step": 58065 }, { "epoch": 9.473083197389887, "grad_norm": 0.24457751214504242, "learning_rate": 3.15845357852795e-05, "loss": 0.0245, "num_input_tokens_seen": 125325584, "step": 58070 }, { "epoch": 9.47389885807504, "grad_norm": 0.407062828540802, "learning_rate": 3.1581102389093536e-05, "loss": 0.0212, "num_input_tokens_seen": 125337328, "step": 58075 }, { "epoch": 9.474714518760196, "grad_norm": 1.223972201347351, "learning_rate": 3.157766885953319e-05, "loss": 0.0735, "num_input_tokens_seen": 125348336, "step": 58080 }, { "epoch": 9.47553017944535, "grad_norm": 0.3314876854419708, "learning_rate": 3.1574235196668036e-05, "loss": 0.1619, "num_input_tokens_seen": 125359152, "step": 58085 }, { "epoch": 9.476345840130506, "grad_norm": 0.07102689892053604, "learning_rate": 3.157080140056766e-05, "loss": 0.0292, "num_input_tokens_seen": 125371792, "step": 58090 }, { "epoch": 9.477161500815662, "grad_norm": 2.2860219478607178, "learning_rate": 3.156736747130167e-05, "loss": 0.1484, "num_input_tokens_seen": 125382960, "step": 58095 }, { "epoch": 9.477977161500815, "grad_norm": 1.845947265625, "learning_rate": 3.156393340893964e-05, "loss": 0.3633, "num_input_tokens_seen": 125392752, "step": 58100 }, { "epoch": 9.478792822185971, "grad_norm": 0.32863834500312805, "learning_rate": 3.156049921355117e-05, "loss": 0.1397, "num_input_tokens_seen": 125404080, "step": 58105 }, { "epoch": 9.479608482871125, "grad_norm": 1.35430908203125, "learning_rate": 3.155706488520586e-05, "loss": 0.1476, "num_input_tokens_seen": 125415568, "step": 58110 }, { "epoch": 9.48042414355628, "grad_norm": 1.5116186141967773, "learning_rate": 3.155363042397332e-05, "loss": 0.0763, "num_input_tokens_seen": 125426832, "step": 58115 }, { "epoch": 9.481239804241435, "grad_norm": 1.7517775297164917, "learning_rate": 3.155019582992314e-05, "loss": 0.1819, "num_input_tokens_seen": 125436656, "step": 58120 }, { "epoch": 9.48205546492659, "grad_norm": 0.7062222361564636, "learning_rate": 3.1546761103124934e-05, "loss": 0.0786, "num_input_tokens_seen": 125447440, "step": 58125 }, { "epoch": 9.482871125611746, "grad_norm": 1.5646147727966309, "learning_rate": 3.154332624364831e-05, "loss": 0.1114, "num_input_tokens_seen": 125458928, "step": 58130 }, { "epoch": 9.4836867862969, "grad_norm": 0.09696425497531891, "learning_rate": 3.153989125156287e-05, "loss": 0.0848, "num_input_tokens_seen": 125469520, "step": 58135 }, { "epoch": 9.484502446982056, "grad_norm": 0.10735008865594864, "learning_rate": 3.153645612693824e-05, "loss": 0.0272, "num_input_tokens_seen": 125480016, "step": 58140 }, { "epoch": 9.48531810766721, "grad_norm": 0.1369839310646057, "learning_rate": 3.1533020869844045e-05, "loss": 0.0209, "num_input_tokens_seen": 125490608, "step": 58145 }, { "epoch": 9.486133768352365, "grad_norm": 0.32702189683914185, "learning_rate": 3.152958548034989e-05, "loss": 0.1133, "num_input_tokens_seen": 125501392, "step": 58150 }, { "epoch": 9.486949429037521, "grad_norm": 0.15861336886882782, "learning_rate": 3.15261499585254e-05, "loss": 0.0794, "num_input_tokens_seen": 125510448, "step": 58155 }, { "epoch": 9.487765089722675, "grad_norm": 0.5592633485794067, "learning_rate": 3.1522714304440196e-05, "loss": 0.1913, "num_input_tokens_seen": 125521968, "step": 58160 }, { "epoch": 9.48858075040783, "grad_norm": 0.14775915443897247, "learning_rate": 3.1519278518163916e-05, "loss": 0.1784, "num_input_tokens_seen": 125533360, "step": 58165 }, { "epoch": 9.489396411092985, "grad_norm": 0.2750858664512634, "learning_rate": 3.151584259976619e-05, "loss": 0.0251, "num_input_tokens_seen": 125544400, "step": 58170 }, { "epoch": 9.49021207177814, "grad_norm": 0.49943608045578003, "learning_rate": 3.1512406549316645e-05, "loss": 0.037, "num_input_tokens_seen": 125555568, "step": 58175 }, { "epoch": 9.491027732463296, "grad_norm": 0.31128454208374023, "learning_rate": 3.1508970366884925e-05, "loss": 0.1704, "num_input_tokens_seen": 125566224, "step": 58180 }, { "epoch": 9.49184339314845, "grad_norm": 0.12218741327524185, "learning_rate": 3.150553405254066e-05, "loss": 0.1173, "num_input_tokens_seen": 125576976, "step": 58185 }, { "epoch": 9.492659053833606, "grad_norm": 1.2741634845733643, "learning_rate": 3.1502097606353485e-05, "loss": 0.0815, "num_input_tokens_seen": 125587856, "step": 58190 }, { "epoch": 9.49347471451876, "grad_norm": 2.323277711868286, "learning_rate": 3.149866102839307e-05, "loss": 0.1846, "num_input_tokens_seen": 125598832, "step": 58195 }, { "epoch": 9.494290375203915, "grad_norm": 1.0562834739685059, "learning_rate": 3.1495224318729034e-05, "loss": 0.1028, "num_input_tokens_seen": 125610832, "step": 58200 }, { "epoch": 9.49510603588907, "grad_norm": 1.109941840171814, "learning_rate": 3.149178747743104e-05, "loss": 0.0687, "num_input_tokens_seen": 125621104, "step": 58205 }, { "epoch": 9.495921696574225, "grad_norm": 0.04417581856250763, "learning_rate": 3.148835050456874e-05, "loss": 0.0858, "num_input_tokens_seen": 125632752, "step": 58210 }, { "epoch": 9.49673735725938, "grad_norm": 0.20134620368480682, "learning_rate": 3.148491340021179e-05, "loss": 0.0199, "num_input_tokens_seen": 125642832, "step": 58215 }, { "epoch": 9.497553017944535, "grad_norm": 0.06739895045757294, "learning_rate": 3.148147616442984e-05, "loss": 0.0499, "num_input_tokens_seen": 125653840, "step": 58220 }, { "epoch": 9.49836867862969, "grad_norm": 1.6250969171524048, "learning_rate": 3.147803879729254e-05, "loss": 0.1203, "num_input_tokens_seen": 125665104, "step": 58225 }, { "epoch": 9.499184339314844, "grad_norm": 0.19041378796100616, "learning_rate": 3.147460129886958e-05, "loss": 0.0173, "num_input_tokens_seen": 125675824, "step": 58230 }, { "epoch": 9.5, "grad_norm": 1.073873519897461, "learning_rate": 3.1471163669230604e-05, "loss": 0.1223, "num_input_tokens_seen": 125686064, "step": 58235 }, { "epoch": 9.500815660685156, "grad_norm": 1.1935343742370605, "learning_rate": 3.146772590844529e-05, "loss": 0.2128, "num_input_tokens_seen": 125697296, "step": 58240 }, { "epoch": 9.50163132137031, "grad_norm": 0.590164303779602, "learning_rate": 3.14642880165833e-05, "loss": 0.0678, "num_input_tokens_seen": 125708688, "step": 58245 }, { "epoch": 9.502446982055465, "grad_norm": 0.05898075923323631, "learning_rate": 3.146084999371432e-05, "loss": 0.052, "num_input_tokens_seen": 125719792, "step": 58250 }, { "epoch": 9.50326264274062, "grad_norm": 0.15871576964855194, "learning_rate": 3.145741183990802e-05, "loss": 0.0247, "num_input_tokens_seen": 125729680, "step": 58255 }, { "epoch": 9.504078303425775, "grad_norm": 0.15834940969944, "learning_rate": 3.145397355523408e-05, "loss": 0.2155, "num_input_tokens_seen": 125739472, "step": 58260 }, { "epoch": 9.50489396411093, "grad_norm": 1.6090832948684692, "learning_rate": 3.145053513976217e-05, "loss": 0.0609, "num_input_tokens_seen": 125751952, "step": 58265 }, { "epoch": 9.505709624796085, "grad_norm": 0.6964912414550781, "learning_rate": 3.1447096593561985e-05, "loss": 0.0694, "num_input_tokens_seen": 125762992, "step": 58270 }, { "epoch": 9.50652528548124, "grad_norm": 0.09170998632907867, "learning_rate": 3.144365791670321e-05, "loss": 0.2138, "num_input_tokens_seen": 125774480, "step": 58275 }, { "epoch": 9.507340946166394, "grad_norm": 1.5901917219161987, "learning_rate": 3.144021910925554e-05, "loss": 0.1174, "num_input_tokens_seen": 125784944, "step": 58280 }, { "epoch": 9.50815660685155, "grad_norm": 2.263777017593384, "learning_rate": 3.143678017128865e-05, "loss": 0.1394, "num_input_tokens_seen": 125795120, "step": 58285 }, { "epoch": 9.508972267536706, "grad_norm": 0.6682602167129517, "learning_rate": 3.143334110287224e-05, "loss": 0.1435, "num_input_tokens_seen": 125806640, "step": 58290 }, { "epoch": 9.50978792822186, "grad_norm": 0.5936106443405151, "learning_rate": 3.142990190407602e-05, "loss": 0.1188, "num_input_tokens_seen": 125817712, "step": 58295 }, { "epoch": 9.510603588907015, "grad_norm": 1.0750778913497925, "learning_rate": 3.142646257496968e-05, "loss": 0.2591, "num_input_tokens_seen": 125829008, "step": 58300 }, { "epoch": 9.51141924959217, "grad_norm": 1.1161350011825562, "learning_rate": 3.142302311562292e-05, "loss": 0.1347, "num_input_tokens_seen": 125840016, "step": 58305 }, { "epoch": 9.512234910277325, "grad_norm": 0.05516873300075531, "learning_rate": 3.141958352610546e-05, "loss": 0.0337, "num_input_tokens_seen": 125850608, "step": 58310 }, { "epoch": 9.513050570962479, "grad_norm": 1.2980417013168335, "learning_rate": 3.141614380648698e-05, "loss": 0.0637, "num_input_tokens_seen": 125863312, "step": 58315 }, { "epoch": 9.513866231647635, "grad_norm": 0.21660518646240234, "learning_rate": 3.1412703956837216e-05, "loss": 0.0289, "num_input_tokens_seen": 125874960, "step": 58320 }, { "epoch": 9.51468189233279, "grad_norm": 1.0363565683364868, "learning_rate": 3.1409263977225865e-05, "loss": 0.0614, "num_input_tokens_seen": 125885264, "step": 58325 }, { "epoch": 9.515497553017944, "grad_norm": 0.06354836374521255, "learning_rate": 3.140582386772265e-05, "loss": 0.1628, "num_input_tokens_seen": 125896624, "step": 58330 }, { "epoch": 9.5163132137031, "grad_norm": 1.7248313426971436, "learning_rate": 3.140238362839729e-05, "loss": 0.1922, "num_input_tokens_seen": 125907056, "step": 58335 }, { "epoch": 9.517128874388254, "grad_norm": 0.3029329180717468, "learning_rate": 3.13989432593195e-05, "loss": 0.3157, "num_input_tokens_seen": 125917616, "step": 58340 }, { "epoch": 9.51794453507341, "grad_norm": 0.6456358432769775, "learning_rate": 3.1395502760559015e-05, "loss": 0.1462, "num_input_tokens_seen": 125927984, "step": 58345 }, { "epoch": 9.518760195758565, "grad_norm": 1.8928231000900269, "learning_rate": 3.1392062132185546e-05, "loss": 0.1229, "num_input_tokens_seen": 125938736, "step": 58350 }, { "epoch": 9.51957585644372, "grad_norm": 0.9324221611022949, "learning_rate": 3.138862137426884e-05, "loss": 0.2131, "num_input_tokens_seen": 125948816, "step": 58355 }, { "epoch": 9.520391517128875, "grad_norm": 0.11688309162855148, "learning_rate": 3.1385180486878595e-05, "loss": 0.0455, "num_input_tokens_seen": 125959504, "step": 58360 }, { "epoch": 9.521207177814029, "grad_norm": 0.27625948190689087, "learning_rate": 3.138173947008458e-05, "loss": 0.0967, "num_input_tokens_seen": 125971504, "step": 58365 }, { "epoch": 9.522022838499185, "grad_norm": 0.18612340092658997, "learning_rate": 3.1378298323956525e-05, "loss": 0.1019, "num_input_tokens_seen": 125981840, "step": 58370 }, { "epoch": 9.522838499184338, "grad_norm": 0.05891917273402214, "learning_rate": 3.137485704856415e-05, "loss": 0.1317, "num_input_tokens_seen": 125992432, "step": 58375 }, { "epoch": 9.523654159869494, "grad_norm": 1.7916145324707031, "learning_rate": 3.1371415643977225e-05, "loss": 0.2237, "num_input_tokens_seen": 126002576, "step": 58380 }, { "epoch": 9.52446982055465, "grad_norm": 0.1530412882566452, "learning_rate": 3.136797411026547e-05, "loss": 0.0662, "num_input_tokens_seen": 126011920, "step": 58385 }, { "epoch": 9.525285481239804, "grad_norm": 0.12662674486637115, "learning_rate": 3.136453244749865e-05, "loss": 0.0359, "num_input_tokens_seen": 126022864, "step": 58390 }, { "epoch": 9.52610114192496, "grad_norm": 0.20487773418426514, "learning_rate": 3.13610906557465e-05, "loss": 0.0148, "num_input_tokens_seen": 126034960, "step": 58395 }, { "epoch": 9.526916802610113, "grad_norm": 0.3937739431858063, "learning_rate": 3.135764873507878e-05, "loss": 0.0523, "num_input_tokens_seen": 126046672, "step": 58400 }, { "epoch": 9.52773246329527, "grad_norm": 0.15317033231258392, "learning_rate": 3.1354206685565234e-05, "loss": 0.0827, "num_input_tokens_seen": 126058000, "step": 58405 }, { "epoch": 9.528548123980425, "grad_norm": 0.17647485435009003, "learning_rate": 3.1350764507275645e-05, "loss": 0.0188, "num_input_tokens_seen": 126068496, "step": 58410 }, { "epoch": 9.529363784665579, "grad_norm": 0.3768817186355591, "learning_rate": 3.1347322200279745e-05, "loss": 0.0571, "num_input_tokens_seen": 126078544, "step": 58415 }, { "epoch": 9.530179445350734, "grad_norm": 1.483519434928894, "learning_rate": 3.134387976464731e-05, "loss": 0.288, "num_input_tokens_seen": 126089712, "step": 58420 }, { "epoch": 9.530995106035888, "grad_norm": 0.05669299140572548, "learning_rate": 3.13404372004481e-05, "loss": 0.1178, "num_input_tokens_seen": 126100656, "step": 58425 }, { "epoch": 9.531810766721044, "grad_norm": 0.2710403501987457, "learning_rate": 3.133699450775189e-05, "loss": 0.1795, "num_input_tokens_seen": 126111376, "step": 58430 }, { "epoch": 9.5326264274062, "grad_norm": 0.3683517277240753, "learning_rate": 3.1333551686628455e-05, "loss": 0.1118, "num_input_tokens_seen": 126122640, "step": 58435 }, { "epoch": 9.533442088091354, "grad_norm": 0.12965193390846252, "learning_rate": 3.1330108737147555e-05, "loss": 0.0855, "num_input_tokens_seen": 126132432, "step": 58440 }, { "epoch": 9.53425774877651, "grad_norm": 0.21872887015342712, "learning_rate": 3.132666565937897e-05, "loss": 0.0281, "num_input_tokens_seen": 126142576, "step": 58445 }, { "epoch": 9.535073409461663, "grad_norm": 0.09787391871213913, "learning_rate": 3.132322245339248e-05, "loss": 0.0257, "num_input_tokens_seen": 126153552, "step": 58450 }, { "epoch": 9.535889070146819, "grad_norm": 0.1288377046585083, "learning_rate": 3.1319779119257855e-05, "loss": 0.1551, "num_input_tokens_seen": 126165072, "step": 58455 }, { "epoch": 9.536704730831975, "grad_norm": 0.22559727728366852, "learning_rate": 3.131633565704491e-05, "loss": 0.0977, "num_input_tokens_seen": 126175696, "step": 58460 }, { "epoch": 9.537520391517129, "grad_norm": 0.14778955280780792, "learning_rate": 3.13128920668234e-05, "loss": 0.0989, "num_input_tokens_seen": 126187088, "step": 58465 }, { "epoch": 9.538336052202284, "grad_norm": 1.35696280002594, "learning_rate": 3.1309448348663115e-05, "loss": 0.1579, "num_input_tokens_seen": 126198480, "step": 58470 }, { "epoch": 9.539151712887438, "grad_norm": 2.1001927852630615, "learning_rate": 3.130600450263387e-05, "loss": 0.1175, "num_input_tokens_seen": 126209904, "step": 58475 }, { "epoch": 9.539967373572594, "grad_norm": 0.044075753539800644, "learning_rate": 3.130256052880543e-05, "loss": 0.0229, "num_input_tokens_seen": 126219792, "step": 58480 }, { "epoch": 9.540783034257748, "grad_norm": 0.14709511399269104, "learning_rate": 3.129911642724762e-05, "loss": 0.08, "num_input_tokens_seen": 126230256, "step": 58485 }, { "epoch": 9.541598694942904, "grad_norm": 0.10409373790025711, "learning_rate": 3.1295672198030214e-05, "loss": 0.1593, "num_input_tokens_seen": 126241136, "step": 58490 }, { "epoch": 9.54241435562806, "grad_norm": 0.9480788111686707, "learning_rate": 3.1292227841223025e-05, "loss": 0.0564, "num_input_tokens_seen": 126252048, "step": 58495 }, { "epoch": 9.543230016313213, "grad_norm": 0.2804504930973053, "learning_rate": 3.128878335689586e-05, "loss": 0.0991, "num_input_tokens_seen": 126262320, "step": 58500 }, { "epoch": 9.544045676998369, "grad_norm": 1.8194321393966675, "learning_rate": 3.128533874511852e-05, "loss": 0.2873, "num_input_tokens_seen": 126272464, "step": 58505 }, { "epoch": 9.544861337683523, "grad_norm": 0.2505822479724884, "learning_rate": 3.1281894005960815e-05, "loss": 0.0319, "num_input_tokens_seen": 126283984, "step": 58510 }, { "epoch": 9.545676998368679, "grad_norm": 0.44009050726890564, "learning_rate": 3.127844913949256e-05, "loss": 0.0584, "num_input_tokens_seen": 126295024, "step": 58515 }, { "epoch": 9.546492659053834, "grad_norm": 0.0648125484585762, "learning_rate": 3.127500414578357e-05, "loss": 0.1606, "num_input_tokens_seen": 126305296, "step": 58520 }, { "epoch": 9.547308319738988, "grad_norm": 0.07803694903850555, "learning_rate": 3.1271559024903665e-05, "loss": 0.1242, "num_input_tokens_seen": 126317264, "step": 58525 }, { "epoch": 9.548123980424144, "grad_norm": 0.36242109537124634, "learning_rate": 3.1268113776922657e-05, "loss": 0.1242, "num_input_tokens_seen": 126326640, "step": 58530 }, { "epoch": 9.548939641109298, "grad_norm": 1.2372605800628662, "learning_rate": 3.126466840191037e-05, "loss": 0.0756, "num_input_tokens_seen": 126337840, "step": 58535 }, { "epoch": 9.549755301794454, "grad_norm": 0.7804908156394958, "learning_rate": 3.126122289993663e-05, "loss": 0.1896, "num_input_tokens_seen": 126347792, "step": 58540 }, { "epoch": 9.550570962479608, "grad_norm": 0.3688790202140808, "learning_rate": 3.125777727107127e-05, "loss": 0.0972, "num_input_tokens_seen": 126359344, "step": 58545 }, { "epoch": 9.551386623164763, "grad_norm": 0.9884105920791626, "learning_rate": 3.125433151538411e-05, "loss": 0.1942, "num_input_tokens_seen": 126370064, "step": 58550 }, { "epoch": 9.552202283849919, "grad_norm": 0.2012493908405304, "learning_rate": 3.125088563294499e-05, "loss": 0.0164, "num_input_tokens_seen": 126380080, "step": 58555 }, { "epoch": 9.553017944535073, "grad_norm": 1.066035270690918, "learning_rate": 3.1247439623823735e-05, "loss": 0.0988, "num_input_tokens_seen": 126390864, "step": 58560 }, { "epoch": 9.553833605220229, "grad_norm": 0.029432717710733414, "learning_rate": 3.12439934880902e-05, "loss": 0.1167, "num_input_tokens_seen": 126402224, "step": 58565 }, { "epoch": 9.554649265905383, "grad_norm": 0.09845539927482605, "learning_rate": 3.1240547225814215e-05, "loss": 0.0137, "num_input_tokens_seen": 126412464, "step": 58570 }, { "epoch": 9.555464926590538, "grad_norm": 0.6455143094062805, "learning_rate": 3.123710083706562e-05, "loss": 0.1061, "num_input_tokens_seen": 126423600, "step": 58575 }, { "epoch": 9.556280587275694, "grad_norm": 0.5273136496543884, "learning_rate": 3.123365432191427e-05, "loss": 0.1074, "num_input_tokens_seen": 126434928, "step": 58580 }, { "epoch": 9.557096247960848, "grad_norm": 0.8689786791801453, "learning_rate": 3.123020768043e-05, "loss": 0.0839, "num_input_tokens_seen": 126444944, "step": 58585 }, { "epoch": 9.557911908646004, "grad_norm": 0.04033276438713074, "learning_rate": 3.122676091268267e-05, "loss": 0.1166, "num_input_tokens_seen": 126456208, "step": 58590 }, { "epoch": 9.558727569331158, "grad_norm": 0.08476927876472473, "learning_rate": 3.122331401874214e-05, "loss": 0.0325, "num_input_tokens_seen": 126467312, "step": 58595 }, { "epoch": 9.559543230016313, "grad_norm": 0.7193312644958496, "learning_rate": 3.121986699867824e-05, "loss": 0.1315, "num_input_tokens_seen": 126476656, "step": 58600 }, { "epoch": 9.560358890701469, "grad_norm": 0.44639357924461365, "learning_rate": 3.121641985256086e-05, "loss": 0.2307, "num_input_tokens_seen": 126487920, "step": 58605 }, { "epoch": 9.561174551386623, "grad_norm": 0.042523473501205444, "learning_rate": 3.121297258045984e-05, "loss": 0.0277, "num_input_tokens_seen": 126499216, "step": 58610 }, { "epoch": 9.561990212071779, "grad_norm": 0.08790655434131622, "learning_rate": 3.120952518244505e-05, "loss": 0.0191, "num_input_tokens_seen": 126509936, "step": 58615 }, { "epoch": 9.562805872756933, "grad_norm": 2.0301525592803955, "learning_rate": 3.1206077658586354e-05, "loss": 0.1606, "num_input_tokens_seen": 126521776, "step": 58620 }, { "epoch": 9.563621533442088, "grad_norm": 1.7546119689941406, "learning_rate": 3.120263000895361e-05, "loss": 0.0828, "num_input_tokens_seen": 126532464, "step": 58625 }, { "epoch": 9.564437194127244, "grad_norm": 0.08722571283578873, "learning_rate": 3.119918223361672e-05, "loss": 0.0656, "num_input_tokens_seen": 126542992, "step": 58630 }, { "epoch": 9.565252854812398, "grad_norm": 1.845009207725525, "learning_rate": 3.119573433264553e-05, "loss": 0.2022, "num_input_tokens_seen": 126553680, "step": 58635 }, { "epoch": 9.566068515497554, "grad_norm": 0.2662495970726013, "learning_rate": 3.119228630610992e-05, "loss": 0.0276, "num_input_tokens_seen": 126563440, "step": 58640 }, { "epoch": 9.566884176182707, "grad_norm": 0.2755752205848694, "learning_rate": 3.118883815407977e-05, "loss": 0.1936, "num_input_tokens_seen": 126573904, "step": 58645 }, { "epoch": 9.567699836867863, "grad_norm": 2.17132830619812, "learning_rate": 3.118538987662497e-05, "loss": 0.251, "num_input_tokens_seen": 126584560, "step": 58650 }, { "epoch": 9.568515497553017, "grad_norm": 0.13856908679008484, "learning_rate": 3.11819414738154e-05, "loss": 0.0313, "num_input_tokens_seen": 126595632, "step": 58655 }, { "epoch": 9.569331158238173, "grad_norm": 0.058041248470544815, "learning_rate": 3.117849294572094e-05, "loss": 0.0455, "num_input_tokens_seen": 126606800, "step": 58660 }, { "epoch": 9.570146818923329, "grad_norm": 1.1570706367492676, "learning_rate": 3.117504429241147e-05, "loss": 0.1174, "num_input_tokens_seen": 126617808, "step": 58665 }, { "epoch": 9.570962479608482, "grad_norm": 0.2265351116657257, "learning_rate": 3.117159551395692e-05, "loss": 0.216, "num_input_tokens_seen": 126628720, "step": 58670 }, { "epoch": 9.571778140293638, "grad_norm": 0.18714818358421326, "learning_rate": 3.116814661042714e-05, "loss": 0.0439, "num_input_tokens_seen": 126639664, "step": 58675 }, { "epoch": 9.572593800978792, "grad_norm": 0.0469852052628994, "learning_rate": 3.1164697581892046e-05, "loss": 0.0153, "num_input_tokens_seen": 126650736, "step": 58680 }, { "epoch": 9.573409461663948, "grad_norm": 1.995047926902771, "learning_rate": 3.1161248428421533e-05, "loss": 0.1249, "num_input_tokens_seen": 126662224, "step": 58685 }, { "epoch": 9.574225122349104, "grad_norm": 0.5961662530899048, "learning_rate": 3.115779915008551e-05, "loss": 0.0333, "num_input_tokens_seen": 126673040, "step": 58690 }, { "epoch": 9.575040783034257, "grad_norm": 0.17159362137317657, "learning_rate": 3.115434974695387e-05, "loss": 0.1083, "num_input_tokens_seen": 126683824, "step": 58695 }, { "epoch": 9.575856443719413, "grad_norm": 0.13540421426296234, "learning_rate": 3.115090021909653e-05, "loss": 0.0646, "num_input_tokens_seen": 126693488, "step": 58700 }, { "epoch": 9.576672104404567, "grad_norm": 1.619397759437561, "learning_rate": 3.114745056658339e-05, "loss": 0.2313, "num_input_tokens_seen": 126703888, "step": 58705 }, { "epoch": 9.577487765089723, "grad_norm": 0.052453018724918365, "learning_rate": 3.1144000789484375e-05, "loss": 0.2076, "num_input_tokens_seen": 126715440, "step": 58710 }, { "epoch": 9.578303425774878, "grad_norm": 0.8961605429649353, "learning_rate": 3.114055088786938e-05, "loss": 0.2611, "num_input_tokens_seen": 126725680, "step": 58715 }, { "epoch": 9.579119086460032, "grad_norm": 1.1490761041641235, "learning_rate": 3.1137100861808336e-05, "loss": 0.0419, "num_input_tokens_seen": 126736144, "step": 58720 }, { "epoch": 9.579934747145188, "grad_norm": 0.49755874276161194, "learning_rate": 3.113365071137116e-05, "loss": 0.1538, "num_input_tokens_seen": 126747504, "step": 58725 }, { "epoch": 9.580750407830342, "grad_norm": 0.03456197679042816, "learning_rate": 3.113020043662777e-05, "loss": 0.1073, "num_input_tokens_seen": 126757904, "step": 58730 }, { "epoch": 9.581566068515498, "grad_norm": 0.9279085993766785, "learning_rate": 3.1126750037648085e-05, "loss": 0.0567, "num_input_tokens_seen": 126768400, "step": 58735 }, { "epoch": 9.582381729200652, "grad_norm": 1.3848828077316284, "learning_rate": 3.112329951450204e-05, "loss": 0.1937, "num_input_tokens_seen": 126779440, "step": 58740 }, { "epoch": 9.583197389885807, "grad_norm": 1.4713363647460938, "learning_rate": 3.111984886725956e-05, "loss": 0.1308, "num_input_tokens_seen": 126790672, "step": 58745 }, { "epoch": 9.584013050570963, "grad_norm": 0.6076797246932983, "learning_rate": 3.111639809599059e-05, "loss": 0.0737, "num_input_tokens_seen": 126799824, "step": 58750 }, { "epoch": 9.584828711256117, "grad_norm": 0.4179360568523407, "learning_rate": 3.111294720076505e-05, "loss": 0.0352, "num_input_tokens_seen": 126811248, "step": 58755 }, { "epoch": 9.585644371941273, "grad_norm": 1.3553516864776611, "learning_rate": 3.1109496181652877e-05, "loss": 0.2992, "num_input_tokens_seen": 126822256, "step": 58760 }, { "epoch": 9.586460032626427, "grad_norm": 0.7180034518241882, "learning_rate": 3.1106045038724015e-05, "loss": 0.1921, "num_input_tokens_seen": 126833840, "step": 58765 }, { "epoch": 9.587275693311582, "grad_norm": 0.2418527454137802, "learning_rate": 3.1102593772048404e-05, "loss": 0.0144, "num_input_tokens_seen": 126846160, "step": 58770 }, { "epoch": 9.588091353996738, "grad_norm": 2.1489272117614746, "learning_rate": 3.109914238169598e-05, "loss": 0.0755, "num_input_tokens_seen": 126857904, "step": 58775 }, { "epoch": 9.588907014681892, "grad_norm": 0.42355650663375854, "learning_rate": 3.10956908677367e-05, "loss": 0.1346, "num_input_tokens_seen": 126869232, "step": 58780 }, { "epoch": 9.589722675367048, "grad_norm": 0.03436494618654251, "learning_rate": 3.109223923024053e-05, "loss": 0.0552, "num_input_tokens_seen": 126880784, "step": 58785 }, { "epoch": 9.590538336052202, "grad_norm": 1.0746830701828003, "learning_rate": 3.1088787469277385e-05, "loss": 0.0791, "num_input_tokens_seen": 126891600, "step": 58790 }, { "epoch": 9.591353996737357, "grad_norm": 1.174362063407898, "learning_rate": 3.108533558491725e-05, "loss": 0.138, "num_input_tokens_seen": 126903056, "step": 58795 }, { "epoch": 9.592169657422513, "grad_norm": 0.28653669357299805, "learning_rate": 3.108188357723006e-05, "loss": 0.1014, "num_input_tokens_seen": 126914032, "step": 58800 }, { "epoch": 9.592985318107667, "grad_norm": 0.04440833255648613, "learning_rate": 3.1078431446285785e-05, "loss": 0.2922, "num_input_tokens_seen": 126923920, "step": 58805 }, { "epoch": 9.593800978792823, "grad_norm": 0.14817869663238525, "learning_rate": 3.107497919215439e-05, "loss": 0.0358, "num_input_tokens_seen": 126935216, "step": 58810 }, { "epoch": 9.594616639477977, "grad_norm": 0.1171821653842926, "learning_rate": 3.1071526814905835e-05, "loss": 0.0616, "num_input_tokens_seen": 126944752, "step": 58815 }, { "epoch": 9.595432300163132, "grad_norm": 1.5535016059875488, "learning_rate": 3.106807431461008e-05, "loss": 0.1935, "num_input_tokens_seen": 126955856, "step": 58820 }, { "epoch": 9.596247960848288, "grad_norm": 1.0049325227737427, "learning_rate": 3.10646216913371e-05, "loss": 0.0828, "num_input_tokens_seen": 126966064, "step": 58825 }, { "epoch": 9.597063621533442, "grad_norm": 0.028906404972076416, "learning_rate": 3.1061168945156875e-05, "loss": 0.097, "num_input_tokens_seen": 126977424, "step": 58830 }, { "epoch": 9.597879282218598, "grad_norm": 1.8156431913375854, "learning_rate": 3.105771607613937e-05, "loss": 0.1383, "num_input_tokens_seen": 126987728, "step": 58835 }, { "epoch": 9.598694942903752, "grad_norm": 0.09521552920341492, "learning_rate": 3.1054263084354565e-05, "loss": 0.191, "num_input_tokens_seen": 126999536, "step": 58840 }, { "epoch": 9.599510603588907, "grad_norm": 0.23220306634902954, "learning_rate": 3.105080996987244e-05, "loss": 0.1705, "num_input_tokens_seen": 127010384, "step": 58845 }, { "epoch": 9.600326264274061, "grad_norm": 0.2896626889705658, "learning_rate": 3.1047356732762975e-05, "loss": 0.0196, "num_input_tokens_seen": 127020176, "step": 58850 }, { "epoch": 9.601141924959217, "grad_norm": 2.2791271209716797, "learning_rate": 3.104390337309615e-05, "loss": 0.1897, "num_input_tokens_seen": 127030192, "step": 58855 }, { "epoch": 9.601957585644373, "grad_norm": 0.8069466352462769, "learning_rate": 3.104044989094196e-05, "loss": 0.061, "num_input_tokens_seen": 127041808, "step": 58860 }, { "epoch": 9.602773246329527, "grad_norm": 0.052256517112255096, "learning_rate": 3.103699628637039e-05, "loss": 0.0905, "num_input_tokens_seen": 127052400, "step": 58865 }, { "epoch": 9.603588907014682, "grad_norm": 0.03733768314123154, "learning_rate": 3.1033542559451426e-05, "loss": 0.1289, "num_input_tokens_seen": 127063056, "step": 58870 }, { "epoch": 9.604404567699836, "grad_norm": 0.08756415545940399, "learning_rate": 3.103008871025507e-05, "loss": 0.122, "num_input_tokens_seen": 127073840, "step": 58875 }, { "epoch": 9.605220228384992, "grad_norm": 0.03416288271546364, "learning_rate": 3.102663473885132e-05, "loss": 0.0732, "num_input_tokens_seen": 127084656, "step": 58880 }, { "epoch": 9.606035889070148, "grad_norm": 0.11042583733797073, "learning_rate": 3.102318064531017e-05, "loss": 0.0495, "num_input_tokens_seen": 127095984, "step": 58885 }, { "epoch": 9.606851549755302, "grad_norm": 0.05257454141974449, "learning_rate": 3.1019726429701625e-05, "loss": 0.0652, "num_input_tokens_seen": 127105776, "step": 58890 }, { "epoch": 9.607667210440457, "grad_norm": 0.5389982461929321, "learning_rate": 3.101627209209568e-05, "loss": 0.1513, "num_input_tokens_seen": 127117424, "step": 58895 }, { "epoch": 9.608482871125611, "grad_norm": 1.2107951641082764, "learning_rate": 3.101281763256236e-05, "loss": 0.2082, "num_input_tokens_seen": 127128240, "step": 58900 }, { "epoch": 9.609298531810767, "grad_norm": 0.47585543990135193, "learning_rate": 3.100936305117166e-05, "loss": 0.0419, "num_input_tokens_seen": 127140176, "step": 58905 }, { "epoch": 9.61011419249592, "grad_norm": 0.3028699457645416, "learning_rate": 3.100590834799359e-05, "loss": 0.0853, "num_input_tokens_seen": 127149392, "step": 58910 }, { "epoch": 9.610929853181077, "grad_norm": 0.10683539509773254, "learning_rate": 3.100245352309817e-05, "loss": 0.0535, "num_input_tokens_seen": 127160496, "step": 58915 }, { "epoch": 9.611745513866232, "grad_norm": 1.9741506576538086, "learning_rate": 3.099899857655542e-05, "loss": 0.0664, "num_input_tokens_seen": 127170544, "step": 58920 }, { "epoch": 9.612561174551386, "grad_norm": 0.06345655769109726, "learning_rate": 3.0995543508435344e-05, "loss": 0.0642, "num_input_tokens_seen": 127182992, "step": 58925 }, { "epoch": 9.613376835236542, "grad_norm": 0.4007052779197693, "learning_rate": 3.099208831880798e-05, "loss": 0.0761, "num_input_tokens_seen": 127194320, "step": 58930 }, { "epoch": 9.614192495921696, "grad_norm": 0.0959867313504219, "learning_rate": 3.098863300774334e-05, "loss": 0.1072, "num_input_tokens_seen": 127205776, "step": 58935 }, { "epoch": 9.615008156606851, "grad_norm": 1.2945691347122192, "learning_rate": 3.098517757531146e-05, "loss": 0.155, "num_input_tokens_seen": 127217264, "step": 58940 }, { "epoch": 9.615823817292007, "grad_norm": 0.5589258670806885, "learning_rate": 3.098172202158236e-05, "loss": 0.1205, "num_input_tokens_seen": 127229584, "step": 58945 }, { "epoch": 9.616639477977161, "grad_norm": 0.22680383920669556, "learning_rate": 3.097826634662607e-05, "loss": 0.1172, "num_input_tokens_seen": 127239824, "step": 58950 }, { "epoch": 9.617455138662317, "grad_norm": 0.1371764838695526, "learning_rate": 3.097481055051264e-05, "loss": 0.0868, "num_input_tokens_seen": 127250672, "step": 58955 }, { "epoch": 9.61827079934747, "grad_norm": 0.5792772173881531, "learning_rate": 3.097135463331208e-05, "loss": 0.1979, "num_input_tokens_seen": 127260496, "step": 58960 }, { "epoch": 9.619086460032626, "grad_norm": 0.39044588804244995, "learning_rate": 3.096789859509445e-05, "loss": 0.1742, "num_input_tokens_seen": 127272336, "step": 58965 }, { "epoch": 9.619902120717782, "grad_norm": 1.0235285758972168, "learning_rate": 3.096444243592979e-05, "loss": 0.0774, "num_input_tokens_seen": 127284368, "step": 58970 }, { "epoch": 9.620717781402936, "grad_norm": 0.037047214806079865, "learning_rate": 3.096098615588813e-05, "loss": 0.128, "num_input_tokens_seen": 127294032, "step": 58975 }, { "epoch": 9.621533442088092, "grad_norm": 0.043973155319690704, "learning_rate": 3.0957529755039534e-05, "loss": 0.1616, "num_input_tokens_seen": 127304080, "step": 58980 }, { "epoch": 9.622349102773246, "grad_norm": 0.6297473311424255, "learning_rate": 3.0954073233454025e-05, "loss": 0.05, "num_input_tokens_seen": 127315888, "step": 58985 }, { "epoch": 9.623164763458401, "grad_norm": 1.5161482095718384, "learning_rate": 3.0950616591201674e-05, "loss": 0.0793, "num_input_tokens_seen": 127326864, "step": 58990 }, { "epoch": 9.623980424143557, "grad_norm": 0.16429191827774048, "learning_rate": 3.094715982835253e-05, "loss": 0.0519, "num_input_tokens_seen": 127336368, "step": 58995 }, { "epoch": 9.624796084828711, "grad_norm": 0.2387678325176239, "learning_rate": 3.094370294497665e-05, "loss": 0.0557, "num_input_tokens_seen": 127347824, "step": 59000 }, { "epoch": 9.625611745513867, "grad_norm": 1.6480281352996826, "learning_rate": 3.09402459411441e-05, "loss": 0.1549, "num_input_tokens_seen": 127356656, "step": 59005 }, { "epoch": 9.62642740619902, "grad_norm": 0.11868660151958466, "learning_rate": 3.093678881692491e-05, "loss": 0.2225, "num_input_tokens_seen": 127367120, "step": 59010 }, { "epoch": 9.627243066884176, "grad_norm": 1.0225486755371094, "learning_rate": 3.093333157238918e-05, "loss": 0.1413, "num_input_tokens_seen": 127378800, "step": 59015 }, { "epoch": 9.62805872756933, "grad_norm": 1.1237956285476685, "learning_rate": 3.092987420760695e-05, "loss": 0.1294, "num_input_tokens_seen": 127390000, "step": 59020 }, { "epoch": 9.628874388254486, "grad_norm": 0.046521447598934174, "learning_rate": 3.092641672264829e-05, "loss": 0.0437, "num_input_tokens_seen": 127400144, "step": 59025 }, { "epoch": 9.629690048939642, "grad_norm": 0.05439640209078789, "learning_rate": 3.092295911758329e-05, "loss": 0.0989, "num_input_tokens_seen": 127411568, "step": 59030 }, { "epoch": 9.630505709624796, "grad_norm": 0.2253159135580063, "learning_rate": 3.0919501392482003e-05, "loss": 0.1174, "num_input_tokens_seen": 127421520, "step": 59035 }, { "epoch": 9.631321370309951, "grad_norm": 0.14611424505710602, "learning_rate": 3.0916043547414515e-05, "loss": 0.0983, "num_input_tokens_seen": 127433424, "step": 59040 }, { "epoch": 9.632137030995105, "grad_norm": 0.5572043061256409, "learning_rate": 3.0912585582450895e-05, "loss": 0.0505, "num_input_tokens_seen": 127443408, "step": 59045 }, { "epoch": 9.632952691680261, "grad_norm": 1.5050040483474731, "learning_rate": 3.090912749766123e-05, "loss": 0.0879, "num_input_tokens_seen": 127454256, "step": 59050 }, { "epoch": 9.633768352365417, "grad_norm": 0.050992418080568314, "learning_rate": 3.09056692931156e-05, "loss": 0.1541, "num_input_tokens_seen": 127464624, "step": 59055 }, { "epoch": 9.63458401305057, "grad_norm": 0.20140644907951355, "learning_rate": 3.0902210968884096e-05, "loss": 0.0772, "num_input_tokens_seen": 127474192, "step": 59060 }, { "epoch": 9.635399673735726, "grad_norm": 2.3968567848205566, "learning_rate": 3.0898752525036786e-05, "loss": 0.1026, "num_input_tokens_seen": 127483056, "step": 59065 }, { "epoch": 9.63621533442088, "grad_norm": 2.1571218967437744, "learning_rate": 3.089529396164378e-05, "loss": 0.1285, "num_input_tokens_seen": 127493360, "step": 59070 }, { "epoch": 9.637030995106036, "grad_norm": 0.749186098575592, "learning_rate": 3.089183527877516e-05, "loss": 0.1299, "num_input_tokens_seen": 127503728, "step": 59075 }, { "epoch": 9.63784665579119, "grad_norm": 0.04522798955440521, "learning_rate": 3.088837647650103e-05, "loss": 0.0311, "num_input_tokens_seen": 127514672, "step": 59080 }, { "epoch": 9.638662316476346, "grad_norm": 0.106670081615448, "learning_rate": 3.088491755489148e-05, "loss": 0.047, "num_input_tokens_seen": 127525200, "step": 59085 }, { "epoch": 9.639477977161501, "grad_norm": 0.18631495535373688, "learning_rate": 3.08814585140166e-05, "loss": 0.1028, "num_input_tokens_seen": 127535824, "step": 59090 }, { "epoch": 9.640293637846655, "grad_norm": 0.05042402073740959, "learning_rate": 3.087799935394651e-05, "loss": 0.0929, "num_input_tokens_seen": 127546960, "step": 59095 }, { "epoch": 9.641109298531811, "grad_norm": 2.029496192932129, "learning_rate": 3.08745400747513e-05, "loss": 0.1927, "num_input_tokens_seen": 127557680, "step": 59100 }, { "epoch": 9.641924959216965, "grad_norm": 0.2979355454444885, "learning_rate": 3.087108067650109e-05, "loss": 0.0524, "num_input_tokens_seen": 127567696, "step": 59105 }, { "epoch": 9.64274061990212, "grad_norm": 0.14064864814281464, "learning_rate": 3.0867621159265985e-05, "loss": 0.2539, "num_input_tokens_seen": 127578736, "step": 59110 }, { "epoch": 9.643556280587276, "grad_norm": 1.102749228477478, "learning_rate": 3.0864161523116085e-05, "loss": 0.0468, "num_input_tokens_seen": 127590224, "step": 59115 }, { "epoch": 9.64437194127243, "grad_norm": 0.14381448924541473, "learning_rate": 3.086070176812152e-05, "loss": 0.0502, "num_input_tokens_seen": 127601328, "step": 59120 }, { "epoch": 9.645187601957586, "grad_norm": 0.6599516868591309, "learning_rate": 3.0857241894352394e-05, "loss": 0.0403, "num_input_tokens_seen": 127612464, "step": 59125 }, { "epoch": 9.64600326264274, "grad_norm": 1.5291526317596436, "learning_rate": 3.085378190187883e-05, "loss": 0.0679, "num_input_tokens_seen": 127622864, "step": 59130 }, { "epoch": 9.646818923327896, "grad_norm": 1.0751097202301025, "learning_rate": 3.0850321790770955e-05, "loss": 0.0602, "num_input_tokens_seen": 127634000, "step": 59135 }, { "epoch": 9.647634584013051, "grad_norm": 1.6365602016448975, "learning_rate": 3.084686156109888e-05, "loss": 0.0551, "num_input_tokens_seen": 127645264, "step": 59140 }, { "epoch": 9.648450244698205, "grad_norm": 1.055525541305542, "learning_rate": 3.0843401212932746e-05, "loss": 0.0216, "num_input_tokens_seen": 127655376, "step": 59145 }, { "epoch": 9.649265905383361, "grad_norm": 0.1302061229944229, "learning_rate": 3.083994074634266e-05, "loss": 0.2472, "num_input_tokens_seen": 127665584, "step": 59150 }, { "epoch": 9.650081566068515, "grad_norm": 0.41628000140190125, "learning_rate": 3.083648016139878e-05, "loss": 0.1368, "num_input_tokens_seen": 127675920, "step": 59155 }, { "epoch": 9.65089722675367, "grad_norm": 0.6258101463317871, "learning_rate": 3.083301945817123e-05, "loss": 0.0394, "num_input_tokens_seen": 127686928, "step": 59160 }, { "epoch": 9.651712887438826, "grad_norm": 0.22003209590911865, "learning_rate": 3.082955863673013e-05, "loss": 0.0187, "num_input_tokens_seen": 127697296, "step": 59165 }, { "epoch": 9.65252854812398, "grad_norm": 0.08491350710391998, "learning_rate": 3.082609769714563e-05, "loss": 0.1582, "num_input_tokens_seen": 127707312, "step": 59170 }, { "epoch": 9.653344208809136, "grad_norm": 0.1588771790266037, "learning_rate": 3.082263663948787e-05, "loss": 0.0588, "num_input_tokens_seen": 127719056, "step": 59175 }, { "epoch": 9.65415986949429, "grad_norm": 0.21589887142181396, "learning_rate": 3.081917546382699e-05, "loss": 0.1045, "num_input_tokens_seen": 127730864, "step": 59180 }, { "epoch": 9.654975530179446, "grad_norm": 1.2377804517745972, "learning_rate": 3.0815714170233147e-05, "loss": 0.0788, "num_input_tokens_seen": 127740944, "step": 59185 }, { "epoch": 9.655791190864601, "grad_norm": 0.023748282343149185, "learning_rate": 3.081225275877647e-05, "loss": 0.0444, "num_input_tokens_seen": 127752016, "step": 59190 }, { "epoch": 9.656606851549755, "grad_norm": 0.14540915191173553, "learning_rate": 3.0808791229527115e-05, "loss": 0.0161, "num_input_tokens_seen": 127762480, "step": 59195 }, { "epoch": 9.65742251223491, "grad_norm": 0.15392164885997772, "learning_rate": 3.080532958255524e-05, "loss": 0.0817, "num_input_tokens_seen": 127773232, "step": 59200 }, { "epoch": 9.658238172920065, "grad_norm": 0.1813262701034546, "learning_rate": 3.080186781793099e-05, "loss": 0.0538, "num_input_tokens_seen": 127784112, "step": 59205 }, { "epoch": 9.65905383360522, "grad_norm": 0.09104301035404205, "learning_rate": 3.079840593572455e-05, "loss": 0.1272, "num_input_tokens_seen": 127795600, "step": 59210 }, { "epoch": 9.659869494290374, "grad_norm": 1.311643362045288, "learning_rate": 3.079494393600604e-05, "loss": 0.148, "num_input_tokens_seen": 127805648, "step": 59215 }, { "epoch": 9.66068515497553, "grad_norm": 1.03512704372406, "learning_rate": 3.079148181884564e-05, "loss": 0.0567, "num_input_tokens_seen": 127816560, "step": 59220 }, { "epoch": 9.661500815660686, "grad_norm": 0.37761345505714417, "learning_rate": 3.078801958431352e-05, "loss": 0.1979, "num_input_tokens_seen": 127828784, "step": 59225 }, { "epoch": 9.66231647634584, "grad_norm": 1.975152850151062, "learning_rate": 3.078455723247983e-05, "loss": 0.1152, "num_input_tokens_seen": 127839088, "step": 59230 }, { "epoch": 9.663132137030995, "grad_norm": 0.7968887686729431, "learning_rate": 3.078109476341476e-05, "loss": 0.1455, "num_input_tokens_seen": 127848016, "step": 59235 }, { "epoch": 9.66394779771615, "grad_norm": 0.046066854149103165, "learning_rate": 3.077763217718847e-05, "loss": 0.0878, "num_input_tokens_seen": 127858320, "step": 59240 }, { "epoch": 9.664763458401305, "grad_norm": 0.6352246999740601, "learning_rate": 3.0774169473871135e-05, "loss": 0.2129, "num_input_tokens_seen": 127869936, "step": 59245 }, { "epoch": 9.66557911908646, "grad_norm": 0.4743174612522125, "learning_rate": 3.077070665353293e-05, "loss": 0.0809, "num_input_tokens_seen": 127881040, "step": 59250 }, { "epoch": 9.666394779771615, "grad_norm": 0.36358433961868286, "learning_rate": 3.076724371624402e-05, "loss": 0.182, "num_input_tokens_seen": 127891792, "step": 59255 }, { "epoch": 9.66721044045677, "grad_norm": 3.0209600925445557, "learning_rate": 3.076378066207462e-05, "loss": 0.3519, "num_input_tokens_seen": 127903376, "step": 59260 }, { "epoch": 9.668026101141924, "grad_norm": 0.4670465588569641, "learning_rate": 3.076031749109489e-05, "loss": 0.048, "num_input_tokens_seen": 127914576, "step": 59265 }, { "epoch": 9.66884176182708, "grad_norm": 0.13665662705898285, "learning_rate": 3.075685420337501e-05, "loss": 0.036, "num_input_tokens_seen": 127924784, "step": 59270 }, { "epoch": 9.669657422512234, "grad_norm": 0.6138381958007812, "learning_rate": 3.075339079898517e-05, "loss": 0.1109, "num_input_tokens_seen": 127934800, "step": 59275 }, { "epoch": 9.67047308319739, "grad_norm": 0.902169942855835, "learning_rate": 3.074992727799558e-05, "loss": 0.136, "num_input_tokens_seen": 127945584, "step": 59280 }, { "epoch": 9.671288743882545, "grad_norm": 0.9446676969528198, "learning_rate": 3.0746463640476414e-05, "loss": 0.1841, "num_input_tokens_seen": 127956272, "step": 59285 }, { "epoch": 9.6721044045677, "grad_norm": 0.49906304478645325, "learning_rate": 3.074299988649788e-05, "loss": 0.0669, "num_input_tokens_seen": 127966672, "step": 59290 }, { "epoch": 9.672920065252855, "grad_norm": 2.937812328338623, "learning_rate": 3.073953601613016e-05, "loss": 0.3541, "num_input_tokens_seen": 127978032, "step": 59295 }, { "epoch": 9.673735725938009, "grad_norm": 0.3416043817996979, "learning_rate": 3.0736072029443464e-05, "loss": 0.0845, "num_input_tokens_seen": 127989072, "step": 59300 }, { "epoch": 9.674551386623165, "grad_norm": 0.05236706882715225, "learning_rate": 3.0732607926507986e-05, "loss": 0.0093, "num_input_tokens_seen": 127999568, "step": 59305 }, { "epoch": 9.67536704730832, "grad_norm": 0.6217676997184753, "learning_rate": 3.0729143707393936e-05, "loss": 0.0644, "num_input_tokens_seen": 128010256, "step": 59310 }, { "epoch": 9.676182707993474, "grad_norm": 0.06276656687259674, "learning_rate": 3.072567937217153e-05, "loss": 0.0322, "num_input_tokens_seen": 128020176, "step": 59315 }, { "epoch": 9.67699836867863, "grad_norm": 0.13871130347251892, "learning_rate": 3.0722214920910965e-05, "loss": 0.0944, "num_input_tokens_seen": 128032496, "step": 59320 }, { "epoch": 9.677814029363784, "grad_norm": 0.06435954570770264, "learning_rate": 3.0718750353682454e-05, "loss": 0.0193, "num_input_tokens_seen": 128043632, "step": 59325 }, { "epoch": 9.67862969004894, "grad_norm": 0.03655462712049484, "learning_rate": 3.0715285670556214e-05, "loss": 0.1619, "num_input_tokens_seen": 128054608, "step": 59330 }, { "epoch": 9.679445350734095, "grad_norm": 0.1442849189043045, "learning_rate": 3.071182087160246e-05, "loss": 0.0377, "num_input_tokens_seen": 128065360, "step": 59335 }, { "epoch": 9.68026101141925, "grad_norm": 0.10637737065553665, "learning_rate": 3.07083559568914e-05, "loss": 0.066, "num_input_tokens_seen": 128076464, "step": 59340 }, { "epoch": 9.681076672104405, "grad_norm": 0.4605633318424225, "learning_rate": 3.070489092649328e-05, "loss": 0.0318, "num_input_tokens_seen": 128086320, "step": 59345 }, { "epoch": 9.681892332789559, "grad_norm": 0.15266914665699005, "learning_rate": 3.07014257804783e-05, "loss": 0.0377, "num_input_tokens_seen": 128096208, "step": 59350 }, { "epoch": 9.682707993474715, "grad_norm": 0.6863635778427124, "learning_rate": 3.06979605189167e-05, "loss": 0.1511, "num_input_tokens_seen": 128107376, "step": 59355 }, { "epoch": 9.68352365415987, "grad_norm": 0.0942421555519104, "learning_rate": 3.06944951418787e-05, "loss": 0.0376, "num_input_tokens_seen": 128118800, "step": 59360 }, { "epoch": 9.684339314845024, "grad_norm": 1.5573742389678955, "learning_rate": 3.069102964943453e-05, "loss": 0.2406, "num_input_tokens_seen": 128131152, "step": 59365 }, { "epoch": 9.68515497553018, "grad_norm": 0.20473292469978333, "learning_rate": 3.068756404165442e-05, "loss": 0.0523, "num_input_tokens_seen": 128142480, "step": 59370 }, { "epoch": 9.685970636215334, "grad_norm": 0.7098509669303894, "learning_rate": 3.068409831860861e-05, "loss": 0.0426, "num_input_tokens_seen": 128152816, "step": 59375 }, { "epoch": 9.68678629690049, "grad_norm": 1.8067643642425537, "learning_rate": 3.0680632480367346e-05, "loss": 0.0459, "num_input_tokens_seen": 128163632, "step": 59380 }, { "epoch": 9.687601957585644, "grad_norm": 0.06642849743366241, "learning_rate": 3.067716652700085e-05, "loss": 0.0119, "num_input_tokens_seen": 128174640, "step": 59385 }, { "epoch": 9.6884176182708, "grad_norm": 0.8378840088844299, "learning_rate": 3.067370045857937e-05, "loss": 0.1728, "num_input_tokens_seen": 128184144, "step": 59390 }, { "epoch": 9.689233278955955, "grad_norm": 0.03852517530322075, "learning_rate": 3.067023427517316e-05, "loss": 0.0699, "num_input_tokens_seen": 128192976, "step": 59395 }, { "epoch": 9.690048939641109, "grad_norm": 0.026498539373278618, "learning_rate": 3.0666767976852455e-05, "loss": 0.1104, "num_input_tokens_seen": 128204144, "step": 59400 }, { "epoch": 9.690864600326265, "grad_norm": 1.2470811605453491, "learning_rate": 3.0663301563687515e-05, "loss": 0.0891, "num_input_tokens_seen": 128215600, "step": 59405 }, { "epoch": 9.691680261011419, "grad_norm": 0.2583214342594147, "learning_rate": 3.065983503574858e-05, "loss": 0.1409, "num_input_tokens_seen": 128226672, "step": 59410 }, { "epoch": 9.692495921696574, "grad_norm": 1.8314781188964844, "learning_rate": 3.06563683931059e-05, "loss": 0.157, "num_input_tokens_seen": 128237936, "step": 59415 }, { "epoch": 9.69331158238173, "grad_norm": 0.92140793800354, "learning_rate": 3.065290163582974e-05, "loss": 0.0587, "num_input_tokens_seen": 128248688, "step": 59420 }, { "epoch": 9.694127243066884, "grad_norm": 0.959821879863739, "learning_rate": 3.064943476399037e-05, "loss": 0.0847, "num_input_tokens_seen": 128259440, "step": 59425 }, { "epoch": 9.69494290375204, "grad_norm": 0.08012363314628601, "learning_rate": 3.064596777765803e-05, "loss": 0.0895, "num_input_tokens_seen": 128268880, "step": 59430 }, { "epoch": 9.695758564437194, "grad_norm": 0.04222622141242027, "learning_rate": 3.064250067690299e-05, "loss": 0.0204, "num_input_tokens_seen": 128277744, "step": 59435 }, { "epoch": 9.69657422512235, "grad_norm": 0.1589697152376175, "learning_rate": 3.063903346179552e-05, "loss": 0.0974, "num_input_tokens_seen": 128288176, "step": 59440 }, { "epoch": 9.697389885807503, "grad_norm": 0.8813496828079224, "learning_rate": 3.0635566132405875e-05, "loss": 0.0294, "num_input_tokens_seen": 128298672, "step": 59445 }, { "epoch": 9.698205546492659, "grad_norm": 0.05688010901212692, "learning_rate": 3.063209868880434e-05, "loss": 0.0304, "num_input_tokens_seen": 128309264, "step": 59450 }, { "epoch": 9.699021207177815, "grad_norm": 2.536278009414673, "learning_rate": 3.062863113106118e-05, "loss": 0.1919, "num_input_tokens_seen": 128320656, "step": 59455 }, { "epoch": 9.699836867862969, "grad_norm": 0.4103579521179199, "learning_rate": 3.0625163459246666e-05, "loss": 0.0487, "num_input_tokens_seen": 128332016, "step": 59460 }, { "epoch": 9.700652528548124, "grad_norm": 2.284395217895508, "learning_rate": 3.062169567343108e-05, "loss": 0.0667, "num_input_tokens_seen": 128342448, "step": 59465 }, { "epoch": 9.701468189233278, "grad_norm": 1.429802656173706, "learning_rate": 3.06182277736847e-05, "loss": 0.0835, "num_input_tokens_seen": 128352368, "step": 59470 }, { "epoch": 9.702283849918434, "grad_norm": 1.4704720973968506, "learning_rate": 3.061475976007781e-05, "loss": 0.3501, "num_input_tokens_seen": 128361488, "step": 59475 }, { "epoch": 9.70309951060359, "grad_norm": 0.07690198719501495, "learning_rate": 3.061129163268069e-05, "loss": 0.1709, "num_input_tokens_seen": 128371632, "step": 59480 }, { "epoch": 9.703915171288743, "grad_norm": 0.22091251611709595, "learning_rate": 3.0607823391563624e-05, "loss": 0.2602, "num_input_tokens_seen": 128380688, "step": 59485 }, { "epoch": 9.7047308319739, "grad_norm": 0.14289748668670654, "learning_rate": 3.0604355036796905e-05, "loss": 0.1719, "num_input_tokens_seen": 128390480, "step": 59490 }, { "epoch": 9.705546492659053, "grad_norm": 0.5532777905464172, "learning_rate": 3.060088656845082e-05, "loss": 0.2426, "num_input_tokens_seen": 128401712, "step": 59495 }, { "epoch": 9.706362153344209, "grad_norm": 0.03010253980755806, "learning_rate": 3.059741798659566e-05, "loss": 0.0398, "num_input_tokens_seen": 128413040, "step": 59500 }, { "epoch": 9.707177814029365, "grad_norm": 0.09366779774427414, "learning_rate": 3.059394929130173e-05, "loss": 0.147, "num_input_tokens_seen": 128421744, "step": 59505 }, { "epoch": 9.707993474714518, "grad_norm": 0.48921898007392883, "learning_rate": 3.0590480482639316e-05, "loss": 0.175, "num_input_tokens_seen": 128432528, "step": 59510 }, { "epoch": 9.708809135399674, "grad_norm": 0.03764723986387253, "learning_rate": 3.058701156067873e-05, "loss": 0.0343, "num_input_tokens_seen": 128443664, "step": 59515 }, { "epoch": 9.709624796084828, "grad_norm": 0.019049031659960747, "learning_rate": 3.058354252549025e-05, "loss": 0.0345, "num_input_tokens_seen": 128454256, "step": 59520 }, { "epoch": 9.710440456769984, "grad_norm": 0.05077332258224487, "learning_rate": 3.058007337714421e-05, "loss": 0.0374, "num_input_tokens_seen": 128466768, "step": 59525 }, { "epoch": 9.71125611745514, "grad_norm": 0.6232810020446777, "learning_rate": 3.057660411571091e-05, "loss": 0.1237, "num_input_tokens_seen": 128477264, "step": 59530 }, { "epoch": 9.712071778140293, "grad_norm": 1.4366434812545776, "learning_rate": 3.057313474126065e-05, "loss": 0.1877, "num_input_tokens_seen": 128489040, "step": 59535 }, { "epoch": 9.71288743882545, "grad_norm": 0.12148507684469223, "learning_rate": 3.056966525386375e-05, "loss": 0.0242, "num_input_tokens_seen": 128500176, "step": 59540 }, { "epoch": 9.713703099510603, "grad_norm": 0.29552432894706726, "learning_rate": 3.05661956535905e-05, "loss": 0.1761, "num_input_tokens_seen": 128510096, "step": 59545 }, { "epoch": 9.714518760195759, "grad_norm": 0.41154736280441284, "learning_rate": 3.0562725940511245e-05, "loss": 0.0713, "num_input_tokens_seen": 128520976, "step": 59550 }, { "epoch": 9.715334420880914, "grad_norm": 0.09111075848340988, "learning_rate": 3.055925611469629e-05, "loss": 0.1358, "num_input_tokens_seen": 128532080, "step": 59555 }, { "epoch": 9.716150081566068, "grad_norm": 1.3458720445632935, "learning_rate": 3.055578617621596e-05, "loss": 0.1871, "num_input_tokens_seen": 128544176, "step": 59560 }, { "epoch": 9.716965742251224, "grad_norm": 0.21162739396095276, "learning_rate": 3.055231612514057e-05, "loss": 0.1079, "num_input_tokens_seen": 128555472, "step": 59565 }, { "epoch": 9.717781402936378, "grad_norm": 0.09962113946676254, "learning_rate": 3.0548845961540456e-05, "loss": 0.0515, "num_input_tokens_seen": 128565968, "step": 59570 }, { "epoch": 9.718597063621534, "grad_norm": 1.2625328302383423, "learning_rate": 3.054537568548594e-05, "loss": 0.315, "num_input_tokens_seen": 128576528, "step": 59575 }, { "epoch": 9.719412724306688, "grad_norm": 0.8449644446372986, "learning_rate": 3.0541905297047346e-05, "loss": 0.0494, "num_input_tokens_seen": 128586928, "step": 59580 }, { "epoch": 9.720228384991843, "grad_norm": 2.1174979209899902, "learning_rate": 3.0538434796295015e-05, "loss": 0.1186, "num_input_tokens_seen": 128599280, "step": 59585 }, { "epoch": 9.721044045676999, "grad_norm": 0.09380583465099335, "learning_rate": 3.0534964183299264e-05, "loss": 0.0291, "num_input_tokens_seen": 128609424, "step": 59590 }, { "epoch": 9.721859706362153, "grad_norm": 0.15157407522201538, "learning_rate": 3.053149345813045e-05, "loss": 0.012, "num_input_tokens_seen": 128620368, "step": 59595 }, { "epoch": 9.722675367047309, "grad_norm": 0.24208088219165802, "learning_rate": 3.052802262085891e-05, "loss": 0.086, "num_input_tokens_seen": 128631792, "step": 59600 }, { "epoch": 9.723491027732463, "grad_norm": 0.3012649416923523, "learning_rate": 3.0524551671554976e-05, "loss": 0.0856, "num_input_tokens_seen": 128641584, "step": 59605 }, { "epoch": 9.724306688417618, "grad_norm": 0.03947323560714722, "learning_rate": 3.0521080610288994e-05, "loss": 0.2859, "num_input_tokens_seen": 128651824, "step": 59610 }, { "epoch": 9.725122349102774, "grad_norm": 0.2614590525627136, "learning_rate": 3.0517609437131314e-05, "loss": 0.2092, "num_input_tokens_seen": 128661616, "step": 59615 }, { "epoch": 9.725938009787928, "grad_norm": 0.6021141409873962, "learning_rate": 3.051413815215227e-05, "loss": 0.0337, "num_input_tokens_seen": 128671952, "step": 59620 }, { "epoch": 9.726753670473084, "grad_norm": 2.506671190261841, "learning_rate": 3.0510666755422217e-05, "loss": 0.094, "num_input_tokens_seen": 128682928, "step": 59625 }, { "epoch": 9.727569331158238, "grad_norm": 1.1171890497207642, "learning_rate": 3.0507195247011522e-05, "loss": 0.2604, "num_input_tokens_seen": 128692592, "step": 59630 }, { "epoch": 9.728384991843393, "grad_norm": 0.7751289010047913, "learning_rate": 3.0503723626990525e-05, "loss": 0.0291, "num_input_tokens_seen": 128703280, "step": 59635 }, { "epoch": 9.729200652528547, "grad_norm": 0.1913212686777115, "learning_rate": 3.0500251895429587e-05, "loss": 0.0098, "num_input_tokens_seen": 128713904, "step": 59640 }, { "epoch": 9.730016313213703, "grad_norm": 0.5927703976631165, "learning_rate": 3.0496780052399072e-05, "loss": 0.054, "num_input_tokens_seen": 128723696, "step": 59645 }, { "epoch": 9.730831973898859, "grad_norm": 1.681679129600525, "learning_rate": 3.0493308097969332e-05, "loss": 0.1028, "num_input_tokens_seen": 128735120, "step": 59650 }, { "epoch": 9.731647634584013, "grad_norm": 0.030634764581918716, "learning_rate": 3.0489836032210734e-05, "loss": 0.1203, "num_input_tokens_seen": 128745232, "step": 59655 }, { "epoch": 9.732463295269168, "grad_norm": 0.4911704659461975, "learning_rate": 3.0486363855193644e-05, "loss": 0.0651, "num_input_tokens_seen": 128755856, "step": 59660 }, { "epoch": 9.733278955954322, "grad_norm": 0.44278720021247864, "learning_rate": 3.0482891566988437e-05, "loss": 0.0293, "num_input_tokens_seen": 128766768, "step": 59665 }, { "epoch": 9.734094616639478, "grad_norm": 2.3992364406585693, "learning_rate": 3.047941916766547e-05, "loss": 0.0858, "num_input_tokens_seen": 128778448, "step": 59670 }, { "epoch": 9.734910277324634, "grad_norm": 0.4707297384738922, "learning_rate": 3.0475946657295124e-05, "loss": 0.1151, "num_input_tokens_seen": 128788176, "step": 59675 }, { "epoch": 9.735725938009788, "grad_norm": 0.8976297974586487, "learning_rate": 3.0472474035947772e-05, "loss": 0.0462, "num_input_tokens_seen": 128799280, "step": 59680 }, { "epoch": 9.736541598694943, "grad_norm": 0.05544804781675339, "learning_rate": 3.046900130369379e-05, "loss": 0.018, "num_input_tokens_seen": 128808752, "step": 59685 }, { "epoch": 9.737357259380097, "grad_norm": 0.39121559262275696, "learning_rate": 3.046552846060356e-05, "loss": 0.0247, "num_input_tokens_seen": 128819728, "step": 59690 }, { "epoch": 9.738172920065253, "grad_norm": 1.2636762857437134, "learning_rate": 3.046205550674746e-05, "loss": 0.1014, "num_input_tokens_seen": 128831440, "step": 59695 }, { "epoch": 9.738988580750409, "grad_norm": 1.0203708410263062, "learning_rate": 3.0458582442195882e-05, "loss": 0.0506, "num_input_tokens_seen": 128841200, "step": 59700 }, { "epoch": 9.739804241435563, "grad_norm": 0.025082025676965714, "learning_rate": 3.0455109267019205e-05, "loss": 0.0281, "num_input_tokens_seen": 128852112, "step": 59705 }, { "epoch": 9.740619902120718, "grad_norm": 0.5312585234642029, "learning_rate": 3.0451635981287812e-05, "loss": 0.0285, "num_input_tokens_seen": 128864592, "step": 59710 }, { "epoch": 9.741435562805872, "grad_norm": 0.017694812268018723, "learning_rate": 3.0448162585072105e-05, "loss": 0.0256, "num_input_tokens_seen": 128876272, "step": 59715 }, { "epoch": 9.742251223491028, "grad_norm": 1.1365703344345093, "learning_rate": 3.0444689078442474e-05, "loss": 0.0438, "num_input_tokens_seen": 128887056, "step": 59720 }, { "epoch": 9.743066884176184, "grad_norm": 1.8132301568984985, "learning_rate": 3.0441215461469306e-05, "loss": 0.2114, "num_input_tokens_seen": 128898128, "step": 59725 }, { "epoch": 9.743882544861338, "grad_norm": 0.27688655257225037, "learning_rate": 3.0437741734223006e-05, "loss": 0.0353, "num_input_tokens_seen": 128909040, "step": 59730 }, { "epoch": 9.744698205546493, "grad_norm": 0.029533332213759422, "learning_rate": 3.0434267896773965e-05, "loss": 0.1501, "num_input_tokens_seen": 128918288, "step": 59735 }, { "epoch": 9.745513866231647, "grad_norm": 2.4605515003204346, "learning_rate": 3.04307939491926e-05, "loss": 0.1794, "num_input_tokens_seen": 128929584, "step": 59740 }, { "epoch": 9.746329526916803, "grad_norm": 0.14813442528247833, "learning_rate": 3.0427319891549304e-05, "loss": 0.0363, "num_input_tokens_seen": 128940208, "step": 59745 }, { "epoch": 9.747145187601957, "grad_norm": 1.0312693119049072, "learning_rate": 3.0423845723914484e-05, "loss": 0.2103, "num_input_tokens_seen": 128950224, "step": 59750 }, { "epoch": 9.747960848287113, "grad_norm": 0.5052252411842346, "learning_rate": 3.0420371446358552e-05, "loss": 0.0265, "num_input_tokens_seen": 128961072, "step": 59755 }, { "epoch": 9.748776508972268, "grad_norm": 0.2828357517719269, "learning_rate": 3.0416897058951916e-05, "loss": 0.0337, "num_input_tokens_seen": 128972528, "step": 59760 }, { "epoch": 9.749592169657422, "grad_norm": 0.036927446722984314, "learning_rate": 3.0413422561764986e-05, "loss": 0.2821, "num_input_tokens_seen": 128982480, "step": 59765 }, { "epoch": 9.750407830342578, "grad_norm": 2.75443434715271, "learning_rate": 3.0409947954868183e-05, "loss": 0.1193, "num_input_tokens_seen": 128992304, "step": 59770 }, { "epoch": 9.751223491027732, "grad_norm": 2.136528253555298, "learning_rate": 3.0406473238331924e-05, "loss": 0.2087, "num_input_tokens_seen": 129003344, "step": 59775 }, { "epoch": 9.752039151712887, "grad_norm": 1.4226765632629395, "learning_rate": 3.0402998412226624e-05, "loss": 0.0832, "num_input_tokens_seen": 129014224, "step": 59780 }, { "epoch": 9.752854812398043, "grad_norm": 0.18167081475257874, "learning_rate": 3.0399523476622704e-05, "loss": 0.1165, "num_input_tokens_seen": 129024016, "step": 59785 }, { "epoch": 9.753670473083197, "grad_norm": 0.5744462609291077, "learning_rate": 3.0396048431590595e-05, "loss": 0.0336, "num_input_tokens_seen": 129033840, "step": 59790 }, { "epoch": 9.754486133768353, "grad_norm": 0.3933996856212616, "learning_rate": 3.039257327720072e-05, "loss": 0.0595, "num_input_tokens_seen": 129044848, "step": 59795 }, { "epoch": 9.755301794453507, "grad_norm": 0.4052698612213135, "learning_rate": 3.038909801352351e-05, "loss": 0.0633, "num_input_tokens_seen": 129055664, "step": 59800 }, { "epoch": 9.756117455138662, "grad_norm": 0.05738910287618637, "learning_rate": 3.0385622640629384e-05, "loss": 0.2791, "num_input_tokens_seen": 129066288, "step": 59805 }, { "epoch": 9.756933115823816, "grad_norm": 0.13810135424137115, "learning_rate": 3.0382147158588786e-05, "loss": 0.0886, "num_input_tokens_seen": 129076016, "step": 59810 }, { "epoch": 9.757748776508972, "grad_norm": 0.06567531079053879, "learning_rate": 3.037867156747215e-05, "loss": 0.009, "num_input_tokens_seen": 129086704, "step": 59815 }, { "epoch": 9.758564437194128, "grad_norm": 0.07819091528654099, "learning_rate": 3.037519586734991e-05, "loss": 0.0879, "num_input_tokens_seen": 129096560, "step": 59820 }, { "epoch": 9.759380097879282, "grad_norm": 1.4649089574813843, "learning_rate": 3.0371720058292507e-05, "loss": 0.0817, "num_input_tokens_seen": 129107984, "step": 59825 }, { "epoch": 9.760195758564437, "grad_norm": 0.04617980122566223, "learning_rate": 3.0368244140370383e-05, "loss": 0.112, "num_input_tokens_seen": 129118352, "step": 59830 }, { "epoch": 9.761011419249591, "grad_norm": 0.5395300388336182, "learning_rate": 3.036476811365398e-05, "loss": 0.1628, "num_input_tokens_seen": 129129072, "step": 59835 }, { "epoch": 9.761827079934747, "grad_norm": 0.3359117805957794, "learning_rate": 3.036129197821374e-05, "loss": 0.102, "num_input_tokens_seen": 129140496, "step": 59840 }, { "epoch": 9.762642740619903, "grad_norm": 0.32261359691619873, "learning_rate": 3.0357815734120122e-05, "loss": 0.0279, "num_input_tokens_seen": 129151952, "step": 59845 }, { "epoch": 9.763458401305057, "grad_norm": 3.3061249256134033, "learning_rate": 3.0354339381443576e-05, "loss": 0.2445, "num_input_tokens_seen": 129165008, "step": 59850 }, { "epoch": 9.764274061990212, "grad_norm": 1.753862738609314, "learning_rate": 3.035086292025454e-05, "loss": 0.2697, "num_input_tokens_seen": 129176272, "step": 59855 }, { "epoch": 9.765089722675366, "grad_norm": 0.20090551674365997, "learning_rate": 3.0347386350623487e-05, "loss": 0.0755, "num_input_tokens_seen": 129187504, "step": 59860 }, { "epoch": 9.765905383360522, "grad_norm": 0.05506078526377678, "learning_rate": 3.034390967262086e-05, "loss": 0.0758, "num_input_tokens_seen": 129198928, "step": 59865 }, { "epoch": 9.766721044045678, "grad_norm": 0.6010008454322815, "learning_rate": 3.0340432886317132e-05, "loss": 0.0161, "num_input_tokens_seen": 129211024, "step": 59870 }, { "epoch": 9.767536704730832, "grad_norm": 0.9398571848869324, "learning_rate": 3.0336955991782755e-05, "loss": 0.0739, "num_input_tokens_seen": 129222096, "step": 59875 }, { "epoch": 9.768352365415987, "grad_norm": 1.653100609779358, "learning_rate": 3.0333478989088192e-05, "loss": 0.2187, "num_input_tokens_seen": 129232880, "step": 59880 }, { "epoch": 9.769168026101141, "grad_norm": 0.331159770488739, "learning_rate": 3.0330001878303908e-05, "loss": 0.0284, "num_input_tokens_seen": 129244560, "step": 59885 }, { "epoch": 9.769983686786297, "grad_norm": 0.9750331044197083, "learning_rate": 3.0326524659500382e-05, "loss": 0.1021, "num_input_tokens_seen": 129255824, "step": 59890 }, { "epoch": 9.770799347471453, "grad_norm": 0.6148929595947266, "learning_rate": 3.0323047332748073e-05, "loss": 0.1503, "num_input_tokens_seen": 129267248, "step": 59895 }, { "epoch": 9.771615008156607, "grad_norm": 1.2400686740875244, "learning_rate": 3.0319569898117456e-05, "loss": 0.0512, "num_input_tokens_seen": 129277616, "step": 59900 }, { "epoch": 9.772430668841762, "grad_norm": 1.1577051877975464, "learning_rate": 3.0316092355679005e-05, "loss": 0.057, "num_input_tokens_seen": 129288112, "step": 59905 }, { "epoch": 9.773246329526916, "grad_norm": 0.3810366988182068, "learning_rate": 3.0312614705503195e-05, "loss": 0.2501, "num_input_tokens_seen": 129298064, "step": 59910 }, { "epoch": 9.774061990212072, "grad_norm": 0.1498691290616989, "learning_rate": 3.0309136947660515e-05, "loss": 0.0074, "num_input_tokens_seen": 129307408, "step": 59915 }, { "epoch": 9.774877650897226, "grad_norm": 1.4249377250671387, "learning_rate": 3.030565908222144e-05, "loss": 0.1375, "num_input_tokens_seen": 129318128, "step": 59920 }, { "epoch": 9.775693311582382, "grad_norm": 0.23349978029727936, "learning_rate": 3.030218110925645e-05, "loss": 0.0455, "num_input_tokens_seen": 129328080, "step": 59925 }, { "epoch": 9.776508972267537, "grad_norm": 0.9213045835494995, "learning_rate": 3.029870302883604e-05, "loss": 0.1131, "num_input_tokens_seen": 129339120, "step": 59930 }, { "epoch": 9.777324632952691, "grad_norm": 1.3249189853668213, "learning_rate": 3.0295224841030685e-05, "loss": 0.2085, "num_input_tokens_seen": 129349648, "step": 59935 }, { "epoch": 9.778140293637847, "grad_norm": 1.040073275566101, "learning_rate": 3.029174654591088e-05, "loss": 0.1283, "num_input_tokens_seen": 129360400, "step": 59940 }, { "epoch": 9.778955954323001, "grad_norm": 1.8092948198318481, "learning_rate": 3.0288268143547116e-05, "loss": 0.1886, "num_input_tokens_seen": 129373168, "step": 59945 }, { "epoch": 9.779771615008157, "grad_norm": 0.3107238709926605, "learning_rate": 3.0284789634009895e-05, "loss": 0.0364, "num_input_tokens_seen": 129382736, "step": 59950 }, { "epoch": 9.780587275693312, "grad_norm": 0.11686274409294128, "learning_rate": 3.0281311017369706e-05, "loss": 0.1642, "num_input_tokens_seen": 129394160, "step": 59955 }, { "epoch": 9.781402936378466, "grad_norm": 0.11921320855617523, "learning_rate": 3.027783229369705e-05, "loss": 0.0583, "num_input_tokens_seen": 129405744, "step": 59960 }, { "epoch": 9.782218597063622, "grad_norm": 0.6772052049636841, "learning_rate": 3.0274353463062434e-05, "loss": 0.039, "num_input_tokens_seen": 129416368, "step": 59965 }, { "epoch": 9.783034257748776, "grad_norm": 0.13554425537586212, "learning_rate": 3.0270874525536348e-05, "loss": 0.0235, "num_input_tokens_seen": 129426928, "step": 59970 }, { "epoch": 9.783849918433932, "grad_norm": 1.940393328666687, "learning_rate": 3.0267395481189298e-05, "loss": 0.1966, "num_input_tokens_seen": 129438896, "step": 59975 }, { "epoch": 9.784665579119086, "grad_norm": 0.02147758938372135, "learning_rate": 3.0263916330091803e-05, "loss": 0.1019, "num_input_tokens_seen": 129449808, "step": 59980 }, { "epoch": 9.785481239804241, "grad_norm": 0.13596254587173462, "learning_rate": 3.0260437072314364e-05, "loss": 0.0944, "num_input_tokens_seen": 129461232, "step": 59985 }, { "epoch": 9.786296900489397, "grad_norm": 1.1899518966674805, "learning_rate": 3.0256957707927495e-05, "loss": 0.2257, "num_input_tokens_seen": 129471984, "step": 59990 }, { "epoch": 9.78711256117455, "grad_norm": 0.3892301023006439, "learning_rate": 3.025347823700171e-05, "loss": 0.0294, "num_input_tokens_seen": 129483088, "step": 59995 }, { "epoch": 9.787928221859707, "grad_norm": 0.3148396909236908, "learning_rate": 3.024999865960752e-05, "loss": 0.0531, "num_input_tokens_seen": 129492144, "step": 60000 }, { "epoch": 9.78874388254486, "grad_norm": 0.029636424034833908, "learning_rate": 3.0246518975815452e-05, "loss": 0.0774, "num_input_tokens_seen": 129502128, "step": 60005 }, { "epoch": 9.789559543230016, "grad_norm": 0.32943734526634216, "learning_rate": 3.024303918569602e-05, "loss": 0.1545, "num_input_tokens_seen": 129512848, "step": 60010 }, { "epoch": 9.790375203915172, "grad_norm": 1.3332926034927368, "learning_rate": 3.0239559289319745e-05, "loss": 0.0589, "num_input_tokens_seen": 129522992, "step": 60015 }, { "epoch": 9.791190864600326, "grad_norm": 0.18669810891151428, "learning_rate": 3.023607928675716e-05, "loss": 0.0612, "num_input_tokens_seen": 129533328, "step": 60020 }, { "epoch": 9.792006525285482, "grad_norm": 0.14402198791503906, "learning_rate": 3.023259917807878e-05, "loss": 0.0082, "num_input_tokens_seen": 129542768, "step": 60025 }, { "epoch": 9.792822185970635, "grad_norm": 1.0307245254516602, "learning_rate": 3.022911896335514e-05, "loss": 0.26, "num_input_tokens_seen": 129554448, "step": 60030 }, { "epoch": 9.793637846655791, "grad_norm": 0.5739598870277405, "learning_rate": 3.0225638642656773e-05, "loss": 0.0574, "num_input_tokens_seen": 129566736, "step": 60035 }, { "epoch": 9.794453507340947, "grad_norm": 0.42104482650756836, "learning_rate": 3.0222158216054207e-05, "loss": 0.0821, "num_input_tokens_seen": 129577392, "step": 60040 }, { "epoch": 9.7952691680261, "grad_norm": 2.058994770050049, "learning_rate": 3.021867768361798e-05, "loss": 0.0777, "num_input_tokens_seen": 129588080, "step": 60045 }, { "epoch": 9.796084828711257, "grad_norm": 0.11351980268955231, "learning_rate": 3.0215197045418632e-05, "loss": 0.0684, "num_input_tokens_seen": 129599312, "step": 60050 }, { "epoch": 9.79690048939641, "grad_norm": 1.8542333841323853, "learning_rate": 3.0211716301526695e-05, "loss": 0.1757, "num_input_tokens_seen": 129609488, "step": 60055 }, { "epoch": 9.797716150081566, "grad_norm": 1.3407922983169556, "learning_rate": 3.0208235452012718e-05, "loss": 0.1292, "num_input_tokens_seen": 129620944, "step": 60060 }, { "epoch": 9.798531810766722, "grad_norm": 0.2651269733905792, "learning_rate": 3.0204754496947247e-05, "loss": 0.063, "num_input_tokens_seen": 129631920, "step": 60065 }, { "epoch": 9.799347471451876, "grad_norm": 0.050888530910015106, "learning_rate": 3.020127343640081e-05, "loss": 0.0105, "num_input_tokens_seen": 129643696, "step": 60070 }, { "epoch": 9.800163132137031, "grad_norm": 1.1237926483154297, "learning_rate": 3.0197792270443982e-05, "loss": 0.0828, "num_input_tokens_seen": 129655184, "step": 60075 }, { "epoch": 9.800978792822185, "grad_norm": 0.12615379691123962, "learning_rate": 3.0194310999147295e-05, "loss": 0.0076, "num_input_tokens_seen": 129665008, "step": 60080 }, { "epoch": 9.801794453507341, "grad_norm": 0.03465782850980759, "learning_rate": 3.0190829622581314e-05, "loss": 0.0393, "num_input_tokens_seen": 129675344, "step": 60085 }, { "epoch": 9.802610114192497, "grad_norm": 1.517115592956543, "learning_rate": 3.0187348140816574e-05, "loss": 0.2039, "num_input_tokens_seen": 129687120, "step": 60090 }, { "epoch": 9.80342577487765, "grad_norm": 0.15959015488624573, "learning_rate": 3.018386655392365e-05, "loss": 0.0626, "num_input_tokens_seen": 129697936, "step": 60095 }, { "epoch": 9.804241435562806, "grad_norm": 0.6349523067474365, "learning_rate": 3.0180384861973093e-05, "loss": 0.0513, "num_input_tokens_seen": 129707312, "step": 60100 }, { "epoch": 9.80505709624796, "grad_norm": 0.749952495098114, "learning_rate": 3.0176903065035468e-05, "loss": 0.0745, "num_input_tokens_seen": 129718256, "step": 60105 }, { "epoch": 9.805872756933116, "grad_norm": 1.3134095668792725, "learning_rate": 3.017342116318133e-05, "loss": 0.0517, "num_input_tokens_seen": 129728336, "step": 60110 }, { "epoch": 9.80668841761827, "grad_norm": 1.052598237991333, "learning_rate": 3.0169939156481254e-05, "loss": 0.1111, "num_input_tokens_seen": 129738960, "step": 60115 }, { "epoch": 9.807504078303426, "grad_norm": 1.5185984373092651, "learning_rate": 3.0166457045005797e-05, "loss": 0.2001, "num_input_tokens_seen": 129748752, "step": 60120 }, { "epoch": 9.808319738988581, "grad_norm": 0.14952856302261353, "learning_rate": 3.0162974828825535e-05, "loss": 0.0704, "num_input_tokens_seen": 129758864, "step": 60125 }, { "epoch": 9.809135399673735, "grad_norm": 1.721084713935852, "learning_rate": 3.015949250801104e-05, "loss": 0.2356, "num_input_tokens_seen": 129768112, "step": 60130 }, { "epoch": 9.809951060358891, "grad_norm": 0.38727429509162903, "learning_rate": 3.0156010082632887e-05, "loss": 0.0266, "num_input_tokens_seen": 129779440, "step": 60135 }, { "epoch": 9.810766721044045, "grad_norm": 0.25083836913108826, "learning_rate": 3.0152527552761643e-05, "loss": 0.1108, "num_input_tokens_seen": 129790320, "step": 60140 }, { "epoch": 9.8115823817292, "grad_norm": 0.10611919313669205, "learning_rate": 3.0149044918467894e-05, "loss": 0.1116, "num_input_tokens_seen": 129799792, "step": 60145 }, { "epoch": 9.812398042414356, "grad_norm": 0.8952147364616394, "learning_rate": 3.014556217982222e-05, "loss": 0.2129, "num_input_tokens_seen": 129811472, "step": 60150 }, { "epoch": 9.81321370309951, "grad_norm": 0.06266672164201736, "learning_rate": 3.0142079336895195e-05, "loss": 0.0433, "num_input_tokens_seen": 129822512, "step": 60155 }, { "epoch": 9.814029363784666, "grad_norm": 0.5305111408233643, "learning_rate": 3.0138596389757412e-05, "loss": 0.0403, "num_input_tokens_seen": 129834608, "step": 60160 }, { "epoch": 9.81484502446982, "grad_norm": 1.0488375425338745, "learning_rate": 3.0135113338479452e-05, "loss": 0.1616, "num_input_tokens_seen": 129845008, "step": 60165 }, { "epoch": 9.815660685154976, "grad_norm": 1.8006457090377808, "learning_rate": 3.0131630183131908e-05, "loss": 0.1393, "num_input_tokens_seen": 129854928, "step": 60170 }, { "epoch": 9.81647634584013, "grad_norm": 0.5493976473808289, "learning_rate": 3.012814692378537e-05, "loss": 0.0731, "num_input_tokens_seen": 129865904, "step": 60175 }, { "epoch": 9.817292006525285, "grad_norm": 0.0905269905924797, "learning_rate": 3.012466356051043e-05, "loss": 0.1587, "num_input_tokens_seen": 129878288, "step": 60180 }, { "epoch": 9.818107667210441, "grad_norm": 0.034026894718408585, "learning_rate": 3.0121180093377682e-05, "loss": 0.0876, "num_input_tokens_seen": 129890512, "step": 60185 }, { "epoch": 9.818923327895595, "grad_norm": 0.3903413414955139, "learning_rate": 3.0117696522457722e-05, "loss": 0.1428, "num_input_tokens_seen": 129900560, "step": 60190 }, { "epoch": 9.81973898858075, "grad_norm": 0.046805400401353836, "learning_rate": 3.011421284782115e-05, "loss": 0.0129, "num_input_tokens_seen": 129912272, "step": 60195 }, { "epoch": 9.820554649265905, "grad_norm": 0.06415168941020966, "learning_rate": 3.011072906953856e-05, "loss": 0.1359, "num_input_tokens_seen": 129923120, "step": 60200 }, { "epoch": 9.82137030995106, "grad_norm": 0.08244102448225021, "learning_rate": 3.010724518768057e-05, "loss": 0.0669, "num_input_tokens_seen": 129935280, "step": 60205 }, { "epoch": 9.822185970636216, "grad_norm": 0.14824573695659637, "learning_rate": 3.0103761202317775e-05, "loss": 0.0516, "num_input_tokens_seen": 129946032, "step": 60210 }, { "epoch": 9.82300163132137, "grad_norm": 0.6348875761032104, "learning_rate": 3.010027711352078e-05, "loss": 0.0485, "num_input_tokens_seen": 129956784, "step": 60215 }, { "epoch": 9.823817292006526, "grad_norm": 0.5382069945335388, "learning_rate": 3.0096792921360205e-05, "loss": 0.0881, "num_input_tokens_seen": 129967504, "step": 60220 }, { "epoch": 9.82463295269168, "grad_norm": 0.5141259431838989, "learning_rate": 3.009330862590666e-05, "loss": 0.1107, "num_input_tokens_seen": 129977840, "step": 60225 }, { "epoch": 9.825448613376835, "grad_norm": 0.2305804193019867, "learning_rate": 3.0089824227230744e-05, "loss": 0.0624, "num_input_tokens_seen": 129988080, "step": 60230 }, { "epoch": 9.826264274061991, "grad_norm": 0.3073344826698303, "learning_rate": 3.008633972540309e-05, "loss": 0.0224, "num_input_tokens_seen": 129999728, "step": 60235 }, { "epoch": 9.827079934747145, "grad_norm": 0.5436553359031677, "learning_rate": 3.0082855120494302e-05, "loss": 0.1977, "num_input_tokens_seen": 130010352, "step": 60240 }, { "epoch": 9.8278955954323, "grad_norm": 0.4084247648715973, "learning_rate": 3.0079370412575014e-05, "loss": 0.1599, "num_input_tokens_seen": 130021776, "step": 60245 }, { "epoch": 9.828711256117455, "grad_norm": 0.14070282876491547, "learning_rate": 3.007588560171584e-05, "loss": 0.0304, "num_input_tokens_seen": 130032304, "step": 60250 }, { "epoch": 9.82952691680261, "grad_norm": 0.42393165826797485, "learning_rate": 3.00724006879874e-05, "loss": 0.1635, "num_input_tokens_seen": 130041840, "step": 60255 }, { "epoch": 9.830342577487766, "grad_norm": 0.12981180846691132, "learning_rate": 3.0068915671460334e-05, "loss": 0.017, "num_input_tokens_seen": 130051216, "step": 60260 }, { "epoch": 9.83115823817292, "grad_norm": 2.1854279041290283, "learning_rate": 3.0065430552205248e-05, "loss": 0.1936, "num_input_tokens_seen": 130061872, "step": 60265 }, { "epoch": 9.831973898858076, "grad_norm": 0.10664068162441254, "learning_rate": 3.0061945330292794e-05, "loss": 0.0197, "num_input_tokens_seen": 130072944, "step": 60270 }, { "epoch": 9.83278955954323, "grad_norm": 1.3863009214401245, "learning_rate": 3.0058460005793598e-05, "loss": 0.1253, "num_input_tokens_seen": 130082704, "step": 60275 }, { "epoch": 9.833605220228385, "grad_norm": 0.7514935731887817, "learning_rate": 3.0054974578778296e-05, "loss": 0.0753, "num_input_tokens_seen": 130093936, "step": 60280 }, { "epoch": 9.83442088091354, "grad_norm": 0.396573930978775, "learning_rate": 3.0051489049317516e-05, "loss": 0.129, "num_input_tokens_seen": 130103888, "step": 60285 }, { "epoch": 9.835236541598695, "grad_norm": 0.058727480471134186, "learning_rate": 3.0048003417481902e-05, "loss": 0.0195, "num_input_tokens_seen": 130114864, "step": 60290 }, { "epoch": 9.83605220228385, "grad_norm": 0.5963637828826904, "learning_rate": 3.00445176833421e-05, "loss": 0.1446, "num_input_tokens_seen": 130125136, "step": 60295 }, { "epoch": 9.836867862969005, "grad_norm": 0.6294246315956116, "learning_rate": 3.0041031846968743e-05, "loss": 0.0619, "num_input_tokens_seen": 130136240, "step": 60300 }, { "epoch": 9.83768352365416, "grad_norm": 0.11424869298934937, "learning_rate": 3.003754590843248e-05, "loss": 0.1779, "num_input_tokens_seen": 130146864, "step": 60305 }, { "epoch": 9.838499184339314, "grad_norm": 0.1887795478105545, "learning_rate": 3.003405986780396e-05, "loss": 0.0298, "num_input_tokens_seen": 130156560, "step": 60310 }, { "epoch": 9.83931484502447, "grad_norm": 0.05720695108175278, "learning_rate": 3.0030573725153833e-05, "loss": 0.2215, "num_input_tokens_seen": 130167664, "step": 60315 }, { "epoch": 9.840130505709626, "grad_norm": 0.054427921772003174, "learning_rate": 3.0027087480552745e-05, "loss": 0.1452, "num_input_tokens_seen": 130178992, "step": 60320 }, { "epoch": 9.84094616639478, "grad_norm": 0.03915392979979515, "learning_rate": 3.0023601134071354e-05, "loss": 0.1161, "num_input_tokens_seen": 130188944, "step": 60325 }, { "epoch": 9.841761827079935, "grad_norm": 0.46037915349006653, "learning_rate": 3.002011468578031e-05, "loss": 0.1178, "num_input_tokens_seen": 130199408, "step": 60330 }, { "epoch": 9.84257748776509, "grad_norm": 0.8380091190338135, "learning_rate": 3.0016628135750274e-05, "loss": 0.0593, "num_input_tokens_seen": 130211152, "step": 60335 }, { "epoch": 9.843393148450245, "grad_norm": 0.2544274628162384, "learning_rate": 3.001314148405191e-05, "loss": 0.0855, "num_input_tokens_seen": 130222192, "step": 60340 }, { "epoch": 9.844208809135399, "grad_norm": 0.49057450890541077, "learning_rate": 3.0009654730755865e-05, "loss": 0.0616, "num_input_tokens_seen": 130232912, "step": 60345 }, { "epoch": 9.845024469820554, "grad_norm": 1.0422734022140503, "learning_rate": 3.0006167875932817e-05, "loss": 0.0559, "num_input_tokens_seen": 130242704, "step": 60350 }, { "epoch": 9.84584013050571, "grad_norm": 0.6630984544754028, "learning_rate": 3.0002680919653424e-05, "loss": 0.0589, "num_input_tokens_seen": 130254224, "step": 60355 }, { "epoch": 9.846655791190864, "grad_norm": 1.8470293283462524, "learning_rate": 2.9999193861988357e-05, "loss": 0.1937, "num_input_tokens_seen": 130265392, "step": 60360 }, { "epoch": 9.84747145187602, "grad_norm": 0.23542512953281403, "learning_rate": 2.9995706703008287e-05, "loss": 0.0228, "num_input_tokens_seen": 130276048, "step": 60365 }, { "epoch": 9.848287112561174, "grad_norm": 0.06097601354122162, "learning_rate": 2.999221944278388e-05, "loss": 0.0331, "num_input_tokens_seen": 130287472, "step": 60370 }, { "epoch": 9.84910277324633, "grad_norm": 1.0528311729431152, "learning_rate": 2.9988732081385813e-05, "loss": 0.0353, "num_input_tokens_seen": 130298832, "step": 60375 }, { "epoch": 9.849918433931485, "grad_norm": 0.3681977093219757, "learning_rate": 2.9985244618884768e-05, "loss": 0.0122, "num_input_tokens_seen": 130308432, "step": 60380 }, { "epoch": 9.850734094616639, "grad_norm": 2.4242472648620605, "learning_rate": 2.99817570553514e-05, "loss": 0.1401, "num_input_tokens_seen": 130318544, "step": 60385 }, { "epoch": 9.851549755301795, "grad_norm": 0.20507262647151947, "learning_rate": 2.9978269390856417e-05, "loss": 0.2566, "num_input_tokens_seen": 130328176, "step": 60390 }, { "epoch": 9.852365415986949, "grad_norm": 2.270460844039917, "learning_rate": 2.9974781625470495e-05, "loss": 0.258, "num_input_tokens_seen": 130338896, "step": 60395 }, { "epoch": 9.853181076672104, "grad_norm": 0.023635461926460266, "learning_rate": 2.99712937592643e-05, "loss": 0.0642, "num_input_tokens_seen": 130350160, "step": 60400 }, { "epoch": 9.85399673735726, "grad_norm": 0.12621398270130157, "learning_rate": 2.996780579230854e-05, "loss": 0.0491, "num_input_tokens_seen": 130360976, "step": 60405 }, { "epoch": 9.854812398042414, "grad_norm": 0.999758780002594, "learning_rate": 2.996431772467389e-05, "loss": 0.0243, "num_input_tokens_seen": 130371184, "step": 60410 }, { "epoch": 9.85562805872757, "grad_norm": 0.5795203447341919, "learning_rate": 2.9960829556431046e-05, "loss": 0.027, "num_input_tokens_seen": 130381168, "step": 60415 }, { "epoch": 9.856443719412724, "grad_norm": 1.6753698587417603, "learning_rate": 2.99573412876507e-05, "loss": 0.0996, "num_input_tokens_seen": 130390544, "step": 60420 }, { "epoch": 9.85725938009788, "grad_norm": 0.061211131513118744, "learning_rate": 2.9953852918403537e-05, "loss": 0.0193, "num_input_tokens_seen": 130401232, "step": 60425 }, { "epoch": 9.858075040783035, "grad_norm": 0.05102735385298729, "learning_rate": 2.9950364448760266e-05, "loss": 0.0181, "num_input_tokens_seen": 130412560, "step": 60430 }, { "epoch": 9.858890701468189, "grad_norm": 0.053870681673288345, "learning_rate": 2.9946875878791575e-05, "loss": 0.1127, "num_input_tokens_seen": 130423472, "step": 60435 }, { "epoch": 9.859706362153345, "grad_norm": 1.9579795598983765, "learning_rate": 2.9943387208568168e-05, "loss": 0.0609, "num_input_tokens_seen": 130433008, "step": 60440 }, { "epoch": 9.860522022838499, "grad_norm": 1.3706754446029663, "learning_rate": 2.993989843816075e-05, "loss": 0.1365, "num_input_tokens_seen": 130445776, "step": 60445 }, { "epoch": 9.861337683523654, "grad_norm": 1.77435302734375, "learning_rate": 2.993640956764003e-05, "loss": 0.1745, "num_input_tokens_seen": 130456720, "step": 60450 }, { "epoch": 9.86215334420881, "grad_norm": 0.0989580899477005, "learning_rate": 2.9932920597076697e-05, "loss": 0.0495, "num_input_tokens_seen": 130468176, "step": 60455 }, { "epoch": 9.862969004893964, "grad_norm": 0.04298507422208786, "learning_rate": 2.992943152654148e-05, "loss": 0.0402, "num_input_tokens_seen": 130479344, "step": 60460 }, { "epoch": 9.86378466557912, "grad_norm": 0.100272998213768, "learning_rate": 2.9925942356105074e-05, "loss": 0.0405, "num_input_tokens_seen": 130490672, "step": 60465 }, { "epoch": 9.864600326264274, "grad_norm": 0.058114998042583466, "learning_rate": 2.99224530858382e-05, "loss": 0.0235, "num_input_tokens_seen": 130502160, "step": 60470 }, { "epoch": 9.86541598694943, "grad_norm": 0.04581095278263092, "learning_rate": 2.9918963715811567e-05, "loss": 0.0104, "num_input_tokens_seen": 130514160, "step": 60475 }, { "epoch": 9.866231647634583, "grad_norm": 0.029855381697416306, "learning_rate": 2.9915474246095886e-05, "loss": 0.0951, "num_input_tokens_seen": 130524336, "step": 60480 }, { "epoch": 9.867047308319739, "grad_norm": 0.04858102276921272, "learning_rate": 2.9911984676761893e-05, "loss": 0.1284, "num_input_tokens_seen": 130535440, "step": 60485 }, { "epoch": 9.867862969004895, "grad_norm": 0.050899624824523926, "learning_rate": 2.99084950078803e-05, "loss": 0.1225, "num_input_tokens_seen": 130546832, "step": 60490 }, { "epoch": 9.868678629690049, "grad_norm": 0.4578625559806824, "learning_rate": 2.9905005239521828e-05, "loss": 0.0368, "num_input_tokens_seen": 130557168, "step": 60495 }, { "epoch": 9.869494290375204, "grad_norm": 0.9933855533599854, "learning_rate": 2.99015153717572e-05, "loss": 0.0627, "num_input_tokens_seen": 130567664, "step": 60500 }, { "epoch": 9.870309951060358, "grad_norm": 1.453950047492981, "learning_rate": 2.989802540465715e-05, "loss": 0.0854, "num_input_tokens_seen": 130577520, "step": 60505 }, { "epoch": 9.871125611745514, "grad_norm": 0.1943298578262329, "learning_rate": 2.9894535338292395e-05, "loss": 0.0414, "num_input_tokens_seen": 130589328, "step": 60510 }, { "epoch": 9.87194127243067, "grad_norm": 3.0246024131774902, "learning_rate": 2.989104517273368e-05, "loss": 0.0495, "num_input_tokens_seen": 130599760, "step": 60515 }, { "epoch": 9.872756933115824, "grad_norm": 0.7955594062805176, "learning_rate": 2.9887554908051723e-05, "loss": 0.0555, "num_input_tokens_seen": 130610576, "step": 60520 }, { "epoch": 9.87357259380098, "grad_norm": 1.453038215637207, "learning_rate": 2.988406454431727e-05, "loss": 0.1211, "num_input_tokens_seen": 130621168, "step": 60525 }, { "epoch": 9.874388254486133, "grad_norm": 0.013636329211294651, "learning_rate": 2.9880574081601042e-05, "loss": 0.0348, "num_input_tokens_seen": 130630736, "step": 60530 }, { "epoch": 9.875203915171289, "grad_norm": 0.7725317478179932, "learning_rate": 2.9877083519973803e-05, "loss": 0.1007, "num_input_tokens_seen": 130641840, "step": 60535 }, { "epoch": 9.876019575856443, "grad_norm": 0.025486325845122337, "learning_rate": 2.9873592859506273e-05, "loss": 0.0331, "num_input_tokens_seen": 130652816, "step": 60540 }, { "epoch": 9.876835236541599, "grad_norm": 0.13727064430713654, "learning_rate": 2.9870102100269205e-05, "loss": 0.0727, "num_input_tokens_seen": 130663248, "step": 60545 }, { "epoch": 9.877650897226754, "grad_norm": 0.06278403848409653, "learning_rate": 2.986661124233333e-05, "loss": 0.0459, "num_input_tokens_seen": 130675184, "step": 60550 }, { "epoch": 9.878466557911908, "grad_norm": 1.6159592866897583, "learning_rate": 2.9863120285769414e-05, "loss": 0.0594, "num_input_tokens_seen": 130686576, "step": 60555 }, { "epoch": 9.879282218597064, "grad_norm": 0.21248702704906464, "learning_rate": 2.9859629230648194e-05, "loss": 0.0641, "num_input_tokens_seen": 130696240, "step": 60560 }, { "epoch": 9.880097879282218, "grad_norm": 1.123465657234192, "learning_rate": 2.9856138077040426e-05, "loss": 0.0987, "num_input_tokens_seen": 130706064, "step": 60565 }, { "epoch": 9.880913539967374, "grad_norm": 0.27670541405677795, "learning_rate": 2.9852646825016855e-05, "loss": 0.0287, "num_input_tokens_seen": 130716720, "step": 60570 }, { "epoch": 9.88172920065253, "grad_norm": 0.33948516845703125, "learning_rate": 2.984915547464824e-05, "loss": 0.1267, "num_input_tokens_seen": 130726288, "step": 60575 }, { "epoch": 9.882544861337683, "grad_norm": 0.06810726970434189, "learning_rate": 2.9845664026005337e-05, "loss": 0.1171, "num_input_tokens_seen": 130736368, "step": 60580 }, { "epoch": 9.883360522022839, "grad_norm": 0.1077975258231163, "learning_rate": 2.9842172479158902e-05, "loss": 0.202, "num_input_tokens_seen": 130746160, "step": 60585 }, { "epoch": 9.884176182707993, "grad_norm": 0.09114788472652435, "learning_rate": 2.9838680834179705e-05, "loss": 0.151, "num_input_tokens_seen": 130755728, "step": 60590 }, { "epoch": 9.884991843393149, "grad_norm": 0.08429092168807983, "learning_rate": 2.9835189091138504e-05, "loss": 0.0272, "num_input_tokens_seen": 130767088, "step": 60595 }, { "epoch": 9.885807504078304, "grad_norm": 0.06469304114580154, "learning_rate": 2.983169725010606e-05, "loss": 0.0541, "num_input_tokens_seen": 130776944, "step": 60600 }, { "epoch": 9.886623164763458, "grad_norm": 0.6927967667579651, "learning_rate": 2.982820531115314e-05, "loss": 0.0539, "num_input_tokens_seen": 130786480, "step": 60605 }, { "epoch": 9.887438825448614, "grad_norm": 0.01592182368040085, "learning_rate": 2.9824713274350517e-05, "loss": 0.0202, "num_input_tokens_seen": 130797968, "step": 60610 }, { "epoch": 9.888254486133768, "grad_norm": 3.8268520832061768, "learning_rate": 2.9821221139768957e-05, "loss": 0.2075, "num_input_tokens_seen": 130808272, "step": 60615 }, { "epoch": 9.889070146818923, "grad_norm": 0.05033247917890549, "learning_rate": 2.9817728907479235e-05, "loss": 0.1775, "num_input_tokens_seen": 130817488, "step": 60620 }, { "epoch": 9.88988580750408, "grad_norm": 0.032330628484487534, "learning_rate": 2.981423657755213e-05, "loss": 0.0085, "num_input_tokens_seen": 130828432, "step": 60625 }, { "epoch": 9.890701468189233, "grad_norm": 0.0906505212187767, "learning_rate": 2.9810744150058405e-05, "loss": 0.0884, "num_input_tokens_seen": 130838416, "step": 60630 }, { "epoch": 9.891517128874389, "grad_norm": 1.57076096534729, "learning_rate": 2.9807251625068856e-05, "loss": 0.0362, "num_input_tokens_seen": 130847920, "step": 60635 }, { "epoch": 9.892332789559543, "grad_norm": 1.4880905151367188, "learning_rate": 2.9803759002654248e-05, "loss": 0.2471, "num_input_tokens_seen": 130859184, "step": 60640 }, { "epoch": 9.893148450244698, "grad_norm": 0.04133743420243263, "learning_rate": 2.980026628288537e-05, "loss": 0.0737, "num_input_tokens_seen": 130869808, "step": 60645 }, { "epoch": 9.893964110929852, "grad_norm": 0.39152225852012634, "learning_rate": 2.9796773465833005e-05, "loss": 0.064, "num_input_tokens_seen": 130880912, "step": 60650 }, { "epoch": 9.894779771615008, "grad_norm": 3.299602508544922, "learning_rate": 2.9793280551567943e-05, "loss": 0.088, "num_input_tokens_seen": 130892336, "step": 60655 }, { "epoch": 9.895595432300164, "grad_norm": 0.28297945857048035, "learning_rate": 2.978978754016097e-05, "loss": 0.0835, "num_input_tokens_seen": 130900752, "step": 60660 }, { "epoch": 9.896411092985318, "grad_norm": 0.07432521134614944, "learning_rate": 2.9786294431682878e-05, "loss": 0.1616, "num_input_tokens_seen": 130911600, "step": 60665 }, { "epoch": 9.897226753670473, "grad_norm": 0.1112067699432373, "learning_rate": 2.9782801226204453e-05, "loss": 0.028, "num_input_tokens_seen": 130922640, "step": 60670 }, { "epoch": 9.898042414355627, "grad_norm": 0.8522239327430725, "learning_rate": 2.977930792379649e-05, "loss": 0.0856, "num_input_tokens_seen": 130933136, "step": 60675 }, { "epoch": 9.898858075040783, "grad_norm": 0.18558253347873688, "learning_rate": 2.97758145245298e-05, "loss": 0.0877, "num_input_tokens_seen": 130944336, "step": 60680 }, { "epoch": 9.899673735725939, "grad_norm": 0.2246699184179306, "learning_rate": 2.9772321028475164e-05, "loss": 0.1031, "num_input_tokens_seen": 130952848, "step": 60685 }, { "epoch": 9.900489396411093, "grad_norm": 0.07298734784126282, "learning_rate": 2.976882743570339e-05, "loss": 0.1067, "num_input_tokens_seen": 130963536, "step": 60690 }, { "epoch": 9.901305057096248, "grad_norm": 0.3242265582084656, "learning_rate": 2.9765333746285274e-05, "loss": 0.2367, "num_input_tokens_seen": 130973840, "step": 60695 }, { "epoch": 9.902120717781402, "grad_norm": 0.054470911622047424, "learning_rate": 2.9761839960291627e-05, "loss": 0.0517, "num_input_tokens_seen": 130984304, "step": 60700 }, { "epoch": 9.902936378466558, "grad_norm": 0.2910788953304291, "learning_rate": 2.9758346077793253e-05, "loss": 0.0954, "num_input_tokens_seen": 130995504, "step": 60705 }, { "epoch": 9.903752039151712, "grad_norm": 3.014040231704712, "learning_rate": 2.9754852098860956e-05, "loss": 0.156, "num_input_tokens_seen": 131005968, "step": 60710 }, { "epoch": 9.904567699836868, "grad_norm": 0.09345629811286926, "learning_rate": 2.9751358023565557e-05, "loss": 0.0209, "num_input_tokens_seen": 131018160, "step": 60715 }, { "epoch": 9.905383360522023, "grad_norm": 0.1875731647014618, "learning_rate": 2.9747863851977853e-05, "loss": 0.1116, "num_input_tokens_seen": 131029008, "step": 60720 }, { "epoch": 9.906199021207177, "grad_norm": 1.977130651473999, "learning_rate": 2.9744369584168667e-05, "loss": 0.1637, "num_input_tokens_seen": 131038960, "step": 60725 }, { "epoch": 9.907014681892333, "grad_norm": 0.0594903789460659, "learning_rate": 2.974087522020882e-05, "loss": 0.0383, "num_input_tokens_seen": 131050160, "step": 60730 }, { "epoch": 9.907830342577487, "grad_norm": 0.9823998808860779, "learning_rate": 2.9737380760169115e-05, "loss": 0.0539, "num_input_tokens_seen": 131060048, "step": 60735 }, { "epoch": 9.908646003262643, "grad_norm": 2.46594500541687, "learning_rate": 2.973388620412038e-05, "loss": 0.1628, "num_input_tokens_seen": 131069360, "step": 60740 }, { "epoch": 9.909461663947798, "grad_norm": 0.31959107518196106, "learning_rate": 2.9730391552133434e-05, "loss": 0.1783, "num_input_tokens_seen": 131079888, "step": 60745 }, { "epoch": 9.910277324632952, "grad_norm": 0.7865742444992065, "learning_rate": 2.9726896804279102e-05, "loss": 0.0374, "num_input_tokens_seen": 131090512, "step": 60750 }, { "epoch": 9.911092985318108, "grad_norm": 0.13632045686244965, "learning_rate": 2.9723401960628217e-05, "loss": 0.1581, "num_input_tokens_seen": 131101680, "step": 60755 }, { "epoch": 9.911908646003262, "grad_norm": 0.34421589970588684, "learning_rate": 2.971990702125159e-05, "loss": 0.1266, "num_input_tokens_seen": 131112336, "step": 60760 }, { "epoch": 9.912724306688418, "grad_norm": 0.326416015625, "learning_rate": 2.9716411986220067e-05, "loss": 0.0441, "num_input_tokens_seen": 131123088, "step": 60765 }, { "epoch": 9.913539967373573, "grad_norm": 1.5278165340423584, "learning_rate": 2.9712916855604465e-05, "loss": 0.1179, "num_input_tokens_seen": 131132368, "step": 60770 }, { "epoch": 9.914355628058727, "grad_norm": 0.15668608248233795, "learning_rate": 2.970942162947563e-05, "loss": 0.065, "num_input_tokens_seen": 131143600, "step": 60775 }, { "epoch": 9.915171288743883, "grad_norm": 0.12191300094127655, "learning_rate": 2.9705926307904392e-05, "loss": 0.0169, "num_input_tokens_seen": 131153872, "step": 60780 }, { "epoch": 9.915986949429037, "grad_norm": 0.1475633680820465, "learning_rate": 2.9702430890961584e-05, "loss": 0.0845, "num_input_tokens_seen": 131164848, "step": 60785 }, { "epoch": 9.916802610114193, "grad_norm": 1.1376827955245972, "learning_rate": 2.969893537871805e-05, "loss": 0.2199, "num_input_tokens_seen": 131175280, "step": 60790 }, { "epoch": 9.917618270799348, "grad_norm": 1.2322769165039062, "learning_rate": 2.969543977124463e-05, "loss": 0.074, "num_input_tokens_seen": 131185776, "step": 60795 }, { "epoch": 9.918433931484502, "grad_norm": 2.592498302459717, "learning_rate": 2.969194406861216e-05, "loss": 0.1252, "num_input_tokens_seen": 131196944, "step": 60800 }, { "epoch": 9.919249592169658, "grad_norm": 0.03166444972157478, "learning_rate": 2.9688448270891494e-05, "loss": 0.0274, "num_input_tokens_seen": 131208752, "step": 60805 }, { "epoch": 9.920065252854812, "grad_norm": 0.24825498461723328, "learning_rate": 2.968495237815348e-05, "loss": 0.0938, "num_input_tokens_seen": 131217360, "step": 60810 }, { "epoch": 9.920880913539968, "grad_norm": 0.06941919773817062, "learning_rate": 2.9681456390468965e-05, "loss": 0.0177, "num_input_tokens_seen": 131228304, "step": 60815 }, { "epoch": 9.921696574225122, "grad_norm": 0.03179261460900307, "learning_rate": 2.9677960307908792e-05, "loss": 0.0637, "num_input_tokens_seen": 131239568, "step": 60820 }, { "epoch": 9.922512234910277, "grad_norm": 1.654510259628296, "learning_rate": 2.9674464130543823e-05, "loss": 0.2077, "num_input_tokens_seen": 131250576, "step": 60825 }, { "epoch": 9.923327895595433, "grad_norm": 0.22366878390312195, "learning_rate": 2.9670967858444904e-05, "loss": 0.0837, "num_input_tokens_seen": 131261616, "step": 60830 }, { "epoch": 9.924143556280587, "grad_norm": 0.11311566829681396, "learning_rate": 2.9667471491682898e-05, "loss": 0.1597, "num_input_tokens_seen": 131272400, "step": 60835 }, { "epoch": 9.924959216965743, "grad_norm": 0.6695348024368286, "learning_rate": 2.9663975030328663e-05, "loss": 0.214, "num_input_tokens_seen": 131283280, "step": 60840 }, { "epoch": 9.925774877650896, "grad_norm": 0.23522666096687317, "learning_rate": 2.9660478474453052e-05, "loss": 0.0412, "num_input_tokens_seen": 131293808, "step": 60845 }, { "epoch": 9.926590538336052, "grad_norm": 0.04276357218623161, "learning_rate": 2.9656981824126932e-05, "loss": 0.0231, "num_input_tokens_seen": 131304016, "step": 60850 }, { "epoch": 9.927406199021208, "grad_norm": 0.1759343296289444, "learning_rate": 2.9653485079421172e-05, "loss": 0.167, "num_input_tokens_seen": 131314032, "step": 60855 }, { "epoch": 9.928221859706362, "grad_norm": 1.1589469909667969, "learning_rate": 2.9649988240406636e-05, "loss": 0.1347, "num_input_tokens_seen": 131324112, "step": 60860 }, { "epoch": 9.929037520391518, "grad_norm": 0.2302282750606537, "learning_rate": 2.9646491307154184e-05, "loss": 0.1298, "num_input_tokens_seen": 131334352, "step": 60865 }, { "epoch": 9.929853181076671, "grad_norm": 0.6725650429725647, "learning_rate": 2.964299427973469e-05, "loss": 0.3111, "num_input_tokens_seen": 131344656, "step": 60870 }, { "epoch": 9.930668841761827, "grad_norm": 0.3016737103462219, "learning_rate": 2.9639497158219037e-05, "loss": 0.0604, "num_input_tokens_seen": 131355984, "step": 60875 }, { "epoch": 9.931484502446983, "grad_norm": 0.8914392590522766, "learning_rate": 2.9635999942678084e-05, "loss": 0.0412, "num_input_tokens_seen": 131367056, "step": 60880 }, { "epoch": 9.932300163132137, "grad_norm": 1.1436786651611328, "learning_rate": 2.963250263318271e-05, "loss": 0.1015, "num_input_tokens_seen": 131378192, "step": 60885 }, { "epoch": 9.933115823817293, "grad_norm": 0.0438789464533329, "learning_rate": 2.9629005229803787e-05, "loss": 0.2196, "num_input_tokens_seen": 131389040, "step": 60890 }, { "epoch": 9.933931484502446, "grad_norm": 0.6389370560646057, "learning_rate": 2.9625507732612205e-05, "loss": 0.0471, "num_input_tokens_seen": 131399664, "step": 60895 }, { "epoch": 9.934747145187602, "grad_norm": 0.8489208221435547, "learning_rate": 2.9622010141678847e-05, "loss": 0.0951, "num_input_tokens_seen": 131409680, "step": 60900 }, { "epoch": 9.935562805872756, "grad_norm": 1.5798951387405396, "learning_rate": 2.9618512457074578e-05, "loss": 0.0981, "num_input_tokens_seen": 131420240, "step": 60905 }, { "epoch": 9.936378466557912, "grad_norm": 1.577789545059204, "learning_rate": 2.9615014678870302e-05, "loss": 0.0716, "num_input_tokens_seen": 131429488, "step": 60910 }, { "epoch": 9.937194127243067, "grad_norm": 0.12106257677078247, "learning_rate": 2.9611516807136896e-05, "loss": 0.1217, "num_input_tokens_seen": 131439792, "step": 60915 }, { "epoch": 9.938009787928221, "grad_norm": 0.029495086520910263, "learning_rate": 2.9608018841945255e-05, "loss": 0.0835, "num_input_tokens_seen": 131448976, "step": 60920 }, { "epoch": 9.938825448613377, "grad_norm": 0.0932675376534462, "learning_rate": 2.9604520783366267e-05, "loss": 0.1149, "num_input_tokens_seen": 131459088, "step": 60925 }, { "epoch": 9.939641109298531, "grad_norm": 0.30725598335266113, "learning_rate": 2.9601022631470825e-05, "loss": 0.0706, "num_input_tokens_seen": 131470096, "step": 60930 }, { "epoch": 9.940456769983687, "grad_norm": 1.6540718078613281, "learning_rate": 2.959752438632982e-05, "loss": 0.1735, "num_input_tokens_seen": 131479376, "step": 60935 }, { "epoch": 9.941272430668842, "grad_norm": 0.36876991391181946, "learning_rate": 2.959402604801415e-05, "loss": 0.098, "num_input_tokens_seen": 131491792, "step": 60940 }, { "epoch": 9.942088091353996, "grad_norm": 0.04926802217960358, "learning_rate": 2.959052761659472e-05, "loss": 0.0059, "num_input_tokens_seen": 131502512, "step": 60945 }, { "epoch": 9.942903752039152, "grad_norm": 1.1399816274642944, "learning_rate": 2.958702909214242e-05, "loss": 0.2122, "num_input_tokens_seen": 131514224, "step": 60950 }, { "epoch": 9.943719412724306, "grad_norm": 0.627850353717804, "learning_rate": 2.9583530474728154e-05, "loss": 0.103, "num_input_tokens_seen": 131525840, "step": 60955 }, { "epoch": 9.944535073409462, "grad_norm": 0.0640321895480156, "learning_rate": 2.9580031764422833e-05, "loss": 0.0281, "num_input_tokens_seen": 131536208, "step": 60960 }, { "epoch": 9.945350734094617, "grad_norm": 2.5661680698394775, "learning_rate": 2.9576532961297354e-05, "loss": 0.1017, "num_input_tokens_seen": 131547344, "step": 60965 }, { "epoch": 9.946166394779771, "grad_norm": 3.065364122390747, "learning_rate": 2.9573034065422632e-05, "loss": 0.1583, "num_input_tokens_seen": 131558544, "step": 60970 }, { "epoch": 9.946982055464927, "grad_norm": 0.12737613916397095, "learning_rate": 2.9569535076869576e-05, "loss": 0.1497, "num_input_tokens_seen": 131569136, "step": 60975 }, { "epoch": 9.947797716150081, "grad_norm": 0.3452402353286743, "learning_rate": 2.956603599570909e-05, "loss": 0.1234, "num_input_tokens_seen": 131580848, "step": 60980 }, { "epoch": 9.948613376835237, "grad_norm": 0.04192456603050232, "learning_rate": 2.9562536822012094e-05, "loss": 0.0107, "num_input_tokens_seen": 131591664, "step": 60985 }, { "epoch": 9.949429037520392, "grad_norm": 0.26321175694465637, "learning_rate": 2.9559037555849495e-05, "loss": 0.0913, "num_input_tokens_seen": 131602032, "step": 60990 }, { "epoch": 9.950244698205546, "grad_norm": 1.6223965883255005, "learning_rate": 2.955553819729222e-05, "loss": 0.1385, "num_input_tokens_seen": 131613520, "step": 60995 }, { "epoch": 9.951060358890702, "grad_norm": 0.33561745285987854, "learning_rate": 2.9552038746411188e-05, "loss": 0.0576, "num_input_tokens_seen": 131624624, "step": 61000 }, { "epoch": 9.951876019575856, "grad_norm": 0.15405374765396118, "learning_rate": 2.9548539203277316e-05, "loss": 0.0586, "num_input_tokens_seen": 131634416, "step": 61005 }, { "epoch": 9.952691680261012, "grad_norm": 1.2168450355529785, "learning_rate": 2.9545039567961523e-05, "loss": 0.0302, "num_input_tokens_seen": 131645232, "step": 61010 }, { "epoch": 9.953507340946166, "grad_norm": 0.8843408823013306, "learning_rate": 2.954153984053474e-05, "loss": 0.2101, "num_input_tokens_seen": 131655824, "step": 61015 }, { "epoch": 9.954323001631321, "grad_norm": 1.2077999114990234, "learning_rate": 2.9538040021067888e-05, "loss": 0.0488, "num_input_tokens_seen": 131666640, "step": 61020 }, { "epoch": 9.955138662316477, "grad_norm": 0.0990581139922142, "learning_rate": 2.9534540109631897e-05, "loss": 0.0532, "num_input_tokens_seen": 131675120, "step": 61025 }, { "epoch": 9.955954323001631, "grad_norm": 0.12572148442268372, "learning_rate": 2.9531040106297704e-05, "loss": 0.1013, "num_input_tokens_seen": 131686096, "step": 61030 }, { "epoch": 9.956769983686787, "grad_norm": 0.14047643542289734, "learning_rate": 2.9527540011136234e-05, "loss": 0.025, "num_input_tokens_seen": 131697968, "step": 61035 }, { "epoch": 9.95758564437194, "grad_norm": 0.07506589591503143, "learning_rate": 2.952403982421842e-05, "loss": 0.0947, "num_input_tokens_seen": 131709168, "step": 61040 }, { "epoch": 9.958401305057096, "grad_norm": 0.13633227348327637, "learning_rate": 2.95205395456152e-05, "loss": 0.2081, "num_input_tokens_seen": 131720656, "step": 61045 }, { "epoch": 9.959216965742252, "grad_norm": 0.887191653251648, "learning_rate": 2.951703917539751e-05, "loss": 0.0458, "num_input_tokens_seen": 131731536, "step": 61050 }, { "epoch": 9.960032626427406, "grad_norm": 0.686028242111206, "learning_rate": 2.9513538713636295e-05, "loss": 0.0313, "num_input_tokens_seen": 131742928, "step": 61055 }, { "epoch": 9.960848287112562, "grad_norm": 1.2948780059814453, "learning_rate": 2.951003816040249e-05, "loss": 0.1484, "num_input_tokens_seen": 131752496, "step": 61060 }, { "epoch": 9.961663947797716, "grad_norm": 0.030570518225431442, "learning_rate": 2.950653751576704e-05, "loss": 0.0677, "num_input_tokens_seen": 131764048, "step": 61065 }, { "epoch": 9.962479608482871, "grad_norm": 1.2407166957855225, "learning_rate": 2.950303677980089e-05, "loss": 0.1085, "num_input_tokens_seen": 131774352, "step": 61070 }, { "epoch": 9.963295269168025, "grad_norm": 0.9998961687088013, "learning_rate": 2.949953595257499e-05, "loss": 0.0491, "num_input_tokens_seen": 131783696, "step": 61075 }, { "epoch": 9.964110929853181, "grad_norm": 0.050343047827482224, "learning_rate": 2.9496035034160286e-05, "loss": 0.0557, "num_input_tokens_seen": 131795696, "step": 61080 }, { "epoch": 9.964926590538337, "grad_norm": 1.0202317237854004, "learning_rate": 2.9492534024627726e-05, "loss": 0.0328, "num_input_tokens_seen": 131805616, "step": 61085 }, { "epoch": 9.96574225122349, "grad_norm": 0.8418501019477844, "learning_rate": 2.9489032924048265e-05, "loss": 0.187, "num_input_tokens_seen": 131815056, "step": 61090 }, { "epoch": 9.966557911908646, "grad_norm": 0.3414064943790436, "learning_rate": 2.9485531732492855e-05, "loss": 0.0266, "num_input_tokens_seen": 131825392, "step": 61095 }, { "epoch": 9.9673735725938, "grad_norm": 1.8587251901626587, "learning_rate": 2.9482030450032456e-05, "loss": 0.099, "num_input_tokens_seen": 131836144, "step": 61100 }, { "epoch": 9.968189233278956, "grad_norm": 0.3697127401828766, "learning_rate": 2.9478529076738026e-05, "loss": 0.0614, "num_input_tokens_seen": 131846672, "step": 61105 }, { "epoch": 9.969004893964112, "grad_norm": 1.584471344947815, "learning_rate": 2.9475027612680522e-05, "loss": 0.2988, "num_input_tokens_seen": 131858544, "step": 61110 }, { "epoch": 9.969820554649266, "grad_norm": 0.0444958359003067, "learning_rate": 2.9471526057930903e-05, "loss": 0.0979, "num_input_tokens_seen": 131869296, "step": 61115 }, { "epoch": 9.970636215334421, "grad_norm": 0.12549719214439392, "learning_rate": 2.9468024412560135e-05, "loss": 0.0534, "num_input_tokens_seen": 131879376, "step": 61120 }, { "epoch": 9.971451876019575, "grad_norm": 0.16902868449687958, "learning_rate": 2.946452267663919e-05, "loss": 0.035, "num_input_tokens_seen": 131890768, "step": 61125 }, { "epoch": 9.97226753670473, "grad_norm": 0.42217013239860535, "learning_rate": 2.9461020850239024e-05, "loss": 0.0447, "num_input_tokens_seen": 131901104, "step": 61130 }, { "epoch": 9.973083197389887, "grad_norm": 0.025150643661618233, "learning_rate": 2.9457518933430616e-05, "loss": 0.2144, "num_input_tokens_seen": 131912208, "step": 61135 }, { "epoch": 9.97389885807504, "grad_norm": 2.588229179382324, "learning_rate": 2.9454016926284928e-05, "loss": 0.2345, "num_input_tokens_seen": 131923472, "step": 61140 }, { "epoch": 9.974714518760196, "grad_norm": 2.164046049118042, "learning_rate": 2.9450514828872937e-05, "loss": 0.3054, "num_input_tokens_seen": 131934384, "step": 61145 }, { "epoch": 9.97553017944535, "grad_norm": 0.22427669167518616, "learning_rate": 2.944701264126562e-05, "loss": 0.1375, "num_input_tokens_seen": 131944784, "step": 61150 }, { "epoch": 9.976345840130506, "grad_norm": 1.0693938732147217, "learning_rate": 2.9443510363533945e-05, "loss": 0.1153, "num_input_tokens_seen": 131955088, "step": 61155 }, { "epoch": 9.977161500815662, "grad_norm": 0.043001946061849594, "learning_rate": 2.9440007995748902e-05, "loss": 0.129, "num_input_tokens_seen": 131966384, "step": 61160 }, { "epoch": 9.977977161500815, "grad_norm": 0.1470509022474289, "learning_rate": 2.9436505537981462e-05, "loss": 0.1522, "num_input_tokens_seen": 131977200, "step": 61165 }, { "epoch": 9.978792822185971, "grad_norm": 0.26246199011802673, "learning_rate": 2.943300299030261e-05, "loss": 0.1186, "num_input_tokens_seen": 131988848, "step": 61170 }, { "epoch": 9.979608482871125, "grad_norm": 1.7477426528930664, "learning_rate": 2.942950035278333e-05, "loss": 0.1108, "num_input_tokens_seen": 132000752, "step": 61175 }, { "epoch": 9.98042414355628, "grad_norm": 0.7843849062919617, "learning_rate": 2.94259976254946e-05, "loss": 0.0307, "num_input_tokens_seen": 132010544, "step": 61180 }, { "epoch": 9.981239804241435, "grad_norm": 0.2330109030008316, "learning_rate": 2.9422494808507417e-05, "loss": 0.1153, "num_input_tokens_seen": 132021552, "step": 61185 }, { "epoch": 9.98205546492659, "grad_norm": 0.08168958127498627, "learning_rate": 2.9418991901892767e-05, "loss": 0.1996, "num_input_tokens_seen": 132032752, "step": 61190 }, { "epoch": 9.982871125611746, "grad_norm": 0.06837616115808487, "learning_rate": 2.9415488905721645e-05, "loss": 0.0123, "num_input_tokens_seen": 132043536, "step": 61195 }, { "epoch": 9.9836867862969, "grad_norm": 0.9043221473693848, "learning_rate": 2.941198582006503e-05, "loss": 0.0773, "num_input_tokens_seen": 132053840, "step": 61200 }, { "epoch": 9.984502446982056, "grad_norm": 0.1481304168701172, "learning_rate": 2.9408482644993928e-05, "loss": 0.047, "num_input_tokens_seen": 132064400, "step": 61205 }, { "epoch": 9.98531810766721, "grad_norm": 0.07287105917930603, "learning_rate": 2.940497938057934e-05, "loss": 0.0958, "num_input_tokens_seen": 132073552, "step": 61210 }, { "epoch": 9.986133768352365, "grad_norm": 0.33521953225135803, "learning_rate": 2.940147602689225e-05, "loss": 0.0574, "num_input_tokens_seen": 132084400, "step": 61215 }, { "epoch": 9.986949429037521, "grad_norm": 0.5743001699447632, "learning_rate": 2.9397972584003663e-05, "loss": 0.0897, "num_input_tokens_seen": 132095856, "step": 61220 }, { "epoch": 9.987765089722675, "grad_norm": 0.15302208065986633, "learning_rate": 2.939446905198458e-05, "loss": 0.1037, "num_input_tokens_seen": 132108368, "step": 61225 }, { "epoch": 9.98858075040783, "grad_norm": 0.12350867688655853, "learning_rate": 2.9390965430906014e-05, "loss": 0.1755, "num_input_tokens_seen": 132120432, "step": 61230 }, { "epoch": 9.989396411092985, "grad_norm": 0.03491656109690666, "learning_rate": 2.938746172083896e-05, "loss": 0.1855, "num_input_tokens_seen": 132131568, "step": 61235 }, { "epoch": 9.99021207177814, "grad_norm": 0.2950908839702606, "learning_rate": 2.9383957921854428e-05, "loss": 0.1193, "num_input_tokens_seen": 132142992, "step": 61240 }, { "epoch": 9.991027732463294, "grad_norm": 0.019792506471276283, "learning_rate": 2.938045403402343e-05, "loss": 0.0196, "num_input_tokens_seen": 132153520, "step": 61245 }, { "epoch": 9.99184339314845, "grad_norm": 0.9346626996994019, "learning_rate": 2.9376950057416975e-05, "loss": 0.0625, "num_input_tokens_seen": 132164368, "step": 61250 }, { "epoch": 9.992659053833606, "grad_norm": 0.05961432307958603, "learning_rate": 2.9373445992106068e-05, "loss": 0.0108, "num_input_tokens_seen": 132174992, "step": 61255 }, { "epoch": 9.99347471451876, "grad_norm": 1.8310309648513794, "learning_rate": 2.9369941838161734e-05, "loss": 0.0708, "num_input_tokens_seen": 132186320, "step": 61260 }, { "epoch": 9.994290375203915, "grad_norm": 1.6424485445022583, "learning_rate": 2.936643759565499e-05, "loss": 0.0766, "num_input_tokens_seen": 132198000, "step": 61265 }, { "epoch": 9.99510603588907, "grad_norm": 0.06705611944198608, "learning_rate": 2.936293326465684e-05, "loss": 0.1437, "num_input_tokens_seen": 132209840, "step": 61270 }, { "epoch": 9.995921696574225, "grad_norm": 0.06083279103040695, "learning_rate": 2.9359428845238317e-05, "loss": 0.0307, "num_input_tokens_seen": 132220848, "step": 61275 }, { "epoch": 9.99673735725938, "grad_norm": 1.9709573984146118, "learning_rate": 2.9355924337470435e-05, "loss": 0.1505, "num_input_tokens_seen": 132230384, "step": 61280 }, { "epoch": 9.997553017944535, "grad_norm": 0.038385696709156036, "learning_rate": 2.9352419741424224e-05, "loss": 0.1611, "num_input_tokens_seen": 132241200, "step": 61285 }, { "epoch": 9.99836867862969, "grad_norm": 2.359654188156128, "learning_rate": 2.9348915057170706e-05, "loss": 0.2621, "num_input_tokens_seen": 132251664, "step": 61290 }, { "epoch": 9.999184339314844, "grad_norm": 0.04212046414613724, "learning_rate": 2.9345410284780906e-05, "loss": 0.0854, "num_input_tokens_seen": 132262576, "step": 61295 }, { "epoch": 10.0, "grad_norm": 0.8471315503120422, "learning_rate": 2.934190542432585e-05, "loss": 0.1154, "num_input_tokens_seen": 132272272, "step": 61300 }, { "epoch": 10.0, "eval_loss": 0.13873758912086487, "eval_runtime": 90.7647, "eval_samples_per_second": 30.023, "eval_steps_per_second": 7.514, "num_input_tokens_seen": 132272272, "step": 61300 }, { "epoch": 10.000815660685156, "grad_norm": 0.5329083204269409, "learning_rate": 2.9338400475876578e-05, "loss": 0.1454, "num_input_tokens_seen": 132282928, "step": 61305 }, { "epoch": 10.00163132137031, "grad_norm": 0.01934979483485222, "learning_rate": 2.933489543950411e-05, "loss": 0.0165, "num_input_tokens_seen": 132294672, "step": 61310 }, { "epoch": 10.002446982055465, "grad_norm": 1.1318914890289307, "learning_rate": 2.9331390315279494e-05, "loss": 0.0727, "num_input_tokens_seen": 132305232, "step": 61315 }, { "epoch": 10.00326264274062, "grad_norm": 0.15689386427402496, "learning_rate": 2.9327885103273757e-05, "loss": 0.144, "num_input_tokens_seen": 132317168, "step": 61320 }, { "epoch": 10.004078303425775, "grad_norm": 0.10814119130373001, "learning_rate": 2.932437980355794e-05, "loss": 0.1706, "num_input_tokens_seen": 132328688, "step": 61325 }, { "epoch": 10.00489396411093, "grad_norm": 0.03103673830628395, "learning_rate": 2.932087441620307e-05, "loss": 0.0189, "num_input_tokens_seen": 132339472, "step": 61330 }, { "epoch": 10.005709624796085, "grad_norm": 0.12355925887823105, "learning_rate": 2.931736894128021e-05, "loss": 0.0637, "num_input_tokens_seen": 132348624, "step": 61335 }, { "epoch": 10.00652528548124, "grad_norm": 0.08027877658605576, "learning_rate": 2.9313863378860384e-05, "loss": 0.0359, "num_input_tokens_seen": 132357968, "step": 61340 }, { "epoch": 10.007340946166394, "grad_norm": 0.16986480355262756, "learning_rate": 2.931035772901465e-05, "loss": 0.0935, "num_input_tokens_seen": 132369040, "step": 61345 }, { "epoch": 10.00815660685155, "grad_norm": 0.17095650732517242, "learning_rate": 2.9306851991814044e-05, "loss": 0.1277, "num_input_tokens_seen": 132380304, "step": 61350 }, { "epoch": 10.008972267536704, "grad_norm": 1.1224709749221802, "learning_rate": 2.9303346167329628e-05, "loss": 0.0279, "num_input_tokens_seen": 132391280, "step": 61355 }, { "epoch": 10.00978792822186, "grad_norm": 0.8650208115577698, "learning_rate": 2.9299840255632432e-05, "loss": 0.2147, "num_input_tokens_seen": 132402544, "step": 61360 }, { "epoch": 10.010603588907015, "grad_norm": 1.4845328330993652, "learning_rate": 2.9296334256793524e-05, "loss": 0.2685, "num_input_tokens_seen": 132414064, "step": 61365 }, { "epoch": 10.01141924959217, "grad_norm": 2.2702882289886475, "learning_rate": 2.9292828170883945e-05, "loss": 0.0855, "num_input_tokens_seen": 132425712, "step": 61370 }, { "epoch": 10.012234910277325, "grad_norm": 1.1922725439071655, "learning_rate": 2.9289321997974766e-05, "loss": 0.154, "num_input_tokens_seen": 132436656, "step": 61375 }, { "epoch": 10.013050570962479, "grad_norm": 0.8518201112747192, "learning_rate": 2.9285815738137034e-05, "loss": 0.0934, "num_input_tokens_seen": 132447440, "step": 61380 }, { "epoch": 10.013866231647635, "grad_norm": 0.09521865844726562, "learning_rate": 2.9282309391441803e-05, "loss": 0.2224, "num_input_tokens_seen": 132459376, "step": 61385 }, { "epoch": 10.01468189233279, "grad_norm": 0.6018702983856201, "learning_rate": 2.927880295796015e-05, "loss": 0.0846, "num_input_tokens_seen": 132471440, "step": 61390 }, { "epoch": 10.015497553017944, "grad_norm": 0.16052910685539246, "learning_rate": 2.9275296437763117e-05, "loss": 0.0493, "num_input_tokens_seen": 132482192, "step": 61395 }, { "epoch": 10.0163132137031, "grad_norm": 0.2862590551376343, "learning_rate": 2.927178983092178e-05, "loss": 0.0206, "num_input_tokens_seen": 132492528, "step": 61400 }, { "epoch": 10.017128874388254, "grad_norm": 1.2048214673995972, "learning_rate": 2.926828313750721e-05, "loss": 0.2032, "num_input_tokens_seen": 132503440, "step": 61405 }, { "epoch": 10.01794453507341, "grad_norm": 2.4263386726379395, "learning_rate": 2.9264776357590462e-05, "loss": 0.2033, "num_input_tokens_seen": 132513616, "step": 61410 }, { "epoch": 10.018760195758565, "grad_norm": 0.9788798689842224, "learning_rate": 2.926126949124261e-05, "loss": 0.101, "num_input_tokens_seen": 132524560, "step": 61415 }, { "epoch": 10.01957585644372, "grad_norm": 0.3524099290370941, "learning_rate": 2.9257762538534728e-05, "loss": 0.0197, "num_input_tokens_seen": 132534192, "step": 61420 }, { "epoch": 10.020391517128875, "grad_norm": 1.3228797912597656, "learning_rate": 2.9254255499537886e-05, "loss": 0.344, "num_input_tokens_seen": 132545104, "step": 61425 }, { "epoch": 10.021207177814029, "grad_norm": 0.030521640554070473, "learning_rate": 2.9250748374323157e-05, "loss": 0.0746, "num_input_tokens_seen": 132556816, "step": 61430 }, { "epoch": 10.022022838499185, "grad_norm": 0.4257247745990753, "learning_rate": 2.924724116296162e-05, "loss": 0.0286, "num_input_tokens_seen": 132567888, "step": 61435 }, { "epoch": 10.022838499184338, "grad_norm": 0.616176962852478, "learning_rate": 2.9243733865524354e-05, "loss": 0.0399, "num_input_tokens_seen": 132580912, "step": 61440 }, { "epoch": 10.023654159869494, "grad_norm": 0.44461217522621155, "learning_rate": 2.924022648208244e-05, "loss": 0.19, "num_input_tokens_seen": 132593520, "step": 61445 }, { "epoch": 10.02446982055465, "grad_norm": 2.62442684173584, "learning_rate": 2.9236719012706954e-05, "loss": 0.0882, "num_input_tokens_seen": 132603504, "step": 61450 }, { "epoch": 10.025285481239804, "grad_norm": 0.3335587680339813, "learning_rate": 2.9233211457468983e-05, "loss": 0.0455, "num_input_tokens_seen": 132614576, "step": 61455 }, { "epoch": 10.02610114192496, "grad_norm": 0.056237947195768356, "learning_rate": 2.9229703816439612e-05, "loss": 0.1034, "num_input_tokens_seen": 132624944, "step": 61460 }, { "epoch": 10.026916802610113, "grad_norm": 0.30391982197761536, "learning_rate": 2.9226196089689934e-05, "loss": 0.1432, "num_input_tokens_seen": 132635248, "step": 61465 }, { "epoch": 10.02773246329527, "grad_norm": 0.6919045448303223, "learning_rate": 2.9222688277291015e-05, "loss": 0.0377, "num_input_tokens_seen": 132646288, "step": 61470 }, { "epoch": 10.028548123980425, "grad_norm": 0.8408374190330505, "learning_rate": 2.921918037931397e-05, "loss": 0.0386, "num_input_tokens_seen": 132655888, "step": 61475 }, { "epoch": 10.029363784665579, "grad_norm": 1.7178236246109009, "learning_rate": 2.9215672395829886e-05, "loss": 0.1573, "num_input_tokens_seen": 132666256, "step": 61480 }, { "epoch": 10.030179445350734, "grad_norm": 0.04531531408429146, "learning_rate": 2.921216432690985e-05, "loss": 0.1964, "num_input_tokens_seen": 132677104, "step": 61485 }, { "epoch": 10.030995106035888, "grad_norm": 0.2484845072031021, "learning_rate": 2.9208656172624966e-05, "loss": 0.0337, "num_input_tokens_seen": 132688656, "step": 61490 }, { "epoch": 10.031810766721044, "grad_norm": 1.3883360624313354, "learning_rate": 2.920514793304632e-05, "loss": 0.0279, "num_input_tokens_seen": 132698896, "step": 61495 }, { "epoch": 10.0326264274062, "grad_norm": 2.078361988067627, "learning_rate": 2.9201639608245013e-05, "loss": 0.1224, "num_input_tokens_seen": 132709392, "step": 61500 }, { "epoch": 10.033442088091354, "grad_norm": 1.4216177463531494, "learning_rate": 2.9198131198292157e-05, "loss": 0.0669, "num_input_tokens_seen": 132719728, "step": 61505 }, { "epoch": 10.03425774877651, "grad_norm": 1.3246084451675415, "learning_rate": 2.9194622703258845e-05, "loss": 0.1414, "num_input_tokens_seen": 132730192, "step": 61510 }, { "epoch": 10.035073409461663, "grad_norm": 0.874946653842926, "learning_rate": 2.919111412321618e-05, "loss": 0.141, "num_input_tokens_seen": 132740208, "step": 61515 }, { "epoch": 10.035889070146819, "grad_norm": 0.06784351170063019, "learning_rate": 2.918760545823527e-05, "loss": 0.0933, "num_input_tokens_seen": 132751184, "step": 61520 }, { "epoch": 10.036704730831975, "grad_norm": 0.5229700803756714, "learning_rate": 2.918409670838722e-05, "loss": 0.0358, "num_input_tokens_seen": 132762672, "step": 61525 }, { "epoch": 10.037520391517129, "grad_norm": 1.2792836427688599, "learning_rate": 2.918058787374315e-05, "loss": 0.2023, "num_input_tokens_seen": 132774448, "step": 61530 }, { "epoch": 10.038336052202284, "grad_norm": 2.0056071281433105, "learning_rate": 2.917707895437416e-05, "loss": 0.081, "num_input_tokens_seen": 132785040, "step": 61535 }, { "epoch": 10.039151712887438, "grad_norm": 0.05510967597365379, "learning_rate": 2.9173569950351366e-05, "loss": 0.0606, "num_input_tokens_seen": 132795856, "step": 61540 }, { "epoch": 10.039967373572594, "grad_norm": 0.36622029542922974, "learning_rate": 2.9170060861745873e-05, "loss": 0.0188, "num_input_tokens_seen": 132808080, "step": 61545 }, { "epoch": 10.040783034257748, "grad_norm": 0.2898099422454834, "learning_rate": 2.9166551688628817e-05, "loss": 0.1404, "num_input_tokens_seen": 132818768, "step": 61550 }, { "epoch": 10.041598694942904, "grad_norm": 0.4731270670890808, "learning_rate": 2.916304243107131e-05, "loss": 0.0594, "num_input_tokens_seen": 132830928, "step": 61555 }, { "epoch": 10.04241435562806, "grad_norm": 0.14525841176509857, "learning_rate": 2.915953308914446e-05, "loss": 0.2118, "num_input_tokens_seen": 132842416, "step": 61560 }, { "epoch": 10.043230016313213, "grad_norm": 0.04356616735458374, "learning_rate": 2.9156023662919395e-05, "loss": 0.0566, "num_input_tokens_seen": 132853904, "step": 61565 }, { "epoch": 10.044045676998369, "grad_norm": 1.2885000705718994, "learning_rate": 2.9152514152467236e-05, "loss": 0.1861, "num_input_tokens_seen": 132863664, "step": 61570 }, { "epoch": 10.044861337683523, "grad_norm": 1.2070746421813965, "learning_rate": 2.9149004557859117e-05, "loss": 0.1671, "num_input_tokens_seen": 132873456, "step": 61575 }, { "epoch": 10.045676998368679, "grad_norm": 2.348705768585205, "learning_rate": 2.9145494879166153e-05, "loss": 0.1022, "num_input_tokens_seen": 132884560, "step": 61580 }, { "epoch": 10.046492659053834, "grad_norm": 0.5743587613105774, "learning_rate": 2.9141985116459473e-05, "loss": 0.0173, "num_input_tokens_seen": 132895664, "step": 61585 }, { "epoch": 10.047308319738988, "grad_norm": 1.6255788803100586, "learning_rate": 2.9138475269810216e-05, "loss": 0.1513, "num_input_tokens_seen": 132907824, "step": 61590 }, { "epoch": 10.048123980424144, "grad_norm": 0.15778057277202606, "learning_rate": 2.9134965339289505e-05, "loss": 0.0357, "num_input_tokens_seen": 132919376, "step": 61595 }, { "epoch": 10.048939641109298, "grad_norm": 0.8833630084991455, "learning_rate": 2.9131455324968477e-05, "loss": 0.1019, "num_input_tokens_seen": 132930992, "step": 61600 }, { "epoch": 10.049755301794454, "grad_norm": 0.696837842464447, "learning_rate": 2.912794522691827e-05, "loss": 0.0408, "num_input_tokens_seen": 132941040, "step": 61605 }, { "epoch": 10.05057096247961, "grad_norm": 0.047140318900346756, "learning_rate": 2.912443504521001e-05, "loss": 0.0098, "num_input_tokens_seen": 132952272, "step": 61610 }, { "epoch": 10.051386623164763, "grad_norm": 1.123530387878418, "learning_rate": 2.912092477991484e-05, "loss": 0.0448, "num_input_tokens_seen": 132963024, "step": 61615 }, { "epoch": 10.052202283849919, "grad_norm": 0.10182782262563705, "learning_rate": 2.9117414431103902e-05, "loss": 0.2152, "num_input_tokens_seen": 132974512, "step": 61620 }, { "epoch": 10.053017944535073, "grad_norm": 0.04410833865404129, "learning_rate": 2.9113903998848337e-05, "loss": 0.031, "num_input_tokens_seen": 132985392, "step": 61625 }, { "epoch": 10.053833605220229, "grad_norm": 0.04025594890117645, "learning_rate": 2.9110393483219295e-05, "loss": 0.0186, "num_input_tokens_seen": 132996144, "step": 61630 }, { "epoch": 10.054649265905383, "grad_norm": 0.023175502195954323, "learning_rate": 2.910688288428791e-05, "loss": 0.0987, "num_input_tokens_seen": 133006576, "step": 61635 }, { "epoch": 10.055464926590538, "grad_norm": 2.271411180496216, "learning_rate": 2.9103372202125334e-05, "loss": 0.0793, "num_input_tokens_seen": 133017296, "step": 61640 }, { "epoch": 10.056280587275694, "grad_norm": 1.2983447313308716, "learning_rate": 2.9099861436802716e-05, "loss": 0.1666, "num_input_tokens_seen": 133027280, "step": 61645 }, { "epoch": 10.057096247960848, "grad_norm": 0.3487899899482727, "learning_rate": 2.9096350588391198e-05, "loss": 0.0481, "num_input_tokens_seen": 133038832, "step": 61650 }, { "epoch": 10.057911908646004, "grad_norm": 0.3270595371723175, "learning_rate": 2.9092839656961946e-05, "loss": 0.1316, "num_input_tokens_seen": 133049296, "step": 61655 }, { "epoch": 10.058727569331158, "grad_norm": 1.3289387226104736, "learning_rate": 2.9089328642586105e-05, "loss": 0.1488, "num_input_tokens_seen": 133060304, "step": 61660 }, { "epoch": 10.059543230016313, "grad_norm": 1.9725311994552612, "learning_rate": 2.9085817545334833e-05, "loss": 0.107, "num_input_tokens_seen": 133071216, "step": 61665 }, { "epoch": 10.060358890701469, "grad_norm": 0.09416933357715607, "learning_rate": 2.9082306365279283e-05, "loss": 0.0536, "num_input_tokens_seen": 133083056, "step": 61670 }, { "epoch": 10.061174551386623, "grad_norm": 0.28285834193229675, "learning_rate": 2.9078795102490614e-05, "loss": 0.1343, "num_input_tokens_seen": 133093872, "step": 61675 }, { "epoch": 10.061990212071779, "grad_norm": 0.24354983866214752, "learning_rate": 2.9075283757039996e-05, "loss": 0.1424, "num_input_tokens_seen": 133104784, "step": 61680 }, { "epoch": 10.062805872756933, "grad_norm": 0.020881034433841705, "learning_rate": 2.9071772328998576e-05, "loss": 0.2047, "num_input_tokens_seen": 133114896, "step": 61685 }, { "epoch": 10.063621533442088, "grad_norm": 0.7180618047714233, "learning_rate": 2.906826081843752e-05, "loss": 0.0601, "num_input_tokens_seen": 133126256, "step": 61690 }, { "epoch": 10.064437194127244, "grad_norm": 0.029201718047261238, "learning_rate": 2.9064749225428007e-05, "loss": 0.2217, "num_input_tokens_seen": 133137200, "step": 61695 }, { "epoch": 10.065252854812398, "grad_norm": 1.27017080783844, "learning_rate": 2.9061237550041193e-05, "loss": 0.1411, "num_input_tokens_seen": 133148016, "step": 61700 }, { "epoch": 10.066068515497554, "grad_norm": 0.1104903370141983, "learning_rate": 2.9057725792348246e-05, "loss": 0.174, "num_input_tokens_seen": 133159344, "step": 61705 }, { "epoch": 10.066884176182707, "grad_norm": 0.10462109744548798, "learning_rate": 2.9054213952420335e-05, "loss": 0.0687, "num_input_tokens_seen": 133168560, "step": 61710 }, { "epoch": 10.067699836867863, "grad_norm": 0.08255377411842346, "learning_rate": 2.9050702030328636e-05, "loss": 0.1371, "num_input_tokens_seen": 133178800, "step": 61715 }, { "epoch": 10.068515497553017, "grad_norm": 0.3600453734397888, "learning_rate": 2.9047190026144327e-05, "loss": 0.2248, "num_input_tokens_seen": 133190032, "step": 61720 }, { "epoch": 10.069331158238173, "grad_norm": 0.5258923768997192, "learning_rate": 2.9043677939938575e-05, "loss": 0.0769, "num_input_tokens_seen": 133200624, "step": 61725 }, { "epoch": 10.070146818923329, "grad_norm": 0.3928627371788025, "learning_rate": 2.904016577178256e-05, "loss": 0.1288, "num_input_tokens_seen": 133212400, "step": 61730 }, { "epoch": 10.070962479608482, "grad_norm": 0.14765621721744537, "learning_rate": 2.9036653521747458e-05, "loss": 0.1167, "num_input_tokens_seen": 133223536, "step": 61735 }, { "epoch": 10.071778140293638, "grad_norm": 0.02600460685789585, "learning_rate": 2.9033141189904455e-05, "loss": 0.2194, "num_input_tokens_seen": 133234640, "step": 61740 }, { "epoch": 10.072593800978792, "grad_norm": 0.07096666842699051, "learning_rate": 2.9029628776324725e-05, "loss": 0.0473, "num_input_tokens_seen": 133244720, "step": 61745 }, { "epoch": 10.073409461663948, "grad_norm": 0.3637857437133789, "learning_rate": 2.9026116281079458e-05, "loss": 0.111, "num_input_tokens_seen": 133255856, "step": 61750 }, { "epoch": 10.074225122349104, "grad_norm": 0.27288585901260376, "learning_rate": 2.9022603704239832e-05, "loss": 0.1165, "num_input_tokens_seen": 133266704, "step": 61755 }, { "epoch": 10.075040783034257, "grad_norm": 0.09326554834842682, "learning_rate": 2.9019091045877046e-05, "loss": 0.058, "num_input_tokens_seen": 133277488, "step": 61760 }, { "epoch": 10.075856443719413, "grad_norm": 1.932496428489685, "learning_rate": 2.901557830606228e-05, "loss": 0.1811, "num_input_tokens_seen": 133288272, "step": 61765 }, { "epoch": 10.076672104404567, "grad_norm": 0.0627392828464508, "learning_rate": 2.9012065484866725e-05, "loss": 0.0502, "num_input_tokens_seen": 133299632, "step": 61770 }, { "epoch": 10.077487765089723, "grad_norm": 1.8343884944915771, "learning_rate": 2.9008552582361576e-05, "loss": 0.0656, "num_input_tokens_seen": 133309296, "step": 61775 }, { "epoch": 10.078303425774878, "grad_norm": 0.7593383193016052, "learning_rate": 2.9005039598618022e-05, "loss": 0.0311, "num_input_tokens_seen": 133319760, "step": 61780 }, { "epoch": 10.079119086460032, "grad_norm": 0.6933031678199768, "learning_rate": 2.9001526533707252e-05, "loss": 0.1931, "num_input_tokens_seen": 133330960, "step": 61785 }, { "epoch": 10.079934747145188, "grad_norm": 0.06330668181180954, "learning_rate": 2.899801338770048e-05, "loss": 0.0984, "num_input_tokens_seen": 133342256, "step": 61790 }, { "epoch": 10.080750407830342, "grad_norm": 1.142472505569458, "learning_rate": 2.899450016066889e-05, "loss": 0.1032, "num_input_tokens_seen": 133352496, "step": 61795 }, { "epoch": 10.081566068515498, "grad_norm": 0.6167538166046143, "learning_rate": 2.899098685268369e-05, "loss": 0.0371, "num_input_tokens_seen": 133362096, "step": 61800 }, { "epoch": 10.082381729200652, "grad_norm": 1.7961801290512085, "learning_rate": 2.8987473463816078e-05, "loss": 0.0785, "num_input_tokens_seen": 133373200, "step": 61805 }, { "epoch": 10.083197389885807, "grad_norm": 0.5680626630783081, "learning_rate": 2.8983959994137254e-05, "loss": 0.082, "num_input_tokens_seen": 133384944, "step": 61810 }, { "epoch": 10.084013050570963, "grad_norm": 0.6787883639335632, "learning_rate": 2.8980446443718433e-05, "loss": 0.0931, "num_input_tokens_seen": 133396144, "step": 61815 }, { "epoch": 10.084828711256117, "grad_norm": 2.025432586669922, "learning_rate": 2.8976932812630814e-05, "loss": 0.1901, "num_input_tokens_seen": 133407312, "step": 61820 }, { "epoch": 10.085644371941273, "grad_norm": 0.08107434213161469, "learning_rate": 2.8973419100945604e-05, "loss": 0.0129, "num_input_tokens_seen": 133418640, "step": 61825 }, { "epoch": 10.086460032626427, "grad_norm": 1.2066102027893066, "learning_rate": 2.8969905308734015e-05, "loss": 0.1137, "num_input_tokens_seen": 133430160, "step": 61830 }, { "epoch": 10.087275693311582, "grad_norm": 0.055856093764305115, "learning_rate": 2.8966391436067265e-05, "loss": 0.0362, "num_input_tokens_seen": 133441424, "step": 61835 }, { "epoch": 10.088091353996738, "grad_norm": 0.3013201653957367, "learning_rate": 2.8962877483016554e-05, "loss": 0.0992, "num_input_tokens_seen": 133452560, "step": 61840 }, { "epoch": 10.088907014681892, "grad_norm": 0.06232026219367981, "learning_rate": 2.8959363449653103e-05, "loss": 0.1387, "num_input_tokens_seen": 133464432, "step": 61845 }, { "epoch": 10.089722675367048, "grad_norm": 1.5212359428405762, "learning_rate": 2.895584933604813e-05, "loss": 0.0393, "num_input_tokens_seen": 133475920, "step": 61850 }, { "epoch": 10.090538336052202, "grad_norm": 1.3034842014312744, "learning_rate": 2.8952335142272858e-05, "loss": 0.2043, "num_input_tokens_seen": 133486992, "step": 61855 }, { "epoch": 10.091353996737357, "grad_norm": 0.2541564404964447, "learning_rate": 2.894882086839849e-05, "loss": 0.0136, "num_input_tokens_seen": 133498768, "step": 61860 }, { "epoch": 10.092169657422513, "grad_norm": 0.07279860973358154, "learning_rate": 2.894530651449626e-05, "loss": 0.0537, "num_input_tokens_seen": 133509872, "step": 61865 }, { "epoch": 10.092985318107667, "grad_norm": 2.2718517780303955, "learning_rate": 2.8941792080637394e-05, "loss": 0.0415, "num_input_tokens_seen": 133521264, "step": 61870 }, { "epoch": 10.093800978792823, "grad_norm": 1.0787216424942017, "learning_rate": 2.8938277566893108e-05, "loss": 0.058, "num_input_tokens_seen": 133531824, "step": 61875 }, { "epoch": 10.094616639477977, "grad_norm": 0.08776584267616272, "learning_rate": 2.893476297333463e-05, "loss": 0.1143, "num_input_tokens_seen": 133543504, "step": 61880 }, { "epoch": 10.095432300163132, "grad_norm": 0.10041382163763046, "learning_rate": 2.893124830003319e-05, "loss": 0.0327, "num_input_tokens_seen": 133553968, "step": 61885 }, { "epoch": 10.096247960848286, "grad_norm": 0.039500899612903595, "learning_rate": 2.892773354706002e-05, "loss": 0.0649, "num_input_tokens_seen": 133563184, "step": 61890 }, { "epoch": 10.097063621533442, "grad_norm": 0.3885713517665863, "learning_rate": 2.892421871448634e-05, "loss": 0.0426, "num_input_tokens_seen": 133573552, "step": 61895 }, { "epoch": 10.097879282218598, "grad_norm": 0.049952827394008636, "learning_rate": 2.89207038023834e-05, "loss": 0.0131, "num_input_tokens_seen": 133584432, "step": 61900 }, { "epoch": 10.098694942903752, "grad_norm": 0.8188916444778442, "learning_rate": 2.8917188810822416e-05, "loss": 0.0856, "num_input_tokens_seen": 133595344, "step": 61905 }, { "epoch": 10.099510603588907, "grad_norm": 0.2779969274997711, "learning_rate": 2.891367373987463e-05, "loss": 0.1123, "num_input_tokens_seen": 133606640, "step": 61910 }, { "epoch": 10.100326264274061, "grad_norm": 0.6546698212623596, "learning_rate": 2.891015858961128e-05, "loss": 0.1024, "num_input_tokens_seen": 133617264, "step": 61915 }, { "epoch": 10.101141924959217, "grad_norm": 0.5955508351325989, "learning_rate": 2.8906643360103613e-05, "loss": 0.2717, "num_input_tokens_seen": 133628912, "step": 61920 }, { "epoch": 10.101957585644373, "grad_norm": 0.43337729573249817, "learning_rate": 2.8903128051422856e-05, "loss": 0.1548, "num_input_tokens_seen": 133638512, "step": 61925 }, { "epoch": 10.102773246329527, "grad_norm": 0.08849340677261353, "learning_rate": 2.889961266364025e-05, "loss": 0.0394, "num_input_tokens_seen": 133648880, "step": 61930 }, { "epoch": 10.103588907014682, "grad_norm": 1.9987754821777344, "learning_rate": 2.889609719682706e-05, "loss": 0.1182, "num_input_tokens_seen": 133658480, "step": 61935 }, { "epoch": 10.104404567699836, "grad_norm": 0.34516847133636475, "learning_rate": 2.889258165105451e-05, "loss": 0.0717, "num_input_tokens_seen": 133669552, "step": 61940 }, { "epoch": 10.105220228384992, "grad_norm": 0.11774769425392151, "learning_rate": 2.8889066026393863e-05, "loss": 0.1565, "num_input_tokens_seen": 133680240, "step": 61945 }, { "epoch": 10.106035889070148, "grad_norm": 0.026410270482301712, "learning_rate": 2.8885550322916356e-05, "loss": 0.1089, "num_input_tokens_seen": 133690480, "step": 61950 }, { "epoch": 10.106851549755302, "grad_norm": 1.4821220636367798, "learning_rate": 2.888203454069324e-05, "loss": 0.1587, "num_input_tokens_seen": 133701104, "step": 61955 }, { "epoch": 10.107667210440457, "grad_norm": 0.921593427658081, "learning_rate": 2.8878518679795772e-05, "loss": 0.2421, "num_input_tokens_seen": 133712016, "step": 61960 }, { "epoch": 10.108482871125611, "grad_norm": 0.02480495162308216, "learning_rate": 2.8875002740295198e-05, "loss": 0.1094, "num_input_tokens_seen": 133723152, "step": 61965 }, { "epoch": 10.109298531810767, "grad_norm": 1.2044458389282227, "learning_rate": 2.887148672226278e-05, "loss": 0.0676, "num_input_tokens_seen": 133735088, "step": 61970 }, { "epoch": 10.11011419249592, "grad_norm": 0.04008066654205322, "learning_rate": 2.8867970625769774e-05, "loss": 0.0174, "num_input_tokens_seen": 133744624, "step": 61975 }, { "epoch": 10.110929853181077, "grad_norm": 1.6324518918991089, "learning_rate": 2.8864454450887434e-05, "loss": 0.1049, "num_input_tokens_seen": 133755344, "step": 61980 }, { "epoch": 10.111745513866232, "grad_norm": 0.5721380114555359, "learning_rate": 2.8860938197687025e-05, "loss": 0.0256, "num_input_tokens_seen": 133766640, "step": 61985 }, { "epoch": 10.112561174551386, "grad_norm": 0.6903706789016724, "learning_rate": 2.8857421866239802e-05, "loss": 0.053, "num_input_tokens_seen": 133776368, "step": 61990 }, { "epoch": 10.113376835236542, "grad_norm": 0.8827333450317383, "learning_rate": 2.8853905456617036e-05, "loss": 0.0589, "num_input_tokens_seen": 133787120, "step": 61995 }, { "epoch": 10.114192495921696, "grad_norm": 3.169434070587158, "learning_rate": 2.8850388968889985e-05, "loss": 0.157, "num_input_tokens_seen": 133798448, "step": 62000 }, { "epoch": 10.115008156606851, "grad_norm": 0.6388412714004517, "learning_rate": 2.8846872403129915e-05, "loss": 0.1215, "num_input_tokens_seen": 133808112, "step": 62005 }, { "epoch": 10.115823817292007, "grad_norm": 0.02988351136445999, "learning_rate": 2.8843355759408096e-05, "loss": 0.0201, "num_input_tokens_seen": 133819856, "step": 62010 }, { "epoch": 10.116639477977161, "grad_norm": 0.041236840188503265, "learning_rate": 2.88398390377958e-05, "loss": 0.1936, "num_input_tokens_seen": 133831792, "step": 62015 }, { "epoch": 10.117455138662317, "grad_norm": 2.7669758796691895, "learning_rate": 2.8836322238364294e-05, "loss": 0.3254, "num_input_tokens_seen": 133841552, "step": 62020 }, { "epoch": 10.11827079934747, "grad_norm": 0.06312458217144012, "learning_rate": 2.883280536118485e-05, "loss": 0.0983, "num_input_tokens_seen": 133852688, "step": 62025 }, { "epoch": 10.119086460032626, "grad_norm": 0.5255514979362488, "learning_rate": 2.8829288406328748e-05, "loss": 0.0937, "num_input_tokens_seen": 133863696, "step": 62030 }, { "epoch": 10.119902120717782, "grad_norm": 1.1941866874694824, "learning_rate": 2.882577137386725e-05, "loss": 0.0943, "num_input_tokens_seen": 133875536, "step": 62035 }, { "epoch": 10.120717781402936, "grad_norm": 0.2932925820350647, "learning_rate": 2.8822254263871645e-05, "loss": 0.1126, "num_input_tokens_seen": 133885872, "step": 62040 }, { "epoch": 10.121533442088092, "grad_norm": 0.014011815190315247, "learning_rate": 2.881873707641321e-05, "loss": 0.0217, "num_input_tokens_seen": 133897776, "step": 62045 }, { "epoch": 10.122349102773246, "grad_norm": 0.6089235544204712, "learning_rate": 2.8815219811563223e-05, "loss": 0.0559, "num_input_tokens_seen": 133909040, "step": 62050 }, { "epoch": 10.123164763458401, "grad_norm": 0.5520412921905518, "learning_rate": 2.8811702469392963e-05, "loss": 0.0335, "num_input_tokens_seen": 133919536, "step": 62055 }, { "epoch": 10.123980424143557, "grad_norm": 0.16180795431137085, "learning_rate": 2.8808185049973723e-05, "loss": 0.0387, "num_input_tokens_seen": 133928912, "step": 62060 }, { "epoch": 10.124796084828711, "grad_norm": 0.8385259509086609, "learning_rate": 2.880466755337678e-05, "loss": 0.0407, "num_input_tokens_seen": 133940784, "step": 62065 }, { "epoch": 10.125611745513867, "grad_norm": 0.16630250215530396, "learning_rate": 2.880114997967342e-05, "loss": 0.1517, "num_input_tokens_seen": 133950000, "step": 62070 }, { "epoch": 10.12642740619902, "grad_norm": 0.05381828919053078, "learning_rate": 2.8797632328934933e-05, "loss": 0.1149, "num_input_tokens_seen": 133960464, "step": 62075 }, { "epoch": 10.127243066884176, "grad_norm": 0.14917676150798798, "learning_rate": 2.8794114601232613e-05, "loss": 0.0665, "num_input_tokens_seen": 133971728, "step": 62080 }, { "epoch": 10.12805872756933, "grad_norm": 0.4208558201789856, "learning_rate": 2.8790596796637747e-05, "loss": 0.1533, "num_input_tokens_seen": 133983696, "step": 62085 }, { "epoch": 10.128874388254486, "grad_norm": 0.18147096037864685, "learning_rate": 2.8787078915221628e-05, "loss": 0.1083, "num_input_tokens_seen": 133994224, "step": 62090 }, { "epoch": 10.129690048939642, "grad_norm": 0.1997164487838745, "learning_rate": 2.8783560957055545e-05, "loss": 0.131, "num_input_tokens_seen": 134005264, "step": 62095 }, { "epoch": 10.130505709624796, "grad_norm": 1.3881251811981201, "learning_rate": 2.8780042922210805e-05, "loss": 0.2874, "num_input_tokens_seen": 134017136, "step": 62100 }, { "epoch": 10.131321370309951, "grad_norm": 1.3027598857879639, "learning_rate": 2.8776524810758694e-05, "loss": 0.1139, "num_input_tokens_seen": 134027920, "step": 62105 }, { "epoch": 10.132137030995105, "grad_norm": 1.8557826280593872, "learning_rate": 2.8773006622770522e-05, "loss": 0.1569, "num_input_tokens_seen": 134039152, "step": 62110 }, { "epoch": 10.132952691680261, "grad_norm": 1.600738525390625, "learning_rate": 2.8769488358317586e-05, "loss": 0.3066, "num_input_tokens_seen": 134050192, "step": 62115 }, { "epoch": 10.133768352365417, "grad_norm": 0.7243179678916931, "learning_rate": 2.8765970017471183e-05, "loss": 0.1336, "num_input_tokens_seen": 134061712, "step": 62120 }, { "epoch": 10.13458401305057, "grad_norm": 1.4289782047271729, "learning_rate": 2.876245160030262e-05, "loss": 0.1286, "num_input_tokens_seen": 134072208, "step": 62125 }, { "epoch": 10.135399673735726, "grad_norm": 0.0762549340724945, "learning_rate": 2.875893310688321e-05, "loss": 0.0624, "num_input_tokens_seen": 134083504, "step": 62130 }, { "epoch": 10.13621533442088, "grad_norm": 2.1692123413085938, "learning_rate": 2.875541453728424e-05, "loss": 0.0797, "num_input_tokens_seen": 134093936, "step": 62135 }, { "epoch": 10.137030995106036, "grad_norm": 1.862510085105896, "learning_rate": 2.8751895891577042e-05, "loss": 0.0342, "num_input_tokens_seen": 134104240, "step": 62140 }, { "epoch": 10.137846655791192, "grad_norm": 0.22160419821739197, "learning_rate": 2.874837716983291e-05, "loss": 0.272, "num_input_tokens_seen": 134116176, "step": 62145 }, { "epoch": 10.138662316476346, "grad_norm": 1.5920593738555908, "learning_rate": 2.8744858372123158e-05, "loss": 0.2876, "num_input_tokens_seen": 134127440, "step": 62150 }, { "epoch": 10.139477977161501, "grad_norm": 1.1993650197982788, "learning_rate": 2.87413394985191e-05, "loss": 0.1556, "num_input_tokens_seen": 134139120, "step": 62155 }, { "epoch": 10.140293637846655, "grad_norm": 0.046042922884225845, "learning_rate": 2.8737820549092048e-05, "loss": 0.0184, "num_input_tokens_seen": 134150192, "step": 62160 }, { "epoch": 10.141109298531811, "grad_norm": 0.783281147480011, "learning_rate": 2.8734301523913327e-05, "loss": 0.1233, "num_input_tokens_seen": 134161712, "step": 62165 }, { "epoch": 10.141924959216965, "grad_norm": 0.8274651765823364, "learning_rate": 2.873078242305424e-05, "loss": 0.062, "num_input_tokens_seen": 134171728, "step": 62170 }, { "epoch": 10.14274061990212, "grad_norm": 0.16917644441127777, "learning_rate": 2.872726324658612e-05, "loss": 0.1586, "num_input_tokens_seen": 134181328, "step": 62175 }, { "epoch": 10.143556280587276, "grad_norm": 0.42829665541648865, "learning_rate": 2.8723743994580288e-05, "loss": 0.1047, "num_input_tokens_seen": 134192976, "step": 62180 }, { "epoch": 10.14437194127243, "grad_norm": 0.8245329260826111, "learning_rate": 2.8720224667108047e-05, "loss": 0.0281, "num_input_tokens_seen": 134205168, "step": 62185 }, { "epoch": 10.145187601957586, "grad_norm": 0.17591917514801025, "learning_rate": 2.871670526424074e-05, "loss": 0.0503, "num_input_tokens_seen": 134216144, "step": 62190 }, { "epoch": 10.14600326264274, "grad_norm": 0.035262808203697205, "learning_rate": 2.8713185786049686e-05, "loss": 0.0437, "num_input_tokens_seen": 134226384, "step": 62195 }, { "epoch": 10.146818923327896, "grad_norm": 1.6073575019836426, "learning_rate": 2.8709666232606212e-05, "loss": 0.1125, "num_input_tokens_seen": 134236432, "step": 62200 }, { "epoch": 10.147634584013051, "grad_norm": 2.148244619369507, "learning_rate": 2.870614660398165e-05, "loss": 0.1807, "num_input_tokens_seen": 134248080, "step": 62205 }, { "epoch": 10.148450244698205, "grad_norm": 0.4061604142189026, "learning_rate": 2.870262690024732e-05, "loss": 0.0363, "num_input_tokens_seen": 134259184, "step": 62210 }, { "epoch": 10.149265905383361, "grad_norm": 1.0441536903381348, "learning_rate": 2.8699107121474557e-05, "loss": 0.0727, "num_input_tokens_seen": 134270384, "step": 62215 }, { "epoch": 10.150081566068515, "grad_norm": 0.3688580393791199, "learning_rate": 2.86955872677347e-05, "loss": 0.0168, "num_input_tokens_seen": 134280336, "step": 62220 }, { "epoch": 10.15089722675367, "grad_norm": 0.4115111231803894, "learning_rate": 2.8692067339099075e-05, "loss": 0.1059, "num_input_tokens_seen": 134290800, "step": 62225 }, { "epoch": 10.151712887438826, "grad_norm": 1.0898293256759644, "learning_rate": 2.8688547335639028e-05, "loss": 0.0718, "num_input_tokens_seen": 134301040, "step": 62230 }, { "epoch": 10.15252854812398, "grad_norm": 1.3303477764129639, "learning_rate": 2.8685027257425884e-05, "loss": 0.1473, "num_input_tokens_seen": 134312240, "step": 62235 }, { "epoch": 10.153344208809136, "grad_norm": 1.6782785654067993, "learning_rate": 2.8681507104530986e-05, "loss": 0.142, "num_input_tokens_seen": 134323312, "step": 62240 }, { "epoch": 10.15415986949429, "grad_norm": 0.5914893746376038, "learning_rate": 2.8677986877025676e-05, "loss": 0.058, "num_input_tokens_seen": 134334672, "step": 62245 }, { "epoch": 10.154975530179446, "grad_norm": 0.47648289799690247, "learning_rate": 2.86744665749813e-05, "loss": 0.1345, "num_input_tokens_seen": 134344912, "step": 62250 }, { "epoch": 10.1557911908646, "grad_norm": 0.8556034564971924, "learning_rate": 2.86709461984692e-05, "loss": 0.1475, "num_input_tokens_seen": 134355728, "step": 62255 }, { "epoch": 10.156606851549755, "grad_norm": 1.4863475561141968, "learning_rate": 2.8667425747560716e-05, "loss": 0.0865, "num_input_tokens_seen": 134366736, "step": 62260 }, { "epoch": 10.15742251223491, "grad_norm": 0.053969599306583405, "learning_rate": 2.8663905222327198e-05, "loss": 0.1139, "num_input_tokens_seen": 134378608, "step": 62265 }, { "epoch": 10.158238172920065, "grad_norm": 0.22572055459022522, "learning_rate": 2.8660384622839993e-05, "loss": 0.194, "num_input_tokens_seen": 134388624, "step": 62270 }, { "epoch": 10.15905383360522, "grad_norm": 1.8904772996902466, "learning_rate": 2.865686394917045e-05, "loss": 0.1428, "num_input_tokens_seen": 134399376, "step": 62275 }, { "epoch": 10.159869494290374, "grad_norm": 0.03571440279483795, "learning_rate": 2.8653343201389916e-05, "loss": 0.0552, "num_input_tokens_seen": 134410512, "step": 62280 }, { "epoch": 10.16068515497553, "grad_norm": 0.428387314081192, "learning_rate": 2.8649822379569746e-05, "loss": 0.133, "num_input_tokens_seen": 134419984, "step": 62285 }, { "epoch": 10.161500815660686, "grad_norm": 0.46151161193847656, "learning_rate": 2.8646301483781296e-05, "loss": 0.2564, "num_input_tokens_seen": 134432432, "step": 62290 }, { "epoch": 10.16231647634584, "grad_norm": 0.1040668860077858, "learning_rate": 2.8642780514095922e-05, "loss": 0.0424, "num_input_tokens_seen": 134442992, "step": 62295 }, { "epoch": 10.163132137030995, "grad_norm": 1.5159307718276978, "learning_rate": 2.8639259470584983e-05, "loss": 0.0991, "num_input_tokens_seen": 134453488, "step": 62300 }, { "epoch": 10.16394779771615, "grad_norm": 2.1155636310577393, "learning_rate": 2.8635738353319836e-05, "loss": 0.1984, "num_input_tokens_seen": 134463312, "step": 62305 }, { "epoch": 10.164763458401305, "grad_norm": 0.13493819534778595, "learning_rate": 2.863221716237183e-05, "loss": 0.0442, "num_input_tokens_seen": 134474064, "step": 62310 }, { "epoch": 10.16557911908646, "grad_norm": 0.9845694899559021, "learning_rate": 2.8628695897812348e-05, "loss": 0.0582, "num_input_tokens_seen": 134486512, "step": 62315 }, { "epoch": 10.166394779771615, "grad_norm": 0.21059252321720123, "learning_rate": 2.862517455971273e-05, "loss": 0.1447, "num_input_tokens_seen": 134497776, "step": 62320 }, { "epoch": 10.16721044045677, "grad_norm": 0.5850360989570618, "learning_rate": 2.8621653148144357e-05, "loss": 0.1871, "num_input_tokens_seen": 134508528, "step": 62325 }, { "epoch": 10.168026101141924, "grad_norm": 1.8508232831954956, "learning_rate": 2.8618131663178582e-05, "loss": 0.2197, "num_input_tokens_seen": 134519920, "step": 62330 }, { "epoch": 10.16884176182708, "grad_norm": 1.9656901359558105, "learning_rate": 2.8614610104886785e-05, "loss": 0.1467, "num_input_tokens_seen": 134531152, "step": 62335 }, { "epoch": 10.169657422512234, "grad_norm": 0.606306791305542, "learning_rate": 2.8611088473340326e-05, "loss": 0.0575, "num_input_tokens_seen": 134542288, "step": 62340 }, { "epoch": 10.17047308319739, "grad_norm": 1.165642261505127, "learning_rate": 2.860756676861058e-05, "loss": 0.236, "num_input_tokens_seen": 134552848, "step": 62345 }, { "epoch": 10.171288743882545, "grad_norm": 0.1841876357793808, "learning_rate": 2.8604044990768912e-05, "loss": 0.1509, "num_input_tokens_seen": 134563824, "step": 62350 }, { "epoch": 10.1721044045677, "grad_norm": 0.3892049789428711, "learning_rate": 2.8600523139886703e-05, "loss": 0.0271, "num_input_tokens_seen": 134573744, "step": 62355 }, { "epoch": 10.172920065252855, "grad_norm": 2.587616443634033, "learning_rate": 2.859700121603533e-05, "loss": 0.2039, "num_input_tokens_seen": 134584752, "step": 62360 }, { "epoch": 10.173735725938009, "grad_norm": 0.8781842589378357, "learning_rate": 2.8593479219286157e-05, "loss": 0.3004, "num_input_tokens_seen": 134597168, "step": 62365 }, { "epoch": 10.174551386623165, "grad_norm": 0.6738335490226746, "learning_rate": 2.858995714971057e-05, "loss": 0.119, "num_input_tokens_seen": 134607408, "step": 62370 }, { "epoch": 10.17536704730832, "grad_norm": 0.17467300593852997, "learning_rate": 2.858643500737995e-05, "loss": 0.0217, "num_input_tokens_seen": 134617456, "step": 62375 }, { "epoch": 10.176182707993474, "grad_norm": 0.24051068723201752, "learning_rate": 2.8582912792365678e-05, "loss": 0.1221, "num_input_tokens_seen": 134628112, "step": 62380 }, { "epoch": 10.17699836867863, "grad_norm": 0.45607948303222656, "learning_rate": 2.8579390504739122e-05, "loss": 0.1244, "num_input_tokens_seen": 134639888, "step": 62385 }, { "epoch": 10.177814029363784, "grad_norm": 0.20214515924453735, "learning_rate": 2.8575868144571683e-05, "loss": 0.0623, "num_input_tokens_seen": 134650704, "step": 62390 }, { "epoch": 10.17862969004894, "grad_norm": 0.16877155005931854, "learning_rate": 2.857234571193474e-05, "loss": 0.1262, "num_input_tokens_seen": 134660752, "step": 62395 }, { "epoch": 10.179445350734095, "grad_norm": 0.34175387024879456, "learning_rate": 2.8568823206899682e-05, "loss": 0.0154, "num_input_tokens_seen": 134670224, "step": 62400 }, { "epoch": 10.18026101141925, "grad_norm": 0.562810480594635, "learning_rate": 2.8565300629537887e-05, "loss": 0.135, "num_input_tokens_seen": 134681072, "step": 62405 }, { "epoch": 10.181076672104405, "grad_norm": 3.1524031162261963, "learning_rate": 2.8561777979920757e-05, "loss": 0.175, "num_input_tokens_seen": 134693104, "step": 62410 }, { "epoch": 10.181892332789559, "grad_norm": 0.09475196152925491, "learning_rate": 2.8558255258119675e-05, "loss": 0.065, "num_input_tokens_seen": 134705104, "step": 62415 }, { "epoch": 10.182707993474715, "grad_norm": 0.585456371307373, "learning_rate": 2.855473246420604e-05, "loss": 0.1369, "num_input_tokens_seen": 134715952, "step": 62420 }, { "epoch": 10.18352365415987, "grad_norm": 1.3770250082015991, "learning_rate": 2.855120959825124e-05, "loss": 0.2318, "num_input_tokens_seen": 134725744, "step": 62425 }, { "epoch": 10.184339314845024, "grad_norm": 0.06386580318212509, "learning_rate": 2.854768666032667e-05, "loss": 0.0424, "num_input_tokens_seen": 134736944, "step": 62430 }, { "epoch": 10.18515497553018, "grad_norm": 2.4343812465667725, "learning_rate": 2.8544163650503735e-05, "loss": 0.2152, "num_input_tokens_seen": 134748240, "step": 62435 }, { "epoch": 10.185970636215334, "grad_norm": 0.48555493354797363, "learning_rate": 2.8540640568853828e-05, "loss": 0.0354, "num_input_tokens_seen": 134759888, "step": 62440 }, { "epoch": 10.18678629690049, "grad_norm": 2.240295648574829, "learning_rate": 2.8537117415448344e-05, "loss": 0.239, "num_input_tokens_seen": 134769808, "step": 62445 }, { "epoch": 10.187601957585644, "grad_norm": 1.2953978776931763, "learning_rate": 2.8533594190358692e-05, "loss": 0.1021, "num_input_tokens_seen": 134779888, "step": 62450 }, { "epoch": 10.1884176182708, "grad_norm": 0.15592309832572937, "learning_rate": 2.8530070893656268e-05, "loss": 0.037, "num_input_tokens_seen": 134791120, "step": 62455 }, { "epoch": 10.189233278955955, "grad_norm": 0.44006192684173584, "learning_rate": 2.8526547525412478e-05, "loss": 0.1401, "num_input_tokens_seen": 134802448, "step": 62460 }, { "epoch": 10.190048939641109, "grad_norm": 0.32530295848846436, "learning_rate": 2.8523024085698735e-05, "loss": 0.3437, "num_input_tokens_seen": 134814000, "step": 62465 }, { "epoch": 10.190864600326265, "grad_norm": 0.04592028632760048, "learning_rate": 2.8519500574586433e-05, "loss": 0.0924, "num_input_tokens_seen": 134824080, "step": 62470 }, { "epoch": 10.191680261011419, "grad_norm": 1.5434376001358032, "learning_rate": 2.8515976992146988e-05, "loss": 0.1308, "num_input_tokens_seen": 134834480, "step": 62475 }, { "epoch": 10.192495921696574, "grad_norm": 1.576985478401184, "learning_rate": 2.851245333845181e-05, "loss": 0.1234, "num_input_tokens_seen": 134844496, "step": 62480 }, { "epoch": 10.19331158238173, "grad_norm": 0.018586475402116776, "learning_rate": 2.850892961357231e-05, "loss": 0.0304, "num_input_tokens_seen": 134854448, "step": 62485 }, { "epoch": 10.194127243066884, "grad_norm": 0.5568224787712097, "learning_rate": 2.85054058175799e-05, "loss": 0.1816, "num_input_tokens_seen": 134865360, "step": 62490 }, { "epoch": 10.19494290375204, "grad_norm": 0.4284113347530365, "learning_rate": 2.8501881950545995e-05, "loss": 0.1616, "num_input_tokens_seen": 134876112, "step": 62495 }, { "epoch": 10.195758564437194, "grad_norm": 0.285891592502594, "learning_rate": 2.8498358012542003e-05, "loss": 0.0402, "num_input_tokens_seen": 134887504, "step": 62500 }, { "epoch": 10.19657422512235, "grad_norm": 0.12587015330791473, "learning_rate": 2.8494834003639355e-05, "loss": 0.2215, "num_input_tokens_seen": 134897936, "step": 62505 }, { "epoch": 10.197389885807505, "grad_norm": 0.6261183023452759, "learning_rate": 2.849130992390946e-05, "loss": 0.0616, "num_input_tokens_seen": 134909520, "step": 62510 }, { "epoch": 10.198205546492659, "grad_norm": 0.21648362278938293, "learning_rate": 2.8487785773423742e-05, "loss": 0.1336, "num_input_tokens_seen": 134919888, "step": 62515 }, { "epoch": 10.199021207177815, "grad_norm": 0.7173413038253784, "learning_rate": 2.8484261552253617e-05, "loss": 0.1011, "num_input_tokens_seen": 134930896, "step": 62520 }, { "epoch": 10.199836867862969, "grad_norm": 0.5948838591575623, "learning_rate": 2.8480737260470513e-05, "loss": 0.0671, "num_input_tokens_seen": 134941456, "step": 62525 }, { "epoch": 10.200652528548124, "grad_norm": 0.047162193804979324, "learning_rate": 2.8477212898145855e-05, "loss": 0.0802, "num_input_tokens_seen": 134950448, "step": 62530 }, { "epoch": 10.201468189233278, "grad_norm": 0.07854106277227402, "learning_rate": 2.8473688465351067e-05, "loss": 0.0897, "num_input_tokens_seen": 134960816, "step": 62535 }, { "epoch": 10.202283849918434, "grad_norm": 1.7536048889160156, "learning_rate": 2.8470163962157575e-05, "loss": 0.1134, "num_input_tokens_seen": 134972240, "step": 62540 }, { "epoch": 10.20309951060359, "grad_norm": 0.08382519334554672, "learning_rate": 2.8466639388636805e-05, "loss": 0.0266, "num_input_tokens_seen": 134982992, "step": 62545 }, { "epoch": 10.203915171288743, "grad_norm": 0.1894584447145462, "learning_rate": 2.8463114744860198e-05, "loss": 0.1148, "num_input_tokens_seen": 134993072, "step": 62550 }, { "epoch": 10.2047308319739, "grad_norm": 1.3594681024551392, "learning_rate": 2.8459590030899163e-05, "loss": 0.1417, "num_input_tokens_seen": 135003824, "step": 62555 }, { "epoch": 10.205546492659053, "grad_norm": 0.2253648042678833, "learning_rate": 2.8456065246825158e-05, "loss": 0.1196, "num_input_tokens_seen": 135013424, "step": 62560 }, { "epoch": 10.206362153344209, "grad_norm": 0.07350586354732513, "learning_rate": 2.8452540392709603e-05, "loss": 0.0325, "num_input_tokens_seen": 135024432, "step": 62565 }, { "epoch": 10.207177814029365, "grad_norm": 0.9754027128219604, "learning_rate": 2.8449015468623942e-05, "loss": 0.2315, "num_input_tokens_seen": 135035440, "step": 62570 }, { "epoch": 10.207993474714518, "grad_norm": 0.6226997971534729, "learning_rate": 2.84454904746396e-05, "loss": 0.03, "num_input_tokens_seen": 135047056, "step": 62575 }, { "epoch": 10.208809135399674, "grad_norm": 0.04125750809907913, "learning_rate": 2.8441965410828026e-05, "loss": 0.2369, "num_input_tokens_seen": 135058000, "step": 62580 }, { "epoch": 10.209624796084828, "grad_norm": 0.9289878010749817, "learning_rate": 2.8438440277260658e-05, "loss": 0.061, "num_input_tokens_seen": 135068944, "step": 62585 }, { "epoch": 10.210440456769984, "grad_norm": 0.21161693334579468, "learning_rate": 2.8434915074008933e-05, "loss": 0.1564, "num_input_tokens_seen": 135078064, "step": 62590 }, { "epoch": 10.21125611745514, "grad_norm": 0.3647748529911041, "learning_rate": 2.8431389801144297e-05, "loss": 0.0351, "num_input_tokens_seen": 135089168, "step": 62595 }, { "epoch": 10.212071778140293, "grad_norm": 0.229709655046463, "learning_rate": 2.8427864458738195e-05, "loss": 0.0488, "num_input_tokens_seen": 135100112, "step": 62600 }, { "epoch": 10.21288743882545, "grad_norm": 0.6680052280426025, "learning_rate": 2.8424339046862076e-05, "loss": 0.0864, "num_input_tokens_seen": 135111088, "step": 62605 }, { "epoch": 10.213703099510603, "grad_norm": 0.7156850099563599, "learning_rate": 2.8420813565587368e-05, "loss": 0.0954, "num_input_tokens_seen": 135121040, "step": 62610 }, { "epoch": 10.214518760195759, "grad_norm": 0.9249472618103027, "learning_rate": 2.8417288014985545e-05, "loss": 0.1199, "num_input_tokens_seen": 135132528, "step": 62615 }, { "epoch": 10.215334420880913, "grad_norm": 0.2185947149991989, "learning_rate": 2.841376239512804e-05, "loss": 0.0167, "num_input_tokens_seen": 135142960, "step": 62620 }, { "epoch": 10.216150081566068, "grad_norm": 2.675691604614258, "learning_rate": 2.8410236706086312e-05, "loss": 0.0956, "num_input_tokens_seen": 135153328, "step": 62625 }, { "epoch": 10.216965742251224, "grad_norm": 0.1791418194770813, "learning_rate": 2.8406710947931814e-05, "loss": 0.1571, "num_input_tokens_seen": 135163504, "step": 62630 }, { "epoch": 10.217781402936378, "grad_norm": 0.2447957843542099, "learning_rate": 2.8403185120735993e-05, "loss": 0.0957, "num_input_tokens_seen": 135174672, "step": 62635 }, { "epoch": 10.218597063621534, "grad_norm": 0.07455362379550934, "learning_rate": 2.8399659224570308e-05, "loss": 0.1791, "num_input_tokens_seen": 135186512, "step": 62640 }, { "epoch": 10.219412724306688, "grad_norm": 0.4359457790851593, "learning_rate": 2.839613325950622e-05, "loss": 0.1215, "num_input_tokens_seen": 135195952, "step": 62645 }, { "epoch": 10.220228384991843, "grad_norm": 1.799918532371521, "learning_rate": 2.839260722561518e-05, "loss": 0.1004, "num_input_tokens_seen": 135206928, "step": 62650 }, { "epoch": 10.221044045676999, "grad_norm": 0.8056613802909851, "learning_rate": 2.8389081122968652e-05, "loss": 0.1903, "num_input_tokens_seen": 135217936, "step": 62655 }, { "epoch": 10.221859706362153, "grad_norm": 0.5210303068161011, "learning_rate": 2.8385554951638093e-05, "loss": 0.0447, "num_input_tokens_seen": 135228656, "step": 62660 }, { "epoch": 10.222675367047309, "grad_norm": 2.1133458614349365, "learning_rate": 2.8382028711694975e-05, "loss": 0.115, "num_input_tokens_seen": 135239024, "step": 62665 }, { "epoch": 10.223491027732463, "grad_norm": 0.7835373282432556, "learning_rate": 2.837850240321075e-05, "loss": 0.046, "num_input_tokens_seen": 135250864, "step": 62670 }, { "epoch": 10.224306688417618, "grad_norm": 0.11709008365869522, "learning_rate": 2.8374976026256883e-05, "loss": 0.0139, "num_input_tokens_seen": 135261840, "step": 62675 }, { "epoch": 10.225122349102774, "grad_norm": 0.05231776088476181, "learning_rate": 2.8371449580904853e-05, "loss": 0.0102, "num_input_tokens_seen": 135272912, "step": 62680 }, { "epoch": 10.225938009787928, "grad_norm": 0.10129852592945099, "learning_rate": 2.8367923067226114e-05, "loss": 0.0847, "num_input_tokens_seen": 135283120, "step": 62685 }, { "epoch": 10.226753670473084, "grad_norm": 2.891920328140259, "learning_rate": 2.836439648529215e-05, "loss": 0.1964, "num_input_tokens_seen": 135294032, "step": 62690 }, { "epoch": 10.227569331158238, "grad_norm": 0.19186048209667206, "learning_rate": 2.8360869835174415e-05, "loss": 0.0423, "num_input_tokens_seen": 135304528, "step": 62695 }, { "epoch": 10.228384991843393, "grad_norm": 0.5088971853256226, "learning_rate": 2.835734311694439e-05, "loss": 0.0226, "num_input_tokens_seen": 135315408, "step": 62700 }, { "epoch": 10.229200652528547, "grad_norm": 1.4306397438049316, "learning_rate": 2.8353816330673548e-05, "loss": 0.1561, "num_input_tokens_seen": 135326992, "step": 62705 }, { "epoch": 10.230016313213703, "grad_norm": 1.7875031232833862, "learning_rate": 2.8350289476433368e-05, "loss": 0.1934, "num_input_tokens_seen": 135337296, "step": 62710 }, { "epoch": 10.230831973898859, "grad_norm": 0.41515690088272095, "learning_rate": 2.8346762554295315e-05, "loss": 0.0524, "num_input_tokens_seen": 135348400, "step": 62715 }, { "epoch": 10.231647634584013, "grad_norm": 1.1667581796646118, "learning_rate": 2.834323556433088e-05, "loss": 0.0517, "num_input_tokens_seen": 135359248, "step": 62720 }, { "epoch": 10.232463295269168, "grad_norm": 0.8811501264572144, "learning_rate": 2.833970850661152e-05, "loss": 0.046, "num_input_tokens_seen": 135370352, "step": 62725 }, { "epoch": 10.233278955954322, "grad_norm": 0.24590076506137848, "learning_rate": 2.833618138120875e-05, "loss": 0.0127, "num_input_tokens_seen": 135381392, "step": 62730 }, { "epoch": 10.234094616639478, "grad_norm": 0.17191562056541443, "learning_rate": 2.8332654188194026e-05, "loss": 0.0497, "num_input_tokens_seen": 135392016, "step": 62735 }, { "epoch": 10.234910277324634, "grad_norm": 0.656214714050293, "learning_rate": 2.8329126927638833e-05, "loss": 0.1543, "num_input_tokens_seen": 135404688, "step": 62740 }, { "epoch": 10.235725938009788, "grad_norm": 0.0971250981092453, "learning_rate": 2.832559959961466e-05, "loss": 0.1642, "num_input_tokens_seen": 135416240, "step": 62745 }, { "epoch": 10.236541598694943, "grad_norm": 1.8259592056274414, "learning_rate": 2.8322072204192994e-05, "loss": 0.1548, "num_input_tokens_seen": 135427824, "step": 62750 }, { "epoch": 10.237357259380097, "grad_norm": 1.1188851594924927, "learning_rate": 2.8318544741445324e-05, "loss": 0.0994, "num_input_tokens_seen": 135438960, "step": 62755 }, { "epoch": 10.238172920065253, "grad_norm": 0.08510475605726242, "learning_rate": 2.831501721144313e-05, "loss": 0.0699, "num_input_tokens_seen": 135449360, "step": 62760 }, { "epoch": 10.238988580750409, "grad_norm": 0.057741519063711166, "learning_rate": 2.8311489614257914e-05, "loss": 0.1307, "num_input_tokens_seen": 135461136, "step": 62765 }, { "epoch": 10.239804241435563, "grad_norm": 0.1689615547657013, "learning_rate": 2.8307961949961153e-05, "loss": 0.0437, "num_input_tokens_seen": 135472752, "step": 62770 }, { "epoch": 10.240619902120718, "grad_norm": 0.09923780709505081, "learning_rate": 2.830443421862435e-05, "loss": 0.0629, "num_input_tokens_seen": 135482704, "step": 62775 }, { "epoch": 10.241435562805872, "grad_norm": 0.8418463468551636, "learning_rate": 2.830090642031899e-05, "loss": 0.0559, "num_input_tokens_seen": 135493904, "step": 62780 }, { "epoch": 10.242251223491028, "grad_norm": 1.502518892288208, "learning_rate": 2.8297378555116582e-05, "loss": 0.0527, "num_input_tokens_seen": 135504496, "step": 62785 }, { "epoch": 10.243066884176184, "grad_norm": 0.04131314903497696, "learning_rate": 2.8293850623088607e-05, "loss": 0.0319, "num_input_tokens_seen": 135516144, "step": 62790 }, { "epoch": 10.243882544861338, "grad_norm": 0.11347679793834686, "learning_rate": 2.8290322624306575e-05, "loss": 0.0797, "num_input_tokens_seen": 135526800, "step": 62795 }, { "epoch": 10.244698205546493, "grad_norm": 0.14239123463630676, "learning_rate": 2.828679455884198e-05, "loss": 0.0524, "num_input_tokens_seen": 135538320, "step": 62800 }, { "epoch": 10.245513866231647, "grad_norm": 0.41203147172927856, "learning_rate": 2.8283266426766327e-05, "loss": 0.0189, "num_input_tokens_seen": 135549552, "step": 62805 }, { "epoch": 10.246329526916803, "grad_norm": 0.04041684418916702, "learning_rate": 2.827973822815111e-05, "loss": 0.1985, "num_input_tokens_seen": 135559504, "step": 62810 }, { "epoch": 10.247145187601957, "grad_norm": 0.13667955994606018, "learning_rate": 2.8276209963067835e-05, "loss": 0.1595, "num_input_tokens_seen": 135570096, "step": 62815 }, { "epoch": 10.247960848287113, "grad_norm": 1.6738642454147339, "learning_rate": 2.827268163158801e-05, "loss": 0.1977, "num_input_tokens_seen": 135581360, "step": 62820 }, { "epoch": 10.248776508972268, "grad_norm": 0.05571011081337929, "learning_rate": 2.826915323378314e-05, "loss": 0.0669, "num_input_tokens_seen": 135592656, "step": 62825 }, { "epoch": 10.249592169657422, "grad_norm": 1.475470781326294, "learning_rate": 2.8265624769724736e-05, "loss": 0.3276, "num_input_tokens_seen": 135603312, "step": 62830 }, { "epoch": 10.250407830342578, "grad_norm": 0.0541076697409153, "learning_rate": 2.8262096239484303e-05, "loss": 0.0059, "num_input_tokens_seen": 135614416, "step": 62835 }, { "epoch": 10.251223491027732, "grad_norm": 1.579238772392273, "learning_rate": 2.825856764313335e-05, "loss": 0.2264, "num_input_tokens_seen": 135626384, "step": 62840 }, { "epoch": 10.252039151712887, "grad_norm": 1.4081025123596191, "learning_rate": 2.8255038980743393e-05, "loss": 0.1055, "num_input_tokens_seen": 135636848, "step": 62845 }, { "epoch": 10.252854812398043, "grad_norm": 0.05745512619614601, "learning_rate": 2.8251510252385938e-05, "loss": 0.0124, "num_input_tokens_seen": 135646320, "step": 62850 }, { "epoch": 10.253670473083197, "grad_norm": 0.1911686360836029, "learning_rate": 2.8247981458132504e-05, "loss": 0.1275, "num_input_tokens_seen": 135659056, "step": 62855 }, { "epoch": 10.254486133768353, "grad_norm": 0.9253178238868713, "learning_rate": 2.8244452598054605e-05, "loss": 0.2662, "num_input_tokens_seen": 135670416, "step": 62860 }, { "epoch": 10.255301794453507, "grad_norm": 0.026572315022349358, "learning_rate": 2.8240923672223764e-05, "loss": 0.0278, "num_input_tokens_seen": 135680592, "step": 62865 }, { "epoch": 10.256117455138662, "grad_norm": 0.3141268789768219, "learning_rate": 2.823739468071148e-05, "loss": 0.1327, "num_input_tokens_seen": 135691408, "step": 62870 }, { "epoch": 10.256933115823816, "grad_norm": 0.23826342821121216, "learning_rate": 2.8233865623589296e-05, "loss": 0.1888, "num_input_tokens_seen": 135703600, "step": 62875 }, { "epoch": 10.257748776508972, "grad_norm": 0.5059893727302551, "learning_rate": 2.8230336500928724e-05, "loss": 0.1133, "num_input_tokens_seen": 135713168, "step": 62880 }, { "epoch": 10.258564437194128, "grad_norm": 0.07666651904582977, "learning_rate": 2.822680731280128e-05, "loss": 0.0873, "num_input_tokens_seen": 135724688, "step": 62885 }, { "epoch": 10.259380097879282, "grad_norm": 0.4649641215801239, "learning_rate": 2.822327805927849e-05, "loss": 0.1315, "num_input_tokens_seen": 135734736, "step": 62890 }, { "epoch": 10.260195758564437, "grad_norm": 0.06628760695457458, "learning_rate": 2.821974874043189e-05, "loss": 0.0515, "num_input_tokens_seen": 135745328, "step": 62895 }, { "epoch": 10.261011419249591, "grad_norm": 0.833749532699585, "learning_rate": 2.821621935633299e-05, "loss": 0.0681, "num_input_tokens_seen": 135756560, "step": 62900 }, { "epoch": 10.261827079934747, "grad_norm": 1.8822298049926758, "learning_rate": 2.8212689907053324e-05, "loss": 0.2361, "num_input_tokens_seen": 135767216, "step": 62905 }, { "epoch": 10.262642740619903, "grad_norm": 0.04732232913374901, "learning_rate": 2.8209160392664428e-05, "loss": 0.0791, "num_input_tokens_seen": 135777744, "step": 62910 }, { "epoch": 10.263458401305057, "grad_norm": 0.0428818054497242, "learning_rate": 2.8205630813237822e-05, "loss": 0.0297, "num_input_tokens_seen": 135788592, "step": 62915 }, { "epoch": 10.264274061990212, "grad_norm": 0.12032200396060944, "learning_rate": 2.8202101168845042e-05, "loss": 0.0947, "num_input_tokens_seen": 135800272, "step": 62920 }, { "epoch": 10.265089722675366, "grad_norm": 0.03804730623960495, "learning_rate": 2.8198571459557623e-05, "loss": 0.0527, "num_input_tokens_seen": 135811472, "step": 62925 }, { "epoch": 10.265905383360522, "grad_norm": 0.049756694585084915, "learning_rate": 2.8195041685447083e-05, "loss": 0.174, "num_input_tokens_seen": 135821424, "step": 62930 }, { "epoch": 10.266721044045678, "grad_norm": 0.3085690140724182, "learning_rate": 2.8191511846584978e-05, "loss": 0.1843, "num_input_tokens_seen": 135830768, "step": 62935 }, { "epoch": 10.267536704730832, "grad_norm": 1.915797472000122, "learning_rate": 2.8187981943042834e-05, "loss": 0.094, "num_input_tokens_seen": 135841008, "step": 62940 }, { "epoch": 10.268352365415987, "grad_norm": 0.0780099406838417, "learning_rate": 2.8184451974892195e-05, "loss": 0.0239, "num_input_tokens_seen": 135851024, "step": 62945 }, { "epoch": 10.269168026101141, "grad_norm": 0.3430262804031372, "learning_rate": 2.8180921942204597e-05, "loss": 0.0233, "num_input_tokens_seen": 135862096, "step": 62950 }, { "epoch": 10.269983686786297, "grad_norm": 0.2036145180463791, "learning_rate": 2.8177391845051582e-05, "loss": 0.112, "num_input_tokens_seen": 135873360, "step": 62955 }, { "epoch": 10.270799347471453, "grad_norm": 0.6197050213813782, "learning_rate": 2.8173861683504687e-05, "loss": 0.1183, "num_input_tokens_seen": 135884112, "step": 62960 }, { "epoch": 10.271615008156607, "grad_norm": 0.0517629012465477, "learning_rate": 2.8170331457635464e-05, "loss": 0.0284, "num_input_tokens_seen": 135895152, "step": 62965 }, { "epoch": 10.272430668841762, "grad_norm": 0.0489802286028862, "learning_rate": 2.8166801167515448e-05, "loss": 0.0247, "num_input_tokens_seen": 135906832, "step": 62970 }, { "epoch": 10.273246329526916, "grad_norm": 0.6945403814315796, "learning_rate": 2.816327081321619e-05, "loss": 0.0199, "num_input_tokens_seen": 135917488, "step": 62975 }, { "epoch": 10.274061990212072, "grad_norm": 0.057651419192552567, "learning_rate": 2.8159740394809236e-05, "loss": 0.0842, "num_input_tokens_seen": 135928080, "step": 62980 }, { "epoch": 10.274877650897226, "grad_norm": 1.5553373098373413, "learning_rate": 2.815620991236613e-05, "loss": 0.178, "num_input_tokens_seen": 135937936, "step": 62985 }, { "epoch": 10.275693311582382, "grad_norm": 0.541330873966217, "learning_rate": 2.8152679365958435e-05, "loss": 0.1128, "num_input_tokens_seen": 135948272, "step": 62990 }, { "epoch": 10.276508972267537, "grad_norm": 1.4238742589950562, "learning_rate": 2.814914875565769e-05, "loss": 0.1303, "num_input_tokens_seen": 135958384, "step": 62995 }, { "epoch": 10.277324632952691, "grad_norm": 2.5719997882843018, "learning_rate": 2.8145618081535445e-05, "loss": 0.1718, "num_input_tokens_seen": 135968464, "step": 63000 }, { "epoch": 10.278140293637847, "grad_norm": 0.44209784269332886, "learning_rate": 2.8142087343663266e-05, "loss": 0.0282, "num_input_tokens_seen": 135979216, "step": 63005 }, { "epoch": 10.278955954323001, "grad_norm": 0.2877276837825775, "learning_rate": 2.81385565421127e-05, "loss": 0.0903, "num_input_tokens_seen": 135989584, "step": 63010 }, { "epoch": 10.279771615008157, "grad_norm": 0.6210121512413025, "learning_rate": 2.8135025676955297e-05, "loss": 0.2621, "num_input_tokens_seen": 136002384, "step": 63015 }, { "epoch": 10.280587275693312, "grad_norm": 1.4048937559127808, "learning_rate": 2.813149474826262e-05, "loss": 0.1639, "num_input_tokens_seen": 136012912, "step": 63020 }, { "epoch": 10.281402936378466, "grad_norm": 0.30874329805374146, "learning_rate": 2.8127963756106242e-05, "loss": 0.0351, "num_input_tokens_seen": 136023120, "step": 63025 }, { "epoch": 10.282218597063622, "grad_norm": 0.3137783408164978, "learning_rate": 2.81244327005577e-05, "loss": 0.0422, "num_input_tokens_seen": 136034384, "step": 63030 }, { "epoch": 10.283034257748776, "grad_norm": 1.8718615770339966, "learning_rate": 2.8120901581688574e-05, "loss": 0.1058, "num_input_tokens_seen": 136045136, "step": 63035 }, { "epoch": 10.283849918433932, "grad_norm": 0.05569479241967201, "learning_rate": 2.811737039957041e-05, "loss": 0.1841, "num_input_tokens_seen": 136057744, "step": 63040 }, { "epoch": 10.284665579119087, "grad_norm": 0.031308613717556, "learning_rate": 2.8113839154274785e-05, "loss": 0.1126, "num_input_tokens_seen": 136068624, "step": 63045 }, { "epoch": 10.285481239804241, "grad_norm": 0.25623947381973267, "learning_rate": 2.8110307845873258e-05, "loss": 0.1736, "num_input_tokens_seen": 136080048, "step": 63050 }, { "epoch": 10.286296900489397, "grad_norm": 0.937122106552124, "learning_rate": 2.8106776474437395e-05, "loss": 0.0561, "num_input_tokens_seen": 136090928, "step": 63055 }, { "epoch": 10.28711256117455, "grad_norm": 0.15082822740077972, "learning_rate": 2.8103245040038772e-05, "loss": 0.1855, "num_input_tokens_seen": 136101168, "step": 63060 }, { "epoch": 10.287928221859707, "grad_norm": 1.4731899499893188, "learning_rate": 2.8099713542748945e-05, "loss": 0.2774, "num_input_tokens_seen": 136111824, "step": 63065 }, { "epoch": 10.28874388254486, "grad_norm": 1.8754608631134033, "learning_rate": 2.8096181982639495e-05, "loss": 0.0839, "num_input_tokens_seen": 136122000, "step": 63070 }, { "epoch": 10.289559543230016, "grad_norm": 0.43900537490844727, "learning_rate": 2.8092650359781987e-05, "loss": 0.0899, "num_input_tokens_seen": 136132016, "step": 63075 }, { "epoch": 10.290375203915172, "grad_norm": 0.33327946066856384, "learning_rate": 2.8089118674248e-05, "loss": 0.3201, "num_input_tokens_seen": 136142832, "step": 63080 }, { "epoch": 10.291190864600326, "grad_norm": 0.05540071800351143, "learning_rate": 2.8085586926109096e-05, "loss": 0.1355, "num_input_tokens_seen": 136152752, "step": 63085 }, { "epoch": 10.292006525285482, "grad_norm": 0.3535749316215515, "learning_rate": 2.8082055115436866e-05, "loss": 0.0572, "num_input_tokens_seen": 136163248, "step": 63090 }, { "epoch": 10.292822185970635, "grad_norm": 0.3877638280391693, "learning_rate": 2.807852324230288e-05, "loss": 0.1528, "num_input_tokens_seen": 136173744, "step": 63095 }, { "epoch": 10.293637846655791, "grad_norm": 1.1417895555496216, "learning_rate": 2.8074991306778708e-05, "loss": 0.2174, "num_input_tokens_seen": 136183440, "step": 63100 }, { "epoch": 10.294453507340947, "grad_norm": 0.21495456993579865, "learning_rate": 2.8071459308935943e-05, "loss": 0.0305, "num_input_tokens_seen": 136195056, "step": 63105 }, { "epoch": 10.2952691680261, "grad_norm": 1.6011767387390137, "learning_rate": 2.806792724884616e-05, "loss": 0.0409, "num_input_tokens_seen": 136204880, "step": 63110 }, { "epoch": 10.296084828711257, "grad_norm": 0.057811833918094635, "learning_rate": 2.8064395126580934e-05, "loss": 0.0861, "num_input_tokens_seen": 136215568, "step": 63115 }, { "epoch": 10.29690048939641, "grad_norm": 0.5005304217338562, "learning_rate": 2.806086294221185e-05, "loss": 0.0238, "num_input_tokens_seen": 136226896, "step": 63120 }, { "epoch": 10.297716150081566, "grad_norm": 0.03077128157019615, "learning_rate": 2.80573306958105e-05, "loss": 0.0076, "num_input_tokens_seen": 136239024, "step": 63125 }, { "epoch": 10.298531810766722, "grad_norm": 0.8513948321342468, "learning_rate": 2.8053798387448473e-05, "loss": 0.0812, "num_input_tokens_seen": 136249520, "step": 63130 }, { "epoch": 10.299347471451876, "grad_norm": 0.3776242733001709, "learning_rate": 2.8050266017197335e-05, "loss": 0.0662, "num_input_tokens_seen": 136260688, "step": 63135 }, { "epoch": 10.300163132137031, "grad_norm": 2.0177292823791504, "learning_rate": 2.8046733585128687e-05, "loss": 0.1023, "num_input_tokens_seen": 136270608, "step": 63140 }, { "epoch": 10.300978792822185, "grad_norm": 0.1760854721069336, "learning_rate": 2.8043201091314125e-05, "loss": 0.0765, "num_input_tokens_seen": 136280656, "step": 63145 }, { "epoch": 10.301794453507341, "grad_norm": 0.07264995574951172, "learning_rate": 2.803966853582523e-05, "loss": 0.0668, "num_input_tokens_seen": 136289904, "step": 63150 }, { "epoch": 10.302610114192497, "grad_norm": 1.546987533569336, "learning_rate": 2.80361359187336e-05, "loss": 0.1665, "num_input_tokens_seen": 136300752, "step": 63155 }, { "epoch": 10.30342577487765, "grad_norm": 0.038732387125492096, "learning_rate": 2.8032603240110816e-05, "loss": 0.0592, "num_input_tokens_seen": 136311600, "step": 63160 }, { "epoch": 10.304241435562806, "grad_norm": 0.29763853549957275, "learning_rate": 2.8029070500028482e-05, "loss": 0.0577, "num_input_tokens_seen": 136323280, "step": 63165 }, { "epoch": 10.30505709624796, "grad_norm": 0.14257515966892242, "learning_rate": 2.8025537698558198e-05, "loss": 0.0406, "num_input_tokens_seen": 136335056, "step": 63170 }, { "epoch": 10.305872756933116, "grad_norm": 0.6686140298843384, "learning_rate": 2.8022004835771547e-05, "loss": 0.0544, "num_input_tokens_seen": 136346256, "step": 63175 }, { "epoch": 10.30668841761827, "grad_norm": 0.062158264219760895, "learning_rate": 2.8018471911740135e-05, "loss": 0.1162, "num_input_tokens_seen": 136357424, "step": 63180 }, { "epoch": 10.307504078303426, "grad_norm": 0.03822684660553932, "learning_rate": 2.801493892653556e-05, "loss": 0.0341, "num_input_tokens_seen": 136367664, "step": 63185 }, { "epoch": 10.308319738988581, "grad_norm": 0.3964165449142456, "learning_rate": 2.8011405880229423e-05, "loss": 0.2158, "num_input_tokens_seen": 136378352, "step": 63190 }, { "epoch": 10.309135399673735, "grad_norm": 1.5017658472061157, "learning_rate": 2.800787277289333e-05, "loss": 0.2525, "num_input_tokens_seen": 136389200, "step": 63195 }, { "epoch": 10.309951060358891, "grad_norm": 0.26827698945999146, "learning_rate": 2.800433960459888e-05, "loss": 0.15, "num_input_tokens_seen": 136399504, "step": 63200 }, { "epoch": 10.310766721044045, "grad_norm": 0.9643372893333435, "learning_rate": 2.8000806375417675e-05, "loss": 0.0223, "num_input_tokens_seen": 136411536, "step": 63205 }, { "epoch": 10.3115823817292, "grad_norm": 1.5141806602478027, "learning_rate": 2.799727308542132e-05, "loss": 0.1483, "num_input_tokens_seen": 136422224, "step": 63210 }, { "epoch": 10.312398042414356, "grad_norm": 0.5103439688682556, "learning_rate": 2.799373973468143e-05, "loss": 0.0299, "num_input_tokens_seen": 136434000, "step": 63215 }, { "epoch": 10.31321370309951, "grad_norm": 0.1140860840678215, "learning_rate": 2.7990206323269597e-05, "loss": 0.0585, "num_input_tokens_seen": 136445264, "step": 63220 }, { "epoch": 10.314029363784666, "grad_norm": 0.1691335141658783, "learning_rate": 2.7986672851257444e-05, "loss": 0.1284, "num_input_tokens_seen": 136455824, "step": 63225 }, { "epoch": 10.31484502446982, "grad_norm": 1.2552192211151123, "learning_rate": 2.7983139318716577e-05, "loss": 0.2144, "num_input_tokens_seen": 136466384, "step": 63230 }, { "epoch": 10.315660685154976, "grad_norm": 0.1443212330341339, "learning_rate": 2.797960572571861e-05, "loss": 0.0772, "num_input_tokens_seen": 136477104, "step": 63235 }, { "epoch": 10.31647634584013, "grad_norm": 0.06584783643484116, "learning_rate": 2.7976072072335148e-05, "loss": 0.1567, "num_input_tokens_seen": 136486768, "step": 63240 }, { "epoch": 10.317292006525285, "grad_norm": 5.279452800750732, "learning_rate": 2.7972538358637813e-05, "loss": 0.205, "num_input_tokens_seen": 136496816, "step": 63245 }, { "epoch": 10.318107667210441, "grad_norm": 0.6693039536476135, "learning_rate": 2.796900458469821e-05, "loss": 0.0317, "num_input_tokens_seen": 136505904, "step": 63250 }, { "epoch": 10.318923327895595, "grad_norm": 0.8998551368713379, "learning_rate": 2.7965470750587973e-05, "loss": 0.0706, "num_input_tokens_seen": 136515696, "step": 63255 }, { "epoch": 10.31973898858075, "grad_norm": 0.23797936737537384, "learning_rate": 2.7961936856378708e-05, "loss": 0.0127, "num_input_tokens_seen": 136526256, "step": 63260 }, { "epoch": 10.320554649265905, "grad_norm": 1.5920865535736084, "learning_rate": 2.7958402902142028e-05, "loss": 0.0773, "num_input_tokens_seen": 136538352, "step": 63265 }, { "epoch": 10.32137030995106, "grad_norm": 0.449808806180954, "learning_rate": 2.7954868887949563e-05, "loss": 0.1466, "num_input_tokens_seen": 136549616, "step": 63270 }, { "epoch": 10.322185970636216, "grad_norm": 0.02041589841246605, "learning_rate": 2.7951334813872936e-05, "loss": 0.1102, "num_input_tokens_seen": 136560272, "step": 63275 }, { "epoch": 10.32300163132137, "grad_norm": 2.4857261180877686, "learning_rate": 2.794780067998376e-05, "loss": 0.1867, "num_input_tokens_seen": 136571248, "step": 63280 }, { "epoch": 10.323817292006526, "grad_norm": 5.407399654388428, "learning_rate": 2.794426648635367e-05, "loss": 0.1616, "num_input_tokens_seen": 136580624, "step": 63285 }, { "epoch": 10.32463295269168, "grad_norm": 0.19962340593338013, "learning_rate": 2.7940732233054277e-05, "loss": 0.1259, "num_input_tokens_seen": 136591696, "step": 63290 }, { "epoch": 10.325448613376835, "grad_norm": 0.3584398031234741, "learning_rate": 2.793719792015722e-05, "loss": 0.0574, "num_input_tokens_seen": 136602800, "step": 63295 }, { "epoch": 10.326264274061991, "grad_norm": 1.0066214799880981, "learning_rate": 2.7933663547734118e-05, "loss": 0.4636, "num_input_tokens_seen": 136613808, "step": 63300 }, { "epoch": 10.327079934747145, "grad_norm": 1.2264569997787476, "learning_rate": 2.7930129115856606e-05, "loss": 0.1422, "num_input_tokens_seen": 136623600, "step": 63305 }, { "epoch": 10.3278955954323, "grad_norm": 0.054742034524679184, "learning_rate": 2.792659462459631e-05, "loss": 0.0418, "num_input_tokens_seen": 136635088, "step": 63310 }, { "epoch": 10.328711256117455, "grad_norm": 0.04102171212434769, "learning_rate": 2.792306007402486e-05, "loss": 0.0525, "num_input_tokens_seen": 136644656, "step": 63315 }, { "epoch": 10.32952691680261, "grad_norm": 0.8904616832733154, "learning_rate": 2.7919525464213887e-05, "loss": 0.0882, "num_input_tokens_seen": 136656336, "step": 63320 }, { "epoch": 10.330342577487766, "grad_norm": 0.5594757795333862, "learning_rate": 2.7915990795235027e-05, "loss": 0.028, "num_input_tokens_seen": 136667536, "step": 63325 }, { "epoch": 10.33115823817292, "grad_norm": 0.9036750793457031, "learning_rate": 2.7912456067159915e-05, "loss": 0.1195, "num_input_tokens_seen": 136678384, "step": 63330 }, { "epoch": 10.331973898858076, "grad_norm": 0.45936548709869385, "learning_rate": 2.7908921280060186e-05, "loss": 0.231, "num_input_tokens_seen": 136687920, "step": 63335 }, { "epoch": 10.33278955954323, "grad_norm": 0.03646567091345787, "learning_rate": 2.7905386434007475e-05, "loss": 0.1057, "num_input_tokens_seen": 136698224, "step": 63340 }, { "epoch": 10.333605220228385, "grad_norm": 0.12636442482471466, "learning_rate": 2.7901851529073432e-05, "loss": 0.145, "num_input_tokens_seen": 136707984, "step": 63345 }, { "epoch": 10.33442088091354, "grad_norm": 0.48287254571914673, "learning_rate": 2.7898316565329675e-05, "loss": 0.0308, "num_input_tokens_seen": 136719984, "step": 63350 }, { "epoch": 10.335236541598695, "grad_norm": 0.3334784507751465, "learning_rate": 2.7894781542847858e-05, "loss": 0.2299, "num_input_tokens_seen": 136730928, "step": 63355 }, { "epoch": 10.33605220228385, "grad_norm": 0.018958941102027893, "learning_rate": 2.7891246461699622e-05, "loss": 0.1398, "num_input_tokens_seen": 136743504, "step": 63360 }, { "epoch": 10.336867862969005, "grad_norm": 0.28603237867355347, "learning_rate": 2.788771132195661e-05, "loss": 0.0478, "num_input_tokens_seen": 136754096, "step": 63365 }, { "epoch": 10.33768352365416, "grad_norm": 0.5727444291114807, "learning_rate": 2.7884176123690464e-05, "loss": 0.0341, "num_input_tokens_seen": 136764304, "step": 63370 }, { "epoch": 10.338499184339314, "grad_norm": 0.2244875580072403, "learning_rate": 2.7880640866972836e-05, "loss": 0.1126, "num_input_tokens_seen": 136776368, "step": 63375 }, { "epoch": 10.33931484502447, "grad_norm": 0.4746112823486328, "learning_rate": 2.787710555187536e-05, "loss": 0.3179, "num_input_tokens_seen": 136786896, "step": 63380 }, { "epoch": 10.340130505709626, "grad_norm": 0.29020261764526367, "learning_rate": 2.7873570178469688e-05, "loss": 0.0315, "num_input_tokens_seen": 136797136, "step": 63385 }, { "epoch": 10.34094616639478, "grad_norm": 0.19023963809013367, "learning_rate": 2.7870034746827478e-05, "loss": 0.1065, "num_input_tokens_seen": 136806320, "step": 63390 }, { "epoch": 10.341761827079935, "grad_norm": 0.6282076835632324, "learning_rate": 2.7866499257020366e-05, "loss": 0.0358, "num_input_tokens_seen": 136815536, "step": 63395 }, { "epoch": 10.34257748776509, "grad_norm": 0.5177266597747803, "learning_rate": 2.7862963709120015e-05, "loss": 0.0503, "num_input_tokens_seen": 136826000, "step": 63400 }, { "epoch": 10.343393148450245, "grad_norm": 0.12869571149349213, "learning_rate": 2.7859428103198072e-05, "loss": 0.129, "num_input_tokens_seen": 136837360, "step": 63405 }, { "epoch": 10.3442088091354, "grad_norm": 1.399314284324646, "learning_rate": 2.785589243932619e-05, "loss": 0.0739, "num_input_tokens_seen": 136847152, "step": 63410 }, { "epoch": 10.345024469820554, "grad_norm": 2.173861026763916, "learning_rate": 2.7852356717576027e-05, "loss": 0.102, "num_input_tokens_seen": 136857040, "step": 63415 }, { "epoch": 10.34584013050571, "grad_norm": 1.2797385454177856, "learning_rate": 2.7848820938019232e-05, "loss": 0.3301, "num_input_tokens_seen": 136868272, "step": 63420 }, { "epoch": 10.346655791190864, "grad_norm": 0.6556531190872192, "learning_rate": 2.784528510072747e-05, "loss": 0.1441, "num_input_tokens_seen": 136879920, "step": 63425 }, { "epoch": 10.34747145187602, "grad_norm": 0.07182838022708893, "learning_rate": 2.7841749205772393e-05, "loss": 0.0687, "num_input_tokens_seen": 136890576, "step": 63430 }, { "epoch": 10.348287112561174, "grad_norm": 0.06417360156774521, "learning_rate": 2.783821325322567e-05, "loss": 0.1211, "num_input_tokens_seen": 136900656, "step": 63435 }, { "epoch": 10.34910277324633, "grad_norm": 0.12210450321435928, "learning_rate": 2.783467724315895e-05, "loss": 0.0273, "num_input_tokens_seen": 136910928, "step": 63440 }, { "epoch": 10.349918433931485, "grad_norm": 0.06413044780492783, "learning_rate": 2.78311411756439e-05, "loss": 0.0278, "num_input_tokens_seen": 136921648, "step": 63445 }, { "epoch": 10.350734094616639, "grad_norm": 0.8441827297210693, "learning_rate": 2.7827605050752183e-05, "loss": 0.0625, "num_input_tokens_seen": 136932592, "step": 63450 }, { "epoch": 10.351549755301795, "grad_norm": 0.20686963200569153, "learning_rate": 2.7824068868555464e-05, "loss": 0.115, "num_input_tokens_seen": 136942576, "step": 63455 }, { "epoch": 10.352365415986949, "grad_norm": 0.11032546311616898, "learning_rate": 2.7820532629125397e-05, "loss": 0.2346, "num_input_tokens_seen": 136954032, "step": 63460 }, { "epoch": 10.353181076672104, "grad_norm": 0.09296128153800964, "learning_rate": 2.7816996332533667e-05, "loss": 0.1919, "num_input_tokens_seen": 136965616, "step": 63465 }, { "epoch": 10.35399673735726, "grad_norm": 0.10763566941022873, "learning_rate": 2.781345997885193e-05, "loss": 0.1067, "num_input_tokens_seen": 136975088, "step": 63470 }, { "epoch": 10.354812398042414, "grad_norm": 0.23632998764514923, "learning_rate": 2.7809923568151865e-05, "loss": 0.0813, "num_input_tokens_seen": 136984592, "step": 63475 }, { "epoch": 10.35562805872757, "grad_norm": 0.15587754547595978, "learning_rate": 2.7806387100505128e-05, "loss": 0.0779, "num_input_tokens_seen": 136995344, "step": 63480 }, { "epoch": 10.356443719412724, "grad_norm": 0.6286761164665222, "learning_rate": 2.7802850575983398e-05, "loss": 0.0941, "num_input_tokens_seen": 137005264, "step": 63485 }, { "epoch": 10.35725938009788, "grad_norm": 0.4729039967060089, "learning_rate": 2.7799313994658337e-05, "loss": 0.1429, "num_input_tokens_seen": 137015376, "step": 63490 }, { "epoch": 10.358075040783035, "grad_norm": 0.0374348908662796, "learning_rate": 2.7795777356601633e-05, "loss": 0.1046, "num_input_tokens_seen": 137025744, "step": 63495 }, { "epoch": 10.358890701468189, "grad_norm": 0.191573828458786, "learning_rate": 2.7792240661884956e-05, "loss": 0.1378, "num_input_tokens_seen": 137037616, "step": 63500 }, { "epoch": 10.359706362153345, "grad_norm": 0.06129446625709534, "learning_rate": 2.7788703910579977e-05, "loss": 0.1436, "num_input_tokens_seen": 137046608, "step": 63505 }, { "epoch": 10.360522022838499, "grad_norm": 0.16392415761947632, "learning_rate": 2.778516710275838e-05, "loss": 0.0178, "num_input_tokens_seen": 137057712, "step": 63510 }, { "epoch": 10.361337683523654, "grad_norm": 2.1573705673217773, "learning_rate": 2.7781630238491834e-05, "loss": 0.2543, "num_input_tokens_seen": 137069232, "step": 63515 }, { "epoch": 10.362153344208808, "grad_norm": 1.5211403369903564, "learning_rate": 2.7778093317852022e-05, "loss": 0.1619, "num_input_tokens_seen": 137080240, "step": 63520 }, { "epoch": 10.362969004893964, "grad_norm": 0.7632777094841003, "learning_rate": 2.777455634091063e-05, "loss": 0.1794, "num_input_tokens_seen": 137090224, "step": 63525 }, { "epoch": 10.36378466557912, "grad_norm": 0.15429799258708954, "learning_rate": 2.7771019307739332e-05, "loss": 0.1345, "num_input_tokens_seen": 137101840, "step": 63530 }, { "epoch": 10.364600326264274, "grad_norm": 1.0153385400772095, "learning_rate": 2.7767482218409806e-05, "loss": 0.1114, "num_input_tokens_seen": 137112048, "step": 63535 }, { "epoch": 10.36541598694943, "grad_norm": 0.25800007581710815, "learning_rate": 2.7763945072993752e-05, "loss": 0.0369, "num_input_tokens_seen": 137122864, "step": 63540 }, { "epoch": 10.366231647634583, "grad_norm": 1.327656626701355, "learning_rate": 2.7760407871562837e-05, "loss": 0.0655, "num_input_tokens_seen": 137133424, "step": 63545 }, { "epoch": 10.367047308319739, "grad_norm": 0.7393860220909119, "learning_rate": 2.7756870614188763e-05, "loss": 0.0472, "num_input_tokens_seen": 137144848, "step": 63550 }, { "epoch": 10.367862969004895, "grad_norm": 0.34777987003326416, "learning_rate": 2.77533333009432e-05, "loss": 0.0929, "num_input_tokens_seen": 137154416, "step": 63555 }, { "epoch": 10.368678629690049, "grad_norm": 0.33973678946495056, "learning_rate": 2.774979593189785e-05, "loss": 0.1116, "num_input_tokens_seen": 137164400, "step": 63560 }, { "epoch": 10.369494290375204, "grad_norm": 0.5832465887069702, "learning_rate": 2.774625850712439e-05, "loss": 0.0837, "num_input_tokens_seen": 137174192, "step": 63565 }, { "epoch": 10.370309951060358, "grad_norm": 0.11797841638326645, "learning_rate": 2.7742721026694523e-05, "loss": 0.0343, "num_input_tokens_seen": 137184944, "step": 63570 }, { "epoch": 10.371125611745514, "grad_norm": 0.7692051529884338, "learning_rate": 2.7739183490679932e-05, "loss": 0.173, "num_input_tokens_seen": 137194640, "step": 63575 }, { "epoch": 10.37194127243067, "grad_norm": 0.05593869835138321, "learning_rate": 2.7735645899152317e-05, "loss": 0.0106, "num_input_tokens_seen": 137205552, "step": 63580 }, { "epoch": 10.372756933115824, "grad_norm": 1.5550775527954102, "learning_rate": 2.773210825218337e-05, "loss": 0.2911, "num_input_tokens_seen": 137217424, "step": 63585 }, { "epoch": 10.37357259380098, "grad_norm": 0.46926572918891907, "learning_rate": 2.772857054984478e-05, "loss": 0.0365, "num_input_tokens_seen": 137228528, "step": 63590 }, { "epoch": 10.374388254486133, "grad_norm": 0.07637907564640045, "learning_rate": 2.7725032792208246e-05, "loss": 0.0651, "num_input_tokens_seen": 137238640, "step": 63595 }, { "epoch": 10.375203915171289, "grad_norm": 0.5811027884483337, "learning_rate": 2.7721494979345463e-05, "loss": 0.0462, "num_input_tokens_seen": 137248624, "step": 63600 }, { "epoch": 10.376019575856443, "grad_norm": 0.14834749698638916, "learning_rate": 2.7717957111328136e-05, "loss": 0.0243, "num_input_tokens_seen": 137258960, "step": 63605 }, { "epoch": 10.376835236541599, "grad_norm": 0.5471009612083435, "learning_rate": 2.771441918822796e-05, "loss": 0.0776, "num_input_tokens_seen": 137269776, "step": 63610 }, { "epoch": 10.377650897226754, "grad_norm": 0.650677502155304, "learning_rate": 2.771088121011663e-05, "loss": 0.114, "num_input_tokens_seen": 137280720, "step": 63615 }, { "epoch": 10.378466557911908, "grad_norm": 0.5103994607925415, "learning_rate": 2.770734317706586e-05, "loss": 0.0632, "num_input_tokens_seen": 137291536, "step": 63620 }, { "epoch": 10.379282218597064, "grad_norm": 0.3430260717868805, "learning_rate": 2.770380508914734e-05, "loss": 0.0361, "num_input_tokens_seen": 137302288, "step": 63625 }, { "epoch": 10.380097879282218, "grad_norm": 1.4786148071289062, "learning_rate": 2.7700266946432785e-05, "loss": 0.0858, "num_input_tokens_seen": 137312688, "step": 63630 }, { "epoch": 10.380913539967374, "grad_norm": 1.6866123676300049, "learning_rate": 2.769672874899389e-05, "loss": 0.228, "num_input_tokens_seen": 137323120, "step": 63635 }, { "epoch": 10.38172920065253, "grad_norm": 0.6867523193359375, "learning_rate": 2.7693190496902365e-05, "loss": 0.0765, "num_input_tokens_seen": 137332912, "step": 63640 }, { "epoch": 10.382544861337683, "grad_norm": 0.05234695225954056, "learning_rate": 2.768965219022992e-05, "loss": 0.0945, "num_input_tokens_seen": 137343888, "step": 63645 }, { "epoch": 10.383360522022839, "grad_norm": 0.602100133895874, "learning_rate": 2.7686113829048267e-05, "loss": 0.1369, "num_input_tokens_seen": 137355824, "step": 63650 }, { "epoch": 10.384176182707993, "grad_norm": 0.067231684923172, "learning_rate": 2.76825754134291e-05, "loss": 0.0126, "num_input_tokens_seen": 137365008, "step": 63655 }, { "epoch": 10.384991843393149, "grad_norm": 0.06184197589755058, "learning_rate": 2.7679036943444147e-05, "loss": 0.064, "num_input_tokens_seen": 137376080, "step": 63660 }, { "epoch": 10.385807504078304, "grad_norm": 0.07439038902521133, "learning_rate": 2.7675498419165107e-05, "loss": 0.1276, "num_input_tokens_seen": 137386608, "step": 63665 }, { "epoch": 10.386623164763458, "grad_norm": 1.803285837173462, "learning_rate": 2.7671959840663703e-05, "loss": 0.2405, "num_input_tokens_seen": 137398640, "step": 63670 }, { "epoch": 10.387438825448614, "grad_norm": 0.6536682844161987, "learning_rate": 2.7668421208011636e-05, "loss": 0.2011, "num_input_tokens_seen": 137408016, "step": 63675 }, { "epoch": 10.388254486133768, "grad_norm": 0.06592869013547897, "learning_rate": 2.7664882521280634e-05, "loss": 0.1072, "num_input_tokens_seen": 137419696, "step": 63680 }, { "epoch": 10.389070146818923, "grad_norm": 1.32839834690094, "learning_rate": 2.7661343780542402e-05, "loss": 0.2472, "num_input_tokens_seen": 137431056, "step": 63685 }, { "epoch": 10.38988580750408, "grad_norm": 0.09280616044998169, "learning_rate": 2.765780498586867e-05, "loss": 0.0508, "num_input_tokens_seen": 137442096, "step": 63690 }, { "epoch": 10.390701468189233, "grad_norm": 0.09259229153394699, "learning_rate": 2.7654266137331143e-05, "loss": 0.0635, "num_input_tokens_seen": 137453616, "step": 63695 }, { "epoch": 10.391517128874389, "grad_norm": 0.13870497047901154, "learning_rate": 2.7650727235001546e-05, "loss": 0.1208, "num_input_tokens_seen": 137463952, "step": 63700 }, { "epoch": 10.392332789559543, "grad_norm": 0.17694422602653503, "learning_rate": 2.7647188278951598e-05, "loss": 0.0584, "num_input_tokens_seen": 137475056, "step": 63705 }, { "epoch": 10.393148450244698, "grad_norm": 0.015222077257931232, "learning_rate": 2.764364926925302e-05, "loss": 0.1633, "num_input_tokens_seen": 137484272, "step": 63710 }, { "epoch": 10.393964110929852, "grad_norm": 0.16566121578216553, "learning_rate": 2.764011020597754e-05, "loss": 0.0317, "num_input_tokens_seen": 137494224, "step": 63715 }, { "epoch": 10.394779771615008, "grad_norm": 0.2660936117172241, "learning_rate": 2.763657108919688e-05, "loss": 0.0847, "num_input_tokens_seen": 137505296, "step": 63720 }, { "epoch": 10.395595432300164, "grad_norm": 0.11800455302000046, "learning_rate": 2.7633031918982754e-05, "loss": 0.1654, "num_input_tokens_seen": 137515760, "step": 63725 }, { "epoch": 10.396411092985318, "grad_norm": 0.16757439076900482, "learning_rate": 2.76294926954069e-05, "loss": 0.1085, "num_input_tokens_seen": 137527280, "step": 63730 }, { "epoch": 10.397226753670473, "grad_norm": 0.0784962996840477, "learning_rate": 2.762595341854104e-05, "loss": 0.089, "num_input_tokens_seen": 137537520, "step": 63735 }, { "epoch": 10.398042414355627, "grad_norm": 0.09694980084896088, "learning_rate": 2.7622414088456904e-05, "loss": 0.1331, "num_input_tokens_seen": 137548432, "step": 63740 }, { "epoch": 10.398858075040783, "grad_norm": 0.576914370059967, "learning_rate": 2.761887470522622e-05, "loss": 0.0214, "num_input_tokens_seen": 137558736, "step": 63745 }, { "epoch": 10.399673735725939, "grad_norm": 0.06435941159725189, "learning_rate": 2.7615335268920718e-05, "loss": 0.053, "num_input_tokens_seen": 137569712, "step": 63750 }, { "epoch": 10.400489396411093, "grad_norm": 0.0520649291574955, "learning_rate": 2.7611795779612125e-05, "loss": 0.0708, "num_input_tokens_seen": 137580432, "step": 63755 }, { "epoch": 10.401305057096248, "grad_norm": 0.11230912059545517, "learning_rate": 2.7608256237372182e-05, "loss": 0.0143, "num_input_tokens_seen": 137591248, "step": 63760 }, { "epoch": 10.402120717781402, "grad_norm": 0.0836416706442833, "learning_rate": 2.7604716642272616e-05, "loss": 0.1026, "num_input_tokens_seen": 137602800, "step": 63765 }, { "epoch": 10.402936378466558, "grad_norm": 1.093072533607483, "learning_rate": 2.7601176994385164e-05, "loss": 0.28, "num_input_tokens_seen": 137614512, "step": 63770 }, { "epoch": 10.403752039151712, "grad_norm": 0.18050941824913025, "learning_rate": 2.759763729378156e-05, "loss": 0.1311, "num_input_tokens_seen": 137625008, "step": 63775 }, { "epoch": 10.404567699836868, "grad_norm": 0.6703826189041138, "learning_rate": 2.7594097540533532e-05, "loss": 0.2159, "num_input_tokens_seen": 137635792, "step": 63780 }, { "epoch": 10.405383360522023, "grad_norm": 0.5733332633972168, "learning_rate": 2.7590557734712837e-05, "loss": 0.0688, "num_input_tokens_seen": 137647120, "step": 63785 }, { "epoch": 10.406199021207177, "grad_norm": 0.025420991703867912, "learning_rate": 2.7587017876391196e-05, "loss": 0.1269, "num_input_tokens_seen": 137658416, "step": 63790 }, { "epoch": 10.407014681892333, "grad_norm": 0.3308872580528259, "learning_rate": 2.7583477965640364e-05, "loss": 0.2766, "num_input_tokens_seen": 137669232, "step": 63795 }, { "epoch": 10.407830342577487, "grad_norm": 0.08839782327413559, "learning_rate": 2.757993800253207e-05, "loss": 0.1127, "num_input_tokens_seen": 137681296, "step": 63800 }, { "epoch": 10.408646003262643, "grad_norm": 0.7300841808319092, "learning_rate": 2.757639798713806e-05, "loss": 0.0706, "num_input_tokens_seen": 137690864, "step": 63805 }, { "epoch": 10.409461663947798, "grad_norm": 1.2086281776428223, "learning_rate": 2.757285791953007e-05, "loss": 0.1507, "num_input_tokens_seen": 137701712, "step": 63810 }, { "epoch": 10.410277324632952, "grad_norm": 0.3314095735549927, "learning_rate": 2.7569317799779853e-05, "loss": 0.0991, "num_input_tokens_seen": 137713072, "step": 63815 }, { "epoch": 10.411092985318108, "grad_norm": 1.9656065702438354, "learning_rate": 2.7565777627959155e-05, "loss": 0.092, "num_input_tokens_seen": 137724368, "step": 63820 }, { "epoch": 10.411908646003262, "grad_norm": 0.17539648711681366, "learning_rate": 2.7562237404139724e-05, "loss": 0.0347, "num_input_tokens_seen": 137734864, "step": 63825 }, { "epoch": 10.412724306688418, "grad_norm": 0.48962298035621643, "learning_rate": 2.755869712839329e-05, "loss": 0.0274, "num_input_tokens_seen": 137747312, "step": 63830 }, { "epoch": 10.413539967373573, "grad_norm": 1.1284222602844238, "learning_rate": 2.755515680079162e-05, "loss": 0.0363, "num_input_tokens_seen": 137758192, "step": 63835 }, { "epoch": 10.414355628058727, "grad_norm": 0.04405913129448891, "learning_rate": 2.7551616421406456e-05, "loss": 0.0432, "num_input_tokens_seen": 137768880, "step": 63840 }, { "epoch": 10.415171288743883, "grad_norm": 0.7694706916809082, "learning_rate": 2.7548075990309546e-05, "loss": 0.0379, "num_input_tokens_seen": 137779728, "step": 63845 }, { "epoch": 10.415986949429037, "grad_norm": 0.06978357583284378, "learning_rate": 2.7544535507572645e-05, "loss": 0.0257, "num_input_tokens_seen": 137790800, "step": 63850 }, { "epoch": 10.416802610114193, "grad_norm": 2.6158761978149414, "learning_rate": 2.7540994973267504e-05, "loss": 0.225, "num_input_tokens_seen": 137802064, "step": 63855 }, { "epoch": 10.417618270799348, "grad_norm": 0.05079402029514313, "learning_rate": 2.7537454387465877e-05, "loss": 0.0241, "num_input_tokens_seen": 137811632, "step": 63860 }, { "epoch": 10.418433931484502, "grad_norm": 0.35666051506996155, "learning_rate": 2.7533913750239514e-05, "loss": 0.0775, "num_input_tokens_seen": 137822896, "step": 63865 }, { "epoch": 10.419249592169658, "grad_norm": 0.4942382276058197, "learning_rate": 2.7530373061660176e-05, "loss": 0.0447, "num_input_tokens_seen": 137834032, "step": 63870 }, { "epoch": 10.420065252854812, "grad_norm": 1.2278345823287964, "learning_rate": 2.7526832321799624e-05, "loss": 0.1015, "num_input_tokens_seen": 137844784, "step": 63875 }, { "epoch": 10.420880913539968, "grad_norm": 1.4638582468032837, "learning_rate": 2.75232915307296e-05, "loss": 0.0982, "num_input_tokens_seen": 137854768, "step": 63880 }, { "epoch": 10.421696574225122, "grad_norm": 0.05563002824783325, "learning_rate": 2.751975068852187e-05, "loss": 0.1203, "num_input_tokens_seen": 137864496, "step": 63885 }, { "epoch": 10.422512234910277, "grad_norm": 0.03119554929435253, "learning_rate": 2.7516209795248206e-05, "loss": 0.3163, "num_input_tokens_seen": 137874352, "step": 63890 }, { "epoch": 10.423327895595433, "grad_norm": 0.35194501280784607, "learning_rate": 2.7512668850980355e-05, "loss": 0.0379, "num_input_tokens_seen": 137885296, "step": 63895 }, { "epoch": 10.424143556280587, "grad_norm": 0.6819128394126892, "learning_rate": 2.750912785579008e-05, "loss": 0.0547, "num_input_tokens_seen": 137896848, "step": 63900 }, { "epoch": 10.424959216965743, "grad_norm": 0.07191616296768188, "learning_rate": 2.750558680974915e-05, "loss": 0.0864, "num_input_tokens_seen": 137907664, "step": 63905 }, { "epoch": 10.425774877650896, "grad_norm": 0.6023111343383789, "learning_rate": 2.750204571292932e-05, "loss": 0.2481, "num_input_tokens_seen": 137918800, "step": 63910 }, { "epoch": 10.426590538336052, "grad_norm": 0.33295443654060364, "learning_rate": 2.749850456540236e-05, "loss": 0.0929, "num_input_tokens_seen": 137929424, "step": 63915 }, { "epoch": 10.427406199021208, "grad_norm": 1.902515172958374, "learning_rate": 2.749496336724004e-05, "loss": 0.1713, "num_input_tokens_seen": 137941232, "step": 63920 }, { "epoch": 10.428221859706362, "grad_norm": 0.07150755822658539, "learning_rate": 2.7491422118514122e-05, "loss": 0.1545, "num_input_tokens_seen": 137951696, "step": 63925 }, { "epoch": 10.429037520391518, "grad_norm": 0.5458016991615295, "learning_rate": 2.748788081929637e-05, "loss": 0.0228, "num_input_tokens_seen": 137962768, "step": 63930 }, { "epoch": 10.429853181076671, "grad_norm": 0.13294453918933868, "learning_rate": 2.748433946965856e-05, "loss": 0.2074, "num_input_tokens_seen": 137974480, "step": 63935 }, { "epoch": 10.430668841761827, "grad_norm": 0.05798257887363434, "learning_rate": 2.748079806967246e-05, "loss": 0.1371, "num_input_tokens_seen": 137985680, "step": 63940 }, { "epoch": 10.431484502446983, "grad_norm": 1.6735857725143433, "learning_rate": 2.7477256619409836e-05, "loss": 0.17, "num_input_tokens_seen": 137997264, "step": 63945 }, { "epoch": 10.432300163132137, "grad_norm": 0.2880840301513672, "learning_rate": 2.7473715118942466e-05, "loss": 0.1277, "num_input_tokens_seen": 138007696, "step": 63950 }, { "epoch": 10.433115823817293, "grad_norm": 0.15454651415348053, "learning_rate": 2.7470173568342116e-05, "loss": 0.019, "num_input_tokens_seen": 138018384, "step": 63955 }, { "epoch": 10.433931484502446, "grad_norm": 0.04580603167414665, "learning_rate": 2.7466631967680567e-05, "loss": 0.1266, "num_input_tokens_seen": 138029968, "step": 63960 }, { "epoch": 10.434747145187602, "grad_norm": 0.06313978880643845, "learning_rate": 2.74630903170296e-05, "loss": 0.1571, "num_input_tokens_seen": 138039664, "step": 63965 }, { "epoch": 10.435562805872756, "grad_norm": 0.10739803314208984, "learning_rate": 2.745954861646098e-05, "loss": 0.1132, "num_input_tokens_seen": 138051024, "step": 63970 }, { "epoch": 10.436378466557912, "grad_norm": 1.0546849966049194, "learning_rate": 2.745600686604648e-05, "loss": 0.0498, "num_input_tokens_seen": 138061648, "step": 63975 }, { "epoch": 10.437194127243067, "grad_norm": 0.4460122883319855, "learning_rate": 2.7452465065857892e-05, "loss": 0.1172, "num_input_tokens_seen": 138071536, "step": 63980 }, { "epoch": 10.438009787928221, "grad_norm": 0.5446131825447083, "learning_rate": 2.7448923215966988e-05, "loss": 0.0412, "num_input_tokens_seen": 138082160, "step": 63985 }, { "epoch": 10.438825448613377, "grad_norm": 0.14122477173805237, "learning_rate": 2.7445381316445544e-05, "loss": 0.1287, "num_input_tokens_seen": 138092048, "step": 63990 }, { "epoch": 10.439641109298531, "grad_norm": 0.08528142422437668, "learning_rate": 2.7441839367365347e-05, "loss": 0.1684, "num_input_tokens_seen": 138103792, "step": 63995 }, { "epoch": 10.440456769983687, "grad_norm": 0.06306731700897217, "learning_rate": 2.7438297368798175e-05, "loss": 0.149, "num_input_tokens_seen": 138114352, "step": 64000 }, { "epoch": 10.441272430668842, "grad_norm": 0.06914136558771133, "learning_rate": 2.743475532081582e-05, "loss": 0.0514, "num_input_tokens_seen": 138124400, "step": 64005 }, { "epoch": 10.442088091353996, "grad_norm": 0.18051248788833618, "learning_rate": 2.743121322349006e-05, "loss": 0.0281, "num_input_tokens_seen": 138134800, "step": 64010 }, { "epoch": 10.442903752039152, "grad_norm": 0.41801440715789795, "learning_rate": 2.7427671076892674e-05, "loss": 0.1048, "num_input_tokens_seen": 138145552, "step": 64015 }, { "epoch": 10.443719412724306, "grad_norm": 2.8256571292877197, "learning_rate": 2.7424128881095456e-05, "loss": 0.1858, "num_input_tokens_seen": 138156464, "step": 64020 }, { "epoch": 10.444535073409462, "grad_norm": 0.1958477646112442, "learning_rate": 2.742058663617019e-05, "loss": 0.2042, "num_input_tokens_seen": 138167984, "step": 64025 }, { "epoch": 10.445350734094617, "grad_norm": 0.7864746451377869, "learning_rate": 2.741704434218867e-05, "loss": 0.0627, "num_input_tokens_seen": 138177840, "step": 64030 }, { "epoch": 10.446166394779771, "grad_norm": 0.03394737094640732, "learning_rate": 2.7413501999222678e-05, "loss": 0.0388, "num_input_tokens_seen": 138188208, "step": 64035 }, { "epoch": 10.446982055464927, "grad_norm": 1.662645697593689, "learning_rate": 2.7409959607344004e-05, "loss": 0.3784, "num_input_tokens_seen": 138198544, "step": 64040 }, { "epoch": 10.447797716150081, "grad_norm": 1.7536611557006836, "learning_rate": 2.7406417166624444e-05, "loss": 0.0626, "num_input_tokens_seen": 138210288, "step": 64045 }, { "epoch": 10.448613376835237, "grad_norm": 0.04884868487715721, "learning_rate": 2.7402874677135787e-05, "loss": 0.0386, "num_input_tokens_seen": 138221488, "step": 64050 }, { "epoch": 10.449429037520392, "grad_norm": 0.3982398808002472, "learning_rate": 2.7399332138949824e-05, "loss": 0.1565, "num_input_tokens_seen": 138232272, "step": 64055 }, { "epoch": 10.450244698205546, "grad_norm": 0.24547293782234192, "learning_rate": 2.739578955213836e-05, "loss": 0.0207, "num_input_tokens_seen": 138243184, "step": 64060 }, { "epoch": 10.451060358890702, "grad_norm": 0.07196619361639023, "learning_rate": 2.739224691677317e-05, "loss": 0.2443, "num_input_tokens_seen": 138254960, "step": 64065 }, { "epoch": 10.451876019575856, "grad_norm": 0.2175823301076889, "learning_rate": 2.7388704232926066e-05, "loss": 0.1153, "num_input_tokens_seen": 138265520, "step": 64070 }, { "epoch": 10.452691680261012, "grad_norm": 0.058623526245355606, "learning_rate": 2.7385161500668842e-05, "loss": 0.1931, "num_input_tokens_seen": 138276112, "step": 64075 }, { "epoch": 10.453507340946166, "grad_norm": 0.726145327091217, "learning_rate": 2.7381618720073294e-05, "loss": 0.039, "num_input_tokens_seen": 138286128, "step": 64080 }, { "epoch": 10.454323001631321, "grad_norm": 1.9937907457351685, "learning_rate": 2.7378075891211217e-05, "loss": 0.252, "num_input_tokens_seen": 138296848, "step": 64085 }, { "epoch": 10.455138662316477, "grad_norm": 0.23838579654693604, "learning_rate": 2.737453301415442e-05, "loss": 0.1216, "num_input_tokens_seen": 138307440, "step": 64090 }, { "epoch": 10.455954323001631, "grad_norm": 3.3132126331329346, "learning_rate": 2.7370990088974698e-05, "loss": 0.2701, "num_input_tokens_seen": 138318128, "step": 64095 }, { "epoch": 10.456769983686787, "grad_norm": 0.07219322770833969, "learning_rate": 2.736744711574385e-05, "loss": 0.0935, "num_input_tokens_seen": 138328176, "step": 64100 }, { "epoch": 10.45758564437194, "grad_norm": 0.05185389518737793, "learning_rate": 2.7363904094533683e-05, "loss": 0.0316, "num_input_tokens_seen": 138337840, "step": 64105 }, { "epoch": 10.458401305057096, "grad_norm": 0.16958920657634735, "learning_rate": 2.7360361025416003e-05, "loss": 0.0411, "num_input_tokens_seen": 138348912, "step": 64110 }, { "epoch": 10.459216965742252, "grad_norm": 0.0642099529504776, "learning_rate": 2.7356817908462618e-05, "loss": 0.1365, "num_input_tokens_seen": 138359440, "step": 64115 }, { "epoch": 10.460032626427406, "grad_norm": 0.07480717450380325, "learning_rate": 2.7353274743745323e-05, "loss": 0.0104, "num_input_tokens_seen": 138368944, "step": 64120 }, { "epoch": 10.460848287112562, "grad_norm": 0.3468113839626312, "learning_rate": 2.7349731531335926e-05, "loss": 0.0156, "num_input_tokens_seen": 138379664, "step": 64125 }, { "epoch": 10.461663947797716, "grad_norm": 2.7125957012176514, "learning_rate": 2.7346188271306245e-05, "loss": 0.1185, "num_input_tokens_seen": 138392496, "step": 64130 }, { "epoch": 10.462479608482871, "grad_norm": 1.551347255706787, "learning_rate": 2.7342644963728075e-05, "loss": 0.1655, "num_input_tokens_seen": 138402800, "step": 64135 }, { "epoch": 10.463295269168025, "grad_norm": 0.07998666167259216, "learning_rate": 2.733910160867324e-05, "loss": 0.0278, "num_input_tokens_seen": 138412912, "step": 64140 }, { "epoch": 10.464110929853181, "grad_norm": 0.8560836315155029, "learning_rate": 2.7335558206213547e-05, "loss": 0.2464, "num_input_tokens_seen": 138424688, "step": 64145 }, { "epoch": 10.464926590538337, "grad_norm": 0.06951258331537247, "learning_rate": 2.7332014756420798e-05, "loss": 0.0224, "num_input_tokens_seen": 138435120, "step": 64150 }, { "epoch": 10.46574225122349, "grad_norm": 0.6860154271125793, "learning_rate": 2.7328471259366813e-05, "loss": 0.0579, "num_input_tokens_seen": 138446736, "step": 64155 }, { "epoch": 10.466557911908646, "grad_norm": 0.06943836808204651, "learning_rate": 2.7324927715123406e-05, "loss": 0.0376, "num_input_tokens_seen": 138456464, "step": 64160 }, { "epoch": 10.4673735725938, "grad_norm": 0.32006654143333435, "learning_rate": 2.7321384123762385e-05, "loss": 0.1633, "num_input_tokens_seen": 138466160, "step": 64165 }, { "epoch": 10.468189233278956, "grad_norm": 0.20165175199508667, "learning_rate": 2.731784048535557e-05, "loss": 0.1467, "num_input_tokens_seen": 138476016, "step": 64170 }, { "epoch": 10.469004893964112, "grad_norm": 0.8487505316734314, "learning_rate": 2.731429679997478e-05, "loss": 0.1404, "num_input_tokens_seen": 138487184, "step": 64175 }, { "epoch": 10.469820554649266, "grad_norm": 0.3532564342021942, "learning_rate": 2.731075306769183e-05, "loss": 0.0188, "num_input_tokens_seen": 138497648, "step": 64180 }, { "epoch": 10.470636215334421, "grad_norm": 1.6019734144210815, "learning_rate": 2.7307209288578538e-05, "loss": 0.1469, "num_input_tokens_seen": 138508720, "step": 64185 }, { "epoch": 10.471451876019575, "grad_norm": 0.05491947755217552, "learning_rate": 2.7303665462706725e-05, "loss": 0.0976, "num_input_tokens_seen": 138518224, "step": 64190 }, { "epoch": 10.47226753670473, "grad_norm": 0.686579704284668, "learning_rate": 2.7300121590148204e-05, "loss": 0.1216, "num_input_tokens_seen": 138530064, "step": 64195 }, { "epoch": 10.473083197389887, "grad_norm": 0.3307395577430725, "learning_rate": 2.72965776709748e-05, "loss": 0.0838, "num_input_tokens_seen": 138540560, "step": 64200 }, { "epoch": 10.47389885807504, "grad_norm": 0.06293752044439316, "learning_rate": 2.7293033705258336e-05, "loss": 0.0468, "num_input_tokens_seen": 138551824, "step": 64205 }, { "epoch": 10.474714518760196, "grad_norm": 1.3790850639343262, "learning_rate": 2.7289489693070635e-05, "loss": 0.2349, "num_input_tokens_seen": 138561072, "step": 64210 }, { "epoch": 10.47553017944535, "grad_norm": 0.029492657631635666, "learning_rate": 2.7285945634483523e-05, "loss": 0.02, "num_input_tokens_seen": 138571760, "step": 64215 }, { "epoch": 10.476345840130506, "grad_norm": 2.77539324760437, "learning_rate": 2.7282401529568825e-05, "loss": 0.0795, "num_input_tokens_seen": 138582352, "step": 64220 }, { "epoch": 10.477161500815662, "grad_norm": 0.9917101860046387, "learning_rate": 2.7278857378398366e-05, "loss": 0.0443, "num_input_tokens_seen": 138593488, "step": 64225 }, { "epoch": 10.477977161500815, "grad_norm": 0.28503870964050293, "learning_rate": 2.7275313181043967e-05, "loss": 0.2015, "num_input_tokens_seen": 138604784, "step": 64230 }, { "epoch": 10.478792822185971, "grad_norm": 1.6665080785751343, "learning_rate": 2.7271768937577464e-05, "loss": 0.0802, "num_input_tokens_seen": 138616144, "step": 64235 }, { "epoch": 10.479608482871125, "grad_norm": 0.08733072131872177, "learning_rate": 2.7268224648070678e-05, "loss": 0.0952, "num_input_tokens_seen": 138626992, "step": 64240 }, { "epoch": 10.48042414355628, "grad_norm": 1.171539545059204, "learning_rate": 2.726468031259544e-05, "loss": 0.1747, "num_input_tokens_seen": 138638032, "step": 64245 }, { "epoch": 10.481239804241435, "grad_norm": 0.03054540790617466, "learning_rate": 2.7261135931223585e-05, "loss": 0.0281, "num_input_tokens_seen": 138648272, "step": 64250 }, { "epoch": 10.48205546492659, "grad_norm": 1.433629035949707, "learning_rate": 2.725759150402695e-05, "loss": 0.1229, "num_input_tokens_seen": 138658288, "step": 64255 }, { "epoch": 10.482871125611746, "grad_norm": 0.45834648609161377, "learning_rate": 2.7254047031077346e-05, "loss": 0.1, "num_input_tokens_seen": 138669840, "step": 64260 }, { "epoch": 10.4836867862969, "grad_norm": 0.34600672125816345, "learning_rate": 2.7250502512446617e-05, "loss": 0.0693, "num_input_tokens_seen": 138680816, "step": 64265 }, { "epoch": 10.484502446982056, "grad_norm": 0.7216743230819702, "learning_rate": 2.724695794820661e-05, "loss": 0.2176, "num_input_tokens_seen": 138691280, "step": 64270 }, { "epoch": 10.48531810766721, "grad_norm": 0.019158119335770607, "learning_rate": 2.7243413338429147e-05, "loss": 0.043, "num_input_tokens_seen": 138701840, "step": 64275 }, { "epoch": 10.486133768352365, "grad_norm": 0.05864100530743599, "learning_rate": 2.723986868318607e-05, "loss": 0.0976, "num_input_tokens_seen": 138712624, "step": 64280 }, { "epoch": 10.486949429037521, "grad_norm": 0.593052327632904, "learning_rate": 2.7236323982549205e-05, "loss": 0.1052, "num_input_tokens_seen": 138723696, "step": 64285 }, { "epoch": 10.487765089722675, "grad_norm": 0.33756023645401, "learning_rate": 2.7232779236590404e-05, "loss": 0.173, "num_input_tokens_seen": 138734000, "step": 64290 }, { "epoch": 10.48858075040783, "grad_norm": 0.5478140115737915, "learning_rate": 2.72292344453815e-05, "loss": 0.1451, "num_input_tokens_seen": 138744304, "step": 64295 }, { "epoch": 10.489396411092985, "grad_norm": 1.3735259771347046, "learning_rate": 2.722568960899433e-05, "loss": 0.1111, "num_input_tokens_seen": 138755056, "step": 64300 }, { "epoch": 10.49021207177814, "grad_norm": 0.11942385882139206, "learning_rate": 2.7222144727500737e-05, "loss": 0.0467, "num_input_tokens_seen": 138766992, "step": 64305 }, { "epoch": 10.491027732463296, "grad_norm": 2.599118947982788, "learning_rate": 2.721859980097256e-05, "loss": 0.1049, "num_input_tokens_seen": 138777712, "step": 64310 }, { "epoch": 10.49184339314845, "grad_norm": 1.7458988428115845, "learning_rate": 2.721505482948164e-05, "loss": 0.2054, "num_input_tokens_seen": 138788912, "step": 64315 }, { "epoch": 10.492659053833606, "grad_norm": 0.09128143638372421, "learning_rate": 2.7211509813099828e-05, "loss": 0.0186, "num_input_tokens_seen": 138799376, "step": 64320 }, { "epoch": 10.49347471451876, "grad_norm": 0.9756708741188049, "learning_rate": 2.7207964751898963e-05, "loss": 0.103, "num_input_tokens_seen": 138809968, "step": 64325 }, { "epoch": 10.494290375203915, "grad_norm": 1.802524209022522, "learning_rate": 2.7204419645950895e-05, "loss": 0.0895, "num_input_tokens_seen": 138820016, "step": 64330 }, { "epoch": 10.49510603588907, "grad_norm": 0.9530027508735657, "learning_rate": 2.7200874495327467e-05, "loss": 0.0684, "num_input_tokens_seen": 138831056, "step": 64335 }, { "epoch": 10.495921696574225, "grad_norm": 0.044302649796009064, "learning_rate": 2.719732930010052e-05, "loss": 0.1708, "num_input_tokens_seen": 138842608, "step": 64340 }, { "epoch": 10.49673735725938, "grad_norm": 1.5745909214019775, "learning_rate": 2.719378406034191e-05, "loss": 0.175, "num_input_tokens_seen": 138853584, "step": 64345 }, { "epoch": 10.497553017944535, "grad_norm": 0.4855268895626068, "learning_rate": 2.7190238776123482e-05, "loss": 0.0851, "num_input_tokens_seen": 138863792, "step": 64350 }, { "epoch": 10.49836867862969, "grad_norm": 0.3338201940059662, "learning_rate": 2.7186693447517087e-05, "loss": 0.0821, "num_input_tokens_seen": 138875056, "step": 64355 }, { "epoch": 10.499184339314844, "grad_norm": 2.0173728466033936, "learning_rate": 2.718314807459458e-05, "loss": 0.2311, "num_input_tokens_seen": 138885424, "step": 64360 }, { "epoch": 10.5, "grad_norm": 0.08593765646219254, "learning_rate": 2.71796026574278e-05, "loss": 0.1338, "num_input_tokens_seen": 138896272, "step": 64365 }, { "epoch": 10.500815660685156, "grad_norm": 1.3014631271362305, "learning_rate": 2.717605719608861e-05, "loss": 0.1093, "num_input_tokens_seen": 138908080, "step": 64370 }, { "epoch": 10.50163132137031, "grad_norm": 0.34349653124809265, "learning_rate": 2.7172511690648867e-05, "loss": 0.0427, "num_input_tokens_seen": 138918576, "step": 64375 }, { "epoch": 10.502446982055465, "grad_norm": 0.7561155557632446, "learning_rate": 2.7168966141180406e-05, "loss": 0.0587, "num_input_tokens_seen": 138929552, "step": 64380 }, { "epoch": 10.50326264274062, "grad_norm": 1.4843299388885498, "learning_rate": 2.7165420547755106e-05, "loss": 0.1127, "num_input_tokens_seen": 138939504, "step": 64385 }, { "epoch": 10.504078303425775, "grad_norm": 0.08532635122537613, "learning_rate": 2.7161874910444806e-05, "loss": 0.0725, "num_input_tokens_seen": 138951408, "step": 64390 }, { "epoch": 10.50489396411093, "grad_norm": 0.6746095418930054, "learning_rate": 2.7158329229321373e-05, "loss": 0.075, "num_input_tokens_seen": 138962224, "step": 64395 }, { "epoch": 10.505709624796085, "grad_norm": 1.4213320016860962, "learning_rate": 2.7154783504456655e-05, "loss": 0.3134, "num_input_tokens_seen": 138973680, "step": 64400 }, { "epoch": 10.50652528548124, "grad_norm": 2.0067808628082275, "learning_rate": 2.715123773592252e-05, "loss": 0.1364, "num_input_tokens_seen": 138985072, "step": 64405 }, { "epoch": 10.507340946166394, "grad_norm": 1.044880747795105, "learning_rate": 2.7147691923790812e-05, "loss": 0.1211, "num_input_tokens_seen": 138995440, "step": 64410 }, { "epoch": 10.50815660685155, "grad_norm": 0.6624925136566162, "learning_rate": 2.7144146068133415e-05, "loss": 0.0547, "num_input_tokens_seen": 139006960, "step": 64415 }, { "epoch": 10.508972267536706, "grad_norm": 0.3595723807811737, "learning_rate": 2.7140600169022168e-05, "loss": 0.1275, "num_input_tokens_seen": 139017424, "step": 64420 }, { "epoch": 10.50978792822186, "grad_norm": 0.06697256863117218, "learning_rate": 2.7137054226528952e-05, "loss": 0.1107, "num_input_tokens_seen": 139028144, "step": 64425 }, { "epoch": 10.510603588907015, "grad_norm": 0.8567570447921753, "learning_rate": 2.713350824072562e-05, "loss": 0.1999, "num_input_tokens_seen": 139038128, "step": 64430 }, { "epoch": 10.51141924959217, "grad_norm": 1.7138385772705078, "learning_rate": 2.712996221168403e-05, "loss": 0.1675, "num_input_tokens_seen": 139049616, "step": 64435 }, { "epoch": 10.512234910277325, "grad_norm": 1.5820364952087402, "learning_rate": 2.712641613947606e-05, "loss": 0.2321, "num_input_tokens_seen": 139061136, "step": 64440 }, { "epoch": 10.513050570962479, "grad_norm": 0.5337540507316589, "learning_rate": 2.7122870024173563e-05, "loss": 0.0494, "num_input_tokens_seen": 139071312, "step": 64445 }, { "epoch": 10.513866231647635, "grad_norm": 0.3350483775138855, "learning_rate": 2.7119323865848418e-05, "loss": 0.0247, "num_input_tokens_seen": 139082672, "step": 64450 }, { "epoch": 10.51468189233279, "grad_norm": 2.3421857357025146, "learning_rate": 2.7115777664572485e-05, "loss": 0.1874, "num_input_tokens_seen": 139093776, "step": 64455 }, { "epoch": 10.515497553017944, "grad_norm": 0.08223574608564377, "learning_rate": 2.7112231420417633e-05, "loss": 0.1036, "num_input_tokens_seen": 139104112, "step": 64460 }, { "epoch": 10.5163132137031, "grad_norm": 0.14938418567180634, "learning_rate": 2.7108685133455735e-05, "loss": 0.1091, "num_input_tokens_seen": 139114384, "step": 64465 }, { "epoch": 10.517128874388254, "grad_norm": 0.3941308557987213, "learning_rate": 2.7105138803758657e-05, "loss": 0.0503, "num_input_tokens_seen": 139125232, "step": 64470 }, { "epoch": 10.51794453507341, "grad_norm": 1.7095544338226318, "learning_rate": 2.7101592431398264e-05, "loss": 0.2653, "num_input_tokens_seen": 139135472, "step": 64475 }, { "epoch": 10.518760195758565, "grad_norm": 0.07790171355009079, "learning_rate": 2.7098046016446442e-05, "loss": 0.0652, "num_input_tokens_seen": 139147024, "step": 64480 }, { "epoch": 10.51957585644372, "grad_norm": 0.6145319938659668, "learning_rate": 2.7094499558975057e-05, "loss": 0.1411, "num_input_tokens_seen": 139157680, "step": 64485 }, { "epoch": 10.520391517128875, "grad_norm": 0.9541204571723938, "learning_rate": 2.7090953059055974e-05, "loss": 0.1245, "num_input_tokens_seen": 139168272, "step": 64490 }, { "epoch": 10.521207177814029, "grad_norm": 0.3905286192893982, "learning_rate": 2.7087406516761076e-05, "loss": 0.0696, "num_input_tokens_seen": 139179344, "step": 64495 }, { "epoch": 10.522022838499185, "grad_norm": 1.6833126544952393, "learning_rate": 2.7083859932162243e-05, "loss": 0.1187, "num_input_tokens_seen": 139191888, "step": 64500 }, { "epoch": 10.522838499184338, "grad_norm": 1.1671637296676636, "learning_rate": 2.708031330533134e-05, "loss": 0.1292, "num_input_tokens_seen": 139200592, "step": 64505 }, { "epoch": 10.523654159869494, "grad_norm": 0.36533668637275696, "learning_rate": 2.707676663634025e-05, "loss": 0.0848, "num_input_tokens_seen": 139212080, "step": 64510 }, { "epoch": 10.52446982055465, "grad_norm": 0.49916282296180725, "learning_rate": 2.707321992526085e-05, "loss": 0.1359, "num_input_tokens_seen": 139223088, "step": 64515 }, { "epoch": 10.525285481239804, "grad_norm": 1.4208316802978516, "learning_rate": 2.7069673172165015e-05, "loss": 0.0534, "num_input_tokens_seen": 139234448, "step": 64520 }, { "epoch": 10.52610114192496, "grad_norm": 0.14922353625297546, "learning_rate": 2.7066126377124628e-05, "loss": 0.0333, "num_input_tokens_seen": 139245008, "step": 64525 }, { "epoch": 10.526916802610113, "grad_norm": 0.03158047795295715, "learning_rate": 2.706257954021157e-05, "loss": 0.2567, "num_input_tokens_seen": 139255024, "step": 64530 }, { "epoch": 10.52773246329527, "grad_norm": 0.20447252690792084, "learning_rate": 2.7059032661497723e-05, "loss": 0.1828, "num_input_tokens_seen": 139265680, "step": 64535 }, { "epoch": 10.528548123980425, "grad_norm": 0.6093783378601074, "learning_rate": 2.7055485741054964e-05, "loss": 0.087, "num_input_tokens_seen": 139276240, "step": 64540 }, { "epoch": 10.529363784665579, "grad_norm": 0.15770794451236725, "learning_rate": 2.7051938778955183e-05, "loss": 0.0667, "num_input_tokens_seen": 139285680, "step": 64545 }, { "epoch": 10.530179445350734, "grad_norm": 0.42847979068756104, "learning_rate": 2.7048391775270258e-05, "loss": 0.0618, "num_input_tokens_seen": 139296656, "step": 64550 }, { "epoch": 10.530995106035888, "grad_norm": 0.04306803643703461, "learning_rate": 2.704484473007208e-05, "loss": 0.0584, "num_input_tokens_seen": 139306672, "step": 64555 }, { "epoch": 10.531810766721044, "grad_norm": 0.11495884507894516, "learning_rate": 2.704129764343252e-05, "loss": 0.1301, "num_input_tokens_seen": 139316624, "step": 64560 }, { "epoch": 10.5326264274062, "grad_norm": 0.11736750602722168, "learning_rate": 2.7037750515423476e-05, "loss": 0.0431, "num_input_tokens_seen": 139327472, "step": 64565 }, { "epoch": 10.533442088091354, "grad_norm": 1.4761215448379517, "learning_rate": 2.7034203346116837e-05, "loss": 0.0816, "num_input_tokens_seen": 139338480, "step": 64570 }, { "epoch": 10.53425774877651, "grad_norm": 0.9886019229888916, "learning_rate": 2.703065613558448e-05, "loss": 0.1293, "num_input_tokens_seen": 139350032, "step": 64575 }, { "epoch": 10.535073409461663, "grad_norm": 0.2220025658607483, "learning_rate": 2.70271088838983e-05, "loss": 0.1279, "num_input_tokens_seen": 139361104, "step": 64580 }, { "epoch": 10.535889070146819, "grad_norm": 0.3614498972892761, "learning_rate": 2.7023561591130192e-05, "loss": 0.0579, "num_input_tokens_seen": 139371056, "step": 64585 }, { "epoch": 10.536704730831975, "grad_norm": 0.7080522775650024, "learning_rate": 2.7020014257352046e-05, "loss": 0.1495, "num_input_tokens_seen": 139382544, "step": 64590 }, { "epoch": 10.537520391517129, "grad_norm": 1.3556373119354248, "learning_rate": 2.701646688263574e-05, "loss": 0.1514, "num_input_tokens_seen": 139391888, "step": 64595 }, { "epoch": 10.538336052202284, "grad_norm": 0.6656185388565063, "learning_rate": 2.7012919467053176e-05, "loss": 0.1178, "num_input_tokens_seen": 139402832, "step": 64600 }, { "epoch": 10.539151712887438, "grad_norm": 0.09217469394207001, "learning_rate": 2.7009372010676247e-05, "loss": 0.0229, "num_input_tokens_seen": 139413648, "step": 64605 }, { "epoch": 10.539967373572594, "grad_norm": 1.7763903141021729, "learning_rate": 2.700582451357685e-05, "loss": 0.1108, "num_input_tokens_seen": 139425584, "step": 64610 }, { "epoch": 10.540783034257748, "grad_norm": 0.06116560474038124, "learning_rate": 2.7002276975826867e-05, "loss": 0.0301, "num_input_tokens_seen": 139436336, "step": 64615 }, { "epoch": 10.541598694942904, "grad_norm": 0.37098440527915955, "learning_rate": 2.6998729397498205e-05, "loss": 0.0982, "num_input_tokens_seen": 139447984, "step": 64620 }, { "epoch": 10.54241435562806, "grad_norm": 0.4457875192165375, "learning_rate": 2.699518177866276e-05, "loss": 0.1412, "num_input_tokens_seen": 139459344, "step": 64625 }, { "epoch": 10.543230016313213, "grad_norm": 0.08177978545427322, "learning_rate": 2.699163411939241e-05, "loss": 0.038, "num_input_tokens_seen": 139470128, "step": 64630 }, { "epoch": 10.544045676998369, "grad_norm": 0.10114390403032303, "learning_rate": 2.6988086419759078e-05, "loss": 0.1307, "num_input_tokens_seen": 139480784, "step": 64635 }, { "epoch": 10.544861337683523, "grad_norm": 0.05408439785242081, "learning_rate": 2.6984538679834654e-05, "loss": 0.0981, "num_input_tokens_seen": 139490608, "step": 64640 }, { "epoch": 10.545676998368679, "grad_norm": 0.8150448203086853, "learning_rate": 2.6980990899691038e-05, "loss": 0.0984, "num_input_tokens_seen": 139501360, "step": 64645 }, { "epoch": 10.546492659053834, "grad_norm": 1.6092103719711304, "learning_rate": 2.697744307940012e-05, "loss": 0.1781, "num_input_tokens_seen": 139513040, "step": 64650 }, { "epoch": 10.547308319738988, "grad_norm": 0.2569628357887268, "learning_rate": 2.697389521903382e-05, "loss": 0.0241, "num_input_tokens_seen": 139523632, "step": 64655 }, { "epoch": 10.548123980424144, "grad_norm": 2.0215964317321777, "learning_rate": 2.6970347318664024e-05, "loss": 0.1407, "num_input_tokens_seen": 139534928, "step": 64660 }, { "epoch": 10.548939641109298, "grad_norm": 0.22021925449371338, "learning_rate": 2.696679937836264e-05, "loss": 0.1715, "num_input_tokens_seen": 139546256, "step": 64665 }, { "epoch": 10.549755301794454, "grad_norm": 0.21954049170017242, "learning_rate": 2.6963251398201572e-05, "loss": 0.2017, "num_input_tokens_seen": 139557712, "step": 64670 }, { "epoch": 10.550570962479608, "grad_norm": 0.5143432021141052, "learning_rate": 2.6959703378252727e-05, "loss": 0.0801, "num_input_tokens_seen": 139567984, "step": 64675 }, { "epoch": 10.551386623164763, "grad_norm": 0.1184430941939354, "learning_rate": 2.6956155318588e-05, "loss": 0.0281, "num_input_tokens_seen": 139579248, "step": 64680 }, { "epoch": 10.552202283849919, "grad_norm": 0.7666539549827576, "learning_rate": 2.695260721927931e-05, "loss": 0.347, "num_input_tokens_seen": 139590288, "step": 64685 }, { "epoch": 10.553017944535073, "grad_norm": 1.3153799772262573, "learning_rate": 2.6949059080398552e-05, "loss": 0.1668, "num_input_tokens_seen": 139601712, "step": 64690 }, { "epoch": 10.553833605220229, "grad_norm": 0.08189196139574051, "learning_rate": 2.6945510902017646e-05, "loss": 0.0654, "num_input_tokens_seen": 139613424, "step": 64695 }, { "epoch": 10.554649265905383, "grad_norm": 0.17256596684455872, "learning_rate": 2.6941962684208488e-05, "loss": 0.0143, "num_input_tokens_seen": 139623696, "step": 64700 }, { "epoch": 10.555464926590538, "grad_norm": 0.09589505940675735, "learning_rate": 2.6938414427042995e-05, "loss": 0.1584, "num_input_tokens_seen": 139634800, "step": 64705 }, { "epoch": 10.556280587275694, "grad_norm": 1.0059243440628052, "learning_rate": 2.6934866130593072e-05, "loss": 0.0592, "num_input_tokens_seen": 139645392, "step": 64710 }, { "epoch": 10.557096247960848, "grad_norm": 0.0687960535287857, "learning_rate": 2.6931317794930637e-05, "loss": 0.0513, "num_input_tokens_seen": 139656144, "step": 64715 }, { "epoch": 10.557911908646004, "grad_norm": 0.04747374355792999, "learning_rate": 2.6927769420127595e-05, "loss": 0.0415, "num_input_tokens_seen": 139667632, "step": 64720 }, { "epoch": 10.558727569331158, "grad_norm": 0.5053068399429321, "learning_rate": 2.692422100625585e-05, "loss": 0.0284, "num_input_tokens_seen": 139679216, "step": 64725 }, { "epoch": 10.559543230016313, "grad_norm": 0.12904377281665802, "learning_rate": 2.6920672553387337e-05, "loss": 0.0994, "num_input_tokens_seen": 139690416, "step": 64730 }, { "epoch": 10.560358890701469, "grad_norm": 0.06448891758918762, "learning_rate": 2.6917124061593958e-05, "loss": 0.0709, "num_input_tokens_seen": 139702000, "step": 64735 }, { "epoch": 10.561174551386623, "grad_norm": 0.07147584110498428, "learning_rate": 2.691357553094763e-05, "loss": 0.2124, "num_input_tokens_seen": 139713168, "step": 64740 }, { "epoch": 10.561990212071779, "grad_norm": 0.06770938634872437, "learning_rate": 2.6910026961520258e-05, "loss": 0.1651, "num_input_tokens_seen": 139722800, "step": 64745 }, { "epoch": 10.562805872756933, "grad_norm": 0.06375313550233841, "learning_rate": 2.6906478353383762e-05, "loss": 0.0467, "num_input_tokens_seen": 139734000, "step": 64750 }, { "epoch": 10.563621533442088, "grad_norm": 0.7890685796737671, "learning_rate": 2.6902929706610076e-05, "loss": 0.0363, "num_input_tokens_seen": 139744784, "step": 64755 }, { "epoch": 10.564437194127244, "grad_norm": 1.6690424680709839, "learning_rate": 2.68993810212711e-05, "loss": 0.2734, "num_input_tokens_seen": 139757264, "step": 64760 }, { "epoch": 10.565252854812398, "grad_norm": 0.5238447785377502, "learning_rate": 2.6895832297438762e-05, "loss": 0.1682, "num_input_tokens_seen": 139768912, "step": 64765 }, { "epoch": 10.566068515497554, "grad_norm": 0.1642126888036728, "learning_rate": 2.6892283535184974e-05, "loss": 0.1062, "num_input_tokens_seen": 139778800, "step": 64770 }, { "epoch": 10.566884176182707, "grad_norm": 1.3095670938491821, "learning_rate": 2.688873473458166e-05, "loss": 0.1284, "num_input_tokens_seen": 139789392, "step": 64775 }, { "epoch": 10.567699836867863, "grad_norm": 0.03388221189379692, "learning_rate": 2.6885185895700743e-05, "loss": 0.1761, "num_input_tokens_seen": 139801136, "step": 64780 }, { "epoch": 10.568515497553017, "grad_norm": 0.03435925021767616, "learning_rate": 2.6881637018614143e-05, "loss": 0.0563, "num_input_tokens_seen": 139812560, "step": 64785 }, { "epoch": 10.569331158238173, "grad_norm": 0.11282086372375488, "learning_rate": 2.6878088103393778e-05, "loss": 0.1312, "num_input_tokens_seen": 139822832, "step": 64790 }, { "epoch": 10.570146818923329, "grad_norm": 0.17233729362487793, "learning_rate": 2.6874539150111577e-05, "loss": 0.047, "num_input_tokens_seen": 139833488, "step": 64795 }, { "epoch": 10.570962479608482, "grad_norm": 0.14684340357780457, "learning_rate": 2.687099015883946e-05, "loss": 0.0234, "num_input_tokens_seen": 139845296, "step": 64800 }, { "epoch": 10.571778140293638, "grad_norm": 0.31441470980644226, "learning_rate": 2.6867441129649355e-05, "loss": 0.1238, "num_input_tokens_seen": 139856112, "step": 64805 }, { "epoch": 10.572593800978792, "grad_norm": 1.7505970001220703, "learning_rate": 2.6863892062613187e-05, "loss": 0.0755, "num_input_tokens_seen": 139868560, "step": 64810 }, { "epoch": 10.573409461663948, "grad_norm": 0.15406426787376404, "learning_rate": 2.6860342957802887e-05, "loss": 0.0385, "num_input_tokens_seen": 139880592, "step": 64815 }, { "epoch": 10.574225122349104, "grad_norm": 1.7936930656433105, "learning_rate": 2.685679381529037e-05, "loss": 0.1908, "num_input_tokens_seen": 139891440, "step": 64820 }, { "epoch": 10.575040783034257, "grad_norm": 0.04991921782493591, "learning_rate": 2.6853244635147574e-05, "loss": 0.1997, "num_input_tokens_seen": 139901904, "step": 64825 }, { "epoch": 10.575856443719413, "grad_norm": 0.07881487905979156, "learning_rate": 2.684969541744642e-05, "loss": 0.0193, "num_input_tokens_seen": 139913936, "step": 64830 }, { "epoch": 10.576672104404567, "grad_norm": 2.264988899230957, "learning_rate": 2.6846146162258844e-05, "loss": 0.1406, "num_input_tokens_seen": 139926224, "step": 64835 }, { "epoch": 10.577487765089723, "grad_norm": 0.1307251900434494, "learning_rate": 2.684259686965678e-05, "loss": 0.1858, "num_input_tokens_seen": 139936976, "step": 64840 }, { "epoch": 10.578303425774878, "grad_norm": 0.3358730673789978, "learning_rate": 2.6839047539712142e-05, "loss": 0.1522, "num_input_tokens_seen": 139947056, "step": 64845 }, { "epoch": 10.579119086460032, "grad_norm": 1.0541383028030396, "learning_rate": 2.6835498172496876e-05, "loss": 0.1673, "num_input_tokens_seen": 139957456, "step": 64850 }, { "epoch": 10.579934747145188, "grad_norm": 0.564172089099884, "learning_rate": 2.6831948768082915e-05, "loss": 0.1203, "num_input_tokens_seen": 139967952, "step": 64855 }, { "epoch": 10.580750407830342, "grad_norm": 0.17340245842933655, "learning_rate": 2.6828399326542185e-05, "loss": 0.0646, "num_input_tokens_seen": 139977776, "step": 64860 }, { "epoch": 10.581566068515498, "grad_norm": 0.4544779360294342, "learning_rate": 2.6824849847946627e-05, "loss": 0.082, "num_input_tokens_seen": 139989136, "step": 64865 }, { "epoch": 10.582381729200652, "grad_norm": 0.05695665255188942, "learning_rate": 2.682130033236817e-05, "loss": 0.2361, "num_input_tokens_seen": 139999504, "step": 64870 }, { "epoch": 10.583197389885807, "grad_norm": 0.08153241872787476, "learning_rate": 2.6817750779878748e-05, "loss": 0.1865, "num_input_tokens_seen": 140011024, "step": 64875 }, { "epoch": 10.584013050570963, "grad_norm": 0.07594533264636993, "learning_rate": 2.6814201190550305e-05, "loss": 0.1175, "num_input_tokens_seen": 140021328, "step": 64880 }, { "epoch": 10.584828711256117, "grad_norm": 0.7583290934562683, "learning_rate": 2.6810651564454774e-05, "loss": 0.0762, "num_input_tokens_seen": 140033008, "step": 64885 }, { "epoch": 10.585644371941273, "grad_norm": 1.4445582628250122, "learning_rate": 2.6807101901664088e-05, "loss": 0.1037, "num_input_tokens_seen": 140042832, "step": 64890 }, { "epoch": 10.586460032626427, "grad_norm": 0.8067874908447266, "learning_rate": 2.6803552202250188e-05, "loss": 0.0939, "num_input_tokens_seen": 140053296, "step": 64895 }, { "epoch": 10.587275693311582, "grad_norm": 0.0684249997138977, "learning_rate": 2.6800002466285017e-05, "loss": 0.0928, "num_input_tokens_seen": 140064528, "step": 64900 }, { "epoch": 10.588091353996738, "grad_norm": 1.3687502145767212, "learning_rate": 2.6796452693840512e-05, "loss": 0.189, "num_input_tokens_seen": 140074576, "step": 64905 }, { "epoch": 10.588907014681892, "grad_norm": 1.2337357997894287, "learning_rate": 2.679290288498862e-05, "loss": 0.1748, "num_input_tokens_seen": 140084368, "step": 64910 }, { "epoch": 10.589722675367048, "grad_norm": 0.1075628399848938, "learning_rate": 2.6789353039801275e-05, "loss": 0.0347, "num_input_tokens_seen": 140094480, "step": 64915 }, { "epoch": 10.590538336052202, "grad_norm": 1.8046988248825073, "learning_rate": 2.6785803158350416e-05, "loss": 0.1665, "num_input_tokens_seen": 140105776, "step": 64920 }, { "epoch": 10.591353996737357, "grad_norm": 0.0870145633816719, "learning_rate": 2.6782253240707994e-05, "loss": 0.0564, "num_input_tokens_seen": 140116400, "step": 64925 }, { "epoch": 10.592169657422513, "grad_norm": 0.0886254534125328, "learning_rate": 2.6778703286945943e-05, "loss": 0.0528, "num_input_tokens_seen": 140127632, "step": 64930 }, { "epoch": 10.592985318107667, "grad_norm": 0.08519622683525085, "learning_rate": 2.6775153297136223e-05, "loss": 0.0592, "num_input_tokens_seen": 140137072, "step": 64935 }, { "epoch": 10.593800978792823, "grad_norm": 0.3508817255496979, "learning_rate": 2.6771603271350763e-05, "loss": 0.2083, "num_input_tokens_seen": 140147088, "step": 64940 }, { "epoch": 10.594616639477977, "grad_norm": 0.38159993290901184, "learning_rate": 2.676805320966152e-05, "loss": 0.1202, "num_input_tokens_seen": 140158064, "step": 64945 }, { "epoch": 10.595432300163132, "grad_norm": 0.0683908760547638, "learning_rate": 2.6764503112140426e-05, "loss": 0.0976, "num_input_tokens_seen": 140168688, "step": 64950 }, { "epoch": 10.596247960848288, "grad_norm": 0.08336813747882843, "learning_rate": 2.6760952978859445e-05, "loss": 0.0367, "num_input_tokens_seen": 140179024, "step": 64955 }, { "epoch": 10.597063621533442, "grad_norm": 1.7622908353805542, "learning_rate": 2.675740280989052e-05, "loss": 0.297, "num_input_tokens_seen": 140190064, "step": 64960 }, { "epoch": 10.597879282218598, "grad_norm": 1.0336883068084717, "learning_rate": 2.675385260530559e-05, "loss": 0.0614, "num_input_tokens_seen": 140202160, "step": 64965 }, { "epoch": 10.598694942903752, "grad_norm": 0.035801198333501816, "learning_rate": 2.675030236517662e-05, "loss": 0.0591, "num_input_tokens_seen": 140212304, "step": 64970 }, { "epoch": 10.599510603588907, "grad_norm": 0.13536714017391205, "learning_rate": 2.674675208957555e-05, "loss": 0.1287, "num_input_tokens_seen": 140223152, "step": 64975 }, { "epoch": 10.600326264274061, "grad_norm": 0.21871505677700043, "learning_rate": 2.6743201778574333e-05, "loss": 0.106, "num_input_tokens_seen": 140233872, "step": 64980 }, { "epoch": 10.601141924959217, "grad_norm": 1.2254663705825806, "learning_rate": 2.673965143224492e-05, "loss": 0.2604, "num_input_tokens_seen": 140244464, "step": 64985 }, { "epoch": 10.601957585644373, "grad_norm": 0.4616449773311615, "learning_rate": 2.673610105065926e-05, "loss": 0.0805, "num_input_tokens_seen": 140253552, "step": 64990 }, { "epoch": 10.602773246329527, "grad_norm": 1.7046794891357422, "learning_rate": 2.673255063388932e-05, "loss": 0.2146, "num_input_tokens_seen": 140264144, "step": 64995 }, { "epoch": 10.603588907014682, "grad_norm": 1.0739599466323853, "learning_rate": 2.6729000182007036e-05, "loss": 0.1212, "num_input_tokens_seen": 140272784, "step": 65000 }, { "epoch": 10.604404567699836, "grad_norm": 0.7822679877281189, "learning_rate": 2.672544969508437e-05, "loss": 0.1292, "num_input_tokens_seen": 140282768, "step": 65005 }, { "epoch": 10.605220228384992, "grad_norm": 0.03106611594557762, "learning_rate": 2.6721899173193284e-05, "loss": 0.0109, "num_input_tokens_seen": 140291952, "step": 65010 }, { "epoch": 10.606035889070148, "grad_norm": 0.053424276411533356, "learning_rate": 2.671834861640572e-05, "loss": 0.2271, "num_input_tokens_seen": 140303568, "step": 65015 }, { "epoch": 10.606851549755302, "grad_norm": 0.1895991712808609, "learning_rate": 2.6714798024793642e-05, "loss": 0.1473, "num_input_tokens_seen": 140314960, "step": 65020 }, { "epoch": 10.607667210440457, "grad_norm": 0.8324997425079346, "learning_rate": 2.6711247398429006e-05, "loss": 0.059, "num_input_tokens_seen": 140325936, "step": 65025 }, { "epoch": 10.608482871125611, "grad_norm": 0.06486763805150986, "learning_rate": 2.670769673738377e-05, "loss": 0.0138, "num_input_tokens_seen": 140337392, "step": 65030 }, { "epoch": 10.609298531810767, "grad_norm": 2.572834014892578, "learning_rate": 2.6704146041729895e-05, "loss": 0.0916, "num_input_tokens_seen": 140349488, "step": 65035 }, { "epoch": 10.61011419249592, "grad_norm": 2.3603508472442627, "learning_rate": 2.6700595311539334e-05, "loss": 0.1977, "num_input_tokens_seen": 140362160, "step": 65040 }, { "epoch": 10.610929853181077, "grad_norm": 0.12398417294025421, "learning_rate": 2.669704454688406e-05, "loss": 0.2491, "num_input_tokens_seen": 140373904, "step": 65045 }, { "epoch": 10.611745513866232, "grad_norm": 0.7558384537696838, "learning_rate": 2.669349374783602e-05, "loss": 0.0641, "num_input_tokens_seen": 140384240, "step": 65050 }, { "epoch": 10.612561174551386, "grad_norm": 0.3973892033100128, "learning_rate": 2.668994291446718e-05, "loss": 0.1541, "num_input_tokens_seen": 140396144, "step": 65055 }, { "epoch": 10.613376835236542, "grad_norm": 0.06520067900419235, "learning_rate": 2.6686392046849508e-05, "loss": 0.0206, "num_input_tokens_seen": 140407536, "step": 65060 }, { "epoch": 10.614192495921696, "grad_norm": 0.09856624901294708, "learning_rate": 2.6682841145054955e-05, "loss": 0.0145, "num_input_tokens_seen": 140417616, "step": 65065 }, { "epoch": 10.615008156606851, "grad_norm": 0.4153487980365753, "learning_rate": 2.6679290209155494e-05, "loss": 0.0517, "num_input_tokens_seen": 140429072, "step": 65070 }, { "epoch": 10.615823817292007, "grad_norm": 0.23301491141319275, "learning_rate": 2.667573923922309e-05, "loss": 0.0785, "num_input_tokens_seen": 140440016, "step": 65075 }, { "epoch": 10.616639477977161, "grad_norm": 0.9313338994979858, "learning_rate": 2.6672188235329702e-05, "loss": 0.1526, "num_input_tokens_seen": 140450672, "step": 65080 }, { "epoch": 10.617455138662317, "grad_norm": 0.07175803929567337, "learning_rate": 2.6668637197547296e-05, "loss": 0.0628, "num_input_tokens_seen": 140460944, "step": 65085 }, { "epoch": 10.61827079934747, "grad_norm": 0.3389871120452881, "learning_rate": 2.6665086125947848e-05, "loss": 0.1037, "num_input_tokens_seen": 140471280, "step": 65090 }, { "epoch": 10.619086460032626, "grad_norm": 0.10248202830553055, "learning_rate": 2.6661535020603312e-05, "loss": 0.1256, "num_input_tokens_seen": 140481584, "step": 65095 }, { "epoch": 10.619902120717782, "grad_norm": 1.336905837059021, "learning_rate": 2.6657983881585662e-05, "loss": 0.0516, "num_input_tokens_seen": 140493104, "step": 65100 }, { "epoch": 10.620717781402936, "grad_norm": 0.17211991548538208, "learning_rate": 2.6654432708966866e-05, "loss": 0.1131, "num_input_tokens_seen": 140503952, "step": 65105 }, { "epoch": 10.621533442088092, "grad_norm": 0.7252758145332336, "learning_rate": 2.665088150281889e-05, "loss": 0.1321, "num_input_tokens_seen": 140514256, "step": 65110 }, { "epoch": 10.622349102773246, "grad_norm": 1.392928957939148, "learning_rate": 2.6647330263213703e-05, "loss": 0.1031, "num_input_tokens_seen": 140523984, "step": 65115 }, { "epoch": 10.623164763458401, "grad_norm": 1.2309141159057617, "learning_rate": 2.6643778990223288e-05, "loss": 0.0675, "num_input_tokens_seen": 140535504, "step": 65120 }, { "epoch": 10.623980424143557, "grad_norm": 0.43227073550224304, "learning_rate": 2.66402276839196e-05, "loss": 0.1153, "num_input_tokens_seen": 140547120, "step": 65125 }, { "epoch": 10.624796084828711, "grad_norm": 0.13062311708927155, "learning_rate": 2.6636676344374617e-05, "loss": 0.1166, "num_input_tokens_seen": 140557200, "step": 65130 }, { "epoch": 10.625611745513867, "grad_norm": 0.028988327831029892, "learning_rate": 2.6633124971660316e-05, "loss": 0.0768, "num_input_tokens_seen": 140567696, "step": 65135 }, { "epoch": 10.62642740619902, "grad_norm": 0.050070345401763916, "learning_rate": 2.6629573565848664e-05, "loss": 0.1239, "num_input_tokens_seen": 140578224, "step": 65140 }, { "epoch": 10.627243066884176, "grad_norm": 2.6791884899139404, "learning_rate": 2.6626022127011634e-05, "loss": 0.1819, "num_input_tokens_seen": 140588304, "step": 65145 }, { "epoch": 10.62805872756933, "grad_norm": 0.2613397240638733, "learning_rate": 2.6622470655221204e-05, "loss": 0.1098, "num_input_tokens_seen": 140598288, "step": 65150 }, { "epoch": 10.628874388254486, "grad_norm": 1.8194618225097656, "learning_rate": 2.6618919150549348e-05, "loss": 0.1821, "num_input_tokens_seen": 140608080, "step": 65155 }, { "epoch": 10.629690048939642, "grad_norm": 0.07459208369255066, "learning_rate": 2.6615367613068043e-05, "loss": 0.0981, "num_input_tokens_seen": 140618512, "step": 65160 }, { "epoch": 10.630505709624796, "grad_norm": 0.8034950494766235, "learning_rate": 2.6611816042849258e-05, "loss": 0.1969, "num_input_tokens_seen": 140630096, "step": 65165 }, { "epoch": 10.631321370309951, "grad_norm": 1.065717101097107, "learning_rate": 2.660826443996498e-05, "loss": 0.088, "num_input_tokens_seen": 140640368, "step": 65170 }, { "epoch": 10.632137030995105, "grad_norm": 0.16089002788066864, "learning_rate": 2.660471280448718e-05, "loss": 0.0496, "num_input_tokens_seen": 140651408, "step": 65175 }, { "epoch": 10.632952691680261, "grad_norm": 1.2744042873382568, "learning_rate": 2.6601161136487845e-05, "loss": 0.0485, "num_input_tokens_seen": 140661296, "step": 65180 }, { "epoch": 10.633768352365417, "grad_norm": 0.06989675760269165, "learning_rate": 2.659760943603895e-05, "loss": 0.0711, "num_input_tokens_seen": 140671088, "step": 65185 }, { "epoch": 10.63458401305057, "grad_norm": 0.040145013481378555, "learning_rate": 2.6594057703212468e-05, "loss": 0.0595, "num_input_tokens_seen": 140682288, "step": 65190 }, { "epoch": 10.635399673735726, "grad_norm": 1.274838924407959, "learning_rate": 2.6590505938080386e-05, "loss": 0.26, "num_input_tokens_seen": 140692080, "step": 65195 }, { "epoch": 10.63621533442088, "grad_norm": 0.7972574234008789, "learning_rate": 2.6586954140714682e-05, "loss": 0.1467, "num_input_tokens_seen": 140702640, "step": 65200 }, { "epoch": 10.637030995106036, "grad_norm": 0.0819842591881752, "learning_rate": 2.6583402311187344e-05, "loss": 0.2561, "num_input_tokens_seen": 140712880, "step": 65205 }, { "epoch": 10.63784665579119, "grad_norm": 0.8712549209594727, "learning_rate": 2.6579850449570336e-05, "loss": 0.0527, "num_input_tokens_seen": 140724496, "step": 65210 }, { "epoch": 10.638662316476346, "grad_norm": 0.1755421757698059, "learning_rate": 2.6576298555935668e-05, "loss": 0.0946, "num_input_tokens_seen": 140734096, "step": 65215 }, { "epoch": 10.639477977161501, "grad_norm": 0.19381418824195862, "learning_rate": 2.6572746630355305e-05, "loss": 0.0497, "num_input_tokens_seen": 140745136, "step": 65220 }, { "epoch": 10.640293637846655, "grad_norm": 0.1343945860862732, "learning_rate": 2.656919467290124e-05, "loss": 0.0535, "num_input_tokens_seen": 140755088, "step": 65225 }, { "epoch": 10.641109298531811, "grad_norm": 1.2551634311676025, "learning_rate": 2.6565642683645452e-05, "loss": 0.0581, "num_input_tokens_seen": 140766416, "step": 65230 }, { "epoch": 10.641924959216965, "grad_norm": 1.3459938764572144, "learning_rate": 2.6562090662659926e-05, "loss": 0.0591, "num_input_tokens_seen": 140777520, "step": 65235 }, { "epoch": 10.64274061990212, "grad_norm": 1.327182650566101, "learning_rate": 2.655853861001666e-05, "loss": 0.0408, "num_input_tokens_seen": 140787632, "step": 65240 }, { "epoch": 10.643556280587276, "grad_norm": 1.2670457363128662, "learning_rate": 2.6554986525787622e-05, "loss": 0.0723, "num_input_tokens_seen": 140798896, "step": 65245 }, { "epoch": 10.64437194127243, "grad_norm": 1.3203973770141602, "learning_rate": 2.6551434410044812e-05, "loss": 0.2562, "num_input_tokens_seen": 140809232, "step": 65250 }, { "epoch": 10.645187601957586, "grad_norm": 1.5967150926589966, "learning_rate": 2.654788226286022e-05, "loss": 0.1347, "num_input_tokens_seen": 140821776, "step": 65255 }, { "epoch": 10.64600326264274, "grad_norm": 0.7623922824859619, "learning_rate": 2.654433008430583e-05, "loss": 0.0967, "num_input_tokens_seen": 140831856, "step": 65260 }, { "epoch": 10.646818923327896, "grad_norm": 1.3726102113723755, "learning_rate": 2.6540777874453625e-05, "loss": 0.131, "num_input_tokens_seen": 140842896, "step": 65265 }, { "epoch": 10.647634584013051, "grad_norm": 0.4008682668209076, "learning_rate": 2.65372256333756e-05, "loss": 0.0137, "num_input_tokens_seen": 140852336, "step": 65270 }, { "epoch": 10.648450244698205, "grad_norm": 3.1053988933563232, "learning_rate": 2.653367336114376e-05, "loss": 0.1383, "num_input_tokens_seen": 140864816, "step": 65275 }, { "epoch": 10.649265905383361, "grad_norm": 0.07882755994796753, "learning_rate": 2.653012105783007e-05, "loss": 0.0338, "num_input_tokens_seen": 140875088, "step": 65280 }, { "epoch": 10.650081566068515, "grad_norm": 0.7525316476821899, "learning_rate": 2.6526568723506545e-05, "loss": 0.2568, "num_input_tokens_seen": 140886544, "step": 65285 }, { "epoch": 10.65089722675367, "grad_norm": 0.17894664406776428, "learning_rate": 2.652301635824517e-05, "loss": 0.0494, "num_input_tokens_seen": 140897008, "step": 65290 }, { "epoch": 10.651712887438826, "grad_norm": 1.457327127456665, "learning_rate": 2.651946396211793e-05, "loss": 0.0751, "num_input_tokens_seen": 140906960, "step": 65295 }, { "epoch": 10.65252854812398, "grad_norm": 1.6090651750564575, "learning_rate": 2.6515911535196824e-05, "loss": 0.2567, "num_input_tokens_seen": 140917808, "step": 65300 }, { "epoch": 10.653344208809136, "grad_norm": 1.618809700012207, "learning_rate": 2.6512359077553854e-05, "loss": 0.1401, "num_input_tokens_seen": 140929168, "step": 65305 }, { "epoch": 10.65415986949429, "grad_norm": 0.4027964472770691, "learning_rate": 2.6508806589261003e-05, "loss": 0.0324, "num_input_tokens_seen": 140938832, "step": 65310 }, { "epoch": 10.654975530179446, "grad_norm": 0.5608228445053101, "learning_rate": 2.6505254070390277e-05, "loss": 0.0752, "num_input_tokens_seen": 140949328, "step": 65315 }, { "epoch": 10.655791190864601, "grad_norm": 0.06199905276298523, "learning_rate": 2.6501701521013666e-05, "loss": 0.0793, "num_input_tokens_seen": 140958256, "step": 65320 }, { "epoch": 10.656606851549755, "grad_norm": 0.09519942104816437, "learning_rate": 2.649814894120317e-05, "loss": 0.0952, "num_input_tokens_seen": 140968848, "step": 65325 }, { "epoch": 10.65742251223491, "grad_norm": 2.4416213035583496, "learning_rate": 2.6494596331030787e-05, "loss": 0.1618, "num_input_tokens_seen": 140979760, "step": 65330 }, { "epoch": 10.658238172920065, "grad_norm": 0.08998853713274002, "learning_rate": 2.6491043690568508e-05, "loss": 0.0449, "num_input_tokens_seen": 140990960, "step": 65335 }, { "epoch": 10.65905383360522, "grad_norm": 0.6272014379501343, "learning_rate": 2.648749101988834e-05, "loss": 0.0325, "num_input_tokens_seen": 141002960, "step": 65340 }, { "epoch": 10.659869494290374, "grad_norm": 0.5701807141304016, "learning_rate": 2.6483938319062278e-05, "loss": 0.0978, "num_input_tokens_seen": 141013360, "step": 65345 }, { "epoch": 10.66068515497553, "grad_norm": 0.04831570386886597, "learning_rate": 2.648038558816232e-05, "loss": 0.0959, "num_input_tokens_seen": 141025264, "step": 65350 }, { "epoch": 10.661500815660686, "grad_norm": 0.32810238003730774, "learning_rate": 2.6476832827260472e-05, "loss": 0.03, "num_input_tokens_seen": 141036048, "step": 65355 }, { "epoch": 10.66231647634584, "grad_norm": 0.22669470310211182, "learning_rate": 2.6473280036428737e-05, "loss": 0.1427, "num_input_tokens_seen": 141047120, "step": 65360 }, { "epoch": 10.663132137030995, "grad_norm": 1.5197396278381348, "learning_rate": 2.646972721573911e-05, "loss": 0.1921, "num_input_tokens_seen": 141056464, "step": 65365 }, { "epoch": 10.66394779771615, "grad_norm": 0.1516294926404953, "learning_rate": 2.6466174365263602e-05, "loss": 0.0311, "num_input_tokens_seen": 141066992, "step": 65370 }, { "epoch": 10.664763458401305, "grad_norm": 0.7767549753189087, "learning_rate": 2.6462621485074207e-05, "loss": 0.0794, "num_input_tokens_seen": 141077008, "step": 65375 }, { "epoch": 10.66557911908646, "grad_norm": 2.1078684329986572, "learning_rate": 2.6459068575242922e-05, "loss": 0.185, "num_input_tokens_seen": 141086160, "step": 65380 }, { "epoch": 10.666394779771615, "grad_norm": 0.15878330171108246, "learning_rate": 2.645551563584177e-05, "loss": 0.1373, "num_input_tokens_seen": 141096560, "step": 65385 }, { "epoch": 10.66721044045677, "grad_norm": 0.8622483015060425, "learning_rate": 2.6451962666942755e-05, "loss": 0.1445, "num_input_tokens_seen": 141108304, "step": 65390 }, { "epoch": 10.668026101141924, "grad_norm": 0.5302728414535522, "learning_rate": 2.644840966861786e-05, "loss": 0.0332, "num_input_tokens_seen": 141118256, "step": 65395 }, { "epoch": 10.66884176182708, "grad_norm": 0.9855493903160095, "learning_rate": 2.644485664093912e-05, "loss": 0.0367, "num_input_tokens_seen": 141129456, "step": 65400 }, { "epoch": 10.669657422512234, "grad_norm": 0.20876020193099976, "learning_rate": 2.6441303583978523e-05, "loss": 0.0275, "num_input_tokens_seen": 141140208, "step": 65405 }, { "epoch": 10.67047308319739, "grad_norm": 0.9268416166305542, "learning_rate": 2.6437750497808073e-05, "loss": 0.0421, "num_input_tokens_seen": 141152144, "step": 65410 }, { "epoch": 10.671288743882545, "grad_norm": 2.4886598587036133, "learning_rate": 2.643419738249979e-05, "loss": 0.113, "num_input_tokens_seen": 141163696, "step": 65415 }, { "epoch": 10.6721044045677, "grad_norm": 1.028226375579834, "learning_rate": 2.643064423812568e-05, "loss": 0.1036, "num_input_tokens_seen": 141173488, "step": 65420 }, { "epoch": 10.672920065252855, "grad_norm": 0.842592716217041, "learning_rate": 2.6427091064757748e-05, "loss": 0.1992, "num_input_tokens_seen": 141182320, "step": 65425 }, { "epoch": 10.673735725938009, "grad_norm": 0.16584299504756927, "learning_rate": 2.6423537862468005e-05, "loss": 0.206, "num_input_tokens_seen": 141191824, "step": 65430 }, { "epoch": 10.674551386623165, "grad_norm": 0.29311010241508484, "learning_rate": 2.641998463132846e-05, "loss": 0.1661, "num_input_tokens_seen": 141202864, "step": 65435 }, { "epoch": 10.67536704730832, "grad_norm": 0.19487516582012177, "learning_rate": 2.6416431371411128e-05, "loss": 0.0634, "num_input_tokens_seen": 141213968, "step": 65440 }, { "epoch": 10.676182707993474, "grad_norm": 0.19535496830940247, "learning_rate": 2.6412878082788017e-05, "loss": 0.0606, "num_input_tokens_seen": 141224624, "step": 65445 }, { "epoch": 10.67699836867863, "grad_norm": 0.06691059470176697, "learning_rate": 2.6409324765531136e-05, "loss": 0.0673, "num_input_tokens_seen": 141236336, "step": 65450 }, { "epoch": 10.677814029363784, "grad_norm": 0.05274812504649162, "learning_rate": 2.6405771419712504e-05, "loss": 0.1721, "num_input_tokens_seen": 141247408, "step": 65455 }, { "epoch": 10.67862969004894, "grad_norm": 1.5523566007614136, "learning_rate": 2.6402218045404132e-05, "loss": 0.2539, "num_input_tokens_seen": 141259760, "step": 65460 }, { "epoch": 10.679445350734095, "grad_norm": 0.5293524265289307, "learning_rate": 2.639866464267803e-05, "loss": 0.0749, "num_input_tokens_seen": 141270960, "step": 65465 }, { "epoch": 10.68026101141925, "grad_norm": 1.9086134433746338, "learning_rate": 2.639511121160621e-05, "loss": 0.1024, "num_input_tokens_seen": 141280976, "step": 65470 }, { "epoch": 10.681076672104405, "grad_norm": 0.7868368029594421, "learning_rate": 2.63915577522607e-05, "loss": 0.0229, "num_input_tokens_seen": 141290320, "step": 65475 }, { "epoch": 10.681892332789559, "grad_norm": 0.10241437703371048, "learning_rate": 2.63880042647135e-05, "loss": 0.0195, "num_input_tokens_seen": 141301488, "step": 65480 }, { "epoch": 10.682707993474715, "grad_norm": 0.6022205352783203, "learning_rate": 2.6384450749036634e-05, "loss": 0.084, "num_input_tokens_seen": 141312368, "step": 65485 }, { "epoch": 10.68352365415987, "grad_norm": 0.7159411907196045, "learning_rate": 2.638089720530212e-05, "loss": 0.0448, "num_input_tokens_seen": 141323760, "step": 65490 }, { "epoch": 10.684339314845024, "grad_norm": 0.13663624227046967, "learning_rate": 2.637734363358197e-05, "loss": 0.1356, "num_input_tokens_seen": 141335760, "step": 65495 }, { "epoch": 10.68515497553018, "grad_norm": 0.051233213394880295, "learning_rate": 2.6373790033948204e-05, "loss": 0.0568, "num_input_tokens_seen": 141346704, "step": 65500 }, { "epoch": 10.685970636215334, "grad_norm": 0.041402529925107956, "learning_rate": 2.637023640647284e-05, "loss": 0.0378, "num_input_tokens_seen": 141357840, "step": 65505 }, { "epoch": 10.68678629690049, "grad_norm": 0.3703213930130005, "learning_rate": 2.63666827512279e-05, "loss": 0.1232, "num_input_tokens_seen": 141368144, "step": 65510 }, { "epoch": 10.687601957585644, "grad_norm": 0.6854356527328491, "learning_rate": 2.63631290682854e-05, "loss": 0.0581, "num_input_tokens_seen": 141379248, "step": 65515 }, { "epoch": 10.6884176182708, "grad_norm": 0.46711522340774536, "learning_rate": 2.6359575357717354e-05, "loss": 0.1262, "num_input_tokens_seen": 141390544, "step": 65520 }, { "epoch": 10.689233278955955, "grad_norm": 1.7422821521759033, "learning_rate": 2.6356021619595796e-05, "loss": 0.3799, "num_input_tokens_seen": 141400976, "step": 65525 }, { "epoch": 10.690048939641109, "grad_norm": 2.1867928504943848, "learning_rate": 2.6352467853992734e-05, "loss": 0.2541, "num_input_tokens_seen": 141410608, "step": 65530 }, { "epoch": 10.690864600326265, "grad_norm": 0.059223901480436325, "learning_rate": 2.6348914060980202e-05, "loss": 0.0467, "num_input_tokens_seen": 141420816, "step": 65535 }, { "epoch": 10.691680261011419, "grad_norm": 0.4034169018268585, "learning_rate": 2.6345360240630206e-05, "loss": 0.0315, "num_input_tokens_seen": 141431920, "step": 65540 }, { "epoch": 10.692495921696574, "grad_norm": 0.1921885907649994, "learning_rate": 2.634180639301479e-05, "loss": 0.0584, "num_input_tokens_seen": 141441136, "step": 65545 }, { "epoch": 10.69331158238173, "grad_norm": 0.8952107429504395, "learning_rate": 2.633825251820596e-05, "loss": 0.0768, "num_input_tokens_seen": 141451760, "step": 65550 }, { "epoch": 10.694127243066884, "grad_norm": 1.567111849784851, "learning_rate": 2.633469861627574e-05, "loss": 0.1053, "num_input_tokens_seen": 141462384, "step": 65555 }, { "epoch": 10.69494290375204, "grad_norm": 0.031513430178165436, "learning_rate": 2.6331144687296167e-05, "loss": 0.0395, "num_input_tokens_seen": 141471984, "step": 65560 }, { "epoch": 10.695758564437194, "grad_norm": 0.6814562678337097, "learning_rate": 2.6327590731339252e-05, "loss": 0.1221, "num_input_tokens_seen": 141482384, "step": 65565 }, { "epoch": 10.69657422512235, "grad_norm": 0.1482807695865631, "learning_rate": 2.632403674847703e-05, "loss": 0.0603, "num_input_tokens_seen": 141492848, "step": 65570 }, { "epoch": 10.697389885807503, "grad_norm": 0.24553586542606354, "learning_rate": 2.632048273878152e-05, "loss": 0.1242, "num_input_tokens_seen": 141502864, "step": 65575 }, { "epoch": 10.698205546492659, "grad_norm": 0.019333556294441223, "learning_rate": 2.6316928702324756e-05, "loss": 0.0106, "num_input_tokens_seen": 141513296, "step": 65580 }, { "epoch": 10.699021207177815, "grad_norm": 1.6275826692581177, "learning_rate": 2.6313374639178763e-05, "loss": 0.0689, "num_input_tokens_seen": 141524496, "step": 65585 }, { "epoch": 10.699836867862969, "grad_norm": 0.8856335878372192, "learning_rate": 2.630982054941556e-05, "loss": 0.0895, "num_input_tokens_seen": 141534576, "step": 65590 }, { "epoch": 10.700652528548124, "grad_norm": 0.07258090376853943, "learning_rate": 2.6306266433107187e-05, "loss": 0.1061, "num_input_tokens_seen": 141545104, "step": 65595 }, { "epoch": 10.701468189233278, "grad_norm": 0.02217632159590721, "learning_rate": 2.6302712290325666e-05, "loss": 0.1029, "num_input_tokens_seen": 141556528, "step": 65600 }, { "epoch": 10.702283849918434, "grad_norm": 0.9769904017448425, "learning_rate": 2.6299158121143024e-05, "loss": 0.0858, "num_input_tokens_seen": 141567600, "step": 65605 }, { "epoch": 10.70309951060359, "grad_norm": 2.069478750228882, "learning_rate": 2.62956039256313e-05, "loss": 0.1231, "num_input_tokens_seen": 141579088, "step": 65610 }, { "epoch": 10.703915171288743, "grad_norm": 0.09849763661623001, "learning_rate": 2.629204970386252e-05, "loss": 0.1317, "num_input_tokens_seen": 141589808, "step": 65615 }, { "epoch": 10.7047308319739, "grad_norm": 0.2631145417690277, "learning_rate": 2.628849545590871e-05, "loss": 0.065, "num_input_tokens_seen": 141600592, "step": 65620 }, { "epoch": 10.705546492659053, "grad_norm": 0.06191059574484825, "learning_rate": 2.628494118184191e-05, "loss": 0.0344, "num_input_tokens_seen": 141610928, "step": 65625 }, { "epoch": 10.706362153344209, "grad_norm": 0.05083785951137543, "learning_rate": 2.6281386881734143e-05, "loss": 0.0844, "num_input_tokens_seen": 141622384, "step": 65630 }, { "epoch": 10.707177814029365, "grad_norm": 0.12290488928556442, "learning_rate": 2.6277832555657444e-05, "loss": 0.0117, "num_input_tokens_seen": 141632144, "step": 65635 }, { "epoch": 10.707993474714518, "grad_norm": 0.09122046083211899, "learning_rate": 2.627427820368385e-05, "loss": 0.1142, "num_input_tokens_seen": 141642608, "step": 65640 }, { "epoch": 10.708809135399674, "grad_norm": 0.04179667681455612, "learning_rate": 2.627072382588539e-05, "loss": 0.0173, "num_input_tokens_seen": 141654064, "step": 65645 }, { "epoch": 10.709624796084828, "grad_norm": 0.5932027101516724, "learning_rate": 2.6267169422334104e-05, "loss": 0.2368, "num_input_tokens_seen": 141663984, "step": 65650 }, { "epoch": 10.710440456769984, "grad_norm": 0.7071725726127625, "learning_rate": 2.6263614993102016e-05, "loss": 0.0778, "num_input_tokens_seen": 141673104, "step": 65655 }, { "epoch": 10.71125611745514, "grad_norm": 0.3533751368522644, "learning_rate": 2.626006053826117e-05, "loss": 0.146, "num_input_tokens_seen": 141684784, "step": 65660 }, { "epoch": 10.712071778140293, "grad_norm": 0.27809756994247437, "learning_rate": 2.6256506057883594e-05, "loss": 0.0169, "num_input_tokens_seen": 141696560, "step": 65665 }, { "epoch": 10.71288743882545, "grad_norm": 2.7437403202056885, "learning_rate": 2.6252951552041333e-05, "loss": 0.193, "num_input_tokens_seen": 141707824, "step": 65670 }, { "epoch": 10.713703099510603, "grad_norm": 0.3099699318408966, "learning_rate": 2.624939702080642e-05, "loss": 0.0253, "num_input_tokens_seen": 141719088, "step": 65675 }, { "epoch": 10.714518760195759, "grad_norm": 0.17583125829696655, "learning_rate": 2.624584246425089e-05, "loss": 0.0199, "num_input_tokens_seen": 141729840, "step": 65680 }, { "epoch": 10.715334420880914, "grad_norm": 0.08397413790225983, "learning_rate": 2.624228788244678e-05, "loss": 0.1191, "num_input_tokens_seen": 141740720, "step": 65685 }, { "epoch": 10.716150081566068, "grad_norm": 0.08385391533374786, "learning_rate": 2.6238733275466136e-05, "loss": 0.1112, "num_input_tokens_seen": 141752752, "step": 65690 }, { "epoch": 10.716965742251224, "grad_norm": 2.5906918048858643, "learning_rate": 2.623517864338098e-05, "loss": 0.11, "num_input_tokens_seen": 141762896, "step": 65695 }, { "epoch": 10.717781402936378, "grad_norm": 0.415906697511673, "learning_rate": 2.623162398626337e-05, "loss": 0.0711, "num_input_tokens_seen": 141774224, "step": 65700 }, { "epoch": 10.718597063621534, "grad_norm": 0.5308367609977722, "learning_rate": 2.6228069304185336e-05, "loss": 0.0557, "num_input_tokens_seen": 141784688, "step": 65705 }, { "epoch": 10.719412724306688, "grad_norm": 0.23082391917705536, "learning_rate": 2.6224514597218924e-05, "loss": 0.0435, "num_input_tokens_seen": 141794768, "step": 65710 }, { "epoch": 10.720228384991843, "grad_norm": 1.5443685054779053, "learning_rate": 2.6220959865436168e-05, "loss": 0.2228, "num_input_tokens_seen": 141806480, "step": 65715 }, { "epoch": 10.721044045676999, "grad_norm": 0.20077434182167053, "learning_rate": 2.621740510890911e-05, "loss": 0.2748, "num_input_tokens_seen": 141817936, "step": 65720 }, { "epoch": 10.721859706362153, "grad_norm": 0.9266568422317505, "learning_rate": 2.6213850327709793e-05, "loss": 0.0781, "num_input_tokens_seen": 141828336, "step": 65725 }, { "epoch": 10.722675367047309, "grad_norm": 1.9321192502975464, "learning_rate": 2.6210295521910262e-05, "loss": 0.1024, "num_input_tokens_seen": 141839792, "step": 65730 }, { "epoch": 10.723491027732463, "grad_norm": 0.09129854291677475, "learning_rate": 2.6206740691582554e-05, "loss": 0.0647, "num_input_tokens_seen": 141851088, "step": 65735 }, { "epoch": 10.724306688417618, "grad_norm": 0.07246187329292297, "learning_rate": 2.6203185836798715e-05, "loss": 0.0339, "num_input_tokens_seen": 141861680, "step": 65740 }, { "epoch": 10.725122349102774, "grad_norm": 0.9394606351852417, "learning_rate": 2.619963095763079e-05, "loss": 0.0353, "num_input_tokens_seen": 141873008, "step": 65745 }, { "epoch": 10.725938009787928, "grad_norm": 0.4245823323726654, "learning_rate": 2.619607605415082e-05, "loss": 0.0232, "num_input_tokens_seen": 141884816, "step": 65750 }, { "epoch": 10.726753670473084, "grad_norm": 0.057648394256830215, "learning_rate": 2.6192521126430853e-05, "loss": 0.1197, "num_input_tokens_seen": 141895696, "step": 65755 }, { "epoch": 10.727569331158238, "grad_norm": 0.08797495067119598, "learning_rate": 2.618896617454293e-05, "loss": 0.0392, "num_input_tokens_seen": 141906512, "step": 65760 }, { "epoch": 10.728384991843393, "grad_norm": 0.03664983808994293, "learning_rate": 2.61854111985591e-05, "loss": 0.0637, "num_input_tokens_seen": 141917424, "step": 65765 }, { "epoch": 10.729200652528547, "grad_norm": 1.8992012739181519, "learning_rate": 2.6181856198551415e-05, "loss": 0.1279, "num_input_tokens_seen": 141927088, "step": 65770 }, { "epoch": 10.730016313213703, "grad_norm": 1.0911102294921875, "learning_rate": 2.6178301174591907e-05, "loss": 0.0775, "num_input_tokens_seen": 141938256, "step": 65775 }, { "epoch": 10.730831973898859, "grad_norm": 0.2887430489063263, "learning_rate": 2.617474612675263e-05, "loss": 0.1741, "num_input_tokens_seen": 141947952, "step": 65780 }, { "epoch": 10.731647634584013, "grad_norm": 1.7468881607055664, "learning_rate": 2.6171191055105636e-05, "loss": 0.172, "num_input_tokens_seen": 141958800, "step": 65785 }, { "epoch": 10.732463295269168, "grad_norm": 0.6308638453483582, "learning_rate": 2.616763595972297e-05, "loss": 0.1187, "num_input_tokens_seen": 141969296, "step": 65790 }, { "epoch": 10.733278955954322, "grad_norm": 0.5421092510223389, "learning_rate": 2.616408084067668e-05, "loss": 0.0424, "num_input_tokens_seen": 141980176, "step": 65795 }, { "epoch": 10.734094616639478, "grad_norm": 0.5521730184555054, "learning_rate": 2.616052569803881e-05, "loss": 0.0331, "num_input_tokens_seen": 141991472, "step": 65800 }, { "epoch": 10.734910277324634, "grad_norm": 0.2540993392467499, "learning_rate": 2.615697053188142e-05, "loss": 0.1168, "num_input_tokens_seen": 142001744, "step": 65805 }, { "epoch": 10.735725938009788, "grad_norm": 0.05179016292095184, "learning_rate": 2.6153415342276548e-05, "loss": 0.0676, "num_input_tokens_seen": 142011504, "step": 65810 }, { "epoch": 10.736541598694943, "grad_norm": 1.8249300718307495, "learning_rate": 2.614986012929626e-05, "loss": 0.1003, "num_input_tokens_seen": 142020784, "step": 65815 }, { "epoch": 10.737357259380097, "grad_norm": 0.13033367693424225, "learning_rate": 2.614630489301259e-05, "loss": 0.1014, "num_input_tokens_seen": 142031312, "step": 65820 }, { "epoch": 10.738172920065253, "grad_norm": 0.03458337113261223, "learning_rate": 2.6142749633497605e-05, "loss": 0.1365, "num_input_tokens_seen": 142042096, "step": 65825 }, { "epoch": 10.738988580750409, "grad_norm": 0.5010349750518799, "learning_rate": 2.6139194350823343e-05, "loss": 0.1019, "num_input_tokens_seen": 142052784, "step": 65830 }, { "epoch": 10.739804241435563, "grad_norm": 0.1552705466747284, "learning_rate": 2.613563904506186e-05, "loss": 0.0911, "num_input_tokens_seen": 142064432, "step": 65835 }, { "epoch": 10.740619902120718, "grad_norm": 0.016183247789740562, "learning_rate": 2.6132083716285216e-05, "loss": 0.0371, "num_input_tokens_seen": 142076048, "step": 65840 }, { "epoch": 10.741435562805872, "grad_norm": 0.19114579260349274, "learning_rate": 2.6128528364565456e-05, "loss": 0.0499, "num_input_tokens_seen": 142086992, "step": 65845 }, { "epoch": 10.742251223491028, "grad_norm": 1.241660237312317, "learning_rate": 2.6124972989974644e-05, "loss": 0.08, "num_input_tokens_seen": 142097616, "step": 65850 }, { "epoch": 10.743066884176184, "grad_norm": 0.5017013549804688, "learning_rate": 2.6121417592584823e-05, "loss": 0.021, "num_input_tokens_seen": 142109008, "step": 65855 }, { "epoch": 10.743882544861338, "grad_norm": 0.0905316025018692, "learning_rate": 2.611786217246805e-05, "loss": 0.0408, "num_input_tokens_seen": 142118448, "step": 65860 }, { "epoch": 10.744698205546493, "grad_norm": 0.9327778220176697, "learning_rate": 2.6114306729696385e-05, "loss": 0.227, "num_input_tokens_seen": 142130480, "step": 65865 }, { "epoch": 10.745513866231647, "grad_norm": 0.061806149780750275, "learning_rate": 2.6110751264341876e-05, "loss": 0.2122, "num_input_tokens_seen": 142141712, "step": 65870 }, { "epoch": 10.746329526916803, "grad_norm": 1.1693668365478516, "learning_rate": 2.610719577647659e-05, "loss": 0.0549, "num_input_tokens_seen": 142153232, "step": 65875 }, { "epoch": 10.747145187601957, "grad_norm": 0.1200244352221489, "learning_rate": 2.6103640266172573e-05, "loss": 0.1462, "num_input_tokens_seen": 142163664, "step": 65880 }, { "epoch": 10.747960848287113, "grad_norm": 0.060155946761369705, "learning_rate": 2.610008473350189e-05, "loss": 0.0325, "num_input_tokens_seen": 142174800, "step": 65885 }, { "epoch": 10.748776508972268, "grad_norm": 1.426317572593689, "learning_rate": 2.6096529178536587e-05, "loss": 0.1197, "num_input_tokens_seen": 142185776, "step": 65890 }, { "epoch": 10.749592169657422, "grad_norm": 0.2819140553474426, "learning_rate": 2.6092973601348735e-05, "loss": 0.0883, "num_input_tokens_seen": 142195856, "step": 65895 }, { "epoch": 10.750407830342578, "grad_norm": 0.032669227570295334, "learning_rate": 2.6089418002010386e-05, "loss": 0.063, "num_input_tokens_seen": 142206160, "step": 65900 }, { "epoch": 10.751223491027732, "grad_norm": 0.723630428314209, "learning_rate": 2.6085862380593594e-05, "loss": 0.2228, "num_input_tokens_seen": 142217040, "step": 65905 }, { "epoch": 10.752039151712887, "grad_norm": 0.24286575615406036, "learning_rate": 2.6082306737170425e-05, "loss": 0.0135, "num_input_tokens_seen": 142226384, "step": 65910 }, { "epoch": 10.752854812398043, "grad_norm": 0.2710131108760834, "learning_rate": 2.6078751071812942e-05, "loss": 0.1195, "num_input_tokens_seen": 142238288, "step": 65915 }, { "epoch": 10.753670473083197, "grad_norm": 0.029672857373952866, "learning_rate": 2.6075195384593192e-05, "loss": 0.0053, "num_input_tokens_seen": 142249040, "step": 65920 }, { "epoch": 10.754486133768353, "grad_norm": 1.7131767272949219, "learning_rate": 2.6071639675583253e-05, "loss": 0.2055, "num_input_tokens_seen": 142260208, "step": 65925 }, { "epoch": 10.755301794453507, "grad_norm": 0.022151345387101173, "learning_rate": 2.6068083944855172e-05, "loss": 0.1091, "num_input_tokens_seen": 142270320, "step": 65930 }, { "epoch": 10.756117455138662, "grad_norm": 0.15497885644435883, "learning_rate": 2.6064528192481015e-05, "loss": 0.1678, "num_input_tokens_seen": 142280560, "step": 65935 }, { "epoch": 10.756933115823816, "grad_norm": 2.356313467025757, "learning_rate": 2.6060972418532844e-05, "loss": 0.0841, "num_input_tokens_seen": 142290608, "step": 65940 }, { "epoch": 10.757748776508972, "grad_norm": 0.10960733890533447, "learning_rate": 2.605741662308272e-05, "loss": 0.0401, "num_input_tokens_seen": 142302128, "step": 65945 }, { "epoch": 10.758564437194128, "grad_norm": 0.9688495397567749, "learning_rate": 2.6053860806202705e-05, "loss": 0.0539, "num_input_tokens_seen": 142313232, "step": 65950 }, { "epoch": 10.759380097879282, "grad_norm": 0.066693976521492, "learning_rate": 2.6050304967964866e-05, "loss": 0.0914, "num_input_tokens_seen": 142323920, "step": 65955 }, { "epoch": 10.760195758564437, "grad_norm": 0.11653497070074081, "learning_rate": 2.604674910844127e-05, "loss": 0.0095, "num_input_tokens_seen": 142333424, "step": 65960 }, { "epoch": 10.761011419249591, "grad_norm": 0.03501233085989952, "learning_rate": 2.604319322770397e-05, "loss": 0.1796, "num_input_tokens_seen": 142344272, "step": 65965 }, { "epoch": 10.761827079934747, "grad_norm": 0.05439986288547516, "learning_rate": 2.6039637325825035e-05, "loss": 0.1495, "num_input_tokens_seen": 142355664, "step": 65970 }, { "epoch": 10.762642740619903, "grad_norm": 0.01532242726534605, "learning_rate": 2.6036081402876535e-05, "loss": 0.015, "num_input_tokens_seen": 142366704, "step": 65975 }, { "epoch": 10.763458401305057, "grad_norm": 0.03234870359301567, "learning_rate": 2.6032525458930518e-05, "loss": 0.0136, "num_input_tokens_seen": 142378704, "step": 65980 }, { "epoch": 10.764274061990212, "grad_norm": 0.02427360601723194, "learning_rate": 2.6028969494059074e-05, "loss": 0.0114, "num_input_tokens_seen": 142388432, "step": 65985 }, { "epoch": 10.765089722675366, "grad_norm": 1.9323824644088745, "learning_rate": 2.6025413508334255e-05, "loss": 0.1003, "num_input_tokens_seen": 142399152, "step": 65990 }, { "epoch": 10.765905383360522, "grad_norm": 0.04967112094163895, "learning_rate": 2.6021857501828134e-05, "loss": 0.1747, "num_input_tokens_seen": 142410448, "step": 65995 }, { "epoch": 10.766721044045678, "grad_norm": 0.07959871739149094, "learning_rate": 2.601830147461277e-05, "loss": 0.1138, "num_input_tokens_seen": 142421424, "step": 66000 }, { "epoch": 10.767536704730832, "grad_norm": 0.5319891571998596, "learning_rate": 2.601474542676024e-05, "loss": 0.0199, "num_input_tokens_seen": 142431568, "step": 66005 }, { "epoch": 10.768352365415987, "grad_norm": 0.9857124090194702, "learning_rate": 2.6011189358342603e-05, "loss": 0.1413, "num_input_tokens_seen": 142443152, "step": 66010 }, { "epoch": 10.769168026101141, "grad_norm": 0.10468605905771255, "learning_rate": 2.6007633269431926e-05, "loss": 0.0441, "num_input_tokens_seen": 142453808, "step": 66015 }, { "epoch": 10.769983686786297, "grad_norm": 1.036263108253479, "learning_rate": 2.6004077160100287e-05, "loss": 0.1018, "num_input_tokens_seen": 142463760, "step": 66020 }, { "epoch": 10.770799347471453, "grad_norm": 0.02415172941982746, "learning_rate": 2.6000521030419755e-05, "loss": 0.031, "num_input_tokens_seen": 142474032, "step": 66025 }, { "epoch": 10.771615008156607, "grad_norm": 0.21053892374038696, "learning_rate": 2.5996964880462394e-05, "loss": 0.0342, "num_input_tokens_seen": 142484368, "step": 66030 }, { "epoch": 10.772430668841762, "grad_norm": 0.13546617329120636, "learning_rate": 2.599340871030027e-05, "loss": 0.0588, "num_input_tokens_seen": 142495344, "step": 66035 }, { "epoch": 10.773246329526916, "grad_norm": 0.09561976790428162, "learning_rate": 2.5989852520005458e-05, "loss": 0.0865, "num_input_tokens_seen": 142505392, "step": 66040 }, { "epoch": 10.774061990212072, "grad_norm": 0.06494847685098648, "learning_rate": 2.598629630965004e-05, "loss": 0.1225, "num_input_tokens_seen": 142515760, "step": 66045 }, { "epoch": 10.774877650897226, "grad_norm": 3.4764137268066406, "learning_rate": 2.5982740079306062e-05, "loss": 0.1822, "num_input_tokens_seen": 142525552, "step": 66050 }, { "epoch": 10.775693311582382, "grad_norm": 0.536199152469635, "learning_rate": 2.5979183829045617e-05, "loss": 0.0373, "num_input_tokens_seen": 142535600, "step": 66055 }, { "epoch": 10.776508972267537, "grad_norm": 2.141201972961426, "learning_rate": 2.597562755894077e-05, "loss": 0.3373, "num_input_tokens_seen": 142546928, "step": 66060 }, { "epoch": 10.777324632952691, "grad_norm": 0.9295910000801086, "learning_rate": 2.5972071269063592e-05, "loss": 0.0259, "num_input_tokens_seen": 142557936, "step": 66065 }, { "epoch": 10.778140293637847, "grad_norm": 1.5344096422195435, "learning_rate": 2.596851495948615e-05, "loss": 0.1559, "num_input_tokens_seen": 142567856, "step": 66070 }, { "epoch": 10.778955954323001, "grad_norm": 0.2555323541164398, "learning_rate": 2.5964958630280534e-05, "loss": 0.0367, "num_input_tokens_seen": 142578544, "step": 66075 }, { "epoch": 10.779771615008157, "grad_norm": 0.32490766048431396, "learning_rate": 2.5961402281518797e-05, "loss": 0.0342, "num_input_tokens_seen": 142589968, "step": 66080 }, { "epoch": 10.780587275693312, "grad_norm": 0.6967642307281494, "learning_rate": 2.595784591327303e-05, "loss": 0.1946, "num_input_tokens_seen": 142601488, "step": 66085 }, { "epoch": 10.781402936378466, "grad_norm": 0.26206713914871216, "learning_rate": 2.59542895256153e-05, "loss": 0.1218, "num_input_tokens_seen": 142611280, "step": 66090 }, { "epoch": 10.782218597063622, "grad_norm": 0.05939271301031113, "learning_rate": 2.5950733118617683e-05, "loss": 0.0082, "num_input_tokens_seen": 142621904, "step": 66095 }, { "epoch": 10.783034257748776, "grad_norm": 0.5948855876922607, "learning_rate": 2.5947176692352255e-05, "loss": 0.0195, "num_input_tokens_seen": 142632976, "step": 66100 }, { "epoch": 10.783849918433932, "grad_norm": 0.252593457698822, "learning_rate": 2.594362024689108e-05, "loss": 0.1177, "num_input_tokens_seen": 142643696, "step": 66105 }, { "epoch": 10.784665579119086, "grad_norm": 1.8490593433380127, "learning_rate": 2.5940063782306255e-05, "loss": 0.1089, "num_input_tokens_seen": 142654448, "step": 66110 }, { "epoch": 10.785481239804241, "grad_norm": 0.062980517745018, "learning_rate": 2.593650729866984e-05, "loss": 0.0699, "num_input_tokens_seen": 142665488, "step": 66115 }, { "epoch": 10.786296900489397, "grad_norm": 1.2732418775558472, "learning_rate": 2.5932950796053917e-05, "loss": 0.1921, "num_input_tokens_seen": 142676496, "step": 66120 }, { "epoch": 10.78711256117455, "grad_norm": 0.10164153575897217, "learning_rate": 2.5929394274530554e-05, "loss": 0.1853, "num_input_tokens_seen": 142686960, "step": 66125 }, { "epoch": 10.787928221859707, "grad_norm": 3.672947406768799, "learning_rate": 2.5925837734171844e-05, "loss": 0.1409, "num_input_tokens_seen": 142697776, "step": 66130 }, { "epoch": 10.78874388254486, "grad_norm": 0.1843724250793457, "learning_rate": 2.592228117504986e-05, "loss": 0.1495, "num_input_tokens_seen": 142709072, "step": 66135 }, { "epoch": 10.789559543230016, "grad_norm": 1.9111803770065308, "learning_rate": 2.5918724597236675e-05, "loss": 0.1377, "num_input_tokens_seen": 142720144, "step": 66140 }, { "epoch": 10.790375203915172, "grad_norm": 0.7110637426376343, "learning_rate": 2.5915168000804374e-05, "loss": 0.0507, "num_input_tokens_seen": 142730512, "step": 66145 }, { "epoch": 10.791190864600326, "grad_norm": 0.05092078819870949, "learning_rate": 2.5911611385825023e-05, "loss": 0.019, "num_input_tokens_seen": 142741488, "step": 66150 }, { "epoch": 10.792006525285482, "grad_norm": 1.6994264125823975, "learning_rate": 2.5908054752370718e-05, "loss": 0.0835, "num_input_tokens_seen": 142751888, "step": 66155 }, { "epoch": 10.792822185970635, "grad_norm": 0.06444184482097626, "learning_rate": 2.5904498100513534e-05, "loss": 0.0265, "num_input_tokens_seen": 142763312, "step": 66160 }, { "epoch": 10.793637846655791, "grad_norm": 0.6925466060638428, "learning_rate": 2.5900941430325548e-05, "loss": 0.3269, "num_input_tokens_seen": 142774928, "step": 66165 }, { "epoch": 10.794453507340947, "grad_norm": 0.201179638504982, "learning_rate": 2.5897384741878838e-05, "loss": 0.031, "num_input_tokens_seen": 142786704, "step": 66170 }, { "epoch": 10.7952691680261, "grad_norm": 3.0813374519348145, "learning_rate": 2.5893828035245494e-05, "loss": 0.1964, "num_input_tokens_seen": 142797936, "step": 66175 }, { "epoch": 10.796084828711257, "grad_norm": 0.04600302129983902, "learning_rate": 2.589027131049758e-05, "loss": 0.0648, "num_input_tokens_seen": 142808784, "step": 66180 }, { "epoch": 10.79690048939641, "grad_norm": 1.0595672130584717, "learning_rate": 2.58867145677072e-05, "loss": 0.1303, "num_input_tokens_seen": 142820944, "step": 66185 }, { "epoch": 10.797716150081566, "grad_norm": 1.0174481868743896, "learning_rate": 2.5883157806946417e-05, "loss": 0.0341, "num_input_tokens_seen": 142832624, "step": 66190 }, { "epoch": 10.798531810766722, "grad_norm": 0.18032194674015045, "learning_rate": 2.5879601028287327e-05, "loss": 0.1248, "num_input_tokens_seen": 142842224, "step": 66195 }, { "epoch": 10.799347471451876, "grad_norm": 0.2629144787788391, "learning_rate": 2.5876044231802e-05, "loss": 0.0673, "num_input_tokens_seen": 142852912, "step": 66200 }, { "epoch": 10.800163132137031, "grad_norm": 0.15956498682498932, "learning_rate": 2.587248741756253e-05, "loss": 0.0382, "num_input_tokens_seen": 142862064, "step": 66205 }, { "epoch": 10.800978792822185, "grad_norm": 0.050437040627002716, "learning_rate": 2.5868930585640993e-05, "loss": 0.0228, "num_input_tokens_seen": 142873712, "step": 66210 }, { "epoch": 10.801794453507341, "grad_norm": 0.1795123666524887, "learning_rate": 2.586537373610947e-05, "loss": 0.0569, "num_input_tokens_seen": 142884304, "step": 66215 }, { "epoch": 10.802610114192497, "grad_norm": 0.18216076493263245, "learning_rate": 2.5861816869040063e-05, "loss": 0.0686, "num_input_tokens_seen": 142895632, "step": 66220 }, { "epoch": 10.80342577487765, "grad_norm": 0.5857537388801575, "learning_rate": 2.5858259984504833e-05, "loss": 0.0759, "num_input_tokens_seen": 142906736, "step": 66225 }, { "epoch": 10.804241435562806, "grad_norm": 1.4540141820907593, "learning_rate": 2.5854703082575876e-05, "loss": 0.176, "num_input_tokens_seen": 142918384, "step": 66230 }, { "epoch": 10.80505709624796, "grad_norm": 2.2588701248168945, "learning_rate": 2.585114616332528e-05, "loss": 0.2192, "num_input_tokens_seen": 142929424, "step": 66235 }, { "epoch": 10.805872756933116, "grad_norm": 0.07028123736381531, "learning_rate": 2.5847589226825126e-05, "loss": 0.0416, "num_input_tokens_seen": 142940464, "step": 66240 }, { "epoch": 10.80668841761827, "grad_norm": 0.9337247014045715, "learning_rate": 2.5844032273147505e-05, "loss": 0.0514, "num_input_tokens_seen": 142951184, "step": 66245 }, { "epoch": 10.807504078303426, "grad_norm": 0.1937645524740219, "learning_rate": 2.5840475302364497e-05, "loss": 0.0128, "num_input_tokens_seen": 142962096, "step": 66250 }, { "epoch": 10.808319738988581, "grad_norm": 0.8402410745620728, "learning_rate": 2.583691831454819e-05, "loss": 0.0374, "num_input_tokens_seen": 142972368, "step": 66255 }, { "epoch": 10.809135399673735, "grad_norm": 1.6358669996261597, "learning_rate": 2.583336130977067e-05, "loss": 0.0642, "num_input_tokens_seen": 142982160, "step": 66260 }, { "epoch": 10.809951060358891, "grad_norm": 0.7227904796600342, "learning_rate": 2.5829804288104032e-05, "loss": 0.0453, "num_input_tokens_seen": 142992656, "step": 66265 }, { "epoch": 10.810766721044045, "grad_norm": 1.7240219116210938, "learning_rate": 2.582624724962035e-05, "loss": 0.1264, "num_input_tokens_seen": 143001936, "step": 66270 }, { "epoch": 10.8115823817292, "grad_norm": 1.6591671705245972, "learning_rate": 2.5822690194391726e-05, "loss": 0.0952, "num_input_tokens_seen": 143013040, "step": 66275 }, { "epoch": 10.812398042414356, "grad_norm": 0.05453094094991684, "learning_rate": 2.5819133122490236e-05, "loss": 0.3456, "num_input_tokens_seen": 143024688, "step": 66280 }, { "epoch": 10.81321370309951, "grad_norm": 2.073108673095703, "learning_rate": 2.581557603398798e-05, "loss": 0.0578, "num_input_tokens_seen": 143035120, "step": 66285 }, { "epoch": 10.814029363784666, "grad_norm": 1.2551952600479126, "learning_rate": 2.5812018928957038e-05, "loss": 0.2163, "num_input_tokens_seen": 143045264, "step": 66290 }, { "epoch": 10.81484502446982, "grad_norm": 0.03452342376112938, "learning_rate": 2.5808461807469497e-05, "loss": 0.1542, "num_input_tokens_seen": 143056016, "step": 66295 }, { "epoch": 10.815660685154976, "grad_norm": 0.6276972889900208, "learning_rate": 2.5804904669597456e-05, "loss": 0.0523, "num_input_tokens_seen": 143066800, "step": 66300 }, { "epoch": 10.81647634584013, "grad_norm": 2.266195297241211, "learning_rate": 2.5801347515412997e-05, "loss": 0.1623, "num_input_tokens_seen": 143077456, "step": 66305 }, { "epoch": 10.817292006525285, "grad_norm": 1.7435803413391113, "learning_rate": 2.579779034498822e-05, "loss": 0.1287, "num_input_tokens_seen": 143087856, "step": 66310 }, { "epoch": 10.818107667210441, "grad_norm": 0.2512047290802002, "learning_rate": 2.579423315839521e-05, "loss": 0.0709, "num_input_tokens_seen": 143098640, "step": 66315 }, { "epoch": 10.818923327895595, "grad_norm": 1.9329193830490112, "learning_rate": 2.5790675955706057e-05, "loss": 0.1447, "num_input_tokens_seen": 143109456, "step": 66320 }, { "epoch": 10.81973898858075, "grad_norm": 0.6094695329666138, "learning_rate": 2.578711873699285e-05, "loss": 0.0868, "num_input_tokens_seen": 143120784, "step": 66325 }, { "epoch": 10.820554649265905, "grad_norm": 1.8309699296951294, "learning_rate": 2.5783561502327687e-05, "loss": 0.0983, "num_input_tokens_seen": 143131568, "step": 66330 }, { "epoch": 10.82137030995106, "grad_norm": 0.06740979105234146, "learning_rate": 2.5780004251782646e-05, "loss": 0.1314, "num_input_tokens_seen": 143142960, "step": 66335 }, { "epoch": 10.822185970636216, "grad_norm": 0.3645698130130768, "learning_rate": 2.5776446985429835e-05, "loss": 0.0749, "num_input_tokens_seen": 143152624, "step": 66340 }, { "epoch": 10.82300163132137, "grad_norm": 0.3681076169013977, "learning_rate": 2.5772889703341345e-05, "loss": 0.1692, "num_input_tokens_seen": 143163920, "step": 66345 }, { "epoch": 10.823817292006526, "grad_norm": 0.17774026095867157, "learning_rate": 2.5769332405589264e-05, "loss": 0.1795, "num_input_tokens_seen": 143174032, "step": 66350 }, { "epoch": 10.82463295269168, "grad_norm": 0.1821865737438202, "learning_rate": 2.5765775092245688e-05, "loss": 0.109, "num_input_tokens_seen": 143182704, "step": 66355 }, { "epoch": 10.825448613376835, "grad_norm": 1.0409657955169678, "learning_rate": 2.57622177633827e-05, "loss": 0.0698, "num_input_tokens_seen": 143193744, "step": 66360 }, { "epoch": 10.826264274061991, "grad_norm": 0.7613281011581421, "learning_rate": 2.5758660419072405e-05, "loss": 0.1282, "num_input_tokens_seen": 143205712, "step": 66365 }, { "epoch": 10.827079934747145, "grad_norm": 1.7354505062103271, "learning_rate": 2.5755103059386898e-05, "loss": 0.1017, "num_input_tokens_seen": 143215824, "step": 66370 }, { "epoch": 10.8278955954323, "grad_norm": 0.22383907437324524, "learning_rate": 2.575154568439827e-05, "loss": 0.175, "num_input_tokens_seen": 143226192, "step": 66375 }, { "epoch": 10.828711256117455, "grad_norm": 1.0973589420318604, "learning_rate": 2.574798829417861e-05, "loss": 0.0953, "num_input_tokens_seen": 143236848, "step": 66380 }, { "epoch": 10.82952691680261, "grad_norm": 0.14978769421577454, "learning_rate": 2.5744430888800026e-05, "loss": 0.0319, "num_input_tokens_seen": 143248176, "step": 66385 }, { "epoch": 10.830342577487766, "grad_norm": 0.030596913769841194, "learning_rate": 2.57408734683346e-05, "loss": 0.0229, "num_input_tokens_seen": 143259248, "step": 66390 }, { "epoch": 10.83115823817292, "grad_norm": 0.0812201276421547, "learning_rate": 2.573731603285443e-05, "loss": 0.0471, "num_input_tokens_seen": 143270192, "step": 66395 }, { "epoch": 10.831973898858076, "grad_norm": 1.1213129758834839, "learning_rate": 2.573375858243162e-05, "loss": 0.267, "num_input_tokens_seen": 143280336, "step": 66400 }, { "epoch": 10.83278955954323, "grad_norm": 0.17448559403419495, "learning_rate": 2.5730201117138262e-05, "loss": 0.0413, "num_input_tokens_seen": 143291184, "step": 66405 }, { "epoch": 10.833605220228385, "grad_norm": 0.3670622408390045, "learning_rate": 2.5726643637046456e-05, "loss": 0.0557, "num_input_tokens_seen": 143301712, "step": 66410 }, { "epoch": 10.83442088091354, "grad_norm": 0.16444194316864014, "learning_rate": 2.5723086142228287e-05, "loss": 0.1848, "num_input_tokens_seen": 143311728, "step": 66415 }, { "epoch": 10.835236541598695, "grad_norm": 0.8412098288536072, "learning_rate": 2.571952863275586e-05, "loss": 0.076, "num_input_tokens_seen": 143322960, "step": 66420 }, { "epoch": 10.83605220228385, "grad_norm": 0.01569049432873726, "learning_rate": 2.5715971108701276e-05, "loss": 0.1414, "num_input_tokens_seen": 143332496, "step": 66425 }, { "epoch": 10.836867862969005, "grad_norm": 0.07185675948858261, "learning_rate": 2.571241357013663e-05, "loss": 0.2921, "num_input_tokens_seen": 143343760, "step": 66430 }, { "epoch": 10.83768352365416, "grad_norm": 0.9809530973434448, "learning_rate": 2.5708856017134013e-05, "loss": 0.1088, "num_input_tokens_seen": 143354800, "step": 66435 }, { "epoch": 10.838499184339314, "grad_norm": 0.5366761684417725, "learning_rate": 2.5705298449765537e-05, "loss": 0.0829, "num_input_tokens_seen": 143365168, "step": 66440 }, { "epoch": 10.83931484502447, "grad_norm": 0.18697398900985718, "learning_rate": 2.5701740868103286e-05, "loss": 0.0175, "num_input_tokens_seen": 143375600, "step": 66445 }, { "epoch": 10.840130505709626, "grad_norm": 0.1389477699995041, "learning_rate": 2.569818327221937e-05, "loss": 0.0236, "num_input_tokens_seen": 143385264, "step": 66450 }, { "epoch": 10.84094616639478, "grad_norm": 0.6027160882949829, "learning_rate": 2.569462566218589e-05, "loss": 0.2197, "num_input_tokens_seen": 143395056, "step": 66455 }, { "epoch": 10.841761827079935, "grad_norm": 0.4001179337501526, "learning_rate": 2.569106803807493e-05, "loss": 0.0967, "num_input_tokens_seen": 143406832, "step": 66460 }, { "epoch": 10.84257748776509, "grad_norm": 0.024303780868649483, "learning_rate": 2.5687510399958602e-05, "loss": 0.0082, "num_input_tokens_seen": 143418544, "step": 66465 }, { "epoch": 10.843393148450245, "grad_norm": 0.17485877871513367, "learning_rate": 2.5683952747909007e-05, "loss": 0.1166, "num_input_tokens_seen": 143428368, "step": 66470 }, { "epoch": 10.844208809135399, "grad_norm": 1.0953985452651978, "learning_rate": 2.5680395081998236e-05, "loss": 0.0576, "num_input_tokens_seen": 143439472, "step": 66475 }, { "epoch": 10.845024469820554, "grad_norm": 0.18358276784420013, "learning_rate": 2.5676837402298402e-05, "loss": 0.0903, "num_input_tokens_seen": 143451280, "step": 66480 }, { "epoch": 10.84584013050571, "grad_norm": 0.27190491557121277, "learning_rate": 2.56732797088816e-05, "loss": 0.1298, "num_input_tokens_seen": 143461136, "step": 66485 }, { "epoch": 10.846655791190864, "grad_norm": 0.07172790169715881, "learning_rate": 2.5669722001819928e-05, "loss": 0.0126, "num_input_tokens_seen": 143471952, "step": 66490 }, { "epoch": 10.84747145187602, "grad_norm": 1.7190792560577393, "learning_rate": 2.566616428118549e-05, "loss": 0.1638, "num_input_tokens_seen": 143483696, "step": 66495 }, { "epoch": 10.848287112561174, "grad_norm": 0.17150135338306427, "learning_rate": 2.5662606547050388e-05, "loss": 0.2044, "num_input_tokens_seen": 143494768, "step": 66500 }, { "epoch": 10.84910277324633, "grad_norm": 0.13755199313163757, "learning_rate": 2.5659048799486722e-05, "loss": 0.2719, "num_input_tokens_seen": 143506608, "step": 66505 }, { "epoch": 10.849918433931485, "grad_norm": 0.04291917383670807, "learning_rate": 2.5655491038566597e-05, "loss": 0.1391, "num_input_tokens_seen": 143517168, "step": 66510 }, { "epoch": 10.850734094616639, "grad_norm": 0.05987805128097534, "learning_rate": 2.5651933264362115e-05, "loss": 0.0771, "num_input_tokens_seen": 143528144, "step": 66515 }, { "epoch": 10.851549755301795, "grad_norm": 0.48840656876564026, "learning_rate": 2.564837547694538e-05, "loss": 0.1599, "num_input_tokens_seen": 143538224, "step": 66520 }, { "epoch": 10.852365415986949, "grad_norm": 0.11423817276954651, "learning_rate": 2.5644817676388492e-05, "loss": 0.0351, "num_input_tokens_seen": 143548464, "step": 66525 }, { "epoch": 10.853181076672104, "grad_norm": 2.6358680725097656, "learning_rate": 2.5641259862763554e-05, "loss": 0.2158, "num_input_tokens_seen": 143559824, "step": 66530 }, { "epoch": 10.85399673735726, "grad_norm": 0.13343295454978943, "learning_rate": 2.5637702036142675e-05, "loss": 0.1868, "num_input_tokens_seen": 143570864, "step": 66535 }, { "epoch": 10.854812398042414, "grad_norm": 1.1184604167938232, "learning_rate": 2.5634144196597952e-05, "loss": 0.1532, "num_input_tokens_seen": 143582224, "step": 66540 }, { "epoch": 10.85562805872757, "grad_norm": 1.2432113885879517, "learning_rate": 2.5630586344201498e-05, "loss": 0.0489, "num_input_tokens_seen": 143593904, "step": 66545 }, { "epoch": 10.856443719412724, "grad_norm": 1.330024242401123, "learning_rate": 2.56270284790254e-05, "loss": 0.1005, "num_input_tokens_seen": 143603952, "step": 66550 }, { "epoch": 10.85725938009788, "grad_norm": 0.5531420707702637, "learning_rate": 2.5623470601141787e-05, "loss": 0.0914, "num_input_tokens_seen": 143614512, "step": 66555 }, { "epoch": 10.858075040783035, "grad_norm": 0.4811558127403259, "learning_rate": 2.5619912710622744e-05, "loss": 0.024, "num_input_tokens_seen": 143625808, "step": 66560 }, { "epoch": 10.858890701468189, "grad_norm": 3.017343044281006, "learning_rate": 2.5616354807540387e-05, "loss": 0.1826, "num_input_tokens_seen": 143634544, "step": 66565 }, { "epoch": 10.859706362153345, "grad_norm": 1.7160379886627197, "learning_rate": 2.5612796891966817e-05, "loss": 0.1542, "num_input_tokens_seen": 143643824, "step": 66570 }, { "epoch": 10.860522022838499, "grad_norm": 0.1276598423719406, "learning_rate": 2.5609238963974142e-05, "loss": 0.0333, "num_input_tokens_seen": 143654672, "step": 66575 }, { "epoch": 10.861337683523654, "grad_norm": 1.0828988552093506, "learning_rate": 2.5605681023634465e-05, "loss": 0.1122, "num_input_tokens_seen": 143663792, "step": 66580 }, { "epoch": 10.86215334420881, "grad_norm": 1.1419358253479004, "learning_rate": 2.560212307101989e-05, "loss": 0.1629, "num_input_tokens_seen": 143673872, "step": 66585 }, { "epoch": 10.862969004893964, "grad_norm": 0.15941272675991058, "learning_rate": 2.5598565106202533e-05, "loss": 0.1771, "num_input_tokens_seen": 143685456, "step": 66590 }, { "epoch": 10.86378466557912, "grad_norm": 0.08036700636148453, "learning_rate": 2.5595007129254494e-05, "loss": 0.1569, "num_input_tokens_seen": 143695536, "step": 66595 }, { "epoch": 10.864600326264274, "grad_norm": 0.244331955909729, "learning_rate": 2.5591449140247874e-05, "loss": 0.3211, "num_input_tokens_seen": 143706672, "step": 66600 }, { "epoch": 10.86541598694943, "grad_norm": 1.3358339071273804, "learning_rate": 2.558789113925479e-05, "loss": 0.1006, "num_input_tokens_seen": 143716976, "step": 66605 }, { "epoch": 10.866231647634583, "grad_norm": 1.8839665651321411, "learning_rate": 2.558433312634735e-05, "loss": 0.0575, "num_input_tokens_seen": 143726352, "step": 66610 }, { "epoch": 10.867047308319739, "grad_norm": 1.3134897947311401, "learning_rate": 2.5580775101597654e-05, "loss": 0.1022, "num_input_tokens_seen": 143737808, "step": 66615 }, { "epoch": 10.867862969004895, "grad_norm": 0.04262102022767067, "learning_rate": 2.557721706507782e-05, "loss": 0.0783, "num_input_tokens_seen": 143749680, "step": 66620 }, { "epoch": 10.868678629690049, "grad_norm": 1.4088129997253418, "learning_rate": 2.557365901685994e-05, "loss": 0.2401, "num_input_tokens_seen": 143759056, "step": 66625 }, { "epoch": 10.869494290375204, "grad_norm": 0.42404940724372864, "learning_rate": 2.557010095701614e-05, "loss": 0.2156, "num_input_tokens_seen": 143769744, "step": 66630 }, { "epoch": 10.870309951060358, "grad_norm": 1.8214203119277954, "learning_rate": 2.556654288561852e-05, "loss": 0.2751, "num_input_tokens_seen": 143779504, "step": 66635 }, { "epoch": 10.871125611745514, "grad_norm": 0.3645426630973816, "learning_rate": 2.5562984802739194e-05, "loss": 0.1379, "num_input_tokens_seen": 143790096, "step": 66640 }, { "epoch": 10.87194127243067, "grad_norm": 0.04920249432325363, "learning_rate": 2.555942670845026e-05, "loss": 0.1554, "num_input_tokens_seen": 143801008, "step": 66645 }, { "epoch": 10.872756933115824, "grad_norm": 0.02114030532538891, "learning_rate": 2.5555868602823834e-05, "loss": 0.0912, "num_input_tokens_seen": 143811664, "step": 66650 }, { "epoch": 10.87357259380098, "grad_norm": 1.6452062129974365, "learning_rate": 2.5552310485932024e-05, "loss": 0.1019, "num_input_tokens_seen": 143822288, "step": 66655 }, { "epoch": 10.874388254486133, "grad_norm": 0.11502192914485931, "learning_rate": 2.5548752357846945e-05, "loss": 0.1356, "num_input_tokens_seen": 143833104, "step": 66660 }, { "epoch": 10.875203915171289, "grad_norm": 1.5122143030166626, "learning_rate": 2.55451942186407e-05, "loss": 0.044, "num_input_tokens_seen": 143844336, "step": 66665 }, { "epoch": 10.876019575856443, "grad_norm": 0.06638689339160919, "learning_rate": 2.5541636068385406e-05, "loss": 0.0216, "num_input_tokens_seen": 143855216, "step": 66670 }, { "epoch": 10.876835236541599, "grad_norm": 0.10416115820407867, "learning_rate": 2.553807790715317e-05, "loss": 0.0133, "num_input_tokens_seen": 143865008, "step": 66675 }, { "epoch": 10.877650897226754, "grad_norm": 0.8665609359741211, "learning_rate": 2.5534519735016106e-05, "loss": 0.0938, "num_input_tokens_seen": 143875568, "step": 66680 }, { "epoch": 10.878466557911908, "grad_norm": 2.249379873275757, "learning_rate": 2.5530961552046317e-05, "loss": 0.1498, "num_input_tokens_seen": 143886192, "step": 66685 }, { "epoch": 10.879282218597064, "grad_norm": 1.2531052827835083, "learning_rate": 2.5527403358315916e-05, "loss": 0.1701, "num_input_tokens_seen": 143896656, "step": 66690 }, { "epoch": 10.880097879282218, "grad_norm": 1.877951979637146, "learning_rate": 2.552384515389702e-05, "loss": 0.065, "num_input_tokens_seen": 143906352, "step": 66695 }, { "epoch": 10.880913539967374, "grad_norm": 0.3007655441761017, "learning_rate": 2.5520286938861736e-05, "loss": 0.0393, "num_input_tokens_seen": 143917296, "step": 66700 }, { "epoch": 10.88172920065253, "grad_norm": 0.13835354149341583, "learning_rate": 2.5516728713282185e-05, "loss": 0.1035, "num_input_tokens_seen": 143928080, "step": 66705 }, { "epoch": 10.882544861337683, "grad_norm": 1.4039262533187866, "learning_rate": 2.5513170477230464e-05, "loss": 0.1259, "num_input_tokens_seen": 143938992, "step": 66710 }, { "epoch": 10.883360522022839, "grad_norm": 0.17409664392471313, "learning_rate": 2.550961223077869e-05, "loss": 0.0358, "num_input_tokens_seen": 143949840, "step": 66715 }, { "epoch": 10.884176182707993, "grad_norm": 0.02837751992046833, "learning_rate": 2.5506053973998982e-05, "loss": 0.0328, "num_input_tokens_seen": 143961008, "step": 66720 }, { "epoch": 10.884991843393149, "grad_norm": 1.170297384262085, "learning_rate": 2.5502495706963447e-05, "loss": 0.1339, "num_input_tokens_seen": 143971888, "step": 66725 }, { "epoch": 10.885807504078304, "grad_norm": 0.8836472630500793, "learning_rate": 2.5498937429744197e-05, "loss": 0.0544, "num_input_tokens_seen": 143983312, "step": 66730 }, { "epoch": 10.886623164763458, "grad_norm": 0.9389909505844116, "learning_rate": 2.549537914241335e-05, "loss": 0.0638, "num_input_tokens_seen": 143992816, "step": 66735 }, { "epoch": 10.887438825448614, "grad_norm": 0.24247795343399048, "learning_rate": 2.549182084504302e-05, "loss": 0.2282, "num_input_tokens_seen": 144005104, "step": 66740 }, { "epoch": 10.888254486133768, "grad_norm": 1.075434923171997, "learning_rate": 2.548826253770531e-05, "loss": 0.0564, "num_input_tokens_seen": 144015088, "step": 66745 }, { "epoch": 10.889070146818923, "grad_norm": 0.10878819972276688, "learning_rate": 2.5484704220472333e-05, "loss": 0.0972, "num_input_tokens_seen": 144026416, "step": 66750 }, { "epoch": 10.88988580750408, "grad_norm": 0.03641286864876747, "learning_rate": 2.548114589341622e-05, "loss": 0.0053, "num_input_tokens_seen": 144035440, "step": 66755 }, { "epoch": 10.890701468189233, "grad_norm": 1.5965758562088013, "learning_rate": 2.547758755660908e-05, "loss": 0.2301, "num_input_tokens_seen": 144046544, "step": 66760 }, { "epoch": 10.891517128874389, "grad_norm": 0.12001105397939682, "learning_rate": 2.5474029210123012e-05, "loss": 0.2657, "num_input_tokens_seen": 144055984, "step": 66765 }, { "epoch": 10.892332789559543, "grad_norm": 0.9766988158226013, "learning_rate": 2.5470470854030148e-05, "loss": 0.1247, "num_input_tokens_seen": 144065744, "step": 66770 }, { "epoch": 10.893148450244698, "grad_norm": 0.6065764427185059, "learning_rate": 2.5466912488402583e-05, "loss": 0.1749, "num_input_tokens_seen": 144076112, "step": 66775 }, { "epoch": 10.893964110929852, "grad_norm": 1.6741997003555298, "learning_rate": 2.5463354113312454e-05, "loss": 0.1476, "num_input_tokens_seen": 144087824, "step": 66780 }, { "epoch": 10.894779771615008, "grad_norm": 0.15981318056583405, "learning_rate": 2.545979572883187e-05, "loss": 0.0255, "num_input_tokens_seen": 144098448, "step": 66785 }, { "epoch": 10.895595432300164, "grad_norm": 1.883838176727295, "learning_rate": 2.545623733503294e-05, "loss": 0.2206, "num_input_tokens_seen": 144108752, "step": 66790 }, { "epoch": 10.896411092985318, "grad_norm": 0.2944166362285614, "learning_rate": 2.5452678931987778e-05, "loss": 0.0747, "num_input_tokens_seen": 144118800, "step": 66795 }, { "epoch": 10.897226753670473, "grad_norm": 2.9190566539764404, "learning_rate": 2.5449120519768505e-05, "loss": 0.1012, "num_input_tokens_seen": 144129488, "step": 66800 }, { "epoch": 10.898042414355627, "grad_norm": 0.12241009622812271, "learning_rate": 2.5445562098447233e-05, "loss": 0.045, "num_input_tokens_seen": 144140176, "step": 66805 }, { "epoch": 10.898858075040783, "grad_norm": 0.7575669288635254, "learning_rate": 2.544200366809608e-05, "loss": 0.0719, "num_input_tokens_seen": 144150320, "step": 66810 }, { "epoch": 10.899673735725939, "grad_norm": 0.8949501514434814, "learning_rate": 2.5438445228787167e-05, "loss": 0.0764, "num_input_tokens_seen": 144162032, "step": 66815 }, { "epoch": 10.900489396411093, "grad_norm": 0.6346423029899597, "learning_rate": 2.5434886780592606e-05, "loss": 0.0641, "num_input_tokens_seen": 144173136, "step": 66820 }, { "epoch": 10.901305057096248, "grad_norm": 2.3229079246520996, "learning_rate": 2.5431328323584504e-05, "loss": 0.1232, "num_input_tokens_seen": 144183760, "step": 66825 }, { "epoch": 10.902120717781402, "grad_norm": 1.3964143991470337, "learning_rate": 2.542776985783499e-05, "loss": 0.1609, "num_input_tokens_seen": 144194480, "step": 66830 }, { "epoch": 10.902936378466558, "grad_norm": 0.8538684248924255, "learning_rate": 2.542421138341618e-05, "loss": 0.0899, "num_input_tokens_seen": 144205680, "step": 66835 }, { "epoch": 10.903752039151712, "grad_norm": 0.2028626799583435, "learning_rate": 2.5420652900400188e-05, "loss": 0.0684, "num_input_tokens_seen": 144216560, "step": 66840 }, { "epoch": 10.904567699836868, "grad_norm": 0.8712891936302185, "learning_rate": 2.5417094408859125e-05, "loss": 0.0625, "num_input_tokens_seen": 144227472, "step": 66845 }, { "epoch": 10.905383360522023, "grad_norm": 0.10644117742776871, "learning_rate": 2.541353590886512e-05, "loss": 0.0657, "num_input_tokens_seen": 144237552, "step": 66850 }, { "epoch": 10.906199021207177, "grad_norm": 0.4864615201950073, "learning_rate": 2.5409977400490286e-05, "loss": 0.0738, "num_input_tokens_seen": 144248400, "step": 66855 }, { "epoch": 10.907014681892333, "grad_norm": 1.4232425689697266, "learning_rate": 2.5406418883806737e-05, "loss": 0.2973, "num_input_tokens_seen": 144259344, "step": 66860 }, { "epoch": 10.907830342577487, "grad_norm": 1.698242425918579, "learning_rate": 2.54028603588866e-05, "loss": 0.2708, "num_input_tokens_seen": 144269936, "step": 66865 }, { "epoch": 10.908646003262643, "grad_norm": 0.5911740064620972, "learning_rate": 2.5399301825801974e-05, "loss": 0.1196, "num_input_tokens_seen": 144279568, "step": 66870 }, { "epoch": 10.909461663947798, "grad_norm": 0.1399400234222412, "learning_rate": 2.5395743284624994e-05, "loss": 0.045, "num_input_tokens_seen": 144289168, "step": 66875 }, { "epoch": 10.910277324632952, "grad_norm": 0.8021982312202454, "learning_rate": 2.539218473542778e-05, "loss": 0.0459, "num_input_tokens_seen": 144301104, "step": 66880 }, { "epoch": 10.911092985318108, "grad_norm": 1.7386977672576904, "learning_rate": 2.538862617828244e-05, "loss": 0.0941, "num_input_tokens_seen": 144311664, "step": 66885 }, { "epoch": 10.911908646003262, "grad_norm": 0.14457286894321442, "learning_rate": 2.5385067613261105e-05, "loss": 0.0249, "num_input_tokens_seen": 144322288, "step": 66890 }, { "epoch": 10.912724306688418, "grad_norm": 0.13874275982379913, "learning_rate": 2.538150904043588e-05, "loss": 0.0165, "num_input_tokens_seen": 144334448, "step": 66895 }, { "epoch": 10.913539967373573, "grad_norm": 0.0716642513871193, "learning_rate": 2.5377950459878897e-05, "loss": 0.0486, "num_input_tokens_seen": 144344208, "step": 66900 }, { "epoch": 10.914355628058727, "grad_norm": 0.1823706328868866, "learning_rate": 2.537439187166226e-05, "loss": 0.0161, "num_input_tokens_seen": 144355120, "step": 66905 }, { "epoch": 10.915171288743883, "grad_norm": 0.23979435861110687, "learning_rate": 2.5370833275858104e-05, "loss": 0.0991, "num_input_tokens_seen": 144366864, "step": 66910 }, { "epoch": 10.915986949429037, "grad_norm": 2.1123127937316895, "learning_rate": 2.5367274672538538e-05, "loss": 0.0689, "num_input_tokens_seen": 144377552, "step": 66915 }, { "epoch": 10.916802610114193, "grad_norm": 1.5151472091674805, "learning_rate": 2.5363716061775678e-05, "loss": 0.1698, "num_input_tokens_seen": 144388208, "step": 66920 }, { "epoch": 10.917618270799348, "grad_norm": 0.11351282894611359, "learning_rate": 2.536015744364166e-05, "loss": 0.0694, "num_input_tokens_seen": 144398192, "step": 66925 }, { "epoch": 10.918433931484502, "grad_norm": 0.024000847712159157, "learning_rate": 2.5356598818208587e-05, "loss": 0.048, "num_input_tokens_seen": 144408304, "step": 66930 }, { "epoch": 10.919249592169658, "grad_norm": 0.22719044983386993, "learning_rate": 2.5353040185548593e-05, "loss": 0.1469, "num_input_tokens_seen": 144418160, "step": 66935 }, { "epoch": 10.920065252854812, "grad_norm": 0.04002726078033447, "learning_rate": 2.534948154573379e-05, "loss": 0.1952, "num_input_tokens_seen": 144428304, "step": 66940 }, { "epoch": 10.920880913539968, "grad_norm": 0.04872860759496689, "learning_rate": 2.53459228988363e-05, "loss": 0.0652, "num_input_tokens_seen": 144438864, "step": 66945 }, { "epoch": 10.921696574225122, "grad_norm": 1.0006099939346313, "learning_rate": 2.534236424492824e-05, "loss": 0.1449, "num_input_tokens_seen": 144448720, "step": 66950 }, { "epoch": 10.922512234910277, "grad_norm": 0.05521850287914276, "learning_rate": 2.533880558408174e-05, "loss": 0.0261, "num_input_tokens_seen": 144459248, "step": 66955 }, { "epoch": 10.923327895595433, "grad_norm": 0.9870822429656982, "learning_rate": 2.5335246916368916e-05, "loss": 0.1193, "num_input_tokens_seen": 144468912, "step": 66960 }, { "epoch": 10.924143556280587, "grad_norm": 0.15696890652179718, "learning_rate": 2.5331688241861883e-05, "loss": 0.0889, "num_input_tokens_seen": 144479920, "step": 66965 }, { "epoch": 10.924959216965743, "grad_norm": 1.3269460201263428, "learning_rate": 2.532812956063277e-05, "loss": 0.1776, "num_input_tokens_seen": 144490320, "step": 66970 }, { "epoch": 10.925774877650896, "grad_norm": 2.7991719245910645, "learning_rate": 2.5324570872753688e-05, "loss": 0.2248, "num_input_tokens_seen": 144500240, "step": 66975 }, { "epoch": 10.926590538336052, "grad_norm": 1.563442587852478, "learning_rate": 2.5321012178296773e-05, "loss": 0.1525, "num_input_tokens_seen": 144511472, "step": 66980 }, { "epoch": 10.927406199021208, "grad_norm": 0.0363902673125267, "learning_rate": 2.5317453477334136e-05, "loss": 0.042, "num_input_tokens_seen": 144522000, "step": 66985 }, { "epoch": 10.928221859706362, "grad_norm": 0.38737842440605164, "learning_rate": 2.53138947699379e-05, "loss": 0.0681, "num_input_tokens_seen": 144533552, "step": 66990 }, { "epoch": 10.929037520391518, "grad_norm": 0.9274747967720032, "learning_rate": 2.5310336056180194e-05, "loss": 0.1086, "num_input_tokens_seen": 144545648, "step": 66995 }, { "epoch": 10.929853181076671, "grad_norm": 1.594929814338684, "learning_rate": 2.530677733613313e-05, "loss": 0.0952, "num_input_tokens_seen": 144556784, "step": 67000 }, { "epoch": 10.930668841761827, "grad_norm": 0.07625290006399155, "learning_rate": 2.5303218609868833e-05, "loss": 0.1379, "num_input_tokens_seen": 144567504, "step": 67005 }, { "epoch": 10.931484502446983, "grad_norm": 0.16281168162822723, "learning_rate": 2.529965987745943e-05, "loss": 0.1077, "num_input_tokens_seen": 144579120, "step": 67010 }, { "epoch": 10.932300163132137, "grad_norm": 1.5525356531143188, "learning_rate": 2.5296101138977042e-05, "loss": 0.0939, "num_input_tokens_seen": 144590000, "step": 67015 }, { "epoch": 10.933115823817293, "grad_norm": 0.6844959259033203, "learning_rate": 2.5292542394493785e-05, "loss": 0.0933, "num_input_tokens_seen": 144600176, "step": 67020 }, { "epoch": 10.933931484502446, "grad_norm": 0.03163904696702957, "learning_rate": 2.5288983644081788e-05, "loss": 0.0707, "num_input_tokens_seen": 144611888, "step": 67025 }, { "epoch": 10.934747145187602, "grad_norm": 0.03361023589968681, "learning_rate": 2.5285424887813165e-05, "loss": 0.0487, "num_input_tokens_seen": 144622544, "step": 67030 }, { "epoch": 10.935562805872756, "grad_norm": 1.7714039087295532, "learning_rate": 2.5281866125760045e-05, "loss": 0.1078, "num_input_tokens_seen": 144633392, "step": 67035 }, { "epoch": 10.936378466557912, "grad_norm": 0.5392457842826843, "learning_rate": 2.5278307357994556e-05, "loss": 0.0477, "num_input_tokens_seen": 144644112, "step": 67040 }, { "epoch": 10.937194127243067, "grad_norm": 1.5591264963150024, "learning_rate": 2.5274748584588813e-05, "loss": 0.2578, "num_input_tokens_seen": 144654768, "step": 67045 }, { "epoch": 10.938009787928221, "grad_norm": 1.5681135654449463, "learning_rate": 2.5271189805614943e-05, "loss": 0.2123, "num_input_tokens_seen": 144665264, "step": 67050 }, { "epoch": 10.938825448613377, "grad_norm": 0.5648176074028015, "learning_rate": 2.526763102114506e-05, "loss": 0.0979, "num_input_tokens_seen": 144676720, "step": 67055 }, { "epoch": 10.939641109298531, "grad_norm": 1.676950216293335, "learning_rate": 2.5264072231251302e-05, "loss": 0.1562, "num_input_tokens_seen": 144688880, "step": 67060 }, { "epoch": 10.940456769983687, "grad_norm": 1.0875051021575928, "learning_rate": 2.5260513436005777e-05, "loss": 0.2362, "num_input_tokens_seen": 144699824, "step": 67065 }, { "epoch": 10.941272430668842, "grad_norm": 1.2124145030975342, "learning_rate": 2.5256954635480624e-05, "loss": 0.0868, "num_input_tokens_seen": 144710256, "step": 67070 }, { "epoch": 10.942088091353996, "grad_norm": 0.08124788105487823, "learning_rate": 2.525339582974796e-05, "loss": 0.0964, "num_input_tokens_seen": 144722000, "step": 67075 }, { "epoch": 10.942903752039152, "grad_norm": 0.38167527318000793, "learning_rate": 2.5249837018879908e-05, "loss": 0.0799, "num_input_tokens_seen": 144732528, "step": 67080 }, { "epoch": 10.943719412724306, "grad_norm": 1.6864858865737915, "learning_rate": 2.524627820294859e-05, "loss": 0.1317, "num_input_tokens_seen": 144744176, "step": 67085 }, { "epoch": 10.944535073409462, "grad_norm": 0.22608718276023865, "learning_rate": 2.5242719382026127e-05, "loss": 0.0818, "num_input_tokens_seen": 144754832, "step": 67090 }, { "epoch": 10.945350734094617, "grad_norm": 0.5652875900268555, "learning_rate": 2.5239160556184653e-05, "loss": 0.1649, "num_input_tokens_seen": 144765808, "step": 67095 }, { "epoch": 10.946166394779771, "grad_norm": 0.06719867140054703, "learning_rate": 2.5235601725496288e-05, "loss": 0.2108, "num_input_tokens_seen": 144776912, "step": 67100 }, { "epoch": 10.946982055464927, "grad_norm": 2.2554802894592285, "learning_rate": 2.5232042890033154e-05, "loss": 0.1286, "num_input_tokens_seen": 144788240, "step": 67105 }, { "epoch": 10.947797716150081, "grad_norm": 0.08198022842407227, "learning_rate": 2.5228484049867374e-05, "loss": 0.0942, "num_input_tokens_seen": 144796816, "step": 67110 }, { "epoch": 10.948613376835237, "grad_norm": 0.24073459208011627, "learning_rate": 2.5224925205071083e-05, "loss": 0.024, "num_input_tokens_seen": 144807440, "step": 67115 }, { "epoch": 10.949429037520392, "grad_norm": 0.06953910738229752, "learning_rate": 2.5221366355716385e-05, "loss": 0.0177, "num_input_tokens_seen": 144819056, "step": 67120 }, { "epoch": 10.950244698205546, "grad_norm": 0.22513632476329803, "learning_rate": 2.5217807501875427e-05, "loss": 0.0627, "num_input_tokens_seen": 144829872, "step": 67125 }, { "epoch": 10.951060358890702, "grad_norm": 1.9225128889083862, "learning_rate": 2.521424864362032e-05, "loss": 0.1134, "num_input_tokens_seen": 144839984, "step": 67130 }, { "epoch": 10.951876019575856, "grad_norm": 0.06548475474119186, "learning_rate": 2.5210689781023194e-05, "loss": 0.0557, "num_input_tokens_seen": 144850640, "step": 67135 }, { "epoch": 10.952691680261012, "grad_norm": 0.07339843362569809, "learning_rate": 2.5207130914156168e-05, "loss": 0.0898, "num_input_tokens_seen": 144861456, "step": 67140 }, { "epoch": 10.953507340946166, "grad_norm": 0.6234319806098938, "learning_rate": 2.520357204309138e-05, "loss": 0.1565, "num_input_tokens_seen": 144872176, "step": 67145 }, { "epoch": 10.954323001631321, "grad_norm": 0.2669171690940857, "learning_rate": 2.520001316790094e-05, "loss": 0.0467, "num_input_tokens_seen": 144882736, "step": 67150 }, { "epoch": 10.955138662316477, "grad_norm": 0.17642071843147278, "learning_rate": 2.519645428865698e-05, "loss": 0.0332, "num_input_tokens_seen": 144893328, "step": 67155 }, { "epoch": 10.955954323001631, "grad_norm": 1.1310590505599976, "learning_rate": 2.5192895405431623e-05, "loss": 0.1388, "num_input_tokens_seen": 144903728, "step": 67160 }, { "epoch": 10.956769983686787, "grad_norm": 0.6670477986335754, "learning_rate": 2.5189336518296995e-05, "loss": 0.0274, "num_input_tokens_seen": 144914512, "step": 67165 }, { "epoch": 10.95758564437194, "grad_norm": 0.33492180705070496, "learning_rate": 2.518577762732523e-05, "loss": 0.1036, "num_input_tokens_seen": 144925904, "step": 67170 }, { "epoch": 10.958401305057096, "grad_norm": 0.17893090844154358, "learning_rate": 2.5182218732588442e-05, "loss": 0.1771, "num_input_tokens_seen": 144938352, "step": 67175 }, { "epoch": 10.959216965742252, "grad_norm": 0.49200600385665894, "learning_rate": 2.517865983415876e-05, "loss": 0.0635, "num_input_tokens_seen": 144949072, "step": 67180 }, { "epoch": 10.960032626427406, "grad_norm": 0.19511614739894867, "learning_rate": 2.517510093210831e-05, "loss": 0.034, "num_input_tokens_seen": 144959248, "step": 67185 }, { "epoch": 10.960848287112562, "grad_norm": 0.020827440544962883, "learning_rate": 2.517154202650921e-05, "loss": 0.0414, "num_input_tokens_seen": 144969168, "step": 67190 }, { "epoch": 10.961663947797716, "grad_norm": 0.4847290515899658, "learning_rate": 2.5167983117433597e-05, "loss": 0.0189, "num_input_tokens_seen": 144979984, "step": 67195 }, { "epoch": 10.962479608482871, "grad_norm": 0.6935114860534668, "learning_rate": 2.5164424204953596e-05, "loss": 0.2956, "num_input_tokens_seen": 144990672, "step": 67200 }, { "epoch": 10.963295269168025, "grad_norm": 0.14868183434009552, "learning_rate": 2.5160865289141326e-05, "loss": 0.0334, "num_input_tokens_seen": 145001616, "step": 67205 }, { "epoch": 10.964110929853181, "grad_norm": 0.20080454647541046, "learning_rate": 2.5157306370068922e-05, "loss": 0.0542, "num_input_tokens_seen": 145012400, "step": 67210 }, { "epoch": 10.964926590538337, "grad_norm": 0.8976837396621704, "learning_rate": 2.5153747447808506e-05, "loss": 0.1005, "num_input_tokens_seen": 145023408, "step": 67215 }, { "epoch": 10.96574225122349, "grad_norm": 0.3697241246700287, "learning_rate": 2.51501885224322e-05, "loss": 0.0224, "num_input_tokens_seen": 145034576, "step": 67220 }, { "epoch": 10.966557911908646, "grad_norm": 0.056037914007902145, "learning_rate": 2.5146629594012134e-05, "loss": 0.0763, "num_input_tokens_seen": 145045616, "step": 67225 }, { "epoch": 10.9673735725938, "grad_norm": 0.13210858404636383, "learning_rate": 2.514307066262043e-05, "loss": 0.1326, "num_input_tokens_seen": 145055696, "step": 67230 }, { "epoch": 10.968189233278956, "grad_norm": 0.14835961163043976, "learning_rate": 2.5139511728329214e-05, "loss": 0.0421, "num_input_tokens_seen": 145066896, "step": 67235 }, { "epoch": 10.969004893964112, "grad_norm": 0.014704921282827854, "learning_rate": 2.513595279121062e-05, "loss": 0.1058, "num_input_tokens_seen": 145077744, "step": 67240 }, { "epoch": 10.969820554649266, "grad_norm": 1.0937366485595703, "learning_rate": 2.5132393851336773e-05, "loss": 0.0919, "num_input_tokens_seen": 145087824, "step": 67245 }, { "epoch": 10.970636215334421, "grad_norm": 1.4106262922286987, "learning_rate": 2.5128834908779798e-05, "loss": 0.1342, "num_input_tokens_seen": 145099824, "step": 67250 }, { "epoch": 10.971451876019575, "grad_norm": 1.7792683839797974, "learning_rate": 2.5125275963611816e-05, "loss": 0.2257, "num_input_tokens_seen": 145110512, "step": 67255 }, { "epoch": 10.97226753670473, "grad_norm": 0.08386397361755371, "learning_rate": 2.512171701590496e-05, "loss": 0.1115, "num_input_tokens_seen": 145121488, "step": 67260 }, { "epoch": 10.973083197389887, "grad_norm": 1.0658612251281738, "learning_rate": 2.5118158065731357e-05, "loss": 0.0761, "num_input_tokens_seen": 145132112, "step": 67265 }, { "epoch": 10.97389885807504, "grad_norm": 2.619152069091797, "learning_rate": 2.5114599113163123e-05, "loss": 0.0493, "num_input_tokens_seen": 145142736, "step": 67270 }, { "epoch": 10.974714518760196, "grad_norm": 0.21190191805362701, "learning_rate": 2.51110401582724e-05, "loss": 0.0595, "num_input_tokens_seen": 145152528, "step": 67275 }, { "epoch": 10.97553017944535, "grad_norm": 0.3487613797187805, "learning_rate": 2.5107481201131305e-05, "loss": 0.0322, "num_input_tokens_seen": 145163280, "step": 67280 }, { "epoch": 10.976345840130506, "grad_norm": 0.059022191911935806, "learning_rate": 2.5103922241811967e-05, "loss": 0.1509, "num_input_tokens_seen": 145173840, "step": 67285 }, { "epoch": 10.977161500815662, "grad_norm": 0.06647855788469315, "learning_rate": 2.5100363280386513e-05, "loss": 0.1617, "num_input_tokens_seen": 145183088, "step": 67290 }, { "epoch": 10.977977161500815, "grad_norm": 1.6212029457092285, "learning_rate": 2.5096804316927065e-05, "loss": 0.2052, "num_input_tokens_seen": 145193776, "step": 67295 }, { "epoch": 10.978792822185971, "grad_norm": 0.3668856620788574, "learning_rate": 2.5093245351505756e-05, "loss": 0.0841, "num_input_tokens_seen": 145205552, "step": 67300 }, { "epoch": 10.979608482871125, "grad_norm": 0.09035888314247131, "learning_rate": 2.5089686384194717e-05, "loss": 0.0082, "num_input_tokens_seen": 145215504, "step": 67305 }, { "epoch": 10.98042414355628, "grad_norm": 0.22127458453178406, "learning_rate": 2.5086127415066064e-05, "loss": 0.096, "num_input_tokens_seen": 145226064, "step": 67310 }, { "epoch": 10.981239804241435, "grad_norm": 1.6852447986602783, "learning_rate": 2.5082568444191933e-05, "loss": 0.2952, "num_input_tokens_seen": 145236944, "step": 67315 }, { "epoch": 10.98205546492659, "grad_norm": 0.9854992032051086, "learning_rate": 2.5079009471644448e-05, "loss": 0.1239, "num_input_tokens_seen": 145247856, "step": 67320 }, { "epoch": 10.982871125611746, "grad_norm": 0.856762707233429, "learning_rate": 2.507545049749574e-05, "loss": 0.123, "num_input_tokens_seen": 145259344, "step": 67325 }, { "epoch": 10.9836867862969, "grad_norm": 0.028753044083714485, "learning_rate": 2.5071891521817925e-05, "loss": 0.1241, "num_input_tokens_seen": 145269648, "step": 67330 }, { "epoch": 10.984502446982056, "grad_norm": 2.17620849609375, "learning_rate": 2.5068332544683137e-05, "loss": 0.1627, "num_input_tokens_seen": 145280304, "step": 67335 }, { "epoch": 10.98531810766721, "grad_norm": 0.3351021409034729, "learning_rate": 2.5064773566163507e-05, "loss": 0.1532, "num_input_tokens_seen": 145292208, "step": 67340 }, { "epoch": 10.986133768352365, "grad_norm": 0.0620255284011364, "learning_rate": 2.506121458633116e-05, "loss": 0.1059, "num_input_tokens_seen": 145303024, "step": 67345 }, { "epoch": 10.986949429037521, "grad_norm": 0.9967605471611023, "learning_rate": 2.5057655605258217e-05, "loss": 0.1757, "num_input_tokens_seen": 145313296, "step": 67350 }, { "epoch": 10.987765089722675, "grad_norm": 0.032301533967256546, "learning_rate": 2.5054096623016816e-05, "loss": 0.0253, "num_input_tokens_seen": 145324208, "step": 67355 }, { "epoch": 10.98858075040783, "grad_norm": 0.23305432498455048, "learning_rate": 2.5050537639679073e-05, "loss": 0.0809, "num_input_tokens_seen": 145334000, "step": 67360 }, { "epoch": 10.989396411092985, "grad_norm": 0.19757825136184692, "learning_rate": 2.5046978655317122e-05, "loss": 0.0523, "num_input_tokens_seen": 145346128, "step": 67365 }, { "epoch": 10.99021207177814, "grad_norm": 0.7301763892173767, "learning_rate": 2.504341967000309e-05, "loss": 0.0738, "num_input_tokens_seen": 145357264, "step": 67370 }, { "epoch": 10.991027732463294, "grad_norm": 0.19378776848316193, "learning_rate": 2.5039860683809103e-05, "loss": 0.1402, "num_input_tokens_seen": 145368592, "step": 67375 }, { "epoch": 10.99184339314845, "grad_norm": 1.0059608221054077, "learning_rate": 2.503630169680728e-05, "loss": 0.1549, "num_input_tokens_seen": 145378576, "step": 67380 }, { "epoch": 10.992659053833606, "grad_norm": 2.1528079509735107, "learning_rate": 2.503274270906977e-05, "loss": 0.1363, "num_input_tokens_seen": 145389296, "step": 67385 }, { "epoch": 10.99347471451876, "grad_norm": 0.07292528450489044, "learning_rate": 2.502918372066868e-05, "loss": 0.1346, "num_input_tokens_seen": 145400624, "step": 67390 }, { "epoch": 10.994290375203915, "grad_norm": 0.21652933955192566, "learning_rate": 2.5025624731676146e-05, "loss": 0.0606, "num_input_tokens_seen": 145411760, "step": 67395 }, { "epoch": 10.99510603588907, "grad_norm": 0.6419284343719482, "learning_rate": 2.5022065742164298e-05, "loss": 0.0576, "num_input_tokens_seen": 145422544, "step": 67400 }, { "epoch": 10.995921696574225, "grad_norm": 0.14147667586803436, "learning_rate": 2.501850675220525e-05, "loss": 0.0241, "num_input_tokens_seen": 145433840, "step": 67405 }, { "epoch": 10.99673735725938, "grad_norm": 0.8340199589729309, "learning_rate": 2.501494776187115e-05, "loss": 0.1359, "num_input_tokens_seen": 145444560, "step": 67410 }, { "epoch": 10.997553017944535, "grad_norm": 0.661568820476532, "learning_rate": 2.5011388771234106e-05, "loss": 0.094, "num_input_tokens_seen": 145455504, "step": 67415 }, { "epoch": 10.99836867862969, "grad_norm": 0.033980004489421844, "learning_rate": 2.500782978036626e-05, "loss": 0.0354, "num_input_tokens_seen": 145466928, "step": 67420 }, { "epoch": 10.999184339314844, "grad_norm": 0.9096682667732239, "learning_rate": 2.5004270789339736e-05, "loss": 0.0328, "num_input_tokens_seen": 145476880, "step": 67425 }, { "epoch": 11.0, "grad_norm": 3.2204177379608154, "learning_rate": 2.5000711798226658e-05, "loss": 0.2578, "num_input_tokens_seen": 145487264, "step": 67430 }, { "epoch": 11.0, "eval_loss": 0.13585010170936584, "eval_runtime": 90.6894, "eval_samples_per_second": 30.048, "eval_steps_per_second": 7.52, "num_input_tokens_seen": 145487264, "step": 67430 }, { "epoch": 11.000815660685156, "grad_norm": 0.8357563614845276, "learning_rate": 2.4997152807099156e-05, "loss": 0.0958, "num_input_tokens_seen": 145498432, "step": 67435 }, { "epoch": 11.00163132137031, "grad_norm": 0.5255961418151855, "learning_rate": 2.4993593816029347e-05, "loss": 0.0942, "num_input_tokens_seen": 145508448, "step": 67440 }, { "epoch": 11.002446982055465, "grad_norm": 1.6774957180023193, "learning_rate": 2.499003482508938e-05, "loss": 0.1847, "num_input_tokens_seen": 145518944, "step": 67445 }, { "epoch": 11.00326264274062, "grad_norm": 0.9726468920707703, "learning_rate": 2.4986475834351357e-05, "loss": 0.2401, "num_input_tokens_seen": 145529664, "step": 67450 }, { "epoch": 11.004078303425775, "grad_norm": 0.08296722173690796, "learning_rate": 2.4982916843887417e-05, "loss": 0.1216, "num_input_tokens_seen": 145542240, "step": 67455 }, { "epoch": 11.00489396411093, "grad_norm": 0.05502428114414215, "learning_rate": 2.49793578537697e-05, "loss": 0.0557, "num_input_tokens_seen": 145551520, "step": 67460 }, { "epoch": 11.005709624796085, "grad_norm": 0.049485787749290466, "learning_rate": 2.4975798864070317e-05, "loss": 0.0655, "num_input_tokens_seen": 145562112, "step": 67465 }, { "epoch": 11.00652528548124, "grad_norm": 1.148514986038208, "learning_rate": 2.4972239874861406e-05, "loss": 0.0438, "num_input_tokens_seen": 145574208, "step": 67470 }, { "epoch": 11.007340946166394, "grad_norm": 0.15612095594406128, "learning_rate": 2.4968680886215078e-05, "loss": 0.1477, "num_input_tokens_seen": 145585184, "step": 67475 }, { "epoch": 11.00815660685155, "grad_norm": 0.030604898929595947, "learning_rate": 2.4965121898203485e-05, "loss": 0.0367, "num_input_tokens_seen": 145596192, "step": 67480 }, { "epoch": 11.008972267536704, "grad_norm": 2.512209892272949, "learning_rate": 2.496156291089873e-05, "loss": 0.1128, "num_input_tokens_seen": 145606304, "step": 67485 }, { "epoch": 11.00978792822186, "grad_norm": 0.02629421092569828, "learning_rate": 2.4958003924372963e-05, "loss": 0.107, "num_input_tokens_seen": 145616800, "step": 67490 }, { "epoch": 11.010603588907015, "grad_norm": 0.4663010835647583, "learning_rate": 2.4954444938698287e-05, "loss": 0.0329, "num_input_tokens_seen": 145626784, "step": 67495 }, { "epoch": 11.01141924959217, "grad_norm": 0.17126619815826416, "learning_rate": 2.4950885953946853e-05, "loss": 0.1011, "num_input_tokens_seen": 145638048, "step": 67500 }, { "epoch": 11.012234910277325, "grad_norm": 0.07857450097799301, "learning_rate": 2.4947326970190767e-05, "loss": 0.1296, "num_input_tokens_seen": 145648512, "step": 67505 }, { "epoch": 11.013050570962479, "grad_norm": 0.07891818135976791, "learning_rate": 2.4943767987502177e-05, "loss": 0.1396, "num_input_tokens_seen": 145659008, "step": 67510 }, { "epoch": 11.013866231647635, "grad_norm": 0.5834202766418457, "learning_rate": 2.494020900595319e-05, "loss": 0.1262, "num_input_tokens_seen": 145670240, "step": 67515 }, { "epoch": 11.01468189233279, "grad_norm": 0.05606532841920853, "learning_rate": 2.493665002561595e-05, "loss": 0.1734, "num_input_tokens_seen": 145680832, "step": 67520 }, { "epoch": 11.015497553017944, "grad_norm": 1.5309282541275024, "learning_rate": 2.4933091046562576e-05, "loss": 0.0903, "num_input_tokens_seen": 145691360, "step": 67525 }, { "epoch": 11.0163132137031, "grad_norm": 0.24476754665374756, "learning_rate": 2.4929532068865198e-05, "loss": 0.1109, "num_input_tokens_seen": 145702304, "step": 67530 }, { "epoch": 11.017128874388254, "grad_norm": 0.13304217159748077, "learning_rate": 2.4925973092595942e-05, "loss": 0.0425, "num_input_tokens_seen": 145714720, "step": 67535 }, { "epoch": 11.01794453507341, "grad_norm": 0.059045735746622086, "learning_rate": 2.4922414117826934e-05, "loss": 0.2024, "num_input_tokens_seen": 145726048, "step": 67540 }, { "epoch": 11.018760195758565, "grad_norm": 3.253984212875366, "learning_rate": 2.49188551446303e-05, "loss": 0.2302, "num_input_tokens_seen": 145736000, "step": 67545 }, { "epoch": 11.01957585644372, "grad_norm": 0.44922634959220886, "learning_rate": 2.4915296173078177e-05, "loss": 0.1334, "num_input_tokens_seen": 145746112, "step": 67550 }, { "epoch": 11.020391517128875, "grad_norm": 0.6595988869667053, "learning_rate": 2.491173720324267e-05, "loss": 0.0464, "num_input_tokens_seen": 145757664, "step": 67555 }, { "epoch": 11.021207177814029, "grad_norm": 0.30862176418304443, "learning_rate": 2.4908178235195933e-05, "loss": 0.1115, "num_input_tokens_seen": 145768224, "step": 67560 }, { "epoch": 11.022022838499185, "grad_norm": 0.3454076647758484, "learning_rate": 2.4904619269010075e-05, "loss": 0.2495, "num_input_tokens_seen": 145780160, "step": 67565 }, { "epoch": 11.022838499184338, "grad_norm": 0.6270619034767151, "learning_rate": 2.4901060304757235e-05, "loss": 0.1198, "num_input_tokens_seen": 145790848, "step": 67570 }, { "epoch": 11.023654159869494, "grad_norm": 0.08966661989688873, "learning_rate": 2.489750134250953e-05, "loss": 0.2402, "num_input_tokens_seen": 145801344, "step": 67575 }, { "epoch": 11.02446982055465, "grad_norm": 0.25948959589004517, "learning_rate": 2.4893942382339082e-05, "loss": 0.1769, "num_input_tokens_seen": 145811680, "step": 67580 }, { "epoch": 11.025285481239804, "grad_norm": 0.8553170561790466, "learning_rate": 2.489038342431804e-05, "loss": 0.1548, "num_input_tokens_seen": 145822784, "step": 67585 }, { "epoch": 11.02610114192496, "grad_norm": 0.3521290123462677, "learning_rate": 2.4886824468518507e-05, "loss": 0.2155, "num_input_tokens_seen": 145833792, "step": 67590 }, { "epoch": 11.026916802610113, "grad_norm": 0.06242314353585243, "learning_rate": 2.488326551501263e-05, "loss": 0.16, "num_input_tokens_seen": 145844384, "step": 67595 }, { "epoch": 11.02773246329527, "grad_norm": 0.16536082327365875, "learning_rate": 2.487970656387251e-05, "loss": 0.171, "num_input_tokens_seen": 145855648, "step": 67600 }, { "epoch": 11.028548123980425, "grad_norm": 0.10414011776447296, "learning_rate": 2.4876147615170305e-05, "loss": 0.1187, "num_input_tokens_seen": 145865792, "step": 67605 }, { "epoch": 11.029363784665579, "grad_norm": 1.2197548151016235, "learning_rate": 2.4872588668978114e-05, "loss": 0.0514, "num_input_tokens_seen": 145875200, "step": 67610 }, { "epoch": 11.030179445350734, "grad_norm": 1.6792734861373901, "learning_rate": 2.4869029725368087e-05, "loss": 0.176, "num_input_tokens_seen": 145886912, "step": 67615 }, { "epoch": 11.030995106035888, "grad_norm": 0.37882059812545776, "learning_rate": 2.4865470784412326e-05, "loss": 0.0314, "num_input_tokens_seen": 145898176, "step": 67620 }, { "epoch": 11.031810766721044, "grad_norm": 0.0791369304060936, "learning_rate": 2.4861911846182984e-05, "loss": 0.1166, "num_input_tokens_seen": 145908128, "step": 67625 }, { "epoch": 11.0326264274062, "grad_norm": 3.138303518295288, "learning_rate": 2.4858352910752157e-05, "loss": 0.1709, "num_input_tokens_seen": 145920544, "step": 67630 }, { "epoch": 11.033442088091354, "grad_norm": 1.9845086336135864, "learning_rate": 2.4854793978192003e-05, "loss": 0.109, "num_input_tokens_seen": 145930208, "step": 67635 }, { "epoch": 11.03425774877651, "grad_norm": 0.9995412230491638, "learning_rate": 2.485123504857462e-05, "loss": 0.1402, "num_input_tokens_seen": 145941536, "step": 67640 }, { "epoch": 11.035073409461663, "grad_norm": 1.6245115995407104, "learning_rate": 2.4847676121972162e-05, "loss": 0.2038, "num_input_tokens_seen": 145953312, "step": 67645 }, { "epoch": 11.035889070146819, "grad_norm": 0.7025054097175598, "learning_rate": 2.4844117198456728e-05, "loss": 0.0684, "num_input_tokens_seen": 145963456, "step": 67650 }, { "epoch": 11.036704730831975, "grad_norm": 0.3382619321346283, "learning_rate": 2.484055827810047e-05, "loss": 0.0452, "num_input_tokens_seen": 145975008, "step": 67655 }, { "epoch": 11.037520391517129, "grad_norm": 1.2319368124008179, "learning_rate": 2.4836999360975484e-05, "loss": 0.2426, "num_input_tokens_seen": 145984928, "step": 67660 }, { "epoch": 11.038336052202284, "grad_norm": 1.2266727685928345, "learning_rate": 2.4833440447153922e-05, "loss": 0.123, "num_input_tokens_seen": 145995584, "step": 67665 }, { "epoch": 11.039151712887438, "grad_norm": 0.6907438039779663, "learning_rate": 2.48298815367079e-05, "loss": 0.1839, "num_input_tokens_seen": 146006368, "step": 67670 }, { "epoch": 11.039967373572594, "grad_norm": 2.3277690410614014, "learning_rate": 2.4826322629709538e-05, "loss": 0.2864, "num_input_tokens_seen": 146017184, "step": 67675 }, { "epoch": 11.040783034257748, "grad_norm": 1.585151195526123, "learning_rate": 2.4822763726230983e-05, "loss": 0.0934, "num_input_tokens_seen": 146027392, "step": 67680 }, { "epoch": 11.041598694942904, "grad_norm": 1.165251612663269, "learning_rate": 2.481920482634433e-05, "loss": 0.122, "num_input_tokens_seen": 146037280, "step": 67685 }, { "epoch": 11.04241435562806, "grad_norm": 0.053970012813806534, "learning_rate": 2.481564593012173e-05, "loss": 0.1465, "num_input_tokens_seen": 146048640, "step": 67690 }, { "epoch": 11.043230016313213, "grad_norm": 0.47207656502723694, "learning_rate": 2.4812087037635297e-05, "loss": 0.1618, "num_input_tokens_seen": 146059296, "step": 67695 }, { "epoch": 11.044045676998369, "grad_norm": 0.9289039969444275, "learning_rate": 2.480852814895716e-05, "loss": 0.1412, "num_input_tokens_seen": 146070240, "step": 67700 }, { "epoch": 11.044861337683523, "grad_norm": 0.03860383480787277, "learning_rate": 2.480496926415944e-05, "loss": 0.0202, "num_input_tokens_seen": 146080736, "step": 67705 }, { "epoch": 11.045676998368679, "grad_norm": 0.06457996368408203, "learning_rate": 2.480141038331427e-05, "loss": 0.1033, "num_input_tokens_seen": 146091488, "step": 67710 }, { "epoch": 11.046492659053834, "grad_norm": 0.04743264615535736, "learning_rate": 2.479785150649376e-05, "loss": 0.0372, "num_input_tokens_seen": 146101664, "step": 67715 }, { "epoch": 11.047308319738988, "grad_norm": 0.7038825750350952, "learning_rate": 2.4794292633770057e-05, "loss": 0.136, "num_input_tokens_seen": 146112448, "step": 67720 }, { "epoch": 11.048123980424144, "grad_norm": 0.21257930994033813, "learning_rate": 2.4790733765215267e-05, "loss": 0.1214, "num_input_tokens_seen": 146123648, "step": 67725 }, { "epoch": 11.048939641109298, "grad_norm": 0.07507006078958511, "learning_rate": 2.4787174900901533e-05, "loss": 0.1045, "num_input_tokens_seen": 146134368, "step": 67730 }, { "epoch": 11.049755301794454, "grad_norm": 0.17383423447608948, "learning_rate": 2.478361604090096e-05, "loss": 0.1348, "num_input_tokens_seen": 146145152, "step": 67735 }, { "epoch": 11.05057096247961, "grad_norm": 0.08180267363786697, "learning_rate": 2.478005718528569e-05, "loss": 0.0724, "num_input_tokens_seen": 146154752, "step": 67740 }, { "epoch": 11.051386623164763, "grad_norm": 0.35674595832824707, "learning_rate": 2.477649833412783e-05, "loss": 0.1469, "num_input_tokens_seen": 146165248, "step": 67745 }, { "epoch": 11.052202283849919, "grad_norm": 0.04192353039979935, "learning_rate": 2.4772939487499528e-05, "loss": 0.0819, "num_input_tokens_seen": 146176512, "step": 67750 }, { "epoch": 11.053017944535073, "grad_norm": 1.3312911987304688, "learning_rate": 2.476938064547288e-05, "loss": 0.0902, "num_input_tokens_seen": 146188608, "step": 67755 }, { "epoch": 11.053833605220229, "grad_norm": 0.11304809898138046, "learning_rate": 2.476582180812004e-05, "loss": 0.1728, "num_input_tokens_seen": 146199424, "step": 67760 }, { "epoch": 11.054649265905383, "grad_norm": 0.05317404493689537, "learning_rate": 2.4762262975513105e-05, "loss": 0.031, "num_input_tokens_seen": 146210688, "step": 67765 }, { "epoch": 11.055464926590538, "grad_norm": 0.2322806715965271, "learning_rate": 2.4758704147724228e-05, "loss": 0.0314, "num_input_tokens_seen": 146222176, "step": 67770 }, { "epoch": 11.056280587275694, "grad_norm": 0.8920825123786926, "learning_rate": 2.4755145324825503e-05, "loss": 0.1103, "num_input_tokens_seen": 146233632, "step": 67775 }, { "epoch": 11.057096247960848, "grad_norm": 1.352427363395691, "learning_rate": 2.475158650688907e-05, "loss": 0.1898, "num_input_tokens_seen": 146244512, "step": 67780 }, { "epoch": 11.057911908646004, "grad_norm": 1.2850098609924316, "learning_rate": 2.474802769398706e-05, "loss": 0.2383, "num_input_tokens_seen": 146255520, "step": 67785 }, { "epoch": 11.058727569331158, "grad_norm": 0.40865492820739746, "learning_rate": 2.474446888619158e-05, "loss": 0.0626, "num_input_tokens_seen": 146266592, "step": 67790 }, { "epoch": 11.059543230016313, "grad_norm": 0.6927269697189331, "learning_rate": 2.4740910083574772e-05, "loss": 0.0503, "num_input_tokens_seen": 146277472, "step": 67795 }, { "epoch": 11.060358890701469, "grad_norm": 1.271425724029541, "learning_rate": 2.4737351286208737e-05, "loss": 0.0913, "num_input_tokens_seen": 146287872, "step": 67800 }, { "epoch": 11.061174551386623, "grad_norm": 0.04083160683512688, "learning_rate": 2.4733792494165627e-05, "loss": 0.0846, "num_input_tokens_seen": 146298848, "step": 67805 }, { "epoch": 11.061990212071779, "grad_norm": 0.19053582847118378, "learning_rate": 2.4730233707517537e-05, "loss": 0.1223, "num_input_tokens_seen": 146309184, "step": 67810 }, { "epoch": 11.062805872756933, "grad_norm": 0.1254521608352661, "learning_rate": 2.4726674926336614e-05, "loss": 0.0349, "num_input_tokens_seen": 146319200, "step": 67815 }, { "epoch": 11.063621533442088, "grad_norm": 0.3484839200973511, "learning_rate": 2.4723116150694955e-05, "loss": 0.0596, "num_input_tokens_seen": 146330688, "step": 67820 }, { "epoch": 11.064437194127244, "grad_norm": 0.07206244766712189, "learning_rate": 2.4719557380664716e-05, "loss": 0.075, "num_input_tokens_seen": 146340384, "step": 67825 }, { "epoch": 11.065252854812398, "grad_norm": 0.4093727171421051, "learning_rate": 2.4715998616317987e-05, "loss": 0.0949, "num_input_tokens_seen": 146349888, "step": 67830 }, { "epoch": 11.066068515497554, "grad_norm": 0.5383801460266113, "learning_rate": 2.4712439857726915e-05, "loss": 0.0217, "num_input_tokens_seen": 146359648, "step": 67835 }, { "epoch": 11.066884176182707, "grad_norm": 0.053849197924137115, "learning_rate": 2.4708881104963613e-05, "loss": 0.0201, "num_input_tokens_seen": 146370656, "step": 67840 }, { "epoch": 11.067699836867863, "grad_norm": 0.12196069210767746, "learning_rate": 2.4705322358100203e-05, "loss": 0.1628, "num_input_tokens_seen": 146381216, "step": 67845 }, { "epoch": 11.068515497553017, "grad_norm": 0.05193619802594185, "learning_rate": 2.4701763617208812e-05, "loss": 0.1065, "num_input_tokens_seen": 146391520, "step": 67850 }, { "epoch": 11.069331158238173, "grad_norm": 1.4098707437515259, "learning_rate": 2.4698204882361554e-05, "loss": 0.1217, "num_input_tokens_seen": 146402016, "step": 67855 }, { "epoch": 11.070146818923329, "grad_norm": 0.20700211822986603, "learning_rate": 2.469464615363056e-05, "loss": 0.0917, "num_input_tokens_seen": 146412832, "step": 67860 }, { "epoch": 11.070962479608482, "grad_norm": 0.2403932809829712, "learning_rate": 2.4691087431087955e-05, "loss": 0.0958, "num_input_tokens_seen": 146424128, "step": 67865 }, { "epoch": 11.071778140293638, "grad_norm": 0.94744473695755, "learning_rate": 2.4687528714805843e-05, "loss": 0.2141, "num_input_tokens_seen": 146433632, "step": 67870 }, { "epoch": 11.072593800978792, "grad_norm": 0.2167527824640274, "learning_rate": 2.4683970004856373e-05, "loss": 0.0717, "num_input_tokens_seen": 146444512, "step": 67875 }, { "epoch": 11.073409461663948, "grad_norm": 2.1718080043792725, "learning_rate": 2.4680411301311637e-05, "loss": 0.1046, "num_input_tokens_seen": 146456192, "step": 67880 }, { "epoch": 11.074225122349104, "grad_norm": 0.3045082688331604, "learning_rate": 2.4676852604243773e-05, "loss": 0.0471, "num_input_tokens_seen": 146466560, "step": 67885 }, { "epoch": 11.075040783034257, "grad_norm": 1.8595339059829712, "learning_rate": 2.4673293913724914e-05, "loss": 0.1443, "num_input_tokens_seen": 146478080, "step": 67890 }, { "epoch": 11.075856443719413, "grad_norm": 0.09343697130680084, "learning_rate": 2.4669735229827155e-05, "loss": 0.0094, "num_input_tokens_seen": 146488608, "step": 67895 }, { "epoch": 11.076672104404567, "grad_norm": 0.5919698476791382, "learning_rate": 2.466617655262264e-05, "loss": 0.0766, "num_input_tokens_seen": 146498720, "step": 67900 }, { "epoch": 11.077487765089723, "grad_norm": 0.18072478473186493, "learning_rate": 2.4662617882183473e-05, "loss": 0.0733, "num_input_tokens_seen": 146509856, "step": 67905 }, { "epoch": 11.078303425774878, "grad_norm": 0.3899756669998169, "learning_rate": 2.465905921858179e-05, "loss": 0.169, "num_input_tokens_seen": 146520032, "step": 67910 }, { "epoch": 11.079119086460032, "grad_norm": 0.5160039663314819, "learning_rate": 2.46555005618897e-05, "loss": 0.0448, "num_input_tokens_seen": 146532096, "step": 67915 }, { "epoch": 11.079934747145188, "grad_norm": 1.1298532485961914, "learning_rate": 2.465194191217933e-05, "loss": 0.1214, "num_input_tokens_seen": 146541856, "step": 67920 }, { "epoch": 11.080750407830342, "grad_norm": 0.024656126275658607, "learning_rate": 2.4648383269522793e-05, "loss": 0.0634, "num_input_tokens_seen": 146552032, "step": 67925 }, { "epoch": 11.081566068515498, "grad_norm": 0.27143174409866333, "learning_rate": 2.4644824633992225e-05, "loss": 0.0465, "num_input_tokens_seen": 146564032, "step": 67930 }, { "epoch": 11.082381729200652, "grad_norm": 0.22808955609798431, "learning_rate": 2.4641266005659726e-05, "loss": 0.0193, "num_input_tokens_seen": 146574688, "step": 67935 }, { "epoch": 11.083197389885807, "grad_norm": 0.018299918621778488, "learning_rate": 2.4637707384597436e-05, "loss": 0.062, "num_input_tokens_seen": 146586464, "step": 67940 }, { "epoch": 11.084013050570963, "grad_norm": 0.13722388446331024, "learning_rate": 2.4634148770877458e-05, "loss": 0.1014, "num_input_tokens_seen": 146599008, "step": 67945 }, { "epoch": 11.084828711256117, "grad_norm": 0.6676216721534729, "learning_rate": 2.4630590164571926e-05, "loss": 0.0741, "num_input_tokens_seen": 146610336, "step": 67950 }, { "epoch": 11.085644371941273, "grad_norm": 0.45057547092437744, "learning_rate": 2.4627031565752944e-05, "loss": 0.2049, "num_input_tokens_seen": 146619680, "step": 67955 }, { "epoch": 11.086460032626427, "grad_norm": 1.1637479066848755, "learning_rate": 2.462347297449265e-05, "loss": 0.1733, "num_input_tokens_seen": 146629792, "step": 67960 }, { "epoch": 11.087275693311582, "grad_norm": 0.24363194406032562, "learning_rate": 2.4619914390863146e-05, "loss": 0.0266, "num_input_tokens_seen": 146640192, "step": 67965 }, { "epoch": 11.088091353996738, "grad_norm": 0.3733276128768921, "learning_rate": 2.461635581493657e-05, "loss": 0.1398, "num_input_tokens_seen": 146651904, "step": 67970 }, { "epoch": 11.088907014681892, "grad_norm": 0.0956702008843422, "learning_rate": 2.4612797246785015e-05, "loss": 0.0368, "num_input_tokens_seen": 146662080, "step": 67975 }, { "epoch": 11.089722675367048, "grad_norm": 1.3062328100204468, "learning_rate": 2.4609238686480622e-05, "loss": 0.1635, "num_input_tokens_seen": 146672352, "step": 67980 }, { "epoch": 11.090538336052202, "grad_norm": 0.052008822560310364, "learning_rate": 2.4605680134095506e-05, "loss": 0.0251, "num_input_tokens_seen": 146683040, "step": 67985 }, { "epoch": 11.091353996737357, "grad_norm": 0.19903817772865295, "learning_rate": 2.4602121589701773e-05, "loss": 0.0133, "num_input_tokens_seen": 146694720, "step": 67990 }, { "epoch": 11.092169657422513, "grad_norm": 1.0265947580337524, "learning_rate": 2.4598563053371564e-05, "loss": 0.0281, "num_input_tokens_seen": 146703776, "step": 67995 }, { "epoch": 11.092985318107667, "grad_norm": 1.328320860862732, "learning_rate": 2.459500452517697e-05, "loss": 0.082, "num_input_tokens_seen": 146714720, "step": 68000 }, { "epoch": 11.093800978792823, "grad_norm": 0.08102182298898697, "learning_rate": 2.4591446005190132e-05, "loss": 0.0117, "num_input_tokens_seen": 146724736, "step": 68005 }, { "epoch": 11.094616639477977, "grad_norm": 0.04237160086631775, "learning_rate": 2.4587887493483157e-05, "loss": 0.0297, "num_input_tokens_seen": 146736544, "step": 68010 }, { "epoch": 11.095432300163132, "grad_norm": 0.08851220458745956, "learning_rate": 2.4584328990128167e-05, "loss": 0.0186, "num_input_tokens_seen": 146748192, "step": 68015 }, { "epoch": 11.096247960848286, "grad_norm": 0.787174642086029, "learning_rate": 2.4580770495197275e-05, "loss": 0.0346, "num_input_tokens_seen": 146760384, "step": 68020 }, { "epoch": 11.097063621533442, "grad_norm": 0.11852854490280151, "learning_rate": 2.4577212008762602e-05, "loss": 0.1058, "num_input_tokens_seen": 146771904, "step": 68025 }, { "epoch": 11.097879282218598, "grad_norm": 0.23249347507953644, "learning_rate": 2.4573653530896256e-05, "loss": 0.1969, "num_input_tokens_seen": 146783296, "step": 68030 }, { "epoch": 11.098694942903752, "grad_norm": 0.030970975756645203, "learning_rate": 2.4570095061670375e-05, "loss": 0.1577, "num_input_tokens_seen": 146794176, "step": 68035 }, { "epoch": 11.099510603588907, "grad_norm": 0.1582077592611313, "learning_rate": 2.4566536601157055e-05, "loss": 0.2588, "num_input_tokens_seen": 146805504, "step": 68040 }, { "epoch": 11.100326264274061, "grad_norm": 0.08530991524457932, "learning_rate": 2.4562978149428425e-05, "loss": 0.0845, "num_input_tokens_seen": 146816096, "step": 68045 }, { "epoch": 11.101141924959217, "grad_norm": 0.03301296755671501, "learning_rate": 2.455941970655659e-05, "loss": 0.187, "num_input_tokens_seen": 146825888, "step": 68050 }, { "epoch": 11.101957585644373, "grad_norm": 0.9836829900741577, "learning_rate": 2.4555861272613688e-05, "loss": 0.1586, "num_input_tokens_seen": 146837344, "step": 68055 }, { "epoch": 11.102773246329527, "grad_norm": 1.9434661865234375, "learning_rate": 2.4552302847671804e-05, "loss": 0.1116, "num_input_tokens_seen": 146848128, "step": 68060 }, { "epoch": 11.103588907014682, "grad_norm": 0.37251442670822144, "learning_rate": 2.4548744431803083e-05, "loss": 0.0162, "num_input_tokens_seen": 146857632, "step": 68065 }, { "epoch": 11.104404567699836, "grad_norm": 0.07969661802053452, "learning_rate": 2.454518602507962e-05, "loss": 0.039, "num_input_tokens_seen": 146867488, "step": 68070 }, { "epoch": 11.105220228384992, "grad_norm": 0.7826928496360779, "learning_rate": 2.454162762757355e-05, "loss": 0.1376, "num_input_tokens_seen": 146878432, "step": 68075 }, { "epoch": 11.106035889070148, "grad_norm": 0.06751958280801773, "learning_rate": 2.4538069239356962e-05, "loss": 0.0263, "num_input_tokens_seen": 146888736, "step": 68080 }, { "epoch": 11.106851549755302, "grad_norm": 0.03801910579204559, "learning_rate": 2.4534510860502002e-05, "loss": 0.0939, "num_input_tokens_seen": 146896352, "step": 68085 }, { "epoch": 11.107667210440457, "grad_norm": 0.3391757905483246, "learning_rate": 2.453095249108076e-05, "loss": 0.1409, "num_input_tokens_seen": 146905792, "step": 68090 }, { "epoch": 11.108482871125611, "grad_norm": 1.5424010753631592, "learning_rate": 2.4527394131165358e-05, "loss": 0.2248, "num_input_tokens_seen": 146918272, "step": 68095 }, { "epoch": 11.109298531810767, "grad_norm": 0.06828660517930984, "learning_rate": 2.4523835780827927e-05, "loss": 0.2532, "num_input_tokens_seen": 146928832, "step": 68100 }, { "epoch": 11.11011419249592, "grad_norm": 0.3801545202732086, "learning_rate": 2.4520277440140552e-05, "loss": 0.0327, "num_input_tokens_seen": 146939776, "step": 68105 }, { "epoch": 11.110929853181077, "grad_norm": 0.13793671131134033, "learning_rate": 2.4516719109175375e-05, "loss": 0.1079, "num_input_tokens_seen": 146949728, "step": 68110 }, { "epoch": 11.111745513866232, "grad_norm": 0.041356321424245834, "learning_rate": 2.451316078800449e-05, "loss": 0.2157, "num_input_tokens_seen": 146960480, "step": 68115 }, { "epoch": 11.112561174551386, "grad_norm": 0.9376341104507446, "learning_rate": 2.4509602476700035e-05, "loss": 0.06, "num_input_tokens_seen": 146971136, "step": 68120 }, { "epoch": 11.113376835236542, "grad_norm": 0.13018669188022614, "learning_rate": 2.4506044175334088e-05, "loss": 0.0405, "num_input_tokens_seen": 146981536, "step": 68125 }, { "epoch": 11.114192495921696, "grad_norm": 0.5597754716873169, "learning_rate": 2.45024858839788e-05, "loss": 0.2205, "num_input_tokens_seen": 146991840, "step": 68130 }, { "epoch": 11.115008156606851, "grad_norm": 0.24036826193332672, "learning_rate": 2.449892760270625e-05, "loss": 0.018, "num_input_tokens_seen": 147001632, "step": 68135 }, { "epoch": 11.115823817292007, "grad_norm": 1.746964693069458, "learning_rate": 2.4495369331588582e-05, "loss": 0.2337, "num_input_tokens_seen": 147012448, "step": 68140 }, { "epoch": 11.116639477977161, "grad_norm": 0.9175130724906921, "learning_rate": 2.4491811070697884e-05, "loss": 0.217, "num_input_tokens_seen": 147022336, "step": 68145 }, { "epoch": 11.117455138662317, "grad_norm": 1.3970295190811157, "learning_rate": 2.4488252820106288e-05, "loss": 0.3417, "num_input_tokens_seen": 147033344, "step": 68150 }, { "epoch": 11.11827079934747, "grad_norm": 0.053646985441446304, "learning_rate": 2.448469457988589e-05, "loss": 0.1997, "num_input_tokens_seen": 147043360, "step": 68155 }, { "epoch": 11.119086460032626, "grad_norm": 0.8003740906715393, "learning_rate": 2.4481136350108818e-05, "loss": 0.0623, "num_input_tokens_seen": 147053696, "step": 68160 }, { "epoch": 11.119902120717782, "grad_norm": 0.07027627527713776, "learning_rate": 2.447757813084717e-05, "loss": 0.0484, "num_input_tokens_seen": 147065248, "step": 68165 }, { "epoch": 11.120717781402936, "grad_norm": 0.05093909800052643, "learning_rate": 2.4474019922173066e-05, "loss": 0.1232, "num_input_tokens_seen": 147075392, "step": 68170 }, { "epoch": 11.121533442088092, "grad_norm": 0.4270585775375366, "learning_rate": 2.447046172415861e-05, "loss": 0.0353, "num_input_tokens_seen": 147086944, "step": 68175 }, { "epoch": 11.122349102773246, "grad_norm": 0.9329542517662048, "learning_rate": 2.446690353687593e-05, "loss": 0.0367, "num_input_tokens_seen": 147096640, "step": 68180 }, { "epoch": 11.123164763458401, "grad_norm": 0.2397291660308838, "learning_rate": 2.4463345360397116e-05, "loss": 0.1616, "num_input_tokens_seen": 147107136, "step": 68185 }, { "epoch": 11.123980424143557, "grad_norm": 0.09336364269256592, "learning_rate": 2.4459787194794298e-05, "loss": 0.0599, "num_input_tokens_seen": 147118144, "step": 68190 }, { "epoch": 11.124796084828711, "grad_norm": 0.14294375479221344, "learning_rate": 2.445622904013957e-05, "loss": 0.0147, "num_input_tokens_seen": 147129504, "step": 68195 }, { "epoch": 11.125611745513867, "grad_norm": 0.22692757844924927, "learning_rate": 2.4452670896505047e-05, "loss": 0.0124, "num_input_tokens_seen": 147140928, "step": 68200 }, { "epoch": 11.12642740619902, "grad_norm": 0.1644526720046997, "learning_rate": 2.4449112763962856e-05, "loss": 0.1029, "num_input_tokens_seen": 147152064, "step": 68205 }, { "epoch": 11.127243066884176, "grad_norm": 0.04025828838348389, "learning_rate": 2.4445554642585082e-05, "loss": 0.0104, "num_input_tokens_seen": 147162528, "step": 68210 }, { "epoch": 11.12805872756933, "grad_norm": 0.405274361371994, "learning_rate": 2.4441996532443857e-05, "loss": 0.0405, "num_input_tokens_seen": 147173056, "step": 68215 }, { "epoch": 11.128874388254486, "grad_norm": 0.2521989643573761, "learning_rate": 2.443843843361127e-05, "loss": 0.0381, "num_input_tokens_seen": 147184416, "step": 68220 }, { "epoch": 11.129690048939642, "grad_norm": 1.2021982669830322, "learning_rate": 2.443488034615945e-05, "loss": 0.4588, "num_input_tokens_seen": 147196224, "step": 68225 }, { "epoch": 11.130505709624796, "grad_norm": 1.6817951202392578, "learning_rate": 2.4431322270160486e-05, "loss": 0.0997, "num_input_tokens_seen": 147206336, "step": 68230 }, { "epoch": 11.131321370309951, "grad_norm": 1.651524543762207, "learning_rate": 2.442776420568651e-05, "loss": 0.2269, "num_input_tokens_seen": 147216288, "step": 68235 }, { "epoch": 11.132137030995105, "grad_norm": 0.19307133555412292, "learning_rate": 2.4424206152809608e-05, "loss": 0.1261, "num_input_tokens_seen": 147227744, "step": 68240 }, { "epoch": 11.132952691680261, "grad_norm": 0.15324239432811737, "learning_rate": 2.4420648111601906e-05, "loss": 0.0123, "num_input_tokens_seen": 147238496, "step": 68245 }, { "epoch": 11.133768352365417, "grad_norm": 0.22166146337985992, "learning_rate": 2.4417090082135497e-05, "loss": 0.0512, "num_input_tokens_seen": 147250944, "step": 68250 }, { "epoch": 11.13458401305057, "grad_norm": 0.04003307223320007, "learning_rate": 2.4413532064482508e-05, "loss": 0.0331, "num_input_tokens_seen": 147261760, "step": 68255 }, { "epoch": 11.135399673735726, "grad_norm": 1.9500963687896729, "learning_rate": 2.4409974058715023e-05, "loss": 0.2248, "num_input_tokens_seen": 147272000, "step": 68260 }, { "epoch": 11.13621533442088, "grad_norm": 0.059859953820705414, "learning_rate": 2.4406416064905175e-05, "loss": 0.0365, "num_input_tokens_seen": 147283296, "step": 68265 }, { "epoch": 11.137030995106036, "grad_norm": 0.26601919531822205, "learning_rate": 2.4402858083125047e-05, "loss": 0.0335, "num_input_tokens_seen": 147293664, "step": 68270 }, { "epoch": 11.137846655791192, "grad_norm": 1.3185609579086304, "learning_rate": 2.439930011344677e-05, "loss": 0.1476, "num_input_tokens_seen": 147304032, "step": 68275 }, { "epoch": 11.138662316476346, "grad_norm": 0.2748582363128662, "learning_rate": 2.439574215594242e-05, "loss": 0.1965, "num_input_tokens_seen": 147313536, "step": 68280 }, { "epoch": 11.139477977161501, "grad_norm": 0.53524249792099, "learning_rate": 2.439218421068414e-05, "loss": 0.0824, "num_input_tokens_seen": 147324704, "step": 68285 }, { "epoch": 11.140293637846655, "grad_norm": 0.27792683243751526, "learning_rate": 2.4388626277744008e-05, "loss": 0.0342, "num_input_tokens_seen": 147333952, "step": 68290 }, { "epoch": 11.141109298531811, "grad_norm": 0.07953805476427078, "learning_rate": 2.438506835719414e-05, "loss": 0.2448, "num_input_tokens_seen": 147345120, "step": 68295 }, { "epoch": 11.141924959216965, "grad_norm": 1.4429244995117188, "learning_rate": 2.4381510449106644e-05, "loss": 0.0887, "num_input_tokens_seen": 147356864, "step": 68300 }, { "epoch": 11.14274061990212, "grad_norm": 0.9431722164154053, "learning_rate": 2.437795255355362e-05, "loss": 0.0528, "num_input_tokens_seen": 147367264, "step": 68305 }, { "epoch": 11.143556280587276, "grad_norm": 0.7604250907897949, "learning_rate": 2.4374394670607187e-05, "loss": 0.1863, "num_input_tokens_seen": 147378208, "step": 68310 }, { "epoch": 11.14437194127243, "grad_norm": 0.028236249461770058, "learning_rate": 2.4370836800339428e-05, "loss": 0.1667, "num_input_tokens_seen": 147389568, "step": 68315 }, { "epoch": 11.145187601957586, "grad_norm": 0.21316590905189514, "learning_rate": 2.4367278942822463e-05, "loss": 0.2057, "num_input_tokens_seen": 147400768, "step": 68320 }, { "epoch": 11.14600326264274, "grad_norm": 1.18587327003479, "learning_rate": 2.4363721098128394e-05, "loss": 0.0965, "num_input_tokens_seen": 147411840, "step": 68325 }, { "epoch": 11.146818923327896, "grad_norm": 0.34392380714416504, "learning_rate": 2.4360163266329324e-05, "loss": 0.1146, "num_input_tokens_seen": 147421472, "step": 68330 }, { "epoch": 11.147634584013051, "grad_norm": 0.3555492162704468, "learning_rate": 2.435660544749736e-05, "loss": 0.0472, "num_input_tokens_seen": 147431744, "step": 68335 }, { "epoch": 11.148450244698205, "grad_norm": 0.7113432288169861, "learning_rate": 2.43530476417046e-05, "loss": 0.0599, "num_input_tokens_seen": 147443232, "step": 68340 }, { "epoch": 11.149265905383361, "grad_norm": 0.05485956370830536, "learning_rate": 2.4349489849023144e-05, "loss": 0.0863, "num_input_tokens_seen": 147453248, "step": 68345 }, { "epoch": 11.150081566068515, "grad_norm": 1.110236406326294, "learning_rate": 2.4345932069525116e-05, "loss": 0.1857, "num_input_tokens_seen": 147464704, "step": 68350 }, { "epoch": 11.15089722675367, "grad_norm": 0.10864017903804779, "learning_rate": 2.4342374303282588e-05, "loss": 0.0433, "num_input_tokens_seen": 147474976, "step": 68355 }, { "epoch": 11.151712887438826, "grad_norm": 0.5716484785079956, "learning_rate": 2.4338816550367693e-05, "loss": 0.0532, "num_input_tokens_seen": 147485536, "step": 68360 }, { "epoch": 11.15252854812398, "grad_norm": 0.2857135534286499, "learning_rate": 2.433525881085251e-05, "loss": 0.1071, "num_input_tokens_seen": 147497408, "step": 68365 }, { "epoch": 11.153344208809136, "grad_norm": 0.02969321794807911, "learning_rate": 2.4331701084809163e-05, "loss": 0.1193, "num_input_tokens_seen": 147507360, "step": 68370 }, { "epoch": 11.15415986949429, "grad_norm": 2.18011474609375, "learning_rate": 2.4328143372309726e-05, "loss": 0.0386, "num_input_tokens_seen": 147517088, "step": 68375 }, { "epoch": 11.154975530179446, "grad_norm": 0.14289626479148865, "learning_rate": 2.4324585673426324e-05, "loss": 0.0645, "num_input_tokens_seen": 147526880, "step": 68380 }, { "epoch": 11.1557911908646, "grad_norm": 0.06431286782026291, "learning_rate": 2.4321027988231046e-05, "loss": 0.012, "num_input_tokens_seen": 147538816, "step": 68385 }, { "epoch": 11.156606851549755, "grad_norm": 0.11011897772550583, "learning_rate": 2.4317470316796004e-05, "loss": 0.0263, "num_input_tokens_seen": 147550752, "step": 68390 }, { "epoch": 11.15742251223491, "grad_norm": 0.0548817403614521, "learning_rate": 2.431391265919328e-05, "loss": 0.1232, "num_input_tokens_seen": 147562080, "step": 68395 }, { "epoch": 11.158238172920065, "grad_norm": 1.6454840898513794, "learning_rate": 2.4310355015495005e-05, "loss": 0.1011, "num_input_tokens_seen": 147573472, "step": 68400 }, { "epoch": 11.15905383360522, "grad_norm": 2.4642226696014404, "learning_rate": 2.4306797385773242e-05, "loss": 0.1216, "num_input_tokens_seen": 147583648, "step": 68405 }, { "epoch": 11.159869494290374, "grad_norm": 0.5053693652153015, "learning_rate": 2.4303239770100124e-05, "loss": 0.0306, "num_input_tokens_seen": 147593728, "step": 68410 }, { "epoch": 11.16068515497553, "grad_norm": 0.6978227496147156, "learning_rate": 2.4299682168547723e-05, "loss": 0.0278, "num_input_tokens_seen": 147604352, "step": 68415 }, { "epoch": 11.161500815660686, "grad_norm": 0.09660188853740692, "learning_rate": 2.4296124581188152e-05, "loss": 0.1237, "num_input_tokens_seen": 147616320, "step": 68420 }, { "epoch": 11.16231647634584, "grad_norm": 0.15440548956394196, "learning_rate": 2.429256700809352e-05, "loss": 0.0341, "num_input_tokens_seen": 147627616, "step": 68425 }, { "epoch": 11.163132137030995, "grad_norm": 1.371533989906311, "learning_rate": 2.4289009449335903e-05, "loss": 0.2268, "num_input_tokens_seen": 147638080, "step": 68430 }, { "epoch": 11.16394779771615, "grad_norm": 1.146743893623352, "learning_rate": 2.4285451904987422e-05, "loss": 0.163, "num_input_tokens_seen": 147648832, "step": 68435 }, { "epoch": 11.164763458401305, "grad_norm": 0.3588942289352417, "learning_rate": 2.4281894375120158e-05, "loss": 0.046, "num_input_tokens_seen": 147659168, "step": 68440 }, { "epoch": 11.16557911908646, "grad_norm": 1.3703206777572632, "learning_rate": 2.4278336859806224e-05, "loss": 0.1535, "num_input_tokens_seen": 147669920, "step": 68445 }, { "epoch": 11.166394779771615, "grad_norm": 0.3249145448207855, "learning_rate": 2.4274779359117693e-05, "loss": 0.1254, "num_input_tokens_seen": 147680672, "step": 68450 }, { "epoch": 11.16721044045677, "grad_norm": 0.07359687238931656, "learning_rate": 2.4271221873126695e-05, "loss": 0.0933, "num_input_tokens_seen": 147691712, "step": 68455 }, { "epoch": 11.168026101141924, "grad_norm": 0.1906709372997284, "learning_rate": 2.42676644019053e-05, "loss": 0.0461, "num_input_tokens_seen": 147701280, "step": 68460 }, { "epoch": 11.16884176182708, "grad_norm": 0.27921634912490845, "learning_rate": 2.4264106945525617e-05, "loss": 0.1392, "num_input_tokens_seen": 147712320, "step": 68465 }, { "epoch": 11.169657422512234, "grad_norm": 0.07571344822645187, "learning_rate": 2.4260549504059742e-05, "loss": 0.2002, "num_input_tokens_seen": 147723008, "step": 68470 }, { "epoch": 11.17047308319739, "grad_norm": 0.16844503581523895, "learning_rate": 2.425699207757977e-05, "loss": 0.0183, "num_input_tokens_seen": 147732736, "step": 68475 }, { "epoch": 11.171288743882545, "grad_norm": 0.3898528218269348, "learning_rate": 2.4253434666157796e-05, "loss": 0.0554, "num_input_tokens_seen": 147741792, "step": 68480 }, { "epoch": 11.1721044045677, "grad_norm": 1.2180935144424438, "learning_rate": 2.4249877269865913e-05, "loss": 0.0486, "num_input_tokens_seen": 147752672, "step": 68485 }, { "epoch": 11.172920065252855, "grad_norm": 0.8251121640205383, "learning_rate": 2.424631988877622e-05, "loss": 0.0637, "num_input_tokens_seen": 147763296, "step": 68490 }, { "epoch": 11.173735725938009, "grad_norm": 0.0373954251408577, "learning_rate": 2.4242762522960812e-05, "loss": 0.0704, "num_input_tokens_seen": 147775584, "step": 68495 }, { "epoch": 11.174551386623165, "grad_norm": 1.2204227447509766, "learning_rate": 2.423920517249178e-05, "loss": 0.1957, "num_input_tokens_seen": 147785568, "step": 68500 }, { "epoch": 11.17536704730832, "grad_norm": 0.6486819386482239, "learning_rate": 2.4235647837441225e-05, "loss": 0.0795, "num_input_tokens_seen": 147796416, "step": 68505 }, { "epoch": 11.176182707993474, "grad_norm": 0.6605256199836731, "learning_rate": 2.4232090517881227e-05, "loss": 0.066, "num_input_tokens_seen": 147806848, "step": 68510 }, { "epoch": 11.17699836867863, "grad_norm": 0.07258416712284088, "learning_rate": 2.42285332138839e-05, "loss": 0.0134, "num_input_tokens_seen": 147816992, "step": 68515 }, { "epoch": 11.177814029363784, "grad_norm": 0.7045605182647705, "learning_rate": 2.4224975925521315e-05, "loss": 0.0477, "num_input_tokens_seen": 147828384, "step": 68520 }, { "epoch": 11.17862969004894, "grad_norm": 0.7427496314048767, "learning_rate": 2.4221418652865576e-05, "loss": 0.038, "num_input_tokens_seen": 147838528, "step": 68525 }, { "epoch": 11.179445350734095, "grad_norm": 0.32940909266471863, "learning_rate": 2.4217861395988788e-05, "loss": 0.0587, "num_input_tokens_seen": 147850688, "step": 68530 }, { "epoch": 11.18026101141925, "grad_norm": 2.1069865226745605, "learning_rate": 2.4214304154963018e-05, "loss": 0.0658, "num_input_tokens_seen": 147861504, "step": 68535 }, { "epoch": 11.181076672104405, "grad_norm": 0.16151341795921326, "learning_rate": 2.4210746929860383e-05, "loss": 0.046, "num_input_tokens_seen": 147872032, "step": 68540 }, { "epoch": 11.181892332789559, "grad_norm": 1.1703846454620361, "learning_rate": 2.420718972075295e-05, "loss": 0.1221, "num_input_tokens_seen": 147881664, "step": 68545 }, { "epoch": 11.182707993474715, "grad_norm": 0.293515145778656, "learning_rate": 2.420363252771284e-05, "loss": 0.157, "num_input_tokens_seen": 147891936, "step": 68550 }, { "epoch": 11.18352365415987, "grad_norm": 0.6459084749221802, "learning_rate": 2.420007535081211e-05, "loss": 0.0425, "num_input_tokens_seen": 147903264, "step": 68555 }, { "epoch": 11.184339314845024, "grad_norm": 0.10097154974937439, "learning_rate": 2.419651819012288e-05, "loss": 0.0299, "num_input_tokens_seen": 147913984, "step": 68560 }, { "epoch": 11.18515497553018, "grad_norm": 0.5257571935653687, "learning_rate": 2.4192961045717218e-05, "loss": 0.1099, "num_input_tokens_seen": 147924704, "step": 68565 }, { "epoch": 11.185970636215334, "grad_norm": 0.045710060745477676, "learning_rate": 2.4189403917667236e-05, "loss": 0.0481, "num_input_tokens_seen": 147935872, "step": 68570 }, { "epoch": 11.18678629690049, "grad_norm": 0.39849844574928284, "learning_rate": 2.4185846806045e-05, "loss": 0.056, "num_input_tokens_seen": 147946720, "step": 68575 }, { "epoch": 11.187601957585644, "grad_norm": 0.04385187849402428, "learning_rate": 2.4182289710922624e-05, "loss": 0.0269, "num_input_tokens_seen": 147957920, "step": 68580 }, { "epoch": 11.1884176182708, "grad_norm": 2.4265365600585938, "learning_rate": 2.417873263237217e-05, "loss": 0.1281, "num_input_tokens_seen": 147968768, "step": 68585 }, { "epoch": 11.189233278955955, "grad_norm": 0.13967365026474, "learning_rate": 2.4175175570465753e-05, "loss": 0.0231, "num_input_tokens_seen": 147979808, "step": 68590 }, { "epoch": 11.190048939641109, "grad_norm": 2.027646064758301, "learning_rate": 2.4171618525275437e-05, "loss": 0.1032, "num_input_tokens_seen": 147990080, "step": 68595 }, { "epoch": 11.190864600326265, "grad_norm": 1.2878361940383911, "learning_rate": 2.416806149687334e-05, "loss": 0.1062, "num_input_tokens_seen": 148001408, "step": 68600 }, { "epoch": 11.191680261011419, "grad_norm": 0.061018213629722595, "learning_rate": 2.4164504485331515e-05, "loss": 0.0666, "num_input_tokens_seen": 148011936, "step": 68605 }, { "epoch": 11.192495921696574, "grad_norm": 1.175412654876709, "learning_rate": 2.4160947490722073e-05, "loss": 0.1751, "num_input_tokens_seen": 148022752, "step": 68610 }, { "epoch": 11.19331158238173, "grad_norm": 1.1872999668121338, "learning_rate": 2.4157390513117095e-05, "loss": 0.0653, "num_input_tokens_seen": 148033536, "step": 68615 }, { "epoch": 11.194127243066884, "grad_norm": 0.027252042666077614, "learning_rate": 2.4153833552588668e-05, "loss": 0.0459, "num_input_tokens_seen": 148045216, "step": 68620 }, { "epoch": 11.19494290375204, "grad_norm": 0.03562507405877113, "learning_rate": 2.4150276609208877e-05, "loss": 0.0222, "num_input_tokens_seen": 148055904, "step": 68625 }, { "epoch": 11.195758564437194, "grad_norm": 0.09410952776670456, "learning_rate": 2.4146719683049804e-05, "loss": 0.027, "num_input_tokens_seen": 148066080, "step": 68630 }, { "epoch": 11.19657422512235, "grad_norm": 0.018496602773666382, "learning_rate": 2.4143162774183542e-05, "loss": 0.2434, "num_input_tokens_seen": 148076992, "step": 68635 }, { "epoch": 11.197389885807505, "grad_norm": 0.13010399043560028, "learning_rate": 2.4139605882682177e-05, "loss": 0.0924, "num_input_tokens_seen": 148086624, "step": 68640 }, { "epoch": 11.198205546492659, "grad_norm": 0.6958179473876953, "learning_rate": 2.4136049008617786e-05, "loss": 0.0377, "num_input_tokens_seen": 148096672, "step": 68645 }, { "epoch": 11.199021207177815, "grad_norm": 0.12161976099014282, "learning_rate": 2.413249215206246e-05, "loss": 0.072, "num_input_tokens_seen": 148106496, "step": 68650 }, { "epoch": 11.199836867862969, "grad_norm": 0.7909783720970154, "learning_rate": 2.412893531308828e-05, "loss": 0.2589, "num_input_tokens_seen": 148117472, "step": 68655 }, { "epoch": 11.200652528548124, "grad_norm": 0.18064621090888977, "learning_rate": 2.4125378491767327e-05, "loss": 0.0555, "num_input_tokens_seen": 148126592, "step": 68660 }, { "epoch": 11.201468189233278, "grad_norm": 0.9686538577079773, "learning_rate": 2.41218216881717e-05, "loss": 0.1562, "num_input_tokens_seen": 148138272, "step": 68665 }, { "epoch": 11.202283849918434, "grad_norm": 0.8825687170028687, "learning_rate": 2.411826490237346e-05, "loss": 0.0832, "num_input_tokens_seen": 148148864, "step": 68670 }, { "epoch": 11.20309951060359, "grad_norm": 0.18006475269794464, "learning_rate": 2.4114708134444708e-05, "loss": 0.1106, "num_input_tokens_seen": 148159296, "step": 68675 }, { "epoch": 11.203915171288743, "grad_norm": 0.9291273355484009, "learning_rate": 2.4111151384457507e-05, "loss": 0.034, "num_input_tokens_seen": 148169088, "step": 68680 }, { "epoch": 11.2047308319739, "grad_norm": 0.18576033413410187, "learning_rate": 2.4107594652483967e-05, "loss": 0.2972, "num_input_tokens_seen": 148180384, "step": 68685 }, { "epoch": 11.205546492659053, "grad_norm": 2.090148687362671, "learning_rate": 2.4104037938596138e-05, "loss": 0.198, "num_input_tokens_seen": 148190528, "step": 68690 }, { "epoch": 11.206362153344209, "grad_norm": 1.2263036966323853, "learning_rate": 2.410048124286613e-05, "loss": 0.0609, "num_input_tokens_seen": 148201216, "step": 68695 }, { "epoch": 11.207177814029365, "grad_norm": 0.04676644876599312, "learning_rate": 2.4096924565366e-05, "loss": 0.0365, "num_input_tokens_seen": 148212160, "step": 68700 }, { "epoch": 11.207993474714518, "grad_norm": 2.3418967723846436, "learning_rate": 2.409336790616785e-05, "loss": 0.1142, "num_input_tokens_seen": 148223712, "step": 68705 }, { "epoch": 11.208809135399674, "grad_norm": 0.07113327085971832, "learning_rate": 2.4089811265343738e-05, "loss": 0.0333, "num_input_tokens_seen": 148234592, "step": 68710 }, { "epoch": 11.209624796084828, "grad_norm": 1.758745551109314, "learning_rate": 2.4086254642965767e-05, "loss": 0.0814, "num_input_tokens_seen": 148245440, "step": 68715 }, { "epoch": 11.210440456769984, "grad_norm": 0.12467214465141296, "learning_rate": 2.408269803910599e-05, "loss": 0.1752, "num_input_tokens_seen": 148255904, "step": 68720 }, { "epoch": 11.21125611745514, "grad_norm": 0.6722206473350525, "learning_rate": 2.407914145383652e-05, "loss": 0.0811, "num_input_tokens_seen": 148266880, "step": 68725 }, { "epoch": 11.212071778140293, "grad_norm": 0.0364130362868309, "learning_rate": 2.40755848872294e-05, "loss": 0.1031, "num_input_tokens_seen": 148277760, "step": 68730 }, { "epoch": 11.21288743882545, "grad_norm": 0.15293793380260468, "learning_rate": 2.4072028339356722e-05, "loss": 0.0178, "num_input_tokens_seen": 148288160, "step": 68735 }, { "epoch": 11.213703099510603, "grad_norm": 1.6355352401733398, "learning_rate": 2.406847181029058e-05, "loss": 0.1361, "num_input_tokens_seen": 148298336, "step": 68740 }, { "epoch": 11.214518760195759, "grad_norm": 2.8928709030151367, "learning_rate": 2.4064915300103033e-05, "loss": 0.1532, "num_input_tokens_seen": 148309248, "step": 68745 }, { "epoch": 11.215334420880913, "grad_norm": 1.7642605304718018, "learning_rate": 2.4061358808866173e-05, "loss": 0.1708, "num_input_tokens_seen": 148319904, "step": 68750 }, { "epoch": 11.216150081566068, "grad_norm": 0.7133986949920654, "learning_rate": 2.4057802336652053e-05, "loss": 0.0166, "num_input_tokens_seen": 148331264, "step": 68755 }, { "epoch": 11.216965742251224, "grad_norm": 0.9844759106636047, "learning_rate": 2.4054245883532777e-05, "loss": 0.257, "num_input_tokens_seen": 148342624, "step": 68760 }, { "epoch": 11.217781402936378, "grad_norm": 0.11645258963108063, "learning_rate": 2.4050689449580392e-05, "loss": 0.1364, "num_input_tokens_seen": 148353632, "step": 68765 }, { "epoch": 11.218597063621534, "grad_norm": 0.372880756855011, "learning_rate": 2.4047133034867006e-05, "loss": 0.0577, "num_input_tokens_seen": 148362816, "step": 68770 }, { "epoch": 11.219412724306688, "grad_norm": 0.2667055130004883, "learning_rate": 2.4043576639464666e-05, "loss": 0.036, "num_input_tokens_seen": 148373792, "step": 68775 }, { "epoch": 11.220228384991843, "grad_norm": 0.1552053987979889, "learning_rate": 2.4040020263445466e-05, "loss": 0.0469, "num_input_tokens_seen": 148384640, "step": 68780 }, { "epoch": 11.221044045676999, "grad_norm": 1.8859872817993164, "learning_rate": 2.403646390688147e-05, "loss": 0.1179, "num_input_tokens_seen": 148395424, "step": 68785 }, { "epoch": 11.221859706362153, "grad_norm": 1.2131867408752441, "learning_rate": 2.4032907569844757e-05, "loss": 0.2035, "num_input_tokens_seen": 148406304, "step": 68790 }, { "epoch": 11.222675367047309, "grad_norm": 0.08506540954113007, "learning_rate": 2.40293512524074e-05, "loss": 0.2548, "num_input_tokens_seen": 148416928, "step": 68795 }, { "epoch": 11.223491027732463, "grad_norm": 0.14269225299358368, "learning_rate": 2.4025794954641466e-05, "loss": 0.0985, "num_input_tokens_seen": 148428768, "step": 68800 }, { "epoch": 11.224306688417618, "grad_norm": 0.08552254736423492, "learning_rate": 2.402223867661903e-05, "loss": 0.0153, "num_input_tokens_seen": 148440256, "step": 68805 }, { "epoch": 11.225122349102774, "grad_norm": 0.03698950633406639, "learning_rate": 2.401868241841218e-05, "loss": 0.0274, "num_input_tokens_seen": 148450016, "step": 68810 }, { "epoch": 11.225938009787928, "grad_norm": 0.06970974057912827, "learning_rate": 2.4015126180092965e-05, "loss": 0.0944, "num_input_tokens_seen": 148462336, "step": 68815 }, { "epoch": 11.226753670473084, "grad_norm": 1.5857720375061035, "learning_rate": 2.4011569961733478e-05, "loss": 0.2142, "num_input_tokens_seen": 148473344, "step": 68820 }, { "epoch": 11.227569331158238, "grad_norm": 0.056459926068782806, "learning_rate": 2.4008013763405767e-05, "loss": 0.0417, "num_input_tokens_seen": 148483936, "step": 68825 }, { "epoch": 11.228384991843393, "grad_norm": 0.0186348557472229, "learning_rate": 2.4004457585181927e-05, "loss": 0.1108, "num_input_tokens_seen": 148493440, "step": 68830 }, { "epoch": 11.229200652528547, "grad_norm": 0.032635707408189774, "learning_rate": 2.400090142713401e-05, "loss": 0.0293, "num_input_tokens_seen": 148504192, "step": 68835 }, { "epoch": 11.230016313213703, "grad_norm": 0.8280782699584961, "learning_rate": 2.399734528933409e-05, "loss": 0.1718, "num_input_tokens_seen": 148515520, "step": 68840 }, { "epoch": 11.230831973898859, "grad_norm": 0.025271417573094368, "learning_rate": 2.399378917185425e-05, "loss": 0.065, "num_input_tokens_seen": 148526464, "step": 68845 }, { "epoch": 11.231647634584013, "grad_norm": 0.46877846121788025, "learning_rate": 2.3990233074766537e-05, "loss": 0.1069, "num_input_tokens_seen": 148537248, "step": 68850 }, { "epoch": 11.232463295269168, "grad_norm": 1.6848506927490234, "learning_rate": 2.3986676998143045e-05, "loss": 0.2552, "num_input_tokens_seen": 148547904, "step": 68855 }, { "epoch": 11.233278955954322, "grad_norm": 0.9591245651245117, "learning_rate": 2.3983120942055816e-05, "loss": 0.0326, "num_input_tokens_seen": 148559712, "step": 68860 }, { "epoch": 11.234094616639478, "grad_norm": 0.11547929048538208, "learning_rate": 2.3979564906576942e-05, "loss": 0.0674, "num_input_tokens_seen": 148571040, "step": 68865 }, { "epoch": 11.234910277324634, "grad_norm": 1.5891586542129517, "learning_rate": 2.3976008891778473e-05, "loss": 0.0626, "num_input_tokens_seen": 148581728, "step": 68870 }, { "epoch": 11.235725938009788, "grad_norm": 0.30598604679107666, "learning_rate": 2.397245289773249e-05, "loss": 0.0357, "num_input_tokens_seen": 148593056, "step": 68875 }, { "epoch": 11.236541598694943, "grad_norm": 0.5453816652297974, "learning_rate": 2.3968896924511043e-05, "loss": 0.0708, "num_input_tokens_seen": 148603136, "step": 68880 }, { "epoch": 11.237357259380097, "grad_norm": 1.919753074645996, "learning_rate": 2.3965340972186218e-05, "loss": 0.158, "num_input_tokens_seen": 148613472, "step": 68885 }, { "epoch": 11.238172920065253, "grad_norm": 1.7008111476898193, "learning_rate": 2.3961785040830058e-05, "loss": 0.1124, "num_input_tokens_seen": 148624000, "step": 68890 }, { "epoch": 11.238988580750409, "grad_norm": 0.01691552996635437, "learning_rate": 2.3958229130514655e-05, "loss": 0.0464, "num_input_tokens_seen": 148636000, "step": 68895 }, { "epoch": 11.239804241435563, "grad_norm": 0.042283795773983, "learning_rate": 2.3954673241312052e-05, "loss": 0.0214, "num_input_tokens_seen": 148646720, "step": 68900 }, { "epoch": 11.240619902120718, "grad_norm": 0.042992815375328064, "learning_rate": 2.3951117373294327e-05, "loss": 0.0358, "num_input_tokens_seen": 148656288, "step": 68905 }, { "epoch": 11.241435562805872, "grad_norm": 1.3425467014312744, "learning_rate": 2.394756152653353e-05, "loss": 0.0967, "num_input_tokens_seen": 148666752, "step": 68910 }, { "epoch": 11.242251223491028, "grad_norm": 1.1414947509765625, "learning_rate": 2.3944005701101747e-05, "loss": 0.0429, "num_input_tokens_seen": 148677248, "step": 68915 }, { "epoch": 11.243066884176184, "grad_norm": 1.8934178352355957, "learning_rate": 2.394044989707101e-05, "loss": 0.0801, "num_input_tokens_seen": 148686624, "step": 68920 }, { "epoch": 11.243882544861338, "grad_norm": 0.03255809098482132, "learning_rate": 2.3936894114513415e-05, "loss": 0.0286, "num_input_tokens_seen": 148696864, "step": 68925 }, { "epoch": 11.244698205546493, "grad_norm": 0.5808064937591553, "learning_rate": 2.3933338353501005e-05, "loss": 0.0349, "num_input_tokens_seen": 148707264, "step": 68930 }, { "epoch": 11.245513866231647, "grad_norm": 0.04721597582101822, "learning_rate": 2.3929782614105846e-05, "loss": 0.0186, "num_input_tokens_seen": 148717856, "step": 68935 }, { "epoch": 11.246329526916803, "grad_norm": 1.4094209671020508, "learning_rate": 2.3926226896400002e-05, "loss": 0.162, "num_input_tokens_seen": 148728416, "step": 68940 }, { "epoch": 11.247145187601957, "grad_norm": 0.8127332329750061, "learning_rate": 2.3922671200455525e-05, "loss": 0.1017, "num_input_tokens_seen": 148739904, "step": 68945 }, { "epoch": 11.247960848287113, "grad_norm": 0.06257767230272293, "learning_rate": 2.391911552634449e-05, "loss": 0.1551, "num_input_tokens_seen": 148749984, "step": 68950 }, { "epoch": 11.248776508972268, "grad_norm": 0.1151905357837677, "learning_rate": 2.3915559874138947e-05, "loss": 0.0861, "num_input_tokens_seen": 148761216, "step": 68955 }, { "epoch": 11.249592169657422, "grad_norm": 0.07420717924833298, "learning_rate": 2.391200424391096e-05, "loss": 0.0423, "num_input_tokens_seen": 148772288, "step": 68960 }, { "epoch": 11.250407830342578, "grad_norm": 0.057432979345321655, "learning_rate": 2.3908448635732587e-05, "loss": 0.2519, "num_input_tokens_seen": 148784640, "step": 68965 }, { "epoch": 11.251223491027732, "grad_norm": 0.24143655598163605, "learning_rate": 2.390489304967589e-05, "loss": 0.1075, "num_input_tokens_seen": 148796224, "step": 68970 }, { "epoch": 11.252039151712887, "grad_norm": 1.314875602722168, "learning_rate": 2.3901337485812913e-05, "loss": 0.0989, "num_input_tokens_seen": 148807360, "step": 68975 }, { "epoch": 11.252854812398043, "grad_norm": 1.423966407775879, "learning_rate": 2.3897781944215743e-05, "loss": 0.0863, "num_input_tokens_seen": 148818592, "step": 68980 }, { "epoch": 11.253670473083197, "grad_norm": 0.8150648474693298, "learning_rate": 2.3894226424956406e-05, "loss": 0.1054, "num_input_tokens_seen": 148828448, "step": 68985 }, { "epoch": 11.254486133768353, "grad_norm": 0.22422528266906738, "learning_rate": 2.3890670928106983e-05, "loss": 0.0334, "num_input_tokens_seen": 148839456, "step": 68990 }, { "epoch": 11.255301794453507, "grad_norm": 0.38680392503738403, "learning_rate": 2.3887115453739515e-05, "loss": 0.195, "num_input_tokens_seen": 148849056, "step": 68995 }, { "epoch": 11.256117455138662, "grad_norm": 1.153710126876831, "learning_rate": 2.3883560001926068e-05, "loss": 0.0459, "num_input_tokens_seen": 148858368, "step": 69000 }, { "epoch": 11.256933115823816, "grad_norm": 0.06198173388838768, "learning_rate": 2.388000457273869e-05, "loss": 0.1305, "num_input_tokens_seen": 148869792, "step": 69005 }, { "epoch": 11.257748776508972, "grad_norm": 1.003201961517334, "learning_rate": 2.3876449166249448e-05, "loss": 0.1311, "num_input_tokens_seen": 148880800, "step": 69010 }, { "epoch": 11.258564437194128, "grad_norm": 1.119303822517395, "learning_rate": 2.387289378253038e-05, "loss": 0.0866, "num_input_tokens_seen": 148891392, "step": 69015 }, { "epoch": 11.259380097879282, "grad_norm": 0.2697938084602356, "learning_rate": 2.3869338421653556e-05, "loss": 0.0163, "num_input_tokens_seen": 148903200, "step": 69020 }, { "epoch": 11.260195758564437, "grad_norm": 1.3942266702651978, "learning_rate": 2.3865783083691014e-05, "loss": 0.1806, "num_input_tokens_seen": 148913856, "step": 69025 }, { "epoch": 11.261011419249591, "grad_norm": 0.027574660256505013, "learning_rate": 2.3862227768714828e-05, "loss": 0.0723, "num_input_tokens_seen": 148924192, "step": 69030 }, { "epoch": 11.261827079934747, "grad_norm": 3.923081159591675, "learning_rate": 2.3858672476797027e-05, "loss": 0.1787, "num_input_tokens_seen": 148934240, "step": 69035 }, { "epoch": 11.262642740619903, "grad_norm": 0.18497009575366974, "learning_rate": 2.385511720800969e-05, "loss": 0.1448, "num_input_tokens_seen": 148946304, "step": 69040 }, { "epoch": 11.263458401305057, "grad_norm": 1.0190950632095337, "learning_rate": 2.3851561962424842e-05, "loss": 0.0815, "num_input_tokens_seen": 148955584, "step": 69045 }, { "epoch": 11.264274061990212, "grad_norm": 0.043263401836156845, "learning_rate": 2.3848006740114552e-05, "loss": 0.0442, "num_input_tokens_seen": 148965760, "step": 69050 }, { "epoch": 11.265089722675366, "grad_norm": 0.050646647810935974, "learning_rate": 2.3844451541150872e-05, "loss": 0.0409, "num_input_tokens_seen": 148976544, "step": 69055 }, { "epoch": 11.265905383360522, "grad_norm": 0.048165347427129745, "learning_rate": 2.384089636560584e-05, "loss": 0.0263, "num_input_tokens_seen": 148987872, "step": 69060 }, { "epoch": 11.266721044045678, "grad_norm": 0.060792356729507446, "learning_rate": 2.3837341213551523e-05, "loss": 0.0159, "num_input_tokens_seen": 148999296, "step": 69065 }, { "epoch": 11.267536704730832, "grad_norm": 1.397181510925293, "learning_rate": 2.3833786085059954e-05, "loss": 0.0455, "num_input_tokens_seen": 149010464, "step": 69070 }, { "epoch": 11.268352365415987, "grad_norm": 0.07055113464593887, "learning_rate": 2.3830230980203196e-05, "loss": 0.0524, "num_input_tokens_seen": 149022368, "step": 69075 }, { "epoch": 11.269168026101141, "grad_norm": 1.6395000219345093, "learning_rate": 2.382667589905328e-05, "loss": 0.1199, "num_input_tokens_seen": 149033152, "step": 69080 }, { "epoch": 11.269983686786297, "grad_norm": 0.394561767578125, "learning_rate": 2.3823120841682278e-05, "loss": 0.1115, "num_input_tokens_seen": 149044768, "step": 69085 }, { "epoch": 11.270799347471453, "grad_norm": 0.4510360658168793, "learning_rate": 2.3819565808162215e-05, "loss": 0.1666, "num_input_tokens_seen": 149055168, "step": 69090 }, { "epoch": 11.271615008156607, "grad_norm": 0.3099696934223175, "learning_rate": 2.3816010798565154e-05, "loss": 0.1413, "num_input_tokens_seen": 149066720, "step": 69095 }, { "epoch": 11.272430668841762, "grad_norm": 1.161145806312561, "learning_rate": 2.3812455812963134e-05, "loss": 0.0986, "num_input_tokens_seen": 149076768, "step": 69100 }, { "epoch": 11.273246329526916, "grad_norm": 0.027826650068163872, "learning_rate": 2.3808900851428203e-05, "loss": 0.0243, "num_input_tokens_seen": 149087488, "step": 69105 }, { "epoch": 11.274061990212072, "grad_norm": 1.1205685138702393, "learning_rate": 2.380534591403241e-05, "loss": 0.2205, "num_input_tokens_seen": 149099296, "step": 69110 }, { "epoch": 11.274877650897226, "grad_norm": 1.9474109411239624, "learning_rate": 2.3801791000847794e-05, "loss": 0.1097, "num_input_tokens_seen": 149109888, "step": 69115 }, { "epoch": 11.275693311582382, "grad_norm": 0.34558382630348206, "learning_rate": 2.3798236111946402e-05, "loss": 0.0548, "num_input_tokens_seen": 149120992, "step": 69120 }, { "epoch": 11.276508972267537, "grad_norm": 0.05002514272928238, "learning_rate": 2.3794681247400288e-05, "loss": 0.1353, "num_input_tokens_seen": 149131232, "step": 69125 }, { "epoch": 11.277324632952691, "grad_norm": 0.3151094615459442, "learning_rate": 2.379112640728148e-05, "loss": 0.0284, "num_input_tokens_seen": 149143168, "step": 69130 }, { "epoch": 11.278140293637847, "grad_norm": 0.052609167993068695, "learning_rate": 2.3787571591662037e-05, "loss": 0.0152, "num_input_tokens_seen": 149153024, "step": 69135 }, { "epoch": 11.278955954323001, "grad_norm": 0.34385305643081665, "learning_rate": 2.378401680061398e-05, "loss": 0.1588, "num_input_tokens_seen": 149164000, "step": 69140 }, { "epoch": 11.279771615008157, "grad_norm": 0.41256266832351685, "learning_rate": 2.3780462034209384e-05, "loss": 0.0674, "num_input_tokens_seen": 149175520, "step": 69145 }, { "epoch": 11.280587275693312, "grad_norm": 0.11173729598522186, "learning_rate": 2.3776907292520255e-05, "loss": 0.0371, "num_input_tokens_seen": 149186432, "step": 69150 }, { "epoch": 11.281402936378466, "grad_norm": 2.0779197216033936, "learning_rate": 2.377335257561866e-05, "loss": 0.1626, "num_input_tokens_seen": 149197216, "step": 69155 }, { "epoch": 11.282218597063622, "grad_norm": 0.7495021820068359, "learning_rate": 2.3769797883576628e-05, "loss": 0.1379, "num_input_tokens_seen": 149208000, "step": 69160 }, { "epoch": 11.283034257748776, "grad_norm": 0.08009158819913864, "learning_rate": 2.3766243216466196e-05, "loss": 0.0963, "num_input_tokens_seen": 149218592, "step": 69165 }, { "epoch": 11.283849918433932, "grad_norm": 0.25238630175590515, "learning_rate": 2.3762688574359424e-05, "loss": 0.147, "num_input_tokens_seen": 149229856, "step": 69170 }, { "epoch": 11.284665579119087, "grad_norm": 0.16728021204471588, "learning_rate": 2.3759133957328327e-05, "loss": 0.1127, "num_input_tokens_seen": 149241472, "step": 69175 }, { "epoch": 11.285481239804241, "grad_norm": 0.3889654278755188, "learning_rate": 2.3755579365444965e-05, "loss": 0.1884, "num_input_tokens_seen": 149252256, "step": 69180 }, { "epoch": 11.286296900489397, "grad_norm": 1.3752788305282593, "learning_rate": 2.3752024798781354e-05, "loss": 0.1401, "num_input_tokens_seen": 149262752, "step": 69185 }, { "epoch": 11.28711256117455, "grad_norm": 2.2264750003814697, "learning_rate": 2.3748470257409555e-05, "loss": 0.1124, "num_input_tokens_seen": 149273408, "step": 69190 }, { "epoch": 11.287928221859707, "grad_norm": 1.277882695198059, "learning_rate": 2.3744915741401584e-05, "loss": 0.0791, "num_input_tokens_seen": 149284384, "step": 69195 }, { "epoch": 11.28874388254486, "grad_norm": 0.10700864344835281, "learning_rate": 2.3741361250829498e-05, "loss": 0.0876, "num_input_tokens_seen": 149295808, "step": 69200 }, { "epoch": 11.289559543230016, "grad_norm": 0.16962659358978271, "learning_rate": 2.3737806785765314e-05, "loss": 0.0235, "num_input_tokens_seen": 149306560, "step": 69205 }, { "epoch": 11.290375203915172, "grad_norm": 0.674608051776886, "learning_rate": 2.373425234628108e-05, "loss": 0.1331, "num_input_tokens_seen": 149318272, "step": 69210 }, { "epoch": 11.291190864600326, "grad_norm": 0.9091984629631042, "learning_rate": 2.3730697932448825e-05, "loss": 0.146, "num_input_tokens_seen": 149329344, "step": 69215 }, { "epoch": 11.292006525285482, "grad_norm": 0.4504617750644684, "learning_rate": 2.3727143544340597e-05, "loss": 0.0761, "num_input_tokens_seen": 149340032, "step": 69220 }, { "epoch": 11.292822185970635, "grad_norm": 2.8542919158935547, "learning_rate": 2.3723589182028405e-05, "loss": 0.2687, "num_input_tokens_seen": 149351072, "step": 69225 }, { "epoch": 11.293637846655791, "grad_norm": 0.05936959758400917, "learning_rate": 2.372003484558431e-05, "loss": 0.017, "num_input_tokens_seen": 149362176, "step": 69230 }, { "epoch": 11.294453507340947, "grad_norm": 0.4272592067718506, "learning_rate": 2.3716480535080323e-05, "loss": 0.0822, "num_input_tokens_seen": 149373056, "step": 69235 }, { "epoch": 11.2952691680261, "grad_norm": 0.34982508420944214, "learning_rate": 2.371292625058849e-05, "loss": 0.0344, "num_input_tokens_seen": 149383008, "step": 69240 }, { "epoch": 11.296084828711257, "grad_norm": 0.22152847051620483, "learning_rate": 2.370937199218084e-05, "loss": 0.0459, "num_input_tokens_seen": 149393280, "step": 69245 }, { "epoch": 11.29690048939641, "grad_norm": 0.03678363189101219, "learning_rate": 2.3705817759929404e-05, "loss": 0.0121, "num_input_tokens_seen": 149404352, "step": 69250 }, { "epoch": 11.297716150081566, "grad_norm": 1.8666917085647583, "learning_rate": 2.3702263553906212e-05, "loss": 0.0451, "num_input_tokens_seen": 149415040, "step": 69255 }, { "epoch": 11.298531810766722, "grad_norm": 0.23108798265457153, "learning_rate": 2.3698709374183298e-05, "loss": 0.0435, "num_input_tokens_seen": 149426528, "step": 69260 }, { "epoch": 11.299347471451876, "grad_norm": 0.17356574535369873, "learning_rate": 2.3695155220832684e-05, "loss": 0.0181, "num_input_tokens_seen": 149437056, "step": 69265 }, { "epoch": 11.300163132137031, "grad_norm": 0.5831885933876038, "learning_rate": 2.3691601093926404e-05, "loss": 0.1222, "num_input_tokens_seen": 149448320, "step": 69270 }, { "epoch": 11.300978792822185, "grad_norm": 3.234773635864258, "learning_rate": 2.368804699353649e-05, "loss": 0.0864, "num_input_tokens_seen": 149458496, "step": 69275 }, { "epoch": 11.301794453507341, "grad_norm": 0.907797634601593, "learning_rate": 2.3684492919734972e-05, "loss": 0.1481, "num_input_tokens_seen": 149469792, "step": 69280 }, { "epoch": 11.302610114192497, "grad_norm": 0.2008611410856247, "learning_rate": 2.3680938872593874e-05, "loss": 0.0392, "num_input_tokens_seen": 149481440, "step": 69285 }, { "epoch": 11.30342577487765, "grad_norm": 0.20454810559749603, "learning_rate": 2.3677384852185212e-05, "loss": 0.0426, "num_input_tokens_seen": 149493216, "step": 69290 }, { "epoch": 11.304241435562806, "grad_norm": 0.8814966082572937, "learning_rate": 2.3673830858581038e-05, "loss": 0.2856, "num_input_tokens_seen": 149503680, "step": 69295 }, { "epoch": 11.30505709624796, "grad_norm": 0.028848690912127495, "learning_rate": 2.3670276891853354e-05, "loss": 0.1865, "num_input_tokens_seen": 149514464, "step": 69300 }, { "epoch": 11.305872756933116, "grad_norm": 0.037400588393211365, "learning_rate": 2.3666722952074206e-05, "loss": 0.0683, "num_input_tokens_seen": 149524896, "step": 69305 }, { "epoch": 11.30668841761827, "grad_norm": 0.13020826876163483, "learning_rate": 2.3663169039315596e-05, "loss": 0.0674, "num_input_tokens_seen": 149536832, "step": 69310 }, { "epoch": 11.307504078303426, "grad_norm": 1.352245569229126, "learning_rate": 2.3659615153649575e-05, "loss": 0.1283, "num_input_tokens_seen": 149548352, "step": 69315 }, { "epoch": 11.308319738988581, "grad_norm": 1.6295673847198486, "learning_rate": 2.365606129514814e-05, "loss": 0.1722, "num_input_tokens_seen": 149558848, "step": 69320 }, { "epoch": 11.309135399673735, "grad_norm": 1.9345446825027466, "learning_rate": 2.365250746388334e-05, "loss": 0.1389, "num_input_tokens_seen": 149570080, "step": 69325 }, { "epoch": 11.309951060358891, "grad_norm": 0.9403092861175537, "learning_rate": 2.3648953659927176e-05, "loss": 0.058, "num_input_tokens_seen": 149581920, "step": 69330 }, { "epoch": 11.310766721044045, "grad_norm": 0.100438192486763, "learning_rate": 2.364539988335169e-05, "loss": 0.168, "num_input_tokens_seen": 149593408, "step": 69335 }, { "epoch": 11.3115823817292, "grad_norm": 0.12356019020080566, "learning_rate": 2.364184613422888e-05, "loss": 0.2411, "num_input_tokens_seen": 149604640, "step": 69340 }, { "epoch": 11.312398042414356, "grad_norm": 0.09495795518159866, "learning_rate": 2.3638292412630798e-05, "loss": 0.0181, "num_input_tokens_seen": 149614880, "step": 69345 }, { "epoch": 11.31321370309951, "grad_norm": 0.7286322712898254, "learning_rate": 2.363473871862943e-05, "loss": 0.1304, "num_input_tokens_seen": 149625824, "step": 69350 }, { "epoch": 11.314029363784666, "grad_norm": 0.026092322543263435, "learning_rate": 2.363118505229683e-05, "loss": 0.1006, "num_input_tokens_seen": 149637376, "step": 69355 }, { "epoch": 11.31484502446982, "grad_norm": 0.5125482678413391, "learning_rate": 2.3627631413704988e-05, "loss": 0.07, "num_input_tokens_seen": 149648480, "step": 69360 }, { "epoch": 11.315660685154976, "grad_norm": 0.4503583610057831, "learning_rate": 2.362407780292595e-05, "loss": 0.0313, "num_input_tokens_seen": 149659584, "step": 69365 }, { "epoch": 11.31647634584013, "grad_norm": 0.9214933514595032, "learning_rate": 2.3620524220031707e-05, "loss": 0.0775, "num_input_tokens_seen": 149670112, "step": 69370 }, { "epoch": 11.317292006525285, "grad_norm": 0.40380576252937317, "learning_rate": 2.361697066509429e-05, "loss": 0.2446, "num_input_tokens_seen": 149680608, "step": 69375 }, { "epoch": 11.318107667210441, "grad_norm": 0.2887084484100342, "learning_rate": 2.3613417138185727e-05, "loss": 0.084, "num_input_tokens_seen": 149692128, "step": 69380 }, { "epoch": 11.318923327895595, "grad_norm": 0.1420767605304718, "learning_rate": 2.3609863639378017e-05, "loss": 0.148, "num_input_tokens_seen": 149702816, "step": 69385 }, { "epoch": 11.31973898858075, "grad_norm": 0.0276385135948658, "learning_rate": 2.3606310168743195e-05, "loss": 0.0755, "num_input_tokens_seen": 149714016, "step": 69390 }, { "epoch": 11.320554649265905, "grad_norm": 0.060134612023830414, "learning_rate": 2.3602756726353253e-05, "loss": 0.0534, "num_input_tokens_seen": 149725376, "step": 69395 }, { "epoch": 11.32137030995106, "grad_norm": 1.3029789924621582, "learning_rate": 2.359920331228023e-05, "loss": 0.0984, "num_input_tokens_seen": 149735584, "step": 69400 }, { "epoch": 11.322185970636216, "grad_norm": 0.02803897298872471, "learning_rate": 2.3595649926596116e-05, "loss": 0.0845, "num_input_tokens_seen": 149746432, "step": 69405 }, { "epoch": 11.32300163132137, "grad_norm": 0.07283228635787964, "learning_rate": 2.3592096569372945e-05, "loss": 0.0449, "num_input_tokens_seen": 149756800, "step": 69410 }, { "epoch": 11.323817292006526, "grad_norm": 2.165677309036255, "learning_rate": 2.3588543240682722e-05, "loss": 0.1868, "num_input_tokens_seen": 149767040, "step": 69415 }, { "epoch": 11.32463295269168, "grad_norm": 0.6942663192749023, "learning_rate": 2.3584989940597462e-05, "loss": 0.1356, "num_input_tokens_seen": 149777312, "step": 69420 }, { "epoch": 11.325448613376835, "grad_norm": 0.8561126589775085, "learning_rate": 2.3581436669189177e-05, "loss": 0.207, "num_input_tokens_seen": 149788000, "step": 69425 }, { "epoch": 11.326264274061991, "grad_norm": 0.38390418887138367, "learning_rate": 2.3577883426529875e-05, "loss": 0.0983, "num_input_tokens_seen": 149798880, "step": 69430 }, { "epoch": 11.327079934747145, "grad_norm": 1.867508053779602, "learning_rate": 2.3574330212691563e-05, "loss": 0.2107, "num_input_tokens_seen": 149808864, "step": 69435 }, { "epoch": 11.3278955954323, "grad_norm": 1.4232059717178345, "learning_rate": 2.3570777027746274e-05, "loss": 0.2006, "num_input_tokens_seen": 149820192, "step": 69440 }, { "epoch": 11.328711256117455, "grad_norm": 1.3834160566329956, "learning_rate": 2.3567223871765987e-05, "loss": 0.1756, "num_input_tokens_seen": 149831296, "step": 69445 }, { "epoch": 11.32952691680261, "grad_norm": 0.6258243322372437, "learning_rate": 2.3563670744822736e-05, "loss": 0.0457, "num_input_tokens_seen": 149842688, "step": 69450 }, { "epoch": 11.330342577487766, "grad_norm": 0.08054181188344955, "learning_rate": 2.356011764698851e-05, "loss": 0.2172, "num_input_tokens_seen": 149853344, "step": 69455 }, { "epoch": 11.33115823817292, "grad_norm": 0.05181559547781944, "learning_rate": 2.355656457833534e-05, "loss": 0.0635, "num_input_tokens_seen": 149865568, "step": 69460 }, { "epoch": 11.331973898858076, "grad_norm": 3.984116315841675, "learning_rate": 2.3553011538935205e-05, "loss": 0.2907, "num_input_tokens_seen": 149877504, "step": 69465 }, { "epoch": 11.33278955954323, "grad_norm": 0.4924349784851074, "learning_rate": 2.354945852886014e-05, "loss": 0.0277, "num_input_tokens_seen": 149888256, "step": 69470 }, { "epoch": 11.333605220228385, "grad_norm": 0.06797084212303162, "learning_rate": 2.3545905548182123e-05, "loss": 0.1815, "num_input_tokens_seen": 149899424, "step": 69475 }, { "epoch": 11.33442088091354, "grad_norm": 1.336840271949768, "learning_rate": 2.354235259697318e-05, "loss": 0.1116, "num_input_tokens_seen": 149911424, "step": 69480 }, { "epoch": 11.335236541598695, "grad_norm": 0.5907074809074402, "learning_rate": 2.3538799675305318e-05, "loss": 0.0538, "num_input_tokens_seen": 149920544, "step": 69485 }, { "epoch": 11.33605220228385, "grad_norm": 2.188410520553589, "learning_rate": 2.3535246783250526e-05, "loss": 0.2498, "num_input_tokens_seen": 149932032, "step": 69490 }, { "epoch": 11.336867862969005, "grad_norm": 0.11007368564605713, "learning_rate": 2.3531693920880825e-05, "loss": 0.0333, "num_input_tokens_seen": 149942624, "step": 69495 }, { "epoch": 11.33768352365416, "grad_norm": 0.2297830432653427, "learning_rate": 2.35281410882682e-05, "loss": 0.0326, "num_input_tokens_seen": 149954336, "step": 69500 }, { "epoch": 11.338499184339314, "grad_norm": 0.030799513682723045, "learning_rate": 2.352458828548467e-05, "loss": 0.099, "num_input_tokens_seen": 149964544, "step": 69505 }, { "epoch": 11.33931484502447, "grad_norm": 0.04186058044433594, "learning_rate": 2.3521035512602225e-05, "loss": 0.078, "num_input_tokens_seen": 149975744, "step": 69510 }, { "epoch": 11.340130505709626, "grad_norm": 0.7262038588523865, "learning_rate": 2.351748276969288e-05, "loss": 0.3501, "num_input_tokens_seen": 149986944, "step": 69515 }, { "epoch": 11.34094616639478, "grad_norm": 1.6647173166275024, "learning_rate": 2.3513930056828617e-05, "loss": 0.1598, "num_input_tokens_seen": 149997664, "step": 69520 }, { "epoch": 11.341761827079935, "grad_norm": 0.043123576790094376, "learning_rate": 2.3510377374081457e-05, "loss": 0.1064, "num_input_tokens_seen": 150008800, "step": 69525 }, { "epoch": 11.34257748776509, "grad_norm": 1.2557727098464966, "learning_rate": 2.3506824721523375e-05, "loss": 0.2188, "num_input_tokens_seen": 150019872, "step": 69530 }, { "epoch": 11.343393148450245, "grad_norm": 2.3529701232910156, "learning_rate": 2.35032720992264e-05, "loss": 0.0699, "num_input_tokens_seen": 150031456, "step": 69535 }, { "epoch": 11.3442088091354, "grad_norm": 0.13214613497257233, "learning_rate": 2.3499719507262503e-05, "loss": 0.1205, "num_input_tokens_seen": 150041600, "step": 69540 }, { "epoch": 11.345024469820554, "grad_norm": 0.19943539798259735, "learning_rate": 2.3496166945703697e-05, "loss": 0.0187, "num_input_tokens_seen": 150052160, "step": 69545 }, { "epoch": 11.34584013050571, "grad_norm": 0.07018015533685684, "learning_rate": 2.3492614414621978e-05, "loss": 0.0867, "num_input_tokens_seen": 150062880, "step": 69550 }, { "epoch": 11.346655791190864, "grad_norm": 0.7571620345115662, "learning_rate": 2.348906191408934e-05, "loss": 0.1562, "num_input_tokens_seen": 150072480, "step": 69555 }, { "epoch": 11.34747145187602, "grad_norm": 0.20045018196105957, "learning_rate": 2.3485509444177778e-05, "loss": 0.1134, "num_input_tokens_seen": 150082656, "step": 69560 }, { "epoch": 11.348287112561174, "grad_norm": 0.13722562789916992, "learning_rate": 2.348195700495929e-05, "loss": 0.0461, "num_input_tokens_seen": 150093600, "step": 69565 }, { "epoch": 11.34910277324633, "grad_norm": 0.2890269160270691, "learning_rate": 2.3478404596505866e-05, "loss": 0.0481, "num_input_tokens_seen": 150105312, "step": 69570 }, { "epoch": 11.349918433931485, "grad_norm": 0.20226150751113892, "learning_rate": 2.3474852218889505e-05, "loss": 0.0706, "num_input_tokens_seen": 150116416, "step": 69575 }, { "epoch": 11.350734094616639, "grad_norm": 0.0949653759598732, "learning_rate": 2.347129987218219e-05, "loss": 0.0686, "num_input_tokens_seen": 150127360, "step": 69580 }, { "epoch": 11.351549755301795, "grad_norm": 1.6111723184585571, "learning_rate": 2.346774755645593e-05, "loss": 0.1098, "num_input_tokens_seen": 150138688, "step": 69585 }, { "epoch": 11.352365415986949, "grad_norm": 0.33903905749320984, "learning_rate": 2.346419527178271e-05, "loss": 0.0194, "num_input_tokens_seen": 150149088, "step": 69590 }, { "epoch": 11.353181076672104, "grad_norm": 0.034616854041814804, "learning_rate": 2.346064301823452e-05, "loss": 0.3264, "num_input_tokens_seen": 150159488, "step": 69595 }, { "epoch": 11.35399673735726, "grad_norm": 2.3649260997772217, "learning_rate": 2.3457090795883354e-05, "loss": 0.1293, "num_input_tokens_seen": 150170560, "step": 69600 }, { "epoch": 11.354812398042414, "grad_norm": 1.382372260093689, "learning_rate": 2.3453538604801195e-05, "loss": 0.2434, "num_input_tokens_seen": 150181760, "step": 69605 }, { "epoch": 11.35562805872757, "grad_norm": 0.4471416175365448, "learning_rate": 2.3449986445060044e-05, "loss": 0.1331, "num_input_tokens_seen": 150192928, "step": 69610 }, { "epoch": 11.356443719412724, "grad_norm": 0.10683291405439377, "learning_rate": 2.344643431673188e-05, "loss": 0.1463, "num_input_tokens_seen": 150203040, "step": 69615 }, { "epoch": 11.35725938009788, "grad_norm": 1.3814620971679688, "learning_rate": 2.3442882219888702e-05, "loss": 0.1962, "num_input_tokens_seen": 150214464, "step": 69620 }, { "epoch": 11.358075040783035, "grad_norm": 0.09540391713380814, "learning_rate": 2.3439330154602483e-05, "loss": 0.1752, "num_input_tokens_seen": 150224864, "step": 69625 }, { "epoch": 11.358890701468189, "grad_norm": 1.0263605117797852, "learning_rate": 2.343577812094523e-05, "loss": 0.0961, "num_input_tokens_seen": 150235072, "step": 69630 }, { "epoch": 11.359706362153345, "grad_norm": 0.6652857065200806, "learning_rate": 2.34322261189889e-05, "loss": 0.0309, "num_input_tokens_seen": 150246816, "step": 69635 }, { "epoch": 11.360522022838499, "grad_norm": 0.0322938896715641, "learning_rate": 2.3428674148805512e-05, "loss": 0.1661, "num_input_tokens_seen": 150258624, "step": 69640 }, { "epoch": 11.361337683523654, "grad_norm": 0.2331029772758484, "learning_rate": 2.3425122210467027e-05, "loss": 0.1146, "num_input_tokens_seen": 150268896, "step": 69645 }, { "epoch": 11.362153344208808, "grad_norm": 1.1440753936767578, "learning_rate": 2.342157030404545e-05, "loss": 0.0697, "num_input_tokens_seen": 150279552, "step": 69650 }, { "epoch": 11.362969004893964, "grad_norm": 0.6238888502120972, "learning_rate": 2.3418018429612737e-05, "loss": 0.0481, "num_input_tokens_seen": 150290304, "step": 69655 }, { "epoch": 11.36378466557912, "grad_norm": 0.1444743424654007, "learning_rate": 2.34144665872409e-05, "loss": 0.1002, "num_input_tokens_seen": 150301632, "step": 69660 }, { "epoch": 11.364600326264274, "grad_norm": 0.2884942293167114, "learning_rate": 2.34109147770019e-05, "loss": 0.0211, "num_input_tokens_seen": 150312096, "step": 69665 }, { "epoch": 11.36541598694943, "grad_norm": 0.4229019284248352, "learning_rate": 2.3407362998967736e-05, "loss": 0.3639, "num_input_tokens_seen": 150323840, "step": 69670 }, { "epoch": 11.366231647634583, "grad_norm": 0.0725867971777916, "learning_rate": 2.3403811253210374e-05, "loss": 0.1295, "num_input_tokens_seen": 150335104, "step": 69675 }, { "epoch": 11.367047308319739, "grad_norm": 0.7437372803688049, "learning_rate": 2.3400259539801807e-05, "loss": 0.1488, "num_input_tokens_seen": 150346016, "step": 69680 }, { "epoch": 11.367862969004895, "grad_norm": 0.20621877908706665, "learning_rate": 2.3396707858814005e-05, "loss": 0.1045, "num_input_tokens_seen": 150358144, "step": 69685 }, { "epoch": 11.368678629690049, "grad_norm": 0.9137030839920044, "learning_rate": 2.339315621031895e-05, "loss": 0.1203, "num_input_tokens_seen": 150368672, "step": 69690 }, { "epoch": 11.369494290375204, "grad_norm": 0.0170965027064085, "learning_rate": 2.338960459438863e-05, "loss": 0.0987, "num_input_tokens_seen": 150379520, "step": 69695 }, { "epoch": 11.370309951060358, "grad_norm": 1.518362045288086, "learning_rate": 2.338605301109501e-05, "loss": 0.1725, "num_input_tokens_seen": 150390400, "step": 69700 }, { "epoch": 11.371125611745514, "grad_norm": 1.3970515727996826, "learning_rate": 2.3382501460510083e-05, "loss": 0.2018, "num_input_tokens_seen": 150399808, "step": 69705 }, { "epoch": 11.37194127243067, "grad_norm": 0.674321711063385, "learning_rate": 2.3378949942705805e-05, "loss": 0.0294, "num_input_tokens_seen": 150409728, "step": 69710 }, { "epoch": 11.372756933115824, "grad_norm": 0.5347917079925537, "learning_rate": 2.3375398457754166e-05, "loss": 0.0232, "num_input_tokens_seen": 150418624, "step": 69715 }, { "epoch": 11.37357259380098, "grad_norm": 0.9435471296310425, "learning_rate": 2.337184700572714e-05, "loss": 0.1472, "num_input_tokens_seen": 150429056, "step": 69720 }, { "epoch": 11.374388254486133, "grad_norm": 2.1390907764434814, "learning_rate": 2.33682955866967e-05, "loss": 0.141, "num_input_tokens_seen": 150440192, "step": 69725 }, { "epoch": 11.375203915171289, "grad_norm": 0.41122233867645264, "learning_rate": 2.336474420073482e-05, "loss": 0.0331, "num_input_tokens_seen": 150450944, "step": 69730 }, { "epoch": 11.376019575856443, "grad_norm": 0.239602193236351, "learning_rate": 2.3361192847913473e-05, "loss": 0.1455, "num_input_tokens_seen": 150461632, "step": 69735 }, { "epoch": 11.376835236541599, "grad_norm": 0.055647771805524826, "learning_rate": 2.3357641528304627e-05, "loss": 0.1917, "num_input_tokens_seen": 150473056, "step": 69740 }, { "epoch": 11.377650897226754, "grad_norm": 0.8058245182037354, "learning_rate": 2.335409024198027e-05, "loss": 0.0459, "num_input_tokens_seen": 150484512, "step": 69745 }, { "epoch": 11.378466557911908, "grad_norm": 0.5431366562843323, "learning_rate": 2.335053898901235e-05, "loss": 0.0341, "num_input_tokens_seen": 150493760, "step": 69750 }, { "epoch": 11.379282218597064, "grad_norm": 0.3041154742240906, "learning_rate": 2.334698776947286e-05, "loss": 0.0145, "num_input_tokens_seen": 150503424, "step": 69755 }, { "epoch": 11.380097879282218, "grad_norm": 2.166212320327759, "learning_rate": 2.3343436583433752e-05, "loss": 0.2252, "num_input_tokens_seen": 150513984, "step": 69760 }, { "epoch": 11.380913539967374, "grad_norm": 0.1271744817495346, "learning_rate": 2.333988543096701e-05, "loss": 0.0408, "num_input_tokens_seen": 150525024, "step": 69765 }, { "epoch": 11.38172920065253, "grad_norm": 0.20101580023765564, "learning_rate": 2.333633431214459e-05, "loss": 0.0939, "num_input_tokens_seen": 150534720, "step": 69770 }, { "epoch": 11.382544861337683, "grad_norm": 0.11296307295560837, "learning_rate": 2.3332783227038475e-05, "loss": 0.1324, "num_input_tokens_seen": 150545856, "step": 69775 }, { "epoch": 11.383360522022839, "grad_norm": 2.0522634983062744, "learning_rate": 2.3329232175720608e-05, "loss": 0.0397, "num_input_tokens_seen": 150556672, "step": 69780 }, { "epoch": 11.384176182707993, "grad_norm": 0.10851255804300308, "learning_rate": 2.3325681158262984e-05, "loss": 0.1642, "num_input_tokens_seen": 150568608, "step": 69785 }, { "epoch": 11.384991843393149, "grad_norm": 2.2848379611968994, "learning_rate": 2.3322130174737544e-05, "loss": 0.1899, "num_input_tokens_seen": 150578432, "step": 69790 }, { "epoch": 11.385807504078304, "grad_norm": 0.6251578330993652, "learning_rate": 2.331857922521628e-05, "loss": 0.0548, "num_input_tokens_seen": 150589504, "step": 69795 }, { "epoch": 11.386623164763458, "grad_norm": 0.10009916126728058, "learning_rate": 2.3315028309771126e-05, "loss": 0.0639, "num_input_tokens_seen": 150599616, "step": 69800 }, { "epoch": 11.387438825448614, "grad_norm": 1.7742714881896973, "learning_rate": 2.331147742847406e-05, "loss": 0.193, "num_input_tokens_seen": 150610816, "step": 69805 }, { "epoch": 11.388254486133768, "grad_norm": 0.7763766646385193, "learning_rate": 2.3307926581397058e-05, "loss": 0.0973, "num_input_tokens_seen": 150622592, "step": 69810 }, { "epoch": 11.389070146818923, "grad_norm": 0.08181990683078766, "learning_rate": 2.3304375768612057e-05, "loss": 0.1995, "num_input_tokens_seen": 150633568, "step": 69815 }, { "epoch": 11.38988580750408, "grad_norm": 0.9330437183380127, "learning_rate": 2.3300824990191047e-05, "loss": 0.0615, "num_input_tokens_seen": 150644384, "step": 69820 }, { "epoch": 11.390701468189233, "grad_norm": 0.3157954216003418, "learning_rate": 2.329727424620596e-05, "loss": 0.184, "num_input_tokens_seen": 150655232, "step": 69825 }, { "epoch": 11.391517128874389, "grad_norm": 1.521544098854065, "learning_rate": 2.3293723536728783e-05, "loss": 0.0529, "num_input_tokens_seen": 150665760, "step": 69830 }, { "epoch": 11.392332789559543, "grad_norm": 1.5978487730026245, "learning_rate": 2.3290172861831453e-05, "loss": 0.2749, "num_input_tokens_seen": 150675648, "step": 69835 }, { "epoch": 11.393148450244698, "grad_norm": 0.04790804907679558, "learning_rate": 2.328662222158595e-05, "loss": 0.1256, "num_input_tokens_seen": 150686784, "step": 69840 }, { "epoch": 11.393964110929852, "grad_norm": 0.10720008611679077, "learning_rate": 2.3283071616064208e-05, "loss": 0.0264, "num_input_tokens_seen": 150697952, "step": 69845 }, { "epoch": 11.394779771615008, "grad_norm": 2.1078550815582275, "learning_rate": 2.3279521045338208e-05, "loss": 0.1041, "num_input_tokens_seen": 150709696, "step": 69850 }, { "epoch": 11.395595432300164, "grad_norm": 0.021770145744085312, "learning_rate": 2.3275970509479893e-05, "loss": 0.1449, "num_input_tokens_seen": 150719648, "step": 69855 }, { "epoch": 11.396411092985318, "grad_norm": 0.139311745762825, "learning_rate": 2.327242000856122e-05, "loss": 0.0368, "num_input_tokens_seen": 150730656, "step": 69860 }, { "epoch": 11.397226753670473, "grad_norm": 2.148458242416382, "learning_rate": 2.326886954265415e-05, "loss": 0.1391, "num_input_tokens_seen": 150740896, "step": 69865 }, { "epoch": 11.398042414355627, "grad_norm": 0.9132635593414307, "learning_rate": 2.3265319111830637e-05, "loss": 0.0465, "num_input_tokens_seen": 150751776, "step": 69870 }, { "epoch": 11.398858075040783, "grad_norm": 0.07682815939188004, "learning_rate": 2.326176871616263e-05, "loss": 0.0497, "num_input_tokens_seen": 150762240, "step": 69875 }, { "epoch": 11.399673735725939, "grad_norm": 1.745833158493042, "learning_rate": 2.3258218355722084e-05, "loss": 0.3252, "num_input_tokens_seen": 150773664, "step": 69880 }, { "epoch": 11.400489396411093, "grad_norm": 1.0681668519973755, "learning_rate": 2.325466803058095e-05, "loss": 0.0444, "num_input_tokens_seen": 150784608, "step": 69885 }, { "epoch": 11.401305057096248, "grad_norm": 2.5629677772521973, "learning_rate": 2.3251117740811192e-05, "loss": 0.144, "num_input_tokens_seen": 150795584, "step": 69890 }, { "epoch": 11.402120717781402, "grad_norm": 0.8290407657623291, "learning_rate": 2.324756748648474e-05, "loss": 0.2706, "num_input_tokens_seen": 150806080, "step": 69895 }, { "epoch": 11.402936378466558, "grad_norm": 1.721725583076477, "learning_rate": 2.3244017267673568e-05, "loss": 0.0788, "num_input_tokens_seen": 150818016, "step": 69900 }, { "epoch": 11.403752039151712, "grad_norm": 1.8937422037124634, "learning_rate": 2.3240467084449604e-05, "loss": 0.1111, "num_input_tokens_seen": 150828352, "step": 69905 }, { "epoch": 11.404567699836868, "grad_norm": 1.7481850385665894, "learning_rate": 2.3236916936884805e-05, "loss": 0.2934, "num_input_tokens_seen": 150838816, "step": 69910 }, { "epoch": 11.405383360522023, "grad_norm": 0.03705111891031265, "learning_rate": 2.323336682505113e-05, "loss": 0.0334, "num_input_tokens_seen": 150849632, "step": 69915 }, { "epoch": 11.406199021207177, "grad_norm": 1.4968377351760864, "learning_rate": 2.322981674902051e-05, "loss": 0.1389, "num_input_tokens_seen": 150861408, "step": 69920 }, { "epoch": 11.407014681892333, "grad_norm": 0.09821999073028564, "learning_rate": 2.322626670886491e-05, "loss": 0.0157, "num_input_tokens_seen": 150871456, "step": 69925 }, { "epoch": 11.407830342577487, "grad_norm": 0.07639025896787643, "learning_rate": 2.3222716704656254e-05, "loss": 0.0918, "num_input_tokens_seen": 150882112, "step": 69930 }, { "epoch": 11.408646003262643, "grad_norm": 0.06489415466785431, "learning_rate": 2.3219166736466508e-05, "loss": 0.1207, "num_input_tokens_seen": 150892576, "step": 69935 }, { "epoch": 11.409461663947798, "grad_norm": 0.035732682794332504, "learning_rate": 2.3215616804367595e-05, "loss": 0.0199, "num_input_tokens_seen": 150901984, "step": 69940 }, { "epoch": 11.410277324632952, "grad_norm": 0.10621672868728638, "learning_rate": 2.3212066908431484e-05, "loss": 0.0319, "num_input_tokens_seen": 150912736, "step": 69945 }, { "epoch": 11.411092985318108, "grad_norm": 0.04823204502463341, "learning_rate": 2.3208517048730096e-05, "loss": 0.0082, "num_input_tokens_seen": 150922848, "step": 69950 }, { "epoch": 11.411908646003262, "grad_norm": 0.2883662283420563, "learning_rate": 2.320496722533539e-05, "loss": 0.1072, "num_input_tokens_seen": 150933824, "step": 69955 }, { "epoch": 11.412724306688418, "grad_norm": 0.07026911526918411, "learning_rate": 2.3201417438319294e-05, "loss": 0.0409, "num_input_tokens_seen": 150945344, "step": 69960 }, { "epoch": 11.413539967373573, "grad_norm": 0.12568555772304535, "learning_rate": 2.319786768775376e-05, "loss": 0.0172, "num_input_tokens_seen": 150956352, "step": 69965 }, { "epoch": 11.414355628058727, "grad_norm": 0.15695048868656158, "learning_rate": 2.3194317973710718e-05, "loss": 0.0329, "num_input_tokens_seen": 150968160, "step": 69970 }, { "epoch": 11.415171288743883, "grad_norm": 1.2898000478744507, "learning_rate": 2.319076829626212e-05, "loss": 0.0535, "num_input_tokens_seen": 150979008, "step": 69975 }, { "epoch": 11.415986949429037, "grad_norm": 0.21469083428382874, "learning_rate": 2.3187218655479892e-05, "loss": 0.1256, "num_input_tokens_seen": 150989536, "step": 69980 }, { "epoch": 11.416802610114193, "grad_norm": 0.5050853490829468, "learning_rate": 2.3183669051435986e-05, "loss": 0.0437, "num_input_tokens_seen": 151000544, "step": 69985 }, { "epoch": 11.417618270799348, "grad_norm": 0.3174484670162201, "learning_rate": 2.3180119484202316e-05, "loss": 0.121, "num_input_tokens_seen": 151010944, "step": 69990 }, { "epoch": 11.418433931484502, "grad_norm": 1.2869629859924316, "learning_rate": 2.317656995385085e-05, "loss": 0.0661, "num_input_tokens_seen": 151020448, "step": 69995 }, { "epoch": 11.419249592169658, "grad_norm": 1.6159546375274658, "learning_rate": 2.317302046045349e-05, "loss": 0.0604, "num_input_tokens_seen": 151031904, "step": 70000 }, { "epoch": 11.420065252854812, "grad_norm": 2.1094183921813965, "learning_rate": 2.31694710040822e-05, "loss": 0.1808, "num_input_tokens_seen": 151043904, "step": 70005 }, { "epoch": 11.420880913539968, "grad_norm": 0.035336535423994064, "learning_rate": 2.31659215848089e-05, "loss": 0.0162, "num_input_tokens_seen": 151053888, "step": 70010 }, { "epoch": 11.421696574225122, "grad_norm": 0.2607364058494568, "learning_rate": 2.316237220270552e-05, "loss": 0.149, "num_input_tokens_seen": 151064576, "step": 70015 }, { "epoch": 11.422512234910277, "grad_norm": 0.9299678206443787, "learning_rate": 2.3158822857844008e-05, "loss": 0.1245, "num_input_tokens_seen": 151075520, "step": 70020 }, { "epoch": 11.423327895595433, "grad_norm": 1.2080152034759521, "learning_rate": 2.3155273550296277e-05, "loss": 0.0496, "num_input_tokens_seen": 151085632, "step": 70025 }, { "epoch": 11.424143556280587, "grad_norm": 1.6738945245742798, "learning_rate": 2.3151724280134273e-05, "loss": 0.0924, "num_input_tokens_seen": 151096032, "step": 70030 }, { "epoch": 11.424959216965743, "grad_norm": 0.14459101855754852, "learning_rate": 2.314817504742992e-05, "loss": 0.1684, "num_input_tokens_seen": 151106752, "step": 70035 }, { "epoch": 11.425774877650896, "grad_norm": 0.15011867880821228, "learning_rate": 2.314462585225515e-05, "loss": 0.1616, "num_input_tokens_seen": 151117408, "step": 70040 }, { "epoch": 11.426590538336052, "grad_norm": 0.057657063007354736, "learning_rate": 2.314107669468189e-05, "loss": 0.0501, "num_input_tokens_seen": 151128704, "step": 70045 }, { "epoch": 11.427406199021208, "grad_norm": 0.18123479187488556, "learning_rate": 2.313752757478207e-05, "loss": 0.0635, "num_input_tokens_seen": 151139392, "step": 70050 }, { "epoch": 11.428221859706362, "grad_norm": 0.07411346584558487, "learning_rate": 2.3133978492627612e-05, "loss": 0.1105, "num_input_tokens_seen": 151150400, "step": 70055 }, { "epoch": 11.429037520391518, "grad_norm": 0.2715538740158081, "learning_rate": 2.3130429448290457e-05, "loss": 0.1445, "num_input_tokens_seen": 151161504, "step": 70060 }, { "epoch": 11.429853181076671, "grad_norm": 0.13483816385269165, "learning_rate": 2.312688044184251e-05, "loss": 0.0444, "num_input_tokens_seen": 151171872, "step": 70065 }, { "epoch": 11.430668841761827, "grad_norm": 2.5575649738311768, "learning_rate": 2.3123331473355717e-05, "loss": 0.0754, "num_input_tokens_seen": 151182880, "step": 70070 }, { "epoch": 11.431484502446983, "grad_norm": 0.04021628201007843, "learning_rate": 2.3119782542901982e-05, "loss": 0.0474, "num_input_tokens_seen": 151193696, "step": 70075 }, { "epoch": 11.432300163132137, "grad_norm": 0.47470802068710327, "learning_rate": 2.311623365055325e-05, "loss": 0.0878, "num_input_tokens_seen": 151203872, "step": 70080 }, { "epoch": 11.433115823817293, "grad_norm": 0.07600348442792892, "learning_rate": 2.3112684796381423e-05, "loss": 0.0091, "num_input_tokens_seen": 151214720, "step": 70085 }, { "epoch": 11.433931484502446, "grad_norm": 1.9088859558105469, "learning_rate": 2.3109135980458442e-05, "loss": 0.041, "num_input_tokens_seen": 151225664, "step": 70090 }, { "epoch": 11.434747145187602, "grad_norm": 0.19367530941963196, "learning_rate": 2.310558720285621e-05, "loss": 0.0833, "num_input_tokens_seen": 151235744, "step": 70095 }, { "epoch": 11.435562805872756, "grad_norm": 0.09110569208860397, "learning_rate": 2.3102038463646666e-05, "loss": 0.1426, "num_input_tokens_seen": 151244544, "step": 70100 }, { "epoch": 11.436378466557912, "grad_norm": 0.1079137921333313, "learning_rate": 2.309848976290171e-05, "loss": 0.1155, "num_input_tokens_seen": 151256000, "step": 70105 }, { "epoch": 11.437194127243067, "grad_norm": 0.05069030821323395, "learning_rate": 2.309494110069328e-05, "loss": 0.0623, "num_input_tokens_seen": 151267616, "step": 70110 }, { "epoch": 11.438009787928221, "grad_norm": 0.07512661814689636, "learning_rate": 2.3091392477093278e-05, "loss": 0.0074, "num_input_tokens_seen": 151277664, "step": 70115 }, { "epoch": 11.438825448613377, "grad_norm": 0.050636760890483856, "learning_rate": 2.3087843892173627e-05, "loss": 0.1008, "num_input_tokens_seen": 151287392, "step": 70120 }, { "epoch": 11.439641109298531, "grad_norm": 0.512220025062561, "learning_rate": 2.3084295346006255e-05, "loss": 0.112, "num_input_tokens_seen": 151297696, "step": 70125 }, { "epoch": 11.440456769983687, "grad_norm": 1.976865291595459, "learning_rate": 2.308074683866306e-05, "loss": 0.1205, "num_input_tokens_seen": 151308992, "step": 70130 }, { "epoch": 11.441272430668842, "grad_norm": 0.44311854243278503, "learning_rate": 2.3077198370215974e-05, "loss": 0.0317, "num_input_tokens_seen": 151320608, "step": 70135 }, { "epoch": 11.442088091353996, "grad_norm": 0.7569528222084045, "learning_rate": 2.307364994073689e-05, "loss": 0.0759, "num_input_tokens_seen": 151330336, "step": 70140 }, { "epoch": 11.442903752039152, "grad_norm": 0.6478731036186218, "learning_rate": 2.3070101550297747e-05, "loss": 0.0715, "num_input_tokens_seen": 151342848, "step": 70145 }, { "epoch": 11.443719412724306, "grad_norm": 0.08592263609170914, "learning_rate": 2.306655319897043e-05, "loss": 0.0312, "num_input_tokens_seen": 151354272, "step": 70150 }, { "epoch": 11.444535073409462, "grad_norm": 1.279575228691101, "learning_rate": 2.306300488682688e-05, "loss": 0.0343, "num_input_tokens_seen": 151366080, "step": 70155 }, { "epoch": 11.445350734094617, "grad_norm": 0.3050525486469269, "learning_rate": 2.3059456613938982e-05, "loss": 0.0107, "num_input_tokens_seen": 151376032, "step": 70160 }, { "epoch": 11.446166394779771, "grad_norm": 1.4219675064086914, "learning_rate": 2.3055908380378667e-05, "loss": 0.2034, "num_input_tokens_seen": 151387648, "step": 70165 }, { "epoch": 11.446982055464927, "grad_norm": 0.1973392814397812, "learning_rate": 2.3052360186217825e-05, "loss": 0.0883, "num_input_tokens_seen": 151397568, "step": 70170 }, { "epoch": 11.447797716150081, "grad_norm": 1.4468684196472168, "learning_rate": 2.304881203152838e-05, "loss": 0.0893, "num_input_tokens_seen": 151407968, "step": 70175 }, { "epoch": 11.448613376835237, "grad_norm": 0.1334230899810791, "learning_rate": 2.3045263916382235e-05, "loss": 0.052, "num_input_tokens_seen": 151418784, "step": 70180 }, { "epoch": 11.449429037520392, "grad_norm": 2.8134357929229736, "learning_rate": 2.3041715840851295e-05, "loss": 0.0558, "num_input_tokens_seen": 151428704, "step": 70185 }, { "epoch": 11.450244698205546, "grad_norm": 0.13587646186351776, "learning_rate": 2.3038167805007464e-05, "loss": 0.0394, "num_input_tokens_seen": 151439168, "step": 70190 }, { "epoch": 11.451060358890702, "grad_norm": 0.07914586365222931, "learning_rate": 2.3034619808922658e-05, "loss": 0.0327, "num_input_tokens_seen": 151450720, "step": 70195 }, { "epoch": 11.451876019575856, "grad_norm": 1.088333249092102, "learning_rate": 2.3031071852668767e-05, "loss": 0.1486, "num_input_tokens_seen": 151462208, "step": 70200 }, { "epoch": 11.452691680261012, "grad_norm": 0.40501171350479126, "learning_rate": 2.3027523936317713e-05, "loss": 0.0409, "num_input_tokens_seen": 151472800, "step": 70205 }, { "epoch": 11.453507340946166, "grad_norm": 2.162271738052368, "learning_rate": 2.3023976059941377e-05, "loss": 0.084, "num_input_tokens_seen": 151483968, "step": 70210 }, { "epoch": 11.454323001631321, "grad_norm": 1.1977756023406982, "learning_rate": 2.3020428223611688e-05, "loss": 0.1583, "num_input_tokens_seen": 151494752, "step": 70215 }, { "epoch": 11.455138662316477, "grad_norm": 2.0043089389801025, "learning_rate": 2.3016880427400518e-05, "loss": 0.2274, "num_input_tokens_seen": 151505024, "step": 70220 }, { "epoch": 11.455954323001631, "grad_norm": 0.10548142343759537, "learning_rate": 2.301333267137978e-05, "loss": 0.1275, "num_input_tokens_seen": 151515968, "step": 70225 }, { "epoch": 11.456769983686787, "grad_norm": 2.120899200439453, "learning_rate": 2.300978495562139e-05, "loss": 0.0577, "num_input_tokens_seen": 151525472, "step": 70230 }, { "epoch": 11.45758564437194, "grad_norm": 0.11255965381860733, "learning_rate": 2.300623728019722e-05, "loss": 0.0172, "num_input_tokens_seen": 151536288, "step": 70235 }, { "epoch": 11.458401305057096, "grad_norm": 0.08275299519300461, "learning_rate": 2.3002689645179195e-05, "loss": 0.0614, "num_input_tokens_seen": 151545536, "step": 70240 }, { "epoch": 11.459216965742252, "grad_norm": 0.09507807344198227, "learning_rate": 2.2999142050639183e-05, "loss": 0.0306, "num_input_tokens_seen": 151556640, "step": 70245 }, { "epoch": 11.460032626427406, "grad_norm": 2.0449039936065674, "learning_rate": 2.2995594496649106e-05, "loss": 0.0738, "num_input_tokens_seen": 151566464, "step": 70250 }, { "epoch": 11.460848287112562, "grad_norm": 0.10514180362224579, "learning_rate": 2.299204698328084e-05, "loss": 0.2534, "num_input_tokens_seen": 151576384, "step": 70255 }, { "epoch": 11.461663947797716, "grad_norm": 1.325512409210205, "learning_rate": 2.29884995106063e-05, "loss": 0.2115, "num_input_tokens_seen": 151587936, "step": 70260 }, { "epoch": 11.462479608482871, "grad_norm": 1.4841303825378418, "learning_rate": 2.298495207869736e-05, "loss": 0.1742, "num_input_tokens_seen": 151599584, "step": 70265 }, { "epoch": 11.463295269168025, "grad_norm": 1.6888068914413452, "learning_rate": 2.298140468762593e-05, "loss": 0.2221, "num_input_tokens_seen": 151609280, "step": 70270 }, { "epoch": 11.464110929853181, "grad_norm": 0.5660707354545593, "learning_rate": 2.2977857337463887e-05, "loss": 0.0121, "num_input_tokens_seen": 151618528, "step": 70275 }, { "epoch": 11.464926590538337, "grad_norm": 1.5238134860992432, "learning_rate": 2.297431002828314e-05, "loss": 0.0452, "num_input_tokens_seen": 151628928, "step": 70280 }, { "epoch": 11.46574225122349, "grad_norm": 0.9358682632446289, "learning_rate": 2.297076276015556e-05, "loss": 0.0371, "num_input_tokens_seen": 151639808, "step": 70285 }, { "epoch": 11.466557911908646, "grad_norm": 0.017059452831745148, "learning_rate": 2.2967215533153058e-05, "loss": 0.067, "num_input_tokens_seen": 151649952, "step": 70290 }, { "epoch": 11.4673735725938, "grad_norm": 2.548048973083496, "learning_rate": 2.29636683473475e-05, "loss": 0.0595, "num_input_tokens_seen": 151660576, "step": 70295 }, { "epoch": 11.468189233278956, "grad_norm": 0.07620873302221298, "learning_rate": 2.2960121202810795e-05, "loss": 0.0672, "num_input_tokens_seen": 151670272, "step": 70300 }, { "epoch": 11.469004893964112, "grad_norm": 1.3213744163513184, "learning_rate": 2.2956574099614814e-05, "loss": 0.1129, "num_input_tokens_seen": 151681088, "step": 70305 }, { "epoch": 11.469820554649266, "grad_norm": 2.112036943435669, "learning_rate": 2.2953027037831462e-05, "loss": 0.2852, "num_input_tokens_seen": 151693312, "step": 70310 }, { "epoch": 11.470636215334421, "grad_norm": 1.120818018913269, "learning_rate": 2.2949480017532603e-05, "loss": 0.2201, "num_input_tokens_seen": 151704928, "step": 70315 }, { "epoch": 11.471451876019575, "grad_norm": 1.589975118637085, "learning_rate": 2.2945933038790135e-05, "loss": 0.0483, "num_input_tokens_seen": 151716096, "step": 70320 }, { "epoch": 11.47226753670473, "grad_norm": 0.9493570327758789, "learning_rate": 2.2942386101675942e-05, "loss": 0.0685, "num_input_tokens_seen": 151725440, "step": 70325 }, { "epoch": 11.473083197389887, "grad_norm": 0.6233099102973938, "learning_rate": 2.29388392062619e-05, "loss": 0.0468, "num_input_tokens_seen": 151735360, "step": 70330 }, { "epoch": 11.47389885807504, "grad_norm": 0.061846230179071426, "learning_rate": 2.2935292352619906e-05, "loss": 0.1271, "num_input_tokens_seen": 151746368, "step": 70335 }, { "epoch": 11.474714518760196, "grad_norm": 1.2297197580337524, "learning_rate": 2.293174554082182e-05, "loss": 0.0518, "num_input_tokens_seen": 151756928, "step": 70340 }, { "epoch": 11.47553017944535, "grad_norm": 0.08495844155550003, "learning_rate": 2.2928198770939542e-05, "loss": 0.0482, "num_input_tokens_seen": 151767136, "step": 70345 }, { "epoch": 11.476345840130506, "grad_norm": 0.26078617572784424, "learning_rate": 2.2924652043044943e-05, "loss": 0.0238, "num_input_tokens_seen": 151777664, "step": 70350 }, { "epoch": 11.477161500815662, "grad_norm": 0.1591256707906723, "learning_rate": 2.2921105357209905e-05, "loss": 0.1643, "num_input_tokens_seen": 151789376, "step": 70355 }, { "epoch": 11.477977161500815, "grad_norm": 0.14225856959819794, "learning_rate": 2.29175587135063e-05, "loss": 0.2363, "num_input_tokens_seen": 151800800, "step": 70360 }, { "epoch": 11.478792822185971, "grad_norm": 1.3610707521438599, "learning_rate": 2.2914012112006013e-05, "loss": 0.0506, "num_input_tokens_seen": 151811648, "step": 70365 }, { "epoch": 11.479608482871125, "grad_norm": 2.004364252090454, "learning_rate": 2.291046555278091e-05, "loss": 0.0821, "num_input_tokens_seen": 151822688, "step": 70370 }, { "epoch": 11.48042414355628, "grad_norm": 0.1648600995540619, "learning_rate": 2.2906919035902885e-05, "loss": 0.157, "num_input_tokens_seen": 151832224, "step": 70375 }, { "epoch": 11.481239804241435, "grad_norm": 0.054689470678567886, "learning_rate": 2.2903372561443793e-05, "loss": 0.2147, "num_input_tokens_seen": 151842304, "step": 70380 }, { "epoch": 11.48205546492659, "grad_norm": 0.2496601939201355, "learning_rate": 2.289982612947552e-05, "loss": 0.0369, "num_input_tokens_seen": 151852576, "step": 70385 }, { "epoch": 11.482871125611746, "grad_norm": 0.035259146243333817, "learning_rate": 2.2896279740069927e-05, "loss": 0.1815, "num_input_tokens_seen": 151863328, "step": 70390 }, { "epoch": 11.4836867862969, "grad_norm": 0.26282811164855957, "learning_rate": 2.2892733393298907e-05, "loss": 0.0276, "num_input_tokens_seen": 151873920, "step": 70395 }, { "epoch": 11.484502446982056, "grad_norm": 0.29412683844566345, "learning_rate": 2.2889187089234303e-05, "loss": 0.118, "num_input_tokens_seen": 151883552, "step": 70400 }, { "epoch": 11.48531810766721, "grad_norm": 1.3771264553070068, "learning_rate": 2.2885640827948017e-05, "loss": 0.1844, "num_input_tokens_seen": 151895136, "step": 70405 }, { "epoch": 11.486133768352365, "grad_norm": 1.2529972791671753, "learning_rate": 2.2882094609511888e-05, "loss": 0.1363, "num_input_tokens_seen": 151907296, "step": 70410 }, { "epoch": 11.486949429037521, "grad_norm": 0.04993043094873428, "learning_rate": 2.2878548433997808e-05, "loss": 0.2013, "num_input_tokens_seen": 151918944, "step": 70415 }, { "epoch": 11.487765089722675, "grad_norm": 0.08355923742055893, "learning_rate": 2.2875002301477626e-05, "loss": 0.1282, "num_input_tokens_seen": 151929536, "step": 70420 }, { "epoch": 11.48858075040783, "grad_norm": 0.4074321985244751, "learning_rate": 2.2871456212023225e-05, "loss": 0.0796, "num_input_tokens_seen": 151940608, "step": 70425 }, { "epoch": 11.489396411092985, "grad_norm": 0.07279534637928009, "learning_rate": 2.2867910165706457e-05, "loss": 0.3842, "num_input_tokens_seen": 151951584, "step": 70430 }, { "epoch": 11.49021207177814, "grad_norm": 1.8660989999771118, "learning_rate": 2.2864364162599195e-05, "loss": 0.0776, "num_input_tokens_seen": 151962720, "step": 70435 }, { "epoch": 11.491027732463296, "grad_norm": 0.9051462411880493, "learning_rate": 2.286081820277331e-05, "loss": 0.0683, "num_input_tokens_seen": 151972800, "step": 70440 }, { "epoch": 11.49184339314845, "grad_norm": 0.09654656052589417, "learning_rate": 2.285727228630065e-05, "loss": 0.0121, "num_input_tokens_seen": 151983520, "step": 70445 }, { "epoch": 11.492659053833606, "grad_norm": 1.852718472480774, "learning_rate": 2.2853726413253096e-05, "loss": 0.0926, "num_input_tokens_seen": 151994624, "step": 70450 }, { "epoch": 11.49347471451876, "grad_norm": 2.2267253398895264, "learning_rate": 2.2850180583702485e-05, "loss": 0.2939, "num_input_tokens_seen": 152004512, "step": 70455 }, { "epoch": 11.494290375203915, "grad_norm": 0.10238856077194214, "learning_rate": 2.2846634797720707e-05, "loss": 0.0285, "num_input_tokens_seen": 152014368, "step": 70460 }, { "epoch": 11.49510603588907, "grad_norm": 0.48625296354293823, "learning_rate": 2.2843089055379595e-05, "loss": 0.1553, "num_input_tokens_seen": 152025536, "step": 70465 }, { "epoch": 11.495921696574225, "grad_norm": 2.1473398208618164, "learning_rate": 2.2839543356751027e-05, "loss": 0.0668, "num_input_tokens_seen": 152036640, "step": 70470 }, { "epoch": 11.49673735725938, "grad_norm": 0.44922497868537903, "learning_rate": 2.2835997701906843e-05, "loss": 0.0322, "num_input_tokens_seen": 152048192, "step": 70475 }, { "epoch": 11.497553017944535, "grad_norm": 0.07268359512090683, "learning_rate": 2.2832452090918928e-05, "loss": 0.071, "num_input_tokens_seen": 152059808, "step": 70480 }, { "epoch": 11.49836867862969, "grad_norm": 0.723131537437439, "learning_rate": 2.2828906523859104e-05, "loss": 0.1469, "num_input_tokens_seen": 152070816, "step": 70485 }, { "epoch": 11.499184339314844, "grad_norm": 1.2246564626693726, "learning_rate": 2.282536100079925e-05, "loss": 0.1725, "num_input_tokens_seen": 152082240, "step": 70490 }, { "epoch": 11.5, "grad_norm": 0.13620692491531372, "learning_rate": 2.282181552181121e-05, "loss": 0.0477, "num_input_tokens_seen": 152093312, "step": 70495 }, { "epoch": 11.500815660685156, "grad_norm": 1.0039259195327759, "learning_rate": 2.281827008696685e-05, "loss": 0.1353, "num_input_tokens_seen": 152103616, "step": 70500 }, { "epoch": 11.50163132137031, "grad_norm": 0.08060311526060104, "learning_rate": 2.281472469633801e-05, "loss": 0.1118, "num_input_tokens_seen": 152113440, "step": 70505 }, { "epoch": 11.502446982055465, "grad_norm": 0.3393334746360779, "learning_rate": 2.2811179349996547e-05, "loss": 0.074, "num_input_tokens_seen": 152124736, "step": 70510 }, { "epoch": 11.50326264274062, "grad_norm": 0.43885207176208496, "learning_rate": 2.2807634048014308e-05, "loss": 0.1581, "num_input_tokens_seen": 152136192, "step": 70515 }, { "epoch": 11.504078303425775, "grad_norm": 3.19777250289917, "learning_rate": 2.2804088790463155e-05, "loss": 0.2061, "num_input_tokens_seen": 152147200, "step": 70520 }, { "epoch": 11.50489396411093, "grad_norm": 0.12074912339448929, "learning_rate": 2.2800543577414917e-05, "loss": 0.0609, "num_input_tokens_seen": 152158368, "step": 70525 }, { "epoch": 11.505709624796085, "grad_norm": 0.3243964910507202, "learning_rate": 2.2796998408941462e-05, "loss": 0.0633, "num_input_tokens_seen": 152168352, "step": 70530 }, { "epoch": 11.50652528548124, "grad_norm": 1.282220482826233, "learning_rate": 2.2793453285114623e-05, "loss": 0.1935, "num_input_tokens_seen": 152178016, "step": 70535 }, { "epoch": 11.507340946166394, "grad_norm": 0.343064546585083, "learning_rate": 2.2789908206006262e-05, "loss": 0.0675, "num_input_tokens_seen": 152188768, "step": 70540 }, { "epoch": 11.50815660685155, "grad_norm": 2.662937641143799, "learning_rate": 2.2786363171688203e-05, "loss": 0.1543, "num_input_tokens_seen": 152199296, "step": 70545 }, { "epoch": 11.508972267536706, "grad_norm": 1.1879936456680298, "learning_rate": 2.2782818182232302e-05, "loss": 0.115, "num_input_tokens_seen": 152210624, "step": 70550 }, { "epoch": 11.50978792822186, "grad_norm": 0.1796954870223999, "learning_rate": 2.2779273237710415e-05, "loss": 0.073, "num_input_tokens_seen": 152222208, "step": 70555 }, { "epoch": 11.510603588907015, "grad_norm": 1.9769574403762817, "learning_rate": 2.2775728338194363e-05, "loss": 0.3122, "num_input_tokens_seen": 152234080, "step": 70560 }, { "epoch": 11.51141924959217, "grad_norm": 0.0820116326212883, "learning_rate": 2.277218348375601e-05, "loss": 0.0518, "num_input_tokens_seen": 152243840, "step": 70565 }, { "epoch": 11.512234910277325, "grad_norm": 0.02629297785460949, "learning_rate": 2.276863867446717e-05, "loss": 0.0555, "num_input_tokens_seen": 152254752, "step": 70570 }, { "epoch": 11.513050570962479, "grad_norm": 0.04126632958650589, "learning_rate": 2.276509391039971e-05, "loss": 0.0099, "num_input_tokens_seen": 152265856, "step": 70575 }, { "epoch": 11.513866231647635, "grad_norm": 0.2259594351053238, "learning_rate": 2.276154919162545e-05, "loss": 0.0348, "num_input_tokens_seen": 152277408, "step": 70580 }, { "epoch": 11.51468189233279, "grad_norm": 0.5474713444709778, "learning_rate": 2.275800451821624e-05, "loss": 0.2036, "num_input_tokens_seen": 152288416, "step": 70585 }, { "epoch": 11.515497553017944, "grad_norm": 1.98784339427948, "learning_rate": 2.2754459890243904e-05, "loss": 0.1614, "num_input_tokens_seen": 152297312, "step": 70590 }, { "epoch": 11.5163132137031, "grad_norm": 0.4336516261100769, "learning_rate": 2.2750915307780298e-05, "loss": 0.139, "num_input_tokens_seen": 152308480, "step": 70595 }, { "epoch": 11.517128874388254, "grad_norm": 0.1460307240486145, "learning_rate": 2.2747370770897235e-05, "loss": 0.0218, "num_input_tokens_seen": 152319648, "step": 70600 }, { "epoch": 11.51794453507341, "grad_norm": 0.40804722905158997, "learning_rate": 2.2743826279666574e-05, "loss": 0.144, "num_input_tokens_seen": 152329984, "step": 70605 }, { "epoch": 11.518760195758565, "grad_norm": 2.1374096870422363, "learning_rate": 2.2740281834160125e-05, "loss": 0.2446, "num_input_tokens_seen": 152340800, "step": 70610 }, { "epoch": 11.51957585644372, "grad_norm": 0.055132050067186356, "learning_rate": 2.273673743444974e-05, "loss": 0.0333, "num_input_tokens_seen": 152351424, "step": 70615 }, { "epoch": 11.520391517128875, "grad_norm": 1.8735871315002441, "learning_rate": 2.2733193080607232e-05, "loss": 0.1126, "num_input_tokens_seen": 152360832, "step": 70620 }, { "epoch": 11.521207177814029, "grad_norm": 0.04159785434603691, "learning_rate": 2.2729648772704452e-05, "loss": 0.023, "num_input_tokens_seen": 152370464, "step": 70625 }, { "epoch": 11.522022838499185, "grad_norm": 0.8963729739189148, "learning_rate": 2.272610451081321e-05, "loss": 0.1219, "num_input_tokens_seen": 152381696, "step": 70630 }, { "epoch": 11.522838499184338, "grad_norm": 1.5062339305877686, "learning_rate": 2.2722560295005352e-05, "loss": 0.1105, "num_input_tokens_seen": 152391552, "step": 70635 }, { "epoch": 11.523654159869494, "grad_norm": 0.2024487853050232, "learning_rate": 2.2719016125352695e-05, "loss": 0.0698, "num_input_tokens_seen": 152403104, "step": 70640 }, { "epoch": 11.52446982055465, "grad_norm": 2.11645245552063, "learning_rate": 2.271547200192707e-05, "loss": 0.1427, "num_input_tokens_seen": 152414208, "step": 70645 }, { "epoch": 11.525285481239804, "grad_norm": 0.1131075918674469, "learning_rate": 2.271192792480031e-05, "loss": 0.0171, "num_input_tokens_seen": 152425504, "step": 70650 }, { "epoch": 11.52610114192496, "grad_norm": 1.0750600099563599, "learning_rate": 2.270838389404422e-05, "loss": 0.1332, "num_input_tokens_seen": 152436128, "step": 70655 }, { "epoch": 11.526916802610113, "grad_norm": 0.042521730065345764, "learning_rate": 2.2704839909730644e-05, "loss": 0.0082, "num_input_tokens_seen": 152447872, "step": 70660 }, { "epoch": 11.52773246329527, "grad_norm": 0.5969802737236023, "learning_rate": 2.27012959719314e-05, "loss": 0.1043, "num_input_tokens_seen": 152458048, "step": 70665 }, { "epoch": 11.528548123980425, "grad_norm": 0.09579924494028091, "learning_rate": 2.269775208071831e-05, "loss": 0.1277, "num_input_tokens_seen": 152469728, "step": 70670 }, { "epoch": 11.529363784665579, "grad_norm": 1.1243274211883545, "learning_rate": 2.2694208236163196e-05, "loss": 0.1899, "num_input_tokens_seen": 152480384, "step": 70675 }, { "epoch": 11.530179445350734, "grad_norm": 0.801002562046051, "learning_rate": 2.269066443833787e-05, "loss": 0.1642, "num_input_tokens_seen": 152490528, "step": 70680 }, { "epoch": 11.530995106035888, "grad_norm": 0.16185542941093445, "learning_rate": 2.268712068731416e-05, "loss": 0.0645, "num_input_tokens_seen": 152501888, "step": 70685 }, { "epoch": 11.531810766721044, "grad_norm": 0.46599969267845154, "learning_rate": 2.2683576983163897e-05, "loss": 0.0868, "num_input_tokens_seen": 152513056, "step": 70690 }, { "epoch": 11.5326264274062, "grad_norm": 2.732158899307251, "learning_rate": 2.2680033325958864e-05, "loss": 0.3947, "num_input_tokens_seen": 152523296, "step": 70695 }, { "epoch": 11.533442088091354, "grad_norm": 0.17452265322208405, "learning_rate": 2.267648971577092e-05, "loss": 0.1303, "num_input_tokens_seen": 152534368, "step": 70700 }, { "epoch": 11.53425774877651, "grad_norm": 1.3373088836669922, "learning_rate": 2.267294615267184e-05, "loss": 0.0532, "num_input_tokens_seen": 152546336, "step": 70705 }, { "epoch": 11.535073409461663, "grad_norm": 0.17893896996974945, "learning_rate": 2.2669402636733476e-05, "loss": 0.026, "num_input_tokens_seen": 152557504, "step": 70710 }, { "epoch": 11.535889070146819, "grad_norm": 0.020875386893749237, "learning_rate": 2.266585916802761e-05, "loss": 0.0089, "num_input_tokens_seen": 152568480, "step": 70715 }, { "epoch": 11.536704730831975, "grad_norm": 1.2523949146270752, "learning_rate": 2.266231574662608e-05, "loss": 0.1479, "num_input_tokens_seen": 152579392, "step": 70720 }, { "epoch": 11.537520391517129, "grad_norm": 0.1756470948457718, "learning_rate": 2.2658772372600677e-05, "loss": 0.2697, "num_input_tokens_seen": 152589856, "step": 70725 }, { "epoch": 11.538336052202284, "grad_norm": 0.041130006313323975, "learning_rate": 2.2655229046023232e-05, "loss": 0.1632, "num_input_tokens_seen": 152601280, "step": 70730 }, { "epoch": 11.539151712887438, "grad_norm": 1.9173557758331299, "learning_rate": 2.2651685766965536e-05, "loss": 0.1635, "num_input_tokens_seen": 152612320, "step": 70735 }, { "epoch": 11.539967373572594, "grad_norm": 0.7600729465484619, "learning_rate": 2.2648142535499416e-05, "loss": 0.233, "num_input_tokens_seen": 152622592, "step": 70740 }, { "epoch": 11.540783034257748, "grad_norm": 0.4623905122280121, "learning_rate": 2.264459935169666e-05, "loss": 0.0246, "num_input_tokens_seen": 152632800, "step": 70745 }, { "epoch": 11.541598694942904, "grad_norm": 1.8016812801361084, "learning_rate": 2.2641056215629098e-05, "loss": 0.0741, "num_input_tokens_seen": 152644256, "step": 70750 }, { "epoch": 11.54241435562806, "grad_norm": 0.03842872753739357, "learning_rate": 2.2637513127368514e-05, "loss": 0.0196, "num_input_tokens_seen": 152655552, "step": 70755 }, { "epoch": 11.543230016313213, "grad_norm": 0.2706871032714844, "learning_rate": 2.263397008698672e-05, "loss": 0.1631, "num_input_tokens_seen": 152666656, "step": 70760 }, { "epoch": 11.544045676998369, "grad_norm": 0.22716301679611206, "learning_rate": 2.2630427094555538e-05, "loss": 0.0535, "num_input_tokens_seen": 152678944, "step": 70765 }, { "epoch": 11.544861337683523, "grad_norm": 1.3203407526016235, "learning_rate": 2.2626884150146746e-05, "loss": 0.1307, "num_input_tokens_seen": 152690336, "step": 70770 }, { "epoch": 11.545676998368679, "grad_norm": 0.06563768535852432, "learning_rate": 2.262334125383217e-05, "loss": 0.074, "num_input_tokens_seen": 152700736, "step": 70775 }, { "epoch": 11.546492659053834, "grad_norm": 0.16966533660888672, "learning_rate": 2.2619798405683582e-05, "loss": 0.0298, "num_input_tokens_seen": 152711968, "step": 70780 }, { "epoch": 11.547308319738988, "grad_norm": 1.539516568183899, "learning_rate": 2.2616255605772813e-05, "loss": 0.1765, "num_input_tokens_seen": 152722336, "step": 70785 }, { "epoch": 11.548123980424144, "grad_norm": 0.03927411511540413, "learning_rate": 2.2612712854171637e-05, "loss": 0.0779, "num_input_tokens_seen": 152733344, "step": 70790 }, { "epoch": 11.548939641109298, "grad_norm": 0.19912685453891754, "learning_rate": 2.2609170150951874e-05, "loss": 0.1146, "num_input_tokens_seen": 152743424, "step": 70795 }, { "epoch": 11.549755301794454, "grad_norm": 0.10889675468206406, "learning_rate": 2.26056274961853e-05, "loss": 0.03, "num_input_tokens_seen": 152754272, "step": 70800 }, { "epoch": 11.550570962479608, "grad_norm": 0.08023516088724136, "learning_rate": 2.260208488994373e-05, "loss": 0.0634, "num_input_tokens_seen": 152765408, "step": 70805 }, { "epoch": 11.551386623164763, "grad_norm": 0.9148570895195007, "learning_rate": 2.2598542332298945e-05, "loss": 0.2589, "num_input_tokens_seen": 152776064, "step": 70810 }, { "epoch": 11.552202283849919, "grad_norm": 2.669903039932251, "learning_rate": 2.2594999823322754e-05, "loss": 0.1437, "num_input_tokens_seen": 152787104, "step": 70815 }, { "epoch": 11.553017944535073, "grad_norm": 1.8352484703063965, "learning_rate": 2.259145736308694e-05, "loss": 0.2677, "num_input_tokens_seen": 152798112, "step": 70820 }, { "epoch": 11.553833605220229, "grad_norm": 0.8246452808380127, "learning_rate": 2.2587914951663298e-05, "loss": 0.1197, "num_input_tokens_seen": 152809664, "step": 70825 }, { "epoch": 11.554649265905383, "grad_norm": 0.2243553102016449, "learning_rate": 2.2584372589123612e-05, "loss": 0.1025, "num_input_tokens_seen": 152820960, "step": 70830 }, { "epoch": 11.555464926590538, "grad_norm": 0.5885673761367798, "learning_rate": 2.258083027553969e-05, "loss": 0.1513, "num_input_tokens_seen": 152832064, "step": 70835 }, { "epoch": 11.556280587275694, "grad_norm": 0.04362769052386284, "learning_rate": 2.2577288010983302e-05, "loss": 0.0917, "num_input_tokens_seen": 152842496, "step": 70840 }, { "epoch": 11.557096247960848, "grad_norm": 0.49506065249443054, "learning_rate": 2.2573745795526256e-05, "loss": 0.0475, "num_input_tokens_seen": 152852608, "step": 70845 }, { "epoch": 11.557911908646004, "grad_norm": 1.7233392000198364, "learning_rate": 2.2570203629240323e-05, "loss": 0.3092, "num_input_tokens_seen": 152862560, "step": 70850 }, { "epoch": 11.558727569331158, "grad_norm": 0.49763667583465576, "learning_rate": 2.2566661512197305e-05, "loss": 0.0384, "num_input_tokens_seen": 152873216, "step": 70855 }, { "epoch": 11.559543230016313, "grad_norm": 1.0326884984970093, "learning_rate": 2.2563119444468965e-05, "loss": 0.073, "num_input_tokens_seen": 152884096, "step": 70860 }, { "epoch": 11.560358890701469, "grad_norm": 0.254951536655426, "learning_rate": 2.25595774261271e-05, "loss": 0.097, "num_input_tokens_seen": 152895168, "step": 70865 }, { "epoch": 11.561174551386623, "grad_norm": 0.09251773357391357, "learning_rate": 2.2556035457243507e-05, "loss": 0.2033, "num_input_tokens_seen": 152906528, "step": 70870 }, { "epoch": 11.561990212071779, "grad_norm": 1.2752368450164795, "learning_rate": 2.2552493537889946e-05, "loss": 0.218, "num_input_tokens_seen": 152917728, "step": 70875 }, { "epoch": 11.562805872756933, "grad_norm": 2.013962745666504, "learning_rate": 2.254895166813822e-05, "loss": 0.248, "num_input_tokens_seen": 152927392, "step": 70880 }, { "epoch": 11.563621533442088, "grad_norm": 0.16309793293476105, "learning_rate": 2.2545409848060084e-05, "loss": 0.0136, "num_input_tokens_seen": 152938624, "step": 70885 }, { "epoch": 11.564437194127244, "grad_norm": 0.5876662731170654, "learning_rate": 2.2541868077727342e-05, "loss": 0.3265, "num_input_tokens_seen": 152949888, "step": 70890 }, { "epoch": 11.565252854812398, "grad_norm": 1.608111023902893, "learning_rate": 2.2538326357211753e-05, "loss": 0.2008, "num_input_tokens_seen": 152961376, "step": 70895 }, { "epoch": 11.566068515497554, "grad_norm": 1.2849479913711548, "learning_rate": 2.2534784686585113e-05, "loss": 0.0424, "num_input_tokens_seen": 152972256, "step": 70900 }, { "epoch": 11.566884176182707, "grad_norm": 0.14833100140094757, "learning_rate": 2.253124306591918e-05, "loss": 0.1223, "num_input_tokens_seen": 152982880, "step": 70905 }, { "epoch": 11.567699836867863, "grad_norm": 0.1789298951625824, "learning_rate": 2.2527701495285745e-05, "loss": 0.1506, "num_input_tokens_seen": 152995296, "step": 70910 }, { "epoch": 11.568515497553017, "grad_norm": 0.4309707283973694, "learning_rate": 2.2524159974756566e-05, "loss": 0.1218, "num_input_tokens_seen": 153005728, "step": 70915 }, { "epoch": 11.569331158238173, "grad_norm": 2.0869252681732178, "learning_rate": 2.252061850440344e-05, "loss": 0.1411, "num_input_tokens_seen": 153017312, "step": 70920 }, { "epoch": 11.570146818923329, "grad_norm": 1.7251472473144531, "learning_rate": 2.2517077084298113e-05, "loss": 0.0631, "num_input_tokens_seen": 153029472, "step": 70925 }, { "epoch": 11.570962479608482, "grad_norm": 0.15465372800827026, "learning_rate": 2.251353571451238e-05, "loss": 0.1058, "num_input_tokens_seen": 153041344, "step": 70930 }, { "epoch": 11.571778140293638, "grad_norm": 0.6729331612586975, "learning_rate": 2.2509994395117986e-05, "loss": 0.1587, "num_input_tokens_seen": 153052352, "step": 70935 }, { "epoch": 11.572593800978792, "grad_norm": 0.9905197024345398, "learning_rate": 2.2506453126186727e-05, "loss": 0.1854, "num_input_tokens_seen": 153062592, "step": 70940 }, { "epoch": 11.573409461663948, "grad_norm": 2.0569751262664795, "learning_rate": 2.2502911907790347e-05, "loss": 0.2734, "num_input_tokens_seen": 153073408, "step": 70945 }, { "epoch": 11.574225122349104, "grad_norm": 0.40869560837745667, "learning_rate": 2.2499370740000632e-05, "loss": 0.0575, "num_input_tokens_seen": 153084416, "step": 70950 }, { "epoch": 11.575040783034257, "grad_norm": 0.3114251494407654, "learning_rate": 2.2495829622889342e-05, "loss": 0.1511, "num_input_tokens_seen": 153095680, "step": 70955 }, { "epoch": 11.575856443719413, "grad_norm": 0.9358269572257996, "learning_rate": 2.249228855652824e-05, "loss": 0.0349, "num_input_tokens_seen": 153106944, "step": 70960 }, { "epoch": 11.576672104404567, "grad_norm": 0.14965367317199707, "learning_rate": 2.248874754098909e-05, "loss": 0.2002, "num_input_tokens_seen": 153117984, "step": 70965 }, { "epoch": 11.577487765089723, "grad_norm": 0.1851363629102707, "learning_rate": 2.2485206576343653e-05, "loss": 0.1124, "num_input_tokens_seen": 153129440, "step": 70970 }, { "epoch": 11.578303425774878, "grad_norm": 0.3164094388484955, "learning_rate": 2.24816656626637e-05, "loss": 0.0672, "num_input_tokens_seen": 153140288, "step": 70975 }, { "epoch": 11.579119086460032, "grad_norm": 0.09163682907819748, "learning_rate": 2.247812480002099e-05, "loss": 0.1771, "num_input_tokens_seen": 153150848, "step": 70980 }, { "epoch": 11.579934747145188, "grad_norm": 0.466545969247818, "learning_rate": 2.2474583988487277e-05, "loss": 0.0679, "num_input_tokens_seen": 153160320, "step": 70985 }, { "epoch": 11.580750407830342, "grad_norm": 1.5247693061828613, "learning_rate": 2.247104322813432e-05, "loss": 0.1221, "num_input_tokens_seen": 153171104, "step": 70990 }, { "epoch": 11.581566068515498, "grad_norm": 0.0706140324473381, "learning_rate": 2.2467502519033886e-05, "loss": 0.1126, "num_input_tokens_seen": 153180640, "step": 70995 }, { "epoch": 11.582381729200652, "grad_norm": 1.6735522747039795, "learning_rate": 2.246396186125772e-05, "loss": 0.1412, "num_input_tokens_seen": 153190624, "step": 71000 }, { "epoch": 11.583197389885807, "grad_norm": 2.043191909790039, "learning_rate": 2.2460421254877594e-05, "loss": 0.3088, "num_input_tokens_seen": 153200864, "step": 71005 }, { "epoch": 11.584013050570963, "grad_norm": 0.8780702948570251, "learning_rate": 2.245688069996524e-05, "loss": 0.2727, "num_input_tokens_seen": 153210688, "step": 71010 }, { "epoch": 11.584828711256117, "grad_norm": 1.2010940313339233, "learning_rate": 2.245334019659244e-05, "loss": 0.0843, "num_input_tokens_seen": 153222816, "step": 71015 }, { "epoch": 11.585644371941273, "grad_norm": 1.373867392539978, "learning_rate": 2.2449799744830918e-05, "loss": 0.1177, "num_input_tokens_seen": 153232832, "step": 71020 }, { "epoch": 11.586460032626427, "grad_norm": 0.04962332174181938, "learning_rate": 2.2446259344752452e-05, "loss": 0.0226, "num_input_tokens_seen": 153244160, "step": 71025 }, { "epoch": 11.587275693311582, "grad_norm": 0.5304173827171326, "learning_rate": 2.2442718996428767e-05, "loss": 0.0469, "num_input_tokens_seen": 153254816, "step": 71030 }, { "epoch": 11.588091353996738, "grad_norm": 1.1346997022628784, "learning_rate": 2.2439178699931636e-05, "loss": 0.146, "num_input_tokens_seen": 153266688, "step": 71035 }, { "epoch": 11.588907014681892, "grad_norm": 0.20464922487735748, "learning_rate": 2.243563845533279e-05, "loss": 0.1022, "num_input_tokens_seen": 153279040, "step": 71040 }, { "epoch": 11.589722675367048, "grad_norm": 0.12036187946796417, "learning_rate": 2.2432098262703992e-05, "loss": 0.0367, "num_input_tokens_seen": 153290240, "step": 71045 }, { "epoch": 11.590538336052202, "grad_norm": 0.2766795754432678, "learning_rate": 2.2428558122116966e-05, "loss": 0.1221, "num_input_tokens_seen": 153302080, "step": 71050 }, { "epoch": 11.591353996737357, "grad_norm": 0.9921755790710449, "learning_rate": 2.2425018033643487e-05, "loss": 0.1335, "num_input_tokens_seen": 153313632, "step": 71055 }, { "epoch": 11.592169657422513, "grad_norm": 1.2967077493667603, "learning_rate": 2.2421477997355274e-05, "loss": 0.0378, "num_input_tokens_seen": 153324960, "step": 71060 }, { "epoch": 11.592985318107667, "grad_norm": 0.06186086684465408, "learning_rate": 2.2417938013324087e-05, "loss": 0.0513, "num_input_tokens_seen": 153335552, "step": 71065 }, { "epoch": 11.593800978792823, "grad_norm": 0.2057032436132431, "learning_rate": 2.2414398081621658e-05, "loss": 0.1261, "num_input_tokens_seen": 153346304, "step": 71070 }, { "epoch": 11.594616639477977, "grad_norm": 0.0948290228843689, "learning_rate": 2.2410858202319726e-05, "loss": 0.0102, "num_input_tokens_seen": 153357472, "step": 71075 }, { "epoch": 11.595432300163132, "grad_norm": 0.08903000503778458, "learning_rate": 2.240731837549005e-05, "loss": 0.2585, "num_input_tokens_seen": 153368800, "step": 71080 }, { "epoch": 11.596247960848288, "grad_norm": 1.635521411895752, "learning_rate": 2.2403778601204345e-05, "loss": 0.0691, "num_input_tokens_seen": 153380800, "step": 71085 }, { "epoch": 11.597063621533442, "grad_norm": 0.07038092613220215, "learning_rate": 2.240023887953437e-05, "loss": 0.009, "num_input_tokens_seen": 153391968, "step": 71090 }, { "epoch": 11.597879282218598, "grad_norm": 1.961808443069458, "learning_rate": 2.239669921055184e-05, "loss": 0.1813, "num_input_tokens_seen": 153401920, "step": 71095 }, { "epoch": 11.598694942903752, "grad_norm": 0.15211905539035797, "learning_rate": 2.239315959432852e-05, "loss": 0.1861, "num_input_tokens_seen": 153413408, "step": 71100 }, { "epoch": 11.599510603588907, "grad_norm": 1.7331358194351196, "learning_rate": 2.238962003093611e-05, "loss": 0.0962, "num_input_tokens_seen": 153424224, "step": 71105 }, { "epoch": 11.600326264274061, "grad_norm": 0.36355751752853394, "learning_rate": 2.2386080520446378e-05, "loss": 0.046, "num_input_tokens_seen": 153435392, "step": 71110 }, { "epoch": 11.601141924959217, "grad_norm": 1.5106152296066284, "learning_rate": 2.2382541062931028e-05, "loss": 0.1104, "num_input_tokens_seen": 153445792, "step": 71115 }, { "epoch": 11.601957585644373, "grad_norm": 0.38031989336013794, "learning_rate": 2.2379001658461807e-05, "loss": 0.1148, "num_input_tokens_seen": 153455936, "step": 71120 }, { "epoch": 11.602773246329527, "grad_norm": 1.264976143836975, "learning_rate": 2.2375462307110446e-05, "loss": 0.1181, "num_input_tokens_seen": 153466752, "step": 71125 }, { "epoch": 11.603588907014682, "grad_norm": 0.8670953512191772, "learning_rate": 2.2371923008948667e-05, "loss": 0.0793, "num_input_tokens_seen": 153477888, "step": 71130 }, { "epoch": 11.604404567699836, "grad_norm": 0.10271702706813812, "learning_rate": 2.2368383764048202e-05, "loss": 0.0134, "num_input_tokens_seen": 153489696, "step": 71135 }, { "epoch": 11.605220228384992, "grad_norm": 0.3491033613681793, "learning_rate": 2.236484457248078e-05, "loss": 0.031, "num_input_tokens_seen": 153500576, "step": 71140 }, { "epoch": 11.606035889070148, "grad_norm": 1.772983431816101, "learning_rate": 2.236130543431812e-05, "loss": 0.1545, "num_input_tokens_seen": 153511104, "step": 71145 }, { "epoch": 11.606851549755302, "grad_norm": 0.08246323466300964, "learning_rate": 2.2357766349631963e-05, "loss": 0.0659, "num_input_tokens_seen": 153521376, "step": 71150 }, { "epoch": 11.607667210440457, "grad_norm": 1.7178480625152588, "learning_rate": 2.2354227318494015e-05, "loss": 0.3213, "num_input_tokens_seen": 153531808, "step": 71155 }, { "epoch": 11.608482871125611, "grad_norm": 0.23512186110019684, "learning_rate": 2.2350688340976016e-05, "loss": 0.1603, "num_input_tokens_seen": 153542400, "step": 71160 }, { "epoch": 11.609298531810767, "grad_norm": 0.9201487898826599, "learning_rate": 2.2347149417149668e-05, "loss": 0.0966, "num_input_tokens_seen": 153553856, "step": 71165 }, { "epoch": 11.61011419249592, "grad_norm": 2.834913969039917, "learning_rate": 2.2343610547086713e-05, "loss": 0.1174, "num_input_tokens_seen": 153564416, "step": 71170 }, { "epoch": 11.610929853181077, "grad_norm": 0.0646994560956955, "learning_rate": 2.234007173085885e-05, "loss": 0.0589, "num_input_tokens_seen": 153576352, "step": 71175 }, { "epoch": 11.611745513866232, "grad_norm": 1.3067232370376587, "learning_rate": 2.2336532968537814e-05, "loss": 0.1242, "num_input_tokens_seen": 153587232, "step": 71180 }, { "epoch": 11.612561174551386, "grad_norm": 1.092307209968567, "learning_rate": 2.2332994260195312e-05, "loss": 0.0494, "num_input_tokens_seen": 153598304, "step": 71185 }, { "epoch": 11.613376835236542, "grad_norm": 2.077354669570923, "learning_rate": 2.232945560590306e-05, "loss": 0.2171, "num_input_tokens_seen": 153610304, "step": 71190 }, { "epoch": 11.614192495921696, "grad_norm": 0.5014381408691406, "learning_rate": 2.2325917005732792e-05, "loss": 0.0305, "num_input_tokens_seen": 153621056, "step": 71195 }, { "epoch": 11.615008156606851, "grad_norm": 0.49203765392303467, "learning_rate": 2.2322378459756194e-05, "loss": 0.1386, "num_input_tokens_seen": 153630592, "step": 71200 }, { "epoch": 11.615823817292007, "grad_norm": 1.4838441610336304, "learning_rate": 2.2318839968045008e-05, "loss": 0.1022, "num_input_tokens_seen": 153640768, "step": 71205 }, { "epoch": 11.616639477977161, "grad_norm": 1.6106514930725098, "learning_rate": 2.2315301530670918e-05, "loss": 0.045, "num_input_tokens_seen": 153651808, "step": 71210 }, { "epoch": 11.617455138662317, "grad_norm": 1.1413484811782837, "learning_rate": 2.2311763147705657e-05, "loss": 0.0813, "num_input_tokens_seen": 153661856, "step": 71215 }, { "epoch": 11.61827079934747, "grad_norm": 0.35308608412742615, "learning_rate": 2.230822481922092e-05, "loss": 0.0249, "num_input_tokens_seen": 153672896, "step": 71220 }, { "epoch": 11.619086460032626, "grad_norm": 0.6589916944503784, "learning_rate": 2.2304686545288432e-05, "loss": 0.0268, "num_input_tokens_seen": 153682112, "step": 71225 }, { "epoch": 11.619902120717782, "grad_norm": 0.6950991153717041, "learning_rate": 2.2301148325979874e-05, "loss": 0.1249, "num_input_tokens_seen": 153691776, "step": 71230 }, { "epoch": 11.620717781402936, "grad_norm": 0.7030717134475708, "learning_rate": 2.2297610161366985e-05, "loss": 0.0646, "num_input_tokens_seen": 153702048, "step": 71235 }, { "epoch": 11.621533442088092, "grad_norm": 0.5841343402862549, "learning_rate": 2.229407205152144e-05, "loss": 0.171, "num_input_tokens_seen": 153713184, "step": 71240 }, { "epoch": 11.622349102773246, "grad_norm": 0.034083291888237, "learning_rate": 2.2290533996514974e-05, "loss": 0.1399, "num_input_tokens_seen": 153725088, "step": 71245 }, { "epoch": 11.623164763458401, "grad_norm": 1.2467373609542847, "learning_rate": 2.228699599641926e-05, "loss": 0.21, "num_input_tokens_seen": 153736608, "step": 71250 }, { "epoch": 11.623980424143557, "grad_norm": 1.4406925439834595, "learning_rate": 2.2283458051306022e-05, "loss": 0.0794, "num_input_tokens_seen": 153747616, "step": 71255 }, { "epoch": 11.624796084828711, "grad_norm": 0.40825116634368896, "learning_rate": 2.2279920161246946e-05, "loss": 0.1294, "num_input_tokens_seen": 153758464, "step": 71260 }, { "epoch": 11.625611745513867, "grad_norm": 0.9569984078407288, "learning_rate": 2.227638232631374e-05, "loss": 0.2496, "num_input_tokens_seen": 153770176, "step": 71265 }, { "epoch": 11.62642740619902, "grad_norm": 1.1128960847854614, "learning_rate": 2.2272844546578108e-05, "loss": 0.082, "num_input_tokens_seen": 153781152, "step": 71270 }, { "epoch": 11.627243066884176, "grad_norm": 1.012009620666504, "learning_rate": 2.2269306822111734e-05, "loss": 0.0786, "num_input_tokens_seen": 153791264, "step": 71275 }, { "epoch": 11.62805872756933, "grad_norm": 1.9046484231948853, "learning_rate": 2.2265769152986325e-05, "loss": 0.1628, "num_input_tokens_seen": 153802784, "step": 71280 }, { "epoch": 11.628874388254486, "grad_norm": 0.11836719512939453, "learning_rate": 2.226223153927357e-05, "loss": 0.02, "num_input_tokens_seen": 153813088, "step": 71285 }, { "epoch": 11.629690048939642, "grad_norm": 0.6201183795928955, "learning_rate": 2.2258693981045162e-05, "loss": 0.1772, "num_input_tokens_seen": 153823744, "step": 71290 }, { "epoch": 11.630505709624796, "grad_norm": 0.6496106386184692, "learning_rate": 2.2255156478372804e-05, "loss": 0.0348, "num_input_tokens_seen": 153834752, "step": 71295 }, { "epoch": 11.631321370309951, "grad_norm": 1.1800585985183716, "learning_rate": 2.2251619031328182e-05, "loss": 0.0592, "num_input_tokens_seen": 153845632, "step": 71300 }, { "epoch": 11.632137030995105, "grad_norm": 0.14327801764011383, "learning_rate": 2.2248081639982983e-05, "loss": 0.0328, "num_input_tokens_seen": 153856704, "step": 71305 }, { "epoch": 11.632952691680261, "grad_norm": 2.1016080379486084, "learning_rate": 2.2244544304408907e-05, "loss": 0.1026, "num_input_tokens_seen": 153867584, "step": 71310 }, { "epoch": 11.633768352365417, "grad_norm": 0.15683196485042572, "learning_rate": 2.2241007024677626e-05, "loss": 0.0149, "num_input_tokens_seen": 153878976, "step": 71315 }, { "epoch": 11.63458401305057, "grad_norm": 0.45844560861587524, "learning_rate": 2.2237469800860853e-05, "loss": 0.1422, "num_input_tokens_seen": 153890880, "step": 71320 }, { "epoch": 11.635399673735726, "grad_norm": 0.09854397922754288, "learning_rate": 2.2233932633030244e-05, "loss": 0.0621, "num_input_tokens_seen": 153902048, "step": 71325 }, { "epoch": 11.63621533442088, "grad_norm": 0.28301748633384705, "learning_rate": 2.2230395521257512e-05, "loss": 0.1428, "num_input_tokens_seen": 153913792, "step": 71330 }, { "epoch": 11.637030995106036, "grad_norm": 0.08974439650774002, "learning_rate": 2.2226858465614317e-05, "loss": 0.0387, "num_input_tokens_seen": 153923744, "step": 71335 }, { "epoch": 11.63784665579119, "grad_norm": 1.015078067779541, "learning_rate": 2.2223321466172364e-05, "loss": 0.221, "num_input_tokens_seen": 153935552, "step": 71340 }, { "epoch": 11.638662316476346, "grad_norm": 1.7577146291732788, "learning_rate": 2.2219784523003315e-05, "loss": 0.2473, "num_input_tokens_seen": 153946272, "step": 71345 }, { "epoch": 11.639477977161501, "grad_norm": 0.6800237894058228, "learning_rate": 2.2216247636178865e-05, "loss": 0.0653, "num_input_tokens_seen": 153957056, "step": 71350 }, { "epoch": 11.640293637846655, "grad_norm": 0.2680055797100067, "learning_rate": 2.2212710805770685e-05, "loss": 0.1616, "num_input_tokens_seen": 153966400, "step": 71355 }, { "epoch": 11.641109298531811, "grad_norm": 0.5071640014648438, "learning_rate": 2.220917403185046e-05, "loss": 0.0352, "num_input_tokens_seen": 153976928, "step": 71360 }, { "epoch": 11.641924959216965, "grad_norm": 0.3483330309391022, "learning_rate": 2.2205637314489856e-05, "loss": 0.1234, "num_input_tokens_seen": 153987840, "step": 71365 }, { "epoch": 11.64274061990212, "grad_norm": 2.281287670135498, "learning_rate": 2.220210065376057e-05, "loss": 0.1728, "num_input_tokens_seen": 153998368, "step": 71370 }, { "epoch": 11.643556280587276, "grad_norm": 0.36765098571777344, "learning_rate": 2.219856404973425e-05, "loss": 0.1311, "num_input_tokens_seen": 154008864, "step": 71375 }, { "epoch": 11.64437194127243, "grad_norm": 0.9658650755882263, "learning_rate": 2.2195027502482595e-05, "loss": 0.2419, "num_input_tokens_seen": 154019168, "step": 71380 }, { "epoch": 11.645187601957586, "grad_norm": 0.6851235032081604, "learning_rate": 2.2191491012077255e-05, "loss": 0.2705, "num_input_tokens_seen": 154031040, "step": 71385 }, { "epoch": 11.64600326264274, "grad_norm": 0.32559430599212646, "learning_rate": 2.218795457858992e-05, "loss": 0.0317, "num_input_tokens_seen": 154041504, "step": 71390 }, { "epoch": 11.646818923327896, "grad_norm": 1.5505768060684204, "learning_rate": 2.2184418202092246e-05, "loss": 0.1719, "num_input_tokens_seen": 154052448, "step": 71395 }, { "epoch": 11.647634584013051, "grad_norm": 1.535341501235962, "learning_rate": 2.2180881882655908e-05, "loss": 0.0633, "num_input_tokens_seen": 154063840, "step": 71400 }, { "epoch": 11.648450244698205, "grad_norm": 0.14389893412590027, "learning_rate": 2.2177345620352587e-05, "loss": 0.0538, "num_input_tokens_seen": 154075360, "step": 71405 }, { "epoch": 11.649265905383361, "grad_norm": 0.19218319654464722, "learning_rate": 2.2173809415253924e-05, "loss": 0.0289, "num_input_tokens_seen": 154086560, "step": 71410 }, { "epoch": 11.650081566068515, "grad_norm": 0.122139111161232, "learning_rate": 2.217027326743161e-05, "loss": 0.0388, "num_input_tokens_seen": 154097344, "step": 71415 }, { "epoch": 11.65089722675367, "grad_norm": 0.1447196751832962, "learning_rate": 2.216673717695729e-05, "loss": 0.0193, "num_input_tokens_seen": 154108128, "step": 71420 }, { "epoch": 11.651712887438826, "grad_norm": 0.0458432175219059, "learning_rate": 2.2163201143902645e-05, "loss": 0.0543, "num_input_tokens_seen": 154119264, "step": 71425 }, { "epoch": 11.65252854812398, "grad_norm": 0.7194188237190247, "learning_rate": 2.2159665168339316e-05, "loss": 0.0564, "num_input_tokens_seen": 154130304, "step": 71430 }, { "epoch": 11.653344208809136, "grad_norm": 0.12153944373130798, "learning_rate": 2.215612925033898e-05, "loss": 0.1448, "num_input_tokens_seen": 154141664, "step": 71435 }, { "epoch": 11.65415986949429, "grad_norm": 0.4997771680355072, "learning_rate": 2.2152593389973294e-05, "loss": 0.0325, "num_input_tokens_seen": 154153536, "step": 71440 }, { "epoch": 11.654975530179446, "grad_norm": 0.05600794032216072, "learning_rate": 2.2149057587313916e-05, "loss": 0.1204, "num_input_tokens_seen": 154163936, "step": 71445 }, { "epoch": 11.655791190864601, "grad_norm": 0.1200203150510788, "learning_rate": 2.2145521842432502e-05, "loss": 0.0963, "num_input_tokens_seen": 154175520, "step": 71450 }, { "epoch": 11.656606851549755, "grad_norm": 0.08995931595563889, "learning_rate": 2.2141986155400707e-05, "loss": 0.0409, "num_input_tokens_seen": 154186976, "step": 71455 }, { "epoch": 11.65742251223491, "grad_norm": 1.1241157054901123, "learning_rate": 2.2138450526290184e-05, "loss": 0.069, "num_input_tokens_seen": 154198528, "step": 71460 }, { "epoch": 11.658238172920065, "grad_norm": 0.13394489884376526, "learning_rate": 2.2134914955172602e-05, "loss": 0.1368, "num_input_tokens_seen": 154209504, "step": 71465 }, { "epoch": 11.65905383360522, "grad_norm": 1.6833219528198242, "learning_rate": 2.2131379442119593e-05, "loss": 0.0513, "num_input_tokens_seen": 154219232, "step": 71470 }, { "epoch": 11.659869494290374, "grad_norm": 0.5416054725646973, "learning_rate": 2.212784398720283e-05, "loss": 0.1228, "num_input_tokens_seen": 154230528, "step": 71475 }, { "epoch": 11.66068515497553, "grad_norm": 0.06872204691171646, "learning_rate": 2.2124308590493938e-05, "loss": 0.0991, "num_input_tokens_seen": 154241568, "step": 71480 }, { "epoch": 11.661500815660686, "grad_norm": 0.994709849357605, "learning_rate": 2.2120773252064594e-05, "loss": 0.0937, "num_input_tokens_seen": 154249472, "step": 71485 }, { "epoch": 11.66231647634584, "grad_norm": 0.08537066727876663, "learning_rate": 2.211723797198642e-05, "loss": 0.0883, "num_input_tokens_seen": 154259872, "step": 71490 }, { "epoch": 11.663132137030995, "grad_norm": 0.3049289882183075, "learning_rate": 2.211370275033109e-05, "loss": 0.042, "num_input_tokens_seen": 154269792, "step": 71495 }, { "epoch": 11.66394779771615, "grad_norm": 0.0317438542842865, "learning_rate": 2.2110167587170222e-05, "loss": 0.0369, "num_input_tokens_seen": 154278336, "step": 71500 }, { "epoch": 11.664763458401305, "grad_norm": 1.097507357597351, "learning_rate": 2.2106632482575473e-05, "loss": 0.2016, "num_input_tokens_seen": 154289248, "step": 71505 }, { "epoch": 11.66557911908646, "grad_norm": 0.11923238635063171, "learning_rate": 2.21030974366185e-05, "loss": 0.1242, "num_input_tokens_seen": 154300608, "step": 71510 }, { "epoch": 11.666394779771615, "grad_norm": 0.17135946452617645, "learning_rate": 2.209956244937092e-05, "loss": 0.1874, "num_input_tokens_seen": 154310976, "step": 71515 }, { "epoch": 11.66721044045677, "grad_norm": 0.8062114715576172, "learning_rate": 2.20960275209044e-05, "loss": 0.0902, "num_input_tokens_seen": 154321760, "step": 71520 }, { "epoch": 11.668026101141924, "grad_norm": 0.67345130443573, "learning_rate": 2.2092492651290552e-05, "loss": 0.1024, "num_input_tokens_seen": 154333216, "step": 71525 }, { "epoch": 11.66884176182708, "grad_norm": 0.6466853022575378, "learning_rate": 2.2088957840601044e-05, "loss": 0.0903, "num_input_tokens_seen": 154344352, "step": 71530 }, { "epoch": 11.669657422512234, "grad_norm": 0.06418163329362869, "learning_rate": 2.2085423088907485e-05, "loss": 0.0962, "num_input_tokens_seen": 154353984, "step": 71535 }, { "epoch": 11.67047308319739, "grad_norm": 1.0835708379745483, "learning_rate": 2.2081888396281532e-05, "loss": 0.0606, "num_input_tokens_seen": 154364512, "step": 71540 }, { "epoch": 11.671288743882545, "grad_norm": 0.1695062816143036, "learning_rate": 2.2078353762794806e-05, "loss": 0.0801, "num_input_tokens_seen": 154374368, "step": 71545 }, { "epoch": 11.6721044045677, "grad_norm": 1.770332932472229, "learning_rate": 2.207481918851896e-05, "loss": 0.1301, "num_input_tokens_seen": 154385728, "step": 71550 }, { "epoch": 11.672920065252855, "grad_norm": 0.5528629422187805, "learning_rate": 2.2071284673525596e-05, "loss": 0.036, "num_input_tokens_seen": 154397504, "step": 71555 }, { "epoch": 11.673735725938009, "grad_norm": 0.3673337399959564, "learning_rate": 2.206775021788638e-05, "loss": 0.0859, "num_input_tokens_seen": 154408576, "step": 71560 }, { "epoch": 11.674551386623165, "grad_norm": 0.13144774734973907, "learning_rate": 2.2064215821672912e-05, "loss": 0.1369, "num_input_tokens_seen": 154419904, "step": 71565 }, { "epoch": 11.67536704730832, "grad_norm": 1.1563478708267212, "learning_rate": 2.206068148495684e-05, "loss": 0.0604, "num_input_tokens_seen": 154431776, "step": 71570 }, { "epoch": 11.676182707993474, "grad_norm": 0.04347466677427292, "learning_rate": 2.205714720780979e-05, "loss": 0.0824, "num_input_tokens_seen": 154443040, "step": 71575 }, { "epoch": 11.67699836867863, "grad_norm": 0.23658044636249542, "learning_rate": 2.205361299030338e-05, "loss": 0.0693, "num_input_tokens_seen": 154453920, "step": 71580 }, { "epoch": 11.677814029363784, "grad_norm": 0.458448201417923, "learning_rate": 2.2050078832509247e-05, "loss": 0.0593, "num_input_tokens_seen": 154465504, "step": 71585 }, { "epoch": 11.67862969004894, "grad_norm": 0.061862193048000336, "learning_rate": 2.2046544734499003e-05, "loss": 0.0069, "num_input_tokens_seen": 154476096, "step": 71590 }, { "epoch": 11.679445350734095, "grad_norm": 0.06035364791750908, "learning_rate": 2.204301069634428e-05, "loss": 0.1882, "num_input_tokens_seen": 154487200, "step": 71595 }, { "epoch": 11.68026101141925, "grad_norm": 1.6845651865005493, "learning_rate": 2.2039476718116693e-05, "loss": 0.2881, "num_input_tokens_seen": 154498784, "step": 71600 }, { "epoch": 11.681076672104405, "grad_norm": 0.1830216944217682, "learning_rate": 2.2035942799887864e-05, "loss": 0.0817, "num_input_tokens_seen": 154508608, "step": 71605 }, { "epoch": 11.681892332789559, "grad_norm": 0.08768809586763382, "learning_rate": 2.203240894172942e-05, "loss": 0.0913, "num_input_tokens_seen": 154519488, "step": 71610 }, { "epoch": 11.682707993474715, "grad_norm": 0.13853690028190613, "learning_rate": 2.202887514371297e-05, "loss": 0.0249, "num_input_tokens_seen": 154530272, "step": 71615 }, { "epoch": 11.68352365415987, "grad_norm": 1.7435916662216187, "learning_rate": 2.2025341405910138e-05, "loss": 0.1455, "num_input_tokens_seen": 154541344, "step": 71620 }, { "epoch": 11.684339314845024, "grad_norm": 0.036040835082530975, "learning_rate": 2.202180772839253e-05, "loss": 0.074, "num_input_tokens_seen": 154552000, "step": 71625 }, { "epoch": 11.68515497553018, "grad_norm": 1.8276020288467407, "learning_rate": 2.2018274111231768e-05, "loss": 0.2445, "num_input_tokens_seen": 154562880, "step": 71630 }, { "epoch": 11.685970636215334, "grad_norm": 0.3641231954097748, "learning_rate": 2.201474055449947e-05, "loss": 0.0301, "num_input_tokens_seen": 154573760, "step": 71635 }, { "epoch": 11.68678629690049, "grad_norm": 1.5511198043823242, "learning_rate": 2.2011207058267237e-05, "loss": 0.123, "num_input_tokens_seen": 154584832, "step": 71640 }, { "epoch": 11.687601957585644, "grad_norm": 0.1494094282388687, "learning_rate": 2.2007673622606688e-05, "loss": 0.135, "num_input_tokens_seen": 154594816, "step": 71645 }, { "epoch": 11.6884176182708, "grad_norm": 1.9654464721679688, "learning_rate": 2.2004140247589422e-05, "loss": 0.1418, "num_input_tokens_seen": 154605216, "step": 71650 }, { "epoch": 11.689233278955955, "grad_norm": 0.2310582995414734, "learning_rate": 2.2000606933287065e-05, "loss": 0.1199, "num_input_tokens_seen": 154616736, "step": 71655 }, { "epoch": 11.690048939641109, "grad_norm": 1.0282363891601562, "learning_rate": 2.19970736797712e-05, "loss": 0.1304, "num_input_tokens_seen": 154627744, "step": 71660 }, { "epoch": 11.690864600326265, "grad_norm": 0.3990475833415985, "learning_rate": 2.199354048711346e-05, "loss": 0.1252, "num_input_tokens_seen": 154638752, "step": 71665 }, { "epoch": 11.691680261011419, "grad_norm": 0.18896083533763885, "learning_rate": 2.199000735538543e-05, "loss": 0.035, "num_input_tokens_seen": 154649504, "step": 71670 }, { "epoch": 11.692495921696574, "grad_norm": 0.2734231650829315, "learning_rate": 2.1986474284658723e-05, "loss": 0.0632, "num_input_tokens_seen": 154659808, "step": 71675 }, { "epoch": 11.69331158238173, "grad_norm": 0.06812288612127304, "learning_rate": 2.1982941275004928e-05, "loss": 0.0304, "num_input_tokens_seen": 154670336, "step": 71680 }, { "epoch": 11.694127243066884, "grad_norm": 0.036676883697509766, "learning_rate": 2.1979408326495667e-05, "loss": 0.0744, "num_input_tokens_seen": 154680576, "step": 71685 }, { "epoch": 11.69494290375204, "grad_norm": 0.0743834525346756, "learning_rate": 2.1975875439202517e-05, "loss": 0.0766, "num_input_tokens_seen": 154692128, "step": 71690 }, { "epoch": 11.695758564437194, "grad_norm": 0.12382251024246216, "learning_rate": 2.19723426131971e-05, "loss": 0.0551, "num_input_tokens_seen": 154703424, "step": 71695 }, { "epoch": 11.69657422512235, "grad_norm": 0.2780958414077759, "learning_rate": 2.1968809848550986e-05, "loss": 0.0404, "num_input_tokens_seen": 154713056, "step": 71700 }, { "epoch": 11.697389885807503, "grad_norm": 0.1836766004562378, "learning_rate": 2.19652771453358e-05, "loss": 0.1396, "num_input_tokens_seen": 154723872, "step": 71705 }, { "epoch": 11.698205546492659, "grad_norm": 0.03601265698671341, "learning_rate": 2.196174450362311e-05, "loss": 0.1451, "num_input_tokens_seen": 154734304, "step": 71710 }, { "epoch": 11.699021207177815, "grad_norm": 0.09719846397638321, "learning_rate": 2.1958211923484524e-05, "loss": 0.0117, "num_input_tokens_seen": 154745888, "step": 71715 }, { "epoch": 11.699836867862969, "grad_norm": 1.8620809316635132, "learning_rate": 2.1954679404991644e-05, "loss": 0.2277, "num_input_tokens_seen": 154756544, "step": 71720 }, { "epoch": 11.700652528548124, "grad_norm": 0.527864933013916, "learning_rate": 2.1951146948216033e-05, "loss": 0.0204, "num_input_tokens_seen": 154768096, "step": 71725 }, { "epoch": 11.701468189233278, "grad_norm": 0.5303065180778503, "learning_rate": 2.194761455322931e-05, "loss": 0.0274, "num_input_tokens_seen": 154779296, "step": 71730 }, { "epoch": 11.702283849918434, "grad_norm": 1.0498732328414917, "learning_rate": 2.1944082220103042e-05, "loss": 0.0363, "num_input_tokens_seen": 154789632, "step": 71735 }, { "epoch": 11.70309951060359, "grad_norm": 1.287557601928711, "learning_rate": 2.1940549948908827e-05, "loss": 0.0672, "num_input_tokens_seen": 154800576, "step": 71740 }, { "epoch": 11.703915171288743, "grad_norm": 0.06512561440467834, "learning_rate": 2.193701773971825e-05, "loss": 0.1128, "num_input_tokens_seen": 154811328, "step": 71745 }, { "epoch": 11.7047308319739, "grad_norm": 2.2657628059387207, "learning_rate": 2.1933485592602896e-05, "loss": 0.2697, "num_input_tokens_seen": 154822208, "step": 71750 }, { "epoch": 11.705546492659053, "grad_norm": 1.7862155437469482, "learning_rate": 2.1929953507634343e-05, "loss": 0.1274, "num_input_tokens_seen": 154833088, "step": 71755 }, { "epoch": 11.706362153344209, "grad_norm": 1.0197663307189941, "learning_rate": 2.1926421484884178e-05, "loss": 0.1699, "num_input_tokens_seen": 154844288, "step": 71760 }, { "epoch": 11.707177814029365, "grad_norm": 2.307173490524292, "learning_rate": 2.1922889524423976e-05, "loss": 0.1409, "num_input_tokens_seen": 154855552, "step": 71765 }, { "epoch": 11.707993474714518, "grad_norm": 0.05354458838701248, "learning_rate": 2.191935762632533e-05, "loss": 0.042, "num_input_tokens_seen": 154866656, "step": 71770 }, { "epoch": 11.708809135399674, "grad_norm": 0.08509000390768051, "learning_rate": 2.1915825790659804e-05, "loss": 0.1046, "num_input_tokens_seen": 154877600, "step": 71775 }, { "epoch": 11.709624796084828, "grad_norm": 2.414236068725586, "learning_rate": 2.191229401749899e-05, "loss": 0.2246, "num_input_tokens_seen": 154889312, "step": 71780 }, { "epoch": 11.710440456769984, "grad_norm": 1.9786494970321655, "learning_rate": 2.1908762306914442e-05, "loss": 0.1412, "num_input_tokens_seen": 154900736, "step": 71785 }, { "epoch": 11.71125611745514, "grad_norm": 0.07262859493494034, "learning_rate": 2.190523065897776e-05, "loss": 0.0981, "num_input_tokens_seen": 154911552, "step": 71790 }, { "epoch": 11.712071778140293, "grad_norm": 0.0245384331792593, "learning_rate": 2.190169907376049e-05, "loss": 0.0363, "num_input_tokens_seen": 154922496, "step": 71795 }, { "epoch": 11.71288743882545, "grad_norm": 0.15329650044441223, "learning_rate": 2.189816755133423e-05, "loss": 0.1822, "num_input_tokens_seen": 154933504, "step": 71800 }, { "epoch": 11.713703099510603, "grad_norm": 0.5817785263061523, "learning_rate": 2.189463609177053e-05, "loss": 0.0256, "num_input_tokens_seen": 154944256, "step": 71805 }, { "epoch": 11.714518760195759, "grad_norm": 1.91278076171875, "learning_rate": 2.1891104695140982e-05, "loss": 0.12, "num_input_tokens_seen": 154953824, "step": 71810 }, { "epoch": 11.715334420880914, "grad_norm": 0.16218820214271545, "learning_rate": 2.1887573361517128e-05, "loss": 0.1228, "num_input_tokens_seen": 154964192, "step": 71815 }, { "epoch": 11.716150081566068, "grad_norm": 0.5790812373161316, "learning_rate": 2.188404209097055e-05, "loss": 0.1148, "num_input_tokens_seen": 154975456, "step": 71820 }, { "epoch": 11.716965742251224, "grad_norm": 1.1949416399002075, "learning_rate": 2.1880510883572823e-05, "loss": 0.0608, "num_input_tokens_seen": 154986304, "step": 71825 }, { "epoch": 11.717781402936378, "grad_norm": 1.3186224699020386, "learning_rate": 2.1876979739395487e-05, "loss": 0.2262, "num_input_tokens_seen": 154997024, "step": 71830 }, { "epoch": 11.718597063621534, "grad_norm": 1.1592273712158203, "learning_rate": 2.1873448658510133e-05, "loss": 0.0827, "num_input_tokens_seen": 155009312, "step": 71835 }, { "epoch": 11.719412724306688, "grad_norm": 0.5764561295509338, "learning_rate": 2.1869917640988295e-05, "loss": 0.1549, "num_input_tokens_seen": 155019008, "step": 71840 }, { "epoch": 11.720228384991843, "grad_norm": 0.04367675259709358, "learning_rate": 2.1866386686901555e-05, "loss": 0.0531, "num_input_tokens_seen": 155029824, "step": 71845 }, { "epoch": 11.721044045676999, "grad_norm": 0.8123282194137573, "learning_rate": 2.1862855796321458e-05, "loss": 0.1109, "num_input_tokens_seen": 155039872, "step": 71850 }, { "epoch": 11.721859706362153, "grad_norm": 0.12883858382701874, "learning_rate": 2.1859324969319577e-05, "loss": 0.0408, "num_input_tokens_seen": 155050208, "step": 71855 }, { "epoch": 11.722675367047309, "grad_norm": 0.031155744567513466, "learning_rate": 2.1855794205967447e-05, "loss": 0.0486, "num_input_tokens_seen": 155060288, "step": 71860 }, { "epoch": 11.723491027732463, "grad_norm": 1.651487946510315, "learning_rate": 2.1852263506336648e-05, "loss": 0.1107, "num_input_tokens_seen": 155071552, "step": 71865 }, { "epoch": 11.724306688417618, "grad_norm": 1.0011276006698608, "learning_rate": 2.184873287049871e-05, "loss": 0.1843, "num_input_tokens_seen": 155083104, "step": 71870 }, { "epoch": 11.725122349102774, "grad_norm": 1.3598849773406982, "learning_rate": 2.1845202298525213e-05, "loss": 0.1267, "num_input_tokens_seen": 155093824, "step": 71875 }, { "epoch": 11.725938009787928, "grad_norm": 1.8257702589035034, "learning_rate": 2.184167179048768e-05, "loss": 0.1414, "num_input_tokens_seen": 155105728, "step": 71880 }, { "epoch": 11.726753670473084, "grad_norm": 1.2053662538528442, "learning_rate": 2.1838141346457684e-05, "loss": 0.062, "num_input_tokens_seen": 155116352, "step": 71885 }, { "epoch": 11.727569331158238, "grad_norm": 1.6391335725784302, "learning_rate": 2.183461096650676e-05, "loss": 0.1967, "num_input_tokens_seen": 155126528, "step": 71890 }, { "epoch": 11.728384991843393, "grad_norm": 1.214876651763916, "learning_rate": 2.1831080650706462e-05, "loss": 0.0731, "num_input_tokens_seen": 155137664, "step": 71895 }, { "epoch": 11.729200652528547, "grad_norm": 0.20922371745109558, "learning_rate": 2.1827550399128336e-05, "loss": 0.0496, "num_input_tokens_seen": 155148128, "step": 71900 }, { "epoch": 11.730016313213703, "grad_norm": 2.5136637687683105, "learning_rate": 2.1824020211843924e-05, "loss": 0.1945, "num_input_tokens_seen": 155159648, "step": 71905 }, { "epoch": 11.730831973898859, "grad_norm": 0.13368813693523407, "learning_rate": 2.1820490088924766e-05, "loss": 0.1404, "num_input_tokens_seen": 155171872, "step": 71910 }, { "epoch": 11.731647634584013, "grad_norm": 0.2741169035434723, "learning_rate": 2.181696003044242e-05, "loss": 0.0586, "num_input_tokens_seen": 155183104, "step": 71915 }, { "epoch": 11.732463295269168, "grad_norm": 0.03828870505094528, "learning_rate": 2.1813430036468406e-05, "loss": 0.0107, "num_input_tokens_seen": 155194752, "step": 71920 }, { "epoch": 11.733278955954322, "grad_norm": 0.30195772647857666, "learning_rate": 2.1809900107074288e-05, "loss": 0.0802, "num_input_tokens_seen": 155206336, "step": 71925 }, { "epoch": 11.734094616639478, "grad_norm": 0.21147361397743225, "learning_rate": 2.180637024233158e-05, "loss": 0.1024, "num_input_tokens_seen": 155217440, "step": 71930 }, { "epoch": 11.734910277324634, "grad_norm": 0.05096254125237465, "learning_rate": 2.1802840442311827e-05, "loss": 0.0711, "num_input_tokens_seen": 155229056, "step": 71935 }, { "epoch": 11.735725938009788, "grad_norm": 1.0018776655197144, "learning_rate": 2.1799310707086584e-05, "loss": 0.0646, "num_input_tokens_seen": 155240672, "step": 71940 }, { "epoch": 11.736541598694943, "grad_norm": 1.2574772834777832, "learning_rate": 2.1795781036727356e-05, "loss": 0.2504, "num_input_tokens_seen": 155250464, "step": 71945 }, { "epoch": 11.737357259380097, "grad_norm": 0.09322245419025421, "learning_rate": 2.17922514313057e-05, "loss": 0.0306, "num_input_tokens_seen": 155260384, "step": 71950 }, { "epoch": 11.738172920065253, "grad_norm": 0.8025752305984497, "learning_rate": 2.178872189089313e-05, "loss": 0.1377, "num_input_tokens_seen": 155270336, "step": 71955 }, { "epoch": 11.738988580750409, "grad_norm": 0.9805739521980286, "learning_rate": 2.1785192415561195e-05, "loss": 0.1004, "num_input_tokens_seen": 155281312, "step": 71960 }, { "epoch": 11.739804241435563, "grad_norm": 0.15525826811790466, "learning_rate": 2.1781663005381404e-05, "loss": 0.063, "num_input_tokens_seen": 155293312, "step": 71965 }, { "epoch": 11.740619902120718, "grad_norm": 0.9560561776161194, "learning_rate": 2.1778133660425305e-05, "loss": 0.1259, "num_input_tokens_seen": 155305184, "step": 71970 }, { "epoch": 11.741435562805872, "grad_norm": 0.176616832613945, "learning_rate": 2.17746043807644e-05, "loss": 0.0466, "num_input_tokens_seen": 155314560, "step": 71975 }, { "epoch": 11.742251223491028, "grad_norm": 0.7417149543762207, "learning_rate": 2.1771075166470245e-05, "loss": 0.1214, "num_input_tokens_seen": 155324832, "step": 71980 }, { "epoch": 11.743066884176184, "grad_norm": 0.05028005689382553, "learning_rate": 2.1767546017614335e-05, "loss": 0.0117, "num_input_tokens_seen": 155334720, "step": 71985 }, { "epoch": 11.743882544861338, "grad_norm": 0.480392724275589, "learning_rate": 2.1764016934268217e-05, "loss": 0.0209, "num_input_tokens_seen": 155345280, "step": 71990 }, { "epoch": 11.744698205546493, "grad_norm": 1.8202316761016846, "learning_rate": 2.176048791650339e-05, "loss": 0.1216, "num_input_tokens_seen": 155357184, "step": 71995 }, { "epoch": 11.745513866231647, "grad_norm": 0.17430654168128967, "learning_rate": 2.1756958964391395e-05, "loss": 0.0831, "num_input_tokens_seen": 155368352, "step": 72000 }, { "epoch": 11.746329526916803, "grad_norm": 0.4297568202018738, "learning_rate": 2.1753430078003728e-05, "loss": 0.1561, "num_input_tokens_seen": 155378528, "step": 72005 }, { "epoch": 11.747145187601957, "grad_norm": 0.8241751194000244, "learning_rate": 2.1749901257411933e-05, "loss": 0.0481, "num_input_tokens_seen": 155389408, "step": 72010 }, { "epoch": 11.747960848287113, "grad_norm": 1.2295210361480713, "learning_rate": 2.17463725026875e-05, "loss": 0.2501, "num_input_tokens_seen": 155400864, "step": 72015 }, { "epoch": 11.748776508972268, "grad_norm": 2.7945940494537354, "learning_rate": 2.1742843813901967e-05, "loss": 0.1164, "num_input_tokens_seen": 155410720, "step": 72020 }, { "epoch": 11.749592169657422, "grad_norm": 0.886651337146759, "learning_rate": 2.1739315191126823e-05, "loss": 0.0701, "num_input_tokens_seen": 155421536, "step": 72025 }, { "epoch": 11.750407830342578, "grad_norm": 1.4198424816131592, "learning_rate": 2.17357866344336e-05, "loss": 0.0882, "num_input_tokens_seen": 155431744, "step": 72030 }, { "epoch": 11.751223491027732, "grad_norm": 1.7454309463500977, "learning_rate": 2.173225814389381e-05, "loss": 0.2312, "num_input_tokens_seen": 155442464, "step": 72035 }, { "epoch": 11.752039151712887, "grad_norm": 0.8966797590255737, "learning_rate": 2.1728729719578938e-05, "loss": 0.1294, "num_input_tokens_seen": 155454016, "step": 72040 }, { "epoch": 11.752854812398043, "grad_norm": 0.024963712319731712, "learning_rate": 2.1725201361560522e-05, "loss": 0.0607, "num_input_tokens_seen": 155465088, "step": 72045 }, { "epoch": 11.753670473083197, "grad_norm": 0.09828583896160126, "learning_rate": 2.1721673069910042e-05, "loss": 0.0432, "num_input_tokens_seen": 155474624, "step": 72050 }, { "epoch": 11.754486133768353, "grad_norm": 0.9671932458877563, "learning_rate": 2.1718144844699028e-05, "loss": 0.1017, "num_input_tokens_seen": 155484960, "step": 72055 }, { "epoch": 11.755301794453507, "grad_norm": 0.19198890030384064, "learning_rate": 2.1714616685998964e-05, "loss": 0.0343, "num_input_tokens_seen": 155495264, "step": 72060 }, { "epoch": 11.756117455138662, "grad_norm": 0.4486143887042999, "learning_rate": 2.1711088593881366e-05, "loss": 0.0747, "num_input_tokens_seen": 155506656, "step": 72065 }, { "epoch": 11.756933115823816, "grad_norm": 0.15413445234298706, "learning_rate": 2.1707560568417728e-05, "loss": 0.0188, "num_input_tokens_seen": 155517664, "step": 72070 }, { "epoch": 11.757748776508972, "grad_norm": 0.3020089864730835, "learning_rate": 2.1704032609679552e-05, "loss": 0.142, "num_input_tokens_seen": 155528480, "step": 72075 }, { "epoch": 11.758564437194128, "grad_norm": 0.03300056234002113, "learning_rate": 2.1700504717738332e-05, "loss": 0.1846, "num_input_tokens_seen": 155539072, "step": 72080 }, { "epoch": 11.759380097879282, "grad_norm": 0.035258591175079346, "learning_rate": 2.169697689266558e-05, "loss": 0.0081, "num_input_tokens_seen": 155548768, "step": 72085 }, { "epoch": 11.760195758564437, "grad_norm": 0.6669556498527527, "learning_rate": 2.169344913453277e-05, "loss": 0.1209, "num_input_tokens_seen": 155560320, "step": 72090 }, { "epoch": 11.761011419249591, "grad_norm": 1.4958211183547974, "learning_rate": 2.168992144341142e-05, "loss": 0.1846, "num_input_tokens_seen": 155570368, "step": 72095 }, { "epoch": 11.761827079934747, "grad_norm": 1.358277440071106, "learning_rate": 2.1686393819372995e-05, "loss": 0.1208, "num_input_tokens_seen": 155579808, "step": 72100 }, { "epoch": 11.762642740619903, "grad_norm": 0.7783116698265076, "learning_rate": 2.168286626248902e-05, "loss": 0.0296, "num_input_tokens_seen": 155590304, "step": 72105 }, { "epoch": 11.763458401305057, "grad_norm": 0.6728605628013611, "learning_rate": 2.1679338772830955e-05, "loss": 0.38, "num_input_tokens_seen": 155602336, "step": 72110 }, { "epoch": 11.764274061990212, "grad_norm": 0.19064126908779144, "learning_rate": 2.1675811350470314e-05, "loss": 0.1327, "num_input_tokens_seen": 155613376, "step": 72115 }, { "epoch": 11.765089722675366, "grad_norm": 1.7454179525375366, "learning_rate": 2.167228399547856e-05, "loss": 0.1254, "num_input_tokens_seen": 155624992, "step": 72120 }, { "epoch": 11.765905383360522, "grad_norm": 0.03442038968205452, "learning_rate": 2.1668756707927208e-05, "loss": 0.0352, "num_input_tokens_seen": 155635904, "step": 72125 }, { "epoch": 11.766721044045678, "grad_norm": 1.353100061416626, "learning_rate": 2.166522948788771e-05, "loss": 0.1803, "num_input_tokens_seen": 155646624, "step": 72130 }, { "epoch": 11.767536704730832, "grad_norm": 0.5936353206634521, "learning_rate": 2.166170233543158e-05, "loss": 0.0462, "num_input_tokens_seen": 155656384, "step": 72135 }, { "epoch": 11.768352365415987, "grad_norm": 0.23048067092895508, "learning_rate": 2.165817525063028e-05, "loss": 0.1074, "num_input_tokens_seen": 155667264, "step": 72140 }, { "epoch": 11.769168026101141, "grad_norm": 0.1819506734609604, "learning_rate": 2.1654648233555297e-05, "loss": 0.0582, "num_input_tokens_seen": 155677024, "step": 72145 }, { "epoch": 11.769983686786297, "grad_norm": 0.06574428081512451, "learning_rate": 2.165112128427812e-05, "loss": 0.1273, "num_input_tokens_seen": 155688640, "step": 72150 }, { "epoch": 11.770799347471453, "grad_norm": 0.1374538689851761, "learning_rate": 2.1647594402870208e-05, "loss": 0.0313, "num_input_tokens_seen": 155698592, "step": 72155 }, { "epoch": 11.771615008156607, "grad_norm": 1.4198116064071655, "learning_rate": 2.164406758940306e-05, "loss": 0.1777, "num_input_tokens_seen": 155710144, "step": 72160 }, { "epoch": 11.772430668841762, "grad_norm": 0.035795364528894424, "learning_rate": 2.1640540843948124e-05, "loss": 0.0574, "num_input_tokens_seen": 155720192, "step": 72165 }, { "epoch": 11.773246329526916, "grad_norm": 0.30315157771110535, "learning_rate": 2.1637014166576908e-05, "loss": 0.1142, "num_input_tokens_seen": 155731040, "step": 72170 }, { "epoch": 11.774061990212072, "grad_norm": 0.1948399394750595, "learning_rate": 2.1633487557360852e-05, "loss": 0.1109, "num_input_tokens_seen": 155740736, "step": 72175 }, { "epoch": 11.774877650897226, "grad_norm": 2.3523988723754883, "learning_rate": 2.1629961016371454e-05, "loss": 0.1967, "num_input_tokens_seen": 155751840, "step": 72180 }, { "epoch": 11.775693311582382, "grad_norm": 0.6412901282310486, "learning_rate": 2.162643454368016e-05, "loss": 0.0452, "num_input_tokens_seen": 155762912, "step": 72185 }, { "epoch": 11.776508972267537, "grad_norm": 0.11217783391475677, "learning_rate": 2.1622908139358456e-05, "loss": 0.0643, "num_input_tokens_seen": 155774688, "step": 72190 }, { "epoch": 11.777324632952691, "grad_norm": 1.2600609064102173, "learning_rate": 2.1619381803477795e-05, "loss": 0.0366, "num_input_tokens_seen": 155785760, "step": 72195 }, { "epoch": 11.778140293637847, "grad_norm": 0.7759982943534851, "learning_rate": 2.1615855536109654e-05, "loss": 0.0282, "num_input_tokens_seen": 155796256, "step": 72200 }, { "epoch": 11.778955954323001, "grad_norm": 0.9377126693725586, "learning_rate": 2.1612329337325497e-05, "loss": 0.0213, "num_input_tokens_seen": 155806080, "step": 72205 }, { "epoch": 11.779771615008157, "grad_norm": 0.8110496401786804, "learning_rate": 2.1608803207196782e-05, "loss": 0.0304, "num_input_tokens_seen": 155816288, "step": 72210 }, { "epoch": 11.780587275693312, "grad_norm": 0.020884698256850243, "learning_rate": 2.160527714579497e-05, "loss": 0.0269, "num_input_tokens_seen": 155827488, "step": 72215 }, { "epoch": 11.781402936378466, "grad_norm": 0.3171876072883606, "learning_rate": 2.1601751153191522e-05, "loss": 0.0827, "num_input_tokens_seen": 155838016, "step": 72220 }, { "epoch": 11.782218597063622, "grad_norm": 0.11925777047872543, "learning_rate": 2.1598225229457896e-05, "loss": 0.0863, "num_input_tokens_seen": 155848928, "step": 72225 }, { "epoch": 11.783034257748776, "grad_norm": 1.2906756401062012, "learning_rate": 2.1594699374665565e-05, "loss": 0.0974, "num_input_tokens_seen": 155860352, "step": 72230 }, { "epoch": 11.783849918433932, "grad_norm": 0.05222009867429733, "learning_rate": 2.1591173588885955e-05, "loss": 0.0825, "num_input_tokens_seen": 155871040, "step": 72235 }, { "epoch": 11.784665579119086, "grad_norm": 0.9452759623527527, "learning_rate": 2.158764787219055e-05, "loss": 0.0518, "num_input_tokens_seen": 155882048, "step": 72240 }, { "epoch": 11.785481239804241, "grad_norm": 1.0817322731018066, "learning_rate": 2.1584122224650775e-05, "loss": 0.0897, "num_input_tokens_seen": 155893600, "step": 72245 }, { "epoch": 11.786296900489397, "grad_norm": 0.9942450523376465, "learning_rate": 2.1580596646338103e-05, "loss": 0.1402, "num_input_tokens_seen": 155904832, "step": 72250 }, { "epoch": 11.78711256117455, "grad_norm": 1.004464030265808, "learning_rate": 2.1577071137323984e-05, "loss": 0.0914, "num_input_tokens_seen": 155915968, "step": 72255 }, { "epoch": 11.787928221859707, "grad_norm": 0.025694571435451508, "learning_rate": 2.1573545697679852e-05, "loss": 0.0751, "num_input_tokens_seen": 155927072, "step": 72260 }, { "epoch": 11.78874388254486, "grad_norm": 0.10167305916547775, "learning_rate": 2.157002032747717e-05, "loss": 0.0962, "num_input_tokens_seen": 155936384, "step": 72265 }, { "epoch": 11.789559543230016, "grad_norm": 0.21725936233997345, "learning_rate": 2.1566495026787374e-05, "loss": 0.0812, "num_input_tokens_seen": 155947744, "step": 72270 }, { "epoch": 11.790375203915172, "grad_norm": 0.5919586420059204, "learning_rate": 2.1562969795681916e-05, "loss": 0.1541, "num_input_tokens_seen": 155957536, "step": 72275 }, { "epoch": 11.791190864600326, "grad_norm": 0.11330591887235641, "learning_rate": 2.1559444634232226e-05, "loss": 0.0682, "num_input_tokens_seen": 155969376, "step": 72280 }, { "epoch": 11.792006525285482, "grad_norm": 0.7995628118515015, "learning_rate": 2.1555919542509766e-05, "loss": 0.1099, "num_input_tokens_seen": 155980032, "step": 72285 }, { "epoch": 11.792822185970635, "grad_norm": 1.8165888786315918, "learning_rate": 2.1552394520585956e-05, "loss": 0.1062, "num_input_tokens_seen": 155990560, "step": 72290 }, { "epoch": 11.793637846655791, "grad_norm": 0.3118176758289337, "learning_rate": 2.1548869568532256e-05, "loss": 0.0212, "num_input_tokens_seen": 156000320, "step": 72295 }, { "epoch": 11.794453507340947, "grad_norm": 0.5182856917381287, "learning_rate": 2.154534468642008e-05, "loss": 0.0394, "num_input_tokens_seen": 156011488, "step": 72300 }, { "epoch": 11.7952691680261, "grad_norm": 0.07705949991941452, "learning_rate": 2.154181987432089e-05, "loss": 0.04, "num_input_tokens_seen": 156022816, "step": 72305 }, { "epoch": 11.796084828711257, "grad_norm": 0.9650600552558899, "learning_rate": 2.1538295132306092e-05, "loss": 0.0437, "num_input_tokens_seen": 156033824, "step": 72310 }, { "epoch": 11.79690048939641, "grad_norm": 0.4057360589504242, "learning_rate": 2.1534770460447148e-05, "loss": 0.1962, "num_input_tokens_seen": 156045088, "step": 72315 }, { "epoch": 11.797716150081566, "grad_norm": 0.12706927955150604, "learning_rate": 2.1531245858815465e-05, "loss": 0.0152, "num_input_tokens_seen": 156055616, "step": 72320 }, { "epoch": 11.798531810766722, "grad_norm": 3.458996057510376, "learning_rate": 2.15277213274825e-05, "loss": 0.2019, "num_input_tokens_seen": 156066528, "step": 72325 }, { "epoch": 11.799347471451876, "grad_norm": 0.7829803228378296, "learning_rate": 2.152419686651965e-05, "loss": 0.026, "num_input_tokens_seen": 156077984, "step": 72330 }, { "epoch": 11.800163132137031, "grad_norm": 2.9794728755950928, "learning_rate": 2.1520672475998373e-05, "loss": 0.172, "num_input_tokens_seen": 156088032, "step": 72335 }, { "epoch": 11.800978792822185, "grad_norm": 0.02395249530673027, "learning_rate": 2.151714815599007e-05, "loss": 0.1907, "num_input_tokens_seen": 156097728, "step": 72340 }, { "epoch": 11.801794453507341, "grad_norm": 1.2060054540634155, "learning_rate": 2.1513623906566187e-05, "loss": 0.2369, "num_input_tokens_seen": 156107712, "step": 72345 }, { "epoch": 11.802610114192497, "grad_norm": 0.1110033169388771, "learning_rate": 2.1510099727798135e-05, "loss": 0.0165, "num_input_tokens_seen": 156118912, "step": 72350 }, { "epoch": 11.80342577487765, "grad_norm": 0.12512585520744324, "learning_rate": 2.1506575619757335e-05, "loss": 0.0593, "num_input_tokens_seen": 156130048, "step": 72355 }, { "epoch": 11.804241435562806, "grad_norm": 0.22646427154541016, "learning_rate": 2.1503051582515224e-05, "loss": 0.0317, "num_input_tokens_seen": 156139840, "step": 72360 }, { "epoch": 11.80505709624796, "grad_norm": 0.24537383019924164, "learning_rate": 2.1499527616143194e-05, "loss": 0.0428, "num_input_tokens_seen": 156150816, "step": 72365 }, { "epoch": 11.805872756933116, "grad_norm": 0.0775226429104805, "learning_rate": 2.1496003720712688e-05, "loss": 0.1157, "num_input_tokens_seen": 156164160, "step": 72370 }, { "epoch": 11.80668841761827, "grad_norm": 0.46216467022895813, "learning_rate": 2.149247989629511e-05, "loss": 0.1293, "num_input_tokens_seen": 156174464, "step": 72375 }, { "epoch": 11.807504078303426, "grad_norm": 0.6570327877998352, "learning_rate": 2.1488956142961875e-05, "loss": 0.0444, "num_input_tokens_seen": 156185760, "step": 72380 }, { "epoch": 11.808319738988581, "grad_norm": 0.07721158862113953, "learning_rate": 2.14854324607844e-05, "loss": 0.1763, "num_input_tokens_seen": 156196128, "step": 72385 }, { "epoch": 11.809135399673735, "grad_norm": 0.3895094096660614, "learning_rate": 2.1481908849834092e-05, "loss": 0.11, "num_input_tokens_seen": 156205920, "step": 72390 }, { "epoch": 11.809951060358891, "grad_norm": 1.4356619119644165, "learning_rate": 2.147838531018236e-05, "loss": 0.0669, "num_input_tokens_seen": 156217184, "step": 72395 }, { "epoch": 11.810766721044045, "grad_norm": 0.07656122744083405, "learning_rate": 2.147486184190063e-05, "loss": 0.0353, "num_input_tokens_seen": 156227968, "step": 72400 }, { "epoch": 11.8115823817292, "grad_norm": 0.058505211025476456, "learning_rate": 2.1471338445060285e-05, "loss": 0.0956, "num_input_tokens_seen": 156239360, "step": 72405 }, { "epoch": 11.812398042414356, "grad_norm": 1.4795176982879639, "learning_rate": 2.146781511973275e-05, "loss": 0.1266, "num_input_tokens_seen": 156248928, "step": 72410 }, { "epoch": 11.81321370309951, "grad_norm": 1.254655122756958, "learning_rate": 2.1464291865989415e-05, "loss": 0.1564, "num_input_tokens_seen": 156259648, "step": 72415 }, { "epoch": 11.814029363784666, "grad_norm": 1.8028042316436768, "learning_rate": 2.1460768683901702e-05, "loss": 0.2172, "num_input_tokens_seen": 156269664, "step": 72420 }, { "epoch": 11.81484502446982, "grad_norm": 0.35112670063972473, "learning_rate": 2.145724557354099e-05, "loss": 0.1829, "num_input_tokens_seen": 156279552, "step": 72425 }, { "epoch": 11.815660685154976, "grad_norm": 1.7442626953125, "learning_rate": 2.1453722534978702e-05, "loss": 0.1513, "num_input_tokens_seen": 156290944, "step": 72430 }, { "epoch": 11.81647634584013, "grad_norm": 0.02948615700006485, "learning_rate": 2.1450199568286213e-05, "loss": 0.1661, "num_input_tokens_seen": 156301856, "step": 72435 }, { "epoch": 11.817292006525285, "grad_norm": 0.10124872624874115, "learning_rate": 2.1446676673534945e-05, "loss": 0.0643, "num_input_tokens_seen": 156313056, "step": 72440 }, { "epoch": 11.818107667210441, "grad_norm": 0.22210337221622467, "learning_rate": 2.144315385079627e-05, "loss": 0.0326, "num_input_tokens_seen": 156324224, "step": 72445 }, { "epoch": 11.818923327895595, "grad_norm": 0.6677703857421875, "learning_rate": 2.1439631100141607e-05, "loss": 0.0762, "num_input_tokens_seen": 156334944, "step": 72450 }, { "epoch": 11.81973898858075, "grad_norm": 0.3709598481655121, "learning_rate": 2.1436108421642327e-05, "loss": 0.019, "num_input_tokens_seen": 156345664, "step": 72455 }, { "epoch": 11.820554649265905, "grad_norm": 0.6490896940231323, "learning_rate": 2.1432585815369827e-05, "loss": 0.0332, "num_input_tokens_seen": 156357504, "step": 72460 }, { "epoch": 11.82137030995106, "grad_norm": 0.12364698201417923, "learning_rate": 2.1429063281395512e-05, "loss": 0.0347, "num_input_tokens_seen": 156366208, "step": 72465 }, { "epoch": 11.822185970636216, "grad_norm": 0.3269157111644745, "learning_rate": 2.142554081979075e-05, "loss": 0.0148, "num_input_tokens_seen": 156377376, "step": 72470 }, { "epoch": 11.82300163132137, "grad_norm": 0.01496219914406538, "learning_rate": 2.142201843062695e-05, "loss": 0.0221, "num_input_tokens_seen": 156387616, "step": 72475 }, { "epoch": 11.823817292006526, "grad_norm": 0.7331241965293884, "learning_rate": 2.1418496113975473e-05, "loss": 0.2087, "num_input_tokens_seen": 156396960, "step": 72480 }, { "epoch": 11.82463295269168, "grad_norm": 0.10415130108594894, "learning_rate": 2.141497386990773e-05, "loss": 0.0505, "num_input_tokens_seen": 156407104, "step": 72485 }, { "epoch": 11.825448613376835, "grad_norm": 1.8091648817062378, "learning_rate": 2.1411451698495074e-05, "loss": 0.1598, "num_input_tokens_seen": 156418784, "step": 72490 }, { "epoch": 11.826264274061991, "grad_norm": 2.283612012863159, "learning_rate": 2.1407929599808914e-05, "loss": 0.1627, "num_input_tokens_seen": 156428768, "step": 72495 }, { "epoch": 11.827079934747145, "grad_norm": 1.297397255897522, "learning_rate": 2.1404407573920608e-05, "loss": 0.1396, "num_input_tokens_seen": 156440448, "step": 72500 }, { "epoch": 11.8278955954323, "grad_norm": 0.791180431842804, "learning_rate": 2.1400885620901553e-05, "loss": 0.0228, "num_input_tokens_seen": 156452544, "step": 72505 }, { "epoch": 11.828711256117455, "grad_norm": 0.0690278485417366, "learning_rate": 2.1397363740823108e-05, "loss": 0.0723, "num_input_tokens_seen": 156462624, "step": 72510 }, { "epoch": 11.82952691680261, "grad_norm": 0.11320072412490845, "learning_rate": 2.139384193375666e-05, "loss": 0.2265, "num_input_tokens_seen": 156473088, "step": 72515 }, { "epoch": 11.830342577487766, "grad_norm": 0.16362887620925903, "learning_rate": 2.139032019977358e-05, "loss": 0.0176, "num_input_tokens_seen": 156483936, "step": 72520 }, { "epoch": 11.83115823817292, "grad_norm": 1.5470556020736694, "learning_rate": 2.1386798538945243e-05, "loss": 0.0856, "num_input_tokens_seen": 156495136, "step": 72525 }, { "epoch": 11.831973898858076, "grad_norm": 0.09813224524259567, "learning_rate": 2.1383276951343015e-05, "loss": 0.0111, "num_input_tokens_seen": 156505984, "step": 72530 }, { "epoch": 11.83278955954323, "grad_norm": 0.7180408835411072, "learning_rate": 2.137975543703827e-05, "loss": 0.0825, "num_input_tokens_seen": 156516352, "step": 72535 }, { "epoch": 11.833605220228385, "grad_norm": 0.6611080169677734, "learning_rate": 2.1376233996102364e-05, "loss": 0.0241, "num_input_tokens_seen": 156527040, "step": 72540 }, { "epoch": 11.83442088091354, "grad_norm": 1.8787225484848022, "learning_rate": 2.137271262860669e-05, "loss": 0.1146, "num_input_tokens_seen": 156538656, "step": 72545 }, { "epoch": 11.835236541598695, "grad_norm": 0.10267730057239532, "learning_rate": 2.136919133462258e-05, "loss": 0.1434, "num_input_tokens_seen": 156548000, "step": 72550 }, { "epoch": 11.83605220228385, "grad_norm": 0.1366681605577469, "learning_rate": 2.1365670114221426e-05, "loss": 0.131, "num_input_tokens_seen": 156559200, "step": 72555 }, { "epoch": 11.836867862969005, "grad_norm": 0.06741610914468765, "learning_rate": 2.136214896747457e-05, "loss": 0.0141, "num_input_tokens_seen": 156571232, "step": 72560 }, { "epoch": 11.83768352365416, "grad_norm": 0.03418995067477226, "learning_rate": 2.1358627894453375e-05, "loss": 0.0183, "num_input_tokens_seen": 156582016, "step": 72565 }, { "epoch": 11.838499184339314, "grad_norm": 0.5928446054458618, "learning_rate": 2.1355106895229218e-05, "loss": 0.1277, "num_input_tokens_seen": 156591136, "step": 72570 }, { "epoch": 11.83931484502447, "grad_norm": 0.018051112070679665, "learning_rate": 2.1351585969873433e-05, "loss": 0.0125, "num_input_tokens_seen": 156602176, "step": 72575 }, { "epoch": 11.840130505709626, "grad_norm": 0.14932465553283691, "learning_rate": 2.1348065118457396e-05, "loss": 0.0357, "num_input_tokens_seen": 156613760, "step": 72580 }, { "epoch": 11.84094616639478, "grad_norm": 0.1952497661113739, "learning_rate": 2.1344544341052445e-05, "loss": 0.0579, "num_input_tokens_seen": 156624192, "step": 72585 }, { "epoch": 11.841761827079935, "grad_norm": 0.047067370265722275, "learning_rate": 2.1341023637729948e-05, "loss": 0.0398, "num_input_tokens_seen": 156635232, "step": 72590 }, { "epoch": 11.84257748776509, "grad_norm": 0.8588661551475525, "learning_rate": 2.1337503008561238e-05, "loss": 0.0604, "num_input_tokens_seen": 156646144, "step": 72595 }, { "epoch": 11.843393148450245, "grad_norm": 0.776509702205658, "learning_rate": 2.133398245361769e-05, "loss": 0.2112, "num_input_tokens_seen": 156657056, "step": 72600 }, { "epoch": 11.844208809135399, "grad_norm": 0.6884551644325256, "learning_rate": 2.133046197297062e-05, "loss": 0.0584, "num_input_tokens_seen": 156667616, "step": 72605 }, { "epoch": 11.845024469820554, "grad_norm": 0.3213008642196655, "learning_rate": 2.1326941566691414e-05, "loss": 0.0836, "num_input_tokens_seen": 156679488, "step": 72610 }, { "epoch": 11.84584013050571, "grad_norm": 1.86896550655365, "learning_rate": 2.1323421234851378e-05, "loss": 0.0407, "num_input_tokens_seen": 156691136, "step": 72615 }, { "epoch": 11.846655791190864, "grad_norm": 0.2791233956813812, "learning_rate": 2.131990097752189e-05, "loss": 0.0716, "num_input_tokens_seen": 156703008, "step": 72620 }, { "epoch": 11.84747145187602, "grad_norm": 1.6661176681518555, "learning_rate": 2.1316380794774267e-05, "loss": 0.1695, "num_input_tokens_seen": 156713696, "step": 72625 }, { "epoch": 11.848287112561174, "grad_norm": 0.025937797501683235, "learning_rate": 2.1312860686679867e-05, "loss": 0.1049, "num_input_tokens_seen": 156725248, "step": 72630 }, { "epoch": 11.84910277324633, "grad_norm": 0.5306939482688904, "learning_rate": 2.1309340653310013e-05, "loss": 0.0206, "num_input_tokens_seen": 156736608, "step": 72635 }, { "epoch": 11.849918433931485, "grad_norm": 0.3216594457626343, "learning_rate": 2.1305820694736064e-05, "loss": 0.0867, "num_input_tokens_seen": 156747040, "step": 72640 }, { "epoch": 11.850734094616639, "grad_norm": 0.019796887412667274, "learning_rate": 2.1302300811029335e-05, "loss": 0.0166, "num_input_tokens_seen": 156758592, "step": 72645 }, { "epoch": 11.851549755301795, "grad_norm": 0.26427873969078064, "learning_rate": 2.129878100226118e-05, "loss": 0.106, "num_input_tokens_seen": 156768928, "step": 72650 }, { "epoch": 11.852365415986949, "grad_norm": 0.9606610536575317, "learning_rate": 2.1295261268502915e-05, "loss": 0.0355, "num_input_tokens_seen": 156779840, "step": 72655 }, { "epoch": 11.853181076672104, "grad_norm": 0.09921054542064667, "learning_rate": 2.1291741609825884e-05, "loss": 0.1253, "num_input_tokens_seen": 156791616, "step": 72660 }, { "epoch": 11.85399673735726, "grad_norm": 1.769745111465454, "learning_rate": 2.1288222026301414e-05, "loss": 0.276, "num_input_tokens_seen": 156801536, "step": 72665 }, { "epoch": 11.854812398042414, "grad_norm": 0.4961727559566498, "learning_rate": 2.1284702518000832e-05, "loss": 0.0551, "num_input_tokens_seen": 156812032, "step": 72670 }, { "epoch": 11.85562805872757, "grad_norm": 0.1374891996383667, "learning_rate": 2.128118308499547e-05, "loss": 0.0476, "num_input_tokens_seen": 156821920, "step": 72675 }, { "epoch": 11.856443719412724, "grad_norm": 0.40700531005859375, "learning_rate": 2.1277663727356638e-05, "loss": 0.1071, "num_input_tokens_seen": 156833056, "step": 72680 }, { "epoch": 11.85725938009788, "grad_norm": 0.6527365446090698, "learning_rate": 2.1274144445155686e-05, "loss": 0.0357, "num_input_tokens_seen": 156844192, "step": 72685 }, { "epoch": 11.858075040783035, "grad_norm": 0.15215502679347992, "learning_rate": 2.127062523846392e-05, "loss": 0.1629, "num_input_tokens_seen": 156855488, "step": 72690 }, { "epoch": 11.858890701468189, "grad_norm": 0.4347357451915741, "learning_rate": 2.1267106107352662e-05, "loss": 0.0355, "num_input_tokens_seen": 156866336, "step": 72695 }, { "epoch": 11.859706362153345, "grad_norm": 0.15616363286972046, "learning_rate": 2.1263587051893242e-05, "loss": 0.0112, "num_input_tokens_seen": 156876928, "step": 72700 }, { "epoch": 11.860522022838499, "grad_norm": 0.9713245630264282, "learning_rate": 2.1260068072156964e-05, "loss": 0.3339, "num_input_tokens_seen": 156886656, "step": 72705 }, { "epoch": 11.861337683523654, "grad_norm": 0.26892149448394775, "learning_rate": 2.1256549168215146e-05, "loss": 0.16, "num_input_tokens_seen": 156897120, "step": 72710 }, { "epoch": 11.86215334420881, "grad_norm": 0.0568917840719223, "learning_rate": 2.1253030340139118e-05, "loss": 0.078, "num_input_tokens_seen": 156907584, "step": 72715 }, { "epoch": 11.862969004893964, "grad_norm": 1.2847198247909546, "learning_rate": 2.1249511588000176e-05, "loss": 0.1519, "num_input_tokens_seen": 156918432, "step": 72720 }, { "epoch": 11.86378466557912, "grad_norm": 0.026531729847192764, "learning_rate": 2.124599291186965e-05, "loss": 0.1677, "num_input_tokens_seen": 156928640, "step": 72725 }, { "epoch": 11.864600326264274, "grad_norm": 1.2268834114074707, "learning_rate": 2.1242474311818826e-05, "loss": 0.0663, "num_input_tokens_seen": 156938528, "step": 72730 }, { "epoch": 11.86541598694943, "grad_norm": 0.05080350488424301, "learning_rate": 2.123895578791904e-05, "loss": 0.0377, "num_input_tokens_seen": 156948704, "step": 72735 }, { "epoch": 11.866231647634583, "grad_norm": 0.03570926561951637, "learning_rate": 2.1235437340241576e-05, "loss": 0.0565, "num_input_tokens_seen": 156959072, "step": 72740 }, { "epoch": 11.867047308319739, "grad_norm": 0.8024483919143677, "learning_rate": 2.1231918968857757e-05, "loss": 0.0566, "num_input_tokens_seen": 156969184, "step": 72745 }, { "epoch": 11.867862969004895, "grad_norm": 0.2969823181629181, "learning_rate": 2.1228400673838875e-05, "loss": 0.1089, "num_input_tokens_seen": 156980256, "step": 72750 }, { "epoch": 11.868678629690049, "grad_norm": 1.154237985610962, "learning_rate": 2.1224882455256246e-05, "loss": 0.1293, "num_input_tokens_seen": 156991872, "step": 72755 }, { "epoch": 11.869494290375204, "grad_norm": 1.0633134841918945, "learning_rate": 2.1221364313181154e-05, "loss": 0.243, "num_input_tokens_seen": 157002336, "step": 72760 }, { "epoch": 11.870309951060358, "grad_norm": 0.3473311960697174, "learning_rate": 2.1217846247684913e-05, "loss": 0.0161, "num_input_tokens_seen": 157014592, "step": 72765 }, { "epoch": 11.871125611745514, "grad_norm": 1.392315149307251, "learning_rate": 2.1214328258838813e-05, "loss": 0.061, "num_input_tokens_seen": 157025152, "step": 72770 }, { "epoch": 11.87194127243067, "grad_norm": 1.8327574729919434, "learning_rate": 2.1210810346714158e-05, "loss": 0.0551, "num_input_tokens_seen": 157036608, "step": 72775 }, { "epoch": 11.872756933115824, "grad_norm": 0.027173642069101334, "learning_rate": 2.1207292511382228e-05, "loss": 0.0085, "num_input_tokens_seen": 157049568, "step": 72780 }, { "epoch": 11.87357259380098, "grad_norm": 1.3133554458618164, "learning_rate": 2.120377475291433e-05, "loss": 0.1028, "num_input_tokens_seen": 157059424, "step": 72785 }, { "epoch": 11.874388254486133, "grad_norm": 0.13062959909439087, "learning_rate": 2.120025707138176e-05, "loss": 0.0662, "num_input_tokens_seen": 157070432, "step": 72790 }, { "epoch": 11.875203915171289, "grad_norm": 0.11110221594572067, "learning_rate": 2.1196739466855792e-05, "loss": 0.0714, "num_input_tokens_seen": 157081664, "step": 72795 }, { "epoch": 11.876019575856443, "grad_norm": 0.9885464906692505, "learning_rate": 2.119322193940773e-05, "loss": 0.0821, "num_input_tokens_seen": 157092352, "step": 72800 }, { "epoch": 11.876835236541599, "grad_norm": 1.7061225175857544, "learning_rate": 2.118970448910885e-05, "loss": 0.0875, "num_input_tokens_seen": 157103328, "step": 72805 }, { "epoch": 11.877650897226754, "grad_norm": 2.828047037124634, "learning_rate": 2.118618711603045e-05, "loss": 0.2415, "num_input_tokens_seen": 157114144, "step": 72810 }, { "epoch": 11.878466557911908, "grad_norm": 0.7099462747573853, "learning_rate": 2.1182669820243797e-05, "loss": 0.0415, "num_input_tokens_seen": 157125216, "step": 72815 }, { "epoch": 11.879282218597064, "grad_norm": 0.06970258057117462, "learning_rate": 2.117915260182019e-05, "loss": 0.0218, "num_input_tokens_seen": 157136608, "step": 72820 }, { "epoch": 11.880097879282218, "grad_norm": 0.42703357338905334, "learning_rate": 2.1175635460830894e-05, "loss": 0.0374, "num_input_tokens_seen": 157147552, "step": 72825 }, { "epoch": 11.880913539967374, "grad_norm": 0.024778077378869057, "learning_rate": 2.1172118397347204e-05, "loss": 0.1337, "num_input_tokens_seen": 157159552, "step": 72830 }, { "epoch": 11.88172920065253, "grad_norm": 1.407976746559143, "learning_rate": 2.1168601411440388e-05, "loss": 0.3493, "num_input_tokens_seen": 157170176, "step": 72835 }, { "epoch": 11.882544861337683, "grad_norm": 0.028810368850827217, "learning_rate": 2.116508450318173e-05, "loss": 0.2455, "num_input_tokens_seen": 157181952, "step": 72840 }, { "epoch": 11.883360522022839, "grad_norm": 0.056707289069890976, "learning_rate": 2.1161567672642494e-05, "loss": 0.0942, "num_input_tokens_seen": 157194176, "step": 72845 }, { "epoch": 11.884176182707993, "grad_norm": 0.05209004133939743, "learning_rate": 2.115805091989396e-05, "loss": 0.0305, "num_input_tokens_seen": 157204992, "step": 72850 }, { "epoch": 11.884991843393149, "grad_norm": 0.017402542755007744, "learning_rate": 2.1154534245007392e-05, "loss": 0.01, "num_input_tokens_seen": 157214496, "step": 72855 }, { "epoch": 11.885807504078304, "grad_norm": 1.5148818492889404, "learning_rate": 2.1151017648054077e-05, "loss": 0.1315, "num_input_tokens_seen": 157225248, "step": 72860 }, { "epoch": 11.886623164763458, "grad_norm": 2.127035617828369, "learning_rate": 2.1147501129105263e-05, "loss": 0.128, "num_input_tokens_seen": 157236416, "step": 72865 }, { "epoch": 11.887438825448614, "grad_norm": 0.029150210320949554, "learning_rate": 2.1143984688232236e-05, "loss": 0.0201, "num_input_tokens_seen": 157246304, "step": 72870 }, { "epoch": 11.888254486133768, "grad_norm": 0.07195594161748886, "learning_rate": 2.1140468325506238e-05, "loss": 0.0206, "num_input_tokens_seen": 157256352, "step": 72875 }, { "epoch": 11.889070146818923, "grad_norm": 1.3512359857559204, "learning_rate": 2.113695204099856e-05, "loss": 0.0994, "num_input_tokens_seen": 157267136, "step": 72880 }, { "epoch": 11.88988580750408, "grad_norm": 2.061378240585327, "learning_rate": 2.1133435834780435e-05, "loss": 0.1608, "num_input_tokens_seen": 157277952, "step": 72885 }, { "epoch": 11.890701468189233, "grad_norm": 1.0467725992202759, "learning_rate": 2.1129919706923138e-05, "loss": 0.1344, "num_input_tokens_seen": 157288704, "step": 72890 }, { "epoch": 11.891517128874389, "grad_norm": 0.03796739503741264, "learning_rate": 2.112640365749794e-05, "loss": 0.1208, "num_input_tokens_seen": 157300928, "step": 72895 }, { "epoch": 11.892332789559543, "grad_norm": 0.03628406301140785, "learning_rate": 2.1122887686576076e-05, "loss": 0.1396, "num_input_tokens_seen": 157312000, "step": 72900 }, { "epoch": 11.893148450244698, "grad_norm": 0.019105052575469017, "learning_rate": 2.111937179422882e-05, "loss": 0.101, "num_input_tokens_seen": 157322848, "step": 72905 }, { "epoch": 11.893964110929852, "grad_norm": 0.4723277986049652, "learning_rate": 2.111585598052741e-05, "loss": 0.1382, "num_input_tokens_seen": 157334528, "step": 72910 }, { "epoch": 11.894779771615008, "grad_norm": 0.03458566963672638, "learning_rate": 2.1112340245543112e-05, "loss": 0.0181, "num_input_tokens_seen": 157343392, "step": 72915 }, { "epoch": 11.895595432300164, "grad_norm": 0.05621443688869476, "learning_rate": 2.1108824589347164e-05, "loss": 0.0407, "num_input_tokens_seen": 157353248, "step": 72920 }, { "epoch": 11.896411092985318, "grad_norm": 0.6661778688430786, "learning_rate": 2.1105309012010833e-05, "loss": 0.1101, "num_input_tokens_seen": 157364736, "step": 72925 }, { "epoch": 11.897226753670473, "grad_norm": 0.1518319696187973, "learning_rate": 2.1101793513605342e-05, "loss": 0.0478, "num_input_tokens_seen": 157373728, "step": 72930 }, { "epoch": 11.898042414355627, "grad_norm": 0.36098551750183105, "learning_rate": 2.1098278094201964e-05, "loss": 0.0525, "num_input_tokens_seen": 157384800, "step": 72935 }, { "epoch": 11.898858075040783, "grad_norm": 2.2144882678985596, "learning_rate": 2.1094762753871916e-05, "loss": 0.3259, "num_input_tokens_seen": 157393696, "step": 72940 }, { "epoch": 11.899673735725939, "grad_norm": 0.4421558976173401, "learning_rate": 2.109124749268647e-05, "loss": 0.0131, "num_input_tokens_seen": 157404608, "step": 72945 }, { "epoch": 11.900489396411093, "grad_norm": 0.9172877073287964, "learning_rate": 2.108773231071684e-05, "loss": 0.0517, "num_input_tokens_seen": 157416288, "step": 72950 }, { "epoch": 11.901305057096248, "grad_norm": 2.100754737854004, "learning_rate": 2.1084217208034284e-05, "loss": 0.137, "num_input_tokens_seen": 157427200, "step": 72955 }, { "epoch": 11.902120717781402, "grad_norm": 0.3705274760723114, "learning_rate": 2.108070218471003e-05, "loss": 0.0618, "num_input_tokens_seen": 157437344, "step": 72960 }, { "epoch": 11.902936378466558, "grad_norm": 0.2719741761684418, "learning_rate": 2.1077187240815326e-05, "loss": 0.135, "num_input_tokens_seen": 157448960, "step": 72965 }, { "epoch": 11.903752039151712, "grad_norm": 0.198203906416893, "learning_rate": 2.107367237642139e-05, "loss": 0.016, "num_input_tokens_seen": 157461088, "step": 72970 }, { "epoch": 11.904567699836868, "grad_norm": 1.9707072973251343, "learning_rate": 2.1070157591599465e-05, "loss": 0.1449, "num_input_tokens_seen": 157470720, "step": 72975 }, { "epoch": 11.905383360522023, "grad_norm": 0.11609303951263428, "learning_rate": 2.1066642886420783e-05, "loss": 0.0149, "num_input_tokens_seen": 157481792, "step": 72980 }, { "epoch": 11.906199021207177, "grad_norm": 1.1629830598831177, "learning_rate": 2.1063128260956575e-05, "loss": 0.0605, "num_input_tokens_seen": 157492544, "step": 72985 }, { "epoch": 11.907014681892333, "grad_norm": 0.031209362670779228, "learning_rate": 2.1059613715278066e-05, "loss": 0.1831, "num_input_tokens_seen": 157503808, "step": 72990 }, { "epoch": 11.907830342577487, "grad_norm": 0.6733864545822144, "learning_rate": 2.105609924945648e-05, "loss": 0.0321, "num_input_tokens_seen": 157513760, "step": 72995 }, { "epoch": 11.908646003262643, "grad_norm": 2.7115752696990967, "learning_rate": 2.1052584863563048e-05, "loss": 0.1567, "num_input_tokens_seen": 157523648, "step": 73000 }, { "epoch": 11.909461663947798, "grad_norm": 0.0374724380671978, "learning_rate": 2.1049070557668992e-05, "loss": 0.0489, "num_input_tokens_seen": 157534496, "step": 73005 }, { "epoch": 11.910277324632952, "grad_norm": 0.37206485867500305, "learning_rate": 2.1045556331845537e-05, "loss": 0.0124, "num_input_tokens_seen": 157544544, "step": 73010 }, { "epoch": 11.911092985318108, "grad_norm": 2.0913891792297363, "learning_rate": 2.1042042186163897e-05, "loss": 0.2494, "num_input_tokens_seen": 157555488, "step": 73015 }, { "epoch": 11.911908646003262, "grad_norm": 1.295657992362976, "learning_rate": 2.1038528120695295e-05, "loss": 0.1052, "num_input_tokens_seen": 157567232, "step": 73020 }, { "epoch": 11.912724306688418, "grad_norm": 0.01843605749309063, "learning_rate": 2.103501413551094e-05, "loss": 0.2822, "num_input_tokens_seen": 157578080, "step": 73025 }, { "epoch": 11.913539967373573, "grad_norm": 0.0590214729309082, "learning_rate": 2.103150023068206e-05, "loss": 0.0332, "num_input_tokens_seen": 157588352, "step": 73030 }, { "epoch": 11.914355628058727, "grad_norm": 1.7791272401809692, "learning_rate": 2.1027986406279852e-05, "loss": 0.1391, "num_input_tokens_seen": 157599584, "step": 73035 }, { "epoch": 11.915171288743883, "grad_norm": 0.12268344312906265, "learning_rate": 2.1024472662375555e-05, "loss": 0.1067, "num_input_tokens_seen": 157610208, "step": 73040 }, { "epoch": 11.915986949429037, "grad_norm": 0.1160314753651619, "learning_rate": 2.102095899904035e-05, "loss": 0.2321, "num_input_tokens_seen": 157620896, "step": 73045 }, { "epoch": 11.916802610114193, "grad_norm": 0.024631265550851822, "learning_rate": 2.1017445416345465e-05, "loss": 0.169, "num_input_tokens_seen": 157631520, "step": 73050 }, { "epoch": 11.917618270799348, "grad_norm": 0.5933998227119446, "learning_rate": 2.1013931914362096e-05, "loss": 0.0428, "num_input_tokens_seen": 157642848, "step": 73055 }, { "epoch": 11.918433931484502, "grad_norm": 1.678772211074829, "learning_rate": 2.101041849316146e-05, "loss": 0.1242, "num_input_tokens_seen": 157654624, "step": 73060 }, { "epoch": 11.919249592169658, "grad_norm": 0.09893500059843063, "learning_rate": 2.1006905152814746e-05, "loss": 0.0627, "num_input_tokens_seen": 157664864, "step": 73065 }, { "epoch": 11.920065252854812, "grad_norm": 0.10200654715299606, "learning_rate": 2.1003391893393173e-05, "loss": 0.0148, "num_input_tokens_seen": 157675808, "step": 73070 }, { "epoch": 11.920880913539968, "grad_norm": 0.5482355952262878, "learning_rate": 2.0999878714967926e-05, "loss": 0.0898, "num_input_tokens_seen": 157687392, "step": 73075 }, { "epoch": 11.921696574225122, "grad_norm": 0.25344255566596985, "learning_rate": 2.099636561761022e-05, "loss": 0.0306, "num_input_tokens_seen": 157699040, "step": 73080 }, { "epoch": 11.922512234910277, "grad_norm": 0.0443185493350029, "learning_rate": 2.0992852601391233e-05, "loss": 0.0805, "num_input_tokens_seen": 157710048, "step": 73085 }, { "epoch": 11.923327895595433, "grad_norm": 0.9558950066566467, "learning_rate": 2.098933966638218e-05, "loss": 0.1467, "num_input_tokens_seen": 157722144, "step": 73090 }, { "epoch": 11.924143556280587, "grad_norm": 0.13499321043491364, "learning_rate": 2.098582681265424e-05, "loss": 0.0794, "num_input_tokens_seen": 157734112, "step": 73095 }, { "epoch": 11.924959216965743, "grad_norm": 2.134669542312622, "learning_rate": 2.098231404027861e-05, "loss": 0.069, "num_input_tokens_seen": 157743776, "step": 73100 }, { "epoch": 11.925774877650896, "grad_norm": 0.0796905979514122, "learning_rate": 2.0978801349326493e-05, "loss": 0.0961, "num_input_tokens_seen": 157754496, "step": 73105 }, { "epoch": 11.926590538336052, "grad_norm": 0.04767826199531555, "learning_rate": 2.097528873986906e-05, "loss": 0.0884, "num_input_tokens_seen": 157764928, "step": 73110 }, { "epoch": 11.927406199021208, "grad_norm": 1.072357416152954, "learning_rate": 2.0971776211977516e-05, "loss": 0.1477, "num_input_tokens_seen": 157775104, "step": 73115 }, { "epoch": 11.928221859706362, "grad_norm": 0.781954824924469, "learning_rate": 2.096826376572302e-05, "loss": 0.1336, "num_input_tokens_seen": 157785984, "step": 73120 }, { "epoch": 11.929037520391518, "grad_norm": 1.4920066595077515, "learning_rate": 2.0964751401176795e-05, "loss": 0.1149, "num_input_tokens_seen": 157796704, "step": 73125 }, { "epoch": 11.929853181076671, "grad_norm": 0.029796583577990532, "learning_rate": 2.0961239118409983e-05, "loss": 0.0299, "num_input_tokens_seen": 157808288, "step": 73130 }, { "epoch": 11.930668841761827, "grad_norm": 0.07652982324361801, "learning_rate": 2.09577269174938e-05, "loss": 0.022, "num_input_tokens_seen": 157818848, "step": 73135 }, { "epoch": 11.931484502446983, "grad_norm": 1.3338309526443481, "learning_rate": 2.0954214798499394e-05, "loss": 0.0997, "num_input_tokens_seen": 157830240, "step": 73140 }, { "epoch": 11.932300163132137, "grad_norm": 1.2362732887268066, "learning_rate": 2.0950702761497966e-05, "loss": 0.0422, "num_input_tokens_seen": 157841632, "step": 73145 }, { "epoch": 11.933115823817293, "grad_norm": 0.7093543410301208, "learning_rate": 2.094719080656068e-05, "loss": 0.2414, "num_input_tokens_seen": 157854048, "step": 73150 }, { "epoch": 11.933931484502446, "grad_norm": 0.11652828007936478, "learning_rate": 2.0943678933758717e-05, "loss": 0.0609, "num_input_tokens_seen": 157866112, "step": 73155 }, { "epoch": 11.934747145187602, "grad_norm": 1.6183925867080688, "learning_rate": 2.0940167143163245e-05, "loss": 0.0631, "num_input_tokens_seen": 157877984, "step": 73160 }, { "epoch": 11.935562805872756, "grad_norm": 0.02456682175397873, "learning_rate": 2.0936655434845437e-05, "loss": 0.0191, "num_input_tokens_seen": 157889536, "step": 73165 }, { "epoch": 11.936378466557912, "grad_norm": 0.06936760246753693, "learning_rate": 2.0933143808876453e-05, "loss": 0.0556, "num_input_tokens_seen": 157899936, "step": 73170 }, { "epoch": 11.937194127243067, "grad_norm": 0.11794636398553848, "learning_rate": 2.092963226532748e-05, "loss": 0.0408, "num_input_tokens_seen": 157909728, "step": 73175 }, { "epoch": 11.938009787928221, "grad_norm": 2.2933833599090576, "learning_rate": 2.0926120804269666e-05, "loss": 0.1702, "num_input_tokens_seen": 157920768, "step": 73180 }, { "epoch": 11.938825448613377, "grad_norm": 0.21811452507972717, "learning_rate": 2.092260942577419e-05, "loss": 0.2968, "num_input_tokens_seen": 157931104, "step": 73185 }, { "epoch": 11.939641109298531, "grad_norm": 0.2419249266386032, "learning_rate": 2.0919098129912197e-05, "loss": 0.0819, "num_input_tokens_seen": 157941984, "step": 73190 }, { "epoch": 11.940456769983687, "grad_norm": 0.077720046043396, "learning_rate": 2.0915586916754867e-05, "loss": 0.0071, "num_input_tokens_seen": 157952512, "step": 73195 }, { "epoch": 11.941272430668842, "grad_norm": 0.8026115298271179, "learning_rate": 2.0912075786373338e-05, "loss": 0.1241, "num_input_tokens_seen": 157963584, "step": 73200 }, { "epoch": 11.942088091353996, "grad_norm": 1.6525017023086548, "learning_rate": 2.0908564738838783e-05, "loss": 0.169, "num_input_tokens_seen": 157975200, "step": 73205 }, { "epoch": 11.942903752039152, "grad_norm": 3.058586359024048, "learning_rate": 2.090505377422236e-05, "loss": 0.0313, "num_input_tokens_seen": 157985696, "step": 73210 }, { "epoch": 11.943719412724306, "grad_norm": 1.8760377168655396, "learning_rate": 2.0901542892595207e-05, "loss": 0.1621, "num_input_tokens_seen": 157996896, "step": 73215 }, { "epoch": 11.944535073409462, "grad_norm": 0.8849329948425293, "learning_rate": 2.08980320940285e-05, "loss": 0.0899, "num_input_tokens_seen": 158009216, "step": 73220 }, { "epoch": 11.945350734094617, "grad_norm": 2.104907751083374, "learning_rate": 2.089452137859336e-05, "loss": 0.2162, "num_input_tokens_seen": 158020512, "step": 73225 }, { "epoch": 11.946166394779771, "grad_norm": 1.898099422454834, "learning_rate": 2.089101074636097e-05, "loss": 0.0534, "num_input_tokens_seen": 158030432, "step": 73230 }, { "epoch": 11.946982055464927, "grad_norm": 0.14442938566207886, "learning_rate": 2.0887500197402447e-05, "loss": 0.1392, "num_input_tokens_seen": 158040576, "step": 73235 }, { "epoch": 11.947797716150081, "grad_norm": 0.055668190121650696, "learning_rate": 2.088398973178896e-05, "loss": 0.1278, "num_input_tokens_seen": 158052032, "step": 73240 }, { "epoch": 11.948613376835237, "grad_norm": 0.7143192291259766, "learning_rate": 2.088047934959163e-05, "loss": 0.084, "num_input_tokens_seen": 158062016, "step": 73245 }, { "epoch": 11.949429037520392, "grad_norm": 0.022025611251592636, "learning_rate": 2.0876969050881627e-05, "loss": 0.0317, "num_input_tokens_seen": 158072384, "step": 73250 }, { "epoch": 11.950244698205546, "grad_norm": 0.0857517197728157, "learning_rate": 2.0873458835730063e-05, "loss": 0.059, "num_input_tokens_seen": 158083712, "step": 73255 }, { "epoch": 11.951060358890702, "grad_norm": 0.1055351123213768, "learning_rate": 2.08699487042081e-05, "loss": 0.0462, "num_input_tokens_seen": 158093664, "step": 73260 }, { "epoch": 11.951876019575856, "grad_norm": 0.031427353620529175, "learning_rate": 2.0866438656386855e-05, "loss": 0.0104, "num_input_tokens_seen": 158104352, "step": 73265 }, { "epoch": 11.952691680261012, "grad_norm": 0.03718220815062523, "learning_rate": 2.0862928692337487e-05, "loss": 0.1766, "num_input_tokens_seen": 158114240, "step": 73270 }, { "epoch": 11.953507340946166, "grad_norm": 0.3441012501716614, "learning_rate": 2.0859418812131105e-05, "loss": 0.0706, "num_input_tokens_seen": 158124704, "step": 73275 }, { "epoch": 11.954323001631321, "grad_norm": 0.9539501667022705, "learning_rate": 2.0855909015838868e-05, "loss": 0.1496, "num_input_tokens_seen": 158135136, "step": 73280 }, { "epoch": 11.955138662316477, "grad_norm": 0.371997594833374, "learning_rate": 2.085239930353188e-05, "loss": 0.0965, "num_input_tokens_seen": 158145024, "step": 73285 }, { "epoch": 11.955954323001631, "grad_norm": 0.10013372451066971, "learning_rate": 2.0848889675281286e-05, "loss": 0.0139, "num_input_tokens_seen": 158155040, "step": 73290 }, { "epoch": 11.956769983686787, "grad_norm": 0.09368204325437546, "learning_rate": 2.0845380131158208e-05, "loss": 0.2377, "num_input_tokens_seen": 158166208, "step": 73295 }, { "epoch": 11.95758564437194, "grad_norm": 0.03634195029735565, "learning_rate": 2.0841870671233772e-05, "loss": 0.0445, "num_input_tokens_seen": 158176608, "step": 73300 }, { "epoch": 11.958401305057096, "grad_norm": 1.9069223403930664, "learning_rate": 2.08383612955791e-05, "loss": 0.047, "num_input_tokens_seen": 158187552, "step": 73305 }, { "epoch": 11.959216965742252, "grad_norm": 1.0116807222366333, "learning_rate": 2.083485200426532e-05, "loss": 0.1039, "num_input_tokens_seen": 158199136, "step": 73310 }, { "epoch": 11.960032626427406, "grad_norm": 0.04661991819739342, "learning_rate": 2.083134279736354e-05, "loss": 0.066, "num_input_tokens_seen": 158209600, "step": 73315 }, { "epoch": 11.960848287112562, "grad_norm": 1.5758074522018433, "learning_rate": 2.0827833674944893e-05, "loss": 0.21, "num_input_tokens_seen": 158220544, "step": 73320 }, { "epoch": 11.961663947797716, "grad_norm": 1.2537420988082886, "learning_rate": 2.082432463708049e-05, "loss": 0.0779, "num_input_tokens_seen": 158231456, "step": 73325 }, { "epoch": 11.962479608482871, "grad_norm": 0.1849859356880188, "learning_rate": 2.0820815683841445e-05, "loss": 0.0242, "num_input_tokens_seen": 158241920, "step": 73330 }, { "epoch": 11.963295269168025, "grad_norm": 0.05130463466048241, "learning_rate": 2.081730681529887e-05, "loss": 0.1127, "num_input_tokens_seen": 158251360, "step": 73335 }, { "epoch": 11.964110929853181, "grad_norm": 0.03785865381360054, "learning_rate": 2.0813798031523878e-05, "loss": 0.0325, "num_input_tokens_seen": 158262528, "step": 73340 }, { "epoch": 11.964926590538337, "grad_norm": 2.1472039222717285, "learning_rate": 2.0810289332587585e-05, "loss": 0.2136, "num_input_tokens_seen": 158272160, "step": 73345 }, { "epoch": 11.96574225122349, "grad_norm": 0.09142717719078064, "learning_rate": 2.0806780718561082e-05, "loss": 0.1883, "num_input_tokens_seen": 158282176, "step": 73350 }, { "epoch": 11.966557911908646, "grad_norm": 2.3442373275756836, "learning_rate": 2.0803272189515505e-05, "loss": 0.1147, "num_input_tokens_seen": 158291136, "step": 73355 }, { "epoch": 11.9673735725938, "grad_norm": 0.060374755412340164, "learning_rate": 2.0799763745521926e-05, "loss": 0.0525, "num_input_tokens_seen": 158301184, "step": 73360 }, { "epoch": 11.968189233278956, "grad_norm": 1.5976709127426147, "learning_rate": 2.079625538665147e-05, "loss": 0.1061, "num_input_tokens_seen": 158311936, "step": 73365 }, { "epoch": 11.969004893964112, "grad_norm": 0.10851141810417175, "learning_rate": 2.0792747112975225e-05, "loss": 0.2993, "num_input_tokens_seen": 158323456, "step": 73370 }, { "epoch": 11.969820554649266, "grad_norm": 0.5926616787910461, "learning_rate": 2.0789238924564307e-05, "loss": 0.1107, "num_input_tokens_seen": 158333568, "step": 73375 }, { "epoch": 11.970636215334421, "grad_norm": 0.04016895219683647, "learning_rate": 2.0785730821489797e-05, "loss": 0.043, "num_input_tokens_seen": 158343232, "step": 73380 }, { "epoch": 11.971451876019575, "grad_norm": 0.05404955893754959, "learning_rate": 2.0782222803822804e-05, "loss": 0.2173, "num_input_tokens_seen": 158354912, "step": 73385 }, { "epoch": 11.97226753670473, "grad_norm": 2.4084203243255615, "learning_rate": 2.0778714871634406e-05, "loss": 0.1954, "num_input_tokens_seen": 158364992, "step": 73390 }, { "epoch": 11.973083197389887, "grad_norm": 2.6299235820770264, "learning_rate": 2.077520702499572e-05, "loss": 0.2738, "num_input_tokens_seen": 158375904, "step": 73395 }, { "epoch": 11.97389885807504, "grad_norm": 0.05190694332122803, "learning_rate": 2.0771699263977808e-05, "loss": 0.0131, "num_input_tokens_seen": 158386752, "step": 73400 }, { "epoch": 11.974714518760196, "grad_norm": 0.05597255378961563, "learning_rate": 2.076819158865179e-05, "loss": 0.2214, "num_input_tokens_seen": 158397984, "step": 73405 }, { "epoch": 11.97553017944535, "grad_norm": 0.5357603430747986, "learning_rate": 2.0764683999088726e-05, "loss": 0.123, "num_input_tokens_seen": 158409440, "step": 73410 }, { "epoch": 11.976345840130506, "grad_norm": 0.06218096613883972, "learning_rate": 2.076117649535972e-05, "loss": 0.0167, "num_input_tokens_seen": 158421152, "step": 73415 }, { "epoch": 11.977161500815662, "grad_norm": 0.47601935267448425, "learning_rate": 2.0757669077535842e-05, "loss": 0.0793, "num_input_tokens_seen": 158430848, "step": 73420 }, { "epoch": 11.977977161500815, "grad_norm": 1.9587618112564087, "learning_rate": 2.0754161745688185e-05, "loss": 0.2426, "num_input_tokens_seen": 158441728, "step": 73425 }, { "epoch": 11.978792822185971, "grad_norm": 2.1285178661346436, "learning_rate": 2.0750654499887832e-05, "loss": 0.2592, "num_input_tokens_seen": 158452928, "step": 73430 }, { "epoch": 11.979608482871125, "grad_norm": 0.23481829464435577, "learning_rate": 2.0747147340205845e-05, "loss": 0.0287, "num_input_tokens_seen": 158465152, "step": 73435 }, { "epoch": 11.98042414355628, "grad_norm": 1.51812744140625, "learning_rate": 2.0743640266713326e-05, "loss": 0.0645, "num_input_tokens_seen": 158476160, "step": 73440 }, { "epoch": 11.981239804241435, "grad_norm": 0.25081387162208557, "learning_rate": 2.0740133279481328e-05, "loss": 0.0423, "num_input_tokens_seen": 158486816, "step": 73445 }, { "epoch": 11.98205546492659, "grad_norm": 2.184952974319458, "learning_rate": 2.073662637858094e-05, "loss": 0.0886, "num_input_tokens_seen": 158497376, "step": 73450 }, { "epoch": 11.982871125611746, "grad_norm": 0.049492981284856796, "learning_rate": 2.0733119564083216e-05, "loss": 0.1405, "num_input_tokens_seen": 158509312, "step": 73455 }, { "epoch": 11.9836867862969, "grad_norm": 0.5182894468307495, "learning_rate": 2.0729612836059244e-05, "loss": 0.2096, "num_input_tokens_seen": 158519424, "step": 73460 }, { "epoch": 11.984502446982056, "grad_norm": 0.022250553593039513, "learning_rate": 2.0726106194580084e-05, "loss": 0.0916, "num_input_tokens_seen": 158529536, "step": 73465 }, { "epoch": 11.98531810766721, "grad_norm": 0.3875635862350464, "learning_rate": 2.0722599639716806e-05, "loss": 0.1471, "num_input_tokens_seen": 158540096, "step": 73470 }, { "epoch": 11.986133768352365, "grad_norm": 0.10276295244693756, "learning_rate": 2.0719093171540473e-05, "loss": 0.019, "num_input_tokens_seen": 158551328, "step": 73475 }, { "epoch": 11.986949429037521, "grad_norm": 0.5723543763160706, "learning_rate": 2.0715586790122142e-05, "loss": 0.0497, "num_input_tokens_seen": 158562720, "step": 73480 }, { "epoch": 11.987765089722675, "grad_norm": 0.10113150626420975, "learning_rate": 2.071208049553288e-05, "loss": 0.2946, "num_input_tokens_seen": 158574208, "step": 73485 }, { "epoch": 11.98858075040783, "grad_norm": 0.5522232055664062, "learning_rate": 2.0708574287843756e-05, "loss": 0.1385, "num_input_tokens_seen": 158586752, "step": 73490 }, { "epoch": 11.989396411092985, "grad_norm": 2.464944362640381, "learning_rate": 2.0705068167125806e-05, "loss": 0.0641, "num_input_tokens_seen": 158597088, "step": 73495 }, { "epoch": 11.99021207177814, "grad_norm": 0.07181546092033386, "learning_rate": 2.070156213345011e-05, "loss": 0.1834, "num_input_tokens_seen": 158608320, "step": 73500 }, { "epoch": 11.991027732463294, "grad_norm": 1.1933398246765137, "learning_rate": 2.06980561868877e-05, "loss": 0.0651, "num_input_tokens_seen": 158618976, "step": 73505 }, { "epoch": 11.99184339314845, "grad_norm": 0.02452344074845314, "learning_rate": 2.069455032750965e-05, "loss": 0.0305, "num_input_tokens_seen": 158630208, "step": 73510 }, { "epoch": 11.992659053833606, "grad_norm": 2.2290918827056885, "learning_rate": 2.0691044555386987e-05, "loss": 0.1372, "num_input_tokens_seen": 158641600, "step": 73515 }, { "epoch": 11.99347471451876, "grad_norm": 2.246546745300293, "learning_rate": 2.0687538870590782e-05, "loss": 0.2701, "num_input_tokens_seen": 158651840, "step": 73520 }, { "epoch": 11.994290375203915, "grad_norm": 1.7637687921524048, "learning_rate": 2.0684033273192067e-05, "loss": 0.0901, "num_input_tokens_seen": 158662912, "step": 73525 }, { "epoch": 11.99510603588907, "grad_norm": 0.11010996252298355, "learning_rate": 2.0680527763261887e-05, "loss": 0.1099, "num_input_tokens_seen": 158673216, "step": 73530 }, { "epoch": 11.995921696574225, "grad_norm": 0.05402909964323044, "learning_rate": 2.0677022340871305e-05, "loss": 0.1532, "num_input_tokens_seen": 158684320, "step": 73535 }, { "epoch": 11.99673735725938, "grad_norm": 1.3932863473892212, "learning_rate": 2.067351700609134e-05, "loss": 0.3267, "num_input_tokens_seen": 158695424, "step": 73540 }, { "epoch": 11.997553017944535, "grad_norm": 2.9552290439605713, "learning_rate": 2.0670011758993048e-05, "loss": 0.1404, "num_input_tokens_seen": 158706240, "step": 73545 }, { "epoch": 11.99836867862969, "grad_norm": 0.19738292694091797, "learning_rate": 2.0666506599647454e-05, "loss": 0.0604, "num_input_tokens_seen": 158716864, "step": 73550 }, { "epoch": 11.999184339314844, "grad_norm": 1.9511076211929321, "learning_rate": 2.066300152812561e-05, "loss": 0.0813, "num_input_tokens_seen": 158727616, "step": 73555 }, { "epoch": 12.0, "grad_norm": 0.03927404060959816, "learning_rate": 2.065949654449853e-05, "loss": 0.1963, "num_input_tokens_seen": 158737232, "step": 73560 }, { "epoch": 12.0, "eval_loss": 0.13565713167190552, "eval_runtime": 90.6724, "eval_samples_per_second": 30.053, "eval_steps_per_second": 7.522, "num_input_tokens_seen": 158737232, "step": 73560 }, { "epoch": 12.000815660685156, "grad_norm": 1.9506754875183105, "learning_rate": 2.0655991648837272e-05, "loss": 0.1972, "num_input_tokens_seen": 158748016, "step": 73565 }, { "epoch": 12.00163132137031, "grad_norm": 0.08614024519920349, "learning_rate": 2.0652486841212843e-05, "loss": 0.0257, "num_input_tokens_seen": 158758480, "step": 73570 }, { "epoch": 12.002446982055465, "grad_norm": 0.5036981105804443, "learning_rate": 2.0648982121696292e-05, "loss": 0.0653, "num_input_tokens_seen": 158768688, "step": 73575 }, { "epoch": 12.00326264274062, "grad_norm": 0.026510238647460938, "learning_rate": 2.064547749035863e-05, "loss": 0.0222, "num_input_tokens_seen": 158780016, "step": 73580 }, { "epoch": 12.004078303425775, "grad_norm": 0.06842823326587677, "learning_rate": 2.0641972947270897e-05, "loss": 0.2394, "num_input_tokens_seen": 158790832, "step": 73585 }, { "epoch": 12.00489396411093, "grad_norm": 0.641148030757904, "learning_rate": 2.0638468492504107e-05, "loss": 0.0565, "num_input_tokens_seen": 158801680, "step": 73590 }, { "epoch": 12.005709624796085, "grad_norm": 1.2772945165634155, "learning_rate": 2.063496412612929e-05, "loss": 0.1101, "num_input_tokens_seen": 158811760, "step": 73595 }, { "epoch": 12.00652528548124, "grad_norm": 0.11294888705015182, "learning_rate": 2.0631459848217457e-05, "loss": 0.1735, "num_input_tokens_seen": 158821392, "step": 73600 }, { "epoch": 12.007340946166394, "grad_norm": 1.022948145866394, "learning_rate": 2.0627955658839635e-05, "loss": 0.0729, "num_input_tokens_seen": 158832176, "step": 73605 }, { "epoch": 12.00815660685155, "grad_norm": 0.48225826025009155, "learning_rate": 2.062445155806684e-05, "loss": 0.0606, "num_input_tokens_seen": 158842192, "step": 73610 }, { "epoch": 12.008972267536704, "grad_norm": 1.389450192451477, "learning_rate": 2.062094754597008e-05, "loss": 0.037, "num_input_tokens_seen": 158853456, "step": 73615 }, { "epoch": 12.00978792822186, "grad_norm": 0.1719331443309784, "learning_rate": 2.0617443622620378e-05, "loss": 0.038, "num_input_tokens_seen": 158863952, "step": 73620 }, { "epoch": 12.010603588907015, "grad_norm": 0.8534601330757141, "learning_rate": 2.0613939788088737e-05, "loss": 0.213, "num_input_tokens_seen": 158875024, "step": 73625 }, { "epoch": 12.01141924959217, "grad_norm": 0.0974598154425621, "learning_rate": 2.061043604244617e-05, "loss": 0.0349, "num_input_tokens_seen": 158884624, "step": 73630 }, { "epoch": 12.012234910277325, "grad_norm": 0.6340378522872925, "learning_rate": 2.060693238576368e-05, "loss": 0.0374, "num_input_tokens_seen": 158895664, "step": 73635 }, { "epoch": 12.013050570962479, "grad_norm": 0.6723273992538452, "learning_rate": 2.060342881811229e-05, "loss": 0.1786, "num_input_tokens_seen": 158906576, "step": 73640 }, { "epoch": 12.013866231647635, "grad_norm": 2.8572590351104736, "learning_rate": 2.0599925339562987e-05, "loss": 0.3073, "num_input_tokens_seen": 158917648, "step": 73645 }, { "epoch": 12.01468189233279, "grad_norm": 1.8398621082305908, "learning_rate": 2.059642195018678e-05, "loss": 0.1591, "num_input_tokens_seen": 158927664, "step": 73650 }, { "epoch": 12.015497553017944, "grad_norm": 2.363193988800049, "learning_rate": 2.0592918650054663e-05, "loss": 0.3212, "num_input_tokens_seen": 158939280, "step": 73655 }, { "epoch": 12.0163132137031, "grad_norm": 0.05516498535871506, "learning_rate": 2.0589415439237653e-05, "loss": 0.1068, "num_input_tokens_seen": 158949456, "step": 73660 }, { "epoch": 12.017128874388254, "grad_norm": 1.1229599714279175, "learning_rate": 2.0585912317806723e-05, "loss": 0.1504, "num_input_tokens_seen": 158960944, "step": 73665 }, { "epoch": 12.01794453507341, "grad_norm": 1.687677025794983, "learning_rate": 2.0582409285832887e-05, "loss": 0.2216, "num_input_tokens_seen": 158971056, "step": 73670 }, { "epoch": 12.018760195758565, "grad_norm": 0.043958161026239395, "learning_rate": 2.0578906343387123e-05, "loss": 0.0599, "num_input_tokens_seen": 158982544, "step": 73675 }, { "epoch": 12.01957585644372, "grad_norm": 0.04704050347208977, "learning_rate": 2.057540349054044e-05, "loss": 0.076, "num_input_tokens_seen": 158993552, "step": 73680 }, { "epoch": 12.020391517128875, "grad_norm": 0.08342497795820236, "learning_rate": 2.057190072736381e-05, "loss": 0.1125, "num_input_tokens_seen": 159003824, "step": 73685 }, { "epoch": 12.021207177814029, "grad_norm": 0.15649984776973724, "learning_rate": 2.0568398053928235e-05, "loss": 0.0879, "num_input_tokens_seen": 159014736, "step": 73690 }, { "epoch": 12.022022838499185, "grad_norm": 0.06338181346654892, "learning_rate": 2.0564895470304686e-05, "loss": 0.2206, "num_input_tokens_seen": 159026448, "step": 73695 }, { "epoch": 12.022838499184338, "grad_norm": 0.9456972479820251, "learning_rate": 2.056139297656417e-05, "loss": 0.1866, "num_input_tokens_seen": 159037392, "step": 73700 }, { "epoch": 12.023654159869494, "grad_norm": 0.5327953696250916, "learning_rate": 2.0557890572777643e-05, "loss": 0.2377, "num_input_tokens_seen": 159046064, "step": 73705 }, { "epoch": 12.02446982055465, "grad_norm": 3.1971774101257324, "learning_rate": 2.055438825901611e-05, "loss": 0.1164, "num_input_tokens_seen": 159057360, "step": 73710 }, { "epoch": 12.025285481239804, "grad_norm": 0.723044753074646, "learning_rate": 2.0550886035350532e-05, "loss": 0.053, "num_input_tokens_seen": 159067216, "step": 73715 }, { "epoch": 12.02610114192496, "grad_norm": 0.06450019776821136, "learning_rate": 2.0547383901851896e-05, "loss": 0.0444, "num_input_tokens_seen": 159077680, "step": 73720 }, { "epoch": 12.026916802610113, "grad_norm": 2.520843505859375, "learning_rate": 2.054388185859117e-05, "loss": 0.213, "num_input_tokens_seen": 159087856, "step": 73725 }, { "epoch": 12.02773246329527, "grad_norm": 0.33985719084739685, "learning_rate": 2.0540379905639336e-05, "loss": 0.0693, "num_input_tokens_seen": 159099376, "step": 73730 }, { "epoch": 12.028548123980425, "grad_norm": 0.31160658597946167, "learning_rate": 2.0536878043067353e-05, "loss": 0.1278, "num_input_tokens_seen": 159111184, "step": 73735 }, { "epoch": 12.029363784665579, "grad_norm": 0.20695538818836212, "learning_rate": 2.05333762709462e-05, "loss": 0.0375, "num_input_tokens_seen": 159122608, "step": 73740 }, { "epoch": 12.030179445350734, "grad_norm": 0.052701566368341446, "learning_rate": 2.052987458934685e-05, "loss": 0.0292, "num_input_tokens_seen": 159134160, "step": 73745 }, { "epoch": 12.030995106035888, "grad_norm": 0.38179948925971985, "learning_rate": 2.0526372998340252e-05, "loss": 0.0411, "num_input_tokens_seen": 159144912, "step": 73750 }, { "epoch": 12.031810766721044, "grad_norm": 2.0624332427978516, "learning_rate": 2.0522871497997392e-05, "loss": 0.1021, "num_input_tokens_seen": 159155984, "step": 73755 }, { "epoch": 12.0326264274062, "grad_norm": 1.0350006818771362, "learning_rate": 2.0519370088389213e-05, "loss": 0.0536, "num_input_tokens_seen": 159167088, "step": 73760 }, { "epoch": 12.033442088091354, "grad_norm": 0.8987144231796265, "learning_rate": 2.0515868769586686e-05, "loss": 0.0444, "num_input_tokens_seen": 159177840, "step": 73765 }, { "epoch": 12.03425774877651, "grad_norm": 0.04123670607805252, "learning_rate": 2.0512367541660765e-05, "loss": 0.0241, "num_input_tokens_seen": 159189360, "step": 73770 }, { "epoch": 12.035073409461663, "grad_norm": 0.25917863845825195, "learning_rate": 2.050886640468241e-05, "loss": 0.2531, "num_input_tokens_seen": 159199600, "step": 73775 }, { "epoch": 12.035889070146819, "grad_norm": 0.6491506695747375, "learning_rate": 2.050536535872258e-05, "loss": 0.2321, "num_input_tokens_seen": 159210832, "step": 73780 }, { "epoch": 12.036704730831975, "grad_norm": 0.04684143140912056, "learning_rate": 2.0501864403852217e-05, "loss": 0.0311, "num_input_tokens_seen": 159219984, "step": 73785 }, { "epoch": 12.037520391517129, "grad_norm": 1.4259302616119385, "learning_rate": 2.049836354014228e-05, "loss": 0.236, "num_input_tokens_seen": 159230608, "step": 73790 }, { "epoch": 12.038336052202284, "grad_norm": 0.5318652391433716, "learning_rate": 2.0494862767663712e-05, "loss": 0.0259, "num_input_tokens_seen": 159240784, "step": 73795 }, { "epoch": 12.039151712887438, "grad_norm": 0.9451073408126831, "learning_rate": 2.0491362086487466e-05, "loss": 0.2851, "num_input_tokens_seen": 159251312, "step": 73800 }, { "epoch": 12.039967373572594, "grad_norm": 0.16515007615089417, "learning_rate": 2.0487861496684495e-05, "loss": 0.3321, "num_input_tokens_seen": 159262384, "step": 73805 }, { "epoch": 12.040783034257748, "grad_norm": 2.345787525177002, "learning_rate": 2.0484360998325724e-05, "loss": 0.139, "num_input_tokens_seen": 159273552, "step": 73810 }, { "epoch": 12.041598694942904, "grad_norm": 0.045102763921022415, "learning_rate": 2.0480860591482117e-05, "loss": 0.1258, "num_input_tokens_seen": 159284080, "step": 73815 }, { "epoch": 12.04241435562806, "grad_norm": 0.046780042350292206, "learning_rate": 2.0477360276224595e-05, "loss": 0.0414, "num_input_tokens_seen": 159294256, "step": 73820 }, { "epoch": 12.043230016313213, "grad_norm": 0.4072761535644531, "learning_rate": 2.0473860052624112e-05, "loss": 0.1566, "num_input_tokens_seen": 159303568, "step": 73825 }, { "epoch": 12.044045676998369, "grad_norm": 0.04586273059248924, "learning_rate": 2.0470359920751588e-05, "loss": 0.04, "num_input_tokens_seen": 159314576, "step": 73830 }, { "epoch": 12.044861337683523, "grad_norm": 0.06254685670137405, "learning_rate": 2.0466859880677973e-05, "loss": 0.1222, "num_input_tokens_seen": 159326480, "step": 73835 }, { "epoch": 12.045676998368679, "grad_norm": 0.04516229405999184, "learning_rate": 2.0463359932474186e-05, "loss": 0.0978, "num_input_tokens_seen": 159336592, "step": 73840 }, { "epoch": 12.046492659053834, "grad_norm": 0.05880981311202049, "learning_rate": 2.0459860076211164e-05, "loss": 0.0428, "num_input_tokens_seen": 159347536, "step": 73845 }, { "epoch": 12.047308319738988, "grad_norm": 0.2749021351337433, "learning_rate": 2.045636031195985e-05, "loss": 0.0636, "num_input_tokens_seen": 159359184, "step": 73850 }, { "epoch": 12.048123980424144, "grad_norm": 0.06265296787023544, "learning_rate": 2.0452860639791144e-05, "loss": 0.2652, "num_input_tokens_seen": 159371920, "step": 73855 }, { "epoch": 12.048939641109298, "grad_norm": 0.8042765259742737, "learning_rate": 2.0449361059775997e-05, "loss": 0.2017, "num_input_tokens_seen": 159382480, "step": 73860 }, { "epoch": 12.049755301794454, "grad_norm": 0.25490760803222656, "learning_rate": 2.0445861571985313e-05, "loss": 0.1181, "num_input_tokens_seen": 159393392, "step": 73865 }, { "epoch": 12.05057096247961, "grad_norm": 1.1391807794570923, "learning_rate": 2.0442362176490033e-05, "loss": 0.0909, "num_input_tokens_seen": 159404656, "step": 73870 }, { "epoch": 12.051386623164763, "grad_norm": 1.0278229713439941, "learning_rate": 2.043886287336105e-05, "loss": 0.059, "num_input_tokens_seen": 159415056, "step": 73875 }, { "epoch": 12.052202283849919, "grad_norm": 0.07825042307376862, "learning_rate": 2.043536366266931e-05, "loss": 0.0336, "num_input_tokens_seen": 159426960, "step": 73880 }, { "epoch": 12.053017944535073, "grad_norm": 1.257601022720337, "learning_rate": 2.0431864544485706e-05, "loss": 0.0535, "num_input_tokens_seen": 159438384, "step": 73885 }, { "epoch": 12.053833605220229, "grad_norm": 0.8331125378608704, "learning_rate": 2.0428365518881172e-05, "loss": 0.1427, "num_input_tokens_seen": 159448016, "step": 73890 }, { "epoch": 12.054649265905383, "grad_norm": 0.2373921275138855, "learning_rate": 2.04248665859266e-05, "loss": 0.0605, "num_input_tokens_seen": 159459568, "step": 73895 }, { "epoch": 12.055464926590538, "grad_norm": 0.05223085358738899, "learning_rate": 2.042136774569292e-05, "loss": 0.0679, "num_input_tokens_seen": 159469680, "step": 73900 }, { "epoch": 12.056280587275694, "grad_norm": 0.39562657475471497, "learning_rate": 2.041786899825102e-05, "loss": 0.1226, "num_input_tokens_seen": 159480464, "step": 73905 }, { "epoch": 12.057096247960848, "grad_norm": 0.8447901606559753, "learning_rate": 2.0414370343671828e-05, "loss": 0.1225, "num_input_tokens_seen": 159489840, "step": 73910 }, { "epoch": 12.057911908646004, "grad_norm": 1.5449974536895752, "learning_rate": 2.0410871782026233e-05, "loss": 0.0595, "num_input_tokens_seen": 159501168, "step": 73915 }, { "epoch": 12.058727569331158, "grad_norm": 0.7123900055885315, "learning_rate": 2.0407373313385145e-05, "loss": 0.0926, "num_input_tokens_seen": 159512528, "step": 73920 }, { "epoch": 12.059543230016313, "grad_norm": 0.0496111661195755, "learning_rate": 2.0403874937819465e-05, "loss": 0.2247, "num_input_tokens_seen": 159523376, "step": 73925 }, { "epoch": 12.060358890701469, "grad_norm": 0.8766421675682068, "learning_rate": 2.0400376655400084e-05, "loss": 0.0347, "num_input_tokens_seen": 159534256, "step": 73930 }, { "epoch": 12.061174551386623, "grad_norm": 0.05502549931406975, "learning_rate": 2.0396878466197906e-05, "loss": 0.1647, "num_input_tokens_seen": 159545072, "step": 73935 }, { "epoch": 12.061990212071779, "grad_norm": 2.979703664779663, "learning_rate": 2.0393380370283833e-05, "loss": 0.2427, "num_input_tokens_seen": 159555024, "step": 73940 }, { "epoch": 12.062805872756933, "grad_norm": 0.3116609752178192, "learning_rate": 2.0389882367728743e-05, "loss": 0.0307, "num_input_tokens_seen": 159566288, "step": 73945 }, { "epoch": 12.063621533442088, "grad_norm": 0.2234586477279663, "learning_rate": 2.0386384458603534e-05, "loss": 0.0393, "num_input_tokens_seen": 159576528, "step": 73950 }, { "epoch": 12.064437194127244, "grad_norm": 1.615018606185913, "learning_rate": 2.03828866429791e-05, "loss": 0.0595, "num_input_tokens_seen": 159587088, "step": 73955 }, { "epoch": 12.065252854812398, "grad_norm": 0.06159951537847519, "learning_rate": 2.0379388920926323e-05, "loss": 0.1138, "num_input_tokens_seen": 159597648, "step": 73960 }, { "epoch": 12.066068515497554, "grad_norm": 0.053101908415555954, "learning_rate": 2.03758912925161e-05, "loss": 0.0974, "num_input_tokens_seen": 159607728, "step": 73965 }, { "epoch": 12.066884176182707, "grad_norm": 2.1072685718536377, "learning_rate": 2.0372393757819293e-05, "loss": 0.0829, "num_input_tokens_seen": 159618224, "step": 73970 }, { "epoch": 12.067699836867863, "grad_norm": 2.451535224914551, "learning_rate": 2.0368896316906813e-05, "loss": 0.0684, "num_input_tokens_seen": 159629136, "step": 73975 }, { "epoch": 12.068515497553017, "grad_norm": 0.5374718308448792, "learning_rate": 2.0365398969849507e-05, "loss": 0.1046, "num_input_tokens_seen": 159640016, "step": 73980 }, { "epoch": 12.069331158238173, "grad_norm": 0.0691375881433487, "learning_rate": 2.036190171671829e-05, "loss": 0.0245, "num_input_tokens_seen": 159651696, "step": 73985 }, { "epoch": 12.070146818923329, "grad_norm": 0.019315442070364952, "learning_rate": 2.0358404557583997e-05, "loss": 0.0591, "num_input_tokens_seen": 159663536, "step": 73990 }, { "epoch": 12.070962479608482, "grad_norm": 0.016872724518179893, "learning_rate": 2.0354907492517542e-05, "loss": 0.1998, "num_input_tokens_seen": 159674032, "step": 73995 }, { "epoch": 12.071778140293638, "grad_norm": 0.18093980848789215, "learning_rate": 2.0351410521589764e-05, "loss": 0.1843, "num_input_tokens_seen": 159684208, "step": 74000 }, { "epoch": 12.072593800978792, "grad_norm": 0.05286658555269241, "learning_rate": 2.0347913644871564e-05, "loss": 0.0989, "num_input_tokens_seen": 159695408, "step": 74005 }, { "epoch": 12.073409461663948, "grad_norm": 0.31082627177238464, "learning_rate": 2.0344416862433783e-05, "loss": 0.0943, "num_input_tokens_seen": 159706064, "step": 74010 }, { "epoch": 12.074225122349104, "grad_norm": 1.302574872970581, "learning_rate": 2.034092017434731e-05, "loss": 0.0975, "num_input_tokens_seen": 159716048, "step": 74015 }, { "epoch": 12.075040783034257, "grad_norm": 0.029176020994782448, "learning_rate": 2.033742358068299e-05, "loss": 0.0195, "num_input_tokens_seen": 159726352, "step": 74020 }, { "epoch": 12.075856443719413, "grad_norm": 0.536223828792572, "learning_rate": 2.0333927081511708e-05, "loss": 0.1612, "num_input_tokens_seen": 159737648, "step": 74025 }, { "epoch": 12.076672104404567, "grad_norm": 2.231325387954712, "learning_rate": 2.03304306769043e-05, "loss": 0.2349, "num_input_tokens_seen": 159748400, "step": 74030 }, { "epoch": 12.077487765089723, "grad_norm": 1.7566887140274048, "learning_rate": 2.032693436693165e-05, "loss": 0.0939, "num_input_tokens_seen": 159760144, "step": 74035 }, { "epoch": 12.078303425774878, "grad_norm": 1.7144023180007935, "learning_rate": 2.0323438151664597e-05, "loss": 0.1205, "num_input_tokens_seen": 159772048, "step": 74040 }, { "epoch": 12.079119086460032, "grad_norm": 0.24385520815849304, "learning_rate": 2.0319942031174004e-05, "loss": 0.0249, "num_input_tokens_seen": 159782128, "step": 74045 }, { "epoch": 12.079934747145188, "grad_norm": 1.0993090867996216, "learning_rate": 2.0316446005530722e-05, "loss": 0.0414, "num_input_tokens_seen": 159793840, "step": 74050 }, { "epoch": 12.080750407830342, "grad_norm": 1.7387663125991821, "learning_rate": 2.03129500748056e-05, "loss": 0.081, "num_input_tokens_seen": 159805488, "step": 74055 }, { "epoch": 12.081566068515498, "grad_norm": 1.2267296314239502, "learning_rate": 2.0309454239069496e-05, "loss": 0.095, "num_input_tokens_seen": 159815056, "step": 74060 }, { "epoch": 12.082381729200652, "grad_norm": 0.16081389784812927, "learning_rate": 2.0305958498393244e-05, "loss": 0.1264, "num_input_tokens_seen": 159826064, "step": 74065 }, { "epoch": 12.083197389885807, "grad_norm": 2.672398328781128, "learning_rate": 2.030246285284771e-05, "loss": 0.1241, "num_input_tokens_seen": 159836944, "step": 74070 }, { "epoch": 12.084013050570963, "grad_norm": 1.4557304382324219, "learning_rate": 2.0298967302503712e-05, "loss": 0.1839, "num_input_tokens_seen": 159847056, "step": 74075 }, { "epoch": 12.084828711256117, "grad_norm": 1.517777442932129, "learning_rate": 2.0295471847432116e-05, "loss": 0.3657, "num_input_tokens_seen": 159856848, "step": 74080 }, { "epoch": 12.085644371941273, "grad_norm": 0.05096715688705444, "learning_rate": 2.029197648770375e-05, "loss": 0.0308, "num_input_tokens_seen": 159869040, "step": 74085 }, { "epoch": 12.086460032626427, "grad_norm": 0.2837909162044525, "learning_rate": 2.0288481223389448e-05, "loss": 0.0526, "num_input_tokens_seen": 159880368, "step": 74090 }, { "epoch": 12.087275693311582, "grad_norm": 0.11179125308990479, "learning_rate": 2.0284986054560057e-05, "loss": 0.0294, "num_input_tokens_seen": 159891600, "step": 74095 }, { "epoch": 12.088091353996738, "grad_norm": 0.06888510286808014, "learning_rate": 2.0281490981286404e-05, "loss": 0.0608, "num_input_tokens_seen": 159903344, "step": 74100 }, { "epoch": 12.088907014681892, "grad_norm": 0.3845067322254181, "learning_rate": 2.027799600363931e-05, "loss": 0.0395, "num_input_tokens_seen": 159913904, "step": 74105 }, { "epoch": 12.089722675367048, "grad_norm": 0.2707254886627197, "learning_rate": 2.0274501121689635e-05, "loss": 0.0232, "num_input_tokens_seen": 159923952, "step": 74110 }, { "epoch": 12.090538336052202, "grad_norm": 0.03893445432186127, "learning_rate": 2.0271006335508178e-05, "loss": 0.0139, "num_input_tokens_seen": 159935280, "step": 74115 }, { "epoch": 12.091353996737357, "grad_norm": 0.8198590874671936, "learning_rate": 2.0267511645165787e-05, "loss": 0.099, "num_input_tokens_seen": 159947312, "step": 74120 }, { "epoch": 12.092169657422513, "grad_norm": 0.3597632348537445, "learning_rate": 2.0264017050733263e-05, "loss": 0.1177, "num_input_tokens_seen": 159957936, "step": 74125 }, { "epoch": 12.092985318107667, "grad_norm": 0.07743720710277557, "learning_rate": 2.0260522552281454e-05, "loss": 0.0325, "num_input_tokens_seen": 159969776, "step": 74130 }, { "epoch": 12.093800978792823, "grad_norm": 0.09640678763389587, "learning_rate": 2.0257028149881158e-05, "loss": 0.2416, "num_input_tokens_seen": 159981232, "step": 74135 }, { "epoch": 12.094616639477977, "grad_norm": 0.06010131910443306, "learning_rate": 2.025353384360321e-05, "loss": 0.114, "num_input_tokens_seen": 159990864, "step": 74140 }, { "epoch": 12.095432300163132, "grad_norm": 1.3333498239517212, "learning_rate": 2.0250039633518416e-05, "loss": 0.0527, "num_input_tokens_seen": 160002352, "step": 74145 }, { "epoch": 12.096247960848286, "grad_norm": 0.3321632742881775, "learning_rate": 2.0246545519697603e-05, "loss": 0.0272, "num_input_tokens_seen": 160013072, "step": 74150 }, { "epoch": 12.097063621533442, "grad_norm": 0.2694147527217865, "learning_rate": 2.0243051502211562e-05, "loss": 0.0844, "num_input_tokens_seen": 160024080, "step": 74155 }, { "epoch": 12.097879282218598, "grad_norm": 0.09302544593811035, "learning_rate": 2.023955758113113e-05, "loss": 0.1128, "num_input_tokens_seen": 160034480, "step": 74160 }, { "epoch": 12.098694942903752, "grad_norm": 0.08967649191617966, "learning_rate": 2.023606375652709e-05, "loss": 0.1127, "num_input_tokens_seen": 160044400, "step": 74165 }, { "epoch": 12.099510603588907, "grad_norm": 0.6812257170677185, "learning_rate": 2.0232570028470263e-05, "loss": 0.0237, "num_input_tokens_seen": 160055024, "step": 74170 }, { "epoch": 12.100326264274061, "grad_norm": 0.5748637318611145, "learning_rate": 2.022907639703146e-05, "loss": 0.0764, "num_input_tokens_seen": 160065808, "step": 74175 }, { "epoch": 12.101141924959217, "grad_norm": 0.577610194683075, "learning_rate": 2.022558286228147e-05, "loss": 0.0859, "num_input_tokens_seen": 160076592, "step": 74180 }, { "epoch": 12.101957585644373, "grad_norm": 1.0766456127166748, "learning_rate": 2.0222089424291106e-05, "loss": 0.0955, "num_input_tokens_seen": 160087760, "step": 74185 }, { "epoch": 12.102773246329527, "grad_norm": 0.2926221489906311, "learning_rate": 2.0218596083131156e-05, "loss": 0.0251, "num_input_tokens_seen": 160098256, "step": 74190 }, { "epoch": 12.103588907014682, "grad_norm": 1.6522119045257568, "learning_rate": 2.021510283887243e-05, "loss": 0.0986, "num_input_tokens_seen": 160107792, "step": 74195 }, { "epoch": 12.104404567699836, "grad_norm": 0.42952582240104675, "learning_rate": 2.0211609691585704e-05, "loss": 0.2646, "num_input_tokens_seen": 160118544, "step": 74200 }, { "epoch": 12.105220228384992, "grad_norm": 0.07167519629001617, "learning_rate": 2.020811664134179e-05, "loss": 0.1207, "num_input_tokens_seen": 160128528, "step": 74205 }, { "epoch": 12.106035889070148, "grad_norm": 0.7992002964019775, "learning_rate": 2.0204623688211467e-05, "loss": 0.1272, "num_input_tokens_seen": 160139024, "step": 74210 }, { "epoch": 12.106851549755302, "grad_norm": 0.053128406405448914, "learning_rate": 2.0201130832265535e-05, "loss": 0.0479, "num_input_tokens_seen": 160150192, "step": 74215 }, { "epoch": 12.107667210440457, "grad_norm": 0.06527543067932129, "learning_rate": 2.019763807357476e-05, "loss": 0.0369, "num_input_tokens_seen": 160162064, "step": 74220 }, { "epoch": 12.108482871125611, "grad_norm": 0.7819750905036926, "learning_rate": 2.0194145412209953e-05, "loss": 0.0407, "num_input_tokens_seen": 160172880, "step": 74225 }, { "epoch": 12.109298531810767, "grad_norm": 0.14624089002609253, "learning_rate": 2.0190652848241885e-05, "loss": 0.1439, "num_input_tokens_seen": 160183600, "step": 74230 }, { "epoch": 12.11011419249592, "grad_norm": 0.129075288772583, "learning_rate": 2.0187160381741337e-05, "loss": 0.0824, "num_input_tokens_seen": 160194000, "step": 74235 }, { "epoch": 12.110929853181077, "grad_norm": 0.9373652338981628, "learning_rate": 2.0183668012779088e-05, "loss": 0.2475, "num_input_tokens_seen": 160204176, "step": 74240 }, { "epoch": 12.111745513866232, "grad_norm": 0.42237377166748047, "learning_rate": 2.018017574142592e-05, "loss": 0.144, "num_input_tokens_seen": 160214896, "step": 74245 }, { "epoch": 12.112561174551386, "grad_norm": 0.07851959019899368, "learning_rate": 2.0176683567752598e-05, "loss": 0.0662, "num_input_tokens_seen": 160225456, "step": 74250 }, { "epoch": 12.113376835236542, "grad_norm": 0.24078570306301117, "learning_rate": 2.0173191491829912e-05, "loss": 0.2235, "num_input_tokens_seen": 160236464, "step": 74255 }, { "epoch": 12.114192495921696, "grad_norm": 0.10952159017324448, "learning_rate": 2.0169699513728615e-05, "loss": 0.0764, "num_input_tokens_seen": 160248624, "step": 74260 }, { "epoch": 12.115008156606851, "grad_norm": 0.08215100318193436, "learning_rate": 2.0166207633519495e-05, "loss": 0.0645, "num_input_tokens_seen": 160258672, "step": 74265 }, { "epoch": 12.115823817292007, "grad_norm": 0.34323108196258545, "learning_rate": 2.0162715851273296e-05, "loss": 0.0907, "num_input_tokens_seen": 160268400, "step": 74270 }, { "epoch": 12.116639477977161, "grad_norm": 2.0769448280334473, "learning_rate": 2.01592241670608e-05, "loss": 0.1226, "num_input_tokens_seen": 160279344, "step": 74275 }, { "epoch": 12.117455138662317, "grad_norm": 0.14278213679790497, "learning_rate": 2.0155732580952778e-05, "loss": 0.0143, "num_input_tokens_seen": 160289808, "step": 74280 }, { "epoch": 12.11827079934747, "grad_norm": 2.4996068477630615, "learning_rate": 2.0152241093019968e-05, "loss": 0.1597, "num_input_tokens_seen": 160300240, "step": 74285 }, { "epoch": 12.119086460032626, "grad_norm": 0.015686752274632454, "learning_rate": 2.0148749703333157e-05, "loss": 0.0105, "num_input_tokens_seen": 160311856, "step": 74290 }, { "epoch": 12.119902120717782, "grad_norm": 0.8145051598548889, "learning_rate": 2.0145258411963073e-05, "loss": 0.0294, "num_input_tokens_seen": 160323024, "step": 74295 }, { "epoch": 12.120717781402936, "grad_norm": 1.5695199966430664, "learning_rate": 2.0141767218980494e-05, "loss": 0.0918, "num_input_tokens_seen": 160334576, "step": 74300 }, { "epoch": 12.121533442088092, "grad_norm": 1.121124029159546, "learning_rate": 2.013827612445616e-05, "loss": 0.1943, "num_input_tokens_seen": 160344464, "step": 74305 }, { "epoch": 12.122349102773246, "grad_norm": 0.09190990030765533, "learning_rate": 2.0134785128460835e-05, "loss": 0.1083, "num_input_tokens_seen": 160355856, "step": 74310 }, { "epoch": 12.123164763458401, "grad_norm": 1.222235083580017, "learning_rate": 2.0131294231065254e-05, "loss": 0.2363, "num_input_tokens_seen": 160367216, "step": 74315 }, { "epoch": 12.123980424143557, "grad_norm": 0.9170172214508057, "learning_rate": 2.012780343234018e-05, "loss": 0.0815, "num_input_tokens_seen": 160378288, "step": 74320 }, { "epoch": 12.124796084828711, "grad_norm": 0.7440828680992126, "learning_rate": 2.012431273235634e-05, "loss": 0.1074, "num_input_tokens_seen": 160388432, "step": 74325 }, { "epoch": 12.125611745513867, "grad_norm": 0.38856422901153564, "learning_rate": 2.0120822131184498e-05, "loss": 0.2285, "num_input_tokens_seen": 160399664, "step": 74330 }, { "epoch": 12.12642740619902, "grad_norm": 1.7418615818023682, "learning_rate": 2.0117331628895376e-05, "loss": 0.1959, "num_input_tokens_seen": 160410064, "step": 74335 }, { "epoch": 12.127243066884176, "grad_norm": 0.1644587516784668, "learning_rate": 2.0113841225559733e-05, "loss": 0.0375, "num_input_tokens_seen": 160421488, "step": 74340 }, { "epoch": 12.12805872756933, "grad_norm": 0.029300455003976822, "learning_rate": 2.0110350921248283e-05, "loss": 0.0369, "num_input_tokens_seen": 160431088, "step": 74345 }, { "epoch": 12.128874388254486, "grad_norm": 0.06192915514111519, "learning_rate": 2.010686071603179e-05, "loss": 0.0499, "num_input_tokens_seen": 160441808, "step": 74350 }, { "epoch": 12.129690048939642, "grad_norm": 0.10280640423297882, "learning_rate": 2.0103370609980957e-05, "loss": 0.0096, "num_input_tokens_seen": 160452080, "step": 74355 }, { "epoch": 12.130505709624796, "grad_norm": 0.43344882130622864, "learning_rate": 2.009988060316654e-05, "loss": 0.1761, "num_input_tokens_seen": 160463824, "step": 74360 }, { "epoch": 12.131321370309951, "grad_norm": 0.9892233610153198, "learning_rate": 2.009639069565925e-05, "loss": 0.041, "num_input_tokens_seen": 160475120, "step": 74365 }, { "epoch": 12.132137030995105, "grad_norm": 0.036820702254772186, "learning_rate": 2.009290088752983e-05, "loss": 0.0743, "num_input_tokens_seen": 160486512, "step": 74370 }, { "epoch": 12.132952691680261, "grad_norm": 0.14375914633274078, "learning_rate": 2.0089411178849e-05, "loss": 0.0572, "num_input_tokens_seen": 160498128, "step": 74375 }, { "epoch": 12.133768352365417, "grad_norm": 0.023073261603713036, "learning_rate": 2.008592156968747e-05, "loss": 0.0273, "num_input_tokens_seen": 160509072, "step": 74380 }, { "epoch": 12.13458401305057, "grad_norm": 0.6984167695045471, "learning_rate": 2.0082432060115992e-05, "loss": 0.0802, "num_input_tokens_seen": 160519056, "step": 74385 }, { "epoch": 12.135399673735726, "grad_norm": 5.7674760818481445, "learning_rate": 2.0078942650205252e-05, "loss": 0.1791, "num_input_tokens_seen": 160529936, "step": 74390 }, { "epoch": 12.13621533442088, "grad_norm": 0.033660639077425, "learning_rate": 2.0075453340025987e-05, "loss": 0.0556, "num_input_tokens_seen": 160541296, "step": 74395 }, { "epoch": 12.137030995106036, "grad_norm": 0.03343970701098442, "learning_rate": 2.007196412964891e-05, "loss": 0.1621, "num_input_tokens_seen": 160552016, "step": 74400 }, { "epoch": 12.137846655791192, "grad_norm": 0.054792460054159164, "learning_rate": 2.0068475019144728e-05, "loss": 0.1214, "num_input_tokens_seen": 160562480, "step": 74405 }, { "epoch": 12.138662316476346, "grad_norm": 0.03273330628871918, "learning_rate": 2.0064986008584157e-05, "loss": 0.0827, "num_input_tokens_seen": 160572368, "step": 74410 }, { "epoch": 12.139477977161501, "grad_norm": 1.0769425630569458, "learning_rate": 2.0061497098037907e-05, "loss": 0.178, "num_input_tokens_seen": 160583088, "step": 74415 }, { "epoch": 12.140293637846655, "grad_norm": 0.8201781511306763, "learning_rate": 2.0058008287576673e-05, "loss": 0.0562, "num_input_tokens_seen": 160594576, "step": 74420 }, { "epoch": 12.141109298531811, "grad_norm": 2.0064857006073, "learning_rate": 2.0054519577271186e-05, "loss": 0.0737, "num_input_tokens_seen": 160605264, "step": 74425 }, { "epoch": 12.141924959216965, "grad_norm": 0.0616990327835083, "learning_rate": 2.0051030967192123e-05, "loss": 0.0776, "num_input_tokens_seen": 160617232, "step": 74430 }, { "epoch": 12.14274061990212, "grad_norm": 0.04557167738676071, "learning_rate": 2.0047542457410205e-05, "loss": 0.0189, "num_input_tokens_seen": 160630224, "step": 74435 }, { "epoch": 12.143556280587276, "grad_norm": 0.9132311940193176, "learning_rate": 2.0044054047996106e-05, "loss": 0.0677, "num_input_tokens_seen": 160641712, "step": 74440 }, { "epoch": 12.14437194127243, "grad_norm": 0.22538447380065918, "learning_rate": 2.004056573902056e-05, "loss": 0.1464, "num_input_tokens_seen": 160654256, "step": 74445 }, { "epoch": 12.145187601957586, "grad_norm": 0.2022649496793747, "learning_rate": 2.003707753055422e-05, "loss": 0.0712, "num_input_tokens_seen": 160664464, "step": 74450 }, { "epoch": 12.14600326264274, "grad_norm": 1.9083900451660156, "learning_rate": 2.003358942266782e-05, "loss": 0.1968, "num_input_tokens_seen": 160675248, "step": 74455 }, { "epoch": 12.146818923327896, "grad_norm": 1.3537554740905762, "learning_rate": 2.003010141543201e-05, "loss": 0.0849, "num_input_tokens_seen": 160685872, "step": 74460 }, { "epoch": 12.147634584013051, "grad_norm": 0.32955846190452576, "learning_rate": 2.0026613508917515e-05, "loss": 0.2282, "num_input_tokens_seen": 160696752, "step": 74465 }, { "epoch": 12.148450244698205, "grad_norm": 0.03912152349948883, "learning_rate": 2.0023125703195e-05, "loss": 0.0828, "num_input_tokens_seen": 160707824, "step": 74470 }, { "epoch": 12.149265905383361, "grad_norm": 0.1475573629140854, "learning_rate": 2.001963799833516e-05, "loss": 0.0075, "num_input_tokens_seen": 160719152, "step": 74475 }, { "epoch": 12.150081566068515, "grad_norm": 0.09313581138849258, "learning_rate": 2.0016150394408667e-05, "loss": 0.0773, "num_input_tokens_seen": 160729616, "step": 74480 }, { "epoch": 12.15089722675367, "grad_norm": 0.2843400835990906, "learning_rate": 2.0012662891486207e-05, "loss": 0.0818, "num_input_tokens_seen": 160740816, "step": 74485 }, { "epoch": 12.151712887438826, "grad_norm": 1.8626543283462524, "learning_rate": 2.000917548963847e-05, "loss": 0.1818, "num_input_tokens_seen": 160751600, "step": 74490 }, { "epoch": 12.15252854812398, "grad_norm": 0.24483810365200043, "learning_rate": 2.0005688188936118e-05, "loss": 0.0478, "num_input_tokens_seen": 160761936, "step": 74495 }, { "epoch": 12.153344208809136, "grad_norm": 0.3695647120475769, "learning_rate": 2.0002200989449834e-05, "loss": 0.0875, "num_input_tokens_seen": 160772272, "step": 74500 }, { "epoch": 12.15415986949429, "grad_norm": 1.093919277191162, "learning_rate": 1.999871389125028e-05, "loss": 0.1355, "num_input_tokens_seen": 160783568, "step": 74505 }, { "epoch": 12.154975530179446, "grad_norm": 0.8413657546043396, "learning_rate": 1.9995226894408146e-05, "loss": 0.1201, "num_input_tokens_seen": 160794480, "step": 74510 }, { "epoch": 12.1557911908646, "grad_norm": 1.2106894254684448, "learning_rate": 1.9991739998994074e-05, "loss": 0.0952, "num_input_tokens_seen": 160806512, "step": 74515 }, { "epoch": 12.156606851549755, "grad_norm": 1.0668931007385254, "learning_rate": 1.9988253205078756e-05, "loss": 0.1233, "num_input_tokens_seen": 160818448, "step": 74520 }, { "epoch": 12.15742251223491, "grad_norm": 0.03524381294846535, "learning_rate": 1.9984766512732837e-05, "loss": 0.2528, "num_input_tokens_seen": 160828976, "step": 74525 }, { "epoch": 12.158238172920065, "grad_norm": 0.027791626751422882, "learning_rate": 1.9981279922026995e-05, "loss": 0.022, "num_input_tokens_seen": 160839728, "step": 74530 }, { "epoch": 12.15905383360522, "grad_norm": 2.916539192199707, "learning_rate": 1.997779343303187e-05, "loss": 0.1832, "num_input_tokens_seen": 160849040, "step": 74535 }, { "epoch": 12.159869494290374, "grad_norm": 0.19271895289421082, "learning_rate": 1.997430704581814e-05, "loss": 0.0744, "num_input_tokens_seen": 160858960, "step": 74540 }, { "epoch": 12.16068515497553, "grad_norm": 0.07169803231954575, "learning_rate": 1.9970820760456453e-05, "loss": 0.0381, "num_input_tokens_seen": 160869200, "step": 74545 }, { "epoch": 12.161500815660686, "grad_norm": 1.4404113292694092, "learning_rate": 1.996733457701746e-05, "loss": 0.0343, "num_input_tokens_seen": 160880080, "step": 74550 }, { "epoch": 12.16231647634584, "grad_norm": 0.17403201758861542, "learning_rate": 1.996384849557182e-05, "loss": 0.112, "num_input_tokens_seen": 160891888, "step": 74555 }, { "epoch": 12.163132137030995, "grad_norm": 0.09635686129331589, "learning_rate": 1.9960362516190174e-05, "loss": 0.0303, "num_input_tokens_seen": 160902288, "step": 74560 }, { "epoch": 12.16394779771615, "grad_norm": 0.12137240171432495, "learning_rate": 1.9956876638943176e-05, "loss": 0.1144, "num_input_tokens_seen": 160912464, "step": 74565 }, { "epoch": 12.164763458401305, "grad_norm": 0.057487718760967255, "learning_rate": 1.9953390863901477e-05, "loss": 0.0337, "num_input_tokens_seen": 160924624, "step": 74570 }, { "epoch": 12.16557911908646, "grad_norm": 0.9543459415435791, "learning_rate": 1.9949905191135706e-05, "loss": 0.1731, "num_input_tokens_seen": 160936144, "step": 74575 }, { "epoch": 12.166394779771615, "grad_norm": 0.11427773535251617, "learning_rate": 1.9946419620716523e-05, "loss": 0.0709, "num_input_tokens_seen": 160946928, "step": 74580 }, { "epoch": 12.16721044045677, "grad_norm": 0.026557862758636475, "learning_rate": 1.9942934152714546e-05, "loss": 0.1039, "num_input_tokens_seen": 160957808, "step": 74585 }, { "epoch": 12.168026101141924, "grad_norm": 0.4987471401691437, "learning_rate": 1.9939448787200422e-05, "loss": 0.0542, "num_input_tokens_seen": 160968400, "step": 74590 }, { "epoch": 12.16884176182708, "grad_norm": 0.34112030267715454, "learning_rate": 1.99359635242448e-05, "loss": 0.0145, "num_input_tokens_seen": 160979472, "step": 74595 }, { "epoch": 12.169657422512234, "grad_norm": 0.027303988113999367, "learning_rate": 1.9932478363918293e-05, "loss": 0.0777, "num_input_tokens_seen": 160990512, "step": 74600 }, { "epoch": 12.17047308319739, "grad_norm": 0.23595838248729706, "learning_rate": 1.9928993306291548e-05, "loss": 0.012, "num_input_tokens_seen": 161002000, "step": 74605 }, { "epoch": 12.171288743882545, "grad_norm": 2.271609306335449, "learning_rate": 1.992550835143518e-05, "loss": 0.1415, "num_input_tokens_seen": 161013488, "step": 74610 }, { "epoch": 12.1721044045677, "grad_norm": 0.7814710140228271, "learning_rate": 1.9922023499419827e-05, "loss": 0.1968, "num_input_tokens_seen": 161024144, "step": 74615 }, { "epoch": 12.172920065252855, "grad_norm": 0.29116109013557434, "learning_rate": 1.9918538750316106e-05, "loss": 0.104, "num_input_tokens_seen": 161034736, "step": 74620 }, { "epoch": 12.173735725938009, "grad_norm": 0.2633648216724396, "learning_rate": 1.9915054104194654e-05, "loss": 0.1428, "num_input_tokens_seen": 161045904, "step": 74625 }, { "epoch": 12.174551386623165, "grad_norm": 0.05613790825009346, "learning_rate": 1.9911569561126066e-05, "loss": 0.0856, "num_input_tokens_seen": 161056272, "step": 74630 }, { "epoch": 12.17536704730832, "grad_norm": 0.5320746898651123, "learning_rate": 1.9908085121180984e-05, "loss": 0.1137, "num_input_tokens_seen": 161064784, "step": 74635 }, { "epoch": 12.176182707993474, "grad_norm": 1.2845540046691895, "learning_rate": 1.9904600784430015e-05, "loss": 0.1241, "num_input_tokens_seen": 161075568, "step": 74640 }, { "epoch": 12.17699836867863, "grad_norm": 0.3769015073776245, "learning_rate": 1.990111655094378e-05, "loss": 0.0294, "num_input_tokens_seen": 161086288, "step": 74645 }, { "epoch": 12.177814029363784, "grad_norm": 0.1622220277786255, "learning_rate": 1.9897632420792878e-05, "loss": 0.0765, "num_input_tokens_seen": 161095184, "step": 74650 }, { "epoch": 12.17862969004894, "grad_norm": 1.0033085346221924, "learning_rate": 1.9894148394047934e-05, "loss": 0.2527, "num_input_tokens_seen": 161106640, "step": 74655 }, { "epoch": 12.179445350734095, "grad_norm": 0.5154435038566589, "learning_rate": 1.9890664470779544e-05, "loss": 0.2586, "num_input_tokens_seen": 161118832, "step": 74660 }, { "epoch": 12.18026101141925, "grad_norm": 0.062215156853199005, "learning_rate": 1.988718065105833e-05, "loss": 0.0664, "num_input_tokens_seen": 161129136, "step": 74665 }, { "epoch": 12.181076672104405, "grad_norm": 0.5191398859024048, "learning_rate": 1.9883696934954872e-05, "loss": 0.0533, "num_input_tokens_seen": 161140848, "step": 74670 }, { "epoch": 12.181892332789559, "grad_norm": 0.026106391102075577, "learning_rate": 1.9880213322539804e-05, "loss": 0.0577, "num_input_tokens_seen": 161151344, "step": 74675 }, { "epoch": 12.182707993474715, "grad_norm": 0.28519320487976074, "learning_rate": 1.987672981388369e-05, "loss": 0.0982, "num_input_tokens_seen": 161162896, "step": 74680 }, { "epoch": 12.18352365415987, "grad_norm": 0.29181039333343506, "learning_rate": 1.9873246409057154e-05, "loss": 0.0297, "num_input_tokens_seen": 161174416, "step": 74685 }, { "epoch": 12.184339314845024, "grad_norm": 0.49279215931892395, "learning_rate": 1.9869763108130786e-05, "loss": 0.085, "num_input_tokens_seen": 161185520, "step": 74690 }, { "epoch": 12.18515497553018, "grad_norm": 1.6734888553619385, "learning_rate": 1.9866279911175167e-05, "loss": 0.1089, "num_input_tokens_seen": 161196528, "step": 74695 }, { "epoch": 12.185970636215334, "grad_norm": 0.02999173291027546, "learning_rate": 1.986279681826091e-05, "loss": 0.0623, "num_input_tokens_seen": 161206032, "step": 74700 }, { "epoch": 12.18678629690049, "grad_norm": 0.6789134740829468, "learning_rate": 1.9859313829458583e-05, "loss": 0.032, "num_input_tokens_seen": 161216944, "step": 74705 }, { "epoch": 12.187601957585644, "grad_norm": 0.11151740700006485, "learning_rate": 1.9855830944838787e-05, "loss": 0.0119, "num_input_tokens_seen": 161228176, "step": 74710 }, { "epoch": 12.1884176182708, "grad_norm": 0.3571975827217102, "learning_rate": 1.9852348164472105e-05, "loss": 0.0233, "num_input_tokens_seen": 161239376, "step": 74715 }, { "epoch": 12.189233278955955, "grad_norm": 1.094821810722351, "learning_rate": 1.9848865488429114e-05, "loss": 0.1916, "num_input_tokens_seen": 161250352, "step": 74720 }, { "epoch": 12.190048939641109, "grad_norm": 0.2345483899116516, "learning_rate": 1.98453829167804e-05, "loss": 0.0458, "num_input_tokens_seen": 161261392, "step": 74725 }, { "epoch": 12.190864600326265, "grad_norm": 0.28208765387535095, "learning_rate": 1.984190044959654e-05, "loss": 0.0352, "num_input_tokens_seen": 161273360, "step": 74730 }, { "epoch": 12.191680261011419, "grad_norm": 0.04701214283704758, "learning_rate": 1.9838418086948107e-05, "loss": 0.0905, "num_input_tokens_seen": 161285232, "step": 74735 }, { "epoch": 12.192495921696574, "grad_norm": 1.192848801612854, "learning_rate": 1.983493582890569e-05, "loss": 0.2042, "num_input_tokens_seen": 161296208, "step": 74740 }, { "epoch": 12.19331158238173, "grad_norm": 1.6078203916549683, "learning_rate": 1.9831453675539837e-05, "loss": 0.0406, "num_input_tokens_seen": 161307248, "step": 74745 }, { "epoch": 12.194127243066884, "grad_norm": 2.2138025760650635, "learning_rate": 1.9827971626921147e-05, "loss": 0.1614, "num_input_tokens_seen": 161317488, "step": 74750 }, { "epoch": 12.19494290375204, "grad_norm": 1.5686500072479248, "learning_rate": 1.9824489683120162e-05, "loss": 0.0325, "num_input_tokens_seen": 161327280, "step": 74755 }, { "epoch": 12.195758564437194, "grad_norm": 0.3999914526939392, "learning_rate": 1.9821007844207467e-05, "loss": 0.0589, "num_input_tokens_seen": 161338800, "step": 74760 }, { "epoch": 12.19657422512235, "grad_norm": 0.07817590236663818, "learning_rate": 1.981752611025361e-05, "loss": 0.0506, "num_input_tokens_seen": 161350960, "step": 74765 }, { "epoch": 12.197389885807505, "grad_norm": 0.057994354516267776, "learning_rate": 1.981404448132917e-05, "loss": 0.225, "num_input_tokens_seen": 161361360, "step": 74770 }, { "epoch": 12.198205546492659, "grad_norm": 0.13602153956890106, "learning_rate": 1.9810562957504684e-05, "loss": 0.1885, "num_input_tokens_seen": 161372400, "step": 74775 }, { "epoch": 12.199021207177815, "grad_norm": 0.18148812651634216, "learning_rate": 1.980708153885074e-05, "loss": 0.0165, "num_input_tokens_seen": 161383728, "step": 74780 }, { "epoch": 12.199836867862969, "grad_norm": 0.19615915417671204, "learning_rate": 1.980360022543786e-05, "loss": 0.0475, "num_input_tokens_seen": 161394416, "step": 74785 }, { "epoch": 12.200652528548124, "grad_norm": 0.8611027598381042, "learning_rate": 1.9800119017336626e-05, "loss": 0.1456, "num_input_tokens_seen": 161404944, "step": 74790 }, { "epoch": 12.201468189233278, "grad_norm": 0.14365936815738678, "learning_rate": 1.9796637914617568e-05, "loss": 0.0158, "num_input_tokens_seen": 161415760, "step": 74795 }, { "epoch": 12.202283849918434, "grad_norm": 0.6359469294548035, "learning_rate": 1.9793156917351253e-05, "loss": 0.1186, "num_input_tokens_seen": 161425872, "step": 74800 }, { "epoch": 12.20309951060359, "grad_norm": 0.05764611437916756, "learning_rate": 1.9789676025608207e-05, "loss": 0.0107, "num_input_tokens_seen": 161436688, "step": 74805 }, { "epoch": 12.203915171288743, "grad_norm": 0.5004457831382751, "learning_rate": 1.978619523945899e-05, "loss": 0.0939, "num_input_tokens_seen": 161447376, "step": 74810 }, { "epoch": 12.2047308319739, "grad_norm": 0.09798445552587509, "learning_rate": 1.9782714558974143e-05, "loss": 0.0151, "num_input_tokens_seen": 161457744, "step": 74815 }, { "epoch": 12.205546492659053, "grad_norm": 0.5141426920890808, "learning_rate": 1.97792339842242e-05, "loss": 0.1302, "num_input_tokens_seen": 161468912, "step": 74820 }, { "epoch": 12.206362153344209, "grad_norm": 0.11525008827447891, "learning_rate": 1.9775753515279713e-05, "loss": 0.0079, "num_input_tokens_seen": 161479920, "step": 74825 }, { "epoch": 12.207177814029365, "grad_norm": 0.5174884796142578, "learning_rate": 1.9772273152211197e-05, "loss": 0.1269, "num_input_tokens_seen": 161491088, "step": 74830 }, { "epoch": 12.207993474714518, "grad_norm": 1.031777262687683, "learning_rate": 1.9768792895089207e-05, "loss": 0.0564, "num_input_tokens_seen": 161500912, "step": 74835 }, { "epoch": 12.208809135399674, "grad_norm": 0.2583431005477905, "learning_rate": 1.9765312743984255e-05, "loss": 0.1535, "num_input_tokens_seen": 161511376, "step": 74840 }, { "epoch": 12.209624796084828, "grad_norm": 0.18030187487602234, "learning_rate": 1.976183269896689e-05, "loss": 0.104, "num_input_tokens_seen": 161521936, "step": 74845 }, { "epoch": 12.210440456769984, "grad_norm": 0.3714359402656555, "learning_rate": 1.9758352760107627e-05, "loss": 0.1333, "num_input_tokens_seen": 161531984, "step": 74850 }, { "epoch": 12.21125611745514, "grad_norm": 0.024721357971429825, "learning_rate": 1.9754872927476994e-05, "loss": 0.0767, "num_input_tokens_seen": 161542288, "step": 74855 }, { "epoch": 12.212071778140293, "grad_norm": 0.10154357552528381, "learning_rate": 1.9751393201145517e-05, "loss": 0.0766, "num_input_tokens_seen": 161552016, "step": 74860 }, { "epoch": 12.21288743882545, "grad_norm": 0.035953227430582047, "learning_rate": 1.9747913581183715e-05, "loss": 0.0239, "num_input_tokens_seen": 161562352, "step": 74865 }, { "epoch": 12.213703099510603, "grad_norm": 0.03805174678564072, "learning_rate": 1.9744434067662103e-05, "loss": 0.0079, "num_input_tokens_seen": 161572400, "step": 74870 }, { "epoch": 12.214518760195759, "grad_norm": 0.2315869927406311, "learning_rate": 1.9740954660651205e-05, "loss": 0.1192, "num_input_tokens_seen": 161582992, "step": 74875 }, { "epoch": 12.215334420880913, "grad_norm": 0.31006932258605957, "learning_rate": 1.973747536022153e-05, "loss": 0.0205, "num_input_tokens_seen": 161593808, "step": 74880 }, { "epoch": 12.216150081566068, "grad_norm": 0.6043400764465332, "learning_rate": 1.97339961664436e-05, "loss": 0.1444, "num_input_tokens_seen": 161603952, "step": 74885 }, { "epoch": 12.216965742251224, "grad_norm": 1.8452306985855103, "learning_rate": 1.973051707938791e-05, "loss": 0.1953, "num_input_tokens_seen": 161615600, "step": 74890 }, { "epoch": 12.217781402936378, "grad_norm": 0.15303437411785126, "learning_rate": 1.9727038099124983e-05, "loss": 0.0627, "num_input_tokens_seen": 161627088, "step": 74895 }, { "epoch": 12.218597063621534, "grad_norm": 2.6295197010040283, "learning_rate": 1.972355922572531e-05, "loss": 0.1976, "num_input_tokens_seen": 161638288, "step": 74900 }, { "epoch": 12.219412724306688, "grad_norm": 2.596317768096924, "learning_rate": 1.9720080459259414e-05, "loss": 0.1595, "num_input_tokens_seen": 161648048, "step": 74905 }, { "epoch": 12.220228384991843, "grad_norm": 1.426421046257019, "learning_rate": 1.9716601799797775e-05, "loss": 0.0724, "num_input_tokens_seen": 161659056, "step": 74910 }, { "epoch": 12.221044045676999, "grad_norm": 0.050621628761291504, "learning_rate": 1.9713123247410896e-05, "loss": 0.0449, "num_input_tokens_seen": 161670864, "step": 74915 }, { "epoch": 12.221859706362153, "grad_norm": 0.5563998818397522, "learning_rate": 1.97096448021693e-05, "loss": 0.1155, "num_input_tokens_seen": 161681328, "step": 74920 }, { "epoch": 12.222675367047309, "grad_norm": 0.15014925599098206, "learning_rate": 1.9706166464143448e-05, "loss": 0.0081, "num_input_tokens_seen": 161691312, "step": 74925 }, { "epoch": 12.223491027732463, "grad_norm": 0.08516101539134979, "learning_rate": 1.9702688233403866e-05, "loss": 0.0449, "num_input_tokens_seen": 161703120, "step": 74930 }, { "epoch": 12.224306688417618, "grad_norm": 0.19748817384243011, "learning_rate": 1.969921011002101e-05, "loss": 0.1531, "num_input_tokens_seen": 161714608, "step": 74935 }, { "epoch": 12.225122349102774, "grad_norm": 0.20027567446231842, "learning_rate": 1.96957320940654e-05, "loss": 0.1115, "num_input_tokens_seen": 161725552, "step": 74940 }, { "epoch": 12.225938009787928, "grad_norm": 0.37908926606178284, "learning_rate": 1.9692254185607496e-05, "loss": 0.1609, "num_input_tokens_seen": 161736816, "step": 74945 }, { "epoch": 12.226753670473084, "grad_norm": 1.0549378395080566, "learning_rate": 1.9688776384717802e-05, "loss": 0.2341, "num_input_tokens_seen": 161747152, "step": 74950 }, { "epoch": 12.227569331158238, "grad_norm": 0.019952120259404182, "learning_rate": 1.9685298691466785e-05, "loss": 0.0759, "num_input_tokens_seen": 161757808, "step": 74955 }, { "epoch": 12.228384991843393, "grad_norm": 0.23841767013072968, "learning_rate": 1.9681821105924942e-05, "loss": 0.0895, "num_input_tokens_seen": 161768880, "step": 74960 }, { "epoch": 12.229200652528547, "grad_norm": 0.04277052357792854, "learning_rate": 1.9678343628162732e-05, "loss": 0.1401, "num_input_tokens_seen": 161780816, "step": 74965 }, { "epoch": 12.230016313213703, "grad_norm": 0.03008297272026539, "learning_rate": 1.9674866258250646e-05, "loss": 0.0369, "num_input_tokens_seen": 161790864, "step": 74970 }, { "epoch": 12.230831973898859, "grad_norm": 0.06233006343245506, "learning_rate": 1.9671388996259143e-05, "loss": 0.0843, "num_input_tokens_seen": 161802352, "step": 74975 }, { "epoch": 12.231647634584013, "grad_norm": 0.09542547911405563, "learning_rate": 1.9667911842258712e-05, "loss": 0.0946, "num_input_tokens_seen": 161813584, "step": 74980 }, { "epoch": 12.232463295269168, "grad_norm": 1.368531584739685, "learning_rate": 1.96644347963198e-05, "loss": 0.1354, "num_input_tokens_seen": 161824208, "step": 74985 }, { "epoch": 12.233278955954322, "grad_norm": 0.014979783445596695, "learning_rate": 1.96609578585129e-05, "loss": 0.0896, "num_input_tokens_seen": 161835248, "step": 74990 }, { "epoch": 12.234094616639478, "grad_norm": 1.1386536359786987, "learning_rate": 1.965748102890845e-05, "loss": 0.2501, "num_input_tokens_seen": 161845392, "step": 74995 }, { "epoch": 12.234910277324634, "grad_norm": 0.8827974200248718, "learning_rate": 1.9654004307576928e-05, "loss": 0.0154, "num_input_tokens_seen": 161855984, "step": 75000 }, { "epoch": 12.235725938009788, "grad_norm": 1.5649909973144531, "learning_rate": 1.9650527694588787e-05, "loss": 0.2543, "num_input_tokens_seen": 161866800, "step": 75005 }, { "epoch": 12.236541598694943, "grad_norm": 0.3613203763961792, "learning_rate": 1.9647051190014494e-05, "loss": 0.2076, "num_input_tokens_seen": 161877648, "step": 75010 }, { "epoch": 12.237357259380097, "grad_norm": 0.5120903253555298, "learning_rate": 1.9643574793924495e-05, "loss": 0.0183, "num_input_tokens_seen": 161888400, "step": 75015 }, { "epoch": 12.238172920065253, "grad_norm": 1.0442898273468018, "learning_rate": 1.964009850638925e-05, "loss": 0.1065, "num_input_tokens_seen": 161898672, "step": 75020 }, { "epoch": 12.238988580750409, "grad_norm": 0.2557477355003357, "learning_rate": 1.9636622327479205e-05, "loss": 0.0299, "num_input_tokens_seen": 161908464, "step": 75025 }, { "epoch": 12.239804241435563, "grad_norm": 0.10171698778867722, "learning_rate": 1.9633146257264817e-05, "loss": 0.0477, "num_input_tokens_seen": 161919120, "step": 75030 }, { "epoch": 12.240619902120718, "grad_norm": 1.867599606513977, "learning_rate": 1.962967029581653e-05, "loss": 0.1717, "num_input_tokens_seen": 161929232, "step": 75035 }, { "epoch": 12.241435562805872, "grad_norm": 0.04638472571969032, "learning_rate": 1.9626194443204784e-05, "loss": 0.0813, "num_input_tokens_seen": 161939792, "step": 75040 }, { "epoch": 12.242251223491028, "grad_norm": 0.06667906790971756, "learning_rate": 1.9622718699500025e-05, "loss": 0.0302, "num_input_tokens_seen": 161951184, "step": 75045 }, { "epoch": 12.243066884176184, "grad_norm": 0.686163067817688, "learning_rate": 1.961924306477269e-05, "loss": 0.0405, "num_input_tokens_seen": 161963184, "step": 75050 }, { "epoch": 12.243882544861338, "grad_norm": 1.2084211111068726, "learning_rate": 1.9615767539093227e-05, "loss": 0.1175, "num_input_tokens_seen": 161973808, "step": 75055 }, { "epoch": 12.244698205546493, "grad_norm": 1.6355539560317993, "learning_rate": 1.9612292122532057e-05, "loss": 0.0636, "num_input_tokens_seen": 161985040, "step": 75060 }, { "epoch": 12.245513866231647, "grad_norm": 0.07616948336362839, "learning_rate": 1.9608816815159634e-05, "loss": 0.0604, "num_input_tokens_seen": 161996624, "step": 75065 }, { "epoch": 12.246329526916803, "grad_norm": 0.1285456120967865, "learning_rate": 1.9605341617046367e-05, "loss": 0.0294, "num_input_tokens_seen": 162007248, "step": 75070 }, { "epoch": 12.247145187601957, "grad_norm": 0.14178980886936188, "learning_rate": 1.9601866528262702e-05, "loss": 0.2414, "num_input_tokens_seen": 162018320, "step": 75075 }, { "epoch": 12.247960848287113, "grad_norm": 2.554535388946533, "learning_rate": 1.959839154887905e-05, "loss": 0.2894, "num_input_tokens_seen": 162029424, "step": 75080 }, { "epoch": 12.248776508972268, "grad_norm": 0.2689424455165863, "learning_rate": 1.9594916678965864e-05, "loss": 0.1539, "num_input_tokens_seen": 162041424, "step": 75085 }, { "epoch": 12.249592169657422, "grad_norm": 0.1549011468887329, "learning_rate": 1.959144191859353e-05, "loss": 0.1054, "num_input_tokens_seen": 162052240, "step": 75090 }, { "epoch": 12.250407830342578, "grad_norm": 1.5218535661697388, "learning_rate": 1.9587967267832498e-05, "loss": 0.0816, "num_input_tokens_seen": 162061904, "step": 75095 }, { "epoch": 12.251223491027732, "grad_norm": 0.07033657282590866, "learning_rate": 1.9584492726753168e-05, "loss": 0.1255, "num_input_tokens_seen": 162072272, "step": 75100 }, { "epoch": 12.252039151712887, "grad_norm": 0.06597834080457687, "learning_rate": 1.958101829542597e-05, "loss": 0.1692, "num_input_tokens_seen": 162084016, "step": 75105 }, { "epoch": 12.252854812398043, "grad_norm": 0.5871928334236145, "learning_rate": 1.9577543973921303e-05, "loss": 0.0666, "num_input_tokens_seen": 162094448, "step": 75110 }, { "epoch": 12.253670473083197, "grad_norm": 0.7521191239356995, "learning_rate": 1.9574069762309593e-05, "loss": 0.0856, "num_input_tokens_seen": 162105712, "step": 75115 }, { "epoch": 12.254486133768353, "grad_norm": 0.04270326346158981, "learning_rate": 1.9570595660661235e-05, "loss": 0.0098, "num_input_tokens_seen": 162117552, "step": 75120 }, { "epoch": 12.255301794453507, "grad_norm": 1.1773788928985596, "learning_rate": 1.956712166904664e-05, "loss": 0.1291, "num_input_tokens_seen": 162129264, "step": 75125 }, { "epoch": 12.256117455138662, "grad_norm": 0.17970198392868042, "learning_rate": 1.9563647787536234e-05, "loss": 0.1367, "num_input_tokens_seen": 162139504, "step": 75130 }, { "epoch": 12.256933115823816, "grad_norm": 0.05467544496059418, "learning_rate": 1.9560174016200384e-05, "loss": 0.0422, "num_input_tokens_seen": 162149584, "step": 75135 }, { "epoch": 12.257748776508972, "grad_norm": 1.198948621749878, "learning_rate": 1.9556700355109526e-05, "loss": 0.1018, "num_input_tokens_seen": 162160624, "step": 75140 }, { "epoch": 12.258564437194128, "grad_norm": 0.492478609085083, "learning_rate": 1.9553226804334025e-05, "loss": 0.0622, "num_input_tokens_seen": 162172016, "step": 75145 }, { "epoch": 12.259380097879282, "grad_norm": 0.7386695742607117, "learning_rate": 1.9549753363944306e-05, "loss": 0.175, "num_input_tokens_seen": 162182000, "step": 75150 }, { "epoch": 12.260195758564437, "grad_norm": 0.06437308341264725, "learning_rate": 1.9546280034010736e-05, "loss": 0.0657, "num_input_tokens_seen": 162192080, "step": 75155 }, { "epoch": 12.261011419249591, "grad_norm": 0.04338047280907631, "learning_rate": 1.954280681460373e-05, "loss": 0.0866, "num_input_tokens_seen": 162203600, "step": 75160 }, { "epoch": 12.261827079934747, "grad_norm": 1.7515634298324585, "learning_rate": 1.9539333705793657e-05, "loss": 0.16, "num_input_tokens_seen": 162214096, "step": 75165 }, { "epoch": 12.262642740619903, "grad_norm": 1.5081162452697754, "learning_rate": 1.9535860707650922e-05, "loss": 0.1321, "num_input_tokens_seen": 162223952, "step": 75170 }, { "epoch": 12.263458401305057, "grad_norm": 0.010753311216831207, "learning_rate": 1.9532387820245897e-05, "loss": 0.1114, "num_input_tokens_seen": 162235312, "step": 75175 }, { "epoch": 12.264274061990212, "grad_norm": 1.6787173748016357, "learning_rate": 1.9528915043648973e-05, "loss": 0.1762, "num_input_tokens_seen": 162245456, "step": 75180 }, { "epoch": 12.265089722675366, "grad_norm": 0.7054939866065979, "learning_rate": 1.9525442377930525e-05, "loss": 0.041, "num_input_tokens_seen": 162256080, "step": 75185 }, { "epoch": 12.265905383360522, "grad_norm": 1.2335515022277832, "learning_rate": 1.9521969823160932e-05, "loss": 0.1187, "num_input_tokens_seen": 162266256, "step": 75190 }, { "epoch": 12.266721044045678, "grad_norm": 1.2707830667495728, "learning_rate": 1.9518497379410565e-05, "loss": 0.1842, "num_input_tokens_seen": 162277456, "step": 75195 }, { "epoch": 12.267536704730832, "grad_norm": 0.12934285402297974, "learning_rate": 1.9515025046749812e-05, "loss": 0.0605, "num_input_tokens_seen": 162288432, "step": 75200 }, { "epoch": 12.268352365415987, "grad_norm": 2.7201836109161377, "learning_rate": 1.9511552825249025e-05, "loss": 0.3252, "num_input_tokens_seen": 162297968, "step": 75205 }, { "epoch": 12.269168026101141, "grad_norm": 2.4636566638946533, "learning_rate": 1.9508080714978592e-05, "loss": 0.1439, "num_input_tokens_seen": 162309520, "step": 75210 }, { "epoch": 12.269983686786297, "grad_norm": 2.349158763885498, "learning_rate": 1.950460871600886e-05, "loss": 0.2304, "num_input_tokens_seen": 162319440, "step": 75215 }, { "epoch": 12.270799347471453, "grad_norm": 1.205628752708435, "learning_rate": 1.9501136828410214e-05, "loss": 0.0444, "num_input_tokens_seen": 162331248, "step": 75220 }, { "epoch": 12.271615008156607, "grad_norm": 0.16212154924869537, "learning_rate": 1.9497665052252994e-05, "loss": 0.1176, "num_input_tokens_seen": 162341872, "step": 75225 }, { "epoch": 12.272430668841762, "grad_norm": 0.22598202526569366, "learning_rate": 1.9494193387607577e-05, "loss": 0.1872, "num_input_tokens_seen": 162352880, "step": 75230 }, { "epoch": 12.273246329526916, "grad_norm": 1.791427731513977, "learning_rate": 1.9490721834544324e-05, "loss": 0.261, "num_input_tokens_seen": 162364144, "step": 75235 }, { "epoch": 12.274061990212072, "grad_norm": 0.03324737399816513, "learning_rate": 1.948725039313357e-05, "loss": 0.1185, "num_input_tokens_seen": 162373616, "step": 75240 }, { "epoch": 12.274877650897226, "grad_norm": 1.7920039892196655, "learning_rate": 1.948377906344569e-05, "loss": 0.1312, "num_input_tokens_seen": 162383536, "step": 75245 }, { "epoch": 12.275693311582382, "grad_norm": 0.1904032826423645, "learning_rate": 1.9480307845551015e-05, "loss": 0.0431, "num_input_tokens_seen": 162394128, "step": 75250 }, { "epoch": 12.276508972267537, "grad_norm": 1.5685757398605347, "learning_rate": 1.9476836739519917e-05, "loss": 0.0431, "num_input_tokens_seen": 162404112, "step": 75255 }, { "epoch": 12.277324632952691, "grad_norm": 0.023385820910334587, "learning_rate": 1.9473365745422716e-05, "loss": 0.0432, "num_input_tokens_seen": 162414384, "step": 75260 }, { "epoch": 12.278140293637847, "grad_norm": 1.3877822160720825, "learning_rate": 1.9469894863329778e-05, "loss": 0.0596, "num_input_tokens_seen": 162425200, "step": 75265 }, { "epoch": 12.278955954323001, "grad_norm": 0.14892666041851044, "learning_rate": 1.946642409331143e-05, "loss": 0.1055, "num_input_tokens_seen": 162435312, "step": 75270 }, { "epoch": 12.279771615008157, "grad_norm": 0.7500154376029968, "learning_rate": 1.9462953435438027e-05, "loss": 0.0342, "num_input_tokens_seen": 162444816, "step": 75275 }, { "epoch": 12.280587275693312, "grad_norm": 0.03206545114517212, "learning_rate": 1.9459482889779884e-05, "loss": 0.057, "num_input_tokens_seen": 162456592, "step": 75280 }, { "epoch": 12.281402936378466, "grad_norm": 0.26585906744003296, "learning_rate": 1.945601245640736e-05, "loss": 0.1246, "num_input_tokens_seen": 162468432, "step": 75285 }, { "epoch": 12.282218597063622, "grad_norm": 0.04315520450472832, "learning_rate": 1.9452542135390768e-05, "loss": 0.197, "num_input_tokens_seen": 162478192, "step": 75290 }, { "epoch": 12.283034257748776, "grad_norm": 0.05081027001142502, "learning_rate": 1.9449071926800456e-05, "loss": 0.0407, "num_input_tokens_seen": 162489168, "step": 75295 }, { "epoch": 12.283849918433932, "grad_norm": 0.057815030217170715, "learning_rate": 1.9445601830706735e-05, "loss": 0.0282, "num_input_tokens_seen": 162500688, "step": 75300 }, { "epoch": 12.284665579119087, "grad_norm": 0.5509018898010254, "learning_rate": 1.9442131847179944e-05, "loss": 0.0222, "num_input_tokens_seen": 162510384, "step": 75305 }, { "epoch": 12.285481239804241, "grad_norm": 0.042243592441082, "learning_rate": 1.9438661976290396e-05, "loss": 0.0225, "num_input_tokens_seen": 162520848, "step": 75310 }, { "epoch": 12.286296900489397, "grad_norm": 0.1946413367986679, "learning_rate": 1.9435192218108424e-05, "loss": 0.1491, "num_input_tokens_seen": 162531824, "step": 75315 }, { "epoch": 12.28711256117455, "grad_norm": 1.4486994743347168, "learning_rate": 1.943172257270434e-05, "loss": 0.0628, "num_input_tokens_seen": 162542736, "step": 75320 }, { "epoch": 12.287928221859707, "grad_norm": 0.9837527871131897, "learning_rate": 1.942825304014846e-05, "loss": 0.0715, "num_input_tokens_seen": 162553968, "step": 75325 }, { "epoch": 12.28874388254486, "grad_norm": 1.4844192266464233, "learning_rate": 1.9424783620511103e-05, "loss": 0.1244, "num_input_tokens_seen": 162564432, "step": 75330 }, { "epoch": 12.289559543230016, "grad_norm": 0.17243173718452454, "learning_rate": 1.942131431386257e-05, "loss": 0.0331, "num_input_tokens_seen": 162574992, "step": 75335 }, { "epoch": 12.290375203915172, "grad_norm": 0.10694414377212524, "learning_rate": 1.941784512027319e-05, "loss": 0.0195, "num_input_tokens_seen": 162585936, "step": 75340 }, { "epoch": 12.291190864600326, "grad_norm": 0.02635575272142887, "learning_rate": 1.9414376039813255e-05, "loss": 0.1906, "num_input_tokens_seen": 162597520, "step": 75345 }, { "epoch": 12.292006525285482, "grad_norm": 1.8786777257919312, "learning_rate": 1.941090707255308e-05, "loss": 0.1061, "num_input_tokens_seen": 162607664, "step": 75350 }, { "epoch": 12.292822185970635, "grad_norm": 0.10403099656105042, "learning_rate": 1.9407438218562958e-05, "loss": 0.133, "num_input_tokens_seen": 162618160, "step": 75355 }, { "epoch": 12.293637846655791, "grad_norm": 1.5818403959274292, "learning_rate": 1.94039694779132e-05, "loss": 0.1583, "num_input_tokens_seen": 162628592, "step": 75360 }, { "epoch": 12.294453507340947, "grad_norm": 1.254078984260559, "learning_rate": 1.9400500850674093e-05, "loss": 0.0363, "num_input_tokens_seen": 162639056, "step": 75365 }, { "epoch": 12.2952691680261, "grad_norm": 0.21508660912513733, "learning_rate": 1.939703233691595e-05, "loss": 0.1075, "num_input_tokens_seen": 162650384, "step": 75370 }, { "epoch": 12.296084828711257, "grad_norm": 0.09858773648738861, "learning_rate": 1.9393563936709046e-05, "loss": 0.1975, "num_input_tokens_seen": 162661712, "step": 75375 }, { "epoch": 12.29690048939641, "grad_norm": 1.640403389930725, "learning_rate": 1.9390095650123685e-05, "loss": 0.1997, "num_input_tokens_seen": 162671344, "step": 75380 }, { "epoch": 12.297716150081566, "grad_norm": 0.08846747130155563, "learning_rate": 1.938662747723015e-05, "loss": 0.0184, "num_input_tokens_seen": 162682096, "step": 75385 }, { "epoch": 12.298531810766722, "grad_norm": 0.166397824883461, "learning_rate": 1.9383159418098735e-05, "loss": 0.1949, "num_input_tokens_seen": 162692432, "step": 75390 }, { "epoch": 12.299347471451876, "grad_norm": 1.445372462272644, "learning_rate": 1.937969147279971e-05, "loss": 0.0956, "num_input_tokens_seen": 162703984, "step": 75395 }, { "epoch": 12.300163132137031, "grad_norm": 0.4747191369533539, "learning_rate": 1.937622364140338e-05, "loss": 0.0315, "num_input_tokens_seen": 162714608, "step": 75400 }, { "epoch": 12.300978792822185, "grad_norm": 0.03447328507900238, "learning_rate": 1.937275592398e-05, "loss": 0.1015, "num_input_tokens_seen": 162726928, "step": 75405 }, { "epoch": 12.301794453507341, "grad_norm": 0.06624813377857208, "learning_rate": 1.936928832059987e-05, "loss": 0.107, "num_input_tokens_seen": 162738736, "step": 75410 }, { "epoch": 12.302610114192497, "grad_norm": 0.9302448034286499, "learning_rate": 1.9365820831333247e-05, "loss": 0.1494, "num_input_tokens_seen": 162749360, "step": 75415 }, { "epoch": 12.30342577487765, "grad_norm": 0.8006057739257812, "learning_rate": 1.9362353456250422e-05, "loss": 0.063, "num_input_tokens_seen": 162759632, "step": 75420 }, { "epoch": 12.304241435562806, "grad_norm": 1.307708501815796, "learning_rate": 1.9358886195421648e-05, "loss": 0.0357, "num_input_tokens_seen": 162770864, "step": 75425 }, { "epoch": 12.30505709624796, "grad_norm": 0.25295475125312805, "learning_rate": 1.9355419048917206e-05, "loss": 0.1926, "num_input_tokens_seen": 162781456, "step": 75430 }, { "epoch": 12.305872756933116, "grad_norm": 0.25154203176498413, "learning_rate": 1.9351952016807352e-05, "loss": 0.0873, "num_input_tokens_seen": 162792528, "step": 75435 }, { "epoch": 12.30668841761827, "grad_norm": 1.8916095495224, "learning_rate": 1.9348485099162362e-05, "loss": 0.2372, "num_input_tokens_seen": 162803408, "step": 75440 }, { "epoch": 12.307504078303426, "grad_norm": 0.9025664329528809, "learning_rate": 1.9345018296052487e-05, "loss": 0.1774, "num_input_tokens_seen": 162814704, "step": 75445 }, { "epoch": 12.308319738988581, "grad_norm": 0.14677000045776367, "learning_rate": 1.9341551607547982e-05, "loss": 0.0735, "num_input_tokens_seen": 162825360, "step": 75450 }, { "epoch": 12.309135399673735, "grad_norm": 0.23174485564231873, "learning_rate": 1.9338085033719127e-05, "loss": 0.2112, "num_input_tokens_seen": 162836272, "step": 75455 }, { "epoch": 12.309951060358891, "grad_norm": 0.043188583105802536, "learning_rate": 1.9334618574636152e-05, "loss": 0.0259, "num_input_tokens_seen": 162848240, "step": 75460 }, { "epoch": 12.310766721044045, "grad_norm": 0.36339271068573, "learning_rate": 1.9331152230369327e-05, "loss": 0.0335, "num_input_tokens_seen": 162859472, "step": 75465 }, { "epoch": 12.3115823817292, "grad_norm": 0.03307906165719032, "learning_rate": 1.932768600098888e-05, "loss": 0.1123, "num_input_tokens_seen": 162870864, "step": 75470 }, { "epoch": 12.312398042414356, "grad_norm": 0.7691472768783569, "learning_rate": 1.932421988656509e-05, "loss": 0.0457, "num_input_tokens_seen": 162880528, "step": 75475 }, { "epoch": 12.31321370309951, "grad_norm": 0.21589204668998718, "learning_rate": 1.9320753887168168e-05, "loss": 0.3091, "num_input_tokens_seen": 162891536, "step": 75480 }, { "epoch": 12.314029363784666, "grad_norm": 0.2492888867855072, "learning_rate": 1.931728800286838e-05, "loss": 0.0572, "num_input_tokens_seen": 162901360, "step": 75485 }, { "epoch": 12.31484502446982, "grad_norm": 0.23662707209587097, "learning_rate": 1.9313822233735958e-05, "loss": 0.0662, "num_input_tokens_seen": 162912624, "step": 75490 }, { "epoch": 12.315660685154976, "grad_norm": 0.05791477859020233, "learning_rate": 1.9310356579841144e-05, "loss": 0.0236, "num_input_tokens_seen": 162922032, "step": 75495 }, { "epoch": 12.31647634584013, "grad_norm": 0.059256140142679214, "learning_rate": 1.9306891041254172e-05, "loss": 0.0376, "num_input_tokens_seen": 162933488, "step": 75500 }, { "epoch": 12.317292006525285, "grad_norm": 0.4551151692867279, "learning_rate": 1.9303425618045275e-05, "loss": 0.2047, "num_input_tokens_seen": 162944976, "step": 75505 }, { "epoch": 12.318107667210441, "grad_norm": 0.027389371767640114, "learning_rate": 1.9299960310284678e-05, "loss": 0.0439, "num_input_tokens_seen": 162955664, "step": 75510 }, { "epoch": 12.318923327895595, "grad_norm": 0.04829046502709389, "learning_rate": 1.929649511804263e-05, "loss": 0.0197, "num_input_tokens_seen": 162966768, "step": 75515 }, { "epoch": 12.31973898858075, "grad_norm": 0.38460636138916016, "learning_rate": 1.929303004138933e-05, "loss": 0.0149, "num_input_tokens_seen": 162978192, "step": 75520 }, { "epoch": 12.320554649265905, "grad_norm": 0.4917415976524353, "learning_rate": 1.9289565080395024e-05, "loss": 0.1083, "num_input_tokens_seen": 162990160, "step": 75525 }, { "epoch": 12.32137030995106, "grad_norm": 1.2188445329666138, "learning_rate": 1.928610023512992e-05, "loss": 0.087, "num_input_tokens_seen": 163000848, "step": 75530 }, { "epoch": 12.322185970636216, "grad_norm": 0.7387290000915527, "learning_rate": 1.9282635505664253e-05, "loss": 0.0737, "num_input_tokens_seen": 163012048, "step": 75535 }, { "epoch": 12.32300163132137, "grad_norm": 1.8478163480758667, "learning_rate": 1.927917089206822e-05, "loss": 0.0695, "num_input_tokens_seen": 163021712, "step": 75540 }, { "epoch": 12.323817292006526, "grad_norm": 0.10504989326000214, "learning_rate": 1.9275706394412054e-05, "loss": 0.1128, "num_input_tokens_seen": 163032496, "step": 75545 }, { "epoch": 12.32463295269168, "grad_norm": 0.024937767535448074, "learning_rate": 1.9272242012765955e-05, "loss": 0.0171, "num_input_tokens_seen": 163043504, "step": 75550 }, { "epoch": 12.325448613376835, "grad_norm": 0.4664900302886963, "learning_rate": 1.926877774720013e-05, "loss": 0.0384, "num_input_tokens_seen": 163053520, "step": 75555 }, { "epoch": 12.326264274061991, "grad_norm": 1.0935287475585938, "learning_rate": 1.926531359778481e-05, "loss": 0.0591, "num_input_tokens_seen": 163065552, "step": 75560 }, { "epoch": 12.327079934747145, "grad_norm": 0.42380526661872864, "learning_rate": 1.9261849564590177e-05, "loss": 0.1821, "num_input_tokens_seen": 163076560, "step": 75565 }, { "epoch": 12.3278955954323, "grad_norm": 0.5201406478881836, "learning_rate": 1.9258385647686448e-05, "loss": 0.0322, "num_input_tokens_seen": 163087472, "step": 75570 }, { "epoch": 12.328711256117455, "grad_norm": 0.41276097297668457, "learning_rate": 1.925492184714381e-05, "loss": 0.3676, "num_input_tokens_seen": 163099120, "step": 75575 }, { "epoch": 12.32952691680261, "grad_norm": 0.06944406777620316, "learning_rate": 1.9251458163032475e-05, "loss": 0.1465, "num_input_tokens_seen": 163109456, "step": 75580 }, { "epoch": 12.330342577487766, "grad_norm": 1.9031834602355957, "learning_rate": 1.9247994595422626e-05, "loss": 0.1745, "num_input_tokens_seen": 163120816, "step": 75585 }, { "epoch": 12.33115823817292, "grad_norm": 0.2779527008533478, "learning_rate": 1.9244531144384474e-05, "loss": 0.1021, "num_input_tokens_seen": 163129456, "step": 75590 }, { "epoch": 12.331973898858076, "grad_norm": 0.039299190044403076, "learning_rate": 1.9241067809988186e-05, "loss": 0.1656, "num_input_tokens_seen": 163140336, "step": 75595 }, { "epoch": 12.33278955954323, "grad_norm": 0.38283514976501465, "learning_rate": 1.9237604592303975e-05, "loss": 0.03, "num_input_tokens_seen": 163151504, "step": 75600 }, { "epoch": 12.333605220228385, "grad_norm": 1.456646203994751, "learning_rate": 1.9234141491402012e-05, "loss": 0.1263, "num_input_tokens_seen": 163161648, "step": 75605 }, { "epoch": 12.33442088091354, "grad_norm": 1.451423168182373, "learning_rate": 1.9230678507352493e-05, "loss": 0.2136, "num_input_tokens_seen": 163172464, "step": 75610 }, { "epoch": 12.335236541598695, "grad_norm": 1.257140040397644, "learning_rate": 1.9227215640225583e-05, "loss": 0.2332, "num_input_tokens_seen": 163182800, "step": 75615 }, { "epoch": 12.33605220228385, "grad_norm": 1.3729543685913086, "learning_rate": 1.922375289009148e-05, "loss": 0.0494, "num_input_tokens_seen": 163193008, "step": 75620 }, { "epoch": 12.336867862969005, "grad_norm": 0.6638264656066895, "learning_rate": 1.9220290257020346e-05, "loss": 0.1692, "num_input_tokens_seen": 163202128, "step": 75625 }, { "epoch": 12.33768352365416, "grad_norm": 0.26147764921188354, "learning_rate": 1.9216827741082362e-05, "loss": 0.0308, "num_input_tokens_seen": 163213424, "step": 75630 }, { "epoch": 12.338499184339314, "grad_norm": 1.993900179862976, "learning_rate": 1.9213365342347704e-05, "loss": 0.219, "num_input_tokens_seen": 163222096, "step": 75635 }, { "epoch": 12.33931484502447, "grad_norm": 0.07868210971355438, "learning_rate": 1.920990306088654e-05, "loss": 0.1124, "num_input_tokens_seen": 163232336, "step": 75640 }, { "epoch": 12.340130505709626, "grad_norm": 0.05594965070486069, "learning_rate": 1.920644089676903e-05, "loss": 0.0538, "num_input_tokens_seen": 163243024, "step": 75645 }, { "epoch": 12.34094616639478, "grad_norm": 0.08240380883216858, "learning_rate": 1.920297885006535e-05, "loss": 0.023, "num_input_tokens_seen": 163253264, "step": 75650 }, { "epoch": 12.341761827079935, "grad_norm": 0.6653852462768555, "learning_rate": 1.919951692084565e-05, "loss": 0.0263, "num_input_tokens_seen": 163262768, "step": 75655 }, { "epoch": 12.34257748776509, "grad_norm": 0.4504534900188446, "learning_rate": 1.91960551091801e-05, "loss": 0.1948, "num_input_tokens_seen": 163274992, "step": 75660 }, { "epoch": 12.343393148450245, "grad_norm": 0.04209383949637413, "learning_rate": 1.919259341513886e-05, "loss": 0.0234, "num_input_tokens_seen": 163285584, "step": 75665 }, { "epoch": 12.3442088091354, "grad_norm": 0.08267101645469666, "learning_rate": 1.9189131838792086e-05, "loss": 0.2865, "num_input_tokens_seen": 163296528, "step": 75670 }, { "epoch": 12.345024469820554, "grad_norm": 0.8899127244949341, "learning_rate": 1.9185670380209918e-05, "loss": 0.1508, "num_input_tokens_seen": 163306768, "step": 75675 }, { "epoch": 12.34584013050571, "grad_norm": 0.1217726543545723, "learning_rate": 1.9182209039462518e-05, "loss": 0.1167, "num_input_tokens_seen": 163317744, "step": 75680 }, { "epoch": 12.346655791190864, "grad_norm": 1.4421793222427368, "learning_rate": 1.917874781662004e-05, "loss": 0.1095, "num_input_tokens_seen": 163329584, "step": 75685 }, { "epoch": 12.34747145187602, "grad_norm": 0.23403795063495636, "learning_rate": 1.917528671175261e-05, "loss": 0.0249, "num_input_tokens_seen": 163339344, "step": 75690 }, { "epoch": 12.348287112561174, "grad_norm": 1.2839738130569458, "learning_rate": 1.9171825724930393e-05, "loss": 0.2658, "num_input_tokens_seen": 163350096, "step": 75695 }, { "epoch": 12.34910277324633, "grad_norm": 1.520164132118225, "learning_rate": 1.9168364856223518e-05, "loss": 0.2355, "num_input_tokens_seen": 163359952, "step": 75700 }, { "epoch": 12.349918433931485, "grad_norm": 1.8262965679168701, "learning_rate": 1.916490410570213e-05, "loss": 0.1125, "num_input_tokens_seen": 163369776, "step": 75705 }, { "epoch": 12.350734094616639, "grad_norm": 0.12622544169425964, "learning_rate": 1.9161443473436357e-05, "loss": 0.1793, "num_input_tokens_seen": 163380688, "step": 75710 }, { "epoch": 12.351549755301795, "grad_norm": 0.07200136035680771, "learning_rate": 1.9157982959496347e-05, "loss": 0.0618, "num_input_tokens_seen": 163391920, "step": 75715 }, { "epoch": 12.352365415986949, "grad_norm": 0.6640815734863281, "learning_rate": 1.9154522563952217e-05, "loss": 0.0522, "num_input_tokens_seen": 163403728, "step": 75720 }, { "epoch": 12.353181076672104, "grad_norm": 0.7301119565963745, "learning_rate": 1.9151062286874104e-05, "loss": 0.3483, "num_input_tokens_seen": 163414416, "step": 75725 }, { "epoch": 12.35399673735726, "grad_norm": 0.1537054479122162, "learning_rate": 1.914760212833213e-05, "loss": 0.0753, "num_input_tokens_seen": 163425968, "step": 75730 }, { "epoch": 12.354812398042414, "grad_norm": 1.7298654317855835, "learning_rate": 1.9144142088396433e-05, "loss": 0.0827, "num_input_tokens_seen": 163434960, "step": 75735 }, { "epoch": 12.35562805872757, "grad_norm": 0.9321191906929016, "learning_rate": 1.9140682167137112e-05, "loss": 0.0485, "num_input_tokens_seen": 163446352, "step": 75740 }, { "epoch": 12.356443719412724, "grad_norm": 0.09315461665391922, "learning_rate": 1.9137222364624316e-05, "loss": 0.1918, "num_input_tokens_seen": 163457104, "step": 75745 }, { "epoch": 12.35725938009788, "grad_norm": 0.1512223780155182, "learning_rate": 1.9133762680928132e-05, "loss": 0.1068, "num_input_tokens_seen": 163468304, "step": 75750 }, { "epoch": 12.358075040783035, "grad_norm": 0.6888826489448547, "learning_rate": 1.91303031161187e-05, "loss": 0.1947, "num_input_tokens_seen": 163480368, "step": 75755 }, { "epoch": 12.358890701468189, "grad_norm": 0.10905945301055908, "learning_rate": 1.912684367026611e-05, "loss": 0.04, "num_input_tokens_seen": 163490448, "step": 75760 }, { "epoch": 12.359706362153345, "grad_norm": 0.14428561925888062, "learning_rate": 1.9123384343440486e-05, "loss": 0.087, "num_input_tokens_seen": 163501072, "step": 75765 }, { "epoch": 12.360522022838499, "grad_norm": 1.0426850318908691, "learning_rate": 1.9119925135711945e-05, "loss": 0.0543, "num_input_tokens_seen": 163511120, "step": 75770 }, { "epoch": 12.361337683523654, "grad_norm": 0.11861254274845123, "learning_rate": 1.9116466047150566e-05, "loss": 0.0719, "num_input_tokens_seen": 163521008, "step": 75775 }, { "epoch": 12.362153344208808, "grad_norm": 0.9977145791053772, "learning_rate": 1.911300707782648e-05, "loss": 0.2663, "num_input_tokens_seen": 163531856, "step": 75780 }, { "epoch": 12.362969004893964, "grad_norm": 1.5694736242294312, "learning_rate": 1.910954822780976e-05, "loss": 0.1496, "num_input_tokens_seen": 163542960, "step": 75785 }, { "epoch": 12.36378466557912, "grad_norm": 0.1701546460390091, "learning_rate": 1.9106089497170532e-05, "loss": 0.0427, "num_input_tokens_seen": 163552848, "step": 75790 }, { "epoch": 12.364600326264274, "grad_norm": 1.393155574798584, "learning_rate": 1.9102630885978863e-05, "loss": 0.0238, "num_input_tokens_seen": 163563952, "step": 75795 }, { "epoch": 12.36541598694943, "grad_norm": 0.5944346785545349, "learning_rate": 1.9099172394304867e-05, "loss": 0.1884, "num_input_tokens_seen": 163573904, "step": 75800 }, { "epoch": 12.366231647634583, "grad_norm": 0.05180436745285988, "learning_rate": 1.909571402221863e-05, "loss": 0.0908, "num_input_tokens_seen": 163584624, "step": 75805 }, { "epoch": 12.367047308319739, "grad_norm": 1.7779712677001953, "learning_rate": 1.9092255769790234e-05, "loss": 0.2064, "num_input_tokens_seen": 163596016, "step": 75810 }, { "epoch": 12.367862969004895, "grad_norm": 1.68020761013031, "learning_rate": 1.9088797637089774e-05, "loss": 0.0675, "num_input_tokens_seen": 163607344, "step": 75815 }, { "epoch": 12.368678629690049, "grad_norm": 0.08122847229242325, "learning_rate": 1.9085339624187325e-05, "loss": 0.1821, "num_input_tokens_seen": 163617936, "step": 75820 }, { "epoch": 12.369494290375204, "grad_norm": 0.08310708403587341, "learning_rate": 1.9081881731152974e-05, "loss": 0.0913, "num_input_tokens_seen": 163628400, "step": 75825 }, { "epoch": 12.370309951060358, "grad_norm": 2.517685651779175, "learning_rate": 1.90784239580568e-05, "loss": 0.1902, "num_input_tokens_seen": 163638704, "step": 75830 }, { "epoch": 12.371125611745514, "grad_norm": 0.1540214568376541, "learning_rate": 1.907496630496887e-05, "loss": 0.0147, "num_input_tokens_seen": 163650128, "step": 75835 }, { "epoch": 12.37194127243067, "grad_norm": 0.16256096959114075, "learning_rate": 1.9071508771959272e-05, "loss": 0.0536, "num_input_tokens_seen": 163660592, "step": 75840 }, { "epoch": 12.372756933115824, "grad_norm": 2.135932445526123, "learning_rate": 1.906805135909806e-05, "loss": 0.0834, "num_input_tokens_seen": 163670000, "step": 75845 }, { "epoch": 12.37357259380098, "grad_norm": 0.0917845144867897, "learning_rate": 1.9064594066455326e-05, "loss": 0.0929, "num_input_tokens_seen": 163680336, "step": 75850 }, { "epoch": 12.374388254486133, "grad_norm": 0.681501030921936, "learning_rate": 1.906113689410111e-05, "loss": 0.0914, "num_input_tokens_seen": 163690224, "step": 75855 }, { "epoch": 12.375203915171289, "grad_norm": 2.0988821983337402, "learning_rate": 1.9057679842105497e-05, "loss": 0.0927, "num_input_tokens_seen": 163700528, "step": 75860 }, { "epoch": 12.376019575856443, "grad_norm": 0.23872782289981842, "learning_rate": 1.9054222910538532e-05, "loss": 0.1226, "num_input_tokens_seen": 163711696, "step": 75865 }, { "epoch": 12.376835236541599, "grad_norm": 1.812633752822876, "learning_rate": 1.9050766099470282e-05, "loss": 0.1778, "num_input_tokens_seen": 163722640, "step": 75870 }, { "epoch": 12.377650897226754, "grad_norm": 0.18163399398326874, "learning_rate": 1.9047309408970815e-05, "loss": 0.0428, "num_input_tokens_seen": 163732400, "step": 75875 }, { "epoch": 12.378466557911908, "grad_norm": 1.6394224166870117, "learning_rate": 1.9043852839110165e-05, "loss": 0.193, "num_input_tokens_seen": 163742128, "step": 75880 }, { "epoch": 12.379282218597064, "grad_norm": 2.552729606628418, "learning_rate": 1.90403963899584e-05, "loss": 0.1578, "num_input_tokens_seen": 163752144, "step": 75885 }, { "epoch": 12.380097879282218, "grad_norm": 0.3477784991264343, "learning_rate": 1.9036940061585557e-05, "loss": 0.1495, "num_input_tokens_seen": 163762224, "step": 75890 }, { "epoch": 12.380913539967374, "grad_norm": 0.3073251247406006, "learning_rate": 1.9033483854061695e-05, "loss": 0.06, "num_input_tokens_seen": 163773584, "step": 75895 }, { "epoch": 12.38172920065253, "grad_norm": 0.03036167286336422, "learning_rate": 1.9030027767456844e-05, "loss": 0.107, "num_input_tokens_seen": 163785264, "step": 75900 }, { "epoch": 12.382544861337683, "grad_norm": 0.40789249539375305, "learning_rate": 1.9026571801841062e-05, "loss": 0.0759, "num_input_tokens_seen": 163795312, "step": 75905 }, { "epoch": 12.383360522022839, "grad_norm": 0.4812869429588318, "learning_rate": 1.902311595728437e-05, "loss": 0.0239, "num_input_tokens_seen": 163806064, "step": 75910 }, { "epoch": 12.384176182707993, "grad_norm": 2.596982955932617, "learning_rate": 1.901966023385683e-05, "loss": 0.1356, "num_input_tokens_seen": 163816016, "step": 75915 }, { "epoch": 12.384991843393149, "grad_norm": 0.48078805208206177, "learning_rate": 1.9016204631628444e-05, "loss": 0.0592, "num_input_tokens_seen": 163826704, "step": 75920 }, { "epoch": 12.385807504078304, "grad_norm": 0.883352518081665, "learning_rate": 1.9012749150669278e-05, "loss": 0.1315, "num_input_tokens_seen": 163838064, "step": 75925 }, { "epoch": 12.386623164763458, "grad_norm": 1.6432417631149292, "learning_rate": 1.9009293791049333e-05, "loss": 0.1396, "num_input_tokens_seen": 163848592, "step": 75930 }, { "epoch": 12.387438825448614, "grad_norm": 0.22621147334575653, "learning_rate": 1.9005838552838656e-05, "loss": 0.0841, "num_input_tokens_seen": 163858704, "step": 75935 }, { "epoch": 12.388254486133768, "grad_norm": 0.3280675709247589, "learning_rate": 1.9002383436107262e-05, "loss": 0.0778, "num_input_tokens_seen": 163868816, "step": 75940 }, { "epoch": 12.389070146818923, "grad_norm": 0.09132494032382965, "learning_rate": 1.8998928440925178e-05, "loss": 0.0535, "num_input_tokens_seen": 163878960, "step": 75945 }, { "epoch": 12.38988580750408, "grad_norm": 1.4176433086395264, "learning_rate": 1.8995473567362425e-05, "loss": 0.1361, "num_input_tokens_seen": 163889424, "step": 75950 }, { "epoch": 12.390701468189233, "grad_norm": 0.056380800902843475, "learning_rate": 1.8992018815489013e-05, "loss": 0.0433, "num_input_tokens_seen": 163899920, "step": 75955 }, { "epoch": 12.391517128874389, "grad_norm": 0.17557232081890106, "learning_rate": 1.8988564185374958e-05, "loss": 0.0799, "num_input_tokens_seen": 163910704, "step": 75960 }, { "epoch": 12.392332789559543, "grad_norm": 0.11090121418237686, "learning_rate": 1.8985109677090288e-05, "loss": 0.2098, "num_input_tokens_seen": 163923312, "step": 75965 }, { "epoch": 12.393148450244698, "grad_norm": 0.9733942151069641, "learning_rate": 1.8981655290704986e-05, "loss": 0.0902, "num_input_tokens_seen": 163933776, "step": 75970 }, { "epoch": 12.393964110929852, "grad_norm": 0.6566357612609863, "learning_rate": 1.8978201026289084e-05, "loss": 0.1825, "num_input_tokens_seen": 163944432, "step": 75975 }, { "epoch": 12.394779771615008, "grad_norm": 1.2877610921859741, "learning_rate": 1.8974746883912574e-05, "loss": 0.0828, "num_input_tokens_seen": 163954736, "step": 75980 }, { "epoch": 12.395595432300164, "grad_norm": 0.20733442902565002, "learning_rate": 1.8971292863645463e-05, "loss": 0.118, "num_input_tokens_seen": 163965104, "step": 75985 }, { "epoch": 12.396411092985318, "grad_norm": 0.13760113716125488, "learning_rate": 1.896783896555775e-05, "loss": 0.1551, "num_input_tokens_seen": 163975728, "step": 75990 }, { "epoch": 12.397226753670473, "grad_norm": 0.9972676634788513, "learning_rate": 1.896438518971943e-05, "loss": 0.1436, "num_input_tokens_seen": 163987600, "step": 75995 }, { "epoch": 12.398042414355627, "grad_norm": 0.2760924994945526, "learning_rate": 1.896093153620051e-05, "loss": 0.0491, "num_input_tokens_seen": 163999248, "step": 76000 }, { "epoch": 12.398858075040783, "grad_norm": 0.1825961321592331, "learning_rate": 1.8957478005070962e-05, "loss": 0.0648, "num_input_tokens_seen": 164008848, "step": 76005 }, { "epoch": 12.399673735725939, "grad_norm": 0.16942265629768372, "learning_rate": 1.8954024596400798e-05, "loss": 0.0115, "num_input_tokens_seen": 164019920, "step": 76010 }, { "epoch": 12.400489396411093, "grad_norm": 0.658324658870697, "learning_rate": 1.8950571310259985e-05, "loss": 0.1303, "num_input_tokens_seen": 164031632, "step": 76015 }, { "epoch": 12.401305057096248, "grad_norm": 0.40400269627571106, "learning_rate": 1.894711814671853e-05, "loss": 0.0446, "num_input_tokens_seen": 164042544, "step": 76020 }, { "epoch": 12.402120717781402, "grad_norm": 1.4261219501495361, "learning_rate": 1.8943665105846393e-05, "loss": 0.2279, "num_input_tokens_seen": 164054384, "step": 76025 }, { "epoch": 12.402936378466558, "grad_norm": 0.6421956419944763, "learning_rate": 1.894021218771358e-05, "loss": 0.0739, "num_input_tokens_seen": 164064208, "step": 76030 }, { "epoch": 12.403752039151712, "grad_norm": 0.34739378094673157, "learning_rate": 1.8936759392390038e-05, "loss": 0.1479, "num_input_tokens_seen": 164074576, "step": 76035 }, { "epoch": 12.404567699836868, "grad_norm": 1.4770294427871704, "learning_rate": 1.893330671994577e-05, "loss": 0.1596, "num_input_tokens_seen": 164084720, "step": 76040 }, { "epoch": 12.405383360522023, "grad_norm": 0.17884469032287598, "learning_rate": 1.8929854170450728e-05, "loss": 0.2985, "num_input_tokens_seen": 164095664, "step": 76045 }, { "epoch": 12.406199021207177, "grad_norm": 0.38647544384002686, "learning_rate": 1.8926401743974904e-05, "loss": 0.1408, "num_input_tokens_seen": 164106864, "step": 76050 }, { "epoch": 12.407014681892333, "grad_norm": 0.3689406216144562, "learning_rate": 1.8922949440588242e-05, "loss": 0.0395, "num_input_tokens_seen": 164118096, "step": 76055 }, { "epoch": 12.407830342577487, "grad_norm": 0.06862427294254303, "learning_rate": 1.891949726036073e-05, "loss": 0.0621, "num_input_tokens_seen": 164128912, "step": 76060 }, { "epoch": 12.408646003262643, "grad_norm": 0.6953986883163452, "learning_rate": 1.891604520336231e-05, "loss": 0.1178, "num_input_tokens_seen": 164139856, "step": 76065 }, { "epoch": 12.409461663947798, "grad_norm": 0.6638372540473938, "learning_rate": 1.891259326966296e-05, "loss": 0.114, "num_input_tokens_seen": 164152016, "step": 76070 }, { "epoch": 12.410277324632952, "grad_norm": 0.5118614435195923, "learning_rate": 1.890914145933262e-05, "loss": 0.0961, "num_input_tokens_seen": 164162512, "step": 76075 }, { "epoch": 12.411092985318108, "grad_norm": 0.039295315742492676, "learning_rate": 1.8905689772441255e-05, "loss": 0.0076, "num_input_tokens_seen": 164173232, "step": 76080 }, { "epoch": 12.411908646003262, "grad_norm": 2.0143465995788574, "learning_rate": 1.8902238209058828e-05, "loss": 0.2062, "num_input_tokens_seen": 164183408, "step": 76085 }, { "epoch": 12.412724306688418, "grad_norm": 0.07353629916906357, "learning_rate": 1.8898786769255273e-05, "loss": 0.019, "num_input_tokens_seen": 164195120, "step": 76090 }, { "epoch": 12.413539967373573, "grad_norm": 1.7910650968551636, "learning_rate": 1.889533545310055e-05, "loss": 0.1041, "num_input_tokens_seen": 164206288, "step": 76095 }, { "epoch": 12.414355628058727, "grad_norm": 0.29493191838264465, "learning_rate": 1.889188426066459e-05, "loss": 0.0954, "num_input_tokens_seen": 164216752, "step": 76100 }, { "epoch": 12.415171288743883, "grad_norm": 1.4698731899261475, "learning_rate": 1.8888433192017345e-05, "loss": 0.1578, "num_input_tokens_seen": 164228432, "step": 76105 }, { "epoch": 12.415986949429037, "grad_norm": 0.7583357095718384, "learning_rate": 1.888498224722876e-05, "loss": 0.2102, "num_input_tokens_seen": 164239216, "step": 76110 }, { "epoch": 12.416802610114193, "grad_norm": 0.9230397343635559, "learning_rate": 1.8881531426368766e-05, "loss": 0.159, "num_input_tokens_seen": 164249424, "step": 76115 }, { "epoch": 12.417618270799348, "grad_norm": 1.2247990369796753, "learning_rate": 1.88780807295073e-05, "loss": 0.1139, "num_input_tokens_seen": 164260944, "step": 76120 }, { "epoch": 12.418433931484502, "grad_norm": 2.958885669708252, "learning_rate": 1.887463015671429e-05, "loss": 0.0969, "num_input_tokens_seen": 164271408, "step": 76125 }, { "epoch": 12.419249592169658, "grad_norm": 0.06623557955026627, "learning_rate": 1.8871179708059668e-05, "loss": 0.1, "num_input_tokens_seen": 164282064, "step": 76130 }, { "epoch": 12.420065252854812, "grad_norm": 0.6495779156684875, "learning_rate": 1.8867729383613377e-05, "loss": 0.0495, "num_input_tokens_seen": 164293520, "step": 76135 }, { "epoch": 12.420880913539968, "grad_norm": 0.4839000105857849, "learning_rate": 1.8864279183445314e-05, "loss": 0.0245, "num_input_tokens_seen": 164304560, "step": 76140 }, { "epoch": 12.421696574225122, "grad_norm": 0.752739667892456, "learning_rate": 1.8860829107625433e-05, "loss": 0.1024, "num_input_tokens_seen": 164314288, "step": 76145 }, { "epoch": 12.422512234910277, "grad_norm": 0.0411144383251667, "learning_rate": 1.8857379156223622e-05, "loss": 0.0364, "num_input_tokens_seen": 164324688, "step": 76150 }, { "epoch": 12.423327895595433, "grad_norm": 0.7106959819793701, "learning_rate": 1.885392932930983e-05, "loss": 0.05, "num_input_tokens_seen": 164336304, "step": 76155 }, { "epoch": 12.424143556280587, "grad_norm": 0.08534025400876999, "learning_rate": 1.8850479626953944e-05, "loss": 0.0425, "num_input_tokens_seen": 164347248, "step": 76160 }, { "epoch": 12.424959216965743, "grad_norm": 2.1268444061279297, "learning_rate": 1.88470300492259e-05, "loss": 0.0899, "num_input_tokens_seen": 164357136, "step": 76165 }, { "epoch": 12.425774877650896, "grad_norm": 0.023307746276259422, "learning_rate": 1.884358059619559e-05, "loss": 0.0332, "num_input_tokens_seen": 164368112, "step": 76170 }, { "epoch": 12.426590538336052, "grad_norm": 0.07492318004369736, "learning_rate": 1.8840131267932938e-05, "loss": 0.0282, "num_input_tokens_seen": 164379376, "step": 76175 }, { "epoch": 12.427406199021208, "grad_norm": 0.10504747182130814, "learning_rate": 1.8836682064507826e-05, "loss": 0.0765, "num_input_tokens_seen": 164390896, "step": 76180 }, { "epoch": 12.428221859706362, "grad_norm": 0.7409861087799072, "learning_rate": 1.883323298599019e-05, "loss": 0.1581, "num_input_tokens_seen": 164402480, "step": 76185 }, { "epoch": 12.429037520391518, "grad_norm": 0.6537163257598877, "learning_rate": 1.882978403244989e-05, "loss": 0.0342, "num_input_tokens_seen": 164414224, "step": 76190 }, { "epoch": 12.429853181076671, "grad_norm": 0.4873903691768646, "learning_rate": 1.882633520395685e-05, "loss": 0.1587, "num_input_tokens_seen": 164425072, "step": 76195 }, { "epoch": 12.430668841761827, "grad_norm": 0.5719048976898193, "learning_rate": 1.882288650058097e-05, "loss": 0.0721, "num_input_tokens_seen": 164436400, "step": 76200 }, { "epoch": 12.431484502446983, "grad_norm": 1.163010835647583, "learning_rate": 1.8819437922392116e-05, "loss": 0.1711, "num_input_tokens_seen": 164446544, "step": 76205 }, { "epoch": 12.432300163132137, "grad_norm": 0.07322462648153305, "learning_rate": 1.881598946946021e-05, "loss": 0.0439, "num_input_tokens_seen": 164457680, "step": 76210 }, { "epoch": 12.433115823817293, "grad_norm": 0.1418592482805252, "learning_rate": 1.8812541141855107e-05, "loss": 0.1438, "num_input_tokens_seen": 164467824, "step": 76215 }, { "epoch": 12.433931484502446, "grad_norm": 0.02978726476430893, "learning_rate": 1.880909293964672e-05, "loss": 0.0338, "num_input_tokens_seen": 164478224, "step": 76220 }, { "epoch": 12.434747145187602, "grad_norm": 0.15192106366157532, "learning_rate": 1.8805644862904907e-05, "loss": 0.1576, "num_input_tokens_seen": 164489616, "step": 76225 }, { "epoch": 12.435562805872756, "grad_norm": 0.14209501445293427, "learning_rate": 1.880219691169957e-05, "loss": 0.0664, "num_input_tokens_seen": 164500144, "step": 76230 }, { "epoch": 12.436378466557912, "grad_norm": 0.10199271887540817, "learning_rate": 1.879874908610056e-05, "loss": 0.1068, "num_input_tokens_seen": 164510800, "step": 76235 }, { "epoch": 12.437194127243067, "grad_norm": 2.820265054702759, "learning_rate": 1.8795301386177783e-05, "loss": 0.145, "num_input_tokens_seen": 164523120, "step": 76240 }, { "epoch": 12.438009787928221, "grad_norm": 0.14775413274765015, "learning_rate": 1.879185381200108e-05, "loss": 0.1159, "num_input_tokens_seen": 164534288, "step": 76245 }, { "epoch": 12.438825448613377, "grad_norm": 1.1922602653503418, "learning_rate": 1.8788406363640342e-05, "loss": 0.1394, "num_input_tokens_seen": 164546480, "step": 76250 }, { "epoch": 12.439641109298531, "grad_norm": 0.37890008091926575, "learning_rate": 1.878495904116543e-05, "loss": 0.1046, "num_input_tokens_seen": 164557264, "step": 76255 }, { "epoch": 12.440456769983687, "grad_norm": 2.4100751876831055, "learning_rate": 1.8781511844646208e-05, "loss": 0.0611, "num_input_tokens_seen": 164568848, "step": 76260 }, { "epoch": 12.441272430668842, "grad_norm": 0.7109119296073914, "learning_rate": 1.877806477415254e-05, "loss": 0.2072, "num_input_tokens_seen": 164579920, "step": 76265 }, { "epoch": 12.442088091353996, "grad_norm": 0.09100636839866638, "learning_rate": 1.8774617829754276e-05, "loss": 0.0874, "num_input_tokens_seen": 164590416, "step": 76270 }, { "epoch": 12.442903752039152, "grad_norm": 0.8230168223381042, "learning_rate": 1.8771171011521272e-05, "loss": 0.0738, "num_input_tokens_seen": 164599312, "step": 76275 }, { "epoch": 12.443719412724306, "grad_norm": 0.0709991306066513, "learning_rate": 1.876772431952341e-05, "loss": 0.0553, "num_input_tokens_seen": 164610448, "step": 76280 }, { "epoch": 12.444535073409462, "grad_norm": 1.2785040140151978, "learning_rate": 1.87642777538305e-05, "loss": 0.1832, "num_input_tokens_seen": 164622448, "step": 76285 }, { "epoch": 12.445350734094617, "grad_norm": 0.21938267350196838, "learning_rate": 1.8760831314512427e-05, "loss": 0.1928, "num_input_tokens_seen": 164632432, "step": 76290 }, { "epoch": 12.446166394779771, "grad_norm": 1.0658239126205444, "learning_rate": 1.8757385001639012e-05, "loss": 0.0651, "num_input_tokens_seen": 164641744, "step": 76295 }, { "epoch": 12.446982055464927, "grad_norm": 0.45284828543663025, "learning_rate": 1.875393881528011e-05, "loss": 0.102, "num_input_tokens_seen": 164652816, "step": 76300 }, { "epoch": 12.447797716150081, "grad_norm": 0.03867784142494202, "learning_rate": 1.8750492755505573e-05, "loss": 0.2344, "num_input_tokens_seen": 164662992, "step": 76305 }, { "epoch": 12.448613376835237, "grad_norm": 0.42196714878082275, "learning_rate": 1.8747046822385215e-05, "loss": 0.0736, "num_input_tokens_seen": 164674288, "step": 76310 }, { "epoch": 12.449429037520392, "grad_norm": 0.13869835436344147, "learning_rate": 1.87436010159889e-05, "loss": 0.0424, "num_input_tokens_seen": 164686064, "step": 76315 }, { "epoch": 12.450244698205546, "grad_norm": 0.12803827226161957, "learning_rate": 1.874015533638643e-05, "loss": 0.1248, "num_input_tokens_seen": 164695472, "step": 76320 }, { "epoch": 12.451060358890702, "grad_norm": 0.19400106370449066, "learning_rate": 1.8736709783647667e-05, "loss": 0.1844, "num_input_tokens_seen": 164706704, "step": 76325 }, { "epoch": 12.451876019575856, "grad_norm": 1.0876277685165405, "learning_rate": 1.873326435784242e-05, "loss": 0.0305, "num_input_tokens_seen": 164718288, "step": 76330 }, { "epoch": 12.452691680261012, "grad_norm": 0.045701753348112106, "learning_rate": 1.8729819059040524e-05, "loss": 0.045, "num_input_tokens_seen": 164728496, "step": 76335 }, { "epoch": 12.453507340946166, "grad_norm": 0.4322223961353302, "learning_rate": 1.8726373887311793e-05, "loss": 0.0659, "num_input_tokens_seen": 164738768, "step": 76340 }, { "epoch": 12.454323001631321, "grad_norm": 1.0677671432495117, "learning_rate": 1.8722928842726062e-05, "loss": 0.035, "num_input_tokens_seen": 164750224, "step": 76345 }, { "epoch": 12.455138662316477, "grad_norm": 0.1979563683271408, "learning_rate": 1.8719483925353133e-05, "loss": 0.0965, "num_input_tokens_seen": 164762096, "step": 76350 }, { "epoch": 12.455954323001631, "grad_norm": 0.0580456480383873, "learning_rate": 1.8716039135262835e-05, "loss": 0.1148, "num_input_tokens_seen": 164773424, "step": 76355 }, { "epoch": 12.456769983686787, "grad_norm": 0.18883419036865234, "learning_rate": 1.8712594472524968e-05, "loss": 0.045, "num_input_tokens_seen": 164784496, "step": 76360 }, { "epoch": 12.45758564437194, "grad_norm": 0.9928102493286133, "learning_rate": 1.8709149937209363e-05, "loss": 0.0651, "num_input_tokens_seen": 164796752, "step": 76365 }, { "epoch": 12.458401305057096, "grad_norm": 0.6154401898384094, "learning_rate": 1.8705705529385804e-05, "loss": 0.0562, "num_input_tokens_seen": 164807696, "step": 76370 }, { "epoch": 12.459216965742252, "grad_norm": 0.047447603195905685, "learning_rate": 1.8702261249124118e-05, "loss": 0.1373, "num_input_tokens_seen": 164818064, "step": 76375 }, { "epoch": 12.460032626427406, "grad_norm": 0.0559195876121521, "learning_rate": 1.8698817096494083e-05, "loss": 0.0672, "num_input_tokens_seen": 164827504, "step": 76380 }, { "epoch": 12.460848287112562, "grad_norm": 0.28209394216537476, "learning_rate": 1.8695373071565526e-05, "loss": 0.0173, "num_input_tokens_seen": 164838992, "step": 76385 }, { "epoch": 12.461663947797716, "grad_norm": 0.3538020849227905, "learning_rate": 1.8691929174408223e-05, "loss": 0.0273, "num_input_tokens_seen": 164849136, "step": 76390 }, { "epoch": 12.462479608482871, "grad_norm": 0.13868097960948944, "learning_rate": 1.8688485405091984e-05, "loss": 0.0469, "num_input_tokens_seen": 164860048, "step": 76395 }, { "epoch": 12.463295269168025, "grad_norm": 0.14626744389533997, "learning_rate": 1.868504176368659e-05, "loss": 0.1842, "num_input_tokens_seen": 164871536, "step": 76400 }, { "epoch": 12.464110929853181, "grad_norm": 1.7536698579788208, "learning_rate": 1.8681598250261832e-05, "loss": 0.1248, "num_input_tokens_seen": 164881808, "step": 76405 }, { "epoch": 12.464926590538337, "grad_norm": 0.6889705061912537, "learning_rate": 1.867815486488751e-05, "loss": 0.1368, "num_input_tokens_seen": 164892432, "step": 76410 }, { "epoch": 12.46574225122349, "grad_norm": 1.0238348245620728, "learning_rate": 1.8674711607633396e-05, "loss": 0.2129, "num_input_tokens_seen": 164903024, "step": 76415 }, { "epoch": 12.466557911908646, "grad_norm": 1.481048583984375, "learning_rate": 1.867126847856927e-05, "loss": 0.3065, "num_input_tokens_seen": 164913680, "step": 76420 }, { "epoch": 12.4673735725938, "grad_norm": 0.23145700991153717, "learning_rate": 1.8667825477764925e-05, "loss": 0.0198, "num_input_tokens_seen": 164923632, "step": 76425 }, { "epoch": 12.468189233278956, "grad_norm": 2.420769453048706, "learning_rate": 1.8664382605290125e-05, "loss": 0.0522, "num_input_tokens_seen": 164934544, "step": 76430 }, { "epoch": 12.469004893964112, "grad_norm": 0.8170453906059265, "learning_rate": 1.8660939861214653e-05, "loss": 0.1205, "num_input_tokens_seen": 164945072, "step": 76435 }, { "epoch": 12.469820554649266, "grad_norm": 0.09194774925708771, "learning_rate": 1.8657497245608273e-05, "loss": 0.0282, "num_input_tokens_seen": 164955504, "step": 76440 }, { "epoch": 12.470636215334421, "grad_norm": 0.9089285135269165, "learning_rate": 1.8654054758540755e-05, "loss": 0.168, "num_input_tokens_seen": 164966160, "step": 76445 }, { "epoch": 12.471451876019575, "grad_norm": 1.765101671218872, "learning_rate": 1.8650612400081877e-05, "loss": 0.0759, "num_input_tokens_seen": 164976976, "step": 76450 }, { "epoch": 12.47226753670473, "grad_norm": 0.1205984503030777, "learning_rate": 1.8647170170301383e-05, "loss": 0.0284, "num_input_tokens_seen": 164987248, "step": 76455 }, { "epoch": 12.473083197389887, "grad_norm": 0.7150914072990417, "learning_rate": 1.8643728069269052e-05, "loss": 0.0573, "num_input_tokens_seen": 164998032, "step": 76460 }, { "epoch": 12.47389885807504, "grad_norm": 0.3437867760658264, "learning_rate": 1.864028609705463e-05, "loss": 0.0521, "num_input_tokens_seen": 165007856, "step": 76465 }, { "epoch": 12.474714518760196, "grad_norm": 1.532265305519104, "learning_rate": 1.8636844253727887e-05, "loss": 0.1483, "num_input_tokens_seen": 165018352, "step": 76470 }, { "epoch": 12.47553017944535, "grad_norm": 0.415626585483551, "learning_rate": 1.8633402539358555e-05, "loss": 0.0823, "num_input_tokens_seen": 165029040, "step": 76475 }, { "epoch": 12.476345840130506, "grad_norm": 0.5060930848121643, "learning_rate": 1.8629960954016407e-05, "loss": 0.0223, "num_input_tokens_seen": 165040080, "step": 76480 }, { "epoch": 12.477161500815662, "grad_norm": 1.9091323614120483, "learning_rate": 1.8626519497771174e-05, "loss": 0.1577, "num_input_tokens_seen": 165051120, "step": 76485 }, { "epoch": 12.477977161500815, "grad_norm": 0.5361989736557007, "learning_rate": 1.8623078170692617e-05, "loss": 0.1464, "num_input_tokens_seen": 165061488, "step": 76490 }, { "epoch": 12.478792822185971, "grad_norm": 2.7795052528381348, "learning_rate": 1.861963697285046e-05, "loss": 0.1372, "num_input_tokens_seen": 165071888, "step": 76495 }, { "epoch": 12.479608482871125, "grad_norm": 0.4938884973526001, "learning_rate": 1.8616195904314464e-05, "loss": 0.0886, "num_input_tokens_seen": 165083056, "step": 76500 }, { "epoch": 12.48042414355628, "grad_norm": 0.6891107559204102, "learning_rate": 1.861275496515435e-05, "loss": 0.0331, "num_input_tokens_seen": 165093072, "step": 76505 }, { "epoch": 12.481239804241435, "grad_norm": 0.5408276319503784, "learning_rate": 1.8609314155439856e-05, "loss": 0.1333, "num_input_tokens_seen": 165102768, "step": 76510 }, { "epoch": 12.48205546492659, "grad_norm": 0.049196068197488785, "learning_rate": 1.8605873475240727e-05, "loss": 0.1836, "num_input_tokens_seen": 165112976, "step": 76515 }, { "epoch": 12.482871125611746, "grad_norm": 0.050704605877399445, "learning_rate": 1.8602432924626675e-05, "loss": 0.0153, "num_input_tokens_seen": 165123568, "step": 76520 }, { "epoch": 12.4836867862969, "grad_norm": 0.13747665286064148, "learning_rate": 1.8598992503667445e-05, "loss": 0.0666, "num_input_tokens_seen": 165135472, "step": 76525 }, { "epoch": 12.484502446982056, "grad_norm": 1.2685394287109375, "learning_rate": 1.8595552212432743e-05, "loss": 0.126, "num_input_tokens_seen": 165147184, "step": 76530 }, { "epoch": 12.48531810766721, "grad_norm": 0.47457802295684814, "learning_rate": 1.8592112050992312e-05, "loss": 0.0389, "num_input_tokens_seen": 165157904, "step": 76535 }, { "epoch": 12.486133768352365, "grad_norm": 0.4392145872116089, "learning_rate": 1.8588672019415846e-05, "loss": 0.0965, "num_input_tokens_seen": 165169008, "step": 76540 }, { "epoch": 12.486949429037521, "grad_norm": 0.09363207221031189, "learning_rate": 1.8585232117773088e-05, "loss": 0.1185, "num_input_tokens_seen": 165178672, "step": 76545 }, { "epoch": 12.487765089722675, "grad_norm": 0.4757803976535797, "learning_rate": 1.8581792346133726e-05, "loss": 0.0554, "num_input_tokens_seen": 165189392, "step": 76550 }, { "epoch": 12.48858075040783, "grad_norm": 0.08501426130533218, "learning_rate": 1.8578352704567495e-05, "loss": 0.0689, "num_input_tokens_seen": 165200208, "step": 76555 }, { "epoch": 12.489396411092985, "grad_norm": 0.059193965047597885, "learning_rate": 1.8574913193144084e-05, "loss": 0.0659, "num_input_tokens_seen": 165210448, "step": 76560 }, { "epoch": 12.49021207177814, "grad_norm": 0.10330141335725784, "learning_rate": 1.857147381193321e-05, "loss": 0.0264, "num_input_tokens_seen": 165220688, "step": 76565 }, { "epoch": 12.491027732463296, "grad_norm": 1.4337241649627686, "learning_rate": 1.8568034561004577e-05, "loss": 0.0774, "num_input_tokens_seen": 165231536, "step": 76570 }, { "epoch": 12.49184339314845, "grad_norm": 0.2876136898994446, "learning_rate": 1.856459544042788e-05, "loss": 0.0717, "num_input_tokens_seen": 165242000, "step": 76575 }, { "epoch": 12.492659053833606, "grad_norm": 3.9443960189819336, "learning_rate": 1.8561156450272822e-05, "loss": 0.0958, "num_input_tokens_seen": 165253712, "step": 76580 }, { "epoch": 12.49347471451876, "grad_norm": 0.8259261846542358, "learning_rate": 1.85577175906091e-05, "loss": 0.1094, "num_input_tokens_seen": 165264240, "step": 76585 }, { "epoch": 12.494290375203915, "grad_norm": 0.08652016520500183, "learning_rate": 1.8554278861506395e-05, "loss": 0.1113, "num_input_tokens_seen": 165275376, "step": 76590 }, { "epoch": 12.49510603588907, "grad_norm": 1.0827370882034302, "learning_rate": 1.8550840263034418e-05, "loss": 0.067, "num_input_tokens_seen": 165286544, "step": 76595 }, { "epoch": 12.495921696574225, "grad_norm": 0.12968656420707703, "learning_rate": 1.854740179526283e-05, "loss": 0.0609, "num_input_tokens_seen": 165296016, "step": 76600 }, { "epoch": 12.49673735725938, "grad_norm": 1.9721033573150635, "learning_rate": 1.8543963458261344e-05, "loss": 0.1151, "num_input_tokens_seen": 165306800, "step": 76605 }, { "epoch": 12.497553017944535, "grad_norm": 0.3034038841724396, "learning_rate": 1.8540525252099622e-05, "loss": 0.0427, "num_input_tokens_seen": 165316816, "step": 76610 }, { "epoch": 12.49836867862969, "grad_norm": 2.013265371322632, "learning_rate": 1.8537087176847344e-05, "loss": 0.0587, "num_input_tokens_seen": 165326544, "step": 76615 }, { "epoch": 12.499184339314844, "grad_norm": 0.06792496144771576, "learning_rate": 1.8533649232574206e-05, "loss": 0.0518, "num_input_tokens_seen": 165337520, "step": 76620 }, { "epoch": 12.5, "grad_norm": 1.644836664199829, "learning_rate": 1.853021141934986e-05, "loss": 0.1084, "num_input_tokens_seen": 165348432, "step": 76625 }, { "epoch": 12.500815660685156, "grad_norm": 0.26453569531440735, "learning_rate": 1.8526773737243996e-05, "loss": 0.0483, "num_input_tokens_seen": 165358640, "step": 76630 }, { "epoch": 12.50163132137031, "grad_norm": 0.28511443734169006, "learning_rate": 1.8523336186326263e-05, "loss": 0.1471, "num_input_tokens_seen": 165367920, "step": 76635 }, { "epoch": 12.502446982055465, "grad_norm": 0.199983611702919, "learning_rate": 1.851989876666635e-05, "loss": 0.1088, "num_input_tokens_seen": 165377904, "step": 76640 }, { "epoch": 12.50326264274062, "grad_norm": 0.027881724759936333, "learning_rate": 1.85164614783339e-05, "loss": 0.1178, "num_input_tokens_seen": 165389520, "step": 76645 }, { "epoch": 12.504078303425775, "grad_norm": 3.0891826152801514, "learning_rate": 1.851302432139859e-05, "loss": 0.1017, "num_input_tokens_seen": 165400432, "step": 76650 }, { "epoch": 12.50489396411093, "grad_norm": 0.12188366055488586, "learning_rate": 1.8509587295930063e-05, "loss": 0.0646, "num_input_tokens_seen": 165410448, "step": 76655 }, { "epoch": 12.505709624796085, "grad_norm": 0.2076563835144043, "learning_rate": 1.8506150401997997e-05, "loss": 0.0627, "num_input_tokens_seen": 165420720, "step": 76660 }, { "epoch": 12.50652528548124, "grad_norm": 1.3312911987304688, "learning_rate": 1.8502713639672015e-05, "loss": 0.1214, "num_input_tokens_seen": 165430960, "step": 76665 }, { "epoch": 12.507340946166394, "grad_norm": 0.6604925394058228, "learning_rate": 1.8499277009021797e-05, "loss": 0.025, "num_input_tokens_seen": 165440976, "step": 76670 }, { "epoch": 12.50815660685155, "grad_norm": 1.671004056930542, "learning_rate": 1.8495840510116962e-05, "loss": 0.1139, "num_input_tokens_seen": 165452848, "step": 76675 }, { "epoch": 12.508972267536706, "grad_norm": 2.1724867820739746, "learning_rate": 1.8492404143027185e-05, "loss": 0.1477, "num_input_tokens_seen": 165463984, "step": 76680 }, { "epoch": 12.50978792822186, "grad_norm": 0.24735969305038452, "learning_rate": 1.8488967907822085e-05, "loss": 0.0962, "num_input_tokens_seen": 165474320, "step": 76685 }, { "epoch": 12.510603588907015, "grad_norm": 2.6970489025115967, "learning_rate": 1.8485531804571317e-05, "loss": 0.3835, "num_input_tokens_seen": 165485232, "step": 76690 }, { "epoch": 12.51141924959217, "grad_norm": 3.762531042098999, "learning_rate": 1.8482095833344505e-05, "loss": 0.0969, "num_input_tokens_seen": 165496528, "step": 76695 }, { "epoch": 12.512234910277325, "grad_norm": 0.031074993312358856, "learning_rate": 1.8478659994211296e-05, "loss": 0.0445, "num_input_tokens_seen": 165506864, "step": 76700 }, { "epoch": 12.513050570962479, "grad_norm": 0.13066013157367706, "learning_rate": 1.8475224287241305e-05, "loss": 0.1172, "num_input_tokens_seen": 165517232, "step": 76705 }, { "epoch": 12.513866231647635, "grad_norm": 0.45558667182922363, "learning_rate": 1.8471788712504175e-05, "loss": 0.0935, "num_input_tokens_seen": 165528112, "step": 76710 }, { "epoch": 12.51468189233279, "grad_norm": 0.028918810188770294, "learning_rate": 1.8468353270069532e-05, "loss": 0.3414, "num_input_tokens_seen": 165538640, "step": 76715 }, { "epoch": 12.515497553017944, "grad_norm": 0.025252392515540123, "learning_rate": 1.846491796000699e-05, "loss": 0.3187, "num_input_tokens_seen": 165549360, "step": 76720 }, { "epoch": 12.5163132137031, "grad_norm": 0.04229237139225006, "learning_rate": 1.8461482782386187e-05, "loss": 0.0404, "num_input_tokens_seen": 165561040, "step": 76725 }, { "epoch": 12.517128874388254, "grad_norm": 0.47137969732284546, "learning_rate": 1.8458047737276715e-05, "loss": 0.023, "num_input_tokens_seen": 165572080, "step": 76730 }, { "epoch": 12.51794453507341, "grad_norm": 0.14538951218128204, "learning_rate": 1.8454612824748212e-05, "loss": 0.0334, "num_input_tokens_seen": 165583792, "step": 76735 }, { "epoch": 12.518760195758565, "grad_norm": 0.19860601425170898, "learning_rate": 1.8451178044870283e-05, "loss": 0.0997, "num_input_tokens_seen": 165594096, "step": 76740 }, { "epoch": 12.51957585644372, "grad_norm": 0.5665508508682251, "learning_rate": 1.844774339771254e-05, "loss": 0.1178, "num_input_tokens_seen": 165603472, "step": 76745 }, { "epoch": 12.520391517128875, "grad_norm": 0.05292172729969025, "learning_rate": 1.844430888334459e-05, "loss": 0.0316, "num_input_tokens_seen": 165614960, "step": 76750 }, { "epoch": 12.521207177814029, "grad_norm": 0.6122546792030334, "learning_rate": 1.8440874501836037e-05, "loss": 0.0748, "num_input_tokens_seen": 165625488, "step": 76755 }, { "epoch": 12.522022838499185, "grad_norm": 0.8459052443504333, "learning_rate": 1.8437440253256478e-05, "loss": 0.0606, "num_input_tokens_seen": 165636368, "step": 76760 }, { "epoch": 12.522838499184338, "grad_norm": 1.200573205947876, "learning_rate": 1.8434006137675525e-05, "loss": 0.1207, "num_input_tokens_seen": 165646832, "step": 76765 }, { "epoch": 12.523654159869494, "grad_norm": 1.95664381980896, "learning_rate": 1.843057215516276e-05, "loss": 0.1906, "num_input_tokens_seen": 165657744, "step": 76770 }, { "epoch": 12.52446982055465, "grad_norm": 3.040116786956787, "learning_rate": 1.8427138305787794e-05, "loss": 0.0426, "num_input_tokens_seen": 165668944, "step": 76775 }, { "epoch": 12.525285481239804, "grad_norm": 1.1881989240646362, "learning_rate": 1.8423704589620196e-05, "loss": 0.1862, "num_input_tokens_seen": 165679312, "step": 76780 }, { "epoch": 12.52610114192496, "grad_norm": 0.3671068847179413, "learning_rate": 1.842027100672958e-05, "loss": 0.2886, "num_input_tokens_seen": 165690704, "step": 76785 }, { "epoch": 12.526916802610113, "grad_norm": 0.7696678638458252, "learning_rate": 1.8416837557185507e-05, "loss": 0.0311, "num_input_tokens_seen": 165702000, "step": 76790 }, { "epoch": 12.52773246329527, "grad_norm": 0.0361882820725441, "learning_rate": 1.8413404241057584e-05, "loss": 0.1161, "num_input_tokens_seen": 165713360, "step": 76795 }, { "epoch": 12.528548123980425, "grad_norm": 1.767483115196228, "learning_rate": 1.840997105841537e-05, "loss": 0.061, "num_input_tokens_seen": 165724400, "step": 76800 }, { "epoch": 12.529363784665579, "grad_norm": 0.08073743432760239, "learning_rate": 1.840653800932846e-05, "loss": 0.052, "num_input_tokens_seen": 165735088, "step": 76805 }, { "epoch": 12.530179445350734, "grad_norm": 0.22539421916007996, "learning_rate": 1.8403105093866413e-05, "loss": 0.0497, "num_input_tokens_seen": 165746064, "step": 76810 }, { "epoch": 12.530995106035888, "grad_norm": 0.07125022262334824, "learning_rate": 1.839967231209882e-05, "loss": 0.095, "num_input_tokens_seen": 165755824, "step": 76815 }, { "epoch": 12.531810766721044, "grad_norm": 0.05227868631482124, "learning_rate": 1.839623966409523e-05, "loss": 0.0933, "num_input_tokens_seen": 165766896, "step": 76820 }, { "epoch": 12.5326264274062, "grad_norm": 0.2691635191440582, "learning_rate": 1.839280714992522e-05, "loss": 0.0843, "num_input_tokens_seen": 165777776, "step": 76825 }, { "epoch": 12.533442088091354, "grad_norm": 0.5459002256393433, "learning_rate": 1.8389374769658367e-05, "loss": 0.2908, "num_input_tokens_seen": 165789168, "step": 76830 }, { "epoch": 12.53425774877651, "grad_norm": 0.7022878527641296, "learning_rate": 1.8385942523364207e-05, "loss": 0.1481, "num_input_tokens_seen": 165799024, "step": 76835 }, { "epoch": 12.535073409461663, "grad_norm": 0.36172524094581604, "learning_rate": 1.8382510411112326e-05, "loss": 0.105, "num_input_tokens_seen": 165810256, "step": 76840 }, { "epoch": 12.535889070146819, "grad_norm": 0.9954974055290222, "learning_rate": 1.8379078432972256e-05, "loss": 0.147, "num_input_tokens_seen": 165822288, "step": 76845 }, { "epoch": 12.536704730831975, "grad_norm": 0.19250325858592987, "learning_rate": 1.8375646589013572e-05, "loss": 0.0275, "num_input_tokens_seen": 165833136, "step": 76850 }, { "epoch": 12.537520391517129, "grad_norm": 0.4342619776725769, "learning_rate": 1.83722148793058e-05, "loss": 0.02, "num_input_tokens_seen": 165842608, "step": 76855 }, { "epoch": 12.538336052202284, "grad_norm": 0.024853425100445747, "learning_rate": 1.8368783303918514e-05, "loss": 0.0587, "num_input_tokens_seen": 165853680, "step": 76860 }, { "epoch": 12.539151712887438, "grad_norm": 1.6722745895385742, "learning_rate": 1.8365351862921237e-05, "loss": 0.0853, "num_input_tokens_seen": 165864176, "step": 76865 }, { "epoch": 12.539967373572594, "grad_norm": 0.23489780724048615, "learning_rate": 1.8361920556383528e-05, "loss": 0.0306, "num_input_tokens_seen": 165874864, "step": 76870 }, { "epoch": 12.540783034257748, "grad_norm": 2.524444341659546, "learning_rate": 1.8358489384374912e-05, "loss": 0.0997, "num_input_tokens_seen": 165886032, "step": 76875 }, { "epoch": 12.541598694942904, "grad_norm": 0.09031470119953156, "learning_rate": 1.8355058346964938e-05, "loss": 0.069, "num_input_tokens_seen": 165897392, "step": 76880 }, { "epoch": 12.54241435562806, "grad_norm": 0.2928479015827179, "learning_rate": 1.835162744422314e-05, "loss": 0.2328, "num_input_tokens_seen": 165909008, "step": 76885 }, { "epoch": 12.543230016313213, "grad_norm": 0.7989332675933838, "learning_rate": 1.8348196676219044e-05, "loss": 0.1757, "num_input_tokens_seen": 165919888, "step": 76890 }, { "epoch": 12.544045676998369, "grad_norm": 0.06988247483968735, "learning_rate": 1.8344766043022177e-05, "loss": 0.1144, "num_input_tokens_seen": 165930896, "step": 76895 }, { "epoch": 12.544861337683523, "grad_norm": 0.0400506965816021, "learning_rate": 1.834133554470207e-05, "loss": 0.0248, "num_input_tokens_seen": 165942256, "step": 76900 }, { "epoch": 12.545676998368679, "grad_norm": 0.01629302278161049, "learning_rate": 1.8337905181328242e-05, "loss": 0.1013, "num_input_tokens_seen": 165953328, "step": 76905 }, { "epoch": 12.546492659053834, "grad_norm": 0.12270297110080719, "learning_rate": 1.8334474952970227e-05, "loss": 0.0265, "num_input_tokens_seen": 165964592, "step": 76910 }, { "epoch": 12.547308319738988, "grad_norm": 1.392996907234192, "learning_rate": 1.8331044859697525e-05, "loss": 0.0843, "num_input_tokens_seen": 165975344, "step": 76915 }, { "epoch": 12.548123980424144, "grad_norm": 0.03396030515432358, "learning_rate": 1.8327614901579666e-05, "loss": 0.1526, "num_input_tokens_seen": 165985936, "step": 76920 }, { "epoch": 12.548939641109298, "grad_norm": 1.0817210674285889, "learning_rate": 1.8324185078686144e-05, "loss": 0.2345, "num_input_tokens_seen": 165995952, "step": 76925 }, { "epoch": 12.549755301794454, "grad_norm": 0.16609205305576324, "learning_rate": 1.83207553910865e-05, "loss": 0.0612, "num_input_tokens_seen": 166005936, "step": 76930 }, { "epoch": 12.550570962479608, "grad_norm": 0.3850753903388977, "learning_rate": 1.8317325838850196e-05, "loss": 0.1388, "num_input_tokens_seen": 166016400, "step": 76935 }, { "epoch": 12.551386623164763, "grad_norm": 0.11056521534919739, "learning_rate": 1.8313896422046773e-05, "loss": 0.1714, "num_input_tokens_seen": 166026288, "step": 76940 }, { "epoch": 12.552202283849919, "grad_norm": 0.035191137343645096, "learning_rate": 1.8310467140745725e-05, "loss": 0.1175, "num_input_tokens_seen": 166037328, "step": 76945 }, { "epoch": 12.553017944535073, "grad_norm": 1.1450953483581543, "learning_rate": 1.830703799501654e-05, "loss": 0.1843, "num_input_tokens_seen": 166047600, "step": 76950 }, { "epoch": 12.553833605220229, "grad_norm": 2.0927624702453613, "learning_rate": 1.830360898492873e-05, "loss": 0.1803, "num_input_tokens_seen": 166057904, "step": 76955 }, { "epoch": 12.554649265905383, "grad_norm": 1.8560991287231445, "learning_rate": 1.830018011055177e-05, "loss": 0.2425, "num_input_tokens_seen": 166069424, "step": 76960 }, { "epoch": 12.555464926590538, "grad_norm": 1.0823357105255127, "learning_rate": 1.829675137195517e-05, "loss": 0.102, "num_input_tokens_seen": 166081392, "step": 76965 }, { "epoch": 12.556280587275694, "grad_norm": 0.7554728984832764, "learning_rate": 1.8293322769208394e-05, "loss": 0.0846, "num_input_tokens_seen": 166091344, "step": 76970 }, { "epoch": 12.557096247960848, "grad_norm": 0.1675126999616623, "learning_rate": 1.828989430238095e-05, "loss": 0.3174, "num_input_tokens_seen": 166102096, "step": 76975 }, { "epoch": 12.557911908646004, "grad_norm": 1.4240014553070068, "learning_rate": 1.82864659715423e-05, "loss": 0.1744, "num_input_tokens_seen": 166112112, "step": 76980 }, { "epoch": 12.558727569331158, "grad_norm": 0.7993730306625366, "learning_rate": 1.8283037776761948e-05, "loss": 0.0456, "num_input_tokens_seen": 166122672, "step": 76985 }, { "epoch": 12.559543230016313, "grad_norm": 3.4149858951568604, "learning_rate": 1.827960971810934e-05, "loss": 0.179, "num_input_tokens_seen": 166132912, "step": 76990 }, { "epoch": 12.560358890701469, "grad_norm": 0.20564617216587067, "learning_rate": 1.827618179565398e-05, "loss": 0.0858, "num_input_tokens_seen": 166143792, "step": 76995 }, { "epoch": 12.561174551386623, "grad_norm": 1.4686020612716675, "learning_rate": 1.8272754009465315e-05, "loss": 0.1417, "num_input_tokens_seen": 166155312, "step": 77000 }, { "epoch": 12.561990212071779, "grad_norm": 0.06672494113445282, "learning_rate": 1.8269326359612838e-05, "loss": 0.1031, "num_input_tokens_seen": 166166672, "step": 77005 }, { "epoch": 12.562805872756933, "grad_norm": 1.3651057481765747, "learning_rate": 1.8265898846165984e-05, "loss": 0.2011, "num_input_tokens_seen": 166176624, "step": 77010 }, { "epoch": 12.563621533442088, "grad_norm": 0.03352707251906395, "learning_rate": 1.8262471469194247e-05, "loss": 0.1918, "num_input_tokens_seen": 166187216, "step": 77015 }, { "epoch": 12.564437194127244, "grad_norm": 1.020595908164978, "learning_rate": 1.8259044228767064e-05, "loss": 0.0919, "num_input_tokens_seen": 166197872, "step": 77020 }, { "epoch": 12.565252854812398, "grad_norm": 0.06630910187959671, "learning_rate": 1.8255617124953906e-05, "loss": 0.1357, "num_input_tokens_seen": 166210288, "step": 77025 }, { "epoch": 12.566068515497554, "grad_norm": 1.1669647693634033, "learning_rate": 1.8252190157824224e-05, "loss": 0.0562, "num_input_tokens_seen": 166221968, "step": 77030 }, { "epoch": 12.566884176182707, "grad_norm": 1.077157735824585, "learning_rate": 1.8248763327447463e-05, "loss": 0.2378, "num_input_tokens_seen": 166233040, "step": 77035 }, { "epoch": 12.567699836867863, "grad_norm": 0.26108497381210327, "learning_rate": 1.824533663389308e-05, "loss": 0.0355, "num_input_tokens_seen": 166243760, "step": 77040 }, { "epoch": 12.568515497553017, "grad_norm": 0.7668368220329285, "learning_rate": 1.8241910077230516e-05, "loss": 0.1887, "num_input_tokens_seen": 166255024, "step": 77045 }, { "epoch": 12.569331158238173, "grad_norm": 0.15656065940856934, "learning_rate": 1.8238483657529222e-05, "loss": 0.0718, "num_input_tokens_seen": 166264784, "step": 77050 }, { "epoch": 12.570146818923329, "grad_norm": 0.1479988545179367, "learning_rate": 1.8235057374858632e-05, "loss": 0.1318, "num_input_tokens_seen": 166276016, "step": 77055 }, { "epoch": 12.570962479608482, "grad_norm": 1.9210307598114014, "learning_rate": 1.8231631229288188e-05, "loss": 0.1129, "num_input_tokens_seen": 166286800, "step": 77060 }, { "epoch": 12.571778140293638, "grad_norm": 0.3800853490829468, "learning_rate": 1.8228205220887324e-05, "loss": 0.1059, "num_input_tokens_seen": 166297872, "step": 77065 }, { "epoch": 12.572593800978792, "grad_norm": 0.7467457056045532, "learning_rate": 1.822477934972547e-05, "loss": 0.1493, "num_input_tokens_seen": 166309328, "step": 77070 }, { "epoch": 12.573409461663948, "grad_norm": 0.032934315502643585, "learning_rate": 1.8221353615872055e-05, "loss": 0.0514, "num_input_tokens_seen": 166320208, "step": 77075 }, { "epoch": 12.574225122349104, "grad_norm": 0.7296888828277588, "learning_rate": 1.8217928019396516e-05, "loss": 0.0453, "num_input_tokens_seen": 166331920, "step": 77080 }, { "epoch": 12.575040783034257, "grad_norm": 2.2570507526397705, "learning_rate": 1.821450256036826e-05, "loss": 0.1578, "num_input_tokens_seen": 166343728, "step": 77085 }, { "epoch": 12.575856443719413, "grad_norm": 1.0527632236480713, "learning_rate": 1.8211077238856728e-05, "loss": 0.0616, "num_input_tokens_seen": 166355152, "step": 77090 }, { "epoch": 12.576672104404567, "grad_norm": 1.0798412561416626, "learning_rate": 1.8207652054931317e-05, "loss": 0.0682, "num_input_tokens_seen": 166365296, "step": 77095 }, { "epoch": 12.577487765089723, "grad_norm": 0.14807309210300446, "learning_rate": 1.820422700866147e-05, "loss": 0.1113, "num_input_tokens_seen": 166376016, "step": 77100 }, { "epoch": 12.578303425774878, "grad_norm": 0.04603337123990059, "learning_rate": 1.8200802100116566e-05, "loss": 0.1481, "num_input_tokens_seen": 166386896, "step": 77105 }, { "epoch": 12.579119086460032, "grad_norm": 0.24907872080802917, "learning_rate": 1.8197377329366044e-05, "loss": 0.0199, "num_input_tokens_seen": 166397392, "step": 77110 }, { "epoch": 12.579934747145188, "grad_norm": 0.1266361027956009, "learning_rate": 1.819395269647929e-05, "loss": 0.0151, "num_input_tokens_seen": 166408944, "step": 77115 }, { "epoch": 12.580750407830342, "grad_norm": 0.26656895875930786, "learning_rate": 1.819052820152573e-05, "loss": 0.2611, "num_input_tokens_seen": 166420240, "step": 77120 }, { "epoch": 12.581566068515498, "grad_norm": 0.08842978626489639, "learning_rate": 1.8187103844574745e-05, "loss": 0.0458, "num_input_tokens_seen": 166430320, "step": 77125 }, { "epoch": 12.582381729200652, "grad_norm": 0.052599746733903885, "learning_rate": 1.8183679625695753e-05, "loss": 0.0805, "num_input_tokens_seen": 166439824, "step": 77130 }, { "epoch": 12.583197389885807, "grad_norm": 0.2139103263616562, "learning_rate": 1.818025554495813e-05, "loss": 0.059, "num_input_tokens_seen": 166451184, "step": 77135 }, { "epoch": 12.584013050570963, "grad_norm": 0.1254579722881317, "learning_rate": 1.817683160243129e-05, "loss": 0.0255, "num_input_tokens_seen": 166462448, "step": 77140 }, { "epoch": 12.584828711256117, "grad_norm": 0.4369843006134033, "learning_rate": 1.8173407798184605e-05, "loss": 0.101, "num_input_tokens_seen": 166473296, "step": 77145 }, { "epoch": 12.585644371941273, "grad_norm": 0.2491205632686615, "learning_rate": 1.816998413228747e-05, "loss": 0.2174, "num_input_tokens_seen": 166485136, "step": 77150 }, { "epoch": 12.586460032626427, "grad_norm": 1.0453156232833862, "learning_rate": 1.816656060480928e-05, "loss": 0.0563, "num_input_tokens_seen": 166495504, "step": 77155 }, { "epoch": 12.587275693311582, "grad_norm": 0.10249706357717514, "learning_rate": 1.8163137215819397e-05, "loss": 0.2052, "num_input_tokens_seen": 166506288, "step": 77160 }, { "epoch": 12.588091353996738, "grad_norm": 0.03337473422288895, "learning_rate": 1.8159713965387225e-05, "loss": 0.0208, "num_input_tokens_seen": 166516400, "step": 77165 }, { "epoch": 12.588907014681892, "grad_norm": 0.05489674583077431, "learning_rate": 1.8156290853582115e-05, "loss": 0.0292, "num_input_tokens_seen": 166528304, "step": 77170 }, { "epoch": 12.589722675367048, "grad_norm": 0.1578889638185501, "learning_rate": 1.8152867880473466e-05, "loss": 0.016, "num_input_tokens_seen": 166537616, "step": 77175 }, { "epoch": 12.590538336052202, "grad_norm": 1.8881276845932007, "learning_rate": 1.814944504613062e-05, "loss": 0.4074, "num_input_tokens_seen": 166549072, "step": 77180 }, { "epoch": 12.591353996737357, "grad_norm": 0.38225460052490234, "learning_rate": 1.8146022350622975e-05, "loss": 0.0218, "num_input_tokens_seen": 166559280, "step": 77185 }, { "epoch": 12.592169657422513, "grad_norm": 0.342747300863266, "learning_rate": 1.814259979401987e-05, "loss": 0.1333, "num_input_tokens_seen": 166571216, "step": 77190 }, { "epoch": 12.592985318107667, "grad_norm": 1.4944686889648438, "learning_rate": 1.8139177376390685e-05, "loss": 0.2855, "num_input_tokens_seen": 166583408, "step": 77195 }, { "epoch": 12.593800978792823, "grad_norm": 0.34710219502449036, "learning_rate": 1.8135755097804774e-05, "loss": 0.0387, "num_input_tokens_seen": 166593328, "step": 77200 }, { "epoch": 12.594616639477977, "grad_norm": 2.0432627201080322, "learning_rate": 1.8132332958331494e-05, "loss": 0.23, "num_input_tokens_seen": 166604976, "step": 77205 }, { "epoch": 12.595432300163132, "grad_norm": 0.10925938189029694, "learning_rate": 1.81289109580402e-05, "loss": 0.0147, "num_input_tokens_seen": 166615760, "step": 77210 }, { "epoch": 12.596247960848288, "grad_norm": 0.7828853130340576, "learning_rate": 1.812548909700024e-05, "loss": 0.0691, "num_input_tokens_seen": 166627408, "step": 77215 }, { "epoch": 12.597063621533442, "grad_norm": 0.2837566137313843, "learning_rate": 1.812206737528096e-05, "loss": 0.0154, "num_input_tokens_seen": 166637168, "step": 77220 }, { "epoch": 12.597879282218598, "grad_norm": 1.3415166139602661, "learning_rate": 1.811864579295172e-05, "loss": 0.1908, "num_input_tokens_seen": 166648080, "step": 77225 }, { "epoch": 12.598694942903752, "grad_norm": 1.2605278491973877, "learning_rate": 1.8115224350081842e-05, "loss": 0.035, "num_input_tokens_seen": 166658160, "step": 77230 }, { "epoch": 12.599510603588907, "grad_norm": 0.6852100491523743, "learning_rate": 1.8111803046740687e-05, "loss": 0.0805, "num_input_tokens_seen": 166669552, "step": 77235 }, { "epoch": 12.600326264274061, "grad_norm": 0.02602585218846798, "learning_rate": 1.8108381882997572e-05, "loss": 0.0389, "num_input_tokens_seen": 166681232, "step": 77240 }, { "epoch": 12.601141924959217, "grad_norm": 0.374140202999115, "learning_rate": 1.810496085892185e-05, "loss": 0.0102, "num_input_tokens_seen": 166691824, "step": 77245 }, { "epoch": 12.601957585644373, "grad_norm": 2.9701662063598633, "learning_rate": 1.810153997458283e-05, "loss": 0.2088, "num_input_tokens_seen": 166702800, "step": 77250 }, { "epoch": 12.602773246329527, "grad_norm": 1.0410617589950562, "learning_rate": 1.809811923004986e-05, "loss": 0.1909, "num_input_tokens_seen": 166713200, "step": 77255 }, { "epoch": 12.603588907014682, "grad_norm": 1.8241851329803467, "learning_rate": 1.8094698625392268e-05, "loss": 0.3109, "num_input_tokens_seen": 166723472, "step": 77260 }, { "epoch": 12.604404567699836, "grad_norm": 0.9479953646659851, "learning_rate": 1.8091278160679355e-05, "loss": 0.0931, "num_input_tokens_seen": 166734960, "step": 77265 }, { "epoch": 12.605220228384992, "grad_norm": 0.03954862058162689, "learning_rate": 1.8087857835980467e-05, "loss": 0.0573, "num_input_tokens_seen": 166745968, "step": 77270 }, { "epoch": 12.606035889070148, "grad_norm": 0.14177857339382172, "learning_rate": 1.8084437651364898e-05, "loss": 0.2603, "num_input_tokens_seen": 166756336, "step": 77275 }, { "epoch": 12.606851549755302, "grad_norm": 0.26521381735801697, "learning_rate": 1.8081017606901985e-05, "loss": 0.0291, "num_input_tokens_seen": 166765776, "step": 77280 }, { "epoch": 12.607667210440457, "grad_norm": 2.0783143043518066, "learning_rate": 1.8077597702661016e-05, "loss": 0.2491, "num_input_tokens_seen": 166777040, "step": 77285 }, { "epoch": 12.608482871125611, "grad_norm": 0.1511952430009842, "learning_rate": 1.807417793871132e-05, "loss": 0.1704, "num_input_tokens_seen": 166788336, "step": 77290 }, { "epoch": 12.609298531810767, "grad_norm": 0.380371630191803, "learning_rate": 1.8070758315122184e-05, "loss": 0.1, "num_input_tokens_seen": 166797712, "step": 77295 }, { "epoch": 12.61011419249592, "grad_norm": 0.1638551503419876, "learning_rate": 1.8067338831962933e-05, "loss": 0.1936, "num_input_tokens_seen": 166808880, "step": 77300 }, { "epoch": 12.610929853181077, "grad_norm": 0.5922892093658447, "learning_rate": 1.8063919489302843e-05, "loss": 0.0178, "num_input_tokens_seen": 166819952, "step": 77305 }, { "epoch": 12.611745513866232, "grad_norm": 0.38666674494743347, "learning_rate": 1.8060500287211235e-05, "loss": 0.0288, "num_input_tokens_seen": 166830608, "step": 77310 }, { "epoch": 12.612561174551386, "grad_norm": 0.5138888359069824, "learning_rate": 1.8057081225757382e-05, "loss": 0.0765, "num_input_tokens_seen": 166840816, "step": 77315 }, { "epoch": 12.613376835236542, "grad_norm": 0.023920288309454918, "learning_rate": 1.80536623050106e-05, "loss": 0.065, "num_input_tokens_seen": 166852016, "step": 77320 }, { "epoch": 12.614192495921696, "grad_norm": 1.1008100509643555, "learning_rate": 1.8050243525040145e-05, "loss": 0.0562, "num_input_tokens_seen": 166862896, "step": 77325 }, { "epoch": 12.615008156606851, "grad_norm": 0.5944487452507019, "learning_rate": 1.8046824885915338e-05, "loss": 0.0388, "num_input_tokens_seen": 166873616, "step": 77330 }, { "epoch": 12.615823817292007, "grad_norm": 1.6810061931610107, "learning_rate": 1.804340638770543e-05, "loss": 0.1866, "num_input_tokens_seen": 166885104, "step": 77335 }, { "epoch": 12.616639477977161, "grad_norm": 0.48554521799087524, "learning_rate": 1.8039988030479726e-05, "loss": 0.1134, "num_input_tokens_seen": 166895952, "step": 77340 }, { "epoch": 12.617455138662317, "grad_norm": 0.7390604615211487, "learning_rate": 1.8036569814307495e-05, "loss": 0.1336, "num_input_tokens_seen": 166905488, "step": 77345 }, { "epoch": 12.61827079934747, "grad_norm": 1.597743034362793, "learning_rate": 1.8033151739258008e-05, "loss": 0.0805, "num_input_tokens_seen": 166915728, "step": 77350 }, { "epoch": 12.619086460032626, "grad_norm": 1.830702543258667, "learning_rate": 1.802973380540054e-05, "loss": 0.1797, "num_input_tokens_seen": 166925872, "step": 77355 }, { "epoch": 12.619902120717782, "grad_norm": 0.9649238586425781, "learning_rate": 1.802631601280435e-05, "loss": 0.039, "num_input_tokens_seen": 166937520, "step": 77360 }, { "epoch": 12.620717781402936, "grad_norm": 0.07973326742649078, "learning_rate": 1.802289836153872e-05, "loss": 0.3206, "num_input_tokens_seen": 166949360, "step": 77365 }, { "epoch": 12.621533442088092, "grad_norm": 0.891948401927948, "learning_rate": 1.8019480851672907e-05, "loss": 0.0523, "num_input_tokens_seen": 166960944, "step": 77370 }, { "epoch": 12.622349102773246, "grad_norm": 0.24371157586574554, "learning_rate": 1.801606348327617e-05, "loss": 0.0763, "num_input_tokens_seen": 166971312, "step": 77375 }, { "epoch": 12.623164763458401, "grad_norm": 1.3303346633911133, "learning_rate": 1.8012646256417764e-05, "loss": 0.0432, "num_input_tokens_seen": 166981456, "step": 77380 }, { "epoch": 12.623980424143557, "grad_norm": 1.5605214834213257, "learning_rate": 1.800922917116695e-05, "loss": 0.0795, "num_input_tokens_seen": 166991984, "step": 77385 }, { "epoch": 12.624796084828711, "grad_norm": 0.6852317452430725, "learning_rate": 1.8005812227592968e-05, "loss": 0.0844, "num_input_tokens_seen": 167002800, "step": 77390 }, { "epoch": 12.625611745513867, "grad_norm": 0.1166413351893425, "learning_rate": 1.8002395425765084e-05, "loss": 0.0164, "num_input_tokens_seen": 167013168, "step": 77395 }, { "epoch": 12.62642740619902, "grad_norm": 0.47492334246635437, "learning_rate": 1.799897876575252e-05, "loss": 0.0725, "num_input_tokens_seen": 167023344, "step": 77400 }, { "epoch": 12.627243066884176, "grad_norm": 0.03807174414396286, "learning_rate": 1.7995562247624546e-05, "loss": 0.039, "num_input_tokens_seen": 167034800, "step": 77405 }, { "epoch": 12.62805872756933, "grad_norm": 1.182410478591919, "learning_rate": 1.799214587145038e-05, "loss": 0.0779, "num_input_tokens_seen": 167045584, "step": 77410 }, { "epoch": 12.628874388254486, "grad_norm": 1.3049384355545044, "learning_rate": 1.7988729637299277e-05, "loss": 0.0572, "num_input_tokens_seen": 167056880, "step": 77415 }, { "epoch": 12.629690048939642, "grad_norm": 0.5762107372283936, "learning_rate": 1.7985313545240452e-05, "loss": 0.2903, "num_input_tokens_seen": 167068176, "step": 77420 }, { "epoch": 12.630505709624796, "grad_norm": 0.41753631830215454, "learning_rate": 1.7981897595343158e-05, "loss": 0.0382, "num_input_tokens_seen": 167078736, "step": 77425 }, { "epoch": 12.631321370309951, "grad_norm": 2.4276936054229736, "learning_rate": 1.79784817876766e-05, "loss": 0.0291, "num_input_tokens_seen": 167089840, "step": 77430 }, { "epoch": 12.632137030995105, "grad_norm": 0.038605183362960815, "learning_rate": 1.797506612231003e-05, "loss": 0.0332, "num_input_tokens_seen": 167100816, "step": 77435 }, { "epoch": 12.632952691680261, "grad_norm": 0.21365435421466827, "learning_rate": 1.7971650599312645e-05, "loss": 0.1657, "num_input_tokens_seen": 167112240, "step": 77440 }, { "epoch": 12.633768352365417, "grad_norm": 1.2726693153381348, "learning_rate": 1.7968235218753683e-05, "loss": 0.035, "num_input_tokens_seen": 167124784, "step": 77445 }, { "epoch": 12.63458401305057, "grad_norm": 0.4749848544597626, "learning_rate": 1.796481998070235e-05, "loss": 0.0269, "num_input_tokens_seen": 167136144, "step": 77450 }, { "epoch": 12.635399673735726, "grad_norm": 0.06498627364635468, "learning_rate": 1.7961404885227873e-05, "loss": 0.0078, "num_input_tokens_seen": 167144688, "step": 77455 }, { "epoch": 12.63621533442088, "grad_norm": 1.3313642740249634, "learning_rate": 1.7957989932399445e-05, "loss": 0.0886, "num_input_tokens_seen": 167155824, "step": 77460 }, { "epoch": 12.637030995106036, "grad_norm": 0.37682121992111206, "learning_rate": 1.7954575122286283e-05, "loss": 0.0417, "num_input_tokens_seen": 167167536, "step": 77465 }, { "epoch": 12.63784665579119, "grad_norm": 1.0729700326919556, "learning_rate": 1.7951160454957604e-05, "loss": 0.1599, "num_input_tokens_seen": 167178576, "step": 77470 }, { "epoch": 12.638662316476346, "grad_norm": 0.23170818388462067, "learning_rate": 1.794774593048259e-05, "loss": 0.1569, "num_input_tokens_seen": 167189232, "step": 77475 }, { "epoch": 12.639477977161501, "grad_norm": 0.11007214337587357, "learning_rate": 1.7944331548930464e-05, "loss": 0.0576, "num_input_tokens_seen": 167200976, "step": 77480 }, { "epoch": 12.640293637846655, "grad_norm": 0.9660501480102539, "learning_rate": 1.7940917310370398e-05, "loss": 0.048, "num_input_tokens_seen": 167212464, "step": 77485 }, { "epoch": 12.641109298531811, "grad_norm": 0.07472510635852814, "learning_rate": 1.793750321487161e-05, "loss": 0.0233, "num_input_tokens_seen": 167221360, "step": 77490 }, { "epoch": 12.641924959216965, "grad_norm": 0.05966304987668991, "learning_rate": 1.7934089262503264e-05, "loss": 0.1646, "num_input_tokens_seen": 167232688, "step": 77495 }, { "epoch": 12.64274061990212, "grad_norm": 0.21834619343280792, "learning_rate": 1.7930675453334577e-05, "loss": 0.0267, "num_input_tokens_seen": 167243088, "step": 77500 }, { "epoch": 12.643556280587276, "grad_norm": 0.15743859112262726, "learning_rate": 1.7927261787434706e-05, "loss": 0.0303, "num_input_tokens_seen": 167254384, "step": 77505 }, { "epoch": 12.64437194127243, "grad_norm": 0.37573015689849854, "learning_rate": 1.7923848264872858e-05, "loss": 0.0148, "num_input_tokens_seen": 167265136, "step": 77510 }, { "epoch": 12.645187601957586, "grad_norm": 0.09281636029481888, "learning_rate": 1.7920434885718197e-05, "loss": 0.0617, "num_input_tokens_seen": 167276464, "step": 77515 }, { "epoch": 12.64600326264274, "grad_norm": 0.06791052967309952, "learning_rate": 1.7917021650039906e-05, "loss": 0.0163, "num_input_tokens_seen": 167286640, "step": 77520 }, { "epoch": 12.646818923327896, "grad_norm": 0.5268145799636841, "learning_rate": 1.7913608557907157e-05, "loss": 0.0794, "num_input_tokens_seen": 167297744, "step": 77525 }, { "epoch": 12.647634584013051, "grad_norm": 0.043193306773900986, "learning_rate": 1.7910195609389122e-05, "loss": 0.0332, "num_input_tokens_seen": 167307600, "step": 77530 }, { "epoch": 12.648450244698205, "grad_norm": 1.3249226808547974, "learning_rate": 1.790678280455496e-05, "loss": 0.0888, "num_input_tokens_seen": 167317296, "step": 77535 }, { "epoch": 12.649265905383361, "grad_norm": 0.17433568835258484, "learning_rate": 1.7903370143473852e-05, "loss": 0.121, "num_input_tokens_seen": 167328080, "step": 77540 }, { "epoch": 12.650081566068515, "grad_norm": 0.22981396317481995, "learning_rate": 1.789995762621494e-05, "loss": 0.1305, "num_input_tokens_seen": 167338480, "step": 77545 }, { "epoch": 12.65089722675367, "grad_norm": 0.27685707807540894, "learning_rate": 1.789654525284741e-05, "loss": 0.1892, "num_input_tokens_seen": 167348624, "step": 77550 }, { "epoch": 12.651712887438826, "grad_norm": 0.5827595591545105, "learning_rate": 1.7893133023440383e-05, "loss": 0.0247, "num_input_tokens_seen": 167359312, "step": 77555 }, { "epoch": 12.65252854812398, "grad_norm": 0.4091913104057312, "learning_rate": 1.7889720938063047e-05, "loss": 0.0177, "num_input_tokens_seen": 167370416, "step": 77560 }, { "epoch": 12.653344208809136, "grad_norm": 0.3021458089351654, "learning_rate": 1.7886308996784524e-05, "loss": 0.1294, "num_input_tokens_seen": 167382096, "step": 77565 }, { "epoch": 12.65415986949429, "grad_norm": 0.8272423148155212, "learning_rate": 1.788289719967398e-05, "loss": 0.0325, "num_input_tokens_seen": 167391920, "step": 77570 }, { "epoch": 12.654975530179446, "grad_norm": 2.4619271755218506, "learning_rate": 1.787948554680055e-05, "loss": 0.1278, "num_input_tokens_seen": 167403664, "step": 77575 }, { "epoch": 12.655791190864601, "grad_norm": 0.7369188666343689, "learning_rate": 1.7876074038233375e-05, "loss": 0.1909, "num_input_tokens_seen": 167413968, "step": 77580 }, { "epoch": 12.656606851549755, "grad_norm": 0.23751398921012878, "learning_rate": 1.7872662674041608e-05, "loss": 0.0117, "num_input_tokens_seen": 167425136, "step": 77585 }, { "epoch": 12.65742251223491, "grad_norm": 0.02352933958172798, "learning_rate": 1.7869251454294362e-05, "loss": 0.0422, "num_input_tokens_seen": 167436400, "step": 77590 }, { "epoch": 12.658238172920065, "grad_norm": 2.2964375019073486, "learning_rate": 1.786584037906079e-05, "loss": 0.1523, "num_input_tokens_seen": 167447728, "step": 77595 }, { "epoch": 12.65905383360522, "grad_norm": 0.07781749963760376, "learning_rate": 1.7862429448410006e-05, "loss": 0.0557, "num_input_tokens_seen": 167458160, "step": 77600 }, { "epoch": 12.659869494290374, "grad_norm": 2.6325862407684326, "learning_rate": 1.7859018662411155e-05, "loss": 0.2499, "num_input_tokens_seen": 167468496, "step": 77605 }, { "epoch": 12.66068515497553, "grad_norm": 1.5635217428207397, "learning_rate": 1.7855608021133334e-05, "loss": 0.2614, "num_input_tokens_seen": 167478864, "step": 77610 }, { "epoch": 12.661500815660686, "grad_norm": 0.5200498104095459, "learning_rate": 1.7852197524645696e-05, "loss": 0.1555, "num_input_tokens_seen": 167490768, "step": 77615 }, { "epoch": 12.66231647634584, "grad_norm": 3.1392266750335693, "learning_rate": 1.7848787173017327e-05, "loss": 0.0414, "num_input_tokens_seen": 167499920, "step": 77620 }, { "epoch": 12.663132137030995, "grad_norm": 0.07495531439781189, "learning_rate": 1.7845376966317373e-05, "loss": 0.0476, "num_input_tokens_seen": 167510096, "step": 77625 }, { "epoch": 12.66394779771615, "grad_norm": 0.05133059248328209, "learning_rate": 1.784196690461492e-05, "loss": 0.0678, "num_input_tokens_seen": 167521712, "step": 77630 }, { "epoch": 12.664763458401305, "grad_norm": 1.3761775493621826, "learning_rate": 1.7838556987979096e-05, "loss": 0.0958, "num_input_tokens_seen": 167532080, "step": 77635 }, { "epoch": 12.66557911908646, "grad_norm": 0.2825724482536316, "learning_rate": 1.7835147216478992e-05, "loss": 0.0326, "num_input_tokens_seen": 167542320, "step": 77640 }, { "epoch": 12.666394779771615, "grad_norm": 0.034927718341350555, "learning_rate": 1.7831737590183727e-05, "loss": 0.0262, "num_input_tokens_seen": 167551952, "step": 77645 }, { "epoch": 12.66721044045677, "grad_norm": 0.014450176618993282, "learning_rate": 1.7828328109162384e-05, "loss": 0.0294, "num_input_tokens_seen": 167563824, "step": 77650 }, { "epoch": 12.668026101141924, "grad_norm": 1.3906556367874146, "learning_rate": 1.7824918773484076e-05, "loss": 0.1537, "num_input_tokens_seen": 167574576, "step": 77655 }, { "epoch": 12.66884176182708, "grad_norm": 0.17405681312084198, "learning_rate": 1.782150958321789e-05, "loss": 0.0368, "num_input_tokens_seen": 167585616, "step": 77660 }, { "epoch": 12.669657422512234, "grad_norm": 0.5863036513328552, "learning_rate": 1.781810053843292e-05, "loss": 0.0652, "num_input_tokens_seen": 167595856, "step": 77665 }, { "epoch": 12.67047308319739, "grad_norm": 0.7505273818969727, "learning_rate": 1.781469163919825e-05, "loss": 0.0313, "num_input_tokens_seen": 167607472, "step": 77670 }, { "epoch": 12.671288743882545, "grad_norm": 0.07720326632261276, "learning_rate": 1.7811282885582976e-05, "loss": 0.0979, "num_input_tokens_seen": 167618032, "step": 77675 }, { "epoch": 12.6721044045677, "grad_norm": 0.06651750952005386, "learning_rate": 1.780787427765616e-05, "loss": 0.1413, "num_input_tokens_seen": 167628528, "step": 77680 }, { "epoch": 12.672920065252855, "grad_norm": 0.03280855342745781, "learning_rate": 1.7804465815486906e-05, "loss": 0.0348, "num_input_tokens_seen": 167639440, "step": 77685 }, { "epoch": 12.673735725938009, "grad_norm": 0.029570315033197403, "learning_rate": 1.780105749914428e-05, "loss": 0.0875, "num_input_tokens_seen": 167650608, "step": 77690 }, { "epoch": 12.674551386623165, "grad_norm": 0.14129263162612915, "learning_rate": 1.7797649328697357e-05, "loss": 0.1333, "num_input_tokens_seen": 167661296, "step": 77695 }, { "epoch": 12.67536704730832, "grad_norm": 0.02828107960522175, "learning_rate": 1.7794241304215205e-05, "loss": 0.226, "num_input_tokens_seen": 167671472, "step": 77700 }, { "epoch": 12.676182707993474, "grad_norm": 1.6335011720657349, "learning_rate": 1.779083342576689e-05, "loss": 0.0959, "num_input_tokens_seen": 167682256, "step": 77705 }, { "epoch": 12.67699836867863, "grad_norm": 1.001133680343628, "learning_rate": 1.7787425693421493e-05, "loss": 0.159, "num_input_tokens_seen": 167694096, "step": 77710 }, { "epoch": 12.677814029363784, "grad_norm": 0.04500194266438484, "learning_rate": 1.7784018107248053e-05, "loss": 0.0177, "num_input_tokens_seen": 167705584, "step": 77715 }, { "epoch": 12.67862969004894, "grad_norm": 0.04180154949426651, "learning_rate": 1.778061066731565e-05, "loss": 0.182, "num_input_tokens_seen": 167716624, "step": 77720 }, { "epoch": 12.679445350734095, "grad_norm": 1.0777299404144287, "learning_rate": 1.777720337369332e-05, "loss": 0.0693, "num_input_tokens_seen": 167727312, "step": 77725 }, { "epoch": 12.68026101141925, "grad_norm": 0.014272588305175304, "learning_rate": 1.7773796226450133e-05, "loss": 0.0368, "num_input_tokens_seen": 167738160, "step": 77730 }, { "epoch": 12.681076672104405, "grad_norm": 0.5158891081809998, "learning_rate": 1.7770389225655125e-05, "loss": 0.0132, "num_input_tokens_seen": 167749072, "step": 77735 }, { "epoch": 12.681892332789559, "grad_norm": 1.5022553205490112, "learning_rate": 1.7766982371377366e-05, "loss": 0.1215, "num_input_tokens_seen": 167760464, "step": 77740 }, { "epoch": 12.682707993474715, "grad_norm": 1.5216492414474487, "learning_rate": 1.7763575663685868e-05, "loss": 0.1762, "num_input_tokens_seen": 167771696, "step": 77745 }, { "epoch": 12.68352365415987, "grad_norm": 0.15619924664497375, "learning_rate": 1.7760169102649705e-05, "loss": 0.1035, "num_input_tokens_seen": 167782736, "step": 77750 }, { "epoch": 12.684339314845024, "grad_norm": 0.08958006650209427, "learning_rate": 1.7756762688337884e-05, "loss": 0.0073, "num_input_tokens_seen": 167792656, "step": 77755 }, { "epoch": 12.68515497553018, "grad_norm": 2.3670215606689453, "learning_rate": 1.775335642081947e-05, "loss": 0.0724, "num_input_tokens_seen": 167803568, "step": 77760 }, { "epoch": 12.685970636215334, "grad_norm": 1.4834414720535278, "learning_rate": 1.7749950300163466e-05, "loss": 0.1205, "num_input_tokens_seen": 167814736, "step": 77765 }, { "epoch": 12.68678629690049, "grad_norm": 0.03496318683028221, "learning_rate": 1.7746544326438933e-05, "loss": 0.0086, "num_input_tokens_seen": 167826192, "step": 77770 }, { "epoch": 12.687601957585644, "grad_norm": 0.5759903192520142, "learning_rate": 1.7743138499714867e-05, "loss": 0.1128, "num_input_tokens_seen": 167838736, "step": 77775 }, { "epoch": 12.6884176182708, "grad_norm": 0.17185159027576447, "learning_rate": 1.7739732820060318e-05, "loss": 0.2395, "num_input_tokens_seen": 167850416, "step": 77780 }, { "epoch": 12.689233278955955, "grad_norm": 1.649593710899353, "learning_rate": 1.773632728754428e-05, "loss": 0.0727, "num_input_tokens_seen": 167861840, "step": 77785 }, { "epoch": 12.690048939641109, "grad_norm": 0.8965033888816833, "learning_rate": 1.7732921902235787e-05, "loss": 0.0788, "num_input_tokens_seen": 167872240, "step": 77790 }, { "epoch": 12.690864600326265, "grad_norm": 1.192795991897583, "learning_rate": 1.772951666420386e-05, "loss": 0.0737, "num_input_tokens_seen": 167883024, "step": 77795 }, { "epoch": 12.691680261011419, "grad_norm": 0.18213626742362976, "learning_rate": 1.772611157351749e-05, "loss": 0.1215, "num_input_tokens_seen": 167894480, "step": 77800 }, { "epoch": 12.692495921696574, "grad_norm": 0.33763033151626587, "learning_rate": 1.7722706630245705e-05, "loss": 0.3414, "num_input_tokens_seen": 167904528, "step": 77805 }, { "epoch": 12.69331158238173, "grad_norm": 0.06884286552667618, "learning_rate": 1.7719301834457497e-05, "loss": 0.1909, "num_input_tokens_seen": 167915376, "step": 77810 }, { "epoch": 12.694127243066884, "grad_norm": 0.04451920837163925, "learning_rate": 1.771589718622188e-05, "loss": 0.1606, "num_input_tokens_seen": 167926672, "step": 77815 }, { "epoch": 12.69494290375204, "grad_norm": 0.09583073109388351, "learning_rate": 1.7712492685607836e-05, "loss": 0.0367, "num_input_tokens_seen": 167937360, "step": 77820 }, { "epoch": 12.695758564437194, "grad_norm": 0.7210690379142761, "learning_rate": 1.770908833268438e-05, "loss": 0.1145, "num_input_tokens_seen": 167948016, "step": 77825 }, { "epoch": 12.69657422512235, "grad_norm": 0.22521808743476868, "learning_rate": 1.7705684127520497e-05, "loss": 0.2493, "num_input_tokens_seen": 167959152, "step": 77830 }, { "epoch": 12.697389885807503, "grad_norm": 2.448201894760132, "learning_rate": 1.7702280070185177e-05, "loss": 0.0852, "num_input_tokens_seen": 167969456, "step": 77835 }, { "epoch": 12.698205546492659, "grad_norm": 0.8362283706665039, "learning_rate": 1.7698876160747414e-05, "loss": 0.1106, "num_input_tokens_seen": 167979728, "step": 77840 }, { "epoch": 12.699021207177815, "grad_norm": 0.029716340824961662, "learning_rate": 1.7695472399276187e-05, "loss": 0.0447, "num_input_tokens_seen": 167989936, "step": 77845 }, { "epoch": 12.699836867862969, "grad_norm": 2.627516031265259, "learning_rate": 1.7692068785840467e-05, "loss": 0.1178, "num_input_tokens_seen": 168000592, "step": 77850 }, { "epoch": 12.700652528548124, "grad_norm": 0.23203341662883759, "learning_rate": 1.768866532050926e-05, "loss": 0.0273, "num_input_tokens_seen": 168011856, "step": 77855 }, { "epoch": 12.701468189233278, "grad_norm": 0.06052650883793831, "learning_rate": 1.7685262003351514e-05, "loss": 0.0731, "num_input_tokens_seen": 168022896, "step": 77860 }, { "epoch": 12.702283849918434, "grad_norm": 0.035744741559028625, "learning_rate": 1.768185883443622e-05, "loss": 0.2064, "num_input_tokens_seen": 168033776, "step": 77865 }, { "epoch": 12.70309951060359, "grad_norm": 0.10094805806875229, "learning_rate": 1.7678455813832333e-05, "loss": 0.1142, "num_input_tokens_seen": 168043152, "step": 77870 }, { "epoch": 12.703915171288743, "grad_norm": 0.5166934728622437, "learning_rate": 1.767505294160884e-05, "loss": 0.1525, "num_input_tokens_seen": 168054640, "step": 77875 }, { "epoch": 12.7047308319739, "grad_norm": 2.2023637294769287, "learning_rate": 1.7671650217834677e-05, "loss": 0.1344, "num_input_tokens_seen": 168065648, "step": 77880 }, { "epoch": 12.705546492659053, "grad_norm": 0.8961057066917419, "learning_rate": 1.766824764257883e-05, "loss": 0.1137, "num_input_tokens_seen": 168076176, "step": 77885 }, { "epoch": 12.706362153344209, "grad_norm": 0.08185695111751556, "learning_rate": 1.7664845215910237e-05, "loss": 0.0086, "num_input_tokens_seen": 168087856, "step": 77890 }, { "epoch": 12.707177814029365, "grad_norm": 4.640097618103027, "learning_rate": 1.766144293789786e-05, "loss": 0.3716, "num_input_tokens_seen": 168098576, "step": 77895 }, { "epoch": 12.707993474714518, "grad_norm": 1.0843909978866577, "learning_rate": 1.765804080861066e-05, "loss": 0.2231, "num_input_tokens_seen": 168109264, "step": 77900 }, { "epoch": 12.708809135399674, "grad_norm": 2.312243938446045, "learning_rate": 1.765463882811757e-05, "loss": 0.1031, "num_input_tokens_seen": 168121104, "step": 77905 }, { "epoch": 12.709624796084828, "grad_norm": 0.42638686299324036, "learning_rate": 1.765123699648755e-05, "loss": 0.0854, "num_input_tokens_seen": 168132464, "step": 77910 }, { "epoch": 12.710440456769984, "grad_norm": 0.2748595178127289, "learning_rate": 1.7647835313789525e-05, "loss": 0.0392, "num_input_tokens_seen": 168143952, "step": 77915 }, { "epoch": 12.71125611745514, "grad_norm": 1.7116512060165405, "learning_rate": 1.7644433780092452e-05, "loss": 0.1094, "num_input_tokens_seen": 168154672, "step": 77920 }, { "epoch": 12.712071778140293, "grad_norm": 0.17794127762317657, "learning_rate": 1.7641032395465253e-05, "loss": 0.0239, "num_input_tokens_seen": 168166896, "step": 77925 }, { "epoch": 12.71288743882545, "grad_norm": 0.33763572573661804, "learning_rate": 1.763763115997688e-05, "loss": 0.1769, "num_input_tokens_seen": 168178448, "step": 77930 }, { "epoch": 12.713703099510603, "grad_norm": 0.059651929885149, "learning_rate": 1.763423007369624e-05, "loss": 0.1725, "num_input_tokens_seen": 168189904, "step": 77935 }, { "epoch": 12.714518760195759, "grad_norm": 0.2942887246608734, "learning_rate": 1.763082913669228e-05, "loss": 0.0986, "num_input_tokens_seen": 168200208, "step": 77940 }, { "epoch": 12.715334420880914, "grad_norm": 0.4100702404975891, "learning_rate": 1.7627428349033902e-05, "loss": 0.0304, "num_input_tokens_seen": 168210960, "step": 77945 }, { "epoch": 12.716150081566068, "grad_norm": 1.0314041376113892, "learning_rate": 1.762402771079006e-05, "loss": 0.2115, "num_input_tokens_seen": 168221456, "step": 77950 }, { "epoch": 12.716965742251224, "grad_norm": 1.655487060546875, "learning_rate": 1.762062722202964e-05, "loss": 0.1903, "num_input_tokens_seen": 168232944, "step": 77955 }, { "epoch": 12.717781402936378, "grad_norm": 1.395523190498352, "learning_rate": 1.7617226882821576e-05, "loss": 0.2483, "num_input_tokens_seen": 168243984, "step": 77960 }, { "epoch": 12.718597063621534, "grad_norm": 1.8476142883300781, "learning_rate": 1.7613826693234768e-05, "loss": 0.1038, "num_input_tokens_seen": 168254736, "step": 77965 }, { "epoch": 12.719412724306688, "grad_norm": 0.05010903999209404, "learning_rate": 1.761042665333814e-05, "loss": 0.0948, "num_input_tokens_seen": 168265456, "step": 77970 }, { "epoch": 12.720228384991843, "grad_norm": 1.3515359163284302, "learning_rate": 1.760702676320059e-05, "loss": 0.0647, "num_input_tokens_seen": 168275632, "step": 77975 }, { "epoch": 12.721044045676999, "grad_norm": 0.5709481835365295, "learning_rate": 1.7603627022891017e-05, "loss": 0.0414, "num_input_tokens_seen": 168286032, "step": 77980 }, { "epoch": 12.721859706362153, "grad_norm": 0.21252968907356262, "learning_rate": 1.7600227432478328e-05, "loss": 0.1437, "num_input_tokens_seen": 168297968, "step": 77985 }, { "epoch": 12.722675367047309, "grad_norm": 0.05794208496809006, "learning_rate": 1.7596827992031416e-05, "loss": 0.0945, "num_input_tokens_seen": 168307888, "step": 77990 }, { "epoch": 12.723491027732463, "grad_norm": 0.44242438673973083, "learning_rate": 1.7593428701619176e-05, "loss": 0.1959, "num_input_tokens_seen": 168318928, "step": 77995 }, { "epoch": 12.724306688417618, "grad_norm": 0.06354571133852005, "learning_rate": 1.75900295613105e-05, "loss": 0.0401, "num_input_tokens_seen": 168328592, "step": 78000 }, { "epoch": 12.725122349102774, "grad_norm": 1.4188809394836426, "learning_rate": 1.7586630571174277e-05, "loss": 0.1186, "num_input_tokens_seen": 168339056, "step": 78005 }, { "epoch": 12.725938009787928, "grad_norm": 0.5780890583992004, "learning_rate": 1.7583231731279386e-05, "loss": 0.0432, "num_input_tokens_seen": 168350640, "step": 78010 }, { "epoch": 12.726753670473084, "grad_norm": 2.0131750106811523, "learning_rate": 1.7579833041694717e-05, "loss": 0.0628, "num_input_tokens_seen": 168361424, "step": 78015 }, { "epoch": 12.727569331158238, "grad_norm": 0.04645226523280144, "learning_rate": 1.757643450248914e-05, "loss": 0.1379, "num_input_tokens_seen": 168372112, "step": 78020 }, { "epoch": 12.728384991843393, "grad_norm": 0.6240108013153076, "learning_rate": 1.7573036113731545e-05, "loss": 0.0308, "num_input_tokens_seen": 168384464, "step": 78025 }, { "epoch": 12.729200652528547, "grad_norm": 0.2728446125984192, "learning_rate": 1.756963787549078e-05, "loss": 0.0569, "num_input_tokens_seen": 168395024, "step": 78030 }, { "epoch": 12.730016313213703, "grad_norm": 0.6507329940795898, "learning_rate": 1.7566239787835744e-05, "loss": 0.0465, "num_input_tokens_seen": 168406800, "step": 78035 }, { "epoch": 12.730831973898859, "grad_norm": 0.1408867985010147, "learning_rate": 1.7562841850835278e-05, "loss": 0.0954, "num_input_tokens_seen": 168418800, "step": 78040 }, { "epoch": 12.731647634584013, "grad_norm": 0.44063469767570496, "learning_rate": 1.7559444064558266e-05, "loss": 0.1334, "num_input_tokens_seen": 168429392, "step": 78045 }, { "epoch": 12.732463295269168, "grad_norm": 0.11245367676019669, "learning_rate": 1.7556046429073554e-05, "loss": 0.3098, "num_input_tokens_seen": 168441008, "step": 78050 }, { "epoch": 12.733278955954322, "grad_norm": 0.0498557910323143, "learning_rate": 1.7552648944450007e-05, "loss": 0.0275, "num_input_tokens_seen": 168452496, "step": 78055 }, { "epoch": 12.734094616639478, "grad_norm": 0.07116755843162537, "learning_rate": 1.7549251610756472e-05, "loss": 0.0921, "num_input_tokens_seen": 168462224, "step": 78060 }, { "epoch": 12.734910277324634, "grad_norm": 1.8956395387649536, "learning_rate": 1.754585442806181e-05, "loss": 0.0496, "num_input_tokens_seen": 168472368, "step": 78065 }, { "epoch": 12.735725938009788, "grad_norm": 1.4264894723892212, "learning_rate": 1.7542457396434858e-05, "loss": 0.2011, "num_input_tokens_seen": 168482480, "step": 78070 }, { "epoch": 12.736541598694943, "grad_norm": 0.08058580011129379, "learning_rate": 1.753906051594448e-05, "loss": 0.0117, "num_input_tokens_seen": 168492368, "step": 78075 }, { "epoch": 12.737357259380097, "grad_norm": 0.33334460854530334, "learning_rate": 1.753566378665949e-05, "loss": 0.1619, "num_input_tokens_seen": 168503472, "step": 78080 }, { "epoch": 12.738172920065253, "grad_norm": 3.4343161582946777, "learning_rate": 1.7532267208648756e-05, "loss": 0.3366, "num_input_tokens_seen": 168513200, "step": 78085 }, { "epoch": 12.738988580750409, "grad_norm": 0.19533491134643555, "learning_rate": 1.7528870781981087e-05, "loss": 0.0781, "num_input_tokens_seen": 168523120, "step": 78090 }, { "epoch": 12.739804241435563, "grad_norm": 1.4593864679336548, "learning_rate": 1.7525474506725344e-05, "loss": 0.0715, "num_input_tokens_seen": 168533072, "step": 78095 }, { "epoch": 12.740619902120718, "grad_norm": 0.04470469057559967, "learning_rate": 1.752207838295033e-05, "loss": 0.0408, "num_input_tokens_seen": 168544464, "step": 78100 }, { "epoch": 12.741435562805872, "grad_norm": 0.7257256507873535, "learning_rate": 1.7518682410724883e-05, "loss": 0.0321, "num_input_tokens_seen": 168555312, "step": 78105 }, { "epoch": 12.742251223491028, "grad_norm": 1.9847356081008911, "learning_rate": 1.7515286590117842e-05, "loss": 0.1908, "num_input_tokens_seen": 168564624, "step": 78110 }, { "epoch": 12.743066884176184, "grad_norm": 0.03895273804664612, "learning_rate": 1.7511890921198e-05, "loss": 0.0507, "num_input_tokens_seen": 168576496, "step": 78115 }, { "epoch": 12.743882544861338, "grad_norm": 0.0634070560336113, "learning_rate": 1.7508495404034194e-05, "loss": 0.156, "num_input_tokens_seen": 168587792, "step": 78120 }, { "epoch": 12.744698205546493, "grad_norm": 0.599896252155304, "learning_rate": 1.7505100038695226e-05, "loss": 0.0475, "num_input_tokens_seen": 168598256, "step": 78125 }, { "epoch": 12.745513866231647, "grad_norm": 0.21462175250053406, "learning_rate": 1.750170482524992e-05, "loss": 0.039, "num_input_tokens_seen": 168608880, "step": 78130 }, { "epoch": 12.746329526916803, "grad_norm": 0.2823779284954071, "learning_rate": 1.7498309763767077e-05, "loss": 0.0093, "num_input_tokens_seen": 168620400, "step": 78135 }, { "epoch": 12.747145187601957, "grad_norm": 0.2779505252838135, "learning_rate": 1.7494914854315502e-05, "loss": 0.0266, "num_input_tokens_seen": 168631376, "step": 78140 }, { "epoch": 12.747960848287113, "grad_norm": 0.43167543411254883, "learning_rate": 1.7491520096963997e-05, "loss": 0.0294, "num_input_tokens_seen": 168641776, "step": 78145 }, { "epoch": 12.748776508972268, "grad_norm": 0.33109718561172485, "learning_rate": 1.7488125491781364e-05, "loss": 0.1664, "num_input_tokens_seen": 168652464, "step": 78150 }, { "epoch": 12.749592169657422, "grad_norm": 0.8349601626396179, "learning_rate": 1.7484731038836397e-05, "loss": 0.0256, "num_input_tokens_seen": 168662928, "step": 78155 }, { "epoch": 12.750407830342578, "grad_norm": 0.2011585384607315, "learning_rate": 1.7481336738197894e-05, "loss": 0.1634, "num_input_tokens_seen": 168674448, "step": 78160 }, { "epoch": 12.751223491027732, "grad_norm": 2.071765422821045, "learning_rate": 1.747794258993463e-05, "loss": 0.1113, "num_input_tokens_seen": 168683792, "step": 78165 }, { "epoch": 12.752039151712887, "grad_norm": 0.6336460709571838, "learning_rate": 1.7474548594115413e-05, "loss": 0.1549, "num_input_tokens_seen": 168695248, "step": 78170 }, { "epoch": 12.752854812398043, "grad_norm": 0.0578620582818985, "learning_rate": 1.747115475080901e-05, "loss": 0.0099, "num_input_tokens_seen": 168706992, "step": 78175 }, { "epoch": 12.753670473083197, "grad_norm": 0.09242139756679535, "learning_rate": 1.746776106008421e-05, "loss": 0.0192, "num_input_tokens_seen": 168718800, "step": 78180 }, { "epoch": 12.754486133768353, "grad_norm": 2.5589914321899414, "learning_rate": 1.746436752200978e-05, "loss": 0.3108, "num_input_tokens_seen": 168729552, "step": 78185 }, { "epoch": 12.755301794453507, "grad_norm": 0.03827889636158943, "learning_rate": 1.7460974136654512e-05, "loss": 0.111, "num_input_tokens_seen": 168740304, "step": 78190 }, { "epoch": 12.756117455138662, "grad_norm": 0.19984427094459534, "learning_rate": 1.7457580904087158e-05, "loss": 0.0948, "num_input_tokens_seen": 168751536, "step": 78195 }, { "epoch": 12.756933115823816, "grad_norm": 0.266124427318573, "learning_rate": 1.7454187824376504e-05, "loss": 0.0292, "num_input_tokens_seen": 168762288, "step": 78200 }, { "epoch": 12.757748776508972, "grad_norm": 2.0334906578063965, "learning_rate": 1.7450794897591296e-05, "loss": 0.154, "num_input_tokens_seen": 168772912, "step": 78205 }, { "epoch": 12.758564437194128, "grad_norm": 2.516111135482788, "learning_rate": 1.7447402123800307e-05, "loss": 0.2197, "num_input_tokens_seen": 168781808, "step": 78210 }, { "epoch": 12.759380097879282, "grad_norm": 1.1941967010498047, "learning_rate": 1.7444009503072307e-05, "loss": 0.1346, "num_input_tokens_seen": 168793104, "step": 78215 }, { "epoch": 12.760195758564437, "grad_norm": 1.5551533699035645, "learning_rate": 1.7440617035476027e-05, "loss": 0.1373, "num_input_tokens_seen": 168803312, "step": 78220 }, { "epoch": 12.761011419249591, "grad_norm": 0.4646497070789337, "learning_rate": 1.743722472108024e-05, "loss": 0.0742, "num_input_tokens_seen": 168813776, "step": 78225 }, { "epoch": 12.761827079934747, "grad_norm": 2.0087687969207764, "learning_rate": 1.7433832559953684e-05, "loss": 0.1375, "num_input_tokens_seen": 168824496, "step": 78230 }, { "epoch": 12.762642740619903, "grad_norm": 1.700279951095581, "learning_rate": 1.7430440552165116e-05, "loss": 0.0995, "num_input_tokens_seen": 168835440, "step": 78235 }, { "epoch": 12.763458401305057, "grad_norm": 0.98744136095047, "learning_rate": 1.7427048697783264e-05, "loss": 0.0763, "num_input_tokens_seen": 168845488, "step": 78240 }, { "epoch": 12.764274061990212, "grad_norm": 0.3450573682785034, "learning_rate": 1.7423656996876887e-05, "loss": 0.0241, "num_input_tokens_seen": 168857424, "step": 78245 }, { "epoch": 12.765089722675366, "grad_norm": 0.04824071004986763, "learning_rate": 1.7420265449514704e-05, "loss": 0.2822, "num_input_tokens_seen": 168868528, "step": 78250 }, { "epoch": 12.765905383360522, "grad_norm": 0.2209496647119522, "learning_rate": 1.7416874055765462e-05, "loss": 0.065, "num_input_tokens_seen": 168878864, "step": 78255 }, { "epoch": 12.766721044045678, "grad_norm": 0.5921001434326172, "learning_rate": 1.7413482815697884e-05, "loss": 0.0091, "num_input_tokens_seen": 168890128, "step": 78260 }, { "epoch": 12.767536704730832, "grad_norm": 0.8659346699714661, "learning_rate": 1.7410091729380708e-05, "loss": 0.0322, "num_input_tokens_seen": 168901392, "step": 78265 }, { "epoch": 12.768352365415987, "grad_norm": 0.15100090205669403, "learning_rate": 1.7406700796882638e-05, "loss": 0.0169, "num_input_tokens_seen": 168912464, "step": 78270 }, { "epoch": 12.769168026101141, "grad_norm": 0.06865692138671875, "learning_rate": 1.740331001827242e-05, "loss": 0.0241, "num_input_tokens_seen": 168923952, "step": 78275 }, { "epoch": 12.769983686786297, "grad_norm": 0.07578466087579727, "learning_rate": 1.7399919393618756e-05, "loss": 0.0103, "num_input_tokens_seen": 168933232, "step": 78280 }, { "epoch": 12.770799347471453, "grad_norm": 1.560034990310669, "learning_rate": 1.739652892299037e-05, "loss": 0.0508, "num_input_tokens_seen": 168944496, "step": 78285 }, { "epoch": 12.771615008156607, "grad_norm": 0.04631412401795387, "learning_rate": 1.739313860645597e-05, "loss": 0.0552, "num_input_tokens_seen": 168955952, "step": 78290 }, { "epoch": 12.772430668841762, "grad_norm": 0.10226115584373474, "learning_rate": 1.738974844408427e-05, "loss": 0.028, "num_input_tokens_seen": 168965456, "step": 78295 }, { "epoch": 12.773246329526916, "grad_norm": 0.6392276883125305, "learning_rate": 1.738635843594396e-05, "loss": 0.0216, "num_input_tokens_seen": 168975536, "step": 78300 }, { "epoch": 12.774061990212072, "grad_norm": 0.03507981449365616, "learning_rate": 1.738296858210377e-05, "loss": 0.0096, "num_input_tokens_seen": 168985648, "step": 78305 }, { "epoch": 12.774877650897226, "grad_norm": 0.10269838571548462, "learning_rate": 1.7379578882632375e-05, "loss": 0.0347, "num_input_tokens_seen": 168997136, "step": 78310 }, { "epoch": 12.775693311582382, "grad_norm": 0.21426215767860413, "learning_rate": 1.7376189337598488e-05, "loss": 0.0465, "num_input_tokens_seen": 169008336, "step": 78315 }, { "epoch": 12.776508972267537, "grad_norm": 0.27974483370780945, "learning_rate": 1.737279994707079e-05, "loss": 0.1296, "num_input_tokens_seen": 169019600, "step": 78320 }, { "epoch": 12.777324632952691, "grad_norm": 0.10596464574337006, "learning_rate": 1.7369410711117974e-05, "loss": 0.0608, "num_input_tokens_seen": 169030992, "step": 78325 }, { "epoch": 12.778140293637847, "grad_norm": 0.11379091441631317, "learning_rate": 1.7366021629808736e-05, "loss": 0.092, "num_input_tokens_seen": 169042032, "step": 78330 }, { "epoch": 12.778955954323001, "grad_norm": 0.065534308552742, "learning_rate": 1.736263270321175e-05, "loss": 0.1145, "num_input_tokens_seen": 169053200, "step": 78335 }, { "epoch": 12.779771615008157, "grad_norm": 2.229719877243042, "learning_rate": 1.7359243931395707e-05, "loss": 0.0473, "num_input_tokens_seen": 169064560, "step": 78340 }, { "epoch": 12.780587275693312, "grad_norm": 0.09028586000204086, "learning_rate": 1.735585531442927e-05, "loss": 0.112, "num_input_tokens_seen": 169075664, "step": 78345 }, { "epoch": 12.781402936378466, "grad_norm": 0.047029126435518265, "learning_rate": 1.7352466852381134e-05, "loss": 0.1143, "num_input_tokens_seen": 169086320, "step": 78350 }, { "epoch": 12.782218597063622, "grad_norm": 0.34689897298812866, "learning_rate": 1.7349078545319946e-05, "loss": 0.1987, "num_input_tokens_seen": 169096944, "step": 78355 }, { "epoch": 12.783034257748776, "grad_norm": 0.6782675981521606, "learning_rate": 1.73456903933144e-05, "loss": 0.1325, "num_input_tokens_seen": 169107664, "step": 78360 }, { "epoch": 12.783849918433932, "grad_norm": 0.029367901384830475, "learning_rate": 1.7342302396433138e-05, "loss": 0.0458, "num_input_tokens_seen": 169118480, "step": 78365 }, { "epoch": 12.784665579119086, "grad_norm": 0.07770401984453201, "learning_rate": 1.733891455474484e-05, "loss": 0.0806, "num_input_tokens_seen": 169129104, "step": 78370 }, { "epoch": 12.785481239804241, "grad_norm": 1.8441455364227295, "learning_rate": 1.733552686831815e-05, "loss": 0.1668, "num_input_tokens_seen": 169140080, "step": 78375 }, { "epoch": 12.786296900489397, "grad_norm": 0.8510557413101196, "learning_rate": 1.7332139337221743e-05, "loss": 0.0889, "num_input_tokens_seen": 169151760, "step": 78380 }, { "epoch": 12.78711256117455, "grad_norm": 0.8910442590713501, "learning_rate": 1.7328751961524248e-05, "loss": 0.0775, "num_input_tokens_seen": 169162416, "step": 78385 }, { "epoch": 12.787928221859707, "grad_norm": 0.4454052746295929, "learning_rate": 1.7325364741294335e-05, "loss": 0.0263, "num_input_tokens_seen": 169173072, "step": 78390 }, { "epoch": 12.78874388254486, "grad_norm": 0.027573835104703903, "learning_rate": 1.732197767660063e-05, "loss": 0.025, "num_input_tokens_seen": 169183504, "step": 78395 }, { "epoch": 12.789559543230016, "grad_norm": 2.2325029373168945, "learning_rate": 1.73185907675118e-05, "loss": 0.3502, "num_input_tokens_seen": 169193648, "step": 78400 }, { "epoch": 12.790375203915172, "grad_norm": 1.505352258682251, "learning_rate": 1.731520401409646e-05, "loss": 0.1583, "num_input_tokens_seen": 169204944, "step": 78405 }, { "epoch": 12.791190864600326, "grad_norm": 0.04166170209646225, "learning_rate": 1.731181741642327e-05, "loss": 0.0131, "num_input_tokens_seen": 169215728, "step": 78410 }, { "epoch": 12.792006525285482, "grad_norm": 0.11689214408397675, "learning_rate": 1.7308430974560846e-05, "loss": 0.2495, "num_input_tokens_seen": 169226640, "step": 78415 }, { "epoch": 12.792822185970635, "grad_norm": 0.0645563155412674, "learning_rate": 1.7305044688577828e-05, "loss": 0.0525, "num_input_tokens_seen": 169237584, "step": 78420 }, { "epoch": 12.793637846655791, "grad_norm": 0.14359769225120544, "learning_rate": 1.7301658558542845e-05, "loss": 0.0582, "num_input_tokens_seen": 169249392, "step": 78425 }, { "epoch": 12.794453507340947, "grad_norm": 0.23817631602287292, "learning_rate": 1.7298272584524508e-05, "loss": 0.1804, "num_input_tokens_seen": 169260112, "step": 78430 }, { "epoch": 12.7952691680261, "grad_norm": 1.117061972618103, "learning_rate": 1.7294886766591463e-05, "loss": 0.2245, "num_input_tokens_seen": 169270608, "step": 78435 }, { "epoch": 12.796084828711257, "grad_norm": 1.014007568359375, "learning_rate": 1.7291501104812295e-05, "loss": 0.0619, "num_input_tokens_seen": 169281456, "step": 78440 }, { "epoch": 12.79690048939641, "grad_norm": 2.033285617828369, "learning_rate": 1.7288115599255643e-05, "loss": 0.1047, "num_input_tokens_seen": 169291888, "step": 78445 }, { "epoch": 12.797716150081566, "grad_norm": 0.43539392948150635, "learning_rate": 1.7284730249990115e-05, "loss": 0.0416, "num_input_tokens_seen": 169302512, "step": 78450 }, { "epoch": 12.798531810766722, "grad_norm": 0.04932163655757904, "learning_rate": 1.7281345057084315e-05, "loss": 0.11, "num_input_tokens_seen": 169314128, "step": 78455 }, { "epoch": 12.799347471451876, "grad_norm": 0.2946796715259552, "learning_rate": 1.7277960020606848e-05, "loss": 0.1514, "num_input_tokens_seen": 169324848, "step": 78460 }, { "epoch": 12.800163132137031, "grad_norm": 0.4727557301521301, "learning_rate": 1.7274575140626318e-05, "loss": 0.0336, "num_input_tokens_seen": 169334416, "step": 78465 }, { "epoch": 12.800978792822185, "grad_norm": 1.1273261308670044, "learning_rate": 1.7271190417211317e-05, "loss": 0.1621, "num_input_tokens_seen": 169345456, "step": 78470 }, { "epoch": 12.801794453507341, "grad_norm": 0.9519625306129456, "learning_rate": 1.726780585043046e-05, "loss": 0.2879, "num_input_tokens_seen": 169355920, "step": 78475 }, { "epoch": 12.802610114192497, "grad_norm": 1.5144219398498535, "learning_rate": 1.726442144035231e-05, "loss": 0.2065, "num_input_tokens_seen": 169365968, "step": 78480 }, { "epoch": 12.80342577487765, "grad_norm": 0.03607725352048874, "learning_rate": 1.7261037187045486e-05, "loss": 0.0136, "num_input_tokens_seen": 169377776, "step": 78485 }, { "epoch": 12.804241435562806, "grad_norm": 1.0860909223556519, "learning_rate": 1.7257653090578553e-05, "loss": 0.0562, "num_input_tokens_seen": 169389456, "step": 78490 }, { "epoch": 12.80505709624796, "grad_norm": 0.08253359794616699, "learning_rate": 1.7254269151020113e-05, "loss": 0.0433, "num_input_tokens_seen": 169400336, "step": 78495 }, { "epoch": 12.805872756933116, "grad_norm": 1.681749939918518, "learning_rate": 1.7250885368438722e-05, "loss": 0.2718, "num_input_tokens_seen": 169411856, "step": 78500 }, { "epoch": 12.80668841761827, "grad_norm": 0.022420724853873253, "learning_rate": 1.724750174290298e-05, "loss": 0.0059, "num_input_tokens_seen": 169423056, "step": 78505 }, { "epoch": 12.807504078303426, "grad_norm": 0.06308262795209885, "learning_rate": 1.7244118274481442e-05, "loss": 0.0972, "num_input_tokens_seen": 169434320, "step": 78510 }, { "epoch": 12.808319738988581, "grad_norm": 0.13171906769275665, "learning_rate": 1.7240734963242695e-05, "loss": 0.2129, "num_input_tokens_seen": 169446384, "step": 78515 }, { "epoch": 12.809135399673735, "grad_norm": 0.05376764014363289, "learning_rate": 1.7237351809255283e-05, "loss": 0.2576, "num_input_tokens_seen": 169457264, "step": 78520 }, { "epoch": 12.809951060358891, "grad_norm": 1.465751051902771, "learning_rate": 1.7233968812587797e-05, "loss": 0.1573, "num_input_tokens_seen": 169467920, "step": 78525 }, { "epoch": 12.810766721044045, "grad_norm": 0.16089659929275513, "learning_rate": 1.7230585973308778e-05, "loss": 0.0952, "num_input_tokens_seen": 169478736, "step": 78530 }, { "epoch": 12.8115823817292, "grad_norm": 0.036334943026304245, "learning_rate": 1.7227203291486787e-05, "loss": 0.035, "num_input_tokens_seen": 169489200, "step": 78535 }, { "epoch": 12.812398042414356, "grad_norm": 0.09568361192941666, "learning_rate": 1.7223820767190392e-05, "loss": 0.1663, "num_input_tokens_seen": 169499824, "step": 78540 }, { "epoch": 12.81321370309951, "grad_norm": 0.08498751372098923, "learning_rate": 1.7220438400488127e-05, "loss": 0.0511, "num_input_tokens_seen": 169511280, "step": 78545 }, { "epoch": 12.814029363784666, "grad_norm": 0.062072135508060455, "learning_rate": 1.7217056191448556e-05, "loss": 0.0518, "num_input_tokens_seen": 169523216, "step": 78550 }, { "epoch": 12.81484502446982, "grad_norm": 0.5159264802932739, "learning_rate": 1.72136741401402e-05, "loss": 0.0404, "num_input_tokens_seen": 169534192, "step": 78555 }, { "epoch": 12.815660685154976, "grad_norm": 0.10303039103746414, "learning_rate": 1.7210292246631632e-05, "loss": 0.0304, "num_input_tokens_seen": 169544976, "step": 78560 }, { "epoch": 12.81647634584013, "grad_norm": 0.5858745574951172, "learning_rate": 1.720691051099136e-05, "loss": 0.0169, "num_input_tokens_seen": 169555568, "step": 78565 }, { "epoch": 12.817292006525285, "grad_norm": 0.08886207640171051, "learning_rate": 1.720352893328794e-05, "loss": 0.0486, "num_input_tokens_seen": 169565872, "step": 78570 }, { "epoch": 12.818107667210441, "grad_norm": 0.23098593950271606, "learning_rate": 1.720014751358989e-05, "loss": 0.0142, "num_input_tokens_seen": 169576944, "step": 78575 }, { "epoch": 12.818923327895595, "grad_norm": 0.032466791570186615, "learning_rate": 1.7196766251965757e-05, "loss": 0.279, "num_input_tokens_seen": 169587536, "step": 78580 }, { "epoch": 12.81973898858075, "grad_norm": 1.0999151468276978, "learning_rate": 1.719338514848404e-05, "loss": 0.2057, "num_input_tokens_seen": 169597648, "step": 78585 }, { "epoch": 12.820554649265905, "grad_norm": 0.02983030304312706, "learning_rate": 1.7190004203213288e-05, "loss": 0.039, "num_input_tokens_seen": 169607920, "step": 78590 }, { "epoch": 12.82137030995106, "grad_norm": 0.08369752764701843, "learning_rate": 1.7186623416222004e-05, "loss": 0.0966, "num_input_tokens_seen": 169618672, "step": 78595 }, { "epoch": 12.822185970636216, "grad_norm": 0.22173602879047394, "learning_rate": 1.718324278757871e-05, "loss": 0.0252, "num_input_tokens_seen": 169628816, "step": 78600 }, { "epoch": 12.82300163132137, "grad_norm": 1.564441204071045, "learning_rate": 1.7179862317351913e-05, "loss": 0.1589, "num_input_tokens_seen": 169639952, "step": 78605 }, { "epoch": 12.823817292006526, "grad_norm": 1.0250309705734253, "learning_rate": 1.7176482005610134e-05, "loss": 0.1533, "num_input_tokens_seen": 169650992, "step": 78610 }, { "epoch": 12.82463295269168, "grad_norm": 0.28857725858688354, "learning_rate": 1.717310185242186e-05, "loss": 0.0174, "num_input_tokens_seen": 169662320, "step": 78615 }, { "epoch": 12.825448613376835, "grad_norm": 0.04042555391788483, "learning_rate": 1.716972185785562e-05, "loss": 0.0746, "num_input_tokens_seen": 169672112, "step": 78620 }, { "epoch": 12.826264274061991, "grad_norm": 0.11038023978471756, "learning_rate": 1.716634202197989e-05, "loss": 0.0716, "num_input_tokens_seen": 169682384, "step": 78625 }, { "epoch": 12.827079934747145, "grad_norm": 0.1725560426712036, "learning_rate": 1.7162962344863187e-05, "loss": 0.0122, "num_input_tokens_seen": 169693232, "step": 78630 }, { "epoch": 12.8278955954323, "grad_norm": 0.1958256959915161, "learning_rate": 1.7159582826573983e-05, "loss": 0.0078, "num_input_tokens_seen": 169704432, "step": 78635 }, { "epoch": 12.828711256117455, "grad_norm": 0.04867057129740715, "learning_rate": 1.7156203467180776e-05, "loss": 0.0426, "num_input_tokens_seen": 169713840, "step": 78640 }, { "epoch": 12.82952691680261, "grad_norm": 0.16927559673786163, "learning_rate": 1.715282426675207e-05, "loss": 0.26, "num_input_tokens_seen": 169725136, "step": 78645 }, { "epoch": 12.830342577487766, "grad_norm": 0.1793067902326584, "learning_rate": 1.7149445225356326e-05, "loss": 0.1175, "num_input_tokens_seen": 169737008, "step": 78650 }, { "epoch": 12.83115823817292, "grad_norm": 1.2458311319351196, "learning_rate": 1.714606634306204e-05, "loss": 0.1235, "num_input_tokens_seen": 169747600, "step": 78655 }, { "epoch": 12.831973898858076, "grad_norm": 0.8706295490264893, "learning_rate": 1.7142687619937677e-05, "loss": 0.2004, "num_input_tokens_seen": 169757616, "step": 78660 }, { "epoch": 12.83278955954323, "grad_norm": 0.9867514967918396, "learning_rate": 1.7139309056051726e-05, "loss": 0.1043, "num_input_tokens_seen": 169768784, "step": 78665 }, { "epoch": 12.833605220228385, "grad_norm": 1.7291810512542725, "learning_rate": 1.713593065147264e-05, "loss": 0.1345, "num_input_tokens_seen": 169780208, "step": 78670 }, { "epoch": 12.83442088091354, "grad_norm": 0.056979645043611526, "learning_rate": 1.7132552406268902e-05, "loss": 0.1853, "num_input_tokens_seen": 169792048, "step": 78675 }, { "epoch": 12.835236541598695, "grad_norm": 1.9657354354858398, "learning_rate": 1.712917432050896e-05, "loss": 0.2169, "num_input_tokens_seen": 169802832, "step": 78680 }, { "epoch": 12.83605220228385, "grad_norm": 0.30423402786254883, "learning_rate": 1.71257963942613e-05, "loss": 0.0184, "num_input_tokens_seen": 169814032, "step": 78685 }, { "epoch": 12.836867862969005, "grad_norm": 1.1328938007354736, "learning_rate": 1.712241862759435e-05, "loss": 0.1811, "num_input_tokens_seen": 169824016, "step": 78690 }, { "epoch": 12.83768352365416, "grad_norm": 0.7515915632247925, "learning_rate": 1.7119041020576594e-05, "loss": 0.1297, "num_input_tokens_seen": 169834704, "step": 78695 }, { "epoch": 12.838499184339314, "grad_norm": 0.15632788836956024, "learning_rate": 1.7115663573276457e-05, "loss": 0.2102, "num_input_tokens_seen": 169845040, "step": 78700 }, { "epoch": 12.83931484502447, "grad_norm": 0.22529864311218262, "learning_rate": 1.7112286285762415e-05, "loss": 0.0409, "num_input_tokens_seen": 169856944, "step": 78705 }, { "epoch": 12.840130505709626, "grad_norm": 0.8297218680381775, "learning_rate": 1.710890915810288e-05, "loss": 0.1143, "num_input_tokens_seen": 169868048, "step": 78710 }, { "epoch": 12.84094616639478, "grad_norm": 1.9379075765609741, "learning_rate": 1.7105532190366323e-05, "loss": 0.3121, "num_input_tokens_seen": 169877776, "step": 78715 }, { "epoch": 12.841761827079935, "grad_norm": 0.10476222634315491, "learning_rate": 1.7102155382621164e-05, "loss": 0.1532, "num_input_tokens_seen": 169888592, "step": 78720 }, { "epoch": 12.84257748776509, "grad_norm": 0.03180732950568199, "learning_rate": 1.7098778734935854e-05, "loss": 0.1037, "num_input_tokens_seen": 169899824, "step": 78725 }, { "epoch": 12.843393148450245, "grad_norm": 0.042717527598142624, "learning_rate": 1.7095402247378806e-05, "loss": 0.0909, "num_input_tokens_seen": 169910064, "step": 78730 }, { "epoch": 12.844208809135399, "grad_norm": 1.2341430187225342, "learning_rate": 1.7092025920018466e-05, "loss": 0.1479, "num_input_tokens_seen": 169920112, "step": 78735 }, { "epoch": 12.845024469820554, "grad_norm": 0.40998274087905884, "learning_rate": 1.708864975292325e-05, "loss": 0.0162, "num_input_tokens_seen": 169931088, "step": 78740 }, { "epoch": 12.84584013050571, "grad_norm": 0.3148539364337921, "learning_rate": 1.708527374616158e-05, "loss": 0.1157, "num_input_tokens_seen": 169942160, "step": 78745 }, { "epoch": 12.846655791190864, "grad_norm": 1.7940845489501953, "learning_rate": 1.7081897899801885e-05, "loss": 0.1794, "num_input_tokens_seen": 169951216, "step": 78750 }, { "epoch": 12.84747145187602, "grad_norm": 0.4914838671684265, "learning_rate": 1.7078522213912566e-05, "loss": 0.0243, "num_input_tokens_seen": 169962288, "step": 78755 }, { "epoch": 12.848287112561174, "grad_norm": 0.807894766330719, "learning_rate": 1.707514668856205e-05, "loss": 0.0193, "num_input_tokens_seen": 169973648, "step": 78760 }, { "epoch": 12.84910277324633, "grad_norm": 1.1409434080123901, "learning_rate": 1.7071771323818736e-05, "loss": 0.2513, "num_input_tokens_seen": 169984688, "step": 78765 }, { "epoch": 12.849918433931485, "grad_norm": 0.03233710303902626, "learning_rate": 1.7068396119751038e-05, "loss": 0.1559, "num_input_tokens_seen": 169996240, "step": 78770 }, { "epoch": 12.850734094616639, "grad_norm": 1.0139901638031006, "learning_rate": 1.706502107642735e-05, "loss": 0.1927, "num_input_tokens_seen": 170006768, "step": 78775 }, { "epoch": 12.851549755301795, "grad_norm": 2.0797927379608154, "learning_rate": 1.7061646193916083e-05, "loss": 0.1858, "num_input_tokens_seen": 170017200, "step": 78780 }, { "epoch": 12.852365415986949, "grad_norm": 0.06800775229930878, "learning_rate": 1.7058271472285615e-05, "loss": 0.1307, "num_input_tokens_seen": 170029168, "step": 78785 }, { "epoch": 12.853181076672104, "grad_norm": 0.36431220173835754, "learning_rate": 1.7054896911604363e-05, "loss": 0.0851, "num_input_tokens_seen": 170040528, "step": 78790 }, { "epoch": 12.85399673735726, "grad_norm": 0.19308127462863922, "learning_rate": 1.7051522511940694e-05, "loss": 0.1869, "num_input_tokens_seen": 170051568, "step": 78795 }, { "epoch": 12.854812398042414, "grad_norm": 0.2706639766693115, "learning_rate": 1.7048148273363012e-05, "loss": 0.0167, "num_input_tokens_seen": 170062864, "step": 78800 }, { "epoch": 12.85562805872757, "grad_norm": 0.7273874878883362, "learning_rate": 1.704477419593969e-05, "loss": 0.063, "num_input_tokens_seen": 170073840, "step": 78805 }, { "epoch": 12.856443719412724, "grad_norm": 0.6555032134056091, "learning_rate": 1.7041400279739115e-05, "loss": 0.0468, "num_input_tokens_seen": 170085456, "step": 78810 }, { "epoch": 12.85725938009788, "grad_norm": 0.15594008564949036, "learning_rate": 1.7038026524829647e-05, "loss": 0.0361, "num_input_tokens_seen": 170095920, "step": 78815 }, { "epoch": 12.858075040783035, "grad_norm": 0.2035299837589264, "learning_rate": 1.7034652931279688e-05, "loss": 0.0142, "num_input_tokens_seen": 170105968, "step": 78820 }, { "epoch": 12.858890701468189, "grad_norm": 0.03793412446975708, "learning_rate": 1.7031279499157583e-05, "loss": 0.0664, "num_input_tokens_seen": 170116976, "step": 78825 }, { "epoch": 12.859706362153345, "grad_norm": 0.7729410529136658, "learning_rate": 1.702790622853171e-05, "loss": 0.1397, "num_input_tokens_seen": 170127504, "step": 78830 }, { "epoch": 12.860522022838499, "grad_norm": 0.10862909257411957, "learning_rate": 1.7024533119470426e-05, "loss": 0.0864, "num_input_tokens_seen": 170137904, "step": 78835 }, { "epoch": 12.861337683523654, "grad_norm": 0.08806788176298141, "learning_rate": 1.7021160172042105e-05, "loss": 0.0213, "num_input_tokens_seen": 170149840, "step": 78840 }, { "epoch": 12.86215334420881, "grad_norm": 0.039600349962711334, "learning_rate": 1.7017787386315087e-05, "loss": 0.0579, "num_input_tokens_seen": 170161072, "step": 78845 }, { "epoch": 12.862969004893964, "grad_norm": 1.2832614183425903, "learning_rate": 1.7014414762357733e-05, "loss": 0.0746, "num_input_tokens_seen": 170171024, "step": 78850 }, { "epoch": 12.86378466557912, "grad_norm": 2.0368285179138184, "learning_rate": 1.70110423002384e-05, "loss": 0.3242, "num_input_tokens_seen": 170181712, "step": 78855 }, { "epoch": 12.864600326264274, "grad_norm": 0.0662120059132576, "learning_rate": 1.7007670000025426e-05, "loss": 0.0425, "num_input_tokens_seen": 170193040, "step": 78860 }, { "epoch": 12.86541598694943, "grad_norm": 0.044275328516960144, "learning_rate": 1.7004297861787164e-05, "loss": 0.0289, "num_input_tokens_seen": 170205328, "step": 78865 }, { "epoch": 12.866231647634583, "grad_norm": 0.5161898732185364, "learning_rate": 1.700092588559194e-05, "loss": 0.0687, "num_input_tokens_seen": 170215888, "step": 78870 }, { "epoch": 12.867047308319739, "grad_norm": 0.026921892538666725, "learning_rate": 1.699755407150811e-05, "loss": 0.0615, "num_input_tokens_seen": 170227376, "step": 78875 }, { "epoch": 12.867862969004895, "grad_norm": 0.09594374895095825, "learning_rate": 1.699418241960399e-05, "loss": 0.1006, "num_input_tokens_seen": 170238416, "step": 78880 }, { "epoch": 12.868678629690049, "grad_norm": 1.718092441558838, "learning_rate": 1.6990810929947933e-05, "loss": 0.1508, "num_input_tokens_seen": 170249552, "step": 78885 }, { "epoch": 12.869494290375204, "grad_norm": 0.8452916741371155, "learning_rate": 1.6987439602608234e-05, "loss": 0.0374, "num_input_tokens_seen": 170260240, "step": 78890 }, { "epoch": 12.870309951060358, "grad_norm": 1.979227900505066, "learning_rate": 1.698406843765325e-05, "loss": 0.208, "num_input_tokens_seen": 170269808, "step": 78895 }, { "epoch": 12.871125611745514, "grad_norm": 1.1952173709869385, "learning_rate": 1.698069743515128e-05, "loss": 0.1474, "num_input_tokens_seen": 170281040, "step": 78900 }, { "epoch": 12.87194127243067, "grad_norm": 0.038220904767513275, "learning_rate": 1.6977326595170654e-05, "loss": 0.1272, "num_input_tokens_seen": 170290320, "step": 78905 }, { "epoch": 12.872756933115824, "grad_norm": 2.1527700424194336, "learning_rate": 1.6973955917779684e-05, "loss": 0.12, "num_input_tokens_seen": 170300944, "step": 78910 }, { "epoch": 12.87357259380098, "grad_norm": 0.6370394229888916, "learning_rate": 1.6970585403046674e-05, "loss": 0.1697, "num_input_tokens_seen": 170311600, "step": 78915 }, { "epoch": 12.874388254486133, "grad_norm": 0.15624135732650757, "learning_rate": 1.696721505103994e-05, "loss": 0.0151, "num_input_tokens_seen": 170322000, "step": 78920 }, { "epoch": 12.875203915171289, "grad_norm": 0.6866651773452759, "learning_rate": 1.6963844861827783e-05, "loss": 0.1405, "num_input_tokens_seen": 170332240, "step": 78925 }, { "epoch": 12.876019575856443, "grad_norm": 1.6835664510726929, "learning_rate": 1.69604748354785e-05, "loss": 0.1943, "num_input_tokens_seen": 170343120, "step": 78930 }, { "epoch": 12.876835236541599, "grad_norm": 0.5334681868553162, "learning_rate": 1.69571049720604e-05, "loss": 0.1189, "num_input_tokens_seen": 170352816, "step": 78935 }, { "epoch": 12.877650897226754, "grad_norm": 0.6118309497833252, "learning_rate": 1.6953735271641764e-05, "loss": 0.1662, "num_input_tokens_seen": 170362448, "step": 78940 }, { "epoch": 12.878466557911908, "grad_norm": 0.835306704044342, "learning_rate": 1.69503657342909e-05, "loss": 0.1785, "num_input_tokens_seen": 170373040, "step": 78945 }, { "epoch": 12.879282218597064, "grad_norm": 0.14784006774425507, "learning_rate": 1.6946996360076075e-05, "loss": 0.0418, "num_input_tokens_seen": 170383984, "step": 78950 }, { "epoch": 12.880097879282218, "grad_norm": 0.37812539935112, "learning_rate": 1.6943627149065587e-05, "loss": 0.1577, "num_input_tokens_seen": 170394608, "step": 78955 }, { "epoch": 12.880913539967374, "grad_norm": 0.27237188816070557, "learning_rate": 1.6940258101327722e-05, "loss": 0.0704, "num_input_tokens_seen": 170406480, "step": 78960 }, { "epoch": 12.88172920065253, "grad_norm": 0.7463412284851074, "learning_rate": 1.693688921693074e-05, "loss": 0.1629, "num_input_tokens_seen": 170417936, "step": 78965 }, { "epoch": 12.882544861337683, "grad_norm": 0.025812888517975807, "learning_rate": 1.6933520495942942e-05, "loss": 0.1805, "num_input_tokens_seen": 170428848, "step": 78970 }, { "epoch": 12.883360522022839, "grad_norm": 1.4469724893569946, "learning_rate": 1.693015193843257e-05, "loss": 0.2996, "num_input_tokens_seen": 170439440, "step": 78975 }, { "epoch": 12.884176182707993, "grad_norm": 0.8922411203384399, "learning_rate": 1.692678354446792e-05, "loss": 0.0544, "num_input_tokens_seen": 170450544, "step": 78980 }, { "epoch": 12.884991843393149, "grad_norm": 0.1139574646949768, "learning_rate": 1.692341531411723e-05, "loss": 0.084, "num_input_tokens_seen": 170462384, "step": 78985 }, { "epoch": 12.885807504078304, "grad_norm": 0.17886033654212952, "learning_rate": 1.6920047247448782e-05, "loss": 0.0183, "num_input_tokens_seen": 170473520, "step": 78990 }, { "epoch": 12.886623164763458, "grad_norm": 0.17637701332569122, "learning_rate": 1.6916679344530823e-05, "loss": 0.0128, "num_input_tokens_seen": 170484080, "step": 78995 }, { "epoch": 12.887438825448614, "grad_norm": 0.20520204305648804, "learning_rate": 1.691331160543162e-05, "loss": 0.0924, "num_input_tokens_seen": 170495568, "step": 79000 }, { "epoch": 12.888254486133768, "grad_norm": 0.8771510124206543, "learning_rate": 1.6909944030219404e-05, "loss": 0.1295, "num_input_tokens_seen": 170505968, "step": 79005 }, { "epoch": 12.889070146818923, "grad_norm": 0.6149004101753235, "learning_rate": 1.6906576618962444e-05, "loss": 0.0466, "num_input_tokens_seen": 170515536, "step": 79010 }, { "epoch": 12.88988580750408, "grad_norm": 0.07827336341142654, "learning_rate": 1.6903209371728966e-05, "loss": 0.0434, "num_input_tokens_seen": 170526160, "step": 79015 }, { "epoch": 12.890701468189233, "grad_norm": 0.3009965419769287, "learning_rate": 1.6899842288587237e-05, "loss": 0.1506, "num_input_tokens_seen": 170537776, "step": 79020 }, { "epoch": 12.891517128874389, "grad_norm": 0.06866800040006638, "learning_rate": 1.6896475369605467e-05, "loss": 0.0564, "num_input_tokens_seen": 170549104, "step": 79025 }, { "epoch": 12.892332789559543, "grad_norm": 0.21913976967334747, "learning_rate": 1.689310861485191e-05, "loss": 0.0206, "num_input_tokens_seen": 170559504, "step": 79030 }, { "epoch": 12.893148450244698, "grad_norm": 0.42868271470069885, "learning_rate": 1.6889742024394786e-05, "loss": 0.093, "num_input_tokens_seen": 170571312, "step": 79035 }, { "epoch": 12.893964110929852, "grad_norm": 0.1961197406053543, "learning_rate": 1.6886375598302335e-05, "loss": 0.0329, "num_input_tokens_seen": 170582896, "step": 79040 }, { "epoch": 12.894779771615008, "grad_norm": 0.09879224747419357, "learning_rate": 1.6883009336642768e-05, "loss": 0.133, "num_input_tokens_seen": 170592336, "step": 79045 }, { "epoch": 12.895595432300164, "grad_norm": 0.16867636144161224, "learning_rate": 1.6879643239484316e-05, "loss": 0.0112, "num_input_tokens_seen": 170604336, "step": 79050 }, { "epoch": 12.896411092985318, "grad_norm": 1.3109699487686157, "learning_rate": 1.68762773068952e-05, "loss": 0.2195, "num_input_tokens_seen": 170615504, "step": 79055 }, { "epoch": 12.897226753670473, "grad_norm": 0.6812476515769958, "learning_rate": 1.6872911538943624e-05, "loss": 0.1274, "num_input_tokens_seen": 170625264, "step": 79060 }, { "epoch": 12.898042414355627, "grad_norm": 0.26801127195358276, "learning_rate": 1.686954593569781e-05, "loss": 0.0418, "num_input_tokens_seen": 170636464, "step": 79065 }, { "epoch": 12.898858075040783, "grad_norm": 0.8946239352226257, "learning_rate": 1.6866180497225955e-05, "loss": 0.219, "num_input_tokens_seen": 170647632, "step": 79070 }, { "epoch": 12.899673735725939, "grad_norm": 0.10591210424900055, "learning_rate": 1.686281522359628e-05, "loss": 0.0922, "num_input_tokens_seen": 170658192, "step": 79075 }, { "epoch": 12.900489396411093, "grad_norm": 0.7002345323562622, "learning_rate": 1.685945011487697e-05, "loss": 0.0845, "num_input_tokens_seen": 170668272, "step": 79080 }, { "epoch": 12.901305057096248, "grad_norm": 0.09924004226922989, "learning_rate": 1.6856085171136236e-05, "loss": 0.0097, "num_input_tokens_seen": 170677232, "step": 79085 }, { "epoch": 12.902120717781402, "grad_norm": 0.3299320340156555, "learning_rate": 1.6852720392442264e-05, "loss": 0.1561, "num_input_tokens_seen": 170688688, "step": 79090 }, { "epoch": 12.902936378466558, "grad_norm": 0.5382140278816223, "learning_rate": 1.6849355778863253e-05, "loss": 0.1251, "num_input_tokens_seen": 170699536, "step": 79095 }, { "epoch": 12.903752039151712, "grad_norm": 0.10627488046884537, "learning_rate": 1.6845991330467377e-05, "loss": 0.1588, "num_input_tokens_seen": 170709872, "step": 79100 }, { "epoch": 12.904567699836868, "grad_norm": 0.3128858208656311, "learning_rate": 1.6842627047322844e-05, "loss": 0.0988, "num_input_tokens_seen": 170720432, "step": 79105 }, { "epoch": 12.905383360522023, "grad_norm": 0.22219857573509216, "learning_rate": 1.683926292949781e-05, "loss": 0.0302, "num_input_tokens_seen": 170730448, "step": 79110 }, { "epoch": 12.906199021207177, "grad_norm": 0.0698913186788559, "learning_rate": 1.6835898977060477e-05, "loss": 0.197, "num_input_tokens_seen": 170740176, "step": 79115 }, { "epoch": 12.907014681892333, "grad_norm": 0.9653496742248535, "learning_rate": 1.6832535190078995e-05, "loss": 0.0278, "num_input_tokens_seen": 170750832, "step": 79120 }, { "epoch": 12.907830342577487, "grad_norm": 0.05142999812960625, "learning_rate": 1.682917156862156e-05, "loss": 0.0783, "num_input_tokens_seen": 170761680, "step": 79125 }, { "epoch": 12.908646003262643, "grad_norm": 1.2782623767852783, "learning_rate": 1.682580811275632e-05, "loss": 0.0856, "num_input_tokens_seen": 170771536, "step": 79130 }, { "epoch": 12.909461663947798, "grad_norm": 0.6967428922653198, "learning_rate": 1.682244482255146e-05, "loss": 0.1621, "num_input_tokens_seen": 170783216, "step": 79135 }, { "epoch": 12.910277324632952, "grad_norm": 3.2200450897216797, "learning_rate": 1.6819081698075117e-05, "loss": 0.1558, "num_input_tokens_seen": 170793296, "step": 79140 }, { "epoch": 12.911092985318108, "grad_norm": 0.02954035811126232, "learning_rate": 1.681571873939547e-05, "loss": 0.026, "num_input_tokens_seen": 170803760, "step": 79145 }, { "epoch": 12.911908646003262, "grad_norm": 1.4886987209320068, "learning_rate": 1.681235594658066e-05, "loss": 0.1329, "num_input_tokens_seen": 170813968, "step": 79150 }, { "epoch": 12.912724306688418, "grad_norm": 1.6637446880340576, "learning_rate": 1.6808993319698853e-05, "loss": 0.0956, "num_input_tokens_seen": 170824080, "step": 79155 }, { "epoch": 12.913539967373573, "grad_norm": 0.02944220043718815, "learning_rate": 1.6805630858818173e-05, "loss": 0.0298, "num_input_tokens_seen": 170835600, "step": 79160 }, { "epoch": 12.914355628058727, "grad_norm": 0.014704996719956398, "learning_rate": 1.680226856400679e-05, "loss": 0.0796, "num_input_tokens_seen": 170847088, "step": 79165 }, { "epoch": 12.915171288743883, "grad_norm": 0.27737513184547424, "learning_rate": 1.679890643533283e-05, "loss": 0.0667, "num_input_tokens_seen": 170857040, "step": 79170 }, { "epoch": 12.915986949429037, "grad_norm": 0.01912410743534565, "learning_rate": 1.6795544472864426e-05, "loss": 0.1201, "num_input_tokens_seen": 170868432, "step": 79175 }, { "epoch": 12.916802610114193, "grad_norm": 1.2569153308868408, "learning_rate": 1.6792182676669738e-05, "loss": 0.0801, "num_input_tokens_seen": 170880176, "step": 79180 }, { "epoch": 12.917618270799348, "grad_norm": 0.06121936812996864, "learning_rate": 1.6788821046816865e-05, "loss": 0.0699, "num_input_tokens_seen": 170890576, "step": 79185 }, { "epoch": 12.918433931484502, "grad_norm": 0.5130727887153625, "learning_rate": 1.6785459583373964e-05, "loss": 0.1936, "num_input_tokens_seen": 170902384, "step": 79190 }, { "epoch": 12.919249592169658, "grad_norm": 1.5775202512741089, "learning_rate": 1.6782098286409135e-05, "loss": 0.0851, "num_input_tokens_seen": 170913936, "step": 79195 }, { "epoch": 12.920065252854812, "grad_norm": 0.2149566113948822, "learning_rate": 1.677873715599052e-05, "loss": 0.0247, "num_input_tokens_seen": 170924816, "step": 79200 }, { "epoch": 12.920880913539968, "grad_norm": 0.46521204710006714, "learning_rate": 1.6775376192186216e-05, "loss": 0.08, "num_input_tokens_seen": 170936080, "step": 79205 }, { "epoch": 12.921696574225122, "grad_norm": 0.09573189914226532, "learning_rate": 1.6772015395064355e-05, "loss": 0.06, "num_input_tokens_seen": 170947952, "step": 79210 }, { "epoch": 12.922512234910277, "grad_norm": 0.9786128997802734, "learning_rate": 1.6768654764693032e-05, "loss": 0.2175, "num_input_tokens_seen": 170959088, "step": 79215 }, { "epoch": 12.923327895595433, "grad_norm": 0.27529218792915344, "learning_rate": 1.6765294301140366e-05, "loss": 0.0323, "num_input_tokens_seen": 170969136, "step": 79220 }, { "epoch": 12.924143556280587, "grad_norm": 0.08934711664915085, "learning_rate": 1.6761934004474454e-05, "loss": 0.0261, "num_input_tokens_seen": 170979088, "step": 79225 }, { "epoch": 12.924959216965743, "grad_norm": 2.1341195106506348, "learning_rate": 1.6758573874763405e-05, "loss": 0.0797, "num_input_tokens_seen": 170989040, "step": 79230 }, { "epoch": 12.925774877650896, "grad_norm": 3.0233166217803955, "learning_rate": 1.6755213912075314e-05, "loss": 0.1181, "num_input_tokens_seen": 171000560, "step": 79235 }, { "epoch": 12.926590538336052, "grad_norm": 0.04242251440882683, "learning_rate": 1.6751854116478267e-05, "loss": 0.0212, "num_input_tokens_seen": 171010064, "step": 79240 }, { "epoch": 12.927406199021208, "grad_norm": 0.07611534744501114, "learning_rate": 1.6748494488040354e-05, "loss": 0.0392, "num_input_tokens_seen": 171021328, "step": 79245 }, { "epoch": 12.928221859706362, "grad_norm": 0.0926312729716301, "learning_rate": 1.6745135026829683e-05, "loss": 0.0359, "num_input_tokens_seen": 171032848, "step": 79250 }, { "epoch": 12.929037520391518, "grad_norm": 1.540789246559143, "learning_rate": 1.674177573291431e-05, "loss": 0.0968, "num_input_tokens_seen": 171043664, "step": 79255 }, { "epoch": 12.929853181076671, "grad_norm": 0.42603012919425964, "learning_rate": 1.6738416606362338e-05, "loss": 0.1736, "num_input_tokens_seen": 171055632, "step": 79260 }, { "epoch": 12.930668841761827, "grad_norm": 1.0926992893218994, "learning_rate": 1.6735057647241827e-05, "loss": 0.0688, "num_input_tokens_seen": 171066576, "step": 79265 }, { "epoch": 12.931484502446983, "grad_norm": 0.05504552274942398, "learning_rate": 1.6731698855620864e-05, "loss": 0.0389, "num_input_tokens_seen": 171077424, "step": 79270 }, { "epoch": 12.932300163132137, "grad_norm": 0.32236379384994507, "learning_rate": 1.67283402315675e-05, "loss": 0.0181, "num_input_tokens_seen": 171087600, "step": 79275 }, { "epoch": 12.933115823817293, "grad_norm": 0.04750223085284233, "learning_rate": 1.6724981775149823e-05, "loss": 0.1101, "num_input_tokens_seen": 171097968, "step": 79280 }, { "epoch": 12.933931484502446, "grad_norm": 1.233414888381958, "learning_rate": 1.6721623486435892e-05, "loss": 0.0274, "num_input_tokens_seen": 171108976, "step": 79285 }, { "epoch": 12.934747145187602, "grad_norm": 1.8415770530700684, "learning_rate": 1.671826536549375e-05, "loss": 0.1781, "num_input_tokens_seen": 171118576, "step": 79290 }, { "epoch": 12.935562805872756, "grad_norm": 1.3769680261611938, "learning_rate": 1.6714907412391483e-05, "loss": 0.1269, "num_input_tokens_seen": 171127760, "step": 79295 }, { "epoch": 12.936378466557912, "grad_norm": 0.12869566679000854, "learning_rate": 1.6711549627197114e-05, "loss": 0.0294, "num_input_tokens_seen": 171139472, "step": 79300 }, { "epoch": 12.937194127243067, "grad_norm": 0.12166541069746017, "learning_rate": 1.670819200997872e-05, "loss": 0.0612, "num_input_tokens_seen": 171149072, "step": 79305 }, { "epoch": 12.938009787928221, "grad_norm": 0.5477724671363831, "learning_rate": 1.6704834560804322e-05, "loss": 0.0519, "num_input_tokens_seen": 171160176, "step": 79310 }, { "epoch": 12.938825448613377, "grad_norm": 0.051106248050928116, "learning_rate": 1.6701477279741985e-05, "loss": 0.1915, "num_input_tokens_seen": 171170544, "step": 79315 }, { "epoch": 12.939641109298531, "grad_norm": 0.23163524270057678, "learning_rate": 1.6698120166859727e-05, "loss": 0.0606, "num_input_tokens_seen": 171180112, "step": 79320 }, { "epoch": 12.940456769983687, "grad_norm": 1.6473965644836426, "learning_rate": 1.6694763222225606e-05, "loss": 0.0985, "num_input_tokens_seen": 171192272, "step": 79325 }, { "epoch": 12.941272430668842, "grad_norm": 0.08036120235919952, "learning_rate": 1.6691406445907634e-05, "loss": 0.0252, "num_input_tokens_seen": 171202960, "step": 79330 }, { "epoch": 12.942088091353996, "grad_norm": 0.019074685871601105, "learning_rate": 1.6688049837973863e-05, "loss": 0.1433, "num_input_tokens_seen": 171213008, "step": 79335 }, { "epoch": 12.942903752039152, "grad_norm": 0.17922548949718475, "learning_rate": 1.6684693398492292e-05, "loss": 0.2648, "num_input_tokens_seen": 171224272, "step": 79340 }, { "epoch": 12.943719412724306, "grad_norm": 0.15689069032669067, "learning_rate": 1.668133712753097e-05, "loss": 0.0183, "num_input_tokens_seen": 171235440, "step": 79345 }, { "epoch": 12.944535073409462, "grad_norm": 0.6942062973976135, "learning_rate": 1.6677981025157895e-05, "loss": 0.0463, "num_input_tokens_seen": 171246928, "step": 79350 }, { "epoch": 12.945350734094617, "grad_norm": 0.08173768222332001, "learning_rate": 1.6674625091441103e-05, "loss": 0.1798, "num_input_tokens_seen": 171259024, "step": 79355 }, { "epoch": 12.946166394779771, "grad_norm": 0.060916196554899216, "learning_rate": 1.6671269326448584e-05, "loss": 0.0366, "num_input_tokens_seen": 171269808, "step": 79360 }, { "epoch": 12.946982055464927, "grad_norm": 0.013870242983102798, "learning_rate": 1.6667913730248363e-05, "loss": 0.2314, "num_input_tokens_seen": 171280208, "step": 79365 }, { "epoch": 12.947797716150081, "grad_norm": 0.1328904777765274, "learning_rate": 1.666455830290844e-05, "loss": 0.0932, "num_input_tokens_seen": 171291312, "step": 79370 }, { "epoch": 12.948613376835237, "grad_norm": 0.1416485756635666, "learning_rate": 1.6661203044496816e-05, "loss": 0.0427, "num_input_tokens_seen": 171303088, "step": 79375 }, { "epoch": 12.949429037520392, "grad_norm": 1.3642760515213013, "learning_rate": 1.665784795508149e-05, "loss": 0.1759, "num_input_tokens_seen": 171313840, "step": 79380 }, { "epoch": 12.950244698205546, "grad_norm": 1.364372730255127, "learning_rate": 1.6654493034730457e-05, "loss": 0.1067, "num_input_tokens_seen": 171324048, "step": 79385 }, { "epoch": 12.951060358890702, "grad_norm": 2.5523629188537598, "learning_rate": 1.6651138283511707e-05, "loss": 0.0492, "num_input_tokens_seen": 171334000, "step": 79390 }, { "epoch": 12.951876019575856, "grad_norm": 1.2263391017913818, "learning_rate": 1.664778370149324e-05, "loss": 0.1751, "num_input_tokens_seen": 171344208, "step": 79395 }, { "epoch": 12.952691680261012, "grad_norm": 1.7374197244644165, "learning_rate": 1.6644429288743027e-05, "loss": 0.2248, "num_input_tokens_seen": 171355888, "step": 79400 }, { "epoch": 12.953507340946166, "grad_norm": 0.6669625639915466, "learning_rate": 1.6641075045329057e-05, "loss": 0.0299, "num_input_tokens_seen": 171365392, "step": 79405 }, { "epoch": 12.954323001631321, "grad_norm": 0.10840458422899246, "learning_rate": 1.6637720971319304e-05, "loss": 0.0468, "num_input_tokens_seen": 171375696, "step": 79410 }, { "epoch": 12.955138662316477, "grad_norm": 0.4052591919898987, "learning_rate": 1.6634367066781735e-05, "loss": 0.0304, "num_input_tokens_seen": 171386096, "step": 79415 }, { "epoch": 12.955954323001631, "grad_norm": 2.115600109100342, "learning_rate": 1.6631013331784346e-05, "loss": 0.1758, "num_input_tokens_seen": 171395664, "step": 79420 }, { "epoch": 12.956769983686787, "grad_norm": 0.25360745191574097, "learning_rate": 1.6627659766395077e-05, "loss": 0.0572, "num_input_tokens_seen": 171406928, "step": 79425 }, { "epoch": 12.95758564437194, "grad_norm": 0.7904363870620728, "learning_rate": 1.6624306370681912e-05, "loss": 0.0503, "num_input_tokens_seen": 171417584, "step": 79430 }, { "epoch": 12.958401305057096, "grad_norm": 0.41989317536354065, "learning_rate": 1.662095314471279e-05, "loss": 0.0154, "num_input_tokens_seen": 171428912, "step": 79435 }, { "epoch": 12.959216965742252, "grad_norm": 0.5169289708137512, "learning_rate": 1.66176000885557e-05, "loss": 0.1502, "num_input_tokens_seen": 171440112, "step": 79440 }, { "epoch": 12.960032626427406, "grad_norm": 1.1919529438018799, "learning_rate": 1.6614247202278565e-05, "loss": 0.0356, "num_input_tokens_seen": 171450576, "step": 79445 }, { "epoch": 12.960848287112562, "grad_norm": 0.9101292490959167, "learning_rate": 1.661089448594936e-05, "loss": 0.034, "num_input_tokens_seen": 171460496, "step": 79450 }, { "epoch": 12.961663947797716, "grad_norm": 0.2984828054904938, "learning_rate": 1.6607541939636005e-05, "loss": 0.1952, "num_input_tokens_seen": 171472016, "step": 79455 }, { "epoch": 12.962479608482871, "grad_norm": 0.21788997948169708, "learning_rate": 1.6604189563406475e-05, "loss": 0.1055, "num_input_tokens_seen": 171483056, "step": 79460 }, { "epoch": 12.963295269168025, "grad_norm": 0.10568325221538544, "learning_rate": 1.660083735732868e-05, "loss": 0.1175, "num_input_tokens_seen": 171493488, "step": 79465 }, { "epoch": 12.964110929853181, "grad_norm": 0.028749944642186165, "learning_rate": 1.6597485321470586e-05, "loss": 0.0662, "num_input_tokens_seen": 171503696, "step": 79470 }, { "epoch": 12.964926590538337, "grad_norm": 0.6177870631217957, "learning_rate": 1.6594133455900096e-05, "loss": 0.0129, "num_input_tokens_seen": 171513936, "step": 79475 }, { "epoch": 12.96574225122349, "grad_norm": 0.20088544487953186, "learning_rate": 1.659078176068517e-05, "loss": 0.1284, "num_input_tokens_seen": 171524496, "step": 79480 }, { "epoch": 12.966557911908646, "grad_norm": 1.384828805923462, "learning_rate": 1.6587430235893707e-05, "loss": 0.0413, "num_input_tokens_seen": 171533808, "step": 79485 }, { "epoch": 12.9673735725938, "grad_norm": 0.7203127145767212, "learning_rate": 1.6584078881593642e-05, "loss": 0.1845, "num_input_tokens_seen": 171545296, "step": 79490 }, { "epoch": 12.968189233278956, "grad_norm": 0.2770227789878845, "learning_rate": 1.6580727697852904e-05, "loss": 0.1654, "num_input_tokens_seen": 171556464, "step": 79495 }, { "epoch": 12.969004893964112, "grad_norm": 2.0927305221557617, "learning_rate": 1.6577376684739394e-05, "loss": 0.1783, "num_input_tokens_seen": 171567696, "step": 79500 }, { "epoch": 12.969820554649266, "grad_norm": 1.3359081745147705, "learning_rate": 1.6574025842321038e-05, "loss": 0.0514, "num_input_tokens_seen": 171579248, "step": 79505 }, { "epoch": 12.970636215334421, "grad_norm": 0.17549018561840057, "learning_rate": 1.6570675170665726e-05, "loss": 0.1629, "num_input_tokens_seen": 171591216, "step": 79510 }, { "epoch": 12.971451876019575, "grad_norm": 0.15209732949733734, "learning_rate": 1.6567324669841385e-05, "loss": 0.0433, "num_input_tokens_seen": 171602160, "step": 79515 }, { "epoch": 12.97226753670473, "grad_norm": 1.63656485080719, "learning_rate": 1.65639743399159e-05, "loss": 0.2064, "num_input_tokens_seen": 171612368, "step": 79520 }, { "epoch": 12.973083197389887, "grad_norm": 0.5273931622505188, "learning_rate": 1.6560624180957186e-05, "loss": 0.0339, "num_input_tokens_seen": 171621872, "step": 79525 }, { "epoch": 12.97389885807504, "grad_norm": 0.05630730837583542, "learning_rate": 1.6557274193033117e-05, "loss": 0.0868, "num_input_tokens_seen": 171631888, "step": 79530 }, { "epoch": 12.974714518760196, "grad_norm": 0.21158406138420105, "learning_rate": 1.65539243762116e-05, "loss": 0.0954, "num_input_tokens_seen": 171643504, "step": 79535 }, { "epoch": 12.97553017944535, "grad_norm": 0.2647726535797119, "learning_rate": 1.6550574730560526e-05, "loss": 0.0809, "num_input_tokens_seen": 171654416, "step": 79540 }, { "epoch": 12.976345840130506, "grad_norm": 0.363643616437912, "learning_rate": 1.654722525614777e-05, "loss": 0.0346, "num_input_tokens_seen": 171664720, "step": 79545 }, { "epoch": 12.977161500815662, "grad_norm": 0.15412244200706482, "learning_rate": 1.654387595304122e-05, "loss": 0.0602, "num_input_tokens_seen": 171676240, "step": 79550 }, { "epoch": 12.977977161500815, "grad_norm": 0.9571519494056702, "learning_rate": 1.654052682130875e-05, "loss": 0.1702, "num_input_tokens_seen": 171687376, "step": 79555 }, { "epoch": 12.978792822185971, "grad_norm": 0.035684168338775635, "learning_rate": 1.6537177861018225e-05, "loss": 0.1545, "num_input_tokens_seen": 171697840, "step": 79560 }, { "epoch": 12.979608482871125, "grad_norm": 0.44963306188583374, "learning_rate": 1.6533829072237545e-05, "loss": 0.0585, "num_input_tokens_seen": 171708656, "step": 79565 }, { "epoch": 12.98042414355628, "grad_norm": 1.0076587200164795, "learning_rate": 1.6530480455034545e-05, "loss": 0.2492, "num_input_tokens_seen": 171718704, "step": 79570 }, { "epoch": 12.981239804241435, "grad_norm": 0.17646339535713196, "learning_rate": 1.6527132009477113e-05, "loss": 0.0546, "num_input_tokens_seen": 171729872, "step": 79575 }, { "epoch": 12.98205546492659, "grad_norm": 0.10819877684116364, "learning_rate": 1.652378373563309e-05, "loss": 0.1436, "num_input_tokens_seen": 171741104, "step": 79580 }, { "epoch": 12.982871125611746, "grad_norm": 0.08386317640542984, "learning_rate": 1.6520435633570352e-05, "loss": 0.0231, "num_input_tokens_seen": 171753040, "step": 79585 }, { "epoch": 12.9836867862969, "grad_norm": 0.39826610684394836, "learning_rate": 1.651708770335673e-05, "loss": 0.0593, "num_input_tokens_seen": 171763984, "step": 79590 }, { "epoch": 12.984502446982056, "grad_norm": 0.07711628824472427, "learning_rate": 1.6513739945060095e-05, "loss": 0.0879, "num_input_tokens_seen": 171775344, "step": 79595 }, { "epoch": 12.98531810766721, "grad_norm": 0.23146244883537292, "learning_rate": 1.6510392358748287e-05, "loss": 0.0734, "num_input_tokens_seen": 171785104, "step": 79600 }, { "epoch": 12.986133768352365, "grad_norm": 0.0461372435092926, "learning_rate": 1.6507044944489143e-05, "loss": 0.2274, "num_input_tokens_seen": 171796112, "step": 79605 }, { "epoch": 12.986949429037521, "grad_norm": 0.025968166068196297, "learning_rate": 1.650369770235052e-05, "loss": 0.1121, "num_input_tokens_seen": 171806864, "step": 79610 }, { "epoch": 12.987765089722675, "grad_norm": 1.266481637954712, "learning_rate": 1.6500350632400225e-05, "loss": 0.2377, "num_input_tokens_seen": 171817712, "step": 79615 }, { "epoch": 12.98858075040783, "grad_norm": 0.08274318277835846, "learning_rate": 1.649700373470612e-05, "loss": 0.031, "num_input_tokens_seen": 171829264, "step": 79620 }, { "epoch": 12.989396411092985, "grad_norm": 0.07195671647787094, "learning_rate": 1.649365700933601e-05, "loss": 0.042, "num_input_tokens_seen": 171838864, "step": 79625 }, { "epoch": 12.99021207177814, "grad_norm": 0.10291063785552979, "learning_rate": 1.649031045635774e-05, "loss": 0.0838, "num_input_tokens_seen": 171849456, "step": 79630 }, { "epoch": 12.991027732463294, "grad_norm": 0.09527216851711273, "learning_rate": 1.6486964075839117e-05, "loss": 0.1267, "num_input_tokens_seen": 171860976, "step": 79635 }, { "epoch": 12.99184339314845, "grad_norm": 0.359350323677063, "learning_rate": 1.6483617867847978e-05, "loss": 0.0242, "num_input_tokens_seen": 171872496, "step": 79640 }, { "epoch": 12.992659053833606, "grad_norm": 0.04575581103563309, "learning_rate": 1.648027183245211e-05, "loss": 0.038, "num_input_tokens_seen": 171883856, "step": 79645 }, { "epoch": 12.99347471451876, "grad_norm": 0.13662022352218628, "learning_rate": 1.6476925969719357e-05, "loss": 0.0144, "num_input_tokens_seen": 171895504, "step": 79650 }, { "epoch": 12.994290375203915, "grad_norm": 0.0618174783885479, "learning_rate": 1.6473580279717502e-05, "loss": 0.068, "num_input_tokens_seen": 171906096, "step": 79655 }, { "epoch": 12.99510603588907, "grad_norm": 1.0457600355148315, "learning_rate": 1.6470234762514367e-05, "loss": 0.1779, "num_input_tokens_seen": 171917072, "step": 79660 }, { "epoch": 12.995921696574225, "grad_norm": 0.21593578159809113, "learning_rate": 1.6466889418177737e-05, "loss": 0.0127, "num_input_tokens_seen": 171929008, "step": 79665 }, { "epoch": 12.99673735725938, "grad_norm": 0.10116396844387054, "learning_rate": 1.6463544246775425e-05, "loss": 0.0377, "num_input_tokens_seen": 171939568, "step": 79670 }, { "epoch": 12.997553017944535, "grad_norm": 0.04299512505531311, "learning_rate": 1.646019924837521e-05, "loss": 0.0137, "num_input_tokens_seen": 171948592, "step": 79675 }, { "epoch": 12.99836867862969, "grad_norm": 0.8233615159988403, "learning_rate": 1.6456854423044896e-05, "loss": 0.1181, "num_input_tokens_seen": 171959824, "step": 79680 }, { "epoch": 12.999184339314844, "grad_norm": 1.7324005365371704, "learning_rate": 1.6453509770852267e-05, "loss": 0.1863, "num_input_tokens_seen": 171970224, "step": 79685 }, { "epoch": 13.0, "grad_norm": 0.19784392416477203, "learning_rate": 1.64501652918651e-05, "loss": 0.0198, "num_input_tokens_seen": 171979232, "step": 79690 }, { "epoch": 13.0, "eval_loss": 0.143095463514328, "eval_runtime": 90.7171, "eval_samples_per_second": 30.038, "eval_steps_per_second": 7.518, "num_input_tokens_seen": 171979232, "step": 79690 }, { "epoch": 13.000815660685156, "grad_norm": 1.1899038553237915, "learning_rate": 1.6446820986151178e-05, "loss": 0.0847, "num_input_tokens_seen": 171989664, "step": 79695 }, { "epoch": 13.00163132137031, "grad_norm": 0.0443442203104496, "learning_rate": 1.6443476853778284e-05, "loss": 0.0729, "num_input_tokens_seen": 172000960, "step": 79700 }, { "epoch": 13.002446982055465, "grad_norm": 0.4486698508262634, "learning_rate": 1.644013289481418e-05, "loss": 0.1798, "num_input_tokens_seen": 172010528, "step": 79705 }, { "epoch": 13.00326264274062, "grad_norm": 0.21600575745105743, "learning_rate": 1.643678910932665e-05, "loss": 0.0806, "num_input_tokens_seen": 172020288, "step": 79710 }, { "epoch": 13.004078303425775, "grad_norm": 2.9651849269866943, "learning_rate": 1.6433445497383446e-05, "loss": 0.0544, "num_input_tokens_seen": 172032032, "step": 79715 }, { "epoch": 13.00489396411093, "grad_norm": 0.1518414169549942, "learning_rate": 1.6430102059052338e-05, "loss": 0.0506, "num_input_tokens_seen": 172042720, "step": 79720 }, { "epoch": 13.005709624796085, "grad_norm": 0.5938199162483215, "learning_rate": 1.6426758794401086e-05, "loss": 0.0547, "num_input_tokens_seen": 172054112, "step": 79725 }, { "epoch": 13.00652528548124, "grad_norm": 2.3050246238708496, "learning_rate": 1.6423415703497435e-05, "loss": 0.2129, "num_input_tokens_seen": 172064640, "step": 79730 }, { "epoch": 13.007340946166394, "grad_norm": 2.8189802169799805, "learning_rate": 1.6420072786409158e-05, "loss": 0.1621, "num_input_tokens_seen": 172076224, "step": 79735 }, { "epoch": 13.00815660685155, "grad_norm": 0.0687422975897789, "learning_rate": 1.6416730043203976e-05, "loss": 0.0281, "num_input_tokens_seen": 172087872, "step": 79740 }, { "epoch": 13.008972267536704, "grad_norm": 0.047543004155159, "learning_rate": 1.6413387473949665e-05, "loss": 0.0433, "num_input_tokens_seen": 172098976, "step": 79745 }, { "epoch": 13.00978792822186, "grad_norm": 3.3655803203582764, "learning_rate": 1.6410045078713935e-05, "loss": 0.1356, "num_input_tokens_seen": 172109696, "step": 79750 }, { "epoch": 13.010603588907015, "grad_norm": 1.343400478363037, "learning_rate": 1.6406702857564547e-05, "loss": 0.1201, "num_input_tokens_seen": 172120896, "step": 79755 }, { "epoch": 13.01141924959217, "grad_norm": 0.07088273763656616, "learning_rate": 1.6403360810569223e-05, "loss": 0.0325, "num_input_tokens_seen": 172132192, "step": 79760 }, { "epoch": 13.012234910277325, "grad_norm": 0.14771610498428345, "learning_rate": 1.6400018937795704e-05, "loss": 0.0218, "num_input_tokens_seen": 172143424, "step": 79765 }, { "epoch": 13.013050570962479, "grad_norm": 0.0776078924536705, "learning_rate": 1.63966772393117e-05, "loss": 0.0387, "num_input_tokens_seen": 172154304, "step": 79770 }, { "epoch": 13.013866231647635, "grad_norm": 0.19719672203063965, "learning_rate": 1.6393335715184958e-05, "loss": 0.0512, "num_input_tokens_seen": 172164896, "step": 79775 }, { "epoch": 13.01468189233279, "grad_norm": 0.046477947384119034, "learning_rate": 1.6389994365483174e-05, "loss": 0.0473, "num_input_tokens_seen": 172175200, "step": 79780 }, { "epoch": 13.015497553017944, "grad_norm": 0.047032665461301804, "learning_rate": 1.6386653190274088e-05, "loss": 0.0886, "num_input_tokens_seen": 172185856, "step": 79785 }, { "epoch": 13.0163132137031, "grad_norm": 0.5759778618812561, "learning_rate": 1.6383312189625394e-05, "loss": 0.3824, "num_input_tokens_seen": 172196672, "step": 79790 }, { "epoch": 13.017128874388254, "grad_norm": 0.10055718570947647, "learning_rate": 1.6379971363604817e-05, "loss": 0.1242, "num_input_tokens_seen": 172207712, "step": 79795 }, { "epoch": 13.01794453507341, "grad_norm": 0.2728129029273987, "learning_rate": 1.6376630712280045e-05, "loss": 0.0208, "num_input_tokens_seen": 172218656, "step": 79800 }, { "epoch": 13.018760195758565, "grad_norm": 1.572549819946289, "learning_rate": 1.6373290235718804e-05, "loss": 0.145, "num_input_tokens_seen": 172230176, "step": 79805 }, { "epoch": 13.01957585644372, "grad_norm": 0.2782912850379944, "learning_rate": 1.6369949933988765e-05, "loss": 0.1249, "num_input_tokens_seen": 172240928, "step": 79810 }, { "epoch": 13.020391517128875, "grad_norm": 2.118633270263672, "learning_rate": 1.6366609807157643e-05, "loss": 0.1079, "num_input_tokens_seen": 172251904, "step": 79815 }, { "epoch": 13.021207177814029, "grad_norm": 2.6566343307495117, "learning_rate": 1.6363269855293133e-05, "loss": 0.1407, "num_input_tokens_seen": 172262688, "step": 79820 }, { "epoch": 13.022022838499185, "grad_norm": 0.5612598657608032, "learning_rate": 1.635993007846291e-05, "loss": 0.1435, "num_input_tokens_seen": 172273376, "step": 79825 }, { "epoch": 13.022838499184338, "grad_norm": 1.7005327939987183, "learning_rate": 1.635659047673467e-05, "loss": 0.0167, "num_input_tokens_seen": 172284416, "step": 79830 }, { "epoch": 13.023654159869494, "grad_norm": 0.07297451049089432, "learning_rate": 1.635325105017608e-05, "loss": 0.0334, "num_input_tokens_seen": 172296032, "step": 79835 }, { "epoch": 13.02446982055465, "grad_norm": 0.08875102549791336, "learning_rate": 1.634991179885484e-05, "loss": 0.0263, "num_input_tokens_seen": 172306720, "step": 79840 }, { "epoch": 13.025285481239804, "grad_norm": 0.017846329137682915, "learning_rate": 1.6346572722838595e-05, "loss": 0.0951, "num_input_tokens_seen": 172317280, "step": 79845 }, { "epoch": 13.02610114192496, "grad_norm": 0.12879300117492676, "learning_rate": 1.634323382219504e-05, "loss": 0.0205, "num_input_tokens_seen": 172328064, "step": 79850 }, { "epoch": 13.026916802610113, "grad_norm": 0.042699288576841354, "learning_rate": 1.6339895096991834e-05, "loss": 0.0511, "num_input_tokens_seen": 172339040, "step": 79855 }, { "epoch": 13.02773246329527, "grad_norm": 0.09656640887260437, "learning_rate": 1.633655654729664e-05, "loss": 0.0093, "num_input_tokens_seen": 172349760, "step": 79860 }, { "epoch": 13.028548123980425, "grad_norm": 1.1884803771972656, "learning_rate": 1.6333218173177117e-05, "loss": 0.0429, "num_input_tokens_seen": 172361632, "step": 79865 }, { "epoch": 13.029363784665579, "grad_norm": 0.9996766448020935, "learning_rate": 1.6329879974700925e-05, "loss": 0.1606, "num_input_tokens_seen": 172372192, "step": 79870 }, { "epoch": 13.030179445350734, "grad_norm": 0.3182235658168793, "learning_rate": 1.6326541951935703e-05, "loss": 0.0332, "num_input_tokens_seen": 172383552, "step": 79875 }, { "epoch": 13.030995106035888, "grad_norm": 0.0455947183072567, "learning_rate": 1.6323204104949124e-05, "loss": 0.0252, "num_input_tokens_seen": 172394496, "step": 79880 }, { "epoch": 13.031810766721044, "grad_norm": 2.3502695560455322, "learning_rate": 1.631986643380881e-05, "loss": 0.1556, "num_input_tokens_seen": 172404096, "step": 79885 }, { "epoch": 13.0326264274062, "grad_norm": 1.939026951789856, "learning_rate": 1.6316528938582432e-05, "loss": 0.1445, "num_input_tokens_seen": 172414880, "step": 79890 }, { "epoch": 13.033442088091354, "grad_norm": 0.02776511386036873, "learning_rate": 1.6313191619337593e-05, "loss": 0.0348, "num_input_tokens_seen": 172424928, "step": 79895 }, { "epoch": 13.03425774877651, "grad_norm": 0.06977161020040512, "learning_rate": 1.6309854476141955e-05, "loss": 0.1348, "num_input_tokens_seen": 172436416, "step": 79900 }, { "epoch": 13.035073409461663, "grad_norm": 0.040181927382946014, "learning_rate": 1.6306517509063136e-05, "loss": 0.1228, "num_input_tokens_seen": 172447232, "step": 79905 }, { "epoch": 13.035889070146819, "grad_norm": 1.2878729104995728, "learning_rate": 1.6303180718168775e-05, "loss": 0.0467, "num_input_tokens_seen": 172457152, "step": 79910 }, { "epoch": 13.036704730831975, "grad_norm": 0.032545387744903564, "learning_rate": 1.629984410352648e-05, "loss": 0.0054, "num_input_tokens_seen": 172469088, "step": 79915 }, { "epoch": 13.037520391517129, "grad_norm": 1.4278761148452759, "learning_rate": 1.629650766520388e-05, "loss": 0.1585, "num_input_tokens_seen": 172479264, "step": 79920 }, { "epoch": 13.038336052202284, "grad_norm": 0.17394259572029114, "learning_rate": 1.6293171403268604e-05, "loss": 0.0255, "num_input_tokens_seen": 172490016, "step": 79925 }, { "epoch": 13.039151712887438, "grad_norm": 2.0906600952148438, "learning_rate": 1.6289835317788246e-05, "loss": 0.2084, "num_input_tokens_seen": 172500192, "step": 79930 }, { "epoch": 13.039967373572594, "grad_norm": 0.05727094039320946, "learning_rate": 1.6286499408830435e-05, "loss": 0.0112, "num_input_tokens_seen": 172509760, "step": 79935 }, { "epoch": 13.040783034257748, "grad_norm": 0.3873692750930786, "learning_rate": 1.6283163676462754e-05, "loss": 0.0644, "num_input_tokens_seen": 172520960, "step": 79940 }, { "epoch": 13.041598694942904, "grad_norm": 0.09895046800374985, "learning_rate": 1.6279828120752833e-05, "loss": 0.0457, "num_input_tokens_seen": 172532320, "step": 79945 }, { "epoch": 13.04241435562806, "grad_norm": 0.3609474003314972, "learning_rate": 1.6276492741768247e-05, "loss": 0.0267, "num_input_tokens_seen": 172542400, "step": 79950 }, { "epoch": 13.043230016313213, "grad_norm": 1.878759503364563, "learning_rate": 1.627315753957661e-05, "loss": 0.1739, "num_input_tokens_seen": 172553504, "step": 79955 }, { "epoch": 13.044045676998369, "grad_norm": 0.3428995907306671, "learning_rate": 1.6269822514245498e-05, "loss": 0.0951, "num_input_tokens_seen": 172564224, "step": 79960 }, { "epoch": 13.044861337683523, "grad_norm": 0.025558724999427795, "learning_rate": 1.6266487665842515e-05, "loss": 0.0925, "num_input_tokens_seen": 172572128, "step": 79965 }, { "epoch": 13.045676998368679, "grad_norm": 0.6665581464767456, "learning_rate": 1.6263152994435233e-05, "loss": 0.0996, "num_input_tokens_seen": 172582432, "step": 79970 }, { "epoch": 13.046492659053834, "grad_norm": 1.1414833068847656, "learning_rate": 1.625981850009125e-05, "loss": 0.0404, "num_input_tokens_seen": 172593984, "step": 79975 }, { "epoch": 13.047308319738988, "grad_norm": 3.0537352561950684, "learning_rate": 1.625648418287812e-05, "loss": 0.1177, "num_input_tokens_seen": 172604512, "step": 79980 }, { "epoch": 13.048123980424144, "grad_norm": 1.6084927320480347, "learning_rate": 1.6253150042863437e-05, "loss": 0.1548, "num_input_tokens_seen": 172613984, "step": 79985 }, { "epoch": 13.048939641109298, "grad_norm": 0.03277287259697914, "learning_rate": 1.6249816080114757e-05, "loss": 0.0147, "num_input_tokens_seen": 172626272, "step": 79990 }, { "epoch": 13.049755301794454, "grad_norm": 2.4749255180358887, "learning_rate": 1.624648229469966e-05, "loss": 0.1902, "num_input_tokens_seen": 172637856, "step": 79995 }, { "epoch": 13.05057096247961, "grad_norm": 0.044131193310022354, "learning_rate": 1.6243148686685706e-05, "loss": 0.0827, "num_input_tokens_seen": 172648032, "step": 80000 }, { "epoch": 13.051386623164763, "grad_norm": 0.07900460064411163, "learning_rate": 1.623981525614045e-05, "loss": 0.0264, "num_input_tokens_seen": 172659232, "step": 80005 }, { "epoch": 13.052202283849919, "grad_norm": 0.2997857332229614, "learning_rate": 1.623648200313145e-05, "loss": 0.0619, "num_input_tokens_seen": 172670336, "step": 80010 }, { "epoch": 13.053017944535073, "grad_norm": 0.11257197707891464, "learning_rate": 1.623314892772626e-05, "loss": 0.027, "num_input_tokens_seen": 172680480, "step": 80015 }, { "epoch": 13.053833605220229, "grad_norm": 0.060754209756851196, "learning_rate": 1.6229816029992426e-05, "loss": 0.0289, "num_input_tokens_seen": 172691264, "step": 80020 }, { "epoch": 13.054649265905383, "grad_norm": 1.5926532745361328, "learning_rate": 1.62264833099975e-05, "loss": 0.0838, "num_input_tokens_seen": 172702016, "step": 80025 }, { "epoch": 13.055464926590538, "grad_norm": 0.21814967691898346, "learning_rate": 1.6223150767809018e-05, "loss": 0.0132, "num_input_tokens_seen": 172713792, "step": 80030 }, { "epoch": 13.056280587275694, "grad_norm": 0.03837819769978523, "learning_rate": 1.6219818403494518e-05, "loss": 0.0121, "num_input_tokens_seen": 172724960, "step": 80035 }, { "epoch": 13.057096247960848, "grad_norm": 0.38472697138786316, "learning_rate": 1.621648621712154e-05, "loss": 0.0199, "num_input_tokens_seen": 172735264, "step": 80040 }, { "epoch": 13.057911908646004, "grad_norm": 0.02161514386534691, "learning_rate": 1.621315420875761e-05, "loss": 0.0272, "num_input_tokens_seen": 172747072, "step": 80045 }, { "epoch": 13.058727569331158, "grad_norm": 0.22021080553531647, "learning_rate": 1.6209822378470264e-05, "loss": 0.2969, "num_input_tokens_seen": 172757376, "step": 80050 }, { "epoch": 13.059543230016313, "grad_norm": 1.5743950605392456, "learning_rate": 1.6206490726327006e-05, "loss": 0.1632, "num_input_tokens_seen": 172768000, "step": 80055 }, { "epoch": 13.060358890701469, "grad_norm": 0.3913061320781708, "learning_rate": 1.6203159252395376e-05, "loss": 0.1056, "num_input_tokens_seen": 172778752, "step": 80060 }, { "epoch": 13.061174551386623, "grad_norm": 0.22882796823978424, "learning_rate": 1.619982795674288e-05, "loss": 0.041, "num_input_tokens_seen": 172789152, "step": 80065 }, { "epoch": 13.061990212071779, "grad_norm": 0.08720193058252335, "learning_rate": 1.6196496839437043e-05, "loss": 0.1636, "num_input_tokens_seen": 172801312, "step": 80070 }, { "epoch": 13.062805872756933, "grad_norm": 2.2546145915985107, "learning_rate": 1.6193165900545355e-05, "loss": 0.1483, "num_input_tokens_seen": 172812352, "step": 80075 }, { "epoch": 13.063621533442088, "grad_norm": 2.0173451900482178, "learning_rate": 1.618983514013534e-05, "loss": 0.1098, "num_input_tokens_seen": 172823264, "step": 80080 }, { "epoch": 13.064437194127244, "grad_norm": 0.11172063648700714, "learning_rate": 1.6186504558274485e-05, "loss": 0.1788, "num_input_tokens_seen": 172834464, "step": 80085 }, { "epoch": 13.065252854812398, "grad_norm": 0.6923021674156189, "learning_rate": 1.6183174155030306e-05, "loss": 0.1975, "num_input_tokens_seen": 172846016, "step": 80090 }, { "epoch": 13.066068515497554, "grad_norm": 1.6417144536972046, "learning_rate": 1.6179843930470275e-05, "loss": 0.1507, "num_input_tokens_seen": 172856544, "step": 80095 }, { "epoch": 13.066884176182707, "grad_norm": 0.07138203084468842, "learning_rate": 1.617651388466191e-05, "loss": 0.1084, "num_input_tokens_seen": 172868256, "step": 80100 }, { "epoch": 13.067699836867863, "grad_norm": 0.07191918790340424, "learning_rate": 1.617318401767267e-05, "loss": 0.1608, "num_input_tokens_seen": 172878912, "step": 80105 }, { "epoch": 13.068515497553017, "grad_norm": 0.07862996309995651, "learning_rate": 1.6169854329570067e-05, "loss": 0.0102, "num_input_tokens_seen": 172890336, "step": 80110 }, { "epoch": 13.069331158238173, "grad_norm": 1.4563077688217163, "learning_rate": 1.6166524820421555e-05, "loss": 0.2747, "num_input_tokens_seen": 172901312, "step": 80115 }, { "epoch": 13.070146818923329, "grad_norm": 0.1963096559047699, "learning_rate": 1.6163195490294635e-05, "loss": 0.0281, "num_input_tokens_seen": 172911264, "step": 80120 }, { "epoch": 13.070962479608482, "grad_norm": 1.2711981534957886, "learning_rate": 1.6159866339256758e-05, "loss": 0.0757, "num_input_tokens_seen": 172921280, "step": 80125 }, { "epoch": 13.071778140293638, "grad_norm": 0.05796998739242554, "learning_rate": 1.6156537367375406e-05, "loss": 0.0104, "num_input_tokens_seen": 172930432, "step": 80130 }, { "epoch": 13.072593800978792, "grad_norm": 0.013628375716507435, "learning_rate": 1.615320857471805e-05, "loss": 0.0691, "num_input_tokens_seen": 172941728, "step": 80135 }, { "epoch": 13.073409461663948, "grad_norm": 0.06801737844944, "learning_rate": 1.6149879961352138e-05, "loss": 0.063, "num_input_tokens_seen": 172952640, "step": 80140 }, { "epoch": 13.074225122349104, "grad_norm": 0.0809330865740776, "learning_rate": 1.6146551527345144e-05, "loss": 0.1115, "num_input_tokens_seen": 172964608, "step": 80145 }, { "epoch": 13.075040783034257, "grad_norm": 1.5474472045898438, "learning_rate": 1.614322327276451e-05, "loss": 0.1618, "num_input_tokens_seen": 172975904, "step": 80150 }, { "epoch": 13.075856443719413, "grad_norm": 0.06733548641204834, "learning_rate": 1.613989519767769e-05, "loss": 0.1264, "num_input_tokens_seen": 172985952, "step": 80155 }, { "epoch": 13.076672104404567, "grad_norm": 0.2101311981678009, "learning_rate": 1.6136567302152133e-05, "loss": 0.0343, "num_input_tokens_seen": 172996672, "step": 80160 }, { "epoch": 13.077487765089723, "grad_norm": 0.1243894025683403, "learning_rate": 1.6133239586255287e-05, "loss": 0.091, "num_input_tokens_seen": 173007392, "step": 80165 }, { "epoch": 13.078303425774878, "grad_norm": 0.25036659836769104, "learning_rate": 1.612991205005459e-05, "loss": 0.0255, "num_input_tokens_seen": 173018816, "step": 80170 }, { "epoch": 13.079119086460032, "grad_norm": 0.6435555815696716, "learning_rate": 1.6126584693617478e-05, "loss": 0.057, "num_input_tokens_seen": 173029952, "step": 80175 }, { "epoch": 13.079934747145188, "grad_norm": 0.18051999807357788, "learning_rate": 1.6123257517011382e-05, "loss": 0.0785, "num_input_tokens_seen": 173040416, "step": 80180 }, { "epoch": 13.080750407830342, "grad_norm": 0.08233722299337387, "learning_rate": 1.6119930520303734e-05, "loss": 0.2129, "num_input_tokens_seen": 173051392, "step": 80185 }, { "epoch": 13.081566068515498, "grad_norm": 0.201433926820755, "learning_rate": 1.6116603703561957e-05, "loss": 0.1545, "num_input_tokens_seen": 173061952, "step": 80190 }, { "epoch": 13.082381729200652, "grad_norm": 0.03511115163564682, "learning_rate": 1.611327706685348e-05, "loss": 0.0091, "num_input_tokens_seen": 173072928, "step": 80195 }, { "epoch": 13.083197389885807, "grad_norm": 0.08422616124153137, "learning_rate": 1.6109950610245707e-05, "loss": 0.2406, "num_input_tokens_seen": 173082848, "step": 80200 }, { "epoch": 13.084013050570963, "grad_norm": 0.2153671830892563, "learning_rate": 1.6106624333806074e-05, "loss": 0.0905, "num_input_tokens_seen": 173092032, "step": 80205 }, { "epoch": 13.084828711256117, "grad_norm": 0.9717006683349609, "learning_rate": 1.6103298237601974e-05, "loss": 0.03, "num_input_tokens_seen": 173102464, "step": 80210 }, { "epoch": 13.085644371941273, "grad_norm": 0.1677493155002594, "learning_rate": 1.6099972321700828e-05, "loss": 0.0384, "num_input_tokens_seen": 173113056, "step": 80215 }, { "epoch": 13.086460032626427, "grad_norm": 0.43875473737716675, "learning_rate": 1.609664658617003e-05, "loss": 0.1372, "num_input_tokens_seen": 173123552, "step": 80220 }, { "epoch": 13.087275693311582, "grad_norm": 2.3606960773468018, "learning_rate": 1.6093321031076986e-05, "loss": 0.1483, "num_input_tokens_seen": 173133120, "step": 80225 }, { "epoch": 13.088091353996738, "grad_norm": 0.20070038735866547, "learning_rate": 1.608999565648908e-05, "loss": 0.1418, "num_input_tokens_seen": 173144064, "step": 80230 }, { "epoch": 13.088907014681892, "grad_norm": 0.05615069344639778, "learning_rate": 1.608667046247372e-05, "loss": 0.1215, "num_input_tokens_seen": 173154560, "step": 80235 }, { "epoch": 13.089722675367048, "grad_norm": 3.0952272415161133, "learning_rate": 1.6083345449098297e-05, "loss": 0.2271, "num_input_tokens_seen": 173164192, "step": 80240 }, { "epoch": 13.090538336052202, "grad_norm": 1.3105639219284058, "learning_rate": 1.608002061643018e-05, "loss": 0.2001, "num_input_tokens_seen": 173173984, "step": 80245 }, { "epoch": 13.091353996737357, "grad_norm": 0.033957816660404205, "learning_rate": 1.6076695964536774e-05, "loss": 0.1087, "num_input_tokens_seen": 173184320, "step": 80250 }, { "epoch": 13.092169657422513, "grad_norm": 0.07932806760072708, "learning_rate": 1.6073371493485435e-05, "loss": 0.0922, "num_input_tokens_seen": 173195168, "step": 80255 }, { "epoch": 13.092985318107667, "grad_norm": 1.332526445388794, "learning_rate": 1.6070047203343553e-05, "loss": 0.0795, "num_input_tokens_seen": 173206112, "step": 80260 }, { "epoch": 13.093800978792823, "grad_norm": 0.04267574101686478, "learning_rate": 1.6066723094178483e-05, "loss": 0.0693, "num_input_tokens_seen": 173216992, "step": 80265 }, { "epoch": 13.094616639477977, "grad_norm": 1.824160099029541, "learning_rate": 1.6063399166057614e-05, "loss": 0.1448, "num_input_tokens_seen": 173229536, "step": 80270 }, { "epoch": 13.095432300163132, "grad_norm": 1.1316486597061157, "learning_rate": 1.6060075419048287e-05, "loss": 0.1082, "num_input_tokens_seen": 173240640, "step": 80275 }, { "epoch": 13.096247960848286, "grad_norm": 1.6050238609313965, "learning_rate": 1.605675185321788e-05, "loss": 0.1905, "num_input_tokens_seen": 173251968, "step": 80280 }, { "epoch": 13.097063621533442, "grad_norm": 0.07502494007349014, "learning_rate": 1.605342846863374e-05, "loss": 0.028, "num_input_tokens_seen": 173264224, "step": 80285 }, { "epoch": 13.097879282218598, "grad_norm": 0.08436498045921326, "learning_rate": 1.6050105265363223e-05, "loss": 0.1168, "num_input_tokens_seen": 173275360, "step": 80290 }, { "epoch": 13.098694942903752, "grad_norm": 1.134009599685669, "learning_rate": 1.6046782243473672e-05, "loss": 0.0541, "num_input_tokens_seen": 173286048, "step": 80295 }, { "epoch": 13.099510603588907, "grad_norm": 0.9279521107673645, "learning_rate": 1.604345940303244e-05, "loss": 0.1566, "num_input_tokens_seen": 173298208, "step": 80300 }, { "epoch": 13.100326264274061, "grad_norm": 1.2675666809082031, "learning_rate": 1.6040136744106864e-05, "loss": 0.0723, "num_input_tokens_seen": 173308672, "step": 80305 }, { "epoch": 13.101141924959217, "grad_norm": 0.021169167011976242, "learning_rate": 1.603681426676429e-05, "loss": 0.0504, "num_input_tokens_seen": 173319808, "step": 80310 }, { "epoch": 13.101957585644373, "grad_norm": 0.036458902060985565, "learning_rate": 1.6033491971072036e-05, "loss": 0.0113, "num_input_tokens_seen": 173330976, "step": 80315 }, { "epoch": 13.102773246329527, "grad_norm": 1.746688723564148, "learning_rate": 1.603016985709745e-05, "loss": 0.0806, "num_input_tokens_seen": 173341984, "step": 80320 }, { "epoch": 13.103588907014682, "grad_norm": 0.01982191391289234, "learning_rate": 1.602684792490784e-05, "loss": 0.0985, "num_input_tokens_seen": 173351808, "step": 80325 }, { "epoch": 13.104404567699836, "grad_norm": 0.06650844216346741, "learning_rate": 1.602352617457055e-05, "loss": 0.0301, "num_input_tokens_seen": 173362624, "step": 80330 }, { "epoch": 13.105220228384992, "grad_norm": 0.02583317644894123, "learning_rate": 1.6020204606152885e-05, "loss": 0.0388, "num_input_tokens_seen": 173374144, "step": 80335 }, { "epoch": 13.106035889070148, "grad_norm": 3.019765853881836, "learning_rate": 1.6016883219722165e-05, "loss": 0.1395, "num_input_tokens_seen": 173385344, "step": 80340 }, { "epoch": 13.106851549755302, "grad_norm": 0.06146698445081711, "learning_rate": 1.6013562015345704e-05, "loss": 0.1308, "num_input_tokens_seen": 173396480, "step": 80345 }, { "epoch": 13.107667210440457, "grad_norm": 2.0224642753601074, "learning_rate": 1.6010240993090803e-05, "loss": 0.045, "num_input_tokens_seen": 173406944, "step": 80350 }, { "epoch": 13.108482871125611, "grad_norm": 0.11245238035917282, "learning_rate": 1.6006920153024785e-05, "loss": 0.1349, "num_input_tokens_seen": 173417728, "step": 80355 }, { "epoch": 13.109298531810767, "grad_norm": 0.3276614248752594, "learning_rate": 1.6003599495214927e-05, "loss": 0.0275, "num_input_tokens_seen": 173428128, "step": 80360 }, { "epoch": 13.11011419249592, "grad_norm": 0.04868568480014801, "learning_rate": 1.600027901972855e-05, "loss": 0.0252, "num_input_tokens_seen": 173439104, "step": 80365 }, { "epoch": 13.110929853181077, "grad_norm": 1.920016884803772, "learning_rate": 1.5996958726632924e-05, "loss": 0.0968, "num_input_tokens_seen": 173451104, "step": 80370 }, { "epoch": 13.111745513866232, "grad_norm": 2.061507225036621, "learning_rate": 1.599363861599536e-05, "loss": 0.0945, "num_input_tokens_seen": 173462176, "step": 80375 }, { "epoch": 13.112561174551386, "grad_norm": 1.5421370267868042, "learning_rate": 1.5990318687883128e-05, "loss": 0.195, "num_input_tokens_seen": 173471904, "step": 80380 }, { "epoch": 13.113376835236542, "grad_norm": 1.3491530418395996, "learning_rate": 1.5986998942363523e-05, "loss": 0.1223, "num_input_tokens_seen": 173483232, "step": 80385 }, { "epoch": 13.114192495921696, "grad_norm": 0.07420562952756882, "learning_rate": 1.598367937950381e-05, "loss": 0.1703, "num_input_tokens_seen": 173494304, "step": 80390 }, { "epoch": 13.115008156606851, "grad_norm": 0.25106745958328247, "learning_rate": 1.5980359999371282e-05, "loss": 0.0816, "num_input_tokens_seen": 173504320, "step": 80395 }, { "epoch": 13.115823817292007, "grad_norm": 1.4852880239486694, "learning_rate": 1.5977040802033193e-05, "loss": 0.1284, "num_input_tokens_seen": 173515392, "step": 80400 }, { "epoch": 13.116639477977161, "grad_norm": 0.02228819765150547, "learning_rate": 1.5973721787556828e-05, "loss": 0.1024, "num_input_tokens_seen": 173525440, "step": 80405 }, { "epoch": 13.117455138662317, "grad_norm": 1.6961339712142944, "learning_rate": 1.597040295600943e-05, "loss": 0.0628, "num_input_tokens_seen": 173536000, "step": 80410 }, { "epoch": 13.11827079934747, "grad_norm": 0.2538699209690094, "learning_rate": 1.596708430745828e-05, "loss": 0.0247, "num_input_tokens_seen": 173546848, "step": 80415 }, { "epoch": 13.119086460032626, "grad_norm": 0.15071217715740204, "learning_rate": 1.596376584197062e-05, "loss": 0.1224, "num_input_tokens_seen": 173557824, "step": 80420 }, { "epoch": 13.119902120717782, "grad_norm": 0.3427389860153198, "learning_rate": 1.5960447559613712e-05, "loss": 0.019, "num_input_tokens_seen": 173568704, "step": 80425 }, { "epoch": 13.120717781402936, "grad_norm": 1.493349313735962, "learning_rate": 1.5957129460454794e-05, "loss": 0.2349, "num_input_tokens_seen": 173580896, "step": 80430 }, { "epoch": 13.121533442088092, "grad_norm": 1.487487554550171, "learning_rate": 1.595381154456113e-05, "loss": 0.0463, "num_input_tokens_seen": 173591616, "step": 80435 }, { "epoch": 13.122349102773246, "grad_norm": 0.0378468781709671, "learning_rate": 1.5950493811999946e-05, "loss": 0.0423, "num_input_tokens_seen": 173601600, "step": 80440 }, { "epoch": 13.123164763458401, "grad_norm": 0.023294515907764435, "learning_rate": 1.5947176262838484e-05, "loss": 0.0284, "num_input_tokens_seen": 173611488, "step": 80445 }, { "epoch": 13.123980424143557, "grad_norm": 0.05662208050489426, "learning_rate": 1.5943858897143982e-05, "loss": 0.1676, "num_input_tokens_seen": 173621376, "step": 80450 }, { "epoch": 13.124796084828711, "grad_norm": 1.4386231899261475, "learning_rate": 1.5940541714983658e-05, "loss": 0.0365, "num_input_tokens_seen": 173631616, "step": 80455 }, { "epoch": 13.125611745513867, "grad_norm": 0.17700357735157013, "learning_rate": 1.5937224716424766e-05, "loss": 0.0632, "num_input_tokens_seen": 173643008, "step": 80460 }, { "epoch": 13.12642740619902, "grad_norm": 2.14424204826355, "learning_rate": 1.59339079015345e-05, "loss": 0.0433, "num_input_tokens_seen": 173654336, "step": 80465 }, { "epoch": 13.127243066884176, "grad_norm": 0.2553407549858093, "learning_rate": 1.593059127038009e-05, "loss": 0.0678, "num_input_tokens_seen": 173664832, "step": 80470 }, { "epoch": 13.12805872756933, "grad_norm": 0.039766959846019745, "learning_rate": 1.592727482302876e-05, "loss": 0.0324, "num_input_tokens_seen": 173676672, "step": 80475 }, { "epoch": 13.128874388254486, "grad_norm": 0.2591179311275482, "learning_rate": 1.5923958559547712e-05, "loss": 0.1655, "num_input_tokens_seen": 173687584, "step": 80480 }, { "epoch": 13.129690048939642, "grad_norm": 1.9463449716567993, "learning_rate": 1.5920642480004162e-05, "loss": 0.2138, "num_input_tokens_seen": 173697856, "step": 80485 }, { "epoch": 13.130505709624796, "grad_norm": 2.5864436626434326, "learning_rate": 1.591732658446531e-05, "loss": 0.105, "num_input_tokens_seen": 173709312, "step": 80490 }, { "epoch": 13.131321370309951, "grad_norm": 2.1446938514709473, "learning_rate": 1.5914010872998347e-05, "loss": 0.1595, "num_input_tokens_seen": 173719392, "step": 80495 }, { "epoch": 13.132137030995105, "grad_norm": 2.5554091930389404, "learning_rate": 1.5910695345670493e-05, "loss": 0.0673, "num_input_tokens_seen": 173730432, "step": 80500 }, { "epoch": 13.132952691680261, "grad_norm": 0.055011048913002014, "learning_rate": 1.5907380002548917e-05, "loss": 0.0164, "num_input_tokens_seen": 173740224, "step": 80505 }, { "epoch": 13.133768352365417, "grad_norm": 0.06901530176401138, "learning_rate": 1.5904064843700832e-05, "loss": 0.0143, "num_input_tokens_seen": 173751680, "step": 80510 }, { "epoch": 13.13458401305057, "grad_norm": 0.02027691900730133, "learning_rate": 1.59007498691934e-05, "loss": 0.0103, "num_input_tokens_seen": 173763840, "step": 80515 }, { "epoch": 13.135399673735726, "grad_norm": 0.6201140880584717, "learning_rate": 1.5897435079093826e-05, "loss": 0.0322, "num_input_tokens_seen": 173774720, "step": 80520 }, { "epoch": 13.13621533442088, "grad_norm": 0.02549431473016739, "learning_rate": 1.5894120473469268e-05, "loss": 0.1297, "num_input_tokens_seen": 173787040, "step": 80525 }, { "epoch": 13.137030995106036, "grad_norm": 0.6662163734436035, "learning_rate": 1.589080605238692e-05, "loss": 0.1581, "num_input_tokens_seen": 173798112, "step": 80530 }, { "epoch": 13.137846655791192, "grad_norm": 0.05326750501990318, "learning_rate": 1.588749181591393e-05, "loss": 0.1581, "num_input_tokens_seen": 173808960, "step": 80535 }, { "epoch": 13.138662316476346, "grad_norm": 3.0225982666015625, "learning_rate": 1.5884177764117495e-05, "loss": 0.4208, "num_input_tokens_seen": 173820320, "step": 80540 }, { "epoch": 13.139477977161501, "grad_norm": 0.07280642539262772, "learning_rate": 1.5880863897064745e-05, "loss": 0.1071, "num_input_tokens_seen": 173831648, "step": 80545 }, { "epoch": 13.140293637846655, "grad_norm": 0.947096049785614, "learning_rate": 1.587755021482287e-05, "loss": 0.0426, "num_input_tokens_seen": 173842624, "step": 80550 }, { "epoch": 13.141109298531811, "grad_norm": 0.06487245857715607, "learning_rate": 1.5874236717459007e-05, "loss": 0.0814, "num_input_tokens_seen": 173852608, "step": 80555 }, { "epoch": 13.141924959216965, "grad_norm": 0.05781757831573486, "learning_rate": 1.5870923405040307e-05, "loss": 0.0902, "num_input_tokens_seen": 173863680, "step": 80560 }, { "epoch": 13.14274061990212, "grad_norm": 0.3566399812698364, "learning_rate": 1.586761027763394e-05, "loss": 0.0209, "num_input_tokens_seen": 173875328, "step": 80565 }, { "epoch": 13.143556280587276, "grad_norm": 0.06634693592786789, "learning_rate": 1.586429733530702e-05, "loss": 0.0176, "num_input_tokens_seen": 173887200, "step": 80570 }, { "epoch": 13.14437194127243, "grad_norm": 0.06798627972602844, "learning_rate": 1.586098457812672e-05, "loss": 0.0072, "num_input_tokens_seen": 173898080, "step": 80575 }, { "epoch": 13.145187601957586, "grad_norm": 0.11796282231807709, "learning_rate": 1.5857672006160144e-05, "loss": 0.1893, "num_input_tokens_seen": 173908800, "step": 80580 }, { "epoch": 13.14600326264274, "grad_norm": 0.09962530434131622, "learning_rate": 1.585435961947446e-05, "loss": 0.0388, "num_input_tokens_seen": 173920416, "step": 80585 }, { "epoch": 13.146818923327896, "grad_norm": 0.19329902529716492, "learning_rate": 1.5851047418136767e-05, "loss": 0.227, "num_input_tokens_seen": 173930592, "step": 80590 }, { "epoch": 13.147634584013051, "grad_norm": 1.1441675424575806, "learning_rate": 1.5847735402214215e-05, "loss": 0.1843, "num_input_tokens_seen": 173941344, "step": 80595 }, { "epoch": 13.148450244698205, "grad_norm": 1.8608318567276, "learning_rate": 1.5844423571773907e-05, "loss": 0.1006, "num_input_tokens_seen": 173951936, "step": 80600 }, { "epoch": 13.149265905383361, "grad_norm": 0.21313609182834625, "learning_rate": 1.584111192688298e-05, "loss": 0.1637, "num_input_tokens_seen": 173962080, "step": 80605 }, { "epoch": 13.150081566068515, "grad_norm": 0.03270283713936806, "learning_rate": 1.5837800467608522e-05, "loss": 0.0271, "num_input_tokens_seen": 173972544, "step": 80610 }, { "epoch": 13.15089722675367, "grad_norm": 0.13401928544044495, "learning_rate": 1.5834489194017673e-05, "loss": 0.1103, "num_input_tokens_seen": 173982560, "step": 80615 }, { "epoch": 13.151712887438826, "grad_norm": 0.22518381476402283, "learning_rate": 1.5831178106177523e-05, "loss": 0.057, "num_input_tokens_seen": 173992864, "step": 80620 }, { "epoch": 13.15252854812398, "grad_norm": 0.39136138558387756, "learning_rate": 1.5827867204155182e-05, "loss": 0.0389, "num_input_tokens_seen": 174002784, "step": 80625 }, { "epoch": 13.153344208809136, "grad_norm": 2.653383731842041, "learning_rate": 1.582455648801775e-05, "loss": 0.1801, "num_input_tokens_seen": 174012896, "step": 80630 }, { "epoch": 13.15415986949429, "grad_norm": 0.7730540633201599, "learning_rate": 1.582124595783232e-05, "loss": 0.0768, "num_input_tokens_seen": 174024448, "step": 80635 }, { "epoch": 13.154975530179446, "grad_norm": 0.12562814354896545, "learning_rate": 1.581793561366598e-05, "loss": 0.1299, "num_input_tokens_seen": 174035424, "step": 80640 }, { "epoch": 13.1557911908646, "grad_norm": 0.037804603576660156, "learning_rate": 1.581462545558583e-05, "loss": 0.0429, "num_input_tokens_seen": 174046016, "step": 80645 }, { "epoch": 13.156606851549755, "grad_norm": 0.41770139336586, "learning_rate": 1.581131548365894e-05, "loss": 0.0342, "num_input_tokens_seen": 174055936, "step": 80650 }, { "epoch": 13.15742251223491, "grad_norm": 1.378528118133545, "learning_rate": 1.580800569795241e-05, "loss": 0.1065, "num_input_tokens_seen": 174065696, "step": 80655 }, { "epoch": 13.158238172920065, "grad_norm": 1.75574791431427, "learning_rate": 1.5804696098533293e-05, "loss": 0.2362, "num_input_tokens_seen": 174075904, "step": 80660 }, { "epoch": 13.15905383360522, "grad_norm": 0.903826117515564, "learning_rate": 1.5801386685468673e-05, "loss": 0.0945, "num_input_tokens_seen": 174086528, "step": 80665 }, { "epoch": 13.159869494290374, "grad_norm": 0.9522660374641418, "learning_rate": 1.5798077458825632e-05, "loss": 0.3094, "num_input_tokens_seen": 174097696, "step": 80670 }, { "epoch": 13.16068515497553, "grad_norm": 0.03393416479229927, "learning_rate": 1.5794768418671217e-05, "loss": 0.0355, "num_input_tokens_seen": 174108192, "step": 80675 }, { "epoch": 13.161500815660686, "grad_norm": 0.9061732888221741, "learning_rate": 1.579145956507251e-05, "loss": 0.0635, "num_input_tokens_seen": 174120224, "step": 80680 }, { "epoch": 13.16231647634584, "grad_norm": 2.1015985012054443, "learning_rate": 1.578815089809654e-05, "loss": 0.2057, "num_input_tokens_seen": 174130048, "step": 80685 }, { "epoch": 13.163132137030995, "grad_norm": 0.15818984806537628, "learning_rate": 1.5784842417810392e-05, "loss": 0.0357, "num_input_tokens_seen": 174140992, "step": 80690 }, { "epoch": 13.16394779771615, "grad_norm": 0.07775472104549408, "learning_rate": 1.5781534124281093e-05, "loss": 0.0942, "num_input_tokens_seen": 174151264, "step": 80695 }, { "epoch": 13.164763458401305, "grad_norm": 0.09426063299179077, "learning_rate": 1.577822601757571e-05, "loss": 0.2023, "num_input_tokens_seen": 174163136, "step": 80700 }, { "epoch": 13.16557911908646, "grad_norm": 0.09903549402952194, "learning_rate": 1.577491809776126e-05, "loss": 0.0827, "num_input_tokens_seen": 174174080, "step": 80705 }, { "epoch": 13.166394779771615, "grad_norm": 0.07090318202972412, "learning_rate": 1.5771610364904813e-05, "loss": 0.0168, "num_input_tokens_seen": 174185536, "step": 80710 }, { "epoch": 13.16721044045677, "grad_norm": 0.0310872420668602, "learning_rate": 1.5768302819073377e-05, "loss": 0.1909, "num_input_tokens_seen": 174196064, "step": 80715 }, { "epoch": 13.168026101141924, "grad_norm": 2.5658464431762695, "learning_rate": 1.5764995460334006e-05, "loss": 0.1946, "num_input_tokens_seen": 174206560, "step": 80720 }, { "epoch": 13.16884176182708, "grad_norm": 0.3390193581581116, "learning_rate": 1.5761688288753707e-05, "loss": 0.0916, "num_input_tokens_seen": 174219136, "step": 80725 }, { "epoch": 13.169657422512234, "grad_norm": 0.10982620716094971, "learning_rate": 1.5758381304399524e-05, "loss": 0.0113, "num_input_tokens_seen": 174229952, "step": 80730 }, { "epoch": 13.17047308319739, "grad_norm": 0.46329206228256226, "learning_rate": 1.5755074507338458e-05, "loss": 0.2609, "num_input_tokens_seen": 174239168, "step": 80735 }, { "epoch": 13.171288743882545, "grad_norm": 0.027113202959299088, "learning_rate": 1.575176789763755e-05, "loss": 0.0095, "num_input_tokens_seen": 174248864, "step": 80740 }, { "epoch": 13.1721044045677, "grad_norm": 0.06635367125272751, "learning_rate": 1.574846147536378e-05, "loss": 0.0185, "num_input_tokens_seen": 174259872, "step": 80745 }, { "epoch": 13.172920065252855, "grad_norm": 0.44519391655921936, "learning_rate": 1.5745155240584187e-05, "loss": 0.029, "num_input_tokens_seen": 174271136, "step": 80750 }, { "epoch": 13.173735725938009, "grad_norm": 0.024702787399291992, "learning_rate": 1.5741849193365756e-05, "loss": 0.0921, "num_input_tokens_seen": 174282624, "step": 80755 }, { "epoch": 13.174551386623165, "grad_norm": 0.49708446860313416, "learning_rate": 1.57385433337755e-05, "loss": 0.2002, "num_input_tokens_seen": 174294368, "step": 80760 }, { "epoch": 13.17536704730832, "grad_norm": 0.2517167329788208, "learning_rate": 1.5735237661880414e-05, "loss": 0.0702, "num_input_tokens_seen": 174305536, "step": 80765 }, { "epoch": 13.176182707993474, "grad_norm": 0.49387305974960327, "learning_rate": 1.5731932177747484e-05, "loss": 0.0241, "num_input_tokens_seen": 174316192, "step": 80770 }, { "epoch": 13.17699836867863, "grad_norm": 0.06015712395310402, "learning_rate": 1.5728626881443713e-05, "loss": 0.0236, "num_input_tokens_seen": 174326336, "step": 80775 }, { "epoch": 13.177814029363784, "grad_norm": 1.603908658027649, "learning_rate": 1.5725321773036072e-05, "loss": 0.2805, "num_input_tokens_seen": 174336960, "step": 80780 }, { "epoch": 13.17862969004894, "grad_norm": 0.6962897777557373, "learning_rate": 1.5722016852591554e-05, "loss": 0.0457, "num_input_tokens_seen": 174347264, "step": 80785 }, { "epoch": 13.179445350734095, "grad_norm": 0.529670000076294, "learning_rate": 1.5718712120177138e-05, "loss": 0.2134, "num_input_tokens_seen": 174357984, "step": 80790 }, { "epoch": 13.18026101141925, "grad_norm": 0.474141925573349, "learning_rate": 1.5715407575859792e-05, "loss": 0.0598, "num_input_tokens_seen": 174367520, "step": 80795 }, { "epoch": 13.181076672104405, "grad_norm": 1.275763750076294, "learning_rate": 1.5712103219706492e-05, "loss": 0.129, "num_input_tokens_seen": 174380000, "step": 80800 }, { "epoch": 13.181892332789559, "grad_norm": 1.2286587953567505, "learning_rate": 1.57087990517842e-05, "loss": 0.1218, "num_input_tokens_seen": 174389440, "step": 80805 }, { "epoch": 13.182707993474715, "grad_norm": 1.7284703254699707, "learning_rate": 1.570549507215988e-05, "loss": 0.093, "num_input_tokens_seen": 174400224, "step": 80810 }, { "epoch": 13.18352365415987, "grad_norm": 1.9380213022232056, "learning_rate": 1.5702191280900502e-05, "loss": 0.2947, "num_input_tokens_seen": 174411904, "step": 80815 }, { "epoch": 13.184339314845024, "grad_norm": 0.46765998005867004, "learning_rate": 1.5698887678073003e-05, "loss": 0.0835, "num_input_tokens_seen": 174421984, "step": 80820 }, { "epoch": 13.18515497553018, "grad_norm": 0.14150255918502808, "learning_rate": 1.569558426374435e-05, "loss": 0.2056, "num_input_tokens_seen": 174433472, "step": 80825 }, { "epoch": 13.185970636215334, "grad_norm": 0.9388641715049744, "learning_rate": 1.5692281037981483e-05, "loss": 0.0416, "num_input_tokens_seen": 174444544, "step": 80830 }, { "epoch": 13.18678629690049, "grad_norm": 0.4710589051246643, "learning_rate": 1.5688978000851354e-05, "loss": 0.017, "num_input_tokens_seen": 174455584, "step": 80835 }, { "epoch": 13.187601957585644, "grad_norm": 0.042759399861097336, "learning_rate": 1.5685675152420888e-05, "loss": 0.131, "num_input_tokens_seen": 174465856, "step": 80840 }, { "epoch": 13.1884176182708, "grad_norm": 1.7757978439331055, "learning_rate": 1.568237249275704e-05, "loss": 0.2143, "num_input_tokens_seen": 174476448, "step": 80845 }, { "epoch": 13.189233278955955, "grad_norm": 0.08664428442716599, "learning_rate": 1.5679070021926727e-05, "loss": 0.0906, "num_input_tokens_seen": 174487648, "step": 80850 }, { "epoch": 13.190048939641109, "grad_norm": 0.026725122705101967, "learning_rate": 1.5675767739996893e-05, "loss": 0.0335, "num_input_tokens_seen": 174497952, "step": 80855 }, { "epoch": 13.190864600326265, "grad_norm": 0.08512270450592041, "learning_rate": 1.5672465647034445e-05, "loss": 0.0292, "num_input_tokens_seen": 174508320, "step": 80860 }, { "epoch": 13.191680261011419, "grad_norm": 1.6358729600906372, "learning_rate": 1.5669163743106326e-05, "loss": 0.081, "num_input_tokens_seen": 174519264, "step": 80865 }, { "epoch": 13.192495921696574, "grad_norm": 0.30159077048301697, "learning_rate": 1.5665862028279423e-05, "loss": 0.011, "num_input_tokens_seen": 174531168, "step": 80870 }, { "epoch": 13.19331158238173, "grad_norm": 1.4282020330429077, "learning_rate": 1.5662560502620673e-05, "loss": 0.0845, "num_input_tokens_seen": 174542240, "step": 80875 }, { "epoch": 13.194127243066884, "grad_norm": 0.594208836555481, "learning_rate": 1.5659259166196987e-05, "loss": 0.0213, "num_input_tokens_seen": 174552800, "step": 80880 }, { "epoch": 13.19494290375204, "grad_norm": 1.9192465543746948, "learning_rate": 1.5655958019075255e-05, "loss": 0.2487, "num_input_tokens_seen": 174564064, "step": 80885 }, { "epoch": 13.195758564437194, "grad_norm": 1.6630158424377441, "learning_rate": 1.56526570613224e-05, "loss": 0.1077, "num_input_tokens_seen": 174575680, "step": 80890 }, { "epoch": 13.19657422512235, "grad_norm": 0.8854870796203613, "learning_rate": 1.5649356293005298e-05, "loss": 0.0939, "num_input_tokens_seen": 174586432, "step": 80895 }, { "epoch": 13.197389885807505, "grad_norm": 0.1404576450586319, "learning_rate": 1.5646055714190855e-05, "loss": 0.0381, "num_input_tokens_seen": 174596896, "step": 80900 }, { "epoch": 13.198205546492659, "grad_norm": 0.03797446936368942, "learning_rate": 1.5642755324945958e-05, "loss": 0.1656, "num_input_tokens_seen": 174608768, "step": 80905 }, { "epoch": 13.199021207177815, "grad_norm": 0.4519462585449219, "learning_rate": 1.56394551253375e-05, "loss": 0.0315, "num_input_tokens_seen": 174619616, "step": 80910 }, { "epoch": 13.199836867862969, "grad_norm": 0.0569361113011837, "learning_rate": 1.563615511543235e-05, "loss": 0.0227, "num_input_tokens_seen": 174630784, "step": 80915 }, { "epoch": 13.200652528548124, "grad_norm": 0.05628475919365883, "learning_rate": 1.5632855295297404e-05, "loss": 0.1187, "num_input_tokens_seen": 174641664, "step": 80920 }, { "epoch": 13.201468189233278, "grad_norm": 0.3518078029155731, "learning_rate": 1.5629555664999516e-05, "loss": 0.0213, "num_input_tokens_seen": 174652736, "step": 80925 }, { "epoch": 13.202283849918434, "grad_norm": 0.9630137085914612, "learning_rate": 1.562625622460558e-05, "loss": 0.0497, "num_input_tokens_seen": 174662752, "step": 80930 }, { "epoch": 13.20309951060359, "grad_norm": 0.06894651055335999, "learning_rate": 1.5622956974182454e-05, "loss": 0.0791, "num_input_tokens_seen": 174673184, "step": 80935 }, { "epoch": 13.203915171288743, "grad_norm": 0.04023426026105881, "learning_rate": 1.5619657913796997e-05, "loss": 0.0839, "num_input_tokens_seen": 174684032, "step": 80940 }, { "epoch": 13.2047308319739, "grad_norm": 0.2628088891506195, "learning_rate": 1.5616359043516073e-05, "loss": 0.1897, "num_input_tokens_seen": 174695072, "step": 80945 }, { "epoch": 13.205546492659053, "grad_norm": 0.19032154977321625, "learning_rate": 1.561306036340654e-05, "loss": 0.2927, "num_input_tokens_seen": 174706272, "step": 80950 }, { "epoch": 13.206362153344209, "grad_norm": 1.680748701095581, "learning_rate": 1.560976187353524e-05, "loss": 0.1375, "num_input_tokens_seen": 174717120, "step": 80955 }, { "epoch": 13.207177814029365, "grad_norm": 0.04230321943759918, "learning_rate": 1.5606463573969033e-05, "loss": 0.0487, "num_input_tokens_seen": 174727520, "step": 80960 }, { "epoch": 13.207993474714518, "grad_norm": 0.9272189736366272, "learning_rate": 1.5603165464774755e-05, "loss": 0.2168, "num_input_tokens_seen": 174737152, "step": 80965 }, { "epoch": 13.208809135399674, "grad_norm": 1.4357109069824219, "learning_rate": 1.5599867546019257e-05, "loss": 0.1352, "num_input_tokens_seen": 174748192, "step": 80970 }, { "epoch": 13.209624796084828, "grad_norm": 0.05785117298364639, "learning_rate": 1.559656981776936e-05, "loss": 0.0871, "num_input_tokens_seen": 174759136, "step": 80975 }, { "epoch": 13.210440456769984, "grad_norm": 1.4432299137115479, "learning_rate": 1.5593272280091897e-05, "loss": 0.0933, "num_input_tokens_seen": 174770400, "step": 80980 }, { "epoch": 13.21125611745514, "grad_norm": 1.9157918691635132, "learning_rate": 1.558997493305372e-05, "loss": 0.2819, "num_input_tokens_seen": 174781568, "step": 80985 }, { "epoch": 13.212071778140293, "grad_norm": 1.8903706073760986, "learning_rate": 1.558667777672163e-05, "loss": 0.0736, "num_input_tokens_seen": 174792608, "step": 80990 }, { "epoch": 13.21288743882545, "grad_norm": 0.036078568547964096, "learning_rate": 1.5583380811162467e-05, "loss": 0.0679, "num_input_tokens_seen": 174802080, "step": 80995 }, { "epoch": 13.213703099510603, "grad_norm": 1.7912347316741943, "learning_rate": 1.5580084036443026e-05, "loss": 0.1843, "num_input_tokens_seen": 174813248, "step": 81000 }, { "epoch": 13.214518760195759, "grad_norm": 1.112001895904541, "learning_rate": 1.5576787452630142e-05, "loss": 0.0476, "num_input_tokens_seen": 174823904, "step": 81005 }, { "epoch": 13.215334420880913, "grad_norm": 0.8023386597633362, "learning_rate": 1.55734910597906e-05, "loss": 0.1193, "num_input_tokens_seen": 174835072, "step": 81010 }, { "epoch": 13.216150081566068, "grad_norm": 1.0950908660888672, "learning_rate": 1.557019485799124e-05, "loss": 0.1001, "num_input_tokens_seen": 174847264, "step": 81015 }, { "epoch": 13.216965742251224, "grad_norm": 0.07488419115543365, "learning_rate": 1.5566898847298827e-05, "loss": 0.0427, "num_input_tokens_seen": 174858432, "step": 81020 }, { "epoch": 13.217781402936378, "grad_norm": 0.04252652823925018, "learning_rate": 1.5563603027780186e-05, "loss": 0.0418, "num_input_tokens_seen": 174868000, "step": 81025 }, { "epoch": 13.218597063621534, "grad_norm": 0.08649387210607529, "learning_rate": 1.556030739950209e-05, "loss": 0.1656, "num_input_tokens_seen": 174877568, "step": 81030 }, { "epoch": 13.219412724306688, "grad_norm": 0.3521949052810669, "learning_rate": 1.5557011962531355e-05, "loss": 0.0111, "num_input_tokens_seen": 174887616, "step": 81035 }, { "epoch": 13.220228384991843, "grad_norm": 2.307359218597412, "learning_rate": 1.5553716716934735e-05, "loss": 0.1786, "num_input_tokens_seen": 174897568, "step": 81040 }, { "epoch": 13.221044045676999, "grad_norm": 0.2536602020263672, "learning_rate": 1.5550421662779043e-05, "loss": 0.0598, "num_input_tokens_seen": 174908224, "step": 81045 }, { "epoch": 13.221859706362153, "grad_norm": 0.24412775039672852, "learning_rate": 1.5547126800131032e-05, "loss": 0.0111, "num_input_tokens_seen": 174919200, "step": 81050 }, { "epoch": 13.222675367047309, "grad_norm": 0.10956675559282303, "learning_rate": 1.55438321290575e-05, "loss": 0.127, "num_input_tokens_seen": 174931104, "step": 81055 }, { "epoch": 13.223491027732463, "grad_norm": 0.0756341814994812, "learning_rate": 1.5540537649625193e-05, "loss": 0.0887, "num_input_tokens_seen": 174942464, "step": 81060 }, { "epoch": 13.224306688417618, "grad_norm": 0.06763011962175369, "learning_rate": 1.5537243361900904e-05, "loss": 0.1424, "num_input_tokens_seen": 174953856, "step": 81065 }, { "epoch": 13.225122349102774, "grad_norm": 0.06647464632987976, "learning_rate": 1.553394926595137e-05, "loss": 0.1554, "num_input_tokens_seen": 174963840, "step": 81070 }, { "epoch": 13.225938009787928, "grad_norm": 0.888420581817627, "learning_rate": 1.5530655361843368e-05, "loss": 0.0395, "num_input_tokens_seen": 174975424, "step": 81075 }, { "epoch": 13.226753670473084, "grad_norm": 0.7537036538124084, "learning_rate": 1.5527361649643652e-05, "loss": 0.022, "num_input_tokens_seen": 174985984, "step": 81080 }, { "epoch": 13.227569331158238, "grad_norm": 0.10522332787513733, "learning_rate": 1.5524068129418962e-05, "loss": 0.023, "num_input_tokens_seen": 174996768, "step": 81085 }, { "epoch": 13.228384991843393, "grad_norm": 1.4841151237487793, "learning_rate": 1.5520774801236055e-05, "loss": 0.2073, "num_input_tokens_seen": 175007360, "step": 81090 }, { "epoch": 13.229200652528547, "grad_norm": 1.423109769821167, "learning_rate": 1.551748166516167e-05, "loss": 0.0991, "num_input_tokens_seen": 175017312, "step": 81095 }, { "epoch": 13.230016313213703, "grad_norm": 0.7890161275863647, "learning_rate": 1.5514188721262553e-05, "loss": 0.0365, "num_input_tokens_seen": 175028512, "step": 81100 }, { "epoch": 13.230831973898859, "grad_norm": 0.05871128290891647, "learning_rate": 1.5510895969605432e-05, "loss": 0.018, "num_input_tokens_seen": 175038656, "step": 81105 }, { "epoch": 13.231647634584013, "grad_norm": 1.0438414812088013, "learning_rate": 1.5507603410257042e-05, "loss": 0.0488, "num_input_tokens_seen": 175049504, "step": 81110 }, { "epoch": 13.232463295269168, "grad_norm": 1.0994758605957031, "learning_rate": 1.5504311043284115e-05, "loss": 0.0733, "num_input_tokens_seen": 175060512, "step": 81115 }, { "epoch": 13.233278955954322, "grad_norm": 0.19136089086532593, "learning_rate": 1.550101886875337e-05, "loss": 0.0567, "num_input_tokens_seen": 175070304, "step": 81120 }, { "epoch": 13.234094616639478, "grad_norm": 1.7136421203613281, "learning_rate": 1.5497726886731518e-05, "loss": 0.3072, "num_input_tokens_seen": 175082368, "step": 81125 }, { "epoch": 13.234910277324634, "grad_norm": 0.3550803065299988, "learning_rate": 1.54944350972853e-05, "loss": 0.0851, "num_input_tokens_seen": 175092896, "step": 81130 }, { "epoch": 13.235725938009788, "grad_norm": 2.0387160778045654, "learning_rate": 1.5491143500481403e-05, "loss": 0.0838, "num_input_tokens_seen": 175103808, "step": 81135 }, { "epoch": 13.236541598694943, "grad_norm": 0.027518808841705322, "learning_rate": 1.5487852096386552e-05, "loss": 0.178, "num_input_tokens_seen": 175115136, "step": 81140 }, { "epoch": 13.237357259380097, "grad_norm": 1.6692808866500854, "learning_rate": 1.548456088506744e-05, "loss": 0.233, "num_input_tokens_seen": 175125568, "step": 81145 }, { "epoch": 13.238172920065253, "grad_norm": 0.10615024715662003, "learning_rate": 1.548126986659078e-05, "loss": 0.0751, "num_input_tokens_seen": 175136352, "step": 81150 }, { "epoch": 13.238988580750409, "grad_norm": 0.025706889107823372, "learning_rate": 1.547797904102325e-05, "loss": 0.1129, "num_input_tokens_seen": 175146912, "step": 81155 }, { "epoch": 13.239804241435563, "grad_norm": 2.152209758758545, "learning_rate": 1.547468840843157e-05, "loss": 0.1721, "num_input_tokens_seen": 175158208, "step": 81160 }, { "epoch": 13.240619902120718, "grad_norm": 0.1885959953069687, "learning_rate": 1.54713979688824e-05, "loss": 0.0188, "num_input_tokens_seen": 175169120, "step": 81165 }, { "epoch": 13.241435562805872, "grad_norm": 0.5156598687171936, "learning_rate": 1.5468107722442445e-05, "loss": 0.2085, "num_input_tokens_seen": 175178400, "step": 81170 }, { "epoch": 13.242251223491028, "grad_norm": 0.060466036200523376, "learning_rate": 1.546481766917837e-05, "loss": 0.1557, "num_input_tokens_seen": 175188576, "step": 81175 }, { "epoch": 13.243066884176184, "grad_norm": 2.2577004432678223, "learning_rate": 1.5461527809156874e-05, "loss": 0.0596, "num_input_tokens_seen": 175199872, "step": 81180 }, { "epoch": 13.243882544861338, "grad_norm": 0.025909701362252235, "learning_rate": 1.545823814244461e-05, "loss": 0.1138, "num_input_tokens_seen": 175210560, "step": 81185 }, { "epoch": 13.244698205546493, "grad_norm": 0.04031939059495926, "learning_rate": 1.545494866910826e-05, "loss": 0.0802, "num_input_tokens_seen": 175221440, "step": 81190 }, { "epoch": 13.245513866231647, "grad_norm": 2.7690885066986084, "learning_rate": 1.5451659389214473e-05, "loss": 0.2136, "num_input_tokens_seen": 175232800, "step": 81195 }, { "epoch": 13.246329526916803, "grad_norm": 0.060813549906015396, "learning_rate": 1.5448370302829922e-05, "loss": 0.0184, "num_input_tokens_seen": 175243296, "step": 81200 }, { "epoch": 13.247145187601957, "grad_norm": 1.775977611541748, "learning_rate": 1.5445081410021277e-05, "loss": 0.2286, "num_input_tokens_seen": 175255328, "step": 81205 }, { "epoch": 13.247960848287113, "grad_norm": 0.5820761322975159, "learning_rate": 1.544179271085517e-05, "loss": 0.0369, "num_input_tokens_seen": 175267616, "step": 81210 }, { "epoch": 13.248776508972268, "grad_norm": 0.030279044061899185, "learning_rate": 1.5438504205398262e-05, "loss": 0.0348, "num_input_tokens_seen": 175278048, "step": 81215 }, { "epoch": 13.249592169657422, "grad_norm": 0.057944100350141525, "learning_rate": 1.543521589371719e-05, "loss": 0.0224, "num_input_tokens_seen": 175289088, "step": 81220 }, { "epoch": 13.250407830342578, "grad_norm": 0.09588305652141571, "learning_rate": 1.5431927775878613e-05, "loss": 0.0248, "num_input_tokens_seen": 175299328, "step": 81225 }, { "epoch": 13.251223491027732, "grad_norm": 0.03886979818344116, "learning_rate": 1.5428639851949143e-05, "loss": 0.1643, "num_input_tokens_seen": 175310080, "step": 81230 }, { "epoch": 13.252039151712887, "grad_norm": 0.10388315469026566, "learning_rate": 1.5425352121995447e-05, "loss": 0.0233, "num_input_tokens_seen": 175319360, "step": 81235 }, { "epoch": 13.252854812398043, "grad_norm": 0.1372697502374649, "learning_rate": 1.5422064586084116e-05, "loss": 0.023, "num_input_tokens_seen": 175331136, "step": 81240 }, { "epoch": 13.253670473083197, "grad_norm": 0.035590533167123795, "learning_rate": 1.5418777244281808e-05, "loss": 0.1367, "num_input_tokens_seen": 175341632, "step": 81245 }, { "epoch": 13.254486133768353, "grad_norm": 1.9717942476272583, "learning_rate": 1.5415490096655134e-05, "loss": 0.1443, "num_input_tokens_seen": 175352992, "step": 81250 }, { "epoch": 13.255301794453507, "grad_norm": 0.11870366334915161, "learning_rate": 1.541220314327071e-05, "loss": 0.0305, "num_input_tokens_seen": 175364160, "step": 81255 }, { "epoch": 13.256117455138662, "grad_norm": 1.0149548053741455, "learning_rate": 1.5408916384195156e-05, "loss": 0.1999, "num_input_tokens_seen": 175374912, "step": 81260 }, { "epoch": 13.256933115823816, "grad_norm": 0.1506863683462143, "learning_rate": 1.5405629819495072e-05, "loss": 0.1396, "num_input_tokens_seen": 175385440, "step": 81265 }, { "epoch": 13.257748776508972, "grad_norm": 2.5855023860931396, "learning_rate": 1.540234344923707e-05, "loss": 0.3123, "num_input_tokens_seen": 175396416, "step": 81270 }, { "epoch": 13.258564437194128, "grad_norm": 0.08164649456739426, "learning_rate": 1.5399057273487764e-05, "loss": 0.1347, "num_input_tokens_seen": 175406912, "step": 81275 }, { "epoch": 13.259380097879282, "grad_norm": 0.9683102965354919, "learning_rate": 1.539577129231373e-05, "loss": 0.1872, "num_input_tokens_seen": 175417600, "step": 81280 }, { "epoch": 13.260195758564437, "grad_norm": 1.1153608560562134, "learning_rate": 1.5392485505781585e-05, "loss": 0.0476, "num_input_tokens_seen": 175427488, "step": 81285 }, { "epoch": 13.261011419249591, "grad_norm": 0.046021804213523865, "learning_rate": 1.53891999139579e-05, "loss": 0.014, "num_input_tokens_seen": 175438496, "step": 81290 }, { "epoch": 13.261827079934747, "grad_norm": 1.368870735168457, "learning_rate": 1.538591451690928e-05, "loss": 0.1645, "num_input_tokens_seen": 175449760, "step": 81295 }, { "epoch": 13.262642740619903, "grad_norm": 1.683776617050171, "learning_rate": 1.538262931470229e-05, "loss": 0.1459, "num_input_tokens_seen": 175460160, "step": 81300 }, { "epoch": 13.263458401305057, "grad_norm": 1.2345455884933472, "learning_rate": 1.5379344307403517e-05, "loss": 0.1126, "num_input_tokens_seen": 175469376, "step": 81305 }, { "epoch": 13.264274061990212, "grad_norm": 0.034673262387514114, "learning_rate": 1.5376059495079548e-05, "loss": 0.1118, "num_input_tokens_seen": 175480416, "step": 81310 }, { "epoch": 13.265089722675366, "grad_norm": 0.012724238447844982, "learning_rate": 1.537277487779693e-05, "loss": 0.0431, "num_input_tokens_seen": 175490624, "step": 81315 }, { "epoch": 13.265905383360522, "grad_norm": 0.13489899039268494, "learning_rate": 1.536949045562225e-05, "loss": 0.0294, "num_input_tokens_seen": 175501568, "step": 81320 }, { "epoch": 13.266721044045678, "grad_norm": 0.4904212951660156, "learning_rate": 1.5366206228622058e-05, "loss": 0.133, "num_input_tokens_seen": 175512928, "step": 81325 }, { "epoch": 13.267536704730832, "grad_norm": 0.032898467034101486, "learning_rate": 1.5362922196862924e-05, "loss": 0.0587, "num_input_tokens_seen": 175523424, "step": 81330 }, { "epoch": 13.268352365415987, "grad_norm": 0.9848911166191101, "learning_rate": 1.535963836041139e-05, "loss": 0.1806, "num_input_tokens_seen": 175534272, "step": 81335 }, { "epoch": 13.269168026101141, "grad_norm": 0.1543855220079422, "learning_rate": 1.535635471933402e-05, "loss": 0.098, "num_input_tokens_seen": 175545760, "step": 81340 }, { "epoch": 13.269983686786297, "grad_norm": 1.9064195156097412, "learning_rate": 1.535307127369735e-05, "loss": 0.2521, "num_input_tokens_seen": 175556416, "step": 81345 }, { "epoch": 13.270799347471453, "grad_norm": 1.625275731086731, "learning_rate": 1.5349788023567937e-05, "loss": 0.1077, "num_input_tokens_seen": 175566272, "step": 81350 }, { "epoch": 13.271615008156607, "grad_norm": 1.1835421323776245, "learning_rate": 1.5346504969012306e-05, "loss": 0.1027, "num_input_tokens_seen": 175577440, "step": 81355 }, { "epoch": 13.272430668841762, "grad_norm": 0.04049038514494896, "learning_rate": 1.5343222110097e-05, "loss": 0.0141, "num_input_tokens_seen": 175589184, "step": 81360 }, { "epoch": 13.273246329526916, "grad_norm": 1.1309853792190552, "learning_rate": 1.5339939446888544e-05, "loss": 0.0438, "num_input_tokens_seen": 175599712, "step": 81365 }, { "epoch": 13.274061990212072, "grad_norm": 0.21731430292129517, "learning_rate": 1.533665697945348e-05, "loss": 0.0787, "num_input_tokens_seen": 175610400, "step": 81370 }, { "epoch": 13.274877650897226, "grad_norm": 0.7030847668647766, "learning_rate": 1.5333374707858306e-05, "loss": 0.1362, "num_input_tokens_seen": 175621504, "step": 81375 }, { "epoch": 13.275693311582382, "grad_norm": 1.3379340171813965, "learning_rate": 1.533009263216957e-05, "loss": 0.073, "num_input_tokens_seen": 175632096, "step": 81380 }, { "epoch": 13.276508972267537, "grad_norm": 0.15565665066242218, "learning_rate": 1.5326810752453764e-05, "loss": 0.3976, "num_input_tokens_seen": 175643232, "step": 81385 }, { "epoch": 13.277324632952691, "grad_norm": 0.039307672530412674, "learning_rate": 1.5323529068777414e-05, "loss": 0.1297, "num_input_tokens_seen": 175654016, "step": 81390 }, { "epoch": 13.278140293637847, "grad_norm": 0.13371938467025757, "learning_rate": 1.5320247581207027e-05, "loss": 0.1807, "num_input_tokens_seen": 175665440, "step": 81395 }, { "epoch": 13.278955954323001, "grad_norm": 0.028908617794513702, "learning_rate": 1.5316966289809097e-05, "loss": 0.0262, "num_input_tokens_seen": 175676288, "step": 81400 }, { "epoch": 13.279771615008157, "grad_norm": 0.035620346665382385, "learning_rate": 1.531368519465013e-05, "loss": 0.0569, "num_input_tokens_seen": 175686592, "step": 81405 }, { "epoch": 13.280587275693312, "grad_norm": 0.32660073041915894, "learning_rate": 1.5310404295796617e-05, "loss": 0.0186, "num_input_tokens_seen": 175698112, "step": 81410 }, { "epoch": 13.281402936378466, "grad_norm": 2.0657308101654053, "learning_rate": 1.5307123593315057e-05, "loss": 0.2057, "num_input_tokens_seen": 175709728, "step": 81415 }, { "epoch": 13.282218597063622, "grad_norm": 0.9218515157699585, "learning_rate": 1.5303843087271934e-05, "loss": 0.1268, "num_input_tokens_seen": 175719968, "step": 81420 }, { "epoch": 13.283034257748776, "grad_norm": 1.463571548461914, "learning_rate": 1.530056277773373e-05, "loss": 0.0866, "num_input_tokens_seen": 175731072, "step": 81425 }, { "epoch": 13.283849918433932, "grad_norm": 0.1072855070233345, "learning_rate": 1.5297282664766928e-05, "loss": 0.0576, "num_input_tokens_seen": 175740768, "step": 81430 }, { "epoch": 13.284665579119087, "grad_norm": 0.07943668216466904, "learning_rate": 1.5294002748438e-05, "loss": 0.0933, "num_input_tokens_seen": 175751168, "step": 81435 }, { "epoch": 13.285481239804241, "grad_norm": 1.7676230669021606, "learning_rate": 1.5290723028813413e-05, "loss": 0.1533, "num_input_tokens_seen": 175762496, "step": 81440 }, { "epoch": 13.286296900489397, "grad_norm": 0.5064945816993713, "learning_rate": 1.5287443505959654e-05, "loss": 0.0423, "num_input_tokens_seen": 175773312, "step": 81445 }, { "epoch": 13.28711256117455, "grad_norm": 0.05882779508829117, "learning_rate": 1.528416417994316e-05, "loss": 0.0276, "num_input_tokens_seen": 175784960, "step": 81450 }, { "epoch": 13.287928221859707, "grad_norm": 0.2905551791191101, "learning_rate": 1.5280885050830417e-05, "loss": 0.0798, "num_input_tokens_seen": 175796192, "step": 81455 }, { "epoch": 13.28874388254486, "grad_norm": 1.6950840950012207, "learning_rate": 1.527760611868786e-05, "loss": 0.2071, "num_input_tokens_seen": 175807680, "step": 81460 }, { "epoch": 13.289559543230016, "grad_norm": 1.7409404516220093, "learning_rate": 1.5274327383581954e-05, "loss": 0.1395, "num_input_tokens_seen": 175816288, "step": 81465 }, { "epoch": 13.290375203915172, "grad_norm": 0.03043145313858986, "learning_rate": 1.5271048845579135e-05, "loss": 0.1465, "num_input_tokens_seen": 175826784, "step": 81470 }, { "epoch": 13.291190864600326, "grad_norm": 0.5236390829086304, "learning_rate": 1.5267770504745864e-05, "loss": 0.1448, "num_input_tokens_seen": 175836384, "step": 81475 }, { "epoch": 13.292006525285482, "grad_norm": 0.02497311681509018, "learning_rate": 1.5264492361148558e-05, "loss": 0.0908, "num_input_tokens_seen": 175848096, "step": 81480 }, { "epoch": 13.292822185970635, "grad_norm": 0.8390122056007385, "learning_rate": 1.5261214414853675e-05, "loss": 0.0237, "num_input_tokens_seen": 175858784, "step": 81485 }, { "epoch": 13.293637846655791, "grad_norm": 0.3435104489326477, "learning_rate": 1.5257936665927625e-05, "loss": 0.179, "num_input_tokens_seen": 175869792, "step": 81490 }, { "epoch": 13.294453507340947, "grad_norm": 0.07935085892677307, "learning_rate": 1.525465911443686e-05, "loss": 0.2372, "num_input_tokens_seen": 175880544, "step": 81495 }, { "epoch": 13.2952691680261, "grad_norm": 0.05469828471541405, "learning_rate": 1.5251381760447778e-05, "loss": 0.1647, "num_input_tokens_seen": 175890816, "step": 81500 }, { "epoch": 13.296084828711257, "grad_norm": 0.7810863256454468, "learning_rate": 1.5248104604026823e-05, "loss": 0.125, "num_input_tokens_seen": 175901024, "step": 81505 }, { "epoch": 13.29690048939641, "grad_norm": 0.8062900900840759, "learning_rate": 1.524482764524039e-05, "loss": 0.0936, "num_input_tokens_seen": 175910752, "step": 81510 }, { "epoch": 13.297716150081566, "grad_norm": 0.018393414095044136, "learning_rate": 1.5241550884154898e-05, "loss": 0.0149, "num_input_tokens_seen": 175921952, "step": 81515 }, { "epoch": 13.298531810766722, "grad_norm": 0.4307844936847687, "learning_rate": 1.5238274320836768e-05, "loss": 0.0154, "num_input_tokens_seen": 175932896, "step": 81520 }, { "epoch": 13.299347471451876, "grad_norm": 0.19323308765888214, "learning_rate": 1.5234997955352382e-05, "loss": 0.0463, "num_input_tokens_seen": 175942432, "step": 81525 }, { "epoch": 13.300163132137031, "grad_norm": 0.0874936580657959, "learning_rate": 1.523172178776816e-05, "loss": 0.0079, "num_input_tokens_seen": 175953120, "step": 81530 }, { "epoch": 13.300978792822185, "grad_norm": 0.40571340918540955, "learning_rate": 1.5228445818150477e-05, "loss": 0.042, "num_input_tokens_seen": 175965056, "step": 81535 }, { "epoch": 13.301794453507341, "grad_norm": 0.2585693895816803, "learning_rate": 1.522517004656575e-05, "loss": 0.13, "num_input_tokens_seen": 175976864, "step": 81540 }, { "epoch": 13.302610114192497, "grad_norm": 0.5007531046867371, "learning_rate": 1.522189447308034e-05, "loss": 0.1141, "num_input_tokens_seen": 175986688, "step": 81545 }, { "epoch": 13.30342577487765, "grad_norm": 0.2874946594238281, "learning_rate": 1.521861909776065e-05, "loss": 0.0241, "num_input_tokens_seen": 175996704, "step": 81550 }, { "epoch": 13.304241435562806, "grad_norm": 0.9267986416816711, "learning_rate": 1.5215343920673047e-05, "loss": 0.0243, "num_input_tokens_seen": 176006912, "step": 81555 }, { "epoch": 13.30505709624796, "grad_norm": 1.6436504125595093, "learning_rate": 1.5212068941883914e-05, "loss": 0.0762, "num_input_tokens_seen": 176018272, "step": 81560 }, { "epoch": 13.305872756933116, "grad_norm": 0.041463401168584824, "learning_rate": 1.5208794161459622e-05, "loss": 0.0765, "num_input_tokens_seen": 176028192, "step": 81565 }, { "epoch": 13.30668841761827, "grad_norm": 0.13306811451911926, "learning_rate": 1.5205519579466538e-05, "loss": 0.0499, "num_input_tokens_seen": 176039392, "step": 81570 }, { "epoch": 13.307504078303426, "grad_norm": 0.08848696947097778, "learning_rate": 1.5202245195971025e-05, "loss": 0.076, "num_input_tokens_seen": 176049696, "step": 81575 }, { "epoch": 13.308319738988581, "grad_norm": 0.12082011997699738, "learning_rate": 1.5198971011039442e-05, "loss": 0.0134, "num_input_tokens_seen": 176060576, "step": 81580 }, { "epoch": 13.309135399673735, "grad_norm": 0.48070111870765686, "learning_rate": 1.5195697024738137e-05, "loss": 0.0613, "num_input_tokens_seen": 176070944, "step": 81585 }, { "epoch": 13.309951060358891, "grad_norm": 0.34761369228363037, "learning_rate": 1.5192423237133482e-05, "loss": 0.1382, "num_input_tokens_seen": 176082240, "step": 81590 }, { "epoch": 13.310766721044045, "grad_norm": 0.08664678782224655, "learning_rate": 1.5189149648291803e-05, "loss": 0.0296, "num_input_tokens_seen": 176093888, "step": 81595 }, { "epoch": 13.3115823817292, "grad_norm": 0.2953043580055237, "learning_rate": 1.518587625827946e-05, "loss": 0.1738, "num_input_tokens_seen": 176104160, "step": 81600 }, { "epoch": 13.312398042414356, "grad_norm": 0.8454289436340332, "learning_rate": 1.5182603067162776e-05, "loss": 0.1305, "num_input_tokens_seen": 176114816, "step": 81605 }, { "epoch": 13.31321370309951, "grad_norm": 0.48043012619018555, "learning_rate": 1.5179330075008106e-05, "loss": 0.0375, "num_input_tokens_seen": 176125216, "step": 81610 }, { "epoch": 13.314029363784666, "grad_norm": 0.13315539062023163, "learning_rate": 1.517605728188176e-05, "loss": 0.0547, "num_input_tokens_seen": 176136320, "step": 81615 }, { "epoch": 13.31484502446982, "grad_norm": 2.3605384826660156, "learning_rate": 1.5172784687850072e-05, "loss": 0.2758, "num_input_tokens_seen": 176147040, "step": 81620 }, { "epoch": 13.315660685154976, "grad_norm": 0.22289127111434937, "learning_rate": 1.5169512292979385e-05, "loss": 0.02, "num_input_tokens_seen": 176158848, "step": 81625 }, { "epoch": 13.31647634584013, "grad_norm": 0.7792696356773376, "learning_rate": 1.5166240097335988e-05, "loss": 0.1215, "num_input_tokens_seen": 176169504, "step": 81630 }, { "epoch": 13.317292006525285, "grad_norm": 0.08516005426645279, "learning_rate": 1.5162968100986225e-05, "loss": 0.098, "num_input_tokens_seen": 176180480, "step": 81635 }, { "epoch": 13.318107667210441, "grad_norm": 0.49361947178840637, "learning_rate": 1.515969630399638e-05, "loss": 0.134, "num_input_tokens_seen": 176192288, "step": 81640 }, { "epoch": 13.318923327895595, "grad_norm": 0.3089146316051483, "learning_rate": 1.5156424706432781e-05, "loss": 0.0538, "num_input_tokens_seen": 176203072, "step": 81645 }, { "epoch": 13.31973898858075, "grad_norm": 0.4444041848182678, "learning_rate": 1.5153153308361712e-05, "loss": 0.0233, "num_input_tokens_seen": 176213568, "step": 81650 }, { "epoch": 13.320554649265905, "grad_norm": 0.05132400989532471, "learning_rate": 1.5149882109849495e-05, "loss": 0.0613, "num_input_tokens_seen": 176224032, "step": 81655 }, { "epoch": 13.32137030995106, "grad_norm": 0.25352999567985535, "learning_rate": 1.5146611110962402e-05, "loss": 0.1813, "num_input_tokens_seen": 176234368, "step": 81660 }, { "epoch": 13.322185970636216, "grad_norm": 0.8188713788986206, "learning_rate": 1.5143340311766747e-05, "loss": 0.1253, "num_input_tokens_seen": 176244992, "step": 81665 }, { "epoch": 13.32300163132137, "grad_norm": 0.049597762525081635, "learning_rate": 1.5140069712328792e-05, "loss": 0.2242, "num_input_tokens_seen": 176254752, "step": 81670 }, { "epoch": 13.323817292006526, "grad_norm": 0.8073980212211609, "learning_rate": 1.5136799312714845e-05, "loss": 0.0163, "num_input_tokens_seen": 176265696, "step": 81675 }, { "epoch": 13.32463295269168, "grad_norm": 0.03762498125433922, "learning_rate": 1.5133529112991157e-05, "loss": 0.0337, "num_input_tokens_seen": 176276064, "step": 81680 }, { "epoch": 13.325448613376835, "grad_norm": 0.08844557404518127, "learning_rate": 1.5130259113224032e-05, "loss": 0.1145, "num_input_tokens_seen": 176286272, "step": 81685 }, { "epoch": 13.326264274061991, "grad_norm": 0.58962482213974, "learning_rate": 1.5126989313479717e-05, "loss": 0.1317, "num_input_tokens_seen": 176297344, "step": 81690 }, { "epoch": 13.327079934747145, "grad_norm": 0.06155966594815254, "learning_rate": 1.51237197138245e-05, "loss": 0.0385, "num_input_tokens_seen": 176308896, "step": 81695 }, { "epoch": 13.3278955954323, "grad_norm": 0.13426220417022705, "learning_rate": 1.512045031432462e-05, "loss": 0.1319, "num_input_tokens_seen": 176319328, "step": 81700 }, { "epoch": 13.328711256117455, "grad_norm": 0.011274098418653011, "learning_rate": 1.5117181115046352e-05, "loss": 0.0217, "num_input_tokens_seen": 176330304, "step": 81705 }, { "epoch": 13.32952691680261, "grad_norm": 0.06514487415552139, "learning_rate": 1.5113912116055945e-05, "loss": 0.0797, "num_input_tokens_seen": 176340448, "step": 81710 }, { "epoch": 13.330342577487766, "grad_norm": 0.46605151891708374, "learning_rate": 1.5110643317419654e-05, "loss": 0.0317, "num_input_tokens_seen": 176351296, "step": 81715 }, { "epoch": 13.33115823817292, "grad_norm": 1.485315203666687, "learning_rate": 1.5107374719203715e-05, "loss": 0.1912, "num_input_tokens_seen": 176362304, "step": 81720 }, { "epoch": 13.331973898858076, "grad_norm": 0.10187487304210663, "learning_rate": 1.5104106321474379e-05, "loss": 0.0731, "num_input_tokens_seen": 176371168, "step": 81725 }, { "epoch": 13.33278955954323, "grad_norm": 1.6267733573913574, "learning_rate": 1.5100838124297884e-05, "loss": 0.2362, "num_input_tokens_seen": 176381984, "step": 81730 }, { "epoch": 13.333605220228385, "grad_norm": 2.774101972579956, "learning_rate": 1.5097570127740462e-05, "loss": 0.2084, "num_input_tokens_seen": 176393120, "step": 81735 }, { "epoch": 13.33442088091354, "grad_norm": 0.4602198302745819, "learning_rate": 1.5094302331868347e-05, "loss": 0.1734, "num_input_tokens_seen": 176404128, "step": 81740 }, { "epoch": 13.335236541598695, "grad_norm": 0.1246635913848877, "learning_rate": 1.5091034736747756e-05, "loss": 0.1787, "num_input_tokens_seen": 176414016, "step": 81745 }, { "epoch": 13.33605220228385, "grad_norm": 0.2575036287307739, "learning_rate": 1.508776734244492e-05, "loss": 0.0253, "num_input_tokens_seen": 176425376, "step": 81750 }, { "epoch": 13.336867862969005, "grad_norm": 0.5377529859542847, "learning_rate": 1.5084500149026048e-05, "loss": 0.175, "num_input_tokens_seen": 176439104, "step": 81755 }, { "epoch": 13.33768352365416, "grad_norm": 0.051797546446323395, "learning_rate": 1.5081233156557366e-05, "loss": 0.0067, "num_input_tokens_seen": 176448800, "step": 81760 }, { "epoch": 13.338499184339314, "grad_norm": 0.19196681678295135, "learning_rate": 1.5077966365105072e-05, "loss": 0.0347, "num_input_tokens_seen": 176459584, "step": 81765 }, { "epoch": 13.33931484502447, "grad_norm": 0.0506758876144886, "learning_rate": 1.5074699774735384e-05, "loss": 0.0462, "num_input_tokens_seen": 176469728, "step": 81770 }, { "epoch": 13.340130505709626, "grad_norm": 0.13671885430812836, "learning_rate": 1.507143338551449e-05, "loss": 0.0801, "num_input_tokens_seen": 176480320, "step": 81775 }, { "epoch": 13.34094616639478, "grad_norm": 0.11610323935747147, "learning_rate": 1.5068167197508599e-05, "loss": 0.1078, "num_input_tokens_seen": 176491392, "step": 81780 }, { "epoch": 13.341761827079935, "grad_norm": 0.20897994935512543, "learning_rate": 1.5064901210783888e-05, "loss": 0.0569, "num_input_tokens_seen": 176501856, "step": 81785 }, { "epoch": 13.34257748776509, "grad_norm": 0.06678997725248337, "learning_rate": 1.5061635425406572e-05, "loss": 0.0411, "num_input_tokens_seen": 176511616, "step": 81790 }, { "epoch": 13.343393148450245, "grad_norm": 0.09874594956636429, "learning_rate": 1.505836984144281e-05, "loss": 0.0304, "num_input_tokens_seen": 176522688, "step": 81795 }, { "epoch": 13.3442088091354, "grad_norm": 0.22993828356266022, "learning_rate": 1.5055104458958804e-05, "loss": 0.024, "num_input_tokens_seen": 176534496, "step": 81800 }, { "epoch": 13.345024469820554, "grad_norm": 1.0094197988510132, "learning_rate": 1.5051839278020713e-05, "loss": 0.159, "num_input_tokens_seen": 176545600, "step": 81805 }, { "epoch": 13.34584013050571, "grad_norm": 0.18439963459968567, "learning_rate": 1.5048574298694728e-05, "loss": 0.1624, "num_input_tokens_seen": 176555904, "step": 81810 }, { "epoch": 13.346655791190864, "grad_norm": 0.16963842511177063, "learning_rate": 1.5045309521047003e-05, "loss": 0.0087, "num_input_tokens_seen": 176567392, "step": 81815 }, { "epoch": 13.34747145187602, "grad_norm": 1.704397439956665, "learning_rate": 1.5042044945143716e-05, "loss": 0.0881, "num_input_tokens_seen": 176577536, "step": 81820 }, { "epoch": 13.348287112561174, "grad_norm": 0.4806655943393707, "learning_rate": 1.5038780571051012e-05, "loss": 0.135, "num_input_tokens_seen": 176586624, "step": 81825 }, { "epoch": 13.34910277324633, "grad_norm": 0.23081673681735992, "learning_rate": 1.5035516398835064e-05, "loss": 0.1169, "num_input_tokens_seen": 176597216, "step": 81830 }, { "epoch": 13.349918433931485, "grad_norm": 1.167635440826416, "learning_rate": 1.503225242856201e-05, "loss": 0.1626, "num_input_tokens_seen": 176608416, "step": 81835 }, { "epoch": 13.350734094616639, "grad_norm": 0.12195462733507156, "learning_rate": 1.5028988660298005e-05, "loss": 0.0764, "num_input_tokens_seen": 176619104, "step": 81840 }, { "epoch": 13.351549755301795, "grad_norm": 0.16067509353160858, "learning_rate": 1.50257250941092e-05, "loss": 0.1042, "num_input_tokens_seen": 176629952, "step": 81845 }, { "epoch": 13.352365415986949, "grad_norm": 0.2085188776254654, "learning_rate": 1.5022461730061723e-05, "loss": 0.1347, "num_input_tokens_seen": 176639744, "step": 81850 }, { "epoch": 13.353181076672104, "grad_norm": 0.03462900593876839, "learning_rate": 1.5019198568221727e-05, "loss": 0.1127, "num_input_tokens_seen": 176651392, "step": 81855 }, { "epoch": 13.35399673735726, "grad_norm": 0.33141621947288513, "learning_rate": 1.5015935608655322e-05, "loss": 0.0257, "num_input_tokens_seen": 176662208, "step": 81860 }, { "epoch": 13.354812398042414, "grad_norm": 0.07771515101194382, "learning_rate": 1.5012672851428655e-05, "loss": 0.0108, "num_input_tokens_seen": 176675232, "step": 81865 }, { "epoch": 13.35562805872757, "grad_norm": 0.24856655299663544, "learning_rate": 1.5009410296607838e-05, "loss": 0.0684, "num_input_tokens_seen": 176686304, "step": 81870 }, { "epoch": 13.356443719412724, "grad_norm": 0.05700708553195, "learning_rate": 1.5006147944258998e-05, "loss": 0.1123, "num_input_tokens_seen": 176696224, "step": 81875 }, { "epoch": 13.35725938009788, "grad_norm": 1.0494122505187988, "learning_rate": 1.5002885794448246e-05, "loss": 0.1234, "num_input_tokens_seen": 176707264, "step": 81880 }, { "epoch": 13.358075040783035, "grad_norm": 0.46496087312698364, "learning_rate": 1.4999623847241698e-05, "loss": 0.0914, "num_input_tokens_seen": 176719264, "step": 81885 }, { "epoch": 13.358890701468189, "grad_norm": 0.1756371706724167, "learning_rate": 1.4996362102705457e-05, "loss": 0.0317, "num_input_tokens_seen": 176730944, "step": 81890 }, { "epoch": 13.359706362153345, "grad_norm": 1.1622314453125, "learning_rate": 1.4993100560905632e-05, "loss": 0.0669, "num_input_tokens_seen": 176741568, "step": 81895 }, { "epoch": 13.360522022838499, "grad_norm": 0.14106306433677673, "learning_rate": 1.4989839221908309e-05, "loss": 0.0404, "num_input_tokens_seen": 176751552, "step": 81900 }, { "epoch": 13.361337683523654, "grad_norm": 0.17887760698795319, "learning_rate": 1.4986578085779607e-05, "loss": 0.083, "num_input_tokens_seen": 176761152, "step": 81905 }, { "epoch": 13.362153344208808, "grad_norm": 1.8647371530532837, "learning_rate": 1.4983317152585592e-05, "loss": 0.0339, "num_input_tokens_seen": 176771648, "step": 81910 }, { "epoch": 13.362969004893964, "grad_norm": 1.9013617038726807, "learning_rate": 1.4980056422392368e-05, "loss": 0.104, "num_input_tokens_seen": 176781664, "step": 81915 }, { "epoch": 13.36378466557912, "grad_norm": 1.9593030214309692, "learning_rate": 1.4976795895266007e-05, "loss": 0.0984, "num_input_tokens_seen": 176792512, "step": 81920 }, { "epoch": 13.364600326264274, "grad_norm": 0.6309439539909363, "learning_rate": 1.4973535571272602e-05, "loss": 0.0364, "num_input_tokens_seen": 176803488, "step": 81925 }, { "epoch": 13.36541598694943, "grad_norm": 0.23816514015197754, "learning_rate": 1.4970275450478204e-05, "loss": 0.195, "num_input_tokens_seen": 176813632, "step": 81930 }, { "epoch": 13.366231647634583, "grad_norm": 0.0708242803812027, "learning_rate": 1.4967015532948914e-05, "loss": 0.1836, "num_input_tokens_seen": 176825472, "step": 81935 }, { "epoch": 13.367047308319739, "grad_norm": 1.1218396425247192, "learning_rate": 1.4963755818750769e-05, "loss": 0.218, "num_input_tokens_seen": 176835776, "step": 81940 }, { "epoch": 13.367862969004895, "grad_norm": 1.2688132524490356, "learning_rate": 1.4960496307949842e-05, "loss": 0.1251, "num_input_tokens_seen": 176847744, "step": 81945 }, { "epoch": 13.368678629690049, "grad_norm": 0.3363891541957855, "learning_rate": 1.4957237000612206e-05, "loss": 0.0554, "num_input_tokens_seen": 176858368, "step": 81950 }, { "epoch": 13.369494290375204, "grad_norm": 0.07073040306568146, "learning_rate": 1.4953977896803892e-05, "loss": 0.1076, "num_input_tokens_seen": 176868448, "step": 81955 }, { "epoch": 13.370309951060358, "grad_norm": 0.07524210214614868, "learning_rate": 1.4950718996590968e-05, "loss": 0.0278, "num_input_tokens_seen": 176879008, "step": 81960 }, { "epoch": 13.371125611745514, "grad_norm": 0.14332282543182373, "learning_rate": 1.4947460300039464e-05, "loss": 0.0243, "num_input_tokens_seen": 176889056, "step": 81965 }, { "epoch": 13.37194127243067, "grad_norm": 0.772853434085846, "learning_rate": 1.494420180721544e-05, "loss": 0.1284, "num_input_tokens_seen": 176899008, "step": 81970 }, { "epoch": 13.372756933115824, "grad_norm": 0.09223171323537827, "learning_rate": 1.4940943518184913e-05, "loss": 0.0571, "num_input_tokens_seen": 176909856, "step": 81975 }, { "epoch": 13.37357259380098, "grad_norm": 0.05026876553893089, "learning_rate": 1.4937685433013934e-05, "loss": 0.1428, "num_input_tokens_seen": 176919648, "step": 81980 }, { "epoch": 13.374388254486133, "grad_norm": 0.09884996712207794, "learning_rate": 1.4934427551768515e-05, "loss": 0.0235, "num_input_tokens_seen": 176929536, "step": 81985 }, { "epoch": 13.375203915171289, "grad_norm": 0.04070119559764862, "learning_rate": 1.4931169874514705e-05, "loss": 0.0723, "num_input_tokens_seen": 176941152, "step": 81990 }, { "epoch": 13.376019575856443, "grad_norm": 0.027692357078194618, "learning_rate": 1.4927912401318494e-05, "loss": 0.1874, "num_input_tokens_seen": 176951328, "step": 81995 }, { "epoch": 13.376835236541599, "grad_norm": 0.1226174607872963, "learning_rate": 1.4924655132245933e-05, "loss": 0.1573, "num_input_tokens_seen": 176962496, "step": 82000 }, { "epoch": 13.377650897226754, "grad_norm": 0.29406654834747314, "learning_rate": 1.4921398067363001e-05, "loss": 0.021, "num_input_tokens_seen": 176972736, "step": 82005 }, { "epoch": 13.378466557911908, "grad_norm": 0.11074376106262207, "learning_rate": 1.4918141206735735e-05, "loss": 0.0607, "num_input_tokens_seen": 176983296, "step": 82010 }, { "epoch": 13.379282218597064, "grad_norm": 0.09693611413240433, "learning_rate": 1.491488455043012e-05, "loss": 0.1341, "num_input_tokens_seen": 176993824, "step": 82015 }, { "epoch": 13.380097879282218, "grad_norm": 0.14117050170898438, "learning_rate": 1.4911628098512164e-05, "loss": 0.3824, "num_input_tokens_seen": 177003488, "step": 82020 }, { "epoch": 13.380913539967374, "grad_norm": 0.058215949684381485, "learning_rate": 1.4908371851047865e-05, "loss": 0.0213, "num_input_tokens_seen": 177014304, "step": 82025 }, { "epoch": 13.38172920065253, "grad_norm": 0.08075210452079773, "learning_rate": 1.4905115808103211e-05, "loss": 0.0236, "num_input_tokens_seen": 177024352, "step": 82030 }, { "epoch": 13.382544861337683, "grad_norm": 0.1436203271150589, "learning_rate": 1.4901859969744194e-05, "loss": 0.0846, "num_input_tokens_seen": 177034944, "step": 82035 }, { "epoch": 13.383360522022839, "grad_norm": 0.24890924990177155, "learning_rate": 1.4898604336036792e-05, "loss": 0.088, "num_input_tokens_seen": 177046816, "step": 82040 }, { "epoch": 13.384176182707993, "grad_norm": 0.15794064104557037, "learning_rate": 1.4895348907046986e-05, "loss": 0.0455, "num_input_tokens_seen": 177056384, "step": 82045 }, { "epoch": 13.384991843393149, "grad_norm": 0.1516345888376236, "learning_rate": 1.4892093682840757e-05, "loss": 0.0367, "num_input_tokens_seen": 177067520, "step": 82050 }, { "epoch": 13.385807504078304, "grad_norm": 0.10764694213867188, "learning_rate": 1.4888838663484073e-05, "loss": 0.2509, "num_input_tokens_seen": 177077984, "step": 82055 }, { "epoch": 13.386623164763458, "grad_norm": 0.37662917375564575, "learning_rate": 1.4885583849042898e-05, "loss": 0.0339, "num_input_tokens_seen": 177088384, "step": 82060 }, { "epoch": 13.387438825448614, "grad_norm": 0.052992481738328934, "learning_rate": 1.4882329239583199e-05, "loss": 0.26, "num_input_tokens_seen": 177100160, "step": 82065 }, { "epoch": 13.388254486133768, "grad_norm": 0.8055438995361328, "learning_rate": 1.4879074835170925e-05, "loss": 0.131, "num_input_tokens_seen": 177110880, "step": 82070 }, { "epoch": 13.389070146818923, "grad_norm": 0.03696770220994949, "learning_rate": 1.487582063587205e-05, "loss": 0.1223, "num_input_tokens_seen": 177120832, "step": 82075 }, { "epoch": 13.38988580750408, "grad_norm": 1.4240306615829468, "learning_rate": 1.4872566641752506e-05, "loss": 0.15, "num_input_tokens_seen": 177132192, "step": 82080 }, { "epoch": 13.390701468189233, "grad_norm": 2.6730055809020996, "learning_rate": 1.4869312852878257e-05, "loss": 0.0811, "num_input_tokens_seen": 177143008, "step": 82085 }, { "epoch": 13.391517128874389, "grad_norm": 0.48406684398651123, "learning_rate": 1.4866059269315221e-05, "loss": 0.0326, "num_input_tokens_seen": 177152384, "step": 82090 }, { "epoch": 13.392332789559543, "grad_norm": 2.3199243545532227, "learning_rate": 1.486280589112936e-05, "loss": 0.1434, "num_input_tokens_seen": 177164608, "step": 82095 }, { "epoch": 13.393148450244698, "grad_norm": 0.03160716965794563, "learning_rate": 1.4859552718386594e-05, "loss": 0.0575, "num_input_tokens_seen": 177176320, "step": 82100 }, { "epoch": 13.393964110929852, "grad_norm": 0.24919521808624268, "learning_rate": 1.4856299751152857e-05, "loss": 0.0197, "num_input_tokens_seen": 177186656, "step": 82105 }, { "epoch": 13.394779771615008, "grad_norm": 0.3051297962665558, "learning_rate": 1.4853046989494071e-05, "loss": 0.0775, "num_input_tokens_seen": 177197536, "step": 82110 }, { "epoch": 13.395595432300164, "grad_norm": 0.26223379373550415, "learning_rate": 1.4849794433476166e-05, "loss": 0.0519, "num_input_tokens_seen": 177207712, "step": 82115 }, { "epoch": 13.396411092985318, "grad_norm": 0.08046263456344604, "learning_rate": 1.4846542083165043e-05, "loss": 0.0149, "num_input_tokens_seen": 177218208, "step": 82120 }, { "epoch": 13.397226753670473, "grad_norm": 0.43852534890174866, "learning_rate": 1.4843289938626639e-05, "loss": 0.1184, "num_input_tokens_seen": 177230240, "step": 82125 }, { "epoch": 13.398042414355627, "grad_norm": 0.21001243591308594, "learning_rate": 1.4840037999926838e-05, "loss": 0.0225, "num_input_tokens_seen": 177241088, "step": 82130 }, { "epoch": 13.398858075040783, "grad_norm": 0.03564786911010742, "learning_rate": 1.4836786267131564e-05, "loss": 0.0243, "num_input_tokens_seen": 177252256, "step": 82135 }, { "epoch": 13.399673735725939, "grad_norm": 0.038484424352645874, "learning_rate": 1.4833534740306699e-05, "loss": 0.1411, "num_input_tokens_seen": 177262784, "step": 82140 }, { "epoch": 13.400489396411093, "grad_norm": 0.14761292934417725, "learning_rate": 1.4830283419518159e-05, "loss": 0.2329, "num_input_tokens_seen": 177272704, "step": 82145 }, { "epoch": 13.401305057096248, "grad_norm": 2.1826770305633545, "learning_rate": 1.482703230483182e-05, "loss": 0.3669, "num_input_tokens_seen": 177283680, "step": 82150 }, { "epoch": 13.402120717781402, "grad_norm": 0.07534115016460419, "learning_rate": 1.4823781396313573e-05, "loss": 0.0933, "num_input_tokens_seen": 177293696, "step": 82155 }, { "epoch": 13.402936378466558, "grad_norm": 0.06971758604049683, "learning_rate": 1.4820530694029319e-05, "loss": 0.1428, "num_input_tokens_seen": 177303904, "step": 82160 }, { "epoch": 13.403752039151712, "grad_norm": 0.08824223279953003, "learning_rate": 1.4817280198044914e-05, "loss": 0.0267, "num_input_tokens_seen": 177314880, "step": 82165 }, { "epoch": 13.404567699836868, "grad_norm": 0.4345036745071411, "learning_rate": 1.481402990842625e-05, "loss": 0.0208, "num_input_tokens_seen": 177325440, "step": 82170 }, { "epoch": 13.405383360522023, "grad_norm": 0.4460865557193756, "learning_rate": 1.4810779825239186e-05, "loss": 0.0814, "num_input_tokens_seen": 177336896, "step": 82175 }, { "epoch": 13.406199021207177, "grad_norm": 0.08659283071756363, "learning_rate": 1.4807529948549598e-05, "loss": 0.1322, "num_input_tokens_seen": 177348288, "step": 82180 }, { "epoch": 13.407014681892333, "grad_norm": 0.17309582233428955, "learning_rate": 1.4804280278423342e-05, "loss": 0.0146, "num_input_tokens_seen": 177359104, "step": 82185 }, { "epoch": 13.407830342577487, "grad_norm": 0.43105676770210266, "learning_rate": 1.4801030814926282e-05, "loss": 0.0185, "num_input_tokens_seen": 177369952, "step": 82190 }, { "epoch": 13.408646003262643, "grad_norm": 0.4562886655330658, "learning_rate": 1.4797781558124274e-05, "loss": 0.0455, "num_input_tokens_seen": 177380864, "step": 82195 }, { "epoch": 13.409461663947798, "grad_norm": 0.04354051873087883, "learning_rate": 1.4794532508083162e-05, "loss": 0.0787, "num_input_tokens_seen": 177391136, "step": 82200 }, { "epoch": 13.410277324632952, "grad_norm": 2.378619432449341, "learning_rate": 1.4791283664868797e-05, "loss": 0.1537, "num_input_tokens_seen": 177401408, "step": 82205 }, { "epoch": 13.411092985318108, "grad_norm": 0.06082529574632645, "learning_rate": 1.4788035028547018e-05, "loss": 0.0324, "num_input_tokens_seen": 177412096, "step": 82210 }, { "epoch": 13.411908646003262, "grad_norm": 0.025591254234313965, "learning_rate": 1.478478659918366e-05, "loss": 0.0373, "num_input_tokens_seen": 177422976, "step": 82215 }, { "epoch": 13.412724306688418, "grad_norm": 0.3098827302455902, "learning_rate": 1.478153837684457e-05, "loss": 0.1463, "num_input_tokens_seen": 177434912, "step": 82220 }, { "epoch": 13.413539967373573, "grad_norm": 0.9852977395057678, "learning_rate": 1.477829036159556e-05, "loss": 0.1052, "num_input_tokens_seen": 177446080, "step": 82225 }, { "epoch": 13.414355628058727, "grad_norm": 1.8721487522125244, "learning_rate": 1.4775042553502468e-05, "loss": 0.0646, "num_input_tokens_seen": 177457152, "step": 82230 }, { "epoch": 13.415171288743883, "grad_norm": 0.05569358542561531, "learning_rate": 1.4771794952631105e-05, "loss": 0.1239, "num_input_tokens_seen": 177467264, "step": 82235 }, { "epoch": 13.415986949429037, "grad_norm": 0.05180950090289116, "learning_rate": 1.47685475590473e-05, "loss": 0.1702, "num_input_tokens_seen": 177478176, "step": 82240 }, { "epoch": 13.416802610114193, "grad_norm": 1.7936985492706299, "learning_rate": 1.4765300372816848e-05, "loss": 0.1071, "num_input_tokens_seen": 177489024, "step": 82245 }, { "epoch": 13.417618270799348, "grad_norm": 0.2732376456260681, "learning_rate": 1.4762053394005578e-05, "loss": 0.0796, "num_input_tokens_seen": 177499744, "step": 82250 }, { "epoch": 13.418433931484502, "grad_norm": 0.2280830591917038, "learning_rate": 1.4758806622679274e-05, "loss": 0.0565, "num_input_tokens_seen": 177510400, "step": 82255 }, { "epoch": 13.419249592169658, "grad_norm": 0.23134970664978027, "learning_rate": 1.4755560058903747e-05, "loss": 0.0313, "num_input_tokens_seen": 177520672, "step": 82260 }, { "epoch": 13.420065252854812, "grad_norm": 2.3165059089660645, "learning_rate": 1.4752313702744803e-05, "loss": 0.1573, "num_input_tokens_seen": 177531328, "step": 82265 }, { "epoch": 13.420880913539968, "grad_norm": 0.0522867850959301, "learning_rate": 1.4749067554268207e-05, "loss": 0.009, "num_input_tokens_seen": 177542912, "step": 82270 }, { "epoch": 13.421696574225122, "grad_norm": 1.4306764602661133, "learning_rate": 1.4745821613539773e-05, "loss": 0.0871, "num_input_tokens_seen": 177553376, "step": 82275 }, { "epoch": 13.422512234910277, "grad_norm": 0.12830482423305511, "learning_rate": 1.4742575880625264e-05, "loss": 0.1202, "num_input_tokens_seen": 177563072, "step": 82280 }, { "epoch": 13.423327895595433, "grad_norm": 1.283210039138794, "learning_rate": 1.4739330355590476e-05, "loss": 0.2543, "num_input_tokens_seen": 177574848, "step": 82285 }, { "epoch": 13.424143556280587, "grad_norm": 0.14521585404872894, "learning_rate": 1.4736085038501163e-05, "loss": 0.0655, "num_input_tokens_seen": 177585632, "step": 82290 }, { "epoch": 13.424959216965743, "grad_norm": 1.4898935556411743, "learning_rate": 1.473283992942312e-05, "loss": 0.2258, "num_input_tokens_seen": 177596256, "step": 82295 }, { "epoch": 13.425774877650896, "grad_norm": 1.6764880418777466, "learning_rate": 1.4729595028422089e-05, "loss": 0.2867, "num_input_tokens_seen": 177608352, "step": 82300 }, { "epoch": 13.426590538336052, "grad_norm": 1.067445158958435, "learning_rate": 1.4726350335563854e-05, "loss": 0.1015, "num_input_tokens_seen": 177619232, "step": 82305 }, { "epoch": 13.427406199021208, "grad_norm": 0.07336539030075073, "learning_rate": 1.4723105850914154e-05, "loss": 0.0616, "num_input_tokens_seen": 177629376, "step": 82310 }, { "epoch": 13.428221859706362, "grad_norm": 0.037429358810186386, "learning_rate": 1.4719861574538763e-05, "loss": 0.0837, "num_input_tokens_seen": 177639296, "step": 82315 }, { "epoch": 13.429037520391518, "grad_norm": 0.8121462464332581, "learning_rate": 1.4716617506503406e-05, "loss": 0.2087, "num_input_tokens_seen": 177651488, "step": 82320 }, { "epoch": 13.429853181076671, "grad_norm": 0.03787887096405029, "learning_rate": 1.4713373646873842e-05, "loss": 0.0213, "num_input_tokens_seen": 177663104, "step": 82325 }, { "epoch": 13.430668841761827, "grad_norm": 2.211507797241211, "learning_rate": 1.4710129995715816e-05, "loss": 0.1809, "num_input_tokens_seen": 177673728, "step": 82330 }, { "epoch": 13.431484502446983, "grad_norm": 0.2371242493391037, "learning_rate": 1.4706886553095055e-05, "loss": 0.0918, "num_input_tokens_seen": 177683968, "step": 82335 }, { "epoch": 13.432300163132137, "grad_norm": 0.05614335834980011, "learning_rate": 1.4703643319077298e-05, "loss": 0.1623, "num_input_tokens_seen": 177695744, "step": 82340 }, { "epoch": 13.433115823817293, "grad_norm": 0.6952659487724304, "learning_rate": 1.4700400293728272e-05, "loss": 0.0842, "num_input_tokens_seen": 177707168, "step": 82345 }, { "epoch": 13.433931484502446, "grad_norm": 3.374192953109741, "learning_rate": 1.4697157477113692e-05, "loss": 0.0661, "num_input_tokens_seen": 177718016, "step": 82350 }, { "epoch": 13.434747145187602, "grad_norm": 0.23701593279838562, "learning_rate": 1.4693914869299299e-05, "loss": 0.1328, "num_input_tokens_seen": 177726752, "step": 82355 }, { "epoch": 13.435562805872756, "grad_norm": 0.09230323880910873, "learning_rate": 1.4690672470350785e-05, "loss": 0.0519, "num_input_tokens_seen": 177736704, "step": 82360 }, { "epoch": 13.436378466557912, "grad_norm": 1.7805827856063843, "learning_rate": 1.4687430280333875e-05, "loss": 0.1824, "num_input_tokens_seen": 177748096, "step": 82365 }, { "epoch": 13.437194127243067, "grad_norm": 0.33267447352409363, "learning_rate": 1.4684188299314272e-05, "loss": 0.0511, "num_input_tokens_seen": 177758944, "step": 82370 }, { "epoch": 13.438009787928221, "grad_norm": 0.10680730640888214, "learning_rate": 1.4680946527357683e-05, "loss": 0.0226, "num_input_tokens_seen": 177767904, "step": 82375 }, { "epoch": 13.438825448613377, "grad_norm": 1.5522440671920776, "learning_rate": 1.4677704964529801e-05, "loss": 0.2356, "num_input_tokens_seen": 177779200, "step": 82380 }, { "epoch": 13.439641109298531, "grad_norm": 0.4102810323238373, "learning_rate": 1.467446361089632e-05, "loss": 0.0864, "num_input_tokens_seen": 177790016, "step": 82385 }, { "epoch": 13.440456769983687, "grad_norm": 0.1539396345615387, "learning_rate": 1.4671222466522938e-05, "loss": 0.0484, "num_input_tokens_seen": 177801088, "step": 82390 }, { "epoch": 13.441272430668842, "grad_norm": 1.3751821517944336, "learning_rate": 1.4667981531475331e-05, "loss": 0.0959, "num_input_tokens_seen": 177810592, "step": 82395 }, { "epoch": 13.442088091353996, "grad_norm": 1.482182264328003, "learning_rate": 1.4664740805819194e-05, "loss": 0.1549, "num_input_tokens_seen": 177822080, "step": 82400 }, { "epoch": 13.442903752039152, "grad_norm": 0.07144466042518616, "learning_rate": 1.4661500289620184e-05, "loss": 0.1148, "num_input_tokens_seen": 177832000, "step": 82405 }, { "epoch": 13.443719412724306, "grad_norm": 1.6657084226608276, "learning_rate": 1.4658259982943994e-05, "loss": 0.2393, "num_input_tokens_seen": 177842048, "step": 82410 }, { "epoch": 13.444535073409462, "grad_norm": 1.963560938835144, "learning_rate": 1.465501988585628e-05, "loss": 0.1659, "num_input_tokens_seen": 177852704, "step": 82415 }, { "epoch": 13.445350734094617, "grad_norm": 1.3772106170654297, "learning_rate": 1.4651779998422719e-05, "loss": 0.1693, "num_input_tokens_seen": 177863456, "step": 82420 }, { "epoch": 13.446166394779771, "grad_norm": 3.617952346801758, "learning_rate": 1.4648540320708954e-05, "loss": 0.0732, "num_input_tokens_seen": 177874624, "step": 82425 }, { "epoch": 13.446982055464927, "grad_norm": 1.4688138961791992, "learning_rate": 1.464530085278066e-05, "loss": 0.0704, "num_input_tokens_seen": 177885024, "step": 82430 }, { "epoch": 13.447797716150081, "grad_norm": 1.1767125129699707, "learning_rate": 1.464206159470347e-05, "loss": 0.0885, "num_input_tokens_seen": 177896160, "step": 82435 }, { "epoch": 13.448613376835237, "grad_norm": 0.07295159995555878, "learning_rate": 1.4638822546543057e-05, "loss": 0.0872, "num_input_tokens_seen": 177907712, "step": 82440 }, { "epoch": 13.449429037520392, "grad_norm": 0.13925930857658386, "learning_rate": 1.4635583708365036e-05, "loss": 0.1371, "num_input_tokens_seen": 177918912, "step": 82445 }, { "epoch": 13.450244698205546, "grad_norm": 1.1117119789123535, "learning_rate": 1.4632345080235066e-05, "loss": 0.15, "num_input_tokens_seen": 177931200, "step": 82450 }, { "epoch": 13.451060358890702, "grad_norm": 1.8146188259124756, "learning_rate": 1.4629106662218768e-05, "loss": 0.0903, "num_input_tokens_seen": 177942848, "step": 82455 }, { "epoch": 13.451876019575856, "grad_norm": 1.543258547782898, "learning_rate": 1.4625868454381791e-05, "loss": 0.1106, "num_input_tokens_seen": 177954048, "step": 82460 }, { "epoch": 13.452691680261012, "grad_norm": 0.17339403927326202, "learning_rate": 1.4622630456789737e-05, "loss": 0.0574, "num_input_tokens_seen": 177965056, "step": 82465 }, { "epoch": 13.453507340946166, "grad_norm": 0.030661752447485924, "learning_rate": 1.4619392669508244e-05, "loss": 0.0233, "num_input_tokens_seen": 177974976, "step": 82470 }, { "epoch": 13.454323001631321, "grad_norm": 0.42010393738746643, "learning_rate": 1.461615509260294e-05, "loss": 0.08, "num_input_tokens_seen": 177985088, "step": 82475 }, { "epoch": 13.455138662316477, "grad_norm": 0.03151889890432358, "learning_rate": 1.4612917726139411e-05, "loss": 0.1857, "num_input_tokens_seen": 177995872, "step": 82480 }, { "epoch": 13.455954323001631, "grad_norm": 1.878441333770752, "learning_rate": 1.4609680570183293e-05, "loss": 0.2255, "num_input_tokens_seen": 178006432, "step": 82485 }, { "epoch": 13.456769983686787, "grad_norm": 1.468940258026123, "learning_rate": 1.460644362480017e-05, "loss": 0.1424, "num_input_tokens_seen": 178015904, "step": 82490 }, { "epoch": 13.45758564437194, "grad_norm": 0.15430276095867157, "learning_rate": 1.4603206890055654e-05, "loss": 0.1046, "num_input_tokens_seen": 178027328, "step": 82495 }, { "epoch": 13.458401305057096, "grad_norm": 1.5407309532165527, "learning_rate": 1.4599970366015343e-05, "loss": 0.0905, "num_input_tokens_seen": 178037824, "step": 82500 }, { "epoch": 13.459216965742252, "grad_norm": 1.981309413909912, "learning_rate": 1.4596734052744826e-05, "loss": 0.1025, "num_input_tokens_seen": 178047936, "step": 82505 }, { "epoch": 13.460032626427406, "grad_norm": 1.0798389911651611, "learning_rate": 1.4593497950309688e-05, "loss": 0.0781, "num_input_tokens_seen": 178059040, "step": 82510 }, { "epoch": 13.460848287112562, "grad_norm": 0.3199831247329712, "learning_rate": 1.4590262058775517e-05, "loss": 0.0234, "num_input_tokens_seen": 178069696, "step": 82515 }, { "epoch": 13.461663947797716, "grad_norm": 0.07970549911260605, "learning_rate": 1.458702637820789e-05, "loss": 0.0159, "num_input_tokens_seen": 178080160, "step": 82520 }, { "epoch": 13.462479608482871, "grad_norm": 0.3277631103992462, "learning_rate": 1.4583790908672387e-05, "loss": 0.0415, "num_input_tokens_seen": 178090624, "step": 82525 }, { "epoch": 13.463295269168025, "grad_norm": 0.8229436874389648, "learning_rate": 1.4580555650234574e-05, "loss": 0.0965, "num_input_tokens_seen": 178102784, "step": 82530 }, { "epoch": 13.464110929853181, "grad_norm": 0.37098538875579834, "learning_rate": 1.4577320602960015e-05, "loss": 0.0331, "num_input_tokens_seen": 178114144, "step": 82535 }, { "epoch": 13.464926590538337, "grad_norm": 0.040930259972810745, "learning_rate": 1.457408576691428e-05, "loss": 0.1893, "num_input_tokens_seen": 178125984, "step": 82540 }, { "epoch": 13.46574225122349, "grad_norm": 0.16271744668483734, "learning_rate": 1.4570851142162922e-05, "loss": 0.2121, "num_input_tokens_seen": 178137600, "step": 82545 }, { "epoch": 13.466557911908646, "grad_norm": 0.08449087291955948, "learning_rate": 1.4567616728771496e-05, "loss": 0.02, "num_input_tokens_seen": 178149088, "step": 82550 }, { "epoch": 13.4673735725938, "grad_norm": 1.00300133228302, "learning_rate": 1.4564382526805553e-05, "loss": 0.1947, "num_input_tokens_seen": 178159968, "step": 82555 }, { "epoch": 13.468189233278956, "grad_norm": 0.10133452713489532, "learning_rate": 1.4561148536330632e-05, "loss": 0.0655, "num_input_tokens_seen": 178171008, "step": 82560 }, { "epoch": 13.469004893964112, "grad_norm": 1.9246591329574585, "learning_rate": 1.4557914757412282e-05, "loss": 0.1944, "num_input_tokens_seen": 178182112, "step": 82565 }, { "epoch": 13.469820554649266, "grad_norm": 2.322747230529785, "learning_rate": 1.4554681190116038e-05, "loss": 0.1924, "num_input_tokens_seen": 178193888, "step": 82570 }, { "epoch": 13.470636215334421, "grad_norm": 0.06973425298929214, "learning_rate": 1.4551447834507425e-05, "loss": 0.0489, "num_input_tokens_seen": 178204416, "step": 82575 }, { "epoch": 13.471451876019575, "grad_norm": 0.19987617433071136, "learning_rate": 1.454821469065197e-05, "loss": 0.0117, "num_input_tokens_seen": 178215552, "step": 82580 }, { "epoch": 13.47226753670473, "grad_norm": 0.05950986593961716, "learning_rate": 1.4544981758615215e-05, "loss": 0.0332, "num_input_tokens_seen": 178227072, "step": 82585 }, { "epoch": 13.473083197389887, "grad_norm": 1.2552069425582886, "learning_rate": 1.4541749038462666e-05, "loss": 0.1173, "num_input_tokens_seen": 178237760, "step": 82590 }, { "epoch": 13.47389885807504, "grad_norm": 0.7396317720413208, "learning_rate": 1.4538516530259839e-05, "loss": 0.181, "num_input_tokens_seen": 178248832, "step": 82595 }, { "epoch": 13.474714518760196, "grad_norm": 0.1317686140537262, "learning_rate": 1.4535284234072243e-05, "loss": 0.0119, "num_input_tokens_seen": 178260416, "step": 82600 }, { "epoch": 13.47553017944535, "grad_norm": 0.29973605275154114, "learning_rate": 1.4532052149965392e-05, "loss": 0.0621, "num_input_tokens_seen": 178271520, "step": 82605 }, { "epoch": 13.476345840130506, "grad_norm": 0.6882100105285645, "learning_rate": 1.4528820278004784e-05, "loss": 0.1287, "num_input_tokens_seen": 178282240, "step": 82610 }, { "epoch": 13.477161500815662, "grad_norm": 0.15676766633987427, "learning_rate": 1.4525588618255914e-05, "loss": 0.0959, "num_input_tokens_seen": 178292064, "step": 82615 }, { "epoch": 13.477977161500815, "grad_norm": 0.34013113379478455, "learning_rate": 1.4522357170784281e-05, "loss": 0.0324, "num_input_tokens_seen": 178303168, "step": 82620 }, { "epoch": 13.478792822185971, "grad_norm": 1.4938000440597534, "learning_rate": 1.451912593565537e-05, "loss": 0.1107, "num_input_tokens_seen": 178313248, "step": 82625 }, { "epoch": 13.479608482871125, "grad_norm": 0.09006288647651672, "learning_rate": 1.4515894912934668e-05, "loss": 0.1045, "num_input_tokens_seen": 178323168, "step": 82630 }, { "epoch": 13.48042414355628, "grad_norm": 0.04304490238428116, "learning_rate": 1.4512664102687656e-05, "loss": 0.1454, "num_input_tokens_seen": 178334432, "step": 82635 }, { "epoch": 13.481239804241435, "grad_norm": 0.08425887674093246, "learning_rate": 1.450943350497981e-05, "loss": 0.052, "num_input_tokens_seen": 178344800, "step": 82640 }, { "epoch": 13.48205546492659, "grad_norm": 0.14486894011497498, "learning_rate": 1.4506203119876604e-05, "loss": 0.0115, "num_input_tokens_seen": 178356320, "step": 82645 }, { "epoch": 13.482871125611746, "grad_norm": 0.044037312269210815, "learning_rate": 1.4502972947443502e-05, "loss": 0.0646, "num_input_tokens_seen": 178366912, "step": 82650 }, { "epoch": 13.4836867862969, "grad_norm": 0.1158161461353302, "learning_rate": 1.4499742987745971e-05, "loss": 0.0281, "num_input_tokens_seen": 178376128, "step": 82655 }, { "epoch": 13.484502446982056, "grad_norm": 0.9328917860984802, "learning_rate": 1.4496513240849468e-05, "loss": 0.0445, "num_input_tokens_seen": 178387296, "step": 82660 }, { "epoch": 13.48531810766721, "grad_norm": 0.7445340752601624, "learning_rate": 1.4493283706819449e-05, "loss": 0.0979, "num_input_tokens_seen": 178398592, "step": 82665 }, { "epoch": 13.486133768352365, "grad_norm": 0.5359448790550232, "learning_rate": 1.4490054385721363e-05, "loss": 0.0836, "num_input_tokens_seen": 178408320, "step": 82670 }, { "epoch": 13.486949429037521, "grad_norm": 1.0423568487167358, "learning_rate": 1.4486825277620658e-05, "loss": 0.0527, "num_input_tokens_seen": 178419392, "step": 82675 }, { "epoch": 13.487765089722675, "grad_norm": 0.14640377461910248, "learning_rate": 1.4483596382582775e-05, "loss": 0.149, "num_input_tokens_seen": 178430016, "step": 82680 }, { "epoch": 13.48858075040783, "grad_norm": 0.4495973289012909, "learning_rate": 1.4480367700673156e-05, "loss": 0.1176, "num_input_tokens_seen": 178439936, "step": 82685 }, { "epoch": 13.489396411092985, "grad_norm": 1.845611810684204, "learning_rate": 1.4477139231957219e-05, "loss": 0.1271, "num_input_tokens_seen": 178451552, "step": 82690 }, { "epoch": 13.49021207177814, "grad_norm": 0.6669424772262573, "learning_rate": 1.4473910976500423e-05, "loss": 0.0162, "num_input_tokens_seen": 178462272, "step": 82695 }, { "epoch": 13.491027732463296, "grad_norm": 0.24819274246692657, "learning_rate": 1.4470682934368157e-05, "loss": 0.0462, "num_input_tokens_seen": 178473888, "step": 82700 }, { "epoch": 13.49184339314845, "grad_norm": 0.08307311683893204, "learning_rate": 1.4467455105625876e-05, "loss": 0.0298, "num_input_tokens_seen": 178483616, "step": 82705 }, { "epoch": 13.492659053833606, "grad_norm": 0.2388790398836136, "learning_rate": 1.446422749033896e-05, "loss": 0.0455, "num_input_tokens_seen": 178494048, "step": 82710 }, { "epoch": 13.49347471451876, "grad_norm": 0.08585446327924728, "learning_rate": 1.4461000088572859e-05, "loss": 0.1247, "num_input_tokens_seen": 178505536, "step": 82715 }, { "epoch": 13.494290375203915, "grad_norm": 0.044057417660951614, "learning_rate": 1.445777290039294e-05, "loss": 0.0267, "num_input_tokens_seen": 178514688, "step": 82720 }, { "epoch": 13.49510603588907, "grad_norm": 0.026549631729722023, "learning_rate": 1.445454592586464e-05, "loss": 0.0354, "num_input_tokens_seen": 178525376, "step": 82725 }, { "epoch": 13.495921696574225, "grad_norm": 0.0564214326441288, "learning_rate": 1.4451319165053342e-05, "loss": 0.0051, "num_input_tokens_seen": 178536576, "step": 82730 }, { "epoch": 13.49673735725938, "grad_norm": 0.15797273814678192, "learning_rate": 1.4448092618024445e-05, "loss": 0.0083, "num_input_tokens_seen": 178546720, "step": 82735 }, { "epoch": 13.497553017944535, "grad_norm": 0.9495553970336914, "learning_rate": 1.4444866284843336e-05, "loss": 0.0808, "num_input_tokens_seen": 178557856, "step": 82740 }, { "epoch": 13.49836867862969, "grad_norm": 0.06762862950563431, "learning_rate": 1.4441640165575402e-05, "loss": 0.0617, "num_input_tokens_seen": 178568096, "step": 82745 }, { "epoch": 13.499184339314844, "grad_norm": 0.10547639429569244, "learning_rate": 1.4438414260286024e-05, "loss": 0.0682, "num_input_tokens_seen": 178579456, "step": 82750 }, { "epoch": 13.5, "grad_norm": 0.09201905131340027, "learning_rate": 1.443518856904058e-05, "loss": 0.0461, "num_input_tokens_seen": 178589920, "step": 82755 }, { "epoch": 13.500815660685156, "grad_norm": 1.4985374212265015, "learning_rate": 1.443196309190444e-05, "loss": 0.0441, "num_input_tokens_seen": 178601024, "step": 82760 }, { "epoch": 13.50163132137031, "grad_norm": 1.0055415630340576, "learning_rate": 1.4428737828942979e-05, "loss": 0.16, "num_input_tokens_seen": 178611712, "step": 82765 }, { "epoch": 13.502446982055465, "grad_norm": 1.9651753902435303, "learning_rate": 1.442551278022155e-05, "loss": 0.1592, "num_input_tokens_seen": 178622304, "step": 82770 }, { "epoch": 13.50326264274062, "grad_norm": 1.2999773025512695, "learning_rate": 1.4422287945805524e-05, "loss": 0.1025, "num_input_tokens_seen": 178632896, "step": 82775 }, { "epoch": 13.504078303425775, "grad_norm": 1.7529950141906738, "learning_rate": 1.4419063325760251e-05, "loss": 0.0809, "num_input_tokens_seen": 178643328, "step": 82780 }, { "epoch": 13.50489396411093, "grad_norm": 0.3564196825027466, "learning_rate": 1.4415838920151082e-05, "loss": 0.1569, "num_input_tokens_seen": 178653376, "step": 82785 }, { "epoch": 13.505709624796085, "grad_norm": 0.025015557184815407, "learning_rate": 1.4412614729043363e-05, "loss": 0.1139, "num_input_tokens_seen": 178663264, "step": 82790 }, { "epoch": 13.50652528548124, "grad_norm": 1.1143368482589722, "learning_rate": 1.440939075250243e-05, "loss": 0.1403, "num_input_tokens_seen": 178674272, "step": 82795 }, { "epoch": 13.507340946166394, "grad_norm": 0.05145640671253204, "learning_rate": 1.4406166990593647e-05, "loss": 0.1172, "num_input_tokens_seen": 178685632, "step": 82800 }, { "epoch": 13.50815660685155, "grad_norm": 1.3530348539352417, "learning_rate": 1.440294344338231e-05, "loss": 0.0404, "num_input_tokens_seen": 178695328, "step": 82805 }, { "epoch": 13.508972267536706, "grad_norm": 1.1826727390289307, "learning_rate": 1.4399720110933788e-05, "loss": 0.1833, "num_input_tokens_seen": 178705952, "step": 82810 }, { "epoch": 13.50978792822186, "grad_norm": 1.7139548063278198, "learning_rate": 1.4396496993313363e-05, "loss": 0.1071, "num_input_tokens_seen": 178716416, "step": 82815 }, { "epoch": 13.510603588907015, "grad_norm": 0.5253949761390686, "learning_rate": 1.43932740905864e-05, "loss": 0.0678, "num_input_tokens_seen": 178727936, "step": 82820 }, { "epoch": 13.51141924959217, "grad_norm": 0.1172512099146843, "learning_rate": 1.4390051402818172e-05, "loss": 0.0953, "num_input_tokens_seen": 178737760, "step": 82825 }, { "epoch": 13.512234910277325, "grad_norm": 0.05162058025598526, "learning_rate": 1.438682893007403e-05, "loss": 0.1202, "num_input_tokens_seen": 178748320, "step": 82830 }, { "epoch": 13.513050570962479, "grad_norm": 0.4654304087162018, "learning_rate": 1.4383606672419247e-05, "loss": 0.0543, "num_input_tokens_seen": 178758656, "step": 82835 }, { "epoch": 13.513866231647635, "grad_norm": 1.7564302682876587, "learning_rate": 1.4380384629919163e-05, "loss": 0.0721, "num_input_tokens_seen": 178770560, "step": 82840 }, { "epoch": 13.51468189233279, "grad_norm": 0.047109782695770264, "learning_rate": 1.4377162802639032e-05, "loss": 0.059, "num_input_tokens_seen": 178781536, "step": 82845 }, { "epoch": 13.515497553017944, "grad_norm": 2.104228973388672, "learning_rate": 1.4373941190644196e-05, "loss": 0.1365, "num_input_tokens_seen": 178793024, "step": 82850 }, { "epoch": 13.5163132137031, "grad_norm": 0.17499645054340363, "learning_rate": 1.4370719793999904e-05, "loss": 0.0155, "num_input_tokens_seen": 178804736, "step": 82855 }, { "epoch": 13.517128874388254, "grad_norm": 0.16524240374565125, "learning_rate": 1.4367498612771476e-05, "loss": 0.0317, "num_input_tokens_seen": 178816672, "step": 82860 }, { "epoch": 13.51794453507341, "grad_norm": 1.9029518365859985, "learning_rate": 1.436427764702416e-05, "loss": 0.3245, "num_input_tokens_seen": 178827072, "step": 82865 }, { "epoch": 13.518760195758565, "grad_norm": 2.506760597229004, "learning_rate": 1.4361056896823255e-05, "loss": 0.1688, "num_input_tokens_seen": 178837792, "step": 82870 }, { "epoch": 13.51957585644372, "grad_norm": 0.12220606952905655, "learning_rate": 1.4357836362234035e-05, "loss": 0.0349, "num_input_tokens_seen": 178846624, "step": 82875 }, { "epoch": 13.520391517128875, "grad_norm": 0.43039679527282715, "learning_rate": 1.4354616043321756e-05, "loss": 0.0304, "num_input_tokens_seen": 178857088, "step": 82880 }, { "epoch": 13.521207177814029, "grad_norm": 0.9881238341331482, "learning_rate": 1.4351395940151691e-05, "loss": 0.1041, "num_input_tokens_seen": 178867744, "step": 82885 }, { "epoch": 13.522022838499185, "grad_norm": 0.7911864519119263, "learning_rate": 1.4348176052789093e-05, "loss": 0.1319, "num_input_tokens_seen": 178877664, "step": 82890 }, { "epoch": 13.522838499184338, "grad_norm": 2.4429402351379395, "learning_rate": 1.434495638129922e-05, "loss": 0.1822, "num_input_tokens_seen": 178887680, "step": 82895 }, { "epoch": 13.523654159869494, "grad_norm": 1.624914526939392, "learning_rate": 1.4341736925747326e-05, "loss": 0.1447, "num_input_tokens_seen": 178898688, "step": 82900 }, { "epoch": 13.52446982055465, "grad_norm": 0.8278146386146545, "learning_rate": 1.4338517686198651e-05, "loss": 0.1632, "num_input_tokens_seen": 178908864, "step": 82905 }, { "epoch": 13.525285481239804, "grad_norm": 1.8249926567077637, "learning_rate": 1.4335298662718443e-05, "loss": 0.1122, "num_input_tokens_seen": 178919200, "step": 82910 }, { "epoch": 13.52610114192496, "grad_norm": 1.5163836479187012, "learning_rate": 1.4332079855371936e-05, "loss": 0.1516, "num_input_tokens_seen": 178929440, "step": 82915 }, { "epoch": 13.526916802610113, "grad_norm": 0.026381582021713257, "learning_rate": 1.4328861264224353e-05, "loss": 0.007, "num_input_tokens_seen": 178940064, "step": 82920 }, { "epoch": 13.52773246329527, "grad_norm": 1.8163877725601196, "learning_rate": 1.4325642889340954e-05, "loss": 0.1095, "num_input_tokens_seen": 178950912, "step": 82925 }, { "epoch": 13.528548123980425, "grad_norm": 0.06810686737298965, "learning_rate": 1.4322424730786924e-05, "loss": 0.0902, "num_input_tokens_seen": 178962752, "step": 82930 }, { "epoch": 13.529363784665579, "grad_norm": 1.4774360656738281, "learning_rate": 1.431920678862752e-05, "loss": 0.0682, "num_input_tokens_seen": 178974048, "step": 82935 }, { "epoch": 13.530179445350734, "grad_norm": 2.1094295978546143, "learning_rate": 1.4315989062927924e-05, "loss": 0.1516, "num_input_tokens_seen": 178983872, "step": 82940 }, { "epoch": 13.530995106035888, "grad_norm": 1.382082462310791, "learning_rate": 1.4312771553753378e-05, "loss": 0.3273, "num_input_tokens_seen": 178994848, "step": 82945 }, { "epoch": 13.531810766721044, "grad_norm": 1.8289320468902588, "learning_rate": 1.4309554261169059e-05, "loss": 0.2518, "num_input_tokens_seen": 179006528, "step": 82950 }, { "epoch": 13.5326264274062, "grad_norm": 1.811120867729187, "learning_rate": 1.4306337185240204e-05, "loss": 0.1199, "num_input_tokens_seen": 179016416, "step": 82955 }, { "epoch": 13.533442088091354, "grad_norm": 0.11947686225175858, "learning_rate": 1.4303120326031968e-05, "loss": 0.2324, "num_input_tokens_seen": 179027232, "step": 82960 }, { "epoch": 13.53425774877651, "grad_norm": 0.10391854494810104, "learning_rate": 1.4299903683609592e-05, "loss": 0.0422, "num_input_tokens_seen": 179039008, "step": 82965 }, { "epoch": 13.535073409461663, "grad_norm": 0.06327836215496063, "learning_rate": 1.4296687258038222e-05, "loss": 0.0342, "num_input_tokens_seen": 179050656, "step": 82970 }, { "epoch": 13.535889070146819, "grad_norm": 1.264397144317627, "learning_rate": 1.4293471049383082e-05, "loss": 0.0896, "num_input_tokens_seen": 179060416, "step": 82975 }, { "epoch": 13.536704730831975, "grad_norm": 1.1105581521987915, "learning_rate": 1.4290255057709311e-05, "loss": 0.0436, "num_input_tokens_seen": 179072320, "step": 82980 }, { "epoch": 13.537520391517129, "grad_norm": 0.08106829971075058, "learning_rate": 1.4287039283082132e-05, "loss": 0.1398, "num_input_tokens_seen": 179082144, "step": 82985 }, { "epoch": 13.538336052202284, "grad_norm": 0.533846914768219, "learning_rate": 1.428382372556667e-05, "loss": 0.2116, "num_input_tokens_seen": 179092896, "step": 82990 }, { "epoch": 13.539151712887438, "grad_norm": 0.5019603967666626, "learning_rate": 1.4280608385228133e-05, "loss": 0.2128, "num_input_tokens_seen": 179104672, "step": 82995 }, { "epoch": 13.539967373572594, "grad_norm": 3.05413556098938, "learning_rate": 1.427739326213165e-05, "loss": 0.1294, "num_input_tokens_seen": 179114208, "step": 83000 }, { "epoch": 13.540783034257748, "grad_norm": 0.16389991343021393, "learning_rate": 1.42741783563424e-05, "loss": 0.016, "num_input_tokens_seen": 179124768, "step": 83005 }, { "epoch": 13.541598694942904, "grad_norm": 1.8001872301101685, "learning_rate": 1.4270963667925535e-05, "loss": 0.1175, "num_input_tokens_seen": 179135104, "step": 83010 }, { "epoch": 13.54241435562806, "grad_norm": 0.2711174190044403, "learning_rate": 1.42677491969462e-05, "loss": 0.0597, "num_input_tokens_seen": 179146432, "step": 83015 }, { "epoch": 13.543230016313213, "grad_norm": 0.07829415053129196, "learning_rate": 1.4264534943469546e-05, "loss": 0.1799, "num_input_tokens_seen": 179156704, "step": 83020 }, { "epoch": 13.544045676998369, "grad_norm": 0.13273103535175323, "learning_rate": 1.426132090756071e-05, "loss": 0.1814, "num_input_tokens_seen": 179167040, "step": 83025 }, { "epoch": 13.544861337683523, "grad_norm": 0.052251219749450684, "learning_rate": 1.4258107089284827e-05, "loss": 0.1881, "num_input_tokens_seen": 179178912, "step": 83030 }, { "epoch": 13.545676998368679, "grad_norm": 0.12355323135852814, "learning_rate": 1.425489348870703e-05, "loss": 0.0399, "num_input_tokens_seen": 179189824, "step": 83035 }, { "epoch": 13.546492659053834, "grad_norm": 0.1825941801071167, "learning_rate": 1.4251680105892449e-05, "loss": 0.0772, "num_input_tokens_seen": 179199424, "step": 83040 }, { "epoch": 13.547308319738988, "grad_norm": 0.2114318609237671, "learning_rate": 1.4248466940906205e-05, "loss": 0.128, "num_input_tokens_seen": 179210560, "step": 83045 }, { "epoch": 13.548123980424144, "grad_norm": 0.9246374368667603, "learning_rate": 1.4245253993813419e-05, "loss": 0.0894, "num_input_tokens_seen": 179220384, "step": 83050 }, { "epoch": 13.548939641109298, "grad_norm": 1.896039366722107, "learning_rate": 1.4242041264679201e-05, "loss": 0.118, "num_input_tokens_seen": 179232416, "step": 83055 }, { "epoch": 13.549755301794454, "grad_norm": 1.738221526145935, "learning_rate": 1.4238828753568667e-05, "loss": 0.3723, "num_input_tokens_seen": 179242944, "step": 83060 }, { "epoch": 13.550570962479608, "grad_norm": 2.098820209503174, "learning_rate": 1.4235616460546911e-05, "loss": 0.2665, "num_input_tokens_seen": 179254464, "step": 83065 }, { "epoch": 13.551386623164763, "grad_norm": 0.09685193002223969, "learning_rate": 1.423240438567906e-05, "loss": 0.0222, "num_input_tokens_seen": 179265728, "step": 83070 }, { "epoch": 13.552202283849919, "grad_norm": 0.07129096984863281, "learning_rate": 1.4229192529030178e-05, "loss": 0.0771, "num_input_tokens_seen": 179276160, "step": 83075 }, { "epoch": 13.553017944535073, "grad_norm": 0.1586148887872696, "learning_rate": 1.4225980890665389e-05, "loss": 0.0202, "num_input_tokens_seen": 179287264, "step": 83080 }, { "epoch": 13.553833605220229, "grad_norm": 1.0995172262191772, "learning_rate": 1.4222769470649746e-05, "loss": 0.1124, "num_input_tokens_seen": 179297696, "step": 83085 }, { "epoch": 13.554649265905383, "grad_norm": 0.5935029983520508, "learning_rate": 1.4219558269048373e-05, "loss": 0.1216, "num_input_tokens_seen": 179307008, "step": 83090 }, { "epoch": 13.555464926590538, "grad_norm": 0.8599728941917419, "learning_rate": 1.4216347285926307e-05, "loss": 0.053, "num_input_tokens_seen": 179318816, "step": 83095 }, { "epoch": 13.556280587275694, "grad_norm": 1.245374083518982, "learning_rate": 1.4213136521348663e-05, "loss": 0.184, "num_input_tokens_seen": 179330208, "step": 83100 }, { "epoch": 13.557096247960848, "grad_norm": 0.6003170609474182, "learning_rate": 1.420992597538047e-05, "loss": 0.0861, "num_input_tokens_seen": 179340448, "step": 83105 }, { "epoch": 13.557911908646004, "grad_norm": 2.1484060287475586, "learning_rate": 1.4206715648086828e-05, "loss": 0.1489, "num_input_tokens_seen": 179350272, "step": 83110 }, { "epoch": 13.558727569331158, "grad_norm": 1.1759504079818726, "learning_rate": 1.4203505539532785e-05, "loss": 0.0599, "num_input_tokens_seen": 179360992, "step": 83115 }, { "epoch": 13.559543230016313, "grad_norm": 0.09330237656831741, "learning_rate": 1.4200295649783395e-05, "loss": 0.136, "num_input_tokens_seen": 179372000, "step": 83120 }, { "epoch": 13.560358890701469, "grad_norm": 2.0060479640960693, "learning_rate": 1.4197085978903718e-05, "loss": 0.087, "num_input_tokens_seen": 179384032, "step": 83125 }, { "epoch": 13.561174551386623, "grad_norm": 0.05672914907336235, "learning_rate": 1.41938765269588e-05, "loss": 0.0992, "num_input_tokens_seen": 179395072, "step": 83130 }, { "epoch": 13.561990212071779, "grad_norm": 0.09541556984186172, "learning_rate": 1.4190667294013677e-05, "loss": 0.0547, "num_input_tokens_seen": 179407392, "step": 83135 }, { "epoch": 13.562805872756933, "grad_norm": 1.9460413455963135, "learning_rate": 1.4187458280133397e-05, "loss": 0.1194, "num_input_tokens_seen": 179418528, "step": 83140 }, { "epoch": 13.563621533442088, "grad_norm": 1.1292519569396973, "learning_rate": 1.4184249485382992e-05, "loss": 0.107, "num_input_tokens_seen": 179429376, "step": 83145 }, { "epoch": 13.564437194127244, "grad_norm": 0.620526134967804, "learning_rate": 1.4181040909827492e-05, "loss": 0.0422, "num_input_tokens_seen": 179439936, "step": 83150 }, { "epoch": 13.565252854812398, "grad_norm": 0.08281121402978897, "learning_rate": 1.4177832553531922e-05, "loss": 0.0213, "num_input_tokens_seen": 179450336, "step": 83155 }, { "epoch": 13.566068515497554, "grad_norm": 1.5829015970230103, "learning_rate": 1.4174624416561302e-05, "loss": 0.1023, "num_input_tokens_seen": 179460544, "step": 83160 }, { "epoch": 13.566884176182707, "grad_norm": 1.607973337173462, "learning_rate": 1.4171416498980655e-05, "loss": 0.1525, "num_input_tokens_seen": 179470688, "step": 83165 }, { "epoch": 13.567699836867863, "grad_norm": 2.648639678955078, "learning_rate": 1.4168208800854987e-05, "loss": 0.1668, "num_input_tokens_seen": 179481920, "step": 83170 }, { "epoch": 13.568515497553017, "grad_norm": 0.08774513006210327, "learning_rate": 1.4165001322249311e-05, "loss": 0.1244, "num_input_tokens_seen": 179492288, "step": 83175 }, { "epoch": 13.569331158238173, "grad_norm": 0.7392784953117371, "learning_rate": 1.4161794063228628e-05, "loss": 0.0327, "num_input_tokens_seen": 179501568, "step": 83180 }, { "epoch": 13.570146818923329, "grad_norm": 0.2965083420276642, "learning_rate": 1.4158587023857936e-05, "loss": 0.122, "num_input_tokens_seen": 179513184, "step": 83185 }, { "epoch": 13.570962479608482, "grad_norm": 1.050952434539795, "learning_rate": 1.4155380204202232e-05, "loss": 0.0281, "num_input_tokens_seen": 179523712, "step": 83190 }, { "epoch": 13.571778140293638, "grad_norm": 0.11086911708116531, "learning_rate": 1.4152173604326504e-05, "loss": 0.0912, "num_input_tokens_seen": 179535264, "step": 83195 }, { "epoch": 13.572593800978792, "grad_norm": 2.121920108795166, "learning_rate": 1.4148967224295742e-05, "loss": 0.1086, "num_input_tokens_seen": 179547424, "step": 83200 }, { "epoch": 13.573409461663948, "grad_norm": 0.0758233293890953, "learning_rate": 1.4145761064174922e-05, "loss": 0.0162, "num_input_tokens_seen": 179558112, "step": 83205 }, { "epoch": 13.574225122349104, "grad_norm": 0.08260143548250198, "learning_rate": 1.4142555124029016e-05, "loss": 0.2112, "num_input_tokens_seen": 179569056, "step": 83210 }, { "epoch": 13.575040783034257, "grad_norm": 0.041219115257263184, "learning_rate": 1.4139349403923024e-05, "loss": 0.0094, "num_input_tokens_seen": 179579840, "step": 83215 }, { "epoch": 13.575856443719413, "grad_norm": 0.0461159311234951, "learning_rate": 1.4136143903921873e-05, "loss": 0.0755, "num_input_tokens_seen": 179590624, "step": 83220 }, { "epoch": 13.576672104404567, "grad_norm": 0.03063630498945713, "learning_rate": 1.4132938624090557e-05, "loss": 0.0223, "num_input_tokens_seen": 179601504, "step": 83225 }, { "epoch": 13.577487765089723, "grad_norm": 0.15292228758335114, "learning_rate": 1.412973356449403e-05, "loss": 0.014, "num_input_tokens_seen": 179611264, "step": 83230 }, { "epoch": 13.578303425774878, "grad_norm": 0.3812592625617981, "learning_rate": 1.4126528725197235e-05, "loss": 0.1474, "num_input_tokens_seen": 179621280, "step": 83235 }, { "epoch": 13.579119086460032, "grad_norm": 0.07936372607946396, "learning_rate": 1.4123324106265135e-05, "loss": 0.0351, "num_input_tokens_seen": 179632256, "step": 83240 }, { "epoch": 13.579934747145188, "grad_norm": 0.07794280350208282, "learning_rate": 1.412011970776267e-05, "loss": 0.1152, "num_input_tokens_seen": 179642464, "step": 83245 }, { "epoch": 13.580750407830342, "grad_norm": 1.6958884000778198, "learning_rate": 1.4116915529754776e-05, "loss": 0.1079, "num_input_tokens_seen": 179654976, "step": 83250 }, { "epoch": 13.581566068515498, "grad_norm": 0.9642257690429688, "learning_rate": 1.4113711572306398e-05, "loss": 0.0654, "num_input_tokens_seen": 179664608, "step": 83255 }, { "epoch": 13.582381729200652, "grad_norm": 2.088197708129883, "learning_rate": 1.4110507835482468e-05, "loss": 0.2907, "num_input_tokens_seen": 179674784, "step": 83260 }, { "epoch": 13.583197389885807, "grad_norm": 0.2701336443424225, "learning_rate": 1.4107304319347908e-05, "loss": 0.0177, "num_input_tokens_seen": 179684608, "step": 83265 }, { "epoch": 13.584013050570963, "grad_norm": 0.5416926145553589, "learning_rate": 1.4104101023967645e-05, "loss": 0.1208, "num_input_tokens_seen": 179694368, "step": 83270 }, { "epoch": 13.584828711256117, "grad_norm": 0.2669161558151245, "learning_rate": 1.4100897949406597e-05, "loss": 0.2415, "num_input_tokens_seen": 179703872, "step": 83275 }, { "epoch": 13.585644371941273, "grad_norm": 0.42429912090301514, "learning_rate": 1.409769509572968e-05, "loss": 0.0282, "num_input_tokens_seen": 179715872, "step": 83280 }, { "epoch": 13.586460032626427, "grad_norm": 0.12111078947782516, "learning_rate": 1.4094492463001802e-05, "loss": 0.0115, "num_input_tokens_seen": 179726976, "step": 83285 }, { "epoch": 13.587275693311582, "grad_norm": 0.44447755813598633, "learning_rate": 1.4091290051287865e-05, "loss": 0.0183, "num_input_tokens_seen": 179738016, "step": 83290 }, { "epoch": 13.588091353996738, "grad_norm": 0.040837571024894714, "learning_rate": 1.4088087860652777e-05, "loss": 0.1094, "num_input_tokens_seen": 179747520, "step": 83295 }, { "epoch": 13.588907014681892, "grad_norm": 0.49396002292633057, "learning_rate": 1.4084885891161431e-05, "loss": 0.0273, "num_input_tokens_seen": 179757696, "step": 83300 }, { "epoch": 13.589722675367048, "grad_norm": 0.16598036885261536, "learning_rate": 1.4081684142878721e-05, "loss": 0.019, "num_input_tokens_seen": 179768160, "step": 83305 }, { "epoch": 13.590538336052202, "grad_norm": 0.08612238615751266, "learning_rate": 1.4078482615869531e-05, "loss": 0.0335, "num_input_tokens_seen": 179780032, "step": 83310 }, { "epoch": 13.591353996737357, "grad_norm": 0.22158561646938324, "learning_rate": 1.4075281310198746e-05, "loss": 0.0754, "num_input_tokens_seen": 179790400, "step": 83315 }, { "epoch": 13.592169657422513, "grad_norm": 0.0836004689335823, "learning_rate": 1.4072080225931245e-05, "loss": 0.0499, "num_input_tokens_seen": 179800960, "step": 83320 }, { "epoch": 13.592985318107667, "grad_norm": 0.10789427161216736, "learning_rate": 1.4068879363131899e-05, "loss": 0.1226, "num_input_tokens_seen": 179811232, "step": 83325 }, { "epoch": 13.593800978792823, "grad_norm": 1.9183568954467773, "learning_rate": 1.4065678721865572e-05, "loss": 0.028, "num_input_tokens_seen": 179822464, "step": 83330 }, { "epoch": 13.594616639477977, "grad_norm": 0.04982323199510574, "learning_rate": 1.4062478302197152e-05, "loss": 0.1092, "num_input_tokens_seen": 179833472, "step": 83335 }, { "epoch": 13.595432300163132, "grad_norm": 0.48445603251457214, "learning_rate": 1.405927810419147e-05, "loss": 0.0816, "num_input_tokens_seen": 179843840, "step": 83340 }, { "epoch": 13.596247960848288, "grad_norm": 1.4253126382827759, "learning_rate": 1.4056078127913414e-05, "loss": 0.1628, "num_input_tokens_seen": 179855584, "step": 83345 }, { "epoch": 13.597063621533442, "grad_norm": 3.0619306564331055, "learning_rate": 1.4052878373427799e-05, "loss": 0.0968, "num_input_tokens_seen": 179866080, "step": 83350 }, { "epoch": 13.597879282218598, "grad_norm": 0.028874188661575317, "learning_rate": 1.4049678840799502e-05, "loss": 0.2629, "num_input_tokens_seen": 179877472, "step": 83355 }, { "epoch": 13.598694942903752, "grad_norm": 0.19851075112819672, "learning_rate": 1.4046479530093353e-05, "loss": 0.1365, "num_input_tokens_seen": 179887456, "step": 83360 }, { "epoch": 13.599510603588907, "grad_norm": 0.09663704037666321, "learning_rate": 1.4043280441374193e-05, "loss": 0.0574, "num_input_tokens_seen": 179899936, "step": 83365 }, { "epoch": 13.600326264274061, "grad_norm": 1.5624885559082031, "learning_rate": 1.4040081574706853e-05, "loss": 0.0442, "num_input_tokens_seen": 179911040, "step": 83370 }, { "epoch": 13.601141924959217, "grad_norm": 2.3184053897857666, "learning_rate": 1.4036882930156165e-05, "loss": 0.2008, "num_input_tokens_seen": 179922176, "step": 83375 }, { "epoch": 13.601957585644373, "grad_norm": 0.027053818106651306, "learning_rate": 1.4033684507786954e-05, "loss": 0.043, "num_input_tokens_seen": 179933440, "step": 83380 }, { "epoch": 13.602773246329527, "grad_norm": 0.2400713413953781, "learning_rate": 1.4030486307664034e-05, "loss": 0.1756, "num_input_tokens_seen": 179944128, "step": 83385 }, { "epoch": 13.603588907014682, "grad_norm": 0.2529054284095764, "learning_rate": 1.4027288329852228e-05, "loss": 0.1887, "num_input_tokens_seen": 179955520, "step": 83390 }, { "epoch": 13.604404567699836, "grad_norm": 0.9454895853996277, "learning_rate": 1.4024090574416343e-05, "loss": 0.1208, "num_input_tokens_seen": 179967520, "step": 83395 }, { "epoch": 13.605220228384992, "grad_norm": 1.150532841682434, "learning_rate": 1.4020893041421184e-05, "loss": 0.1808, "num_input_tokens_seen": 179979040, "step": 83400 }, { "epoch": 13.606035889070148, "grad_norm": 1.9233418703079224, "learning_rate": 1.4017695730931557e-05, "loss": 0.179, "num_input_tokens_seen": 179990624, "step": 83405 }, { "epoch": 13.606851549755302, "grad_norm": 1.6702003479003906, "learning_rate": 1.401449864301226e-05, "loss": 0.1729, "num_input_tokens_seen": 180002976, "step": 83410 }, { "epoch": 13.607667210440457, "grad_norm": 0.31115108728408813, "learning_rate": 1.4011301777728083e-05, "loss": 0.0582, "num_input_tokens_seen": 180014880, "step": 83415 }, { "epoch": 13.608482871125611, "grad_norm": 0.15671639144420624, "learning_rate": 1.4008105135143818e-05, "loss": 0.0629, "num_input_tokens_seen": 180025568, "step": 83420 }, { "epoch": 13.609298531810767, "grad_norm": 0.6373993158340454, "learning_rate": 1.400490871532424e-05, "loss": 0.0656, "num_input_tokens_seen": 180035616, "step": 83425 }, { "epoch": 13.61011419249592, "grad_norm": 0.30219215154647827, "learning_rate": 1.4001712518334143e-05, "loss": 0.0336, "num_input_tokens_seen": 180047136, "step": 83430 }, { "epoch": 13.610929853181077, "grad_norm": 0.10616283118724823, "learning_rate": 1.3998516544238278e-05, "loss": 0.1954, "num_input_tokens_seen": 180057952, "step": 83435 }, { "epoch": 13.611745513866232, "grad_norm": 0.7350236177444458, "learning_rate": 1.3995320793101453e-05, "loss": 0.0614, "num_input_tokens_seen": 180067680, "step": 83440 }, { "epoch": 13.612561174551386, "grad_norm": 2.669013738632202, "learning_rate": 1.3992125264988392e-05, "loss": 0.1061, "num_input_tokens_seen": 180079104, "step": 83445 }, { "epoch": 13.613376835236542, "grad_norm": 0.17100612819194794, "learning_rate": 1.3988929959963901e-05, "loss": 0.0439, "num_input_tokens_seen": 180090400, "step": 83450 }, { "epoch": 13.614192495921696, "grad_norm": 0.09592237323522568, "learning_rate": 1.398573487809269e-05, "loss": 0.1758, "num_input_tokens_seen": 180101504, "step": 83455 }, { "epoch": 13.615008156606851, "grad_norm": 1.9364994764328003, "learning_rate": 1.3982540019439555e-05, "loss": 0.1303, "num_input_tokens_seen": 180111488, "step": 83460 }, { "epoch": 13.615823817292007, "grad_norm": 0.5018119215965271, "learning_rate": 1.3979345384069204e-05, "loss": 0.0301, "num_input_tokens_seen": 180122880, "step": 83465 }, { "epoch": 13.616639477977161, "grad_norm": 2.7095556259155273, "learning_rate": 1.3976150972046415e-05, "loss": 0.1727, "num_input_tokens_seen": 180133504, "step": 83470 }, { "epoch": 13.617455138662317, "grad_norm": 0.8232817649841309, "learning_rate": 1.3972956783435897e-05, "loss": 0.1652, "num_input_tokens_seen": 180145120, "step": 83475 }, { "epoch": 13.61827079934747, "grad_norm": 0.8487611413002014, "learning_rate": 1.3969762818302415e-05, "loss": 0.1629, "num_input_tokens_seen": 180155904, "step": 83480 }, { "epoch": 13.619086460032626, "grad_norm": 0.45848649740219116, "learning_rate": 1.3966569076710664e-05, "loss": 0.0964, "num_input_tokens_seen": 180166112, "step": 83485 }, { "epoch": 13.619902120717782, "grad_norm": 1.916274070739746, "learning_rate": 1.3963375558725406e-05, "loss": 0.172, "num_input_tokens_seen": 180177824, "step": 83490 }, { "epoch": 13.620717781402936, "grad_norm": 0.031729165464639664, "learning_rate": 1.3960182264411326e-05, "loss": 0.1317, "num_input_tokens_seen": 180188800, "step": 83495 }, { "epoch": 13.621533442088092, "grad_norm": 0.09786845743656158, "learning_rate": 1.3956989193833165e-05, "loss": 0.0102, "num_input_tokens_seen": 180198944, "step": 83500 }, { "epoch": 13.622349102773246, "grad_norm": 0.04560495913028717, "learning_rate": 1.3953796347055625e-05, "loss": 0.2077, "num_input_tokens_seen": 180209440, "step": 83505 }, { "epoch": 13.623164763458401, "grad_norm": 0.04147855192422867, "learning_rate": 1.3950603724143418e-05, "loss": 0.1301, "num_input_tokens_seen": 180220288, "step": 83510 }, { "epoch": 13.623980424143557, "grad_norm": 0.12304338812828064, "learning_rate": 1.3947411325161247e-05, "loss": 0.1242, "num_input_tokens_seen": 180231552, "step": 83515 }, { "epoch": 13.624796084828711, "grad_norm": 0.37412694096565247, "learning_rate": 1.3944219150173803e-05, "loss": 0.1113, "num_input_tokens_seen": 180244384, "step": 83520 }, { "epoch": 13.625611745513867, "grad_norm": 0.7477430701255798, "learning_rate": 1.3941027199245781e-05, "loss": 0.1381, "num_input_tokens_seen": 180255936, "step": 83525 }, { "epoch": 13.62642740619902, "grad_norm": 0.5074164867401123, "learning_rate": 1.3937835472441876e-05, "loss": 0.1433, "num_input_tokens_seen": 180266368, "step": 83530 }, { "epoch": 13.627243066884176, "grad_norm": 0.08622585237026215, "learning_rate": 1.3934643969826766e-05, "loss": 0.0925, "num_input_tokens_seen": 180277376, "step": 83535 }, { "epoch": 13.62805872756933, "grad_norm": 0.10593428462743759, "learning_rate": 1.3931452691465135e-05, "loss": 0.0739, "num_input_tokens_seen": 180287232, "step": 83540 }, { "epoch": 13.628874388254486, "grad_norm": 0.2759089171886444, "learning_rate": 1.3928261637421658e-05, "loss": 0.112, "num_input_tokens_seen": 180297888, "step": 83545 }, { "epoch": 13.629690048939642, "grad_norm": 2.128878593444824, "learning_rate": 1.3925070807760993e-05, "loss": 0.182, "num_input_tokens_seen": 180309152, "step": 83550 }, { "epoch": 13.630505709624796, "grad_norm": 0.04654868319630623, "learning_rate": 1.3921880202547834e-05, "loss": 0.0557, "num_input_tokens_seen": 180319552, "step": 83555 }, { "epoch": 13.631321370309951, "grad_norm": 0.6864942908287048, "learning_rate": 1.3918689821846808e-05, "loss": 0.1345, "num_input_tokens_seen": 180328832, "step": 83560 }, { "epoch": 13.632137030995105, "grad_norm": 0.4074864685535431, "learning_rate": 1.3915499665722614e-05, "loss": 0.0796, "num_input_tokens_seen": 180338752, "step": 83565 }, { "epoch": 13.632952691680261, "grad_norm": 0.13197995722293854, "learning_rate": 1.3912309734239858e-05, "loss": 0.0601, "num_input_tokens_seen": 180349312, "step": 83570 }, { "epoch": 13.633768352365417, "grad_norm": 1.7798833847045898, "learning_rate": 1.3909120027463232e-05, "loss": 0.1293, "num_input_tokens_seen": 180360544, "step": 83575 }, { "epoch": 13.63458401305057, "grad_norm": 0.054678112268447876, "learning_rate": 1.3905930545457338e-05, "loss": 0.0313, "num_input_tokens_seen": 180371136, "step": 83580 }, { "epoch": 13.635399673735726, "grad_norm": 1.4746973514556885, "learning_rate": 1.3902741288286856e-05, "loss": 0.0318, "num_input_tokens_seen": 180382400, "step": 83585 }, { "epoch": 13.63621533442088, "grad_norm": 3.126415967941284, "learning_rate": 1.389955225601638e-05, "loss": 0.0767, "num_input_tokens_seen": 180393280, "step": 83590 }, { "epoch": 13.637030995106036, "grad_norm": 0.5453575253486633, "learning_rate": 1.3896363448710576e-05, "loss": 0.0215, "num_input_tokens_seen": 180403840, "step": 83595 }, { "epoch": 13.63784665579119, "grad_norm": 2.1405866146087646, "learning_rate": 1.3893174866434035e-05, "loss": 0.1594, "num_input_tokens_seen": 180415200, "step": 83600 }, { "epoch": 13.638662316476346, "grad_norm": 0.5893659591674805, "learning_rate": 1.3889986509251418e-05, "loss": 0.0256, "num_input_tokens_seen": 180426688, "step": 83605 }, { "epoch": 13.639477977161501, "grad_norm": 1.7385042905807495, "learning_rate": 1.3886798377227295e-05, "loss": 0.2312, "num_input_tokens_seen": 180436480, "step": 83610 }, { "epoch": 13.640293637846655, "grad_norm": 0.09643278270959854, "learning_rate": 1.3883610470426323e-05, "loss": 0.04, "num_input_tokens_seen": 180446976, "step": 83615 }, { "epoch": 13.641109298531811, "grad_norm": 0.029557501897215843, "learning_rate": 1.3880422788913067e-05, "loss": 0.0439, "num_input_tokens_seen": 180458656, "step": 83620 }, { "epoch": 13.641924959216965, "grad_norm": 1.5468591451644897, "learning_rate": 1.3877235332752168e-05, "loss": 0.063, "num_input_tokens_seen": 180468448, "step": 83625 }, { "epoch": 13.64274061990212, "grad_norm": 0.599174439907074, "learning_rate": 1.3874048102008186e-05, "loss": 0.0236, "num_input_tokens_seen": 180478784, "step": 83630 }, { "epoch": 13.643556280587276, "grad_norm": 0.17769049108028412, "learning_rate": 1.3870861096745751e-05, "loss": 0.0812, "num_input_tokens_seen": 180490496, "step": 83635 }, { "epoch": 13.64437194127243, "grad_norm": 0.11683005839586258, "learning_rate": 1.3867674317029416e-05, "loss": 0.1463, "num_input_tokens_seen": 180500416, "step": 83640 }, { "epoch": 13.645187601957586, "grad_norm": 0.5099381804466248, "learning_rate": 1.3864487762923794e-05, "loss": 0.1982, "num_input_tokens_seen": 180510720, "step": 83645 }, { "epoch": 13.64600326264274, "grad_norm": 2.306138515472412, "learning_rate": 1.3861301434493451e-05, "loss": 0.1741, "num_input_tokens_seen": 180522080, "step": 83650 }, { "epoch": 13.646818923327896, "grad_norm": 0.25886771082878113, "learning_rate": 1.3858115331802968e-05, "loss": 0.0849, "num_input_tokens_seen": 180532096, "step": 83655 }, { "epoch": 13.647634584013051, "grad_norm": 2.078927755355835, "learning_rate": 1.3854929454916913e-05, "loss": 0.1808, "num_input_tokens_seen": 180543264, "step": 83660 }, { "epoch": 13.648450244698205, "grad_norm": 1.6632754802703857, "learning_rate": 1.385174380389985e-05, "loss": 0.1794, "num_input_tokens_seen": 180553408, "step": 83665 }, { "epoch": 13.649265905383361, "grad_norm": 1.0174270868301392, "learning_rate": 1.384855837881634e-05, "loss": 0.0918, "num_input_tokens_seen": 180563520, "step": 83670 }, { "epoch": 13.650081566068515, "grad_norm": 0.03420085832476616, "learning_rate": 1.3845373179730941e-05, "loss": 0.0308, "num_input_tokens_seen": 180574528, "step": 83675 }, { "epoch": 13.65089722675367, "grad_norm": 0.11405736953020096, "learning_rate": 1.3842188206708206e-05, "loss": 0.0296, "num_input_tokens_seen": 180585536, "step": 83680 }, { "epoch": 13.651712887438826, "grad_norm": 0.12816770374774933, "learning_rate": 1.3839003459812683e-05, "loss": 0.2007, "num_input_tokens_seen": 180596480, "step": 83685 }, { "epoch": 13.65252854812398, "grad_norm": 1.649356722831726, "learning_rate": 1.3835818939108913e-05, "loss": 0.1046, "num_input_tokens_seen": 180607936, "step": 83690 }, { "epoch": 13.653344208809136, "grad_norm": 0.0699593797326088, "learning_rate": 1.3832634644661425e-05, "loss": 0.028, "num_input_tokens_seen": 180618720, "step": 83695 }, { "epoch": 13.65415986949429, "grad_norm": 0.13145558536052704, "learning_rate": 1.382945057653478e-05, "loss": 0.052, "num_input_tokens_seen": 180630144, "step": 83700 }, { "epoch": 13.654975530179446, "grad_norm": 1.8960764408111572, "learning_rate": 1.3826266734793474e-05, "loss": 0.5284, "num_input_tokens_seen": 180641888, "step": 83705 }, { "epoch": 13.655791190864601, "grad_norm": 0.181407630443573, "learning_rate": 1.3823083119502068e-05, "loss": 0.0613, "num_input_tokens_seen": 180653344, "step": 83710 }, { "epoch": 13.656606851549755, "grad_norm": 0.92678302526474, "learning_rate": 1.3819899730725039e-05, "loss": 0.0755, "num_input_tokens_seen": 180664480, "step": 83715 }, { "epoch": 13.65742251223491, "grad_norm": 0.04242353141307831, "learning_rate": 1.3816716568526944e-05, "loss": 0.0216, "num_input_tokens_seen": 180674944, "step": 83720 }, { "epoch": 13.658238172920065, "grad_norm": 1.6306480169296265, "learning_rate": 1.3813533632972256e-05, "loss": 0.0725, "num_input_tokens_seen": 180686688, "step": 83725 }, { "epoch": 13.65905383360522, "grad_norm": 0.21809732913970947, "learning_rate": 1.381035092412552e-05, "loss": 0.1191, "num_input_tokens_seen": 180696960, "step": 83730 }, { "epoch": 13.659869494290374, "grad_norm": 2.3590474128723145, "learning_rate": 1.3807168442051196e-05, "loss": 0.0762, "num_input_tokens_seen": 180706784, "step": 83735 }, { "epoch": 13.66068515497553, "grad_norm": 0.7177374958992004, "learning_rate": 1.3803986186813825e-05, "loss": 0.153, "num_input_tokens_seen": 180717184, "step": 83740 }, { "epoch": 13.661500815660686, "grad_norm": 0.03784840181469917, "learning_rate": 1.3800804158477856e-05, "loss": 0.2068, "num_input_tokens_seen": 180727360, "step": 83745 }, { "epoch": 13.66231647634584, "grad_norm": 3.3612284660339355, "learning_rate": 1.3797622357107808e-05, "loss": 0.2238, "num_input_tokens_seen": 180739040, "step": 83750 }, { "epoch": 13.663132137030995, "grad_norm": 0.051524873822927475, "learning_rate": 1.3794440782768153e-05, "loss": 0.2068, "num_input_tokens_seen": 180749024, "step": 83755 }, { "epoch": 13.66394779771615, "grad_norm": 0.7104042768478394, "learning_rate": 1.3791259435523373e-05, "loss": 0.2566, "num_input_tokens_seen": 180760224, "step": 83760 }, { "epoch": 13.664763458401305, "grad_norm": 0.29813212156295776, "learning_rate": 1.3788078315437938e-05, "loss": 0.0389, "num_input_tokens_seen": 180770560, "step": 83765 }, { "epoch": 13.66557911908646, "grad_norm": 0.38604310154914856, "learning_rate": 1.378489742257632e-05, "loss": 0.1123, "num_input_tokens_seen": 180781984, "step": 83770 }, { "epoch": 13.666394779771615, "grad_norm": 0.06008743867278099, "learning_rate": 1.3781716757002982e-05, "loss": 0.0497, "num_input_tokens_seen": 180792992, "step": 83775 }, { "epoch": 13.66721044045677, "grad_norm": 2.283287763595581, "learning_rate": 1.3778536318782387e-05, "loss": 0.0374, "num_input_tokens_seen": 180802976, "step": 83780 }, { "epoch": 13.668026101141924, "grad_norm": 1.9783482551574707, "learning_rate": 1.377535610797899e-05, "loss": 0.1073, "num_input_tokens_seen": 180813664, "step": 83785 }, { "epoch": 13.66884176182708, "grad_norm": 0.03158547729253769, "learning_rate": 1.3772176124657239e-05, "loss": 0.1848, "num_input_tokens_seen": 180823968, "step": 83790 }, { "epoch": 13.669657422512234, "grad_norm": 0.04622502624988556, "learning_rate": 1.3768996368881582e-05, "loss": 0.1178, "num_input_tokens_seen": 180834528, "step": 83795 }, { "epoch": 13.67047308319739, "grad_norm": 1.0949214696884155, "learning_rate": 1.3765816840716463e-05, "loss": 0.0752, "num_input_tokens_seen": 180846176, "step": 83800 }, { "epoch": 13.671288743882545, "grad_norm": 0.8256412744522095, "learning_rate": 1.3762637540226317e-05, "loss": 0.0453, "num_input_tokens_seen": 180855744, "step": 83805 }, { "epoch": 13.6721044045677, "grad_norm": 1.6645596027374268, "learning_rate": 1.3759458467475575e-05, "loss": 0.1541, "num_input_tokens_seen": 180867168, "step": 83810 }, { "epoch": 13.672920065252855, "grad_norm": 0.24647612869739532, "learning_rate": 1.3756279622528667e-05, "loss": 0.119, "num_input_tokens_seen": 180878048, "step": 83815 }, { "epoch": 13.673735725938009, "grad_norm": 2.23616886138916, "learning_rate": 1.375310100545002e-05, "loss": 0.0791, "num_input_tokens_seen": 180889440, "step": 83820 }, { "epoch": 13.674551386623165, "grad_norm": 0.1633257269859314, "learning_rate": 1.3749922616304044e-05, "loss": 0.0606, "num_input_tokens_seen": 180899584, "step": 83825 }, { "epoch": 13.67536704730832, "grad_norm": 2.368298292160034, "learning_rate": 1.3746744455155159e-05, "loss": 0.0538, "num_input_tokens_seen": 180910624, "step": 83830 }, { "epoch": 13.676182707993474, "grad_norm": 3.0420167446136475, "learning_rate": 1.3743566522067775e-05, "loss": 0.1227, "num_input_tokens_seen": 180920928, "step": 83835 }, { "epoch": 13.67699836867863, "grad_norm": 0.14176136255264282, "learning_rate": 1.3740388817106284e-05, "loss": 0.1516, "num_input_tokens_seen": 180930048, "step": 83840 }, { "epoch": 13.677814029363784, "grad_norm": 0.17290975153446198, "learning_rate": 1.3737211340335119e-05, "loss": 0.2078, "num_input_tokens_seen": 180942368, "step": 83845 }, { "epoch": 13.67862969004894, "grad_norm": 0.04696352034807205, "learning_rate": 1.3734034091818631e-05, "loss": 0.0996, "num_input_tokens_seen": 180954400, "step": 83850 }, { "epoch": 13.679445350734095, "grad_norm": 0.1625932902097702, "learning_rate": 1.3730857071621245e-05, "loss": 0.0499, "num_input_tokens_seen": 180966400, "step": 83855 }, { "epoch": 13.68026101141925, "grad_norm": 0.6318919062614441, "learning_rate": 1.372768027980734e-05, "loss": 0.0349, "num_input_tokens_seen": 180976480, "step": 83860 }, { "epoch": 13.681076672104405, "grad_norm": 2.576011896133423, "learning_rate": 1.3724503716441289e-05, "loss": 0.2331, "num_input_tokens_seen": 180987808, "step": 83865 }, { "epoch": 13.681892332789559, "grad_norm": 0.06248467415571213, "learning_rate": 1.372132738158748e-05, "loss": 0.0209, "num_input_tokens_seen": 180997088, "step": 83870 }, { "epoch": 13.682707993474715, "grad_norm": 1.166721224784851, "learning_rate": 1.3718151275310273e-05, "loss": 0.2374, "num_input_tokens_seen": 181008352, "step": 83875 }, { "epoch": 13.68352365415987, "grad_norm": 0.04023667052388191, "learning_rate": 1.3714975397674046e-05, "loss": 0.05, "num_input_tokens_seen": 181020256, "step": 83880 }, { "epoch": 13.684339314845024, "grad_norm": 0.37510421872138977, "learning_rate": 1.371179974874316e-05, "loss": 0.0236, "num_input_tokens_seen": 181031392, "step": 83885 }, { "epoch": 13.68515497553018, "grad_norm": 0.6177892684936523, "learning_rate": 1.370862432858197e-05, "loss": 0.1855, "num_input_tokens_seen": 181042304, "step": 83890 }, { "epoch": 13.685970636215334, "grad_norm": 0.10761089622974396, "learning_rate": 1.370544913725483e-05, "loss": 0.1177, "num_input_tokens_seen": 181052960, "step": 83895 }, { "epoch": 13.68678629690049, "grad_norm": 0.062286585569381714, "learning_rate": 1.3702274174826096e-05, "loss": 0.1404, "num_input_tokens_seen": 181064096, "step": 83900 }, { "epoch": 13.687601957585644, "grad_norm": 0.21399301290512085, "learning_rate": 1.3699099441360105e-05, "loss": 0.0474, "num_input_tokens_seen": 181074112, "step": 83905 }, { "epoch": 13.6884176182708, "grad_norm": 0.2943771481513977, "learning_rate": 1.3695924936921201e-05, "loss": 0.028, "num_input_tokens_seen": 181083520, "step": 83910 }, { "epoch": 13.689233278955955, "grad_norm": 0.4242311120033264, "learning_rate": 1.3692750661573717e-05, "loss": 0.2048, "num_input_tokens_seen": 181093984, "step": 83915 }, { "epoch": 13.690048939641109, "grad_norm": 0.2836724519729614, "learning_rate": 1.3689576615381989e-05, "loss": 0.1657, "num_input_tokens_seen": 181104096, "step": 83920 }, { "epoch": 13.690864600326265, "grad_norm": 0.06081240251660347, "learning_rate": 1.3686402798410336e-05, "loss": 0.051, "num_input_tokens_seen": 181114816, "step": 83925 }, { "epoch": 13.691680261011419, "grad_norm": 0.06021466851234436, "learning_rate": 1.368322921072308e-05, "loss": 0.0369, "num_input_tokens_seen": 181125664, "step": 83930 }, { "epoch": 13.692495921696574, "grad_norm": 0.013525022193789482, "learning_rate": 1.3680055852384547e-05, "loss": 0.0313, "num_input_tokens_seen": 181135136, "step": 83935 }, { "epoch": 13.69331158238173, "grad_norm": 0.22204671800136566, "learning_rate": 1.367688272345904e-05, "loss": 0.0229, "num_input_tokens_seen": 181147808, "step": 83940 }, { "epoch": 13.694127243066884, "grad_norm": 0.7516202330589294, "learning_rate": 1.3673709824010867e-05, "loss": 0.0484, "num_input_tokens_seen": 181159584, "step": 83945 }, { "epoch": 13.69494290375204, "grad_norm": 0.4277510941028595, "learning_rate": 1.3670537154104335e-05, "loss": 0.0205, "num_input_tokens_seen": 181170496, "step": 83950 }, { "epoch": 13.695758564437194, "grad_norm": 0.8131223917007446, "learning_rate": 1.3667364713803737e-05, "loss": 0.4194, "num_input_tokens_seen": 181181120, "step": 83955 }, { "epoch": 13.69657422512235, "grad_norm": 0.06049283966422081, "learning_rate": 1.3664192503173373e-05, "loss": 0.1957, "num_input_tokens_seen": 181192352, "step": 83960 }, { "epoch": 13.697389885807503, "grad_norm": 2.6771857738494873, "learning_rate": 1.3661020522277523e-05, "loss": 0.0927, "num_input_tokens_seen": 181202112, "step": 83965 }, { "epoch": 13.698205546492659, "grad_norm": 0.135065495967865, "learning_rate": 1.3657848771180474e-05, "loss": 0.0876, "num_input_tokens_seen": 181213760, "step": 83970 }, { "epoch": 13.699021207177815, "grad_norm": 1.349208950996399, "learning_rate": 1.3654677249946524e-05, "loss": 0.1434, "num_input_tokens_seen": 181223104, "step": 83975 }, { "epoch": 13.699836867862969, "grad_norm": 0.19282375276088715, "learning_rate": 1.3651505958639913e-05, "loss": 0.0573, "num_input_tokens_seen": 181232512, "step": 83980 }, { "epoch": 13.700652528548124, "grad_norm": 0.13377326726913452, "learning_rate": 1.3648334897324938e-05, "loss": 0.1168, "num_input_tokens_seen": 181243968, "step": 83985 }, { "epoch": 13.701468189233278, "grad_norm": 0.0522114597260952, "learning_rate": 1.364516406606586e-05, "loss": 0.0449, "num_input_tokens_seen": 181253856, "step": 83990 }, { "epoch": 13.702283849918434, "grad_norm": 3.565052032470703, "learning_rate": 1.3641993464926935e-05, "loss": 0.2968, "num_input_tokens_seen": 181264736, "step": 83995 }, { "epoch": 13.70309951060359, "grad_norm": 0.0181354321539402, "learning_rate": 1.3638823093972426e-05, "loss": 0.021, "num_input_tokens_seen": 181274976, "step": 84000 }, { "epoch": 13.703915171288743, "grad_norm": 1.7262934446334839, "learning_rate": 1.3635652953266572e-05, "loss": 0.1725, "num_input_tokens_seen": 181285344, "step": 84005 }, { "epoch": 13.7047308319739, "grad_norm": 1.0910422801971436, "learning_rate": 1.3632483042873634e-05, "loss": 0.1067, "num_input_tokens_seen": 181297024, "step": 84010 }, { "epoch": 13.705546492659053, "grad_norm": 0.852122962474823, "learning_rate": 1.3629313362857843e-05, "loss": 0.1374, "num_input_tokens_seen": 181308032, "step": 84015 }, { "epoch": 13.706362153344209, "grad_norm": 2.4557838439941406, "learning_rate": 1.3626143913283445e-05, "loss": 0.2281, "num_input_tokens_seen": 181319232, "step": 84020 }, { "epoch": 13.707177814029365, "grad_norm": 0.8140450716018677, "learning_rate": 1.3622974694214668e-05, "loss": 0.0512, "num_input_tokens_seen": 181329472, "step": 84025 }, { "epoch": 13.707993474714518, "grad_norm": 0.052282437682151794, "learning_rate": 1.361980570571574e-05, "loss": 0.0191, "num_input_tokens_seen": 181340448, "step": 84030 }, { "epoch": 13.708809135399674, "grad_norm": 0.032885320484638214, "learning_rate": 1.3616636947850886e-05, "loss": 0.3636, "num_input_tokens_seen": 181352064, "step": 84035 }, { "epoch": 13.709624796084828, "grad_norm": 2.029240131378174, "learning_rate": 1.3613468420684328e-05, "loss": 0.1939, "num_input_tokens_seen": 181363360, "step": 84040 }, { "epoch": 13.710440456769984, "grad_norm": 0.06011710315942764, "learning_rate": 1.3610300124280278e-05, "loss": 0.1047, "num_input_tokens_seen": 181374432, "step": 84045 }, { "epoch": 13.71125611745514, "grad_norm": 3.1607913970947266, "learning_rate": 1.3607132058702942e-05, "loss": 0.1196, "num_input_tokens_seen": 181385472, "step": 84050 }, { "epoch": 13.712071778140293, "grad_norm": 0.9663771390914917, "learning_rate": 1.360396422401653e-05, "loss": 0.0726, "num_input_tokens_seen": 181396896, "step": 84055 }, { "epoch": 13.71288743882545, "grad_norm": 0.15831948816776276, "learning_rate": 1.3600796620285239e-05, "loss": 0.025, "num_input_tokens_seen": 181406912, "step": 84060 }, { "epoch": 13.713703099510603, "grad_norm": 0.028069039806723595, "learning_rate": 1.3597629247573263e-05, "loss": 0.0794, "num_input_tokens_seen": 181417216, "step": 84065 }, { "epoch": 13.714518760195759, "grad_norm": 1.3576045036315918, "learning_rate": 1.3594462105944797e-05, "loss": 0.094, "num_input_tokens_seen": 181428384, "step": 84070 }, { "epoch": 13.715334420880914, "grad_norm": 0.6680601239204407, "learning_rate": 1.3591295195464016e-05, "loss": 0.1525, "num_input_tokens_seen": 181439168, "step": 84075 }, { "epoch": 13.716150081566068, "grad_norm": 0.3231727182865143, "learning_rate": 1.358812851619513e-05, "loss": 0.1675, "num_input_tokens_seen": 181449536, "step": 84080 }, { "epoch": 13.716965742251224, "grad_norm": 0.13751980662345886, "learning_rate": 1.3584962068202279e-05, "loss": 0.1544, "num_input_tokens_seen": 181459648, "step": 84085 }, { "epoch": 13.717781402936378, "grad_norm": 0.04256328567862511, "learning_rate": 1.3581795851549666e-05, "loss": 0.0941, "num_input_tokens_seen": 181469568, "step": 84090 }, { "epoch": 13.718597063621534, "grad_norm": 1.2345548868179321, "learning_rate": 1.3578629866301432e-05, "loss": 0.1653, "num_input_tokens_seen": 181480512, "step": 84095 }, { "epoch": 13.719412724306688, "grad_norm": 0.16411463916301727, "learning_rate": 1.3575464112521772e-05, "loss": 0.1166, "num_input_tokens_seen": 181491072, "step": 84100 }, { "epoch": 13.720228384991843, "grad_norm": 0.27144917845726013, "learning_rate": 1.3572298590274801e-05, "loss": 0.1607, "num_input_tokens_seen": 181503392, "step": 84105 }, { "epoch": 13.721044045676999, "grad_norm": 0.6397104263305664, "learning_rate": 1.356913329962472e-05, "loss": 0.0754, "num_input_tokens_seen": 181513888, "step": 84110 }, { "epoch": 13.721859706362153, "grad_norm": 2.1926088333129883, "learning_rate": 1.3565968240635632e-05, "loss": 0.1556, "num_input_tokens_seen": 181524416, "step": 84115 }, { "epoch": 13.722675367047309, "grad_norm": 0.20573341846466064, "learning_rate": 1.3562803413371722e-05, "loss": 0.0761, "num_input_tokens_seen": 181535584, "step": 84120 }, { "epoch": 13.723491027732463, "grad_norm": 0.998799741268158, "learning_rate": 1.3559638817897091e-05, "loss": 0.1794, "num_input_tokens_seen": 181546080, "step": 84125 }, { "epoch": 13.724306688417618, "grad_norm": 0.31531116366386414, "learning_rate": 1.3556474454275903e-05, "loss": 0.0156, "num_input_tokens_seen": 181557824, "step": 84130 }, { "epoch": 13.725122349102774, "grad_norm": 0.9548677802085876, "learning_rate": 1.3553310322572276e-05, "loss": 0.1946, "num_input_tokens_seen": 181567360, "step": 84135 }, { "epoch": 13.725938009787928, "grad_norm": 0.12964224815368652, "learning_rate": 1.3550146422850338e-05, "loss": 0.0089, "num_input_tokens_seen": 181578528, "step": 84140 }, { "epoch": 13.726753670473084, "grad_norm": 0.07400067150592804, "learning_rate": 1.354698275517421e-05, "loss": 0.0147, "num_input_tokens_seen": 181589568, "step": 84145 }, { "epoch": 13.727569331158238, "grad_norm": 0.39425894618034363, "learning_rate": 1.3543819319607997e-05, "loss": 0.1596, "num_input_tokens_seen": 181601248, "step": 84150 }, { "epoch": 13.728384991843393, "grad_norm": 0.661973237991333, "learning_rate": 1.3540656116215828e-05, "loss": 0.0457, "num_input_tokens_seen": 181611680, "step": 84155 }, { "epoch": 13.729200652528547, "grad_norm": 0.16973929107189178, "learning_rate": 1.3537493145061797e-05, "loss": 0.0639, "num_input_tokens_seen": 181622560, "step": 84160 }, { "epoch": 13.730016313213703, "grad_norm": 0.1409459412097931, "learning_rate": 1.3534330406210005e-05, "loss": 0.2068, "num_input_tokens_seen": 181633408, "step": 84165 }, { "epoch": 13.730831973898859, "grad_norm": 0.7135701179504395, "learning_rate": 1.3531167899724556e-05, "loss": 0.3447, "num_input_tokens_seen": 181644320, "step": 84170 }, { "epoch": 13.731647634584013, "grad_norm": 1.37894868850708, "learning_rate": 1.3528005625669538e-05, "loss": 0.1511, "num_input_tokens_seen": 181654688, "step": 84175 }, { "epoch": 13.732463295269168, "grad_norm": 0.269563764333725, "learning_rate": 1.3524843584109032e-05, "loss": 0.1054, "num_input_tokens_seen": 181663936, "step": 84180 }, { "epoch": 13.733278955954322, "grad_norm": 0.2232072800397873, "learning_rate": 1.3521681775107146e-05, "loss": 0.1659, "num_input_tokens_seen": 181673664, "step": 84185 }, { "epoch": 13.734094616639478, "grad_norm": 0.25219932198524475, "learning_rate": 1.3518520198727918e-05, "loss": 0.0432, "num_input_tokens_seen": 181684320, "step": 84190 }, { "epoch": 13.734910277324634, "grad_norm": 0.2528882622718811, "learning_rate": 1.3515358855035465e-05, "loss": 0.0209, "num_input_tokens_seen": 181695616, "step": 84195 }, { "epoch": 13.735725938009788, "grad_norm": 2.0146636962890625, "learning_rate": 1.3512197744093815e-05, "loss": 0.3026, "num_input_tokens_seen": 181704512, "step": 84200 }, { "epoch": 13.736541598694943, "grad_norm": 0.073723703622818, "learning_rate": 1.350903686596707e-05, "loss": 0.0691, "num_input_tokens_seen": 181715552, "step": 84205 }, { "epoch": 13.737357259380097, "grad_norm": 0.4033137261867523, "learning_rate": 1.3505876220719252e-05, "loss": 0.0263, "num_input_tokens_seen": 181726240, "step": 84210 }, { "epoch": 13.738172920065253, "grad_norm": 0.06443940848112106, "learning_rate": 1.3502715808414452e-05, "loss": 0.0519, "num_input_tokens_seen": 181737280, "step": 84215 }, { "epoch": 13.738988580750409, "grad_norm": 1.6541093587875366, "learning_rate": 1.3499555629116682e-05, "loss": 0.125, "num_input_tokens_seen": 181747968, "step": 84220 }, { "epoch": 13.739804241435563, "grad_norm": 0.037310317158699036, "learning_rate": 1.3496395682890025e-05, "loss": 0.0335, "num_input_tokens_seen": 181759616, "step": 84225 }, { "epoch": 13.740619902120718, "grad_norm": 0.7173768877983093, "learning_rate": 1.3493235969798485e-05, "loss": 0.2537, "num_input_tokens_seen": 181771168, "step": 84230 }, { "epoch": 13.741435562805872, "grad_norm": 0.13473834097385406, "learning_rate": 1.3490076489906134e-05, "loss": 0.0352, "num_input_tokens_seen": 181781568, "step": 84235 }, { "epoch": 13.742251223491028, "grad_norm": 0.04544651880860329, "learning_rate": 1.3486917243276964e-05, "loss": 0.0212, "num_input_tokens_seen": 181791424, "step": 84240 }, { "epoch": 13.743066884176184, "grad_norm": 2.891183614730835, "learning_rate": 1.3483758229975044e-05, "loss": 0.1942, "num_input_tokens_seen": 181801376, "step": 84245 }, { "epoch": 13.743882544861338, "grad_norm": 0.07748350501060486, "learning_rate": 1.3480599450064351e-05, "loss": 0.0892, "num_input_tokens_seen": 181812736, "step": 84250 }, { "epoch": 13.744698205546493, "grad_norm": 0.7380479574203491, "learning_rate": 1.3477440903608946e-05, "loss": 0.09, "num_input_tokens_seen": 181822784, "step": 84255 }, { "epoch": 13.745513866231647, "grad_norm": 0.19886134564876556, "learning_rate": 1.3474282590672793e-05, "loss": 0.08, "num_input_tokens_seen": 181833504, "step": 84260 }, { "epoch": 13.746329526916803, "grad_norm": 0.03857235983014107, "learning_rate": 1.3471124511319949e-05, "loss": 0.1011, "num_input_tokens_seen": 181844768, "step": 84265 }, { "epoch": 13.747145187601957, "grad_norm": 0.07466436177492142, "learning_rate": 1.3467966665614373e-05, "loss": 0.1156, "num_input_tokens_seen": 181855456, "step": 84270 }, { "epoch": 13.747960848287113, "grad_norm": 0.20236365497112274, "learning_rate": 1.3464809053620087e-05, "loss": 0.0237, "num_input_tokens_seen": 181867648, "step": 84275 }, { "epoch": 13.748776508972268, "grad_norm": 0.567420244216919, "learning_rate": 1.3461651675401083e-05, "loss": 0.1306, "num_input_tokens_seen": 181877472, "step": 84280 }, { "epoch": 13.749592169657422, "grad_norm": 0.059721022844314575, "learning_rate": 1.3458494531021343e-05, "loss": 0.0705, "num_input_tokens_seen": 181888224, "step": 84285 }, { "epoch": 13.750407830342578, "grad_norm": 0.37639060616493225, "learning_rate": 1.3455337620544856e-05, "loss": 0.1177, "num_input_tokens_seen": 181899616, "step": 84290 }, { "epoch": 13.751223491027732, "grad_norm": 0.1002596989274025, "learning_rate": 1.3452180944035597e-05, "loss": 0.1326, "num_input_tokens_seen": 181910624, "step": 84295 }, { "epoch": 13.752039151712887, "grad_norm": 0.5246875882148743, "learning_rate": 1.344902450155754e-05, "loss": 0.2361, "num_input_tokens_seen": 181920416, "step": 84300 }, { "epoch": 13.752854812398043, "grad_norm": 0.40701115131378174, "learning_rate": 1.3445868293174651e-05, "loss": 0.1052, "num_input_tokens_seen": 181931488, "step": 84305 }, { "epoch": 13.753670473083197, "grad_norm": 0.15933185815811157, "learning_rate": 1.34427123189509e-05, "loss": 0.1997, "num_input_tokens_seen": 181942272, "step": 84310 }, { "epoch": 13.754486133768353, "grad_norm": 0.4497774541378021, "learning_rate": 1.3439556578950249e-05, "loss": 0.0801, "num_input_tokens_seen": 181952448, "step": 84315 }, { "epoch": 13.755301794453507, "grad_norm": 0.7165252566337585, "learning_rate": 1.3436401073236649e-05, "loss": 0.0264, "num_input_tokens_seen": 181962976, "step": 84320 }, { "epoch": 13.756117455138662, "grad_norm": 2.76389479637146, "learning_rate": 1.343324580187404e-05, "loss": 0.2346, "num_input_tokens_seen": 181972672, "step": 84325 }, { "epoch": 13.756933115823816, "grad_norm": 0.8122569918632507, "learning_rate": 1.3430090764926394e-05, "loss": 0.0381, "num_input_tokens_seen": 181983616, "step": 84330 }, { "epoch": 13.757748776508972, "grad_norm": 0.3564693033695221, "learning_rate": 1.3426935962457621e-05, "loss": 0.0152, "num_input_tokens_seen": 181995296, "step": 84335 }, { "epoch": 13.758564437194128, "grad_norm": 0.16827793419361115, "learning_rate": 1.342378139453169e-05, "loss": 0.0966, "num_input_tokens_seen": 182006016, "step": 84340 }, { "epoch": 13.759380097879282, "grad_norm": 0.17516952753067017, "learning_rate": 1.3420627061212497e-05, "loss": 0.0246, "num_input_tokens_seen": 182016800, "step": 84345 }, { "epoch": 13.760195758564437, "grad_norm": 0.17151206731796265, "learning_rate": 1.3417472962564004e-05, "loss": 0.0402, "num_input_tokens_seen": 182027392, "step": 84350 }, { "epoch": 13.761011419249591, "grad_norm": 0.10595542192459106, "learning_rate": 1.3414319098650094e-05, "loss": 0.1136, "num_input_tokens_seen": 182039232, "step": 84355 }, { "epoch": 13.761827079934747, "grad_norm": 0.42075538635253906, "learning_rate": 1.3411165469534726e-05, "loss": 0.1088, "num_input_tokens_seen": 182050848, "step": 84360 }, { "epoch": 13.762642740619903, "grad_norm": 0.10338807851076126, "learning_rate": 1.3408012075281773e-05, "loss": 0.07, "num_input_tokens_seen": 182061088, "step": 84365 }, { "epoch": 13.763458401305057, "grad_norm": 0.055142972618341446, "learning_rate": 1.3404858915955176e-05, "loss": 0.1236, "num_input_tokens_seen": 182072064, "step": 84370 }, { "epoch": 13.764274061990212, "grad_norm": 0.07106640189886093, "learning_rate": 1.3401705991618808e-05, "loss": 0.0223, "num_input_tokens_seen": 182082272, "step": 84375 }, { "epoch": 13.765089722675366, "grad_norm": 0.33728572726249695, "learning_rate": 1.3398553302336597e-05, "loss": 0.0422, "num_input_tokens_seen": 182094176, "step": 84380 }, { "epoch": 13.765905383360522, "grad_norm": 0.13180823624134064, "learning_rate": 1.3395400848172407e-05, "loss": 0.0446, "num_input_tokens_seen": 182105600, "step": 84385 }, { "epoch": 13.766721044045678, "grad_norm": 0.11440286785364151, "learning_rate": 1.3392248629190146e-05, "loss": 0.0747, "num_input_tokens_seen": 182117120, "step": 84390 }, { "epoch": 13.767536704730832, "grad_norm": 0.15505903959274292, "learning_rate": 1.3389096645453691e-05, "loss": 0.204, "num_input_tokens_seen": 182128032, "step": 84395 }, { "epoch": 13.768352365415987, "grad_norm": 0.3583214282989502, "learning_rate": 1.3385944897026923e-05, "loss": 0.1283, "num_input_tokens_seen": 182139104, "step": 84400 }, { "epoch": 13.769168026101141, "grad_norm": 0.25044044852256775, "learning_rate": 1.3382793383973715e-05, "loss": 0.1081, "num_input_tokens_seen": 182149024, "step": 84405 }, { "epoch": 13.769983686786297, "grad_norm": 1.1738548278808594, "learning_rate": 1.3379642106357937e-05, "loss": 0.2408, "num_input_tokens_seen": 182160352, "step": 84410 }, { "epoch": 13.770799347471453, "grad_norm": 0.07412159442901611, "learning_rate": 1.3376491064243451e-05, "loss": 0.0448, "num_input_tokens_seen": 182172000, "step": 84415 }, { "epoch": 13.771615008156607, "grad_norm": 1.03252112865448, "learning_rate": 1.3373340257694122e-05, "loss": 0.0508, "num_input_tokens_seen": 182182624, "step": 84420 }, { "epoch": 13.772430668841762, "grad_norm": 1.8090856075286865, "learning_rate": 1.3370189686773805e-05, "loss": 0.0505, "num_input_tokens_seen": 182193152, "step": 84425 }, { "epoch": 13.773246329526916, "grad_norm": 1.6334552764892578, "learning_rate": 1.3367039351546343e-05, "loss": 0.0602, "num_input_tokens_seen": 182203456, "step": 84430 }, { "epoch": 13.774061990212072, "grad_norm": 0.0396743081510067, "learning_rate": 1.3363889252075585e-05, "loss": 0.1469, "num_input_tokens_seen": 182213728, "step": 84435 }, { "epoch": 13.774877650897226, "grad_norm": 0.6958248615264893, "learning_rate": 1.3360739388425375e-05, "loss": 0.0226, "num_input_tokens_seen": 182224832, "step": 84440 }, { "epoch": 13.775693311582382, "grad_norm": 1.2570252418518066, "learning_rate": 1.3357589760659548e-05, "loss": 0.1069, "num_input_tokens_seen": 182235168, "step": 84445 }, { "epoch": 13.776508972267537, "grad_norm": 1.5456982851028442, "learning_rate": 1.335444036884193e-05, "loss": 0.0369, "num_input_tokens_seen": 182246336, "step": 84450 }, { "epoch": 13.777324632952691, "grad_norm": 0.06542519479990005, "learning_rate": 1.3351291213036354e-05, "loss": 0.0974, "num_input_tokens_seen": 182257920, "step": 84455 }, { "epoch": 13.778140293637847, "grad_norm": 0.060193076729774475, "learning_rate": 1.3348142293306632e-05, "loss": 0.0811, "num_input_tokens_seen": 182267840, "step": 84460 }, { "epoch": 13.778955954323001, "grad_norm": 0.16956152021884918, "learning_rate": 1.3344993609716602e-05, "loss": 0.2578, "num_input_tokens_seen": 182279328, "step": 84465 }, { "epoch": 13.779771615008157, "grad_norm": 0.44466209411621094, "learning_rate": 1.3341845162330044e-05, "loss": 0.1452, "num_input_tokens_seen": 182291168, "step": 84470 }, { "epoch": 13.780587275693312, "grad_norm": 0.029037974774837494, "learning_rate": 1.3338696951210806e-05, "loss": 0.0208, "num_input_tokens_seen": 182302144, "step": 84475 }, { "epoch": 13.781402936378466, "grad_norm": 0.11354006826877594, "learning_rate": 1.3335548976422643e-05, "loss": 0.2086, "num_input_tokens_seen": 182313248, "step": 84480 }, { "epoch": 13.782218597063622, "grad_norm": 3.5630252361297607, "learning_rate": 1.3332401238029401e-05, "loss": 0.1756, "num_input_tokens_seen": 182323968, "step": 84485 }, { "epoch": 13.783034257748776, "grad_norm": 0.17680920660495758, "learning_rate": 1.3329253736094827e-05, "loss": 0.0465, "num_input_tokens_seen": 182334848, "step": 84490 }, { "epoch": 13.783849918433932, "grad_norm": 0.03315218910574913, "learning_rate": 1.3326106470682742e-05, "loss": 0.1569, "num_input_tokens_seen": 182346176, "step": 84495 }, { "epoch": 13.784665579119086, "grad_norm": 0.5229934453964233, "learning_rate": 1.332295944185692e-05, "loss": 0.1373, "num_input_tokens_seen": 182356832, "step": 84500 }, { "epoch": 13.785481239804241, "grad_norm": 0.3769393563270569, "learning_rate": 1.3319812649681137e-05, "loss": 0.0801, "num_input_tokens_seen": 182367200, "step": 84505 }, { "epoch": 13.786296900489397, "grad_norm": 0.046664535999298096, "learning_rate": 1.3316666094219169e-05, "loss": 0.055, "num_input_tokens_seen": 182377952, "step": 84510 }, { "epoch": 13.78711256117455, "grad_norm": 1.4839173555374146, "learning_rate": 1.3313519775534784e-05, "loss": 0.0417, "num_input_tokens_seen": 182388352, "step": 84515 }, { "epoch": 13.787928221859707, "grad_norm": 0.0753621980547905, "learning_rate": 1.3310373693691747e-05, "loss": 0.1085, "num_input_tokens_seen": 182399424, "step": 84520 }, { "epoch": 13.78874388254486, "grad_norm": 0.20980732142925262, "learning_rate": 1.3307227848753812e-05, "loss": 0.0162, "num_input_tokens_seen": 182411232, "step": 84525 }, { "epoch": 13.789559543230016, "grad_norm": 1.3353105783462524, "learning_rate": 1.3304082240784744e-05, "loss": 0.0778, "num_input_tokens_seen": 182422272, "step": 84530 }, { "epoch": 13.790375203915172, "grad_norm": 2.4389305114746094, "learning_rate": 1.3300936869848283e-05, "loss": 0.2373, "num_input_tokens_seen": 182433856, "step": 84535 }, { "epoch": 13.791190864600326, "grad_norm": 0.2703353464603424, "learning_rate": 1.3297791736008177e-05, "loss": 0.0119, "num_input_tokens_seen": 182444256, "step": 84540 }, { "epoch": 13.792006525285482, "grad_norm": 0.47467663884162903, "learning_rate": 1.3294646839328167e-05, "loss": 0.0641, "num_input_tokens_seen": 182455136, "step": 84545 }, { "epoch": 13.792822185970635, "grad_norm": 0.1510898619890213, "learning_rate": 1.329150217987199e-05, "loss": 0.0483, "num_input_tokens_seen": 182465344, "step": 84550 }, { "epoch": 13.793637846655791, "grad_norm": 0.12189261615276337, "learning_rate": 1.3288357757703374e-05, "loss": 0.0599, "num_input_tokens_seen": 182475968, "step": 84555 }, { "epoch": 13.794453507340947, "grad_norm": 0.7297714352607727, "learning_rate": 1.3285213572886046e-05, "loss": 0.0781, "num_input_tokens_seen": 182485440, "step": 84560 }, { "epoch": 13.7952691680261, "grad_norm": 0.9349077343940735, "learning_rate": 1.3282069625483726e-05, "loss": 0.0781, "num_input_tokens_seen": 182496000, "step": 84565 }, { "epoch": 13.796084828711257, "grad_norm": 0.18750832974910736, "learning_rate": 1.3278925915560128e-05, "loss": 0.0769, "num_input_tokens_seen": 182506336, "step": 84570 }, { "epoch": 13.79690048939641, "grad_norm": 1.2488301992416382, "learning_rate": 1.327578244317897e-05, "loss": 0.1015, "num_input_tokens_seen": 182517568, "step": 84575 }, { "epoch": 13.797716150081566, "grad_norm": 0.04359155520796776, "learning_rate": 1.3272639208403948e-05, "loss": 0.0072, "num_input_tokens_seen": 182528416, "step": 84580 }, { "epoch": 13.798531810766722, "grad_norm": 1.573868989944458, "learning_rate": 1.3269496211298774e-05, "loss": 0.0775, "num_input_tokens_seen": 182538432, "step": 84585 }, { "epoch": 13.799347471451876, "grad_norm": 0.07262402027845383, "learning_rate": 1.3266353451927138e-05, "loss": 0.1229, "num_input_tokens_seen": 182549632, "step": 84590 }, { "epoch": 13.800163132137031, "grad_norm": 0.024491582065820694, "learning_rate": 1.3263210930352737e-05, "loss": 0.0418, "num_input_tokens_seen": 182560768, "step": 84595 }, { "epoch": 13.800978792822185, "grad_norm": 0.17795446515083313, "learning_rate": 1.3260068646639246e-05, "loss": 0.0531, "num_input_tokens_seen": 182572704, "step": 84600 }, { "epoch": 13.801794453507341, "grad_norm": 1.7668135166168213, "learning_rate": 1.3256926600850364e-05, "loss": 0.2052, "num_input_tokens_seen": 182583232, "step": 84605 }, { "epoch": 13.802610114192497, "grad_norm": 1.2238733768463135, "learning_rate": 1.3253784793049765e-05, "loss": 0.1373, "num_input_tokens_seen": 182594144, "step": 84610 }, { "epoch": 13.80342577487765, "grad_norm": 0.07071459293365479, "learning_rate": 1.3250643223301115e-05, "loss": 0.1795, "num_input_tokens_seen": 182604672, "step": 84615 }, { "epoch": 13.804241435562806, "grad_norm": 1.040078043937683, "learning_rate": 1.3247501891668085e-05, "loss": 0.1688, "num_input_tokens_seen": 182615360, "step": 84620 }, { "epoch": 13.80505709624796, "grad_norm": 0.6765803694725037, "learning_rate": 1.3244360798214343e-05, "loss": 0.1505, "num_input_tokens_seen": 182626016, "step": 84625 }, { "epoch": 13.805872756933116, "grad_norm": 0.04986214265227318, "learning_rate": 1.3241219943003538e-05, "loss": 0.0684, "num_input_tokens_seen": 182636928, "step": 84630 }, { "epoch": 13.80668841761827, "grad_norm": 1.9132496118545532, "learning_rate": 1.3238079326099328e-05, "loss": 0.0581, "num_input_tokens_seen": 182647392, "step": 84635 }, { "epoch": 13.807504078303426, "grad_norm": 0.031770817935466766, "learning_rate": 1.3234938947565365e-05, "loss": 0.0817, "num_input_tokens_seen": 182658272, "step": 84640 }, { "epoch": 13.808319738988581, "grad_norm": 0.11587905883789062, "learning_rate": 1.3231798807465288e-05, "loss": 0.0599, "num_input_tokens_seen": 182668736, "step": 84645 }, { "epoch": 13.809135399673735, "grad_norm": 0.662243127822876, "learning_rate": 1.3228658905862734e-05, "loss": 0.1438, "num_input_tokens_seen": 182679552, "step": 84650 }, { "epoch": 13.809951060358891, "grad_norm": 0.0934557095170021, "learning_rate": 1.3225519242821343e-05, "loss": 0.0657, "num_input_tokens_seen": 182689568, "step": 84655 }, { "epoch": 13.810766721044045, "grad_norm": 0.7736760377883911, "learning_rate": 1.322237981840474e-05, "loss": 0.0567, "num_input_tokens_seen": 182699680, "step": 84660 }, { "epoch": 13.8115823817292, "grad_norm": 0.15705226361751556, "learning_rate": 1.3219240632676555e-05, "loss": 0.1161, "num_input_tokens_seen": 182710848, "step": 84665 }, { "epoch": 13.812398042414356, "grad_norm": 0.13688333332538605, "learning_rate": 1.32161016857004e-05, "loss": 0.1291, "num_input_tokens_seen": 182721792, "step": 84670 }, { "epoch": 13.81321370309951, "grad_norm": 2.5250353813171387, "learning_rate": 1.3212962977539894e-05, "loss": 0.1638, "num_input_tokens_seen": 182732384, "step": 84675 }, { "epoch": 13.814029363784666, "grad_norm": 1.050355076789856, "learning_rate": 1.3209824508258645e-05, "loss": 0.1747, "num_input_tokens_seen": 182743744, "step": 84680 }, { "epoch": 13.81484502446982, "grad_norm": 2.032104015350342, "learning_rate": 1.320668627792026e-05, "loss": 0.1337, "num_input_tokens_seen": 182754816, "step": 84685 }, { "epoch": 13.815660685154976, "grad_norm": 0.04766704887151718, "learning_rate": 1.320354828658834e-05, "loss": 0.0324, "num_input_tokens_seen": 182766752, "step": 84690 }, { "epoch": 13.81647634584013, "grad_norm": 1.0240675210952759, "learning_rate": 1.3200410534326476e-05, "loss": 0.2187, "num_input_tokens_seen": 182776832, "step": 84695 }, { "epoch": 13.817292006525285, "grad_norm": 0.05589909479022026, "learning_rate": 1.3197273021198264e-05, "loss": 0.0268, "num_input_tokens_seen": 182787360, "step": 84700 }, { "epoch": 13.818107667210441, "grad_norm": 0.17078936100006104, "learning_rate": 1.3194135747267286e-05, "loss": 0.0962, "num_input_tokens_seen": 182797248, "step": 84705 }, { "epoch": 13.818923327895595, "grad_norm": 1.8573651313781738, "learning_rate": 1.3190998712597125e-05, "loss": 0.1436, "num_input_tokens_seen": 182808416, "step": 84710 }, { "epoch": 13.81973898858075, "grad_norm": 0.24592836201190948, "learning_rate": 1.3187861917251343e-05, "loss": 0.0427, "num_input_tokens_seen": 182818912, "step": 84715 }, { "epoch": 13.820554649265905, "grad_norm": 0.07312993705272675, "learning_rate": 1.3184725361293546e-05, "loss": 0.0159, "num_input_tokens_seen": 182830368, "step": 84720 }, { "epoch": 13.82137030995106, "grad_norm": 3.165531635284424, "learning_rate": 1.3181589044787258e-05, "loss": 0.1492, "num_input_tokens_seen": 182841536, "step": 84725 }, { "epoch": 13.822185970636216, "grad_norm": 1.4045395851135254, "learning_rate": 1.317845296779608e-05, "loss": 0.1303, "num_input_tokens_seen": 182852288, "step": 84730 }, { "epoch": 13.82300163132137, "grad_norm": 0.3810580372810364, "learning_rate": 1.317531713038353e-05, "loss": 0.1237, "num_input_tokens_seen": 182862880, "step": 84735 }, { "epoch": 13.823817292006526, "grad_norm": 0.1394076943397522, "learning_rate": 1.3172181532613201e-05, "loss": 0.147, "num_input_tokens_seen": 182873792, "step": 84740 }, { "epoch": 13.82463295269168, "grad_norm": 2.004216432571411, "learning_rate": 1.3169046174548594e-05, "loss": 0.281, "num_input_tokens_seen": 182884000, "step": 84745 }, { "epoch": 13.825448613376835, "grad_norm": 2.07149076461792, "learning_rate": 1.3165911056253288e-05, "loss": 0.1439, "num_input_tokens_seen": 182894848, "step": 84750 }, { "epoch": 13.826264274061991, "grad_norm": 0.39017200469970703, "learning_rate": 1.3162776177790806e-05, "loss": 0.0207, "num_input_tokens_seen": 182903840, "step": 84755 }, { "epoch": 13.827079934747145, "grad_norm": 1.2509578466415405, "learning_rate": 1.3159641539224682e-05, "loss": 0.0383, "num_input_tokens_seen": 182913024, "step": 84760 }, { "epoch": 13.8278955954323, "grad_norm": 0.8746175169944763, "learning_rate": 1.3156507140618441e-05, "loss": 0.0421, "num_input_tokens_seen": 182924032, "step": 84765 }, { "epoch": 13.828711256117455, "grad_norm": 0.27837902307510376, "learning_rate": 1.3153372982035612e-05, "loss": 0.0371, "num_input_tokens_seen": 182934976, "step": 84770 }, { "epoch": 13.82952691680261, "grad_norm": 0.303927481174469, "learning_rate": 1.3150239063539702e-05, "loss": 0.0855, "num_input_tokens_seen": 182945376, "step": 84775 }, { "epoch": 13.830342577487766, "grad_norm": 0.03934401273727417, "learning_rate": 1.314710538519423e-05, "loss": 0.0044, "num_input_tokens_seen": 182957088, "step": 84780 }, { "epoch": 13.83115823817292, "grad_norm": 0.23327673971652985, "learning_rate": 1.3143971947062707e-05, "loss": 0.0356, "num_input_tokens_seen": 182968192, "step": 84785 }, { "epoch": 13.831973898858076, "grad_norm": 1.1037707328796387, "learning_rate": 1.3140838749208633e-05, "loss": 0.1914, "num_input_tokens_seen": 182978848, "step": 84790 }, { "epoch": 13.83278955954323, "grad_norm": 0.6809675693511963, "learning_rate": 1.3137705791695504e-05, "loss": 0.0895, "num_input_tokens_seen": 182989536, "step": 84795 }, { "epoch": 13.833605220228385, "grad_norm": 0.056752245873212814, "learning_rate": 1.313457307458682e-05, "loss": 0.0259, "num_input_tokens_seen": 182999168, "step": 84800 }, { "epoch": 13.83442088091354, "grad_norm": 0.10434938967227936, "learning_rate": 1.313144059794606e-05, "loss": 0.0534, "num_input_tokens_seen": 183009344, "step": 84805 }, { "epoch": 13.835236541598695, "grad_norm": 1.879400372505188, "learning_rate": 1.3128308361836713e-05, "loss": 0.1316, "num_input_tokens_seen": 183018880, "step": 84810 }, { "epoch": 13.83605220228385, "grad_norm": 0.6615368723869324, "learning_rate": 1.3125176366322261e-05, "loss": 0.1644, "num_input_tokens_seen": 183029952, "step": 84815 }, { "epoch": 13.836867862969005, "grad_norm": 0.13195708394050598, "learning_rate": 1.312204461146616e-05, "loss": 0.1065, "num_input_tokens_seen": 183039840, "step": 84820 }, { "epoch": 13.83768352365416, "grad_norm": 0.0384925901889801, "learning_rate": 1.3118913097331914e-05, "loss": 0.042, "num_input_tokens_seen": 183049568, "step": 84825 }, { "epoch": 13.838499184339314, "grad_norm": 0.12974752485752106, "learning_rate": 1.3115781823982948e-05, "loss": 0.0212, "num_input_tokens_seen": 183061344, "step": 84830 }, { "epoch": 13.83931484502447, "grad_norm": 1.9813095331192017, "learning_rate": 1.3112650791482762e-05, "loss": 0.2262, "num_input_tokens_seen": 183071360, "step": 84835 }, { "epoch": 13.840130505709626, "grad_norm": 1.1735025644302368, "learning_rate": 1.3109519999894765e-05, "loss": 0.1017, "num_input_tokens_seen": 183082560, "step": 84840 }, { "epoch": 13.84094616639478, "grad_norm": 0.030696317553520203, "learning_rate": 1.3106389449282447e-05, "loss": 0.102, "num_input_tokens_seen": 183092352, "step": 84845 }, { "epoch": 13.841761827079935, "grad_norm": 0.030400807037949562, "learning_rate": 1.3103259139709218e-05, "loss": 0.2391, "num_input_tokens_seen": 183102912, "step": 84850 }, { "epoch": 13.84257748776509, "grad_norm": 0.2788623571395874, "learning_rate": 1.3100129071238554e-05, "loss": 0.0875, "num_input_tokens_seen": 183113472, "step": 84855 }, { "epoch": 13.843393148450245, "grad_norm": 0.04495776817202568, "learning_rate": 1.309699924393385e-05, "loss": 0.1338, "num_input_tokens_seen": 183124768, "step": 84860 }, { "epoch": 13.844208809135399, "grad_norm": 0.1497819572687149, "learning_rate": 1.3093869657858576e-05, "loss": 0.0341, "num_input_tokens_seen": 183135872, "step": 84865 }, { "epoch": 13.845024469820554, "grad_norm": 1.8905786275863647, "learning_rate": 1.309074031307612e-05, "loss": 0.0931, "num_input_tokens_seen": 183146976, "step": 84870 }, { "epoch": 13.84584013050571, "grad_norm": 0.21144498884677887, "learning_rate": 1.3087611209649936e-05, "loss": 0.0795, "num_input_tokens_seen": 183156672, "step": 84875 }, { "epoch": 13.846655791190864, "grad_norm": 0.48416227102279663, "learning_rate": 1.3084482347643403e-05, "loss": 0.0316, "num_input_tokens_seen": 183166688, "step": 84880 }, { "epoch": 13.84747145187602, "grad_norm": 0.1550978422164917, "learning_rate": 1.3081353727119971e-05, "loss": 0.1304, "num_input_tokens_seen": 183177152, "step": 84885 }, { "epoch": 13.848287112561174, "grad_norm": 3.3827993869781494, "learning_rate": 1.3078225348143008e-05, "loss": 0.2524, "num_input_tokens_seen": 183188512, "step": 84890 }, { "epoch": 13.84910277324633, "grad_norm": 0.05120261386036873, "learning_rate": 1.3075097210775944e-05, "loss": 0.0136, "num_input_tokens_seen": 183199072, "step": 84895 }, { "epoch": 13.849918433931485, "grad_norm": 0.1512635499238968, "learning_rate": 1.3071969315082158e-05, "loss": 0.2516, "num_input_tokens_seen": 183209664, "step": 84900 }, { "epoch": 13.850734094616639, "grad_norm": 0.9502463936805725, "learning_rate": 1.3068841661125048e-05, "loss": 0.1572, "num_input_tokens_seen": 183221504, "step": 84905 }, { "epoch": 13.851549755301795, "grad_norm": 0.08786869049072266, "learning_rate": 1.3065714248967997e-05, "loss": 0.014, "num_input_tokens_seen": 183231296, "step": 84910 }, { "epoch": 13.852365415986949, "grad_norm": 0.4213089942932129, "learning_rate": 1.3062587078674387e-05, "loss": 0.0359, "num_input_tokens_seen": 183241728, "step": 84915 }, { "epoch": 13.853181076672104, "grad_norm": 0.6700032949447632, "learning_rate": 1.3059460150307592e-05, "loss": 0.1894, "num_input_tokens_seen": 183253696, "step": 84920 }, { "epoch": 13.85399673735726, "grad_norm": 0.575928270816803, "learning_rate": 1.3056333463930986e-05, "loss": 0.0376, "num_input_tokens_seen": 183263712, "step": 84925 }, { "epoch": 13.854812398042414, "grad_norm": 0.12656916677951813, "learning_rate": 1.3053207019607933e-05, "loss": 0.0654, "num_input_tokens_seen": 183275008, "step": 84930 }, { "epoch": 13.85562805872757, "grad_norm": 1.3085129261016846, "learning_rate": 1.3050080817401798e-05, "loss": 0.0599, "num_input_tokens_seen": 183285952, "step": 84935 }, { "epoch": 13.856443719412724, "grad_norm": 0.17395387589931488, "learning_rate": 1.304695485737593e-05, "loss": 0.0467, "num_input_tokens_seen": 183297024, "step": 84940 }, { "epoch": 13.85725938009788, "grad_norm": 2.526142120361328, "learning_rate": 1.304382913959368e-05, "loss": 0.1732, "num_input_tokens_seen": 183307936, "step": 84945 }, { "epoch": 13.858075040783035, "grad_norm": 2.2933313846588135, "learning_rate": 1.304070366411842e-05, "loss": 0.1723, "num_input_tokens_seen": 183318496, "step": 84950 }, { "epoch": 13.858890701468189, "grad_norm": 2.3298630714416504, "learning_rate": 1.3037578431013447e-05, "loss": 0.0744, "num_input_tokens_seen": 183328608, "step": 84955 }, { "epoch": 13.859706362153345, "grad_norm": 0.1639392226934433, "learning_rate": 1.3034453440342148e-05, "loss": 0.0225, "num_input_tokens_seen": 183340064, "step": 84960 }, { "epoch": 13.860522022838499, "grad_norm": 0.33642974495887756, "learning_rate": 1.3031328692167804e-05, "loss": 0.0758, "num_input_tokens_seen": 183350688, "step": 84965 }, { "epoch": 13.861337683523654, "grad_norm": 1.3412669897079468, "learning_rate": 1.3028204186553792e-05, "loss": 0.0866, "num_input_tokens_seen": 183360992, "step": 84970 }, { "epoch": 13.86215334420881, "grad_norm": 0.1331881284713745, "learning_rate": 1.3025079923563388e-05, "loss": 0.0692, "num_input_tokens_seen": 183371456, "step": 84975 }, { "epoch": 13.862969004893964, "grad_norm": 0.03962613269686699, "learning_rate": 1.302195590325995e-05, "loss": 0.0659, "num_input_tokens_seen": 183382240, "step": 84980 }, { "epoch": 13.86378466557912, "grad_norm": 0.20549742877483368, "learning_rate": 1.3018832125706751e-05, "loss": 0.0497, "num_input_tokens_seen": 183393152, "step": 84985 }, { "epoch": 13.864600326264274, "grad_norm": 2.528975248336792, "learning_rate": 1.3015708590967139e-05, "loss": 0.1702, "num_input_tokens_seen": 183403200, "step": 84990 }, { "epoch": 13.86541598694943, "grad_norm": 0.44736549258232117, "learning_rate": 1.3012585299104375e-05, "loss": 0.0971, "num_input_tokens_seen": 183414272, "step": 84995 }, { "epoch": 13.866231647634583, "grad_norm": 0.42480403184890747, "learning_rate": 1.3009462250181795e-05, "loss": 0.1677, "num_input_tokens_seen": 183425728, "step": 85000 }, { "epoch": 13.867047308319739, "grad_norm": 0.15291552245616913, "learning_rate": 1.3006339444262655e-05, "loss": 0.0625, "num_input_tokens_seen": 183437344, "step": 85005 }, { "epoch": 13.867862969004895, "grad_norm": 3.077866315841675, "learning_rate": 1.3003216881410282e-05, "loss": 0.1733, "num_input_tokens_seen": 183447680, "step": 85010 }, { "epoch": 13.868678629690049, "grad_norm": 1.088321566581726, "learning_rate": 1.300009456168792e-05, "loss": 0.084, "num_input_tokens_seen": 183459616, "step": 85015 }, { "epoch": 13.869494290375204, "grad_norm": 0.09156150370836258, "learning_rate": 1.299697248515888e-05, "loss": 0.0535, "num_input_tokens_seen": 183470560, "step": 85020 }, { "epoch": 13.870309951060358, "grad_norm": 1.9611246585845947, "learning_rate": 1.2993850651886403e-05, "loss": 0.3463, "num_input_tokens_seen": 183481664, "step": 85025 }, { "epoch": 13.871125611745514, "grad_norm": 0.10143929719924927, "learning_rate": 1.2990729061933782e-05, "loss": 0.0278, "num_input_tokens_seen": 183493472, "step": 85030 }, { "epoch": 13.87194127243067, "grad_norm": 0.7732589244842529, "learning_rate": 1.298760771536427e-05, "loss": 0.042, "num_input_tokens_seen": 183504352, "step": 85035 }, { "epoch": 13.872756933115824, "grad_norm": 2.4777495861053467, "learning_rate": 1.298448661224113e-05, "loss": 0.1455, "num_input_tokens_seen": 183514592, "step": 85040 }, { "epoch": 13.87357259380098, "grad_norm": 1.6891998052597046, "learning_rate": 1.2981365752627608e-05, "loss": 0.0874, "num_input_tokens_seen": 183526112, "step": 85045 }, { "epoch": 13.874388254486133, "grad_norm": 0.9503273963928223, "learning_rate": 1.2978245136586958e-05, "loss": 0.0527, "num_input_tokens_seen": 183535776, "step": 85050 }, { "epoch": 13.875203915171289, "grad_norm": 0.27188241481781006, "learning_rate": 1.297512476418242e-05, "loss": 0.0544, "num_input_tokens_seen": 183547552, "step": 85055 }, { "epoch": 13.876019575856443, "grad_norm": 0.11559107154607773, "learning_rate": 1.297200463547723e-05, "loss": 0.1414, "num_input_tokens_seen": 183557280, "step": 85060 }, { "epoch": 13.876835236541599, "grad_norm": 3.705284833908081, "learning_rate": 1.2968884750534632e-05, "loss": 0.3273, "num_input_tokens_seen": 183567360, "step": 85065 }, { "epoch": 13.877650897226754, "grad_norm": 0.055419206619262695, "learning_rate": 1.2965765109417844e-05, "loss": 0.2227, "num_input_tokens_seen": 183578432, "step": 85070 }, { "epoch": 13.878466557911908, "grad_norm": 0.5140016674995422, "learning_rate": 1.2962645712190091e-05, "loss": 0.1415, "num_input_tokens_seen": 183589536, "step": 85075 }, { "epoch": 13.879282218597064, "grad_norm": 0.08856625854969025, "learning_rate": 1.2959526558914592e-05, "loss": 0.0634, "num_input_tokens_seen": 183599904, "step": 85080 }, { "epoch": 13.880097879282218, "grad_norm": 0.03633768856525421, "learning_rate": 1.2956407649654564e-05, "loss": 0.0479, "num_input_tokens_seen": 183610240, "step": 85085 }, { "epoch": 13.880913539967374, "grad_norm": 0.611937403678894, "learning_rate": 1.2953288984473205e-05, "loss": 0.0345, "num_input_tokens_seen": 183622048, "step": 85090 }, { "epoch": 13.88172920065253, "grad_norm": 0.2710965573787689, "learning_rate": 1.2950170563433745e-05, "loss": 0.023, "num_input_tokens_seen": 183632960, "step": 85095 }, { "epoch": 13.882544861337683, "grad_norm": 0.5137221813201904, "learning_rate": 1.2947052386599346e-05, "loss": 0.2212, "num_input_tokens_seen": 183645408, "step": 85100 }, { "epoch": 13.883360522022839, "grad_norm": 0.10494086891412735, "learning_rate": 1.2943934454033236e-05, "loss": 0.0234, "num_input_tokens_seen": 183655200, "step": 85105 }, { "epoch": 13.884176182707993, "grad_norm": 0.07618985325098038, "learning_rate": 1.2940816765798575e-05, "loss": 0.0344, "num_input_tokens_seen": 183665568, "step": 85110 }, { "epoch": 13.884991843393149, "grad_norm": 0.07813215255737305, "learning_rate": 1.2937699321958574e-05, "loss": 0.12, "num_input_tokens_seen": 183677088, "step": 85115 }, { "epoch": 13.885807504078304, "grad_norm": 0.05550206080079079, "learning_rate": 1.2934582122576383e-05, "loss": 0.0274, "num_input_tokens_seen": 183687296, "step": 85120 }, { "epoch": 13.886623164763458, "grad_norm": 0.43137621879577637, "learning_rate": 1.2931465167715213e-05, "loss": 0.043, "num_input_tokens_seen": 183697280, "step": 85125 }, { "epoch": 13.887438825448614, "grad_norm": 0.10541153699159622, "learning_rate": 1.2928348457438186e-05, "loss": 0.0585, "num_input_tokens_seen": 183706112, "step": 85130 }, { "epoch": 13.888254486133768, "grad_norm": 0.3541242182254791, "learning_rate": 1.2925231991808504e-05, "loss": 0.1578, "num_input_tokens_seen": 183716416, "step": 85135 }, { "epoch": 13.889070146818923, "grad_norm": 0.0510697104036808, "learning_rate": 1.2922115770889314e-05, "loss": 0.0428, "num_input_tokens_seen": 183727008, "step": 85140 }, { "epoch": 13.88988580750408, "grad_norm": 0.03505512326955795, "learning_rate": 1.2918999794743769e-05, "loss": 0.1614, "num_input_tokens_seen": 183738720, "step": 85145 }, { "epoch": 13.890701468189233, "grad_norm": 0.32780134677886963, "learning_rate": 1.2915884063435018e-05, "loss": 0.1167, "num_input_tokens_seen": 183748832, "step": 85150 }, { "epoch": 13.891517128874389, "grad_norm": 0.21172061562538147, "learning_rate": 1.2912768577026208e-05, "loss": 0.0249, "num_input_tokens_seen": 183760448, "step": 85155 }, { "epoch": 13.892332789559543, "grad_norm": 2.1926839351654053, "learning_rate": 1.2909653335580477e-05, "loss": 0.1043, "num_input_tokens_seen": 183771104, "step": 85160 }, { "epoch": 13.893148450244698, "grad_norm": 0.06198621913790703, "learning_rate": 1.2906538339160957e-05, "loss": 0.1241, "num_input_tokens_seen": 183781408, "step": 85165 }, { "epoch": 13.893964110929852, "grad_norm": 1.6076096296310425, "learning_rate": 1.2903423587830781e-05, "loss": 0.1211, "num_input_tokens_seen": 183793024, "step": 85170 }, { "epoch": 13.894779771615008, "grad_norm": 1.1312354803085327, "learning_rate": 1.2900309081653072e-05, "loss": 0.2822, "num_input_tokens_seen": 183804192, "step": 85175 }, { "epoch": 13.895595432300164, "grad_norm": 0.2680586576461792, "learning_rate": 1.2897194820690947e-05, "loss": 0.0191, "num_input_tokens_seen": 183815904, "step": 85180 }, { "epoch": 13.896411092985318, "grad_norm": 0.5029542446136475, "learning_rate": 1.2894080805007524e-05, "loss": 0.1354, "num_input_tokens_seen": 183825376, "step": 85185 }, { "epoch": 13.897226753670473, "grad_norm": 2.222092390060425, "learning_rate": 1.289096703466591e-05, "loss": 0.2762, "num_input_tokens_seen": 183835872, "step": 85190 }, { "epoch": 13.898042414355627, "grad_norm": 0.07550002634525299, "learning_rate": 1.2887853509729209e-05, "loss": 0.1647, "num_input_tokens_seen": 183847584, "step": 85195 }, { "epoch": 13.898858075040783, "grad_norm": 0.028805768117308617, "learning_rate": 1.2884740230260525e-05, "loss": 0.0276, "num_input_tokens_seen": 183859072, "step": 85200 }, { "epoch": 13.899673735725939, "grad_norm": 0.6369123458862305, "learning_rate": 1.2881627196322948e-05, "loss": 0.1482, "num_input_tokens_seen": 183870912, "step": 85205 }, { "epoch": 13.900489396411093, "grad_norm": 4.135544300079346, "learning_rate": 1.2878514407979569e-05, "loss": 0.1054, "num_input_tokens_seen": 183882432, "step": 85210 }, { "epoch": 13.901305057096248, "grad_norm": 0.22738368809223175, "learning_rate": 1.2875401865293473e-05, "loss": 0.1683, "num_input_tokens_seen": 183892768, "step": 85215 }, { "epoch": 13.902120717781402, "grad_norm": 0.6498969793319702, "learning_rate": 1.2872289568327739e-05, "loss": 0.0694, "num_input_tokens_seen": 183904416, "step": 85220 }, { "epoch": 13.902936378466558, "grad_norm": 1.7098537683486938, "learning_rate": 1.2869177517145442e-05, "loss": 0.1264, "num_input_tokens_seen": 183915872, "step": 85225 }, { "epoch": 13.903752039151712, "grad_norm": 0.09859177470207214, "learning_rate": 1.2866065711809653e-05, "loss": 0.0575, "num_input_tokens_seen": 183926464, "step": 85230 }, { "epoch": 13.904567699836868, "grad_norm": 0.16392917931079865, "learning_rate": 1.2862954152383424e-05, "loss": 0.0173, "num_input_tokens_seen": 183937504, "step": 85235 }, { "epoch": 13.905383360522023, "grad_norm": 1.5602654218673706, "learning_rate": 1.2859842838929837e-05, "loss": 0.0919, "num_input_tokens_seen": 183947680, "step": 85240 }, { "epoch": 13.906199021207177, "grad_norm": 1.839542269706726, "learning_rate": 1.2856731771511935e-05, "loss": 0.0851, "num_input_tokens_seen": 183956736, "step": 85245 }, { "epoch": 13.907014681892333, "grad_norm": 3.2695839405059814, "learning_rate": 1.2853620950192768e-05, "loss": 0.222, "num_input_tokens_seen": 183967264, "step": 85250 }, { "epoch": 13.907830342577487, "grad_norm": 1.8702324628829956, "learning_rate": 1.2850510375035381e-05, "loss": 0.1473, "num_input_tokens_seen": 183976416, "step": 85255 }, { "epoch": 13.908646003262643, "grad_norm": 0.2845754623413086, "learning_rate": 1.2847400046102814e-05, "loss": 0.1227, "num_input_tokens_seen": 183988224, "step": 85260 }, { "epoch": 13.909461663947798, "grad_norm": 0.14639942348003387, "learning_rate": 1.2844289963458105e-05, "loss": 0.0849, "num_input_tokens_seen": 183999872, "step": 85265 }, { "epoch": 13.910277324632952, "grad_norm": 0.33433786034584045, "learning_rate": 1.2841180127164276e-05, "loss": 0.1299, "num_input_tokens_seen": 184011040, "step": 85270 }, { "epoch": 13.911092985318108, "grad_norm": 0.8659694194793701, "learning_rate": 1.2838070537284358e-05, "loss": 0.2398, "num_input_tokens_seen": 184022016, "step": 85275 }, { "epoch": 13.911908646003262, "grad_norm": 0.13906003534793854, "learning_rate": 1.2834961193881373e-05, "loss": 0.0669, "num_input_tokens_seen": 184031584, "step": 85280 }, { "epoch": 13.912724306688418, "grad_norm": 0.3398989737033844, "learning_rate": 1.2831852097018326e-05, "loss": 0.1007, "num_input_tokens_seen": 184042112, "step": 85285 }, { "epoch": 13.913539967373573, "grad_norm": 2.947000026702881, "learning_rate": 1.2828743246758235e-05, "loss": 0.262, "num_input_tokens_seen": 184052864, "step": 85290 }, { "epoch": 13.914355628058727, "grad_norm": 1.23024320602417, "learning_rate": 1.2825634643164103e-05, "loss": 0.0955, "num_input_tokens_seen": 184063232, "step": 85295 }, { "epoch": 13.915171288743883, "grad_norm": 0.3899560272693634, "learning_rate": 1.2822526286298928e-05, "loss": 0.2217, "num_input_tokens_seen": 184075296, "step": 85300 }, { "epoch": 13.915986949429037, "grad_norm": 0.051652941852808, "learning_rate": 1.2819418176225707e-05, "loss": 0.0497, "num_input_tokens_seen": 184085920, "step": 85305 }, { "epoch": 13.916802610114193, "grad_norm": 0.05121760815382004, "learning_rate": 1.281631031300743e-05, "loss": 0.0317, "num_input_tokens_seen": 184097856, "step": 85310 }, { "epoch": 13.917618270799348, "grad_norm": 1.2118695974349976, "learning_rate": 1.2813202696707078e-05, "loss": 0.0522, "num_input_tokens_seen": 184109472, "step": 85315 }, { "epoch": 13.918433931484502, "grad_norm": 2.3417818546295166, "learning_rate": 1.2810095327387634e-05, "loss": 0.0731, "num_input_tokens_seen": 184119392, "step": 85320 }, { "epoch": 13.919249592169658, "grad_norm": 0.670259416103363, "learning_rate": 1.2806988205112072e-05, "loss": 0.1194, "num_input_tokens_seen": 184129664, "step": 85325 }, { "epoch": 13.920065252854812, "grad_norm": 0.20139582455158234, "learning_rate": 1.2803881329943362e-05, "loss": 0.0162, "num_input_tokens_seen": 184140288, "step": 85330 }, { "epoch": 13.920880913539968, "grad_norm": 0.12392681837081909, "learning_rate": 1.2800774701944469e-05, "loss": 0.1553, "num_input_tokens_seen": 184150720, "step": 85335 }, { "epoch": 13.921696574225122, "grad_norm": 0.0923587903380394, "learning_rate": 1.2797668321178351e-05, "loss": 0.0102, "num_input_tokens_seen": 184161120, "step": 85340 }, { "epoch": 13.922512234910277, "grad_norm": 0.2337663769721985, "learning_rate": 1.279456218770797e-05, "loss": 0.1538, "num_input_tokens_seen": 184170976, "step": 85345 }, { "epoch": 13.923327895595433, "grad_norm": 2.690133571624756, "learning_rate": 1.2791456301596266e-05, "loss": 0.0571, "num_input_tokens_seen": 184181888, "step": 85350 }, { "epoch": 13.924143556280587, "grad_norm": 0.0685129165649414, "learning_rate": 1.2788350662906176e-05, "loss": 0.2144, "num_input_tokens_seen": 184192544, "step": 85355 }, { "epoch": 13.924959216965743, "grad_norm": 1.9574559926986694, "learning_rate": 1.2785245271700674e-05, "loss": 0.2998, "num_input_tokens_seen": 184203648, "step": 85360 }, { "epoch": 13.925774877650896, "grad_norm": 1.2910807132720947, "learning_rate": 1.2782140128042652e-05, "loss": 0.1294, "num_input_tokens_seen": 184214848, "step": 85365 }, { "epoch": 13.926590538336052, "grad_norm": 1.757611870765686, "learning_rate": 1.2779035231995079e-05, "loss": 0.077, "num_input_tokens_seen": 184226880, "step": 85370 }, { "epoch": 13.927406199021208, "grad_norm": 0.14563007652759552, "learning_rate": 1.2775930583620838e-05, "loss": 0.1618, "num_input_tokens_seen": 184238112, "step": 85375 }, { "epoch": 13.928221859706362, "grad_norm": 0.4590490758419037, "learning_rate": 1.2772826182982883e-05, "loss": 0.1233, "num_input_tokens_seen": 184248160, "step": 85380 }, { "epoch": 13.929037520391518, "grad_norm": 0.3903016746044159, "learning_rate": 1.2769722030144118e-05, "loss": 0.0773, "num_input_tokens_seen": 184258848, "step": 85385 }, { "epoch": 13.929853181076671, "grad_norm": 0.06921765208244324, "learning_rate": 1.2766618125167451e-05, "loss": 0.044, "num_input_tokens_seen": 184269952, "step": 85390 }, { "epoch": 13.930668841761827, "grad_norm": 1.190748929977417, "learning_rate": 1.2763514468115783e-05, "loss": 0.1015, "num_input_tokens_seen": 184279936, "step": 85395 }, { "epoch": 13.931484502446983, "grad_norm": 1.193642497062683, "learning_rate": 1.2760411059052024e-05, "loss": 0.1587, "num_input_tokens_seen": 184290464, "step": 85400 }, { "epoch": 13.932300163132137, "grad_norm": 1.8684566020965576, "learning_rate": 1.2757307898039055e-05, "loss": 0.2571, "num_input_tokens_seen": 184300672, "step": 85405 }, { "epoch": 13.933115823817293, "grad_norm": 0.5163094997406006, "learning_rate": 1.2754204985139779e-05, "loss": 0.0174, "num_input_tokens_seen": 184311584, "step": 85410 }, { "epoch": 13.933931484502446, "grad_norm": 1.110357642173767, "learning_rate": 1.275110232041707e-05, "loss": 0.2099, "num_input_tokens_seen": 184322432, "step": 85415 }, { "epoch": 13.934747145187602, "grad_norm": 0.3950801193714142, "learning_rate": 1.274799990393381e-05, "loss": 0.0774, "num_input_tokens_seen": 184334464, "step": 85420 }, { "epoch": 13.935562805872756, "grad_norm": 0.02339777536690235, "learning_rate": 1.2744897735752878e-05, "loss": 0.0386, "num_input_tokens_seen": 184344608, "step": 85425 }, { "epoch": 13.936378466557912, "grad_norm": 0.5351450443267822, "learning_rate": 1.2741795815937141e-05, "loss": 0.1031, "num_input_tokens_seen": 184354496, "step": 85430 }, { "epoch": 13.937194127243067, "grad_norm": 0.09877585619688034, "learning_rate": 1.2738694144549456e-05, "loss": 0.0216, "num_input_tokens_seen": 184366016, "step": 85435 }, { "epoch": 13.938009787928221, "grad_norm": 0.17151036858558655, "learning_rate": 1.2735592721652695e-05, "loss": 0.3651, "num_input_tokens_seen": 184375200, "step": 85440 }, { "epoch": 13.938825448613377, "grad_norm": 0.6461935043334961, "learning_rate": 1.2732491547309704e-05, "loss": 0.0983, "num_input_tokens_seen": 184385440, "step": 85445 }, { "epoch": 13.939641109298531, "grad_norm": 0.08458361774682999, "learning_rate": 1.2729390621583334e-05, "loss": 0.0488, "num_input_tokens_seen": 184396704, "step": 85450 }, { "epoch": 13.940456769983687, "grad_norm": 0.07831602543592453, "learning_rate": 1.272628994453643e-05, "loss": 0.0131, "num_input_tokens_seen": 184408160, "step": 85455 }, { "epoch": 13.941272430668842, "grad_norm": 0.3327011466026306, "learning_rate": 1.2723189516231821e-05, "loss": 0.0227, "num_input_tokens_seen": 184419008, "step": 85460 }, { "epoch": 13.942088091353996, "grad_norm": 0.9296494126319885, "learning_rate": 1.2720089336732368e-05, "loss": 0.1666, "num_input_tokens_seen": 184430016, "step": 85465 }, { "epoch": 13.942903752039152, "grad_norm": 0.42356163263320923, "learning_rate": 1.2716989406100865e-05, "loss": 0.2177, "num_input_tokens_seen": 184440224, "step": 85470 }, { "epoch": 13.943719412724306, "grad_norm": 0.35238850116729736, "learning_rate": 1.2713889724400175e-05, "loss": 0.0158, "num_input_tokens_seen": 184451104, "step": 85475 }, { "epoch": 13.944535073409462, "grad_norm": 0.04805886745452881, "learning_rate": 1.2710790291693076e-05, "loss": 0.0855, "num_input_tokens_seen": 184461888, "step": 85480 }, { "epoch": 13.945350734094617, "grad_norm": 1.8136975765228271, "learning_rate": 1.270769110804242e-05, "loss": 0.0558, "num_input_tokens_seen": 184472768, "step": 85485 }, { "epoch": 13.946166394779771, "grad_norm": 2.3848230838775635, "learning_rate": 1.2704592173510976e-05, "loss": 0.0989, "num_input_tokens_seen": 184482592, "step": 85490 }, { "epoch": 13.946982055464927, "grad_norm": 0.34427815675735474, "learning_rate": 1.2701493488161589e-05, "loss": 0.0499, "num_input_tokens_seen": 184494112, "step": 85495 }, { "epoch": 13.947797716150081, "grad_norm": 2.145376443862915, "learning_rate": 1.2698395052057022e-05, "loss": 0.1339, "num_input_tokens_seen": 184505120, "step": 85500 }, { "epoch": 13.948613376835237, "grad_norm": 0.15177211165428162, "learning_rate": 1.2695296865260103e-05, "loss": 0.213, "num_input_tokens_seen": 184516800, "step": 85505 }, { "epoch": 13.949429037520392, "grad_norm": 1.811677098274231, "learning_rate": 1.2692198927833581e-05, "loss": 0.37, "num_input_tokens_seen": 184528288, "step": 85510 }, { "epoch": 13.950244698205546, "grad_norm": 1.071068286895752, "learning_rate": 1.2689101239840284e-05, "loss": 0.0914, "num_input_tokens_seen": 184539552, "step": 85515 }, { "epoch": 13.951060358890702, "grad_norm": 1.2828487157821655, "learning_rate": 1.2686003801342947e-05, "loss": 0.1086, "num_input_tokens_seen": 184549984, "step": 85520 }, { "epoch": 13.951876019575856, "grad_norm": 0.10793675482273102, "learning_rate": 1.2682906612404375e-05, "loss": 0.0976, "num_input_tokens_seen": 184560608, "step": 85525 }, { "epoch": 13.952691680261012, "grad_norm": 0.14743801951408386, "learning_rate": 1.2679809673087323e-05, "loss": 0.0829, "num_input_tokens_seen": 184572352, "step": 85530 }, { "epoch": 13.953507340946166, "grad_norm": 0.4899848997592926, "learning_rate": 1.267671298345456e-05, "loss": 0.0447, "num_input_tokens_seen": 184582976, "step": 85535 }, { "epoch": 13.954323001631321, "grad_norm": 2.3624911308288574, "learning_rate": 1.2673616543568842e-05, "loss": 0.1756, "num_input_tokens_seen": 184592640, "step": 85540 }, { "epoch": 13.955138662316477, "grad_norm": 2.5890533924102783, "learning_rate": 1.267052035349292e-05, "loss": 0.4384, "num_input_tokens_seen": 184603936, "step": 85545 }, { "epoch": 13.955954323001631, "grad_norm": 1.3451790809631348, "learning_rate": 1.2667424413289548e-05, "loss": 0.0327, "num_input_tokens_seen": 184615520, "step": 85550 }, { "epoch": 13.956769983686787, "grad_norm": 0.04892313480377197, "learning_rate": 1.2664328723021459e-05, "loss": 0.1267, "num_input_tokens_seen": 184625920, "step": 85555 }, { "epoch": 13.95758564437194, "grad_norm": 1.472597360610962, "learning_rate": 1.26612332827514e-05, "loss": 0.131, "num_input_tokens_seen": 184637440, "step": 85560 }, { "epoch": 13.958401305057096, "grad_norm": 0.2788280248641968, "learning_rate": 1.2658138092542104e-05, "loss": 0.051, "num_input_tokens_seen": 184646816, "step": 85565 }, { "epoch": 13.959216965742252, "grad_norm": 0.3066985011100769, "learning_rate": 1.2655043152456292e-05, "loss": 0.0228, "num_input_tokens_seen": 184658016, "step": 85570 }, { "epoch": 13.960032626427406, "grad_norm": 1.3714067935943604, "learning_rate": 1.2651948462556684e-05, "loss": 0.2352, "num_input_tokens_seen": 184670496, "step": 85575 }, { "epoch": 13.960848287112562, "grad_norm": 1.198268175125122, "learning_rate": 1.2648854022906027e-05, "loss": 0.0571, "num_input_tokens_seen": 184680384, "step": 85580 }, { "epoch": 13.961663947797716, "grad_norm": 0.14260601997375488, "learning_rate": 1.264575983356699e-05, "loss": 0.081, "num_input_tokens_seen": 184692448, "step": 85585 }, { "epoch": 13.962479608482871, "grad_norm": 1.0409629344940186, "learning_rate": 1.2642665894602319e-05, "loss": 0.0352, "num_input_tokens_seen": 184702176, "step": 85590 }, { "epoch": 13.963295269168025, "grad_norm": 0.28155529499053955, "learning_rate": 1.2639572206074685e-05, "loss": 0.0425, "num_input_tokens_seen": 184712960, "step": 85595 }, { "epoch": 13.964110929853181, "grad_norm": 0.03909383341670036, "learning_rate": 1.2636478768046816e-05, "loss": 0.0793, "num_input_tokens_seen": 184723680, "step": 85600 }, { "epoch": 13.964926590538337, "grad_norm": 0.061837781220674515, "learning_rate": 1.263338558058137e-05, "loss": 0.0367, "num_input_tokens_seen": 184733504, "step": 85605 }, { "epoch": 13.96574225122349, "grad_norm": 0.013403070159256458, "learning_rate": 1.2630292643741076e-05, "loss": 0.0666, "num_input_tokens_seen": 184745024, "step": 85610 }, { "epoch": 13.966557911908646, "grad_norm": 0.08146060258150101, "learning_rate": 1.262719995758857e-05, "loss": 0.0207, "num_input_tokens_seen": 184756064, "step": 85615 }, { "epoch": 13.9673735725938, "grad_norm": 0.3439330756664276, "learning_rate": 1.2624107522186578e-05, "loss": 0.1603, "num_input_tokens_seen": 184765312, "step": 85620 }, { "epoch": 13.968189233278956, "grad_norm": 0.3593723475933075, "learning_rate": 1.2621015337597725e-05, "loss": 0.2062, "num_input_tokens_seen": 184776864, "step": 85625 }, { "epoch": 13.969004893964112, "grad_norm": 0.7409630417823792, "learning_rate": 1.2617923403884718e-05, "loss": 0.147, "num_input_tokens_seen": 184787360, "step": 85630 }, { "epoch": 13.969820554649266, "grad_norm": 0.16438458859920502, "learning_rate": 1.2614831721110181e-05, "loss": 0.0166, "num_input_tokens_seen": 184797056, "step": 85635 }, { "epoch": 13.970636215334421, "grad_norm": 2.0884108543395996, "learning_rate": 1.2611740289336813e-05, "loss": 0.0936, "num_input_tokens_seen": 184808832, "step": 85640 }, { "epoch": 13.971451876019575, "grad_norm": 0.05857826769351959, "learning_rate": 1.2608649108627224e-05, "loss": 0.0659, "num_input_tokens_seen": 184819936, "step": 85645 }, { "epoch": 13.97226753670473, "grad_norm": 0.3303782045841217, "learning_rate": 1.2605558179044097e-05, "loss": 0.0303, "num_input_tokens_seen": 184831008, "step": 85650 }, { "epoch": 13.973083197389887, "grad_norm": 0.22670626640319824, "learning_rate": 1.260246750065004e-05, "loss": 0.1773, "num_input_tokens_seen": 184842464, "step": 85655 }, { "epoch": 13.97389885807504, "grad_norm": 0.02048259973526001, "learning_rate": 1.2599377073507724e-05, "loss": 0.0101, "num_input_tokens_seen": 184854464, "step": 85660 }, { "epoch": 13.974714518760196, "grad_norm": 0.06534496694803238, "learning_rate": 1.2596286897679746e-05, "loss": 0.0127, "num_input_tokens_seen": 184864576, "step": 85665 }, { "epoch": 13.97553017944535, "grad_norm": 1.0659924745559692, "learning_rate": 1.2593196973228758e-05, "loss": 0.3107, "num_input_tokens_seen": 184875840, "step": 85670 }, { "epoch": 13.976345840130506, "grad_norm": 0.0830778107047081, "learning_rate": 1.2590107300217371e-05, "loss": 0.1457, "num_input_tokens_seen": 184887488, "step": 85675 }, { "epoch": 13.977161500815662, "grad_norm": 0.03735239803791046, "learning_rate": 1.2587017878708205e-05, "loss": 0.0329, "num_input_tokens_seen": 184898912, "step": 85680 }, { "epoch": 13.977977161500815, "grad_norm": 0.4034694731235504, "learning_rate": 1.2583928708763865e-05, "loss": 0.0573, "num_input_tokens_seen": 184910112, "step": 85685 }, { "epoch": 13.978792822185971, "grad_norm": 0.0543685145676136, "learning_rate": 1.2580839790446964e-05, "loss": 0.1214, "num_input_tokens_seen": 184920160, "step": 85690 }, { "epoch": 13.979608482871125, "grad_norm": 0.024290727451443672, "learning_rate": 1.2577751123820097e-05, "loss": 0.0292, "num_input_tokens_seen": 184930240, "step": 85695 }, { "epoch": 13.98042414355628, "grad_norm": 0.2843286097049713, "learning_rate": 1.2574662708945866e-05, "loss": 0.0858, "num_input_tokens_seen": 184942016, "step": 85700 }, { "epoch": 13.981239804241435, "grad_norm": 0.04632934182882309, "learning_rate": 1.2571574545886855e-05, "loss": 0.0129, "num_input_tokens_seen": 184952544, "step": 85705 }, { "epoch": 13.98205546492659, "grad_norm": 2.196769952774048, "learning_rate": 1.2568486634705653e-05, "loss": 0.1486, "num_input_tokens_seen": 184962624, "step": 85710 }, { "epoch": 13.982871125611746, "grad_norm": 0.4613611102104187, "learning_rate": 1.256539897546484e-05, "loss": 0.054, "num_input_tokens_seen": 184973472, "step": 85715 }, { "epoch": 13.9836867862969, "grad_norm": 0.579579770565033, "learning_rate": 1.256231156822698e-05, "loss": 0.0763, "num_input_tokens_seen": 184986400, "step": 85720 }, { "epoch": 13.984502446982056, "grad_norm": 0.03577524051070213, "learning_rate": 1.2559224413054677e-05, "loss": 0.006, "num_input_tokens_seen": 184997568, "step": 85725 }, { "epoch": 13.98531810766721, "grad_norm": 0.2277853935956955, "learning_rate": 1.2556137510010451e-05, "loss": 0.0718, "num_input_tokens_seen": 185007744, "step": 85730 }, { "epoch": 13.986133768352365, "grad_norm": 1.3331656455993652, "learning_rate": 1.2553050859156906e-05, "loss": 0.072, "num_input_tokens_seen": 185018976, "step": 85735 }, { "epoch": 13.986949429037521, "grad_norm": 0.526013970375061, "learning_rate": 1.2549964460556557e-05, "loss": 0.0251, "num_input_tokens_seen": 185029696, "step": 85740 }, { "epoch": 13.987765089722675, "grad_norm": 0.3444252610206604, "learning_rate": 1.2546878314271987e-05, "loss": 0.1855, "num_input_tokens_seen": 185040064, "step": 85745 }, { "epoch": 13.98858075040783, "grad_norm": 2.0820634365081787, "learning_rate": 1.254379242036571e-05, "loss": 0.1993, "num_input_tokens_seen": 185050784, "step": 85750 }, { "epoch": 13.989396411092985, "grad_norm": 1.8886513710021973, "learning_rate": 1.2540706778900302e-05, "loss": 0.0942, "num_input_tokens_seen": 185061984, "step": 85755 }, { "epoch": 13.99021207177814, "grad_norm": 1.7713929414749146, "learning_rate": 1.2537621389938254e-05, "loss": 0.107, "num_input_tokens_seen": 185072544, "step": 85760 }, { "epoch": 13.991027732463294, "grad_norm": 0.9860191941261292, "learning_rate": 1.253453625354214e-05, "loss": 0.0369, "num_input_tokens_seen": 185082976, "step": 85765 }, { "epoch": 13.99184339314845, "grad_norm": 1.638497233390808, "learning_rate": 1.2531451369774442e-05, "loss": 0.0767, "num_input_tokens_seen": 185094176, "step": 85770 }, { "epoch": 13.992659053833606, "grad_norm": 0.04326368495821953, "learning_rate": 1.2528366738697708e-05, "loss": 0.1238, "num_input_tokens_seen": 185104480, "step": 85775 }, { "epoch": 13.99347471451876, "grad_norm": 0.2623533308506012, "learning_rate": 1.2525282360374446e-05, "loss": 0.0291, "num_input_tokens_seen": 185115712, "step": 85780 }, { "epoch": 13.994290375203915, "grad_norm": 0.2134229689836502, "learning_rate": 1.252219823486716e-05, "loss": 0.012, "num_input_tokens_seen": 185128256, "step": 85785 }, { "epoch": 13.99510603588907, "grad_norm": 0.13283191621303558, "learning_rate": 1.2519114362238358e-05, "loss": 0.0274, "num_input_tokens_seen": 185138944, "step": 85790 }, { "epoch": 13.995921696574225, "grad_norm": 0.03510506823658943, "learning_rate": 1.2516030742550532e-05, "loss": 0.0484, "num_input_tokens_seen": 185148640, "step": 85795 }, { "epoch": 13.99673735725938, "grad_norm": 0.026962565258145332, "learning_rate": 1.2512947375866185e-05, "loss": 0.1019, "num_input_tokens_seen": 185158752, "step": 85800 }, { "epoch": 13.997553017944535, "grad_norm": 1.6613500118255615, "learning_rate": 1.25098642622478e-05, "loss": 0.0472, "num_input_tokens_seen": 185169312, "step": 85805 }, { "epoch": 13.99836867862969, "grad_norm": 0.26275381445884705, "learning_rate": 1.250678140175786e-05, "loss": 0.0223, "num_input_tokens_seen": 185179040, "step": 85810 }, { "epoch": 13.999184339314844, "grad_norm": 0.2991775870323181, "learning_rate": 1.2503698794458844e-05, "loss": 0.0306, "num_input_tokens_seen": 185190016, "step": 85815 }, { "epoch": 14.0, "grad_norm": 0.040648967027664185, "learning_rate": 1.2500616440413221e-05, "loss": 0.0615, "num_input_tokens_seen": 185199728, "step": 85820 }, { "epoch": 14.0, "eval_loss": 0.14694535732269287, "eval_runtime": 90.8469, "eval_samples_per_second": 29.996, "eval_steps_per_second": 7.507, "num_input_tokens_seen": 185199728, "step": 85820 }, { "epoch": 14.000815660685156, "grad_norm": 0.05534778907895088, "learning_rate": 1.2497534339683468e-05, "loss": 0.1015, "num_input_tokens_seen": 185211504, "step": 85825 }, { "epoch": 14.00163132137031, "grad_norm": 0.26753392815589905, "learning_rate": 1.2494452492332037e-05, "loss": 0.0791, "num_input_tokens_seen": 185222320, "step": 85830 }, { "epoch": 14.002446982055465, "grad_norm": 1.2269688844680786, "learning_rate": 1.2491370898421394e-05, "loss": 0.0635, "num_input_tokens_seen": 185231856, "step": 85835 }, { "epoch": 14.00326264274062, "grad_norm": 0.1324503868818283, "learning_rate": 1.2488289558013988e-05, "loss": 0.0305, "num_input_tokens_seen": 185244176, "step": 85840 }, { "epoch": 14.004078303425775, "grad_norm": 0.32276901602745056, "learning_rate": 1.2485208471172266e-05, "loss": 0.2722, "num_input_tokens_seen": 185255088, "step": 85845 }, { "epoch": 14.00489396411093, "grad_norm": 2.654189348220825, "learning_rate": 1.2482127637958669e-05, "loss": 0.2025, "num_input_tokens_seen": 185266064, "step": 85850 }, { "epoch": 14.005709624796085, "grad_norm": 0.33801397681236267, "learning_rate": 1.2479047058435636e-05, "loss": 0.0611, "num_input_tokens_seen": 185277456, "step": 85855 }, { "epoch": 14.00652528548124, "grad_norm": 2.1331727504730225, "learning_rate": 1.2475966732665597e-05, "loss": 0.0897, "num_input_tokens_seen": 185286864, "step": 85860 }, { "epoch": 14.007340946166394, "grad_norm": 0.07336900383234024, "learning_rate": 1.2472886660710975e-05, "loss": 0.0113, "num_input_tokens_seen": 185297904, "step": 85865 }, { "epoch": 14.00815660685155, "grad_norm": 0.06420835107564926, "learning_rate": 1.2469806842634212e-05, "loss": 0.0151, "num_input_tokens_seen": 185307504, "step": 85870 }, { "epoch": 14.008972267536704, "grad_norm": 0.6921957731246948, "learning_rate": 1.2466727278497688e-05, "loss": 0.0839, "num_input_tokens_seen": 185317872, "step": 85875 }, { "epoch": 14.00978792822186, "grad_norm": 0.05328761041164398, "learning_rate": 1.2463647968363851e-05, "loss": 0.0203, "num_input_tokens_seen": 185328464, "step": 85880 }, { "epoch": 14.010603588907015, "grad_norm": 1.8411418199539185, "learning_rate": 1.2460568912295087e-05, "loss": 0.124, "num_input_tokens_seen": 185338032, "step": 85885 }, { "epoch": 14.01141924959217, "grad_norm": 0.35950610041618347, "learning_rate": 1.2457490110353804e-05, "loss": 0.1633, "num_input_tokens_seen": 185348912, "step": 85890 }, { "epoch": 14.012234910277325, "grad_norm": 0.06737524271011353, "learning_rate": 1.2454411562602391e-05, "loss": 0.1667, "num_input_tokens_seen": 185360048, "step": 85895 }, { "epoch": 14.013050570962479, "grad_norm": 0.6880989074707031, "learning_rate": 1.2451333269103246e-05, "loss": 0.1495, "num_input_tokens_seen": 185370256, "step": 85900 }, { "epoch": 14.013866231647635, "grad_norm": 0.06986191868782043, "learning_rate": 1.2448255229918751e-05, "loss": 0.1092, "num_input_tokens_seen": 185380144, "step": 85905 }, { "epoch": 14.01468189233279, "grad_norm": 0.06013830378651619, "learning_rate": 1.2445177445111287e-05, "loss": 0.0315, "num_input_tokens_seen": 185389872, "step": 85910 }, { "epoch": 14.015497553017944, "grad_norm": 0.06842593848705292, "learning_rate": 1.244209991474323e-05, "loss": 0.1237, "num_input_tokens_seen": 185400848, "step": 85915 }, { "epoch": 14.0163132137031, "grad_norm": 1.732261300086975, "learning_rate": 1.2439022638876946e-05, "loss": 0.1689, "num_input_tokens_seen": 185412592, "step": 85920 }, { "epoch": 14.017128874388254, "grad_norm": 0.0470765121281147, "learning_rate": 1.2435945617574804e-05, "loss": 0.0463, "num_input_tokens_seen": 185422832, "step": 85925 }, { "epoch": 14.01794453507341, "grad_norm": 0.03157913684844971, "learning_rate": 1.2432868850899165e-05, "loss": 0.1143, "num_input_tokens_seen": 185433744, "step": 85930 }, { "epoch": 14.018760195758565, "grad_norm": 0.09128987044095993, "learning_rate": 1.242979233891238e-05, "loss": 0.2399, "num_input_tokens_seen": 185444816, "step": 85935 }, { "epoch": 14.01957585644372, "grad_norm": 1.2516766786575317, "learning_rate": 1.2426716081676798e-05, "loss": 0.2785, "num_input_tokens_seen": 185455408, "step": 85940 }, { "epoch": 14.020391517128875, "grad_norm": 0.04557940363883972, "learning_rate": 1.2423640079254767e-05, "loss": 0.0188, "num_input_tokens_seen": 185466672, "step": 85945 }, { "epoch": 14.021207177814029, "grad_norm": 0.39645805954933167, "learning_rate": 1.2420564331708623e-05, "loss": 0.0399, "num_input_tokens_seen": 185476912, "step": 85950 }, { "epoch": 14.022022838499185, "grad_norm": 0.17977164685726166, "learning_rate": 1.2417488839100702e-05, "loss": 0.0161, "num_input_tokens_seen": 185488112, "step": 85955 }, { "epoch": 14.022838499184338, "grad_norm": 0.03547930717468262, "learning_rate": 1.241441360149333e-05, "loss": 0.0613, "num_input_tokens_seen": 185498384, "step": 85960 }, { "epoch": 14.023654159869494, "grad_norm": 0.72148597240448, "learning_rate": 1.2411338618948835e-05, "loss": 0.1637, "num_input_tokens_seen": 185509456, "step": 85965 }, { "epoch": 14.02446982055465, "grad_norm": 0.8423939347267151, "learning_rate": 1.2408263891529531e-05, "loss": 0.067, "num_input_tokens_seen": 185521104, "step": 85970 }, { "epoch": 14.025285481239804, "grad_norm": 1.111926794052124, "learning_rate": 1.2405189419297734e-05, "loss": 0.0432, "num_input_tokens_seen": 185530608, "step": 85975 }, { "epoch": 14.02610114192496, "grad_norm": 0.24997617304325104, "learning_rate": 1.2402115202315748e-05, "loss": 0.0783, "num_input_tokens_seen": 185541328, "step": 85980 }, { "epoch": 14.026916802610113, "grad_norm": 3.095691204071045, "learning_rate": 1.2399041240645874e-05, "loss": 0.1235, "num_input_tokens_seen": 185551568, "step": 85985 }, { "epoch": 14.02773246329527, "grad_norm": 0.044675327837467194, "learning_rate": 1.2395967534350427e-05, "loss": 0.0393, "num_input_tokens_seen": 185562736, "step": 85990 }, { "epoch": 14.028548123980425, "grad_norm": 1.8568843603134155, "learning_rate": 1.2392894083491673e-05, "loss": 0.0722, "num_input_tokens_seen": 185573008, "step": 85995 }, { "epoch": 14.029363784665579, "grad_norm": 2.068256139755249, "learning_rate": 1.2389820888131934e-05, "loss": 0.2568, "num_input_tokens_seen": 185583088, "step": 86000 }, { "epoch": 14.030179445350734, "grad_norm": 0.2515891492366791, "learning_rate": 1.2386747948333453e-05, "loss": 0.078, "num_input_tokens_seen": 185593680, "step": 86005 }, { "epoch": 14.030995106035888, "grad_norm": 0.12663856148719788, "learning_rate": 1.2383675264158532e-05, "loss": 0.1493, "num_input_tokens_seen": 185603952, "step": 86010 }, { "epoch": 14.031810766721044, "grad_norm": 0.04938655346632004, "learning_rate": 1.2380602835669438e-05, "loss": 0.1385, "num_input_tokens_seen": 185613968, "step": 86015 }, { "epoch": 14.0326264274062, "grad_norm": 0.6365146636962891, "learning_rate": 1.2377530662928435e-05, "loss": 0.1506, "num_input_tokens_seen": 185624112, "step": 86020 }, { "epoch": 14.033442088091354, "grad_norm": 1.2023241519927979, "learning_rate": 1.2374458745997788e-05, "loss": 0.0442, "num_input_tokens_seen": 185633200, "step": 86025 }, { "epoch": 14.03425774877651, "grad_norm": 0.054083794355392456, "learning_rate": 1.237138708493975e-05, "loss": 0.0284, "num_input_tokens_seen": 185643664, "step": 86030 }, { "epoch": 14.035073409461663, "grad_norm": 0.0529869869351387, "learning_rate": 1.2368315679816576e-05, "loss": 0.1597, "num_input_tokens_seen": 185654000, "step": 86035 }, { "epoch": 14.035889070146819, "grad_norm": 0.950647234916687, "learning_rate": 1.2365244530690506e-05, "loss": 0.1715, "num_input_tokens_seen": 185664336, "step": 86040 }, { "epoch": 14.036704730831975, "grad_norm": 0.03830430656671524, "learning_rate": 1.2362173637623783e-05, "loss": 0.1578, "num_input_tokens_seen": 185675088, "step": 86045 }, { "epoch": 14.037520391517129, "grad_norm": 1.4660491943359375, "learning_rate": 1.2359103000678646e-05, "loss": 0.1577, "num_input_tokens_seen": 185685936, "step": 86050 }, { "epoch": 14.038336052202284, "grad_norm": 0.16433016955852509, "learning_rate": 1.2356032619917323e-05, "loss": 0.0204, "num_input_tokens_seen": 185697552, "step": 86055 }, { "epoch": 14.039151712887438, "grad_norm": 0.4348747432231903, "learning_rate": 1.2352962495402037e-05, "loss": 0.0931, "num_input_tokens_seen": 185708656, "step": 86060 }, { "epoch": 14.039967373572594, "grad_norm": 0.025978533551096916, "learning_rate": 1.2349892627195012e-05, "loss": 0.037, "num_input_tokens_seen": 185720080, "step": 86065 }, { "epoch": 14.040783034257748, "grad_norm": 0.022813959047198296, "learning_rate": 1.234682301535846e-05, "loss": 0.1151, "num_input_tokens_seen": 185731120, "step": 86070 }, { "epoch": 14.041598694942904, "grad_norm": 0.29134225845336914, "learning_rate": 1.2343753659954591e-05, "loss": 0.2007, "num_input_tokens_seen": 185741936, "step": 86075 }, { "epoch": 14.04241435562806, "grad_norm": 0.475230872631073, "learning_rate": 1.234068456104561e-05, "loss": 0.04, "num_input_tokens_seen": 185752592, "step": 86080 }, { "epoch": 14.043230016313213, "grad_norm": 0.16970589756965637, "learning_rate": 1.2337615718693715e-05, "loss": 0.057, "num_input_tokens_seen": 185763024, "step": 86085 }, { "epoch": 14.044045676998369, "grad_norm": 0.2361578345298767, "learning_rate": 1.2334547132961102e-05, "loss": 0.0494, "num_input_tokens_seen": 185774032, "step": 86090 }, { "epoch": 14.044861337683523, "grad_norm": 3.2296204566955566, "learning_rate": 1.2331478803909956e-05, "loss": 0.1497, "num_input_tokens_seen": 185785520, "step": 86095 }, { "epoch": 14.045676998368679, "grad_norm": 0.05967390537261963, "learning_rate": 1.2328410731602458e-05, "loss": 0.0137, "num_input_tokens_seen": 185795696, "step": 86100 }, { "epoch": 14.046492659053834, "grad_norm": 0.24819253385066986, "learning_rate": 1.2325342916100807e-05, "loss": 0.1058, "num_input_tokens_seen": 185806096, "step": 86105 }, { "epoch": 14.047308319738988, "grad_norm": 0.13448412716388702, "learning_rate": 1.2322275357467144e-05, "loss": 0.061, "num_input_tokens_seen": 185816688, "step": 86110 }, { "epoch": 14.048123980424144, "grad_norm": 2.3018572330474854, "learning_rate": 1.2319208055763673e-05, "loss": 0.1229, "num_input_tokens_seen": 185827408, "step": 86115 }, { "epoch": 14.048939641109298, "grad_norm": 0.16993531584739685, "learning_rate": 1.2316141011052516e-05, "loss": 0.1139, "num_input_tokens_seen": 185837712, "step": 86120 }, { "epoch": 14.049755301794454, "grad_norm": 0.12201175838708878, "learning_rate": 1.2313074223395873e-05, "loss": 0.0185, "num_input_tokens_seen": 185848336, "step": 86125 }, { "epoch": 14.05057096247961, "grad_norm": 0.08785099536180496, "learning_rate": 1.2310007692855854e-05, "loss": 0.0274, "num_input_tokens_seen": 185858096, "step": 86130 }, { "epoch": 14.051386623164763, "grad_norm": 2.198265552520752, "learning_rate": 1.230694141949465e-05, "loss": 0.1588, "num_input_tokens_seen": 185868528, "step": 86135 }, { "epoch": 14.052202283849919, "grad_norm": 1.4042457342147827, "learning_rate": 1.230387540337436e-05, "loss": 0.0773, "num_input_tokens_seen": 185879952, "step": 86140 }, { "epoch": 14.053017944535073, "grad_norm": 1.988681674003601, "learning_rate": 1.230080964455716e-05, "loss": 0.1201, "num_input_tokens_seen": 185891920, "step": 86145 }, { "epoch": 14.053833605220229, "grad_norm": 1.4778586626052856, "learning_rate": 1.2297744143105142e-05, "loss": 0.223, "num_input_tokens_seen": 185901936, "step": 86150 }, { "epoch": 14.054649265905383, "grad_norm": 0.16151189804077148, "learning_rate": 1.2294678899080465e-05, "loss": 0.1686, "num_input_tokens_seen": 185913104, "step": 86155 }, { "epoch": 14.055464926590538, "grad_norm": 0.1722130924463272, "learning_rate": 1.2291613912545235e-05, "loss": 0.0074, "num_input_tokens_seen": 185924496, "step": 86160 }, { "epoch": 14.056280587275694, "grad_norm": 2.168968677520752, "learning_rate": 1.2288549183561574e-05, "loss": 0.1481, "num_input_tokens_seen": 185936048, "step": 86165 }, { "epoch": 14.057096247960848, "grad_norm": 0.4028603136539459, "learning_rate": 1.2285484712191586e-05, "loss": 0.0383, "num_input_tokens_seen": 185947248, "step": 86170 }, { "epoch": 14.057911908646004, "grad_norm": 0.5724915862083435, "learning_rate": 1.2282420498497383e-05, "loss": 0.1364, "num_input_tokens_seen": 185958160, "step": 86175 }, { "epoch": 14.058727569331158, "grad_norm": 0.5200060606002808, "learning_rate": 1.227935654254106e-05, "loss": 0.1321, "num_input_tokens_seen": 185968816, "step": 86180 }, { "epoch": 14.059543230016313, "grad_norm": 1.3448700904846191, "learning_rate": 1.2276292844384715e-05, "loss": 0.0994, "num_input_tokens_seen": 185980144, "step": 86185 }, { "epoch": 14.060358890701469, "grad_norm": 0.3006497323513031, "learning_rate": 1.2273229404090436e-05, "loss": 0.0888, "num_input_tokens_seen": 185991696, "step": 86190 }, { "epoch": 14.061174551386623, "grad_norm": 2.2827510833740234, "learning_rate": 1.2270166221720308e-05, "loss": 0.1431, "num_input_tokens_seen": 186001520, "step": 86195 }, { "epoch": 14.061990212071779, "grad_norm": 0.1925009787082672, "learning_rate": 1.226710329733641e-05, "loss": 0.1119, "num_input_tokens_seen": 186011728, "step": 86200 }, { "epoch": 14.062805872756933, "grad_norm": 2.381734609603882, "learning_rate": 1.2264040631000808e-05, "loss": 0.3187, "num_input_tokens_seen": 186022832, "step": 86205 }, { "epoch": 14.063621533442088, "grad_norm": 2.1300039291381836, "learning_rate": 1.22609782227756e-05, "loss": 0.1185, "num_input_tokens_seen": 186033584, "step": 86210 }, { "epoch": 14.064437194127244, "grad_norm": 0.5799909234046936, "learning_rate": 1.2257916072722809e-05, "loss": 0.1568, "num_input_tokens_seen": 186044976, "step": 86215 }, { "epoch": 14.065252854812398, "grad_norm": 0.3046317398548126, "learning_rate": 1.2254854180904529e-05, "loss": 0.0617, "num_input_tokens_seen": 186056688, "step": 86220 }, { "epoch": 14.066068515497554, "grad_norm": 0.4698726236820221, "learning_rate": 1.2251792547382782e-05, "loss": 0.0374, "num_input_tokens_seen": 186068208, "step": 86225 }, { "epoch": 14.066884176182707, "grad_norm": 0.3113492429256439, "learning_rate": 1.2248731172219648e-05, "loss": 0.0155, "num_input_tokens_seen": 186080784, "step": 86230 }, { "epoch": 14.067699836867863, "grad_norm": 0.04595450684428215, "learning_rate": 1.2245670055477134e-05, "loss": 0.0174, "num_input_tokens_seen": 186092816, "step": 86235 }, { "epoch": 14.068515497553017, "grad_norm": 0.04255710542201996, "learning_rate": 1.224260919721732e-05, "loss": 0.0451, "num_input_tokens_seen": 186103856, "step": 86240 }, { "epoch": 14.069331158238173, "grad_norm": 0.09395133703947067, "learning_rate": 1.2239548597502191e-05, "loss": 0.0167, "num_input_tokens_seen": 186115664, "step": 86245 }, { "epoch": 14.070146818923329, "grad_norm": 0.1017075851559639, "learning_rate": 1.2236488256393816e-05, "loss": 0.034, "num_input_tokens_seen": 186125488, "step": 86250 }, { "epoch": 14.070962479608482, "grad_norm": 0.09208498150110245, "learning_rate": 1.2233428173954181e-05, "loss": 0.068, "num_input_tokens_seen": 186137424, "step": 86255 }, { "epoch": 14.071778140293638, "grad_norm": 0.8326020836830139, "learning_rate": 1.2230368350245338e-05, "loss": 0.156, "num_input_tokens_seen": 186147920, "step": 86260 }, { "epoch": 14.072593800978792, "grad_norm": 0.07726694643497467, "learning_rate": 1.2227308785329261e-05, "loss": 0.0353, "num_input_tokens_seen": 186158192, "step": 86265 }, { "epoch": 14.073409461663948, "grad_norm": 0.13169196248054504, "learning_rate": 1.2224249479267994e-05, "loss": 0.1095, "num_input_tokens_seen": 186168912, "step": 86270 }, { "epoch": 14.074225122349104, "grad_norm": 0.12611913681030273, "learning_rate": 1.2221190432123499e-05, "loss": 0.1149, "num_input_tokens_seen": 186179024, "step": 86275 }, { "epoch": 14.075040783034257, "grad_norm": 0.2476791888475418, "learning_rate": 1.2218131643957812e-05, "loss": 0.0389, "num_input_tokens_seen": 186189136, "step": 86280 }, { "epoch": 14.075856443719413, "grad_norm": 0.15177002549171448, "learning_rate": 1.2215073114832879e-05, "loss": 0.0228, "num_input_tokens_seen": 186200112, "step": 86285 }, { "epoch": 14.076672104404567, "grad_norm": 0.8638579249382019, "learning_rate": 1.2212014844810728e-05, "loss": 0.1031, "num_input_tokens_seen": 186209264, "step": 86290 }, { "epoch": 14.077487765089723, "grad_norm": 0.04598957672715187, "learning_rate": 1.2208956833953302e-05, "loss": 0.0558, "num_input_tokens_seen": 186220624, "step": 86295 }, { "epoch": 14.078303425774878, "grad_norm": 0.07466243207454681, "learning_rate": 1.2205899082322602e-05, "loss": 0.081, "num_input_tokens_seen": 186231536, "step": 86300 }, { "epoch": 14.079119086460032, "grad_norm": 0.04235163703560829, "learning_rate": 1.2202841589980587e-05, "loss": 0.1458, "num_input_tokens_seen": 186242192, "step": 86305 }, { "epoch": 14.079934747145188, "grad_norm": 0.06874396651983261, "learning_rate": 1.2199784356989221e-05, "loss": 0.0959, "num_input_tokens_seen": 186253904, "step": 86310 }, { "epoch": 14.080750407830342, "grad_norm": 0.19137123227119446, "learning_rate": 1.2196727383410463e-05, "loss": 0.0979, "num_input_tokens_seen": 186266000, "step": 86315 }, { "epoch": 14.081566068515498, "grad_norm": 0.09747853875160217, "learning_rate": 1.2193670669306268e-05, "loss": 0.1756, "num_input_tokens_seen": 186275760, "step": 86320 }, { "epoch": 14.082381729200652, "grad_norm": 0.3939719796180725, "learning_rate": 1.2190614214738583e-05, "loss": 0.1427, "num_input_tokens_seen": 186285936, "step": 86325 }, { "epoch": 14.083197389885807, "grad_norm": 0.15568220615386963, "learning_rate": 1.218755801976935e-05, "loss": 0.0107, "num_input_tokens_seen": 186296144, "step": 86330 }, { "epoch": 14.084013050570963, "grad_norm": 1.2860736846923828, "learning_rate": 1.2184502084460505e-05, "loss": 0.2691, "num_input_tokens_seen": 186307792, "step": 86335 }, { "epoch": 14.084828711256117, "grad_norm": 0.6645366549491882, "learning_rate": 1.2181446408873987e-05, "loss": 0.0637, "num_input_tokens_seen": 186319760, "step": 86340 }, { "epoch": 14.085644371941273, "grad_norm": 0.15707840025424957, "learning_rate": 1.2178390993071717e-05, "loss": 0.0536, "num_input_tokens_seen": 186331248, "step": 86345 }, { "epoch": 14.086460032626427, "grad_norm": 0.031009603291749954, "learning_rate": 1.2175335837115609e-05, "loss": 0.0203, "num_input_tokens_seen": 186341392, "step": 86350 }, { "epoch": 14.087275693311582, "grad_norm": 0.07661430537700653, "learning_rate": 1.217228094106761e-05, "loss": 0.0242, "num_input_tokens_seen": 186352528, "step": 86355 }, { "epoch": 14.088091353996738, "grad_norm": 0.21711395680904388, "learning_rate": 1.216922630498959e-05, "loss": 0.1348, "num_input_tokens_seen": 186364816, "step": 86360 }, { "epoch": 14.088907014681892, "grad_norm": 1.9280811548233032, "learning_rate": 1.2166171928943496e-05, "loss": 0.3468, "num_input_tokens_seen": 186376304, "step": 86365 }, { "epoch": 14.089722675367048, "grad_norm": 0.12963992357254028, "learning_rate": 1.216311781299119e-05, "loss": 0.0504, "num_input_tokens_seen": 186387152, "step": 86370 }, { "epoch": 14.090538336052202, "grad_norm": 0.25804126262664795, "learning_rate": 1.2160063957194606e-05, "loss": 0.0804, "num_input_tokens_seen": 186398096, "step": 86375 }, { "epoch": 14.091353996737357, "grad_norm": 2.998006582260132, "learning_rate": 1.2157010361615592e-05, "loss": 0.1127, "num_input_tokens_seen": 186408720, "step": 86380 }, { "epoch": 14.092169657422513, "grad_norm": 0.1990167796611786, "learning_rate": 1.2153957026316079e-05, "loss": 0.0225, "num_input_tokens_seen": 186420528, "step": 86385 }, { "epoch": 14.092985318107667, "grad_norm": 0.0760483369231224, "learning_rate": 1.21509039513579e-05, "loss": 0.0738, "num_input_tokens_seen": 186430512, "step": 86390 }, { "epoch": 14.093800978792823, "grad_norm": 2.8302161693573, "learning_rate": 1.2147851136802977e-05, "loss": 0.2816, "num_input_tokens_seen": 186442224, "step": 86395 }, { "epoch": 14.094616639477977, "grad_norm": 0.08145179599523544, "learning_rate": 1.2144798582713132e-05, "loss": 0.0102, "num_input_tokens_seen": 186451888, "step": 86400 }, { "epoch": 14.095432300163132, "grad_norm": 0.15571938455104828, "learning_rate": 1.2141746289150272e-05, "loss": 0.098, "num_input_tokens_seen": 186462576, "step": 86405 }, { "epoch": 14.096247960848286, "grad_norm": 0.6361894011497498, "learning_rate": 1.2138694256176214e-05, "loss": 0.1425, "num_input_tokens_seen": 186472144, "step": 86410 }, { "epoch": 14.097063621533442, "grad_norm": 0.16864241659641266, "learning_rate": 1.2135642483852844e-05, "loss": 0.0417, "num_input_tokens_seen": 186483696, "step": 86415 }, { "epoch": 14.097879282218598, "grad_norm": 0.07793223112821579, "learning_rate": 1.2132590972241997e-05, "loss": 0.0538, "num_input_tokens_seen": 186495120, "step": 86420 }, { "epoch": 14.098694942903752, "grad_norm": 2.4249861240386963, "learning_rate": 1.212953972140552e-05, "loss": 0.0766, "num_input_tokens_seen": 186505520, "step": 86425 }, { "epoch": 14.099510603588907, "grad_norm": 2.378903388977051, "learning_rate": 1.2126488731405247e-05, "loss": 0.324, "num_input_tokens_seen": 186516944, "step": 86430 }, { "epoch": 14.100326264274061, "grad_norm": 0.3186088502407074, "learning_rate": 1.2123438002303012e-05, "loss": 0.1188, "num_input_tokens_seen": 186526224, "step": 86435 }, { "epoch": 14.101141924959217, "grad_norm": 0.08994770050048828, "learning_rate": 1.2120387534160638e-05, "loss": 0.0172, "num_input_tokens_seen": 186535792, "step": 86440 }, { "epoch": 14.101957585644373, "grad_norm": 1.2480225563049316, "learning_rate": 1.211733732703995e-05, "loss": 0.192, "num_input_tokens_seen": 186546096, "step": 86445 }, { "epoch": 14.102773246329527, "grad_norm": 0.5002581477165222, "learning_rate": 1.2114287381002768e-05, "loss": 0.0542, "num_input_tokens_seen": 186557616, "step": 86450 }, { "epoch": 14.103588907014682, "grad_norm": 0.031930118799209595, "learning_rate": 1.2111237696110895e-05, "loss": 0.1108, "num_input_tokens_seen": 186568368, "step": 86455 }, { "epoch": 14.104404567699836, "grad_norm": 0.5889196395874023, "learning_rate": 1.210818827242614e-05, "loss": 0.2168, "num_input_tokens_seen": 186578640, "step": 86460 }, { "epoch": 14.105220228384992, "grad_norm": 0.5459994673728943, "learning_rate": 1.2105139110010309e-05, "loss": 0.1588, "num_input_tokens_seen": 186588752, "step": 86465 }, { "epoch": 14.106035889070148, "grad_norm": 0.11660180985927582, "learning_rate": 1.2102090208925188e-05, "loss": 0.0555, "num_input_tokens_seen": 186599504, "step": 86470 }, { "epoch": 14.106851549755302, "grad_norm": 0.8909913897514343, "learning_rate": 1.2099041569232572e-05, "loss": 0.1382, "num_input_tokens_seen": 186610096, "step": 86475 }, { "epoch": 14.107667210440457, "grad_norm": 1.1620404720306396, "learning_rate": 1.2095993190994246e-05, "loss": 0.0184, "num_input_tokens_seen": 186620464, "step": 86480 }, { "epoch": 14.108482871125611, "grad_norm": 0.045560285449028015, "learning_rate": 1.2092945074271986e-05, "loss": 0.1288, "num_input_tokens_seen": 186632208, "step": 86485 }, { "epoch": 14.109298531810767, "grad_norm": 0.17658840119838715, "learning_rate": 1.2089897219127567e-05, "loss": 0.0259, "num_input_tokens_seen": 186643344, "step": 86490 }, { "epoch": 14.11011419249592, "grad_norm": 1.4651566743850708, "learning_rate": 1.2086849625622751e-05, "loss": 0.079, "num_input_tokens_seen": 186655248, "step": 86495 }, { "epoch": 14.110929853181077, "grad_norm": 0.0274799894541502, "learning_rate": 1.2083802293819327e-05, "loss": 0.0588, "num_input_tokens_seen": 186665776, "step": 86500 }, { "epoch": 14.111745513866232, "grad_norm": 0.02783040702342987, "learning_rate": 1.2080755223779017e-05, "loss": 0.0146, "num_input_tokens_seen": 186675024, "step": 86505 }, { "epoch": 14.112561174551386, "grad_norm": 0.5181410908699036, "learning_rate": 1.207770841556361e-05, "loss": 0.1743, "num_input_tokens_seen": 186685648, "step": 86510 }, { "epoch": 14.113376835236542, "grad_norm": 0.5838316679000854, "learning_rate": 1.2074661869234816e-05, "loss": 0.0674, "num_input_tokens_seen": 186697424, "step": 86515 }, { "epoch": 14.114192495921696, "grad_norm": 0.0403931625187397, "learning_rate": 1.2071615584854404e-05, "loss": 0.1711, "num_input_tokens_seen": 186707760, "step": 86520 }, { "epoch": 14.115008156606851, "grad_norm": 0.08949896693229675, "learning_rate": 1.2068569562484106e-05, "loss": 0.0224, "num_input_tokens_seen": 186717392, "step": 86525 }, { "epoch": 14.115823817292007, "grad_norm": 0.8094066977500916, "learning_rate": 1.2065523802185647e-05, "loss": 0.189, "num_input_tokens_seen": 186728496, "step": 86530 }, { "epoch": 14.116639477977161, "grad_norm": 1.191061019897461, "learning_rate": 1.206247830402076e-05, "loss": 0.1222, "num_input_tokens_seen": 186740592, "step": 86535 }, { "epoch": 14.117455138662317, "grad_norm": 0.06904874742031097, "learning_rate": 1.205943306805116e-05, "loss": 0.2401, "num_input_tokens_seen": 186751152, "step": 86540 }, { "epoch": 14.11827079934747, "grad_norm": 0.5437315702438354, "learning_rate": 1.2056388094338566e-05, "loss": 0.1236, "num_input_tokens_seen": 186762064, "step": 86545 }, { "epoch": 14.119086460032626, "grad_norm": 0.11863268911838531, "learning_rate": 1.205334338294469e-05, "loss": 0.1327, "num_input_tokens_seen": 186773808, "step": 86550 }, { "epoch": 14.119902120717782, "grad_norm": 0.04884420707821846, "learning_rate": 1.2050298933931231e-05, "loss": 0.0829, "num_input_tokens_seen": 186783792, "step": 86555 }, { "epoch": 14.120717781402936, "grad_norm": 0.037149395793676376, "learning_rate": 1.2047254747359893e-05, "loss": 0.1575, "num_input_tokens_seen": 186793072, "step": 86560 }, { "epoch": 14.121533442088092, "grad_norm": 2.4466989040374756, "learning_rate": 1.204421082329237e-05, "loss": 0.0863, "num_input_tokens_seen": 186803632, "step": 86565 }, { "epoch": 14.122349102773246, "grad_norm": 1.368826150894165, "learning_rate": 1.2041167161790348e-05, "loss": 0.1306, "num_input_tokens_seen": 186815536, "step": 86570 }, { "epoch": 14.123164763458401, "grad_norm": 0.877902090549469, "learning_rate": 1.2038123762915515e-05, "loss": 0.0443, "num_input_tokens_seen": 186826384, "step": 86575 }, { "epoch": 14.123980424143557, "grad_norm": 1.2561945915222168, "learning_rate": 1.2035080626729547e-05, "loss": 0.0599, "num_input_tokens_seen": 186837616, "step": 86580 }, { "epoch": 14.124796084828711, "grad_norm": 1.6790515184402466, "learning_rate": 1.2032037753294117e-05, "loss": 0.1407, "num_input_tokens_seen": 186847632, "step": 86585 }, { "epoch": 14.125611745513867, "grad_norm": 1.7376115322113037, "learning_rate": 1.2028995142670893e-05, "loss": 0.1204, "num_input_tokens_seen": 186858384, "step": 86590 }, { "epoch": 14.12642740619902, "grad_norm": 0.8102809190750122, "learning_rate": 1.2025952794921539e-05, "loss": 0.1018, "num_input_tokens_seen": 186868624, "step": 86595 }, { "epoch": 14.127243066884176, "grad_norm": 0.05206513777375221, "learning_rate": 1.2022910710107707e-05, "loss": 0.0961, "num_input_tokens_seen": 186879056, "step": 86600 }, { "epoch": 14.12805872756933, "grad_norm": 0.05985773354768753, "learning_rate": 1.2019868888291055e-05, "loss": 0.0123, "num_input_tokens_seen": 186889904, "step": 86605 }, { "epoch": 14.128874388254486, "grad_norm": 0.06402549892663956, "learning_rate": 1.2016827329533225e-05, "loss": 0.0815, "num_input_tokens_seen": 186899696, "step": 86610 }, { "epoch": 14.129690048939642, "grad_norm": 2.2946856021881104, "learning_rate": 1.201378603389586e-05, "loss": 0.2388, "num_input_tokens_seen": 186910384, "step": 86615 }, { "epoch": 14.130505709624796, "grad_norm": 0.051668960601091385, "learning_rate": 1.2010745001440593e-05, "loss": 0.0196, "num_input_tokens_seen": 186921392, "step": 86620 }, { "epoch": 14.131321370309951, "grad_norm": 0.037168294191360474, "learning_rate": 1.200770423222905e-05, "loss": 0.0537, "num_input_tokens_seen": 186932880, "step": 86625 }, { "epoch": 14.132137030995105, "grad_norm": 0.4775707423686981, "learning_rate": 1.200466372632287e-05, "loss": 0.2041, "num_input_tokens_seen": 186942064, "step": 86630 }, { "epoch": 14.132952691680261, "grad_norm": 0.08870147913694382, "learning_rate": 1.2001623483783667e-05, "loss": 0.0165, "num_input_tokens_seen": 186951408, "step": 86635 }, { "epoch": 14.133768352365417, "grad_norm": 0.0885474905371666, "learning_rate": 1.1998583504673052e-05, "loss": 0.0103, "num_input_tokens_seen": 186963760, "step": 86640 }, { "epoch": 14.13458401305057, "grad_norm": 0.5829237699508667, "learning_rate": 1.1995543789052637e-05, "loss": 0.0775, "num_input_tokens_seen": 186974544, "step": 86645 }, { "epoch": 14.135399673735726, "grad_norm": 0.12936869263648987, "learning_rate": 1.1992504336984025e-05, "loss": 0.0863, "num_input_tokens_seen": 186985328, "step": 86650 }, { "epoch": 14.13621533442088, "grad_norm": 1.4203609228134155, "learning_rate": 1.1989465148528813e-05, "loss": 0.1064, "num_input_tokens_seen": 186995504, "step": 86655 }, { "epoch": 14.137030995106036, "grad_norm": 0.8769461512565613, "learning_rate": 1.1986426223748595e-05, "loss": 0.0398, "num_input_tokens_seen": 187006544, "step": 86660 }, { "epoch": 14.137846655791192, "grad_norm": 0.02797192893922329, "learning_rate": 1.1983387562704958e-05, "loss": 0.2484, "num_input_tokens_seen": 187017328, "step": 86665 }, { "epoch": 14.138662316476346, "grad_norm": 0.15694886445999146, "learning_rate": 1.1980349165459487e-05, "loss": 0.0232, "num_input_tokens_seen": 187028464, "step": 86670 }, { "epoch": 14.139477977161501, "grad_norm": 0.43000081181526184, "learning_rate": 1.1977311032073754e-05, "loss": 0.1091, "num_input_tokens_seen": 187038736, "step": 86675 }, { "epoch": 14.140293637846655, "grad_norm": 0.32923364639282227, "learning_rate": 1.1974273162609334e-05, "loss": 0.1091, "num_input_tokens_seen": 187050000, "step": 86680 }, { "epoch": 14.141109298531811, "grad_norm": 1.455224633216858, "learning_rate": 1.1971235557127794e-05, "loss": 0.2721, "num_input_tokens_seen": 187061040, "step": 86685 }, { "epoch": 14.141924959216965, "grad_norm": 0.5278072953224182, "learning_rate": 1.1968198215690694e-05, "loss": 0.0521, "num_input_tokens_seen": 187071728, "step": 86690 }, { "epoch": 14.14274061990212, "grad_norm": 0.1008075624704361, "learning_rate": 1.1965161138359588e-05, "loss": 0.0426, "num_input_tokens_seen": 187083440, "step": 86695 }, { "epoch": 14.143556280587276, "grad_norm": 1.2073266506195068, "learning_rate": 1.1962124325196026e-05, "loss": 0.0324, "num_input_tokens_seen": 187093872, "step": 86700 }, { "epoch": 14.14437194127243, "grad_norm": 0.03197600319981575, "learning_rate": 1.1959087776261554e-05, "loss": 0.046, "num_input_tokens_seen": 187104208, "step": 86705 }, { "epoch": 14.145187601957586, "grad_norm": 0.8690775036811829, "learning_rate": 1.1956051491617714e-05, "loss": 0.1869, "num_input_tokens_seen": 187114960, "step": 86710 }, { "epoch": 14.14600326264274, "grad_norm": 0.3726750314235687, "learning_rate": 1.1953015471326037e-05, "loss": 0.0464, "num_input_tokens_seen": 187125200, "step": 86715 }, { "epoch": 14.146818923327896, "grad_norm": 1.913069725036621, "learning_rate": 1.1949979715448054e-05, "loss": 0.1863, "num_input_tokens_seen": 187135248, "step": 86720 }, { "epoch": 14.147634584013051, "grad_norm": 0.398299902677536, "learning_rate": 1.1946944224045286e-05, "loss": 0.1258, "num_input_tokens_seen": 187145904, "step": 86725 }, { "epoch": 14.148450244698205, "grad_norm": 0.5584713816642761, "learning_rate": 1.1943908997179244e-05, "loss": 0.0459, "num_input_tokens_seen": 187155504, "step": 86730 }, { "epoch": 14.149265905383361, "grad_norm": 0.6124641299247742, "learning_rate": 1.1940874034911464e-05, "loss": 0.1859, "num_input_tokens_seen": 187167408, "step": 86735 }, { "epoch": 14.150081566068515, "grad_norm": 0.8595650792121887, "learning_rate": 1.1937839337303425e-05, "loss": 0.0471, "num_input_tokens_seen": 187178064, "step": 86740 }, { "epoch": 14.15089722675367, "grad_norm": 3.231276750564575, "learning_rate": 1.1934804904416657e-05, "loss": 0.2339, "num_input_tokens_seen": 187188976, "step": 86745 }, { "epoch": 14.151712887438826, "grad_norm": 0.44735631346702576, "learning_rate": 1.1931770736312625e-05, "loss": 0.0158, "num_input_tokens_seen": 187199312, "step": 86750 }, { "epoch": 14.15252854812398, "grad_norm": 1.2700337171554565, "learning_rate": 1.1928736833052856e-05, "loss": 0.0429, "num_input_tokens_seen": 187208816, "step": 86755 }, { "epoch": 14.153344208809136, "grad_norm": 4.237532138824463, "learning_rate": 1.1925703194698798e-05, "loss": 0.1412, "num_input_tokens_seen": 187218672, "step": 86760 }, { "epoch": 14.15415986949429, "grad_norm": 0.056564077734947205, "learning_rate": 1.1922669821311969e-05, "loss": 0.046, "num_input_tokens_seen": 187230096, "step": 86765 }, { "epoch": 14.154975530179446, "grad_norm": 1.9281471967697144, "learning_rate": 1.1919636712953809e-05, "loss": 0.1677, "num_input_tokens_seen": 187241264, "step": 86770 }, { "epoch": 14.1557911908646, "grad_norm": 0.8558056950569153, "learning_rate": 1.191660386968581e-05, "loss": 0.1046, "num_input_tokens_seen": 187250192, "step": 86775 }, { "epoch": 14.156606851549755, "grad_norm": 0.06863926351070404, "learning_rate": 1.1913571291569436e-05, "loss": 0.1586, "num_input_tokens_seen": 187260720, "step": 86780 }, { "epoch": 14.15742251223491, "grad_norm": 0.12287648767232895, "learning_rate": 1.191053897866614e-05, "loss": 0.0891, "num_input_tokens_seen": 187270096, "step": 86785 }, { "epoch": 14.158238172920065, "grad_norm": 0.694342315196991, "learning_rate": 1.1907506931037374e-05, "loss": 0.1507, "num_input_tokens_seen": 187280368, "step": 86790 }, { "epoch": 14.15905383360522, "grad_norm": 2.1371777057647705, "learning_rate": 1.190447514874459e-05, "loss": 0.1854, "num_input_tokens_seen": 187291056, "step": 86795 }, { "epoch": 14.159869494290374, "grad_norm": 0.12055977433919907, "learning_rate": 1.1901443631849231e-05, "loss": 0.1087, "num_input_tokens_seen": 187303184, "step": 86800 }, { "epoch": 14.16068515497553, "grad_norm": 0.24973231554031372, "learning_rate": 1.1898412380412733e-05, "loss": 0.1031, "num_input_tokens_seen": 187314896, "step": 86805 }, { "epoch": 14.161500815660686, "grad_norm": 0.10042745620012283, "learning_rate": 1.189538139449653e-05, "loss": 0.0487, "num_input_tokens_seen": 187325712, "step": 86810 }, { "epoch": 14.16231647634584, "grad_norm": 0.029203496873378754, "learning_rate": 1.1892350674162045e-05, "loss": 0.1198, "num_input_tokens_seen": 187337136, "step": 86815 }, { "epoch": 14.163132137030995, "grad_norm": 0.2731606662273407, "learning_rate": 1.1889320219470703e-05, "loss": 0.0155, "num_input_tokens_seen": 187349008, "step": 86820 }, { "epoch": 14.16394779771615, "grad_norm": 2.222336530685425, "learning_rate": 1.1886290030483919e-05, "loss": 0.0797, "num_input_tokens_seen": 187359376, "step": 86825 }, { "epoch": 14.164763458401305, "grad_norm": 0.9201524257659912, "learning_rate": 1.1883260107263103e-05, "loss": 0.1691, "num_input_tokens_seen": 187371632, "step": 86830 }, { "epoch": 14.16557911908646, "grad_norm": 0.05470957234501839, "learning_rate": 1.1880230449869661e-05, "loss": 0.0238, "num_input_tokens_seen": 187381520, "step": 86835 }, { "epoch": 14.166394779771615, "grad_norm": 0.09356965869665146, "learning_rate": 1.1877201058364993e-05, "loss": 0.0836, "num_input_tokens_seen": 187392336, "step": 86840 }, { "epoch": 14.16721044045677, "grad_norm": 0.052264995872974396, "learning_rate": 1.1874171932810483e-05, "loss": 0.0677, "num_input_tokens_seen": 187403504, "step": 86845 }, { "epoch": 14.168026101141924, "grad_norm": 0.09481468051671982, "learning_rate": 1.1871143073267548e-05, "loss": 0.1453, "num_input_tokens_seen": 187413776, "step": 86850 }, { "epoch": 14.16884176182708, "grad_norm": 0.028099246323108673, "learning_rate": 1.1868114479797538e-05, "loss": 0.0176, "num_input_tokens_seen": 187425104, "step": 86855 }, { "epoch": 14.169657422512234, "grad_norm": 1.5633093118667603, "learning_rate": 1.1865086152461862e-05, "loss": 0.1353, "num_input_tokens_seen": 187435952, "step": 86860 }, { "epoch": 14.17047308319739, "grad_norm": 0.055218975991010666, "learning_rate": 1.1862058091321862e-05, "loss": 0.111, "num_input_tokens_seen": 187446736, "step": 86865 }, { "epoch": 14.171288743882545, "grad_norm": 0.06654094904661179, "learning_rate": 1.185903029643894e-05, "loss": 0.0095, "num_input_tokens_seen": 187458128, "step": 86870 }, { "epoch": 14.1721044045677, "grad_norm": 1.203651785850525, "learning_rate": 1.1856002767874418e-05, "loss": 0.2237, "num_input_tokens_seen": 187469104, "step": 86875 }, { "epoch": 14.172920065252855, "grad_norm": 1.3908907175064087, "learning_rate": 1.1852975505689696e-05, "loss": 0.1391, "num_input_tokens_seen": 187480528, "step": 86880 }, { "epoch": 14.173735725938009, "grad_norm": 0.5504226088523865, "learning_rate": 1.1849948509946082e-05, "loss": 0.0345, "num_input_tokens_seen": 187491920, "step": 86885 }, { "epoch": 14.174551386623165, "grad_norm": 0.49372726678848267, "learning_rate": 1.1846921780704964e-05, "loss": 0.0945, "num_input_tokens_seen": 187502512, "step": 86890 }, { "epoch": 14.17536704730832, "grad_norm": 0.11704455316066742, "learning_rate": 1.1843895318027646e-05, "loss": 0.2615, "num_input_tokens_seen": 187514480, "step": 86895 }, { "epoch": 14.176182707993474, "grad_norm": 0.026629364117980003, "learning_rate": 1.1840869121975493e-05, "loss": 0.0651, "num_input_tokens_seen": 187524752, "step": 86900 }, { "epoch": 14.17699836867863, "grad_norm": 0.9135497808456421, "learning_rate": 1.1837843192609805e-05, "loss": 0.2167, "num_input_tokens_seen": 187533904, "step": 86905 }, { "epoch": 14.177814029363784, "grad_norm": 3.0955939292907715, "learning_rate": 1.183481752999194e-05, "loss": 0.232, "num_input_tokens_seen": 187545264, "step": 86910 }, { "epoch": 14.17862969004894, "grad_norm": 0.043845292180776596, "learning_rate": 1.1831792134183179e-05, "loss": 0.0472, "num_input_tokens_seen": 187556944, "step": 86915 }, { "epoch": 14.179445350734095, "grad_norm": 0.4474588632583618, "learning_rate": 1.1828767005244868e-05, "loss": 0.068, "num_input_tokens_seen": 187567728, "step": 86920 }, { "epoch": 14.18026101141925, "grad_norm": 0.5907017588615417, "learning_rate": 1.18257421432383e-05, "loss": 0.0824, "num_input_tokens_seen": 187577712, "step": 86925 }, { "epoch": 14.181076672104405, "grad_norm": 1.4749470949172974, "learning_rate": 1.182271754822478e-05, "loss": 0.0794, "num_input_tokens_seen": 187588048, "step": 86930 }, { "epoch": 14.181892332789559, "grad_norm": 0.04630337655544281, "learning_rate": 1.1819693220265604e-05, "loss": 0.1913, "num_input_tokens_seen": 187598832, "step": 86935 }, { "epoch": 14.182707993474715, "grad_norm": 0.27685144543647766, "learning_rate": 1.181666915942207e-05, "loss": 0.0693, "num_input_tokens_seen": 187610608, "step": 86940 }, { "epoch": 14.18352365415987, "grad_norm": 0.3191351294517517, "learning_rate": 1.1813645365755455e-05, "loss": 0.011, "num_input_tokens_seen": 187623024, "step": 86945 }, { "epoch": 14.184339314845024, "grad_norm": 0.048423003405332565, "learning_rate": 1.1810621839327049e-05, "loss": 0.2592, "num_input_tokens_seen": 187634800, "step": 86950 }, { "epoch": 14.18515497553018, "grad_norm": 2.400428056716919, "learning_rate": 1.180759858019812e-05, "loss": 0.2899, "num_input_tokens_seen": 187646480, "step": 86955 }, { "epoch": 14.185970636215334, "grad_norm": 0.06811077892780304, "learning_rate": 1.1804575588429942e-05, "loss": 0.1761, "num_input_tokens_seen": 187655408, "step": 86960 }, { "epoch": 14.18678629690049, "grad_norm": 0.8399774432182312, "learning_rate": 1.180155286408378e-05, "loss": 0.0761, "num_input_tokens_seen": 187665840, "step": 86965 }, { "epoch": 14.187601957585644, "grad_norm": 0.16359001398086548, "learning_rate": 1.1798530407220884e-05, "loss": 0.1979, "num_input_tokens_seen": 187676144, "step": 86970 }, { "epoch": 14.1884176182708, "grad_norm": 0.030730005353689194, "learning_rate": 1.1795508217902535e-05, "loss": 0.1233, "num_input_tokens_seen": 187686896, "step": 86975 }, { "epoch": 14.189233278955955, "grad_norm": 0.1647055447101593, "learning_rate": 1.1792486296189945e-05, "loss": 0.1692, "num_input_tokens_seen": 187698160, "step": 86980 }, { "epoch": 14.190048939641109, "grad_norm": 0.5589426159858704, "learning_rate": 1.1789464642144391e-05, "loss": 0.0743, "num_input_tokens_seen": 187708272, "step": 86985 }, { "epoch": 14.190864600326265, "grad_norm": 1.7327722311019897, "learning_rate": 1.1786443255827078e-05, "loss": 0.0549, "num_input_tokens_seen": 187720848, "step": 86990 }, { "epoch": 14.191680261011419, "grad_norm": 0.7830374836921692, "learning_rate": 1.1783422137299274e-05, "loss": 0.0544, "num_input_tokens_seen": 187730384, "step": 86995 }, { "epoch": 14.192495921696574, "grad_norm": 0.6283591389656067, "learning_rate": 1.1780401286622167e-05, "loss": 0.0754, "num_input_tokens_seen": 187742256, "step": 87000 }, { "epoch": 14.19331158238173, "grad_norm": 0.08744557201862335, "learning_rate": 1.1777380703857019e-05, "loss": 0.0455, "num_input_tokens_seen": 187752336, "step": 87005 }, { "epoch": 14.194127243066884, "grad_norm": 0.1035538762807846, "learning_rate": 1.1774360389065003e-05, "loss": 0.1137, "num_input_tokens_seen": 187762192, "step": 87010 }, { "epoch": 14.19494290375204, "grad_norm": 0.3061774671077728, "learning_rate": 1.1771340342307374e-05, "loss": 0.0908, "num_input_tokens_seen": 187773040, "step": 87015 }, { "epoch": 14.195758564437194, "grad_norm": 2.955080032348633, "learning_rate": 1.1768320563645293e-05, "loss": 0.1318, "num_input_tokens_seen": 187783376, "step": 87020 }, { "epoch": 14.19657422512235, "grad_norm": 0.025551388040184975, "learning_rate": 1.1765301053140001e-05, "loss": 0.0717, "num_input_tokens_seen": 187794768, "step": 87025 }, { "epoch": 14.197389885807505, "grad_norm": 0.14519861340522766, "learning_rate": 1.1762281810852654e-05, "loss": 0.0303, "num_input_tokens_seen": 187805680, "step": 87030 }, { "epoch": 14.198205546492659, "grad_norm": 2.098924160003662, "learning_rate": 1.1759262836844478e-05, "loss": 0.3326, "num_input_tokens_seen": 187816784, "step": 87035 }, { "epoch": 14.199021207177815, "grad_norm": 0.24041208624839783, "learning_rate": 1.175624413117662e-05, "loss": 0.048, "num_input_tokens_seen": 187827888, "step": 87040 }, { "epoch": 14.199836867862969, "grad_norm": 0.10116774588823318, "learning_rate": 1.1753225693910295e-05, "loss": 0.0634, "num_input_tokens_seen": 187838736, "step": 87045 }, { "epoch": 14.200652528548124, "grad_norm": 0.13137881457805634, "learning_rate": 1.1750207525106635e-05, "loss": 0.0234, "num_input_tokens_seen": 187848560, "step": 87050 }, { "epoch": 14.201468189233278, "grad_norm": 0.9265087842941284, "learning_rate": 1.1747189624826838e-05, "loss": 0.0959, "num_input_tokens_seen": 187859472, "step": 87055 }, { "epoch": 14.202283849918434, "grad_norm": 1.0638507604599, "learning_rate": 1.1744171993132055e-05, "loss": 0.0676, "num_input_tokens_seen": 187869840, "step": 87060 }, { "epoch": 14.20309951060359, "grad_norm": 0.9360893964767456, "learning_rate": 1.1741154630083445e-05, "loss": 0.0253, "num_input_tokens_seen": 187880368, "step": 87065 }, { "epoch": 14.203915171288743, "grad_norm": 0.15844029188156128, "learning_rate": 1.1738137535742156e-05, "loss": 0.1013, "num_input_tokens_seen": 187892368, "step": 87070 }, { "epoch": 14.2047308319739, "grad_norm": 0.1921718418598175, "learning_rate": 1.1735120710169332e-05, "loss": 0.011, "num_input_tokens_seen": 187903344, "step": 87075 }, { "epoch": 14.205546492659053, "grad_norm": 0.09419961273670197, "learning_rate": 1.1732104153426115e-05, "loss": 0.066, "num_input_tokens_seen": 187914672, "step": 87080 }, { "epoch": 14.206362153344209, "grad_norm": 1.3079745769500732, "learning_rate": 1.172908786557364e-05, "loss": 0.0985, "num_input_tokens_seen": 187925424, "step": 87085 }, { "epoch": 14.207177814029365, "grad_norm": 0.15376435220241547, "learning_rate": 1.1726071846673035e-05, "loss": 0.0206, "num_input_tokens_seen": 187936400, "step": 87090 }, { "epoch": 14.207993474714518, "grad_norm": 0.19435736536979675, "learning_rate": 1.1723056096785421e-05, "loss": 0.0867, "num_input_tokens_seen": 187945904, "step": 87095 }, { "epoch": 14.208809135399674, "grad_norm": 0.04346093162894249, "learning_rate": 1.172004061597192e-05, "loss": 0.1176, "num_input_tokens_seen": 187956880, "step": 87100 }, { "epoch": 14.209624796084828, "grad_norm": 0.35820430517196655, "learning_rate": 1.1717025404293642e-05, "loss": 0.2147, "num_input_tokens_seen": 187967792, "step": 87105 }, { "epoch": 14.210440456769984, "grad_norm": 0.1069689467549324, "learning_rate": 1.1714010461811693e-05, "loss": 0.0688, "num_input_tokens_seen": 187979248, "step": 87110 }, { "epoch": 14.21125611745514, "grad_norm": 0.05387469008564949, "learning_rate": 1.171099578858717e-05, "loss": 0.0302, "num_input_tokens_seen": 187990480, "step": 87115 }, { "epoch": 14.212071778140293, "grad_norm": 0.8803828954696655, "learning_rate": 1.1707981384681194e-05, "loss": 0.0756, "num_input_tokens_seen": 188000752, "step": 87120 }, { "epoch": 14.21288743882545, "grad_norm": 0.09102623909711838, "learning_rate": 1.1704967250154817e-05, "loss": 0.0465, "num_input_tokens_seen": 188010992, "step": 87125 }, { "epoch": 14.213703099510603, "grad_norm": 0.07908423244953156, "learning_rate": 1.1701953385069164e-05, "loss": 0.1967, "num_input_tokens_seen": 188022448, "step": 87130 }, { "epoch": 14.214518760195759, "grad_norm": 0.05380617082118988, "learning_rate": 1.1698939789485277e-05, "loss": 0.074, "num_input_tokens_seen": 188031728, "step": 87135 }, { "epoch": 14.215334420880913, "grad_norm": 0.04203703999519348, "learning_rate": 1.1695926463464269e-05, "loss": 0.0916, "num_input_tokens_seen": 188041808, "step": 87140 }, { "epoch": 14.216150081566068, "grad_norm": 1.8643728494644165, "learning_rate": 1.1692913407067167e-05, "loss": 0.0814, "num_input_tokens_seen": 188052496, "step": 87145 }, { "epoch": 14.216965742251224, "grad_norm": 1.1083483695983887, "learning_rate": 1.1689900620355077e-05, "loss": 0.1055, "num_input_tokens_seen": 188064752, "step": 87150 }, { "epoch": 14.217781402936378, "grad_norm": 1.6768397092819214, "learning_rate": 1.1686888103389015e-05, "loss": 0.1607, "num_input_tokens_seen": 188075952, "step": 87155 }, { "epoch": 14.218597063621534, "grad_norm": 1.6417733430862427, "learning_rate": 1.1683875856230067e-05, "loss": 0.164, "num_input_tokens_seen": 188087024, "step": 87160 }, { "epoch": 14.219412724306688, "grad_norm": 0.2563534677028656, "learning_rate": 1.1680863878939263e-05, "loss": 0.0682, "num_input_tokens_seen": 188095568, "step": 87165 }, { "epoch": 14.220228384991843, "grad_norm": 2.8059306144714355, "learning_rate": 1.1677852171577653e-05, "loss": 0.179, "num_input_tokens_seen": 188106864, "step": 87170 }, { "epoch": 14.221044045676999, "grad_norm": 0.13801209628582, "learning_rate": 1.1674840734206268e-05, "loss": 0.128, "num_input_tokens_seen": 188117712, "step": 87175 }, { "epoch": 14.221859706362153, "grad_norm": 2.2987468242645264, "learning_rate": 1.167182956688614e-05, "loss": 0.1269, "num_input_tokens_seen": 188129136, "step": 87180 }, { "epoch": 14.222675367047309, "grad_norm": 0.08064496517181396, "learning_rate": 1.1668818669678294e-05, "loss": 0.2128, "num_input_tokens_seen": 188139696, "step": 87185 }, { "epoch": 14.223491027732463, "grad_norm": 0.026395121589303017, "learning_rate": 1.1665808042643748e-05, "loss": 0.0099, "num_input_tokens_seen": 188150768, "step": 87190 }, { "epoch": 14.224306688417618, "grad_norm": 1.2061985731124878, "learning_rate": 1.1662797685843519e-05, "loss": 0.1572, "num_input_tokens_seen": 188162768, "step": 87195 }, { "epoch": 14.225122349102774, "grad_norm": 0.28138405084609985, "learning_rate": 1.1659787599338612e-05, "loss": 0.0714, "num_input_tokens_seen": 188173520, "step": 87200 }, { "epoch": 14.225938009787928, "grad_norm": 0.6792731881141663, "learning_rate": 1.1656777783190035e-05, "loss": 0.0194, "num_input_tokens_seen": 188184976, "step": 87205 }, { "epoch": 14.226753670473084, "grad_norm": 1.2590155601501465, "learning_rate": 1.1653768237458784e-05, "loss": 0.0776, "num_input_tokens_seen": 188196208, "step": 87210 }, { "epoch": 14.227569331158238, "grad_norm": 0.0881214290857315, "learning_rate": 1.1650758962205849e-05, "loss": 0.0984, "num_input_tokens_seen": 188206992, "step": 87215 }, { "epoch": 14.228384991843393, "grad_norm": 0.31343692541122437, "learning_rate": 1.1647749957492218e-05, "loss": 0.0698, "num_input_tokens_seen": 188219024, "step": 87220 }, { "epoch": 14.229200652528547, "grad_norm": 0.02345779538154602, "learning_rate": 1.1644741223378874e-05, "loss": 0.1323, "num_input_tokens_seen": 188230288, "step": 87225 }, { "epoch": 14.230016313213703, "grad_norm": 1.395693302154541, "learning_rate": 1.1641732759926791e-05, "loss": 0.1582, "num_input_tokens_seen": 188241008, "step": 87230 }, { "epoch": 14.230831973898859, "grad_norm": 0.5901260375976562, "learning_rate": 1.1638724567196938e-05, "loss": 0.0278, "num_input_tokens_seen": 188252304, "step": 87235 }, { "epoch": 14.231647634584013, "grad_norm": 3.1192002296447754, "learning_rate": 1.1635716645250283e-05, "loss": 0.1624, "num_input_tokens_seen": 188263696, "step": 87240 }, { "epoch": 14.232463295269168, "grad_norm": 0.1911095380783081, "learning_rate": 1.1632708994147784e-05, "loss": 0.0333, "num_input_tokens_seen": 188273200, "step": 87245 }, { "epoch": 14.233278955954322, "grad_norm": 1.9610693454742432, "learning_rate": 1.1629701613950394e-05, "loss": 0.1301, "num_input_tokens_seen": 188284144, "step": 87250 }, { "epoch": 14.234094616639478, "grad_norm": 1.7104164361953735, "learning_rate": 1.1626694504719066e-05, "loss": 0.1215, "num_input_tokens_seen": 188294480, "step": 87255 }, { "epoch": 14.234910277324634, "grad_norm": 0.31524449586868286, "learning_rate": 1.1623687666514727e-05, "loss": 0.0294, "num_input_tokens_seen": 188305008, "step": 87260 }, { "epoch": 14.235725938009788, "grad_norm": 0.09831525385379791, "learning_rate": 1.1620681099398334e-05, "loss": 0.0975, "num_input_tokens_seen": 188315824, "step": 87265 }, { "epoch": 14.236541598694943, "grad_norm": 0.06944860517978668, "learning_rate": 1.1617674803430814e-05, "loss": 0.0291, "num_input_tokens_seen": 188327920, "step": 87270 }, { "epoch": 14.237357259380097, "grad_norm": 2.189547538757324, "learning_rate": 1.1614668778673087e-05, "loss": 0.2071, "num_input_tokens_seen": 188339760, "step": 87275 }, { "epoch": 14.238172920065253, "grad_norm": 0.20670953392982483, "learning_rate": 1.161166302518608e-05, "loss": 0.0561, "num_input_tokens_seen": 188351952, "step": 87280 }, { "epoch": 14.238988580750409, "grad_norm": 2.7629692554473877, "learning_rate": 1.160865754303071e-05, "loss": 0.1452, "num_input_tokens_seen": 188363024, "step": 87285 }, { "epoch": 14.239804241435563, "grad_norm": 0.2756122052669525, "learning_rate": 1.160565233226788e-05, "loss": 0.1079, "num_input_tokens_seen": 188374832, "step": 87290 }, { "epoch": 14.240619902120718, "grad_norm": 0.03483796864748001, "learning_rate": 1.1602647392958496e-05, "loss": 0.136, "num_input_tokens_seen": 188386224, "step": 87295 }, { "epoch": 14.241435562805872, "grad_norm": 0.7341668605804443, "learning_rate": 1.159964272516346e-05, "loss": 0.2829, "num_input_tokens_seen": 188397456, "step": 87300 }, { "epoch": 14.242251223491028, "grad_norm": 0.5114835500717163, "learning_rate": 1.1596638328943662e-05, "loss": 0.0568, "num_input_tokens_seen": 188407408, "step": 87305 }, { "epoch": 14.243066884176184, "grad_norm": 1.0044664144515991, "learning_rate": 1.1593634204359993e-05, "loss": 0.0656, "num_input_tokens_seen": 188417680, "step": 87310 }, { "epoch": 14.243882544861338, "grad_norm": 0.6095999479293823, "learning_rate": 1.1590630351473336e-05, "loss": 0.0919, "num_input_tokens_seen": 188427504, "step": 87315 }, { "epoch": 14.244698205546493, "grad_norm": 0.0599321648478508, "learning_rate": 1.1587626770344561e-05, "loss": 0.1246, "num_input_tokens_seen": 188438352, "step": 87320 }, { "epoch": 14.245513866231647, "grad_norm": 0.15533947944641113, "learning_rate": 1.158462346103455e-05, "loss": 0.0214, "num_input_tokens_seen": 188448208, "step": 87325 }, { "epoch": 14.246329526916803, "grad_norm": 0.3497253656387329, "learning_rate": 1.158162042360416e-05, "loss": 0.0395, "num_input_tokens_seen": 188459504, "step": 87330 }, { "epoch": 14.247145187601957, "grad_norm": 0.02206227369606495, "learning_rate": 1.1578617658114255e-05, "loss": 0.1244, "num_input_tokens_seen": 188469744, "step": 87335 }, { "epoch": 14.247960848287113, "grad_norm": 0.13487526774406433, "learning_rate": 1.1575615164625689e-05, "loss": 0.0098, "num_input_tokens_seen": 188480752, "step": 87340 }, { "epoch": 14.248776508972268, "grad_norm": 2.084351062774658, "learning_rate": 1.157261294319931e-05, "loss": 0.2214, "num_input_tokens_seen": 188491248, "step": 87345 }, { "epoch": 14.249592169657422, "grad_norm": 0.2078065425157547, "learning_rate": 1.1569610993895966e-05, "loss": 0.1402, "num_input_tokens_seen": 188502448, "step": 87350 }, { "epoch": 14.250407830342578, "grad_norm": 0.24589914083480835, "learning_rate": 1.1566609316776492e-05, "loss": 0.1204, "num_input_tokens_seen": 188515024, "step": 87355 }, { "epoch": 14.251223491027732, "grad_norm": 0.6315385103225708, "learning_rate": 1.1563607911901722e-05, "loss": 0.0606, "num_input_tokens_seen": 188525168, "step": 87360 }, { "epoch": 14.252039151712887, "grad_norm": 0.24967820942401886, "learning_rate": 1.1560606779332484e-05, "loss": 0.1217, "num_input_tokens_seen": 188536592, "step": 87365 }, { "epoch": 14.252854812398043, "grad_norm": 2.1827940940856934, "learning_rate": 1.1557605919129588e-05, "loss": 0.2804, "num_input_tokens_seen": 188547440, "step": 87370 }, { "epoch": 14.253670473083197, "grad_norm": 0.25967615842819214, "learning_rate": 1.155460533135388e-05, "loss": 0.1965, "num_input_tokens_seen": 188557424, "step": 87375 }, { "epoch": 14.254486133768353, "grad_norm": 0.16619275510311127, "learning_rate": 1.1551605016066133e-05, "loss": 0.077, "num_input_tokens_seen": 188567408, "step": 87380 }, { "epoch": 14.255301794453507, "grad_norm": 2.8754045963287354, "learning_rate": 1.154860497332719e-05, "loss": 0.1073, "num_input_tokens_seen": 188578288, "step": 87385 }, { "epoch": 14.256117455138662, "grad_norm": 0.023671070113778114, "learning_rate": 1.1545605203197812e-05, "loss": 0.0195, "num_input_tokens_seen": 188588464, "step": 87390 }, { "epoch": 14.256933115823816, "grad_norm": 0.023505790159106255, "learning_rate": 1.1542605705738831e-05, "loss": 0.1308, "num_input_tokens_seen": 188598736, "step": 87395 }, { "epoch": 14.257748776508972, "grad_norm": 0.1395178884267807, "learning_rate": 1.1539606481011e-05, "loss": 0.1345, "num_input_tokens_seen": 188609040, "step": 87400 }, { "epoch": 14.258564437194128, "grad_norm": 0.10392814874649048, "learning_rate": 1.1536607529075127e-05, "loss": 0.0909, "num_input_tokens_seen": 188620848, "step": 87405 }, { "epoch": 14.259380097879282, "grad_norm": 0.0469166599214077, "learning_rate": 1.1533608849991983e-05, "loss": 0.0087, "num_input_tokens_seen": 188632112, "step": 87410 }, { "epoch": 14.260195758564437, "grad_norm": 0.20792347192764282, "learning_rate": 1.153061044382234e-05, "loss": 0.0283, "num_input_tokens_seen": 188641008, "step": 87415 }, { "epoch": 14.261011419249591, "grad_norm": 2.0350818634033203, "learning_rate": 1.1527612310626962e-05, "loss": 0.0806, "num_input_tokens_seen": 188651792, "step": 87420 }, { "epoch": 14.261827079934747, "grad_norm": 0.5045346617698669, "learning_rate": 1.1524614450466612e-05, "loss": 0.0259, "num_input_tokens_seen": 188662768, "step": 87425 }, { "epoch": 14.262642740619903, "grad_norm": 0.24476122856140137, "learning_rate": 1.1521616863402044e-05, "loss": 0.0094, "num_input_tokens_seen": 188673456, "step": 87430 }, { "epoch": 14.263458401305057, "grad_norm": 0.1107553169131279, "learning_rate": 1.1518619549494009e-05, "loss": 0.104, "num_input_tokens_seen": 188684560, "step": 87435 }, { "epoch": 14.264274061990212, "grad_norm": 1.8714251518249512, "learning_rate": 1.1515622508803253e-05, "loss": 0.0885, "num_input_tokens_seen": 188694768, "step": 87440 }, { "epoch": 14.265089722675366, "grad_norm": 1.445448875427246, "learning_rate": 1.1512625741390512e-05, "loss": 0.17, "num_input_tokens_seen": 188703856, "step": 87445 }, { "epoch": 14.265905383360522, "grad_norm": 1.374929666519165, "learning_rate": 1.1509629247316519e-05, "loss": 0.1077, "num_input_tokens_seen": 188714800, "step": 87450 }, { "epoch": 14.266721044045678, "grad_norm": 1.7470216751098633, "learning_rate": 1.1506633026642005e-05, "loss": 0.0746, "num_input_tokens_seen": 188724848, "step": 87455 }, { "epoch": 14.267536704730832, "grad_norm": 1.6636532545089722, "learning_rate": 1.150363707942769e-05, "loss": 0.0514, "num_input_tokens_seen": 188735344, "step": 87460 }, { "epoch": 14.268352365415987, "grad_norm": 0.19133757054805756, "learning_rate": 1.150064140573429e-05, "loss": 0.0145, "num_input_tokens_seen": 188748208, "step": 87465 }, { "epoch": 14.269168026101141, "grad_norm": 1.7168391942977905, "learning_rate": 1.1497646005622517e-05, "loss": 0.1827, "num_input_tokens_seen": 188758576, "step": 87470 }, { "epoch": 14.269983686786297, "grad_norm": 2.907360315322876, "learning_rate": 1.1494650879153077e-05, "loss": 0.1155, "num_input_tokens_seen": 188769616, "step": 87475 }, { "epoch": 14.270799347471453, "grad_norm": 0.9137846827507019, "learning_rate": 1.1491656026386669e-05, "loss": 0.1727, "num_input_tokens_seen": 188781008, "step": 87480 }, { "epoch": 14.271615008156607, "grad_norm": 1.0560816526412964, "learning_rate": 1.148866144738398e-05, "loss": 0.0305, "num_input_tokens_seen": 188792368, "step": 87485 }, { "epoch": 14.272430668841762, "grad_norm": 0.0883934497833252, "learning_rate": 1.1485667142205724e-05, "loss": 0.1224, "num_input_tokens_seen": 188804016, "step": 87490 }, { "epoch": 14.273246329526916, "grad_norm": 0.18922223150730133, "learning_rate": 1.1482673110912551e-05, "loss": 0.2484, "num_input_tokens_seen": 188815280, "step": 87495 }, { "epoch": 14.274061990212072, "grad_norm": 3.2747416496276855, "learning_rate": 1.1479679353565173e-05, "loss": 0.0887, "num_input_tokens_seen": 188825904, "step": 87500 }, { "epoch": 14.274877650897226, "grad_norm": 0.041979748755693436, "learning_rate": 1.1476685870224227e-05, "loss": 0.3085, "num_input_tokens_seen": 188837200, "step": 87505 }, { "epoch": 14.275693311582382, "grad_norm": 0.1366555243730545, "learning_rate": 1.1473692660950413e-05, "loss": 0.1061, "num_input_tokens_seen": 188846832, "step": 87510 }, { "epoch": 14.276508972267537, "grad_norm": 1.3380593061447144, "learning_rate": 1.1470699725804363e-05, "loss": 0.1246, "num_input_tokens_seen": 188857936, "step": 87515 }, { "epoch": 14.277324632952691, "grad_norm": 0.07593437284231186, "learning_rate": 1.1467707064846762e-05, "loss": 0.0207, "num_input_tokens_seen": 188869296, "step": 87520 }, { "epoch": 14.278140293637847, "grad_norm": 1.772386908531189, "learning_rate": 1.1464714678138225e-05, "loss": 0.1906, "num_input_tokens_seen": 188880784, "step": 87525 }, { "epoch": 14.278955954323001, "grad_norm": 1.056544542312622, "learning_rate": 1.1461722565739432e-05, "loss": 0.1112, "num_input_tokens_seen": 188891984, "step": 87530 }, { "epoch": 14.279771615008157, "grad_norm": 1.4863495826721191, "learning_rate": 1.1458730727710992e-05, "loss": 0.1425, "num_input_tokens_seen": 188904208, "step": 87535 }, { "epoch": 14.280587275693312, "grad_norm": 0.22639231383800507, "learning_rate": 1.1455739164113568e-05, "loss": 0.1924, "num_input_tokens_seen": 188915792, "step": 87540 }, { "epoch": 14.281402936378466, "grad_norm": 2.3446967601776123, "learning_rate": 1.1452747875007755e-05, "loss": 0.0652, "num_input_tokens_seen": 188926320, "step": 87545 }, { "epoch": 14.282218597063622, "grad_norm": 0.5677906274795532, "learning_rate": 1.14497568604542e-05, "loss": 0.0209, "num_input_tokens_seen": 188938576, "step": 87550 }, { "epoch": 14.283034257748776, "grad_norm": 0.5592125058174133, "learning_rate": 1.1446766120513514e-05, "loss": 0.349, "num_input_tokens_seen": 188949744, "step": 87555 }, { "epoch": 14.283849918433932, "grad_norm": 3.053297996520996, "learning_rate": 1.1443775655246303e-05, "loss": 0.2757, "num_input_tokens_seen": 188959440, "step": 87560 }, { "epoch": 14.284665579119087, "grad_norm": 0.9799112677574158, "learning_rate": 1.1440785464713178e-05, "loss": 0.058, "num_input_tokens_seen": 188971440, "step": 87565 }, { "epoch": 14.285481239804241, "grad_norm": 0.21629633009433746, "learning_rate": 1.1437795548974737e-05, "loss": 0.0894, "num_input_tokens_seen": 188982960, "step": 87570 }, { "epoch": 14.286296900489397, "grad_norm": 0.8528348803520203, "learning_rate": 1.1434805908091573e-05, "loss": 0.0968, "num_input_tokens_seen": 188993072, "step": 87575 }, { "epoch": 14.28711256117455, "grad_norm": 0.6970805525779724, "learning_rate": 1.1431816542124275e-05, "loss": 0.2018, "num_input_tokens_seen": 189003920, "step": 87580 }, { "epoch": 14.287928221859707, "grad_norm": 0.15056130290031433, "learning_rate": 1.1428827451133427e-05, "loss": 0.0841, "num_input_tokens_seen": 189014640, "step": 87585 }, { "epoch": 14.28874388254486, "grad_norm": 0.06304659694433212, "learning_rate": 1.142583863517961e-05, "loss": 0.0397, "num_input_tokens_seen": 189024240, "step": 87590 }, { "epoch": 14.289559543230016, "grad_norm": 0.05613046884536743, "learning_rate": 1.142285009432339e-05, "loss": 0.0098, "num_input_tokens_seen": 189035216, "step": 87595 }, { "epoch": 14.290375203915172, "grad_norm": 0.448764830827713, "learning_rate": 1.1419861828625331e-05, "loss": 0.1281, "num_input_tokens_seen": 189045968, "step": 87600 }, { "epoch": 14.291190864600326, "grad_norm": 1.4941915273666382, "learning_rate": 1.1416873838146013e-05, "loss": 0.1649, "num_input_tokens_seen": 189056720, "step": 87605 }, { "epoch": 14.292006525285482, "grad_norm": 1.505867600440979, "learning_rate": 1.1413886122945963e-05, "loss": 0.0814, "num_input_tokens_seen": 189066672, "step": 87610 }, { "epoch": 14.292822185970635, "grad_norm": 0.9577232003211975, "learning_rate": 1.1410898683085765e-05, "loss": 0.1809, "num_input_tokens_seen": 189078096, "step": 87615 }, { "epoch": 14.293637846655791, "grad_norm": 0.041911475360393524, "learning_rate": 1.1407911518625924e-05, "loss": 0.1735, "num_input_tokens_seen": 189088880, "step": 87620 }, { "epoch": 14.294453507340947, "grad_norm": 1.109623670578003, "learning_rate": 1.1404924629627017e-05, "loss": 0.0817, "num_input_tokens_seen": 189099152, "step": 87625 }, { "epoch": 14.2952691680261, "grad_norm": 0.16849882900714874, "learning_rate": 1.1401938016149544e-05, "loss": 0.151, "num_input_tokens_seen": 189110864, "step": 87630 }, { "epoch": 14.296084828711257, "grad_norm": 0.08031546324491501, "learning_rate": 1.1398951678254063e-05, "loss": 0.0471, "num_input_tokens_seen": 189122736, "step": 87635 }, { "epoch": 14.29690048939641, "grad_norm": 0.08319980651140213, "learning_rate": 1.1395965616001062e-05, "loss": 0.03, "num_input_tokens_seen": 189133648, "step": 87640 }, { "epoch": 14.297716150081566, "grad_norm": 2.3812191486358643, "learning_rate": 1.1392979829451095e-05, "loss": 0.1269, "num_input_tokens_seen": 189144080, "step": 87645 }, { "epoch": 14.298531810766722, "grad_norm": 0.0967959314584732, "learning_rate": 1.1389994318664631e-05, "loss": 0.0144, "num_input_tokens_seen": 189155056, "step": 87650 }, { "epoch": 14.299347471451876, "grad_norm": 0.9309567213058472, "learning_rate": 1.1387009083702221e-05, "loss": 0.0698, "num_input_tokens_seen": 189166288, "step": 87655 }, { "epoch": 14.300163132137031, "grad_norm": 0.7258473634719849, "learning_rate": 1.1384024124624324e-05, "loss": 0.0298, "num_input_tokens_seen": 189177040, "step": 87660 }, { "epoch": 14.300978792822185, "grad_norm": 2.0313665866851807, "learning_rate": 1.1381039441491465e-05, "loss": 0.0592, "num_input_tokens_seen": 189188080, "step": 87665 }, { "epoch": 14.301794453507341, "grad_norm": 0.09189745038747787, "learning_rate": 1.1378055034364102e-05, "loss": 0.0467, "num_input_tokens_seen": 189199600, "step": 87670 }, { "epoch": 14.302610114192497, "grad_norm": 2.1845710277557373, "learning_rate": 1.1375070903302754e-05, "loss": 0.1466, "num_input_tokens_seen": 189208912, "step": 87675 }, { "epoch": 14.30342577487765, "grad_norm": 0.07345576584339142, "learning_rate": 1.1372087048367858e-05, "loss": 0.1283, "num_input_tokens_seen": 189219888, "step": 87680 }, { "epoch": 14.304241435562806, "grad_norm": 0.09042699635028839, "learning_rate": 1.1369103469619927e-05, "loss": 0.1575, "num_input_tokens_seen": 189229776, "step": 87685 }, { "epoch": 14.30505709624796, "grad_norm": 0.6866011023521423, "learning_rate": 1.1366120167119385e-05, "loss": 0.1497, "num_input_tokens_seen": 189241040, "step": 87690 }, { "epoch": 14.305872756933116, "grad_norm": 0.8089296221733093, "learning_rate": 1.1363137140926725e-05, "loss": 0.0754, "num_input_tokens_seen": 189251504, "step": 87695 }, { "epoch": 14.30668841761827, "grad_norm": 1.8058562278747559, "learning_rate": 1.136015439110239e-05, "loss": 0.1664, "num_input_tokens_seen": 189261808, "step": 87700 }, { "epoch": 14.307504078303426, "grad_norm": 1.8970983028411865, "learning_rate": 1.135717191770683e-05, "loss": 0.1883, "num_input_tokens_seen": 189273264, "step": 87705 }, { "epoch": 14.308319738988581, "grad_norm": 0.2843134105205536, "learning_rate": 1.1354189720800486e-05, "loss": 0.0461, "num_input_tokens_seen": 189285360, "step": 87710 }, { "epoch": 14.309135399673735, "grad_norm": 0.5257636904716492, "learning_rate": 1.13512078004438e-05, "loss": 0.0326, "num_input_tokens_seen": 189297040, "step": 87715 }, { "epoch": 14.309951060358891, "grad_norm": 0.03334127739071846, "learning_rate": 1.1348226156697203e-05, "loss": 0.0706, "num_input_tokens_seen": 189307664, "step": 87720 }, { "epoch": 14.310766721044045, "grad_norm": 2.031189441680908, "learning_rate": 1.134524478962112e-05, "loss": 0.083, "num_input_tokens_seen": 189317296, "step": 87725 }, { "epoch": 14.3115823817292, "grad_norm": 0.6846861243247986, "learning_rate": 1.1342263699275976e-05, "loss": 0.0925, "num_input_tokens_seen": 189327408, "step": 87730 }, { "epoch": 14.312398042414356, "grad_norm": 0.14247310161590576, "learning_rate": 1.1339282885722182e-05, "loss": 0.0514, "num_input_tokens_seen": 189336656, "step": 87735 }, { "epoch": 14.31321370309951, "grad_norm": 0.039365995675325394, "learning_rate": 1.1336302349020154e-05, "loss": 0.0498, "num_input_tokens_seen": 189346992, "step": 87740 }, { "epoch": 14.314029363784666, "grad_norm": 0.2040693759918213, "learning_rate": 1.1333322089230278e-05, "loss": 0.0795, "num_input_tokens_seen": 189356240, "step": 87745 }, { "epoch": 14.31484502446982, "grad_norm": 1.399673581123352, "learning_rate": 1.1330342106412989e-05, "loss": 0.1199, "num_input_tokens_seen": 189366192, "step": 87750 }, { "epoch": 14.315660685154976, "grad_norm": 0.34125807881355286, "learning_rate": 1.1327362400628638e-05, "loss": 0.0272, "num_input_tokens_seen": 189376496, "step": 87755 }, { "epoch": 14.31647634584013, "grad_norm": 0.386199027299881, "learning_rate": 1.1324382971937652e-05, "loss": 0.053, "num_input_tokens_seen": 189388016, "step": 87760 }, { "epoch": 14.317292006525285, "grad_norm": 0.04152941331267357, "learning_rate": 1.1321403820400378e-05, "loss": 0.0594, "num_input_tokens_seen": 189399472, "step": 87765 }, { "epoch": 14.318107667210441, "grad_norm": 1.9899139404296875, "learning_rate": 1.1318424946077225e-05, "loss": 0.0433, "num_input_tokens_seen": 189410640, "step": 87770 }, { "epoch": 14.318923327895595, "grad_norm": 0.05400635302066803, "learning_rate": 1.1315446349028528e-05, "loss": 0.191, "num_input_tokens_seen": 189422320, "step": 87775 }, { "epoch": 14.31973898858075, "grad_norm": 0.02246786653995514, "learning_rate": 1.1312468029314688e-05, "loss": 0.0116, "num_input_tokens_seen": 189432336, "step": 87780 }, { "epoch": 14.320554649265905, "grad_norm": 0.24386583268642426, "learning_rate": 1.1309489986996027e-05, "loss": 0.0712, "num_input_tokens_seen": 189443792, "step": 87785 }, { "epoch": 14.32137030995106, "grad_norm": 0.02947998233139515, "learning_rate": 1.1306512222132942e-05, "loss": 0.0301, "num_input_tokens_seen": 189453904, "step": 87790 }, { "epoch": 14.322185970636216, "grad_norm": 0.09610529243946075, "learning_rate": 1.1303534734785736e-05, "loss": 0.086, "num_input_tokens_seen": 189462992, "step": 87795 }, { "epoch": 14.32300163132137, "grad_norm": 0.4604610204696655, "learning_rate": 1.1300557525014787e-05, "loss": 0.0207, "num_input_tokens_seen": 189473584, "step": 87800 }, { "epoch": 14.323817292006526, "grad_norm": 0.7766291499137878, "learning_rate": 1.1297580592880417e-05, "loss": 0.0357, "num_input_tokens_seen": 189484464, "step": 87805 }, { "epoch": 14.32463295269168, "grad_norm": 1.654012680053711, "learning_rate": 1.1294603938442957e-05, "loss": 0.0482, "num_input_tokens_seen": 189495280, "step": 87810 }, { "epoch": 14.325448613376835, "grad_norm": 0.052875030785799026, "learning_rate": 1.1291627561762738e-05, "loss": 0.0781, "num_input_tokens_seen": 189506992, "step": 87815 }, { "epoch": 14.326264274061991, "grad_norm": 0.587163507938385, "learning_rate": 1.1288651462900077e-05, "loss": 0.1417, "num_input_tokens_seen": 189517808, "step": 87820 }, { "epoch": 14.327079934747145, "grad_norm": 1.3815536499023438, "learning_rate": 1.1285675641915285e-05, "loss": 0.1886, "num_input_tokens_seen": 189528688, "step": 87825 }, { "epoch": 14.3278955954323, "grad_norm": 0.012742740102112293, "learning_rate": 1.1282700098868675e-05, "loss": 0.0188, "num_input_tokens_seen": 189539600, "step": 87830 }, { "epoch": 14.328711256117455, "grad_norm": 1.2591890096664429, "learning_rate": 1.1279724833820552e-05, "loss": 0.0566, "num_input_tokens_seen": 189551152, "step": 87835 }, { "epoch": 14.32952691680261, "grad_norm": 1.6475437879562378, "learning_rate": 1.1276749846831205e-05, "loss": 0.225, "num_input_tokens_seen": 189562672, "step": 87840 }, { "epoch": 14.330342577487766, "grad_norm": 0.1605922132730484, "learning_rate": 1.1273775137960935e-05, "loss": 0.0172, "num_input_tokens_seen": 189574320, "step": 87845 }, { "epoch": 14.33115823817292, "grad_norm": 0.2513539493083954, "learning_rate": 1.1270800707270026e-05, "loss": 0.0522, "num_input_tokens_seen": 189586352, "step": 87850 }, { "epoch": 14.331973898858076, "grad_norm": 1.3098729848861694, "learning_rate": 1.1267826554818756e-05, "loss": 0.2223, "num_input_tokens_seen": 189598704, "step": 87855 }, { "epoch": 14.33278955954323, "grad_norm": 0.10032311081886292, "learning_rate": 1.12648526806674e-05, "loss": 0.2586, "num_input_tokens_seen": 189608784, "step": 87860 }, { "epoch": 14.333605220228385, "grad_norm": 0.11203352361917496, "learning_rate": 1.126187908487623e-05, "loss": 0.118, "num_input_tokens_seen": 189620240, "step": 87865 }, { "epoch": 14.33442088091354, "grad_norm": 0.8451818823814392, "learning_rate": 1.1258905767505509e-05, "loss": 0.1605, "num_input_tokens_seen": 189630640, "step": 87870 }, { "epoch": 14.335236541598695, "grad_norm": 0.7008575797080994, "learning_rate": 1.1255932728615496e-05, "loss": 0.0823, "num_input_tokens_seen": 189640880, "step": 87875 }, { "epoch": 14.33605220228385, "grad_norm": 0.023689275607466698, "learning_rate": 1.1252959968266439e-05, "loss": 0.1129, "num_input_tokens_seen": 189651632, "step": 87880 }, { "epoch": 14.336867862969005, "grad_norm": 0.19067351520061493, "learning_rate": 1.1249987486518588e-05, "loss": 0.0149, "num_input_tokens_seen": 189661392, "step": 87885 }, { "epoch": 14.33768352365416, "grad_norm": 0.15383394062519073, "learning_rate": 1.1247015283432175e-05, "loss": 0.0246, "num_input_tokens_seen": 189672688, "step": 87890 }, { "epoch": 14.338499184339314, "grad_norm": 2.607696294784546, "learning_rate": 1.124404335906746e-05, "loss": 0.0851, "num_input_tokens_seen": 189684080, "step": 87895 }, { "epoch": 14.33931484502447, "grad_norm": 0.04602544382214546, "learning_rate": 1.1241071713484641e-05, "loss": 0.1659, "num_input_tokens_seen": 189695184, "step": 87900 }, { "epoch": 14.340130505709626, "grad_norm": 0.12117324769496918, "learning_rate": 1.1238100346743969e-05, "loss": 0.017, "num_input_tokens_seen": 189706640, "step": 87905 }, { "epoch": 14.34094616639478, "grad_norm": 0.2911335825920105, "learning_rate": 1.1235129258905649e-05, "loss": 0.2751, "num_input_tokens_seen": 189718512, "step": 87910 }, { "epoch": 14.341761827079935, "grad_norm": 0.2059975564479828, "learning_rate": 1.1232158450029898e-05, "loss": 0.0703, "num_input_tokens_seen": 189728944, "step": 87915 }, { "epoch": 14.34257748776509, "grad_norm": 0.050428684800863266, "learning_rate": 1.122918792017692e-05, "loss": 0.1002, "num_input_tokens_seen": 189738960, "step": 87920 }, { "epoch": 14.343393148450245, "grad_norm": 1.2196511030197144, "learning_rate": 1.1226217669406922e-05, "loss": 0.16, "num_input_tokens_seen": 189750224, "step": 87925 }, { "epoch": 14.3442088091354, "grad_norm": 0.28679728507995605, "learning_rate": 1.1223247697780092e-05, "loss": 0.0488, "num_input_tokens_seen": 189760592, "step": 87930 }, { "epoch": 14.345024469820554, "grad_norm": 0.0809866264462471, "learning_rate": 1.1220278005356628e-05, "loss": 0.0127, "num_input_tokens_seen": 189771248, "step": 87935 }, { "epoch": 14.34584013050571, "grad_norm": 0.2610401213169098, "learning_rate": 1.1217308592196709e-05, "loss": 0.0114, "num_input_tokens_seen": 189781456, "step": 87940 }, { "epoch": 14.346655791190864, "grad_norm": 3.0333549976348877, "learning_rate": 1.1214339458360514e-05, "loss": 0.0634, "num_input_tokens_seen": 189792048, "step": 87945 }, { "epoch": 14.34747145187602, "grad_norm": 0.8951427340507507, "learning_rate": 1.1211370603908222e-05, "loss": 0.0824, "num_input_tokens_seen": 189802832, "step": 87950 }, { "epoch": 14.348287112561174, "grad_norm": 1.9165377616882324, "learning_rate": 1.1208402028899995e-05, "loss": 0.1318, "num_input_tokens_seen": 189814416, "step": 87955 }, { "epoch": 14.34910277324633, "grad_norm": 0.8329323530197144, "learning_rate": 1.1205433733395998e-05, "loss": 0.0915, "num_input_tokens_seen": 189825456, "step": 87960 }, { "epoch": 14.349918433931485, "grad_norm": 2.713345527648926, "learning_rate": 1.1202465717456384e-05, "loss": 0.1106, "num_input_tokens_seen": 189836944, "step": 87965 }, { "epoch": 14.350734094616639, "grad_norm": 0.04314349964261055, "learning_rate": 1.1199497981141307e-05, "loss": 0.0721, "num_input_tokens_seen": 189848272, "step": 87970 }, { "epoch": 14.351549755301795, "grad_norm": 0.059946730732917786, "learning_rate": 1.1196530524510911e-05, "loss": 0.1054, "num_input_tokens_seen": 189857776, "step": 87975 }, { "epoch": 14.352365415986949, "grad_norm": 0.05648485943675041, "learning_rate": 1.1193563347625335e-05, "loss": 0.1759, "num_input_tokens_seen": 189866960, "step": 87980 }, { "epoch": 14.353181076672104, "grad_norm": 0.19989614188671112, "learning_rate": 1.119059645054471e-05, "loss": 0.0618, "num_input_tokens_seen": 189876720, "step": 87985 }, { "epoch": 14.35399673735726, "grad_norm": 1.0403039455413818, "learning_rate": 1.118762983332917e-05, "loss": 0.0371, "num_input_tokens_seen": 189886096, "step": 87990 }, { "epoch": 14.354812398042414, "grad_norm": 0.028551217168569565, "learning_rate": 1.118466349603883e-05, "loss": 0.1218, "num_input_tokens_seen": 189898064, "step": 87995 }, { "epoch": 14.35562805872757, "grad_norm": 0.13279499113559723, "learning_rate": 1.1181697438733812e-05, "loss": 0.1519, "num_input_tokens_seen": 189908912, "step": 88000 }, { "epoch": 14.356443719412724, "grad_norm": 1.9243426322937012, "learning_rate": 1.1178731661474226e-05, "loss": 0.1645, "num_input_tokens_seen": 189919216, "step": 88005 }, { "epoch": 14.35725938009788, "grad_norm": 1.7060832977294922, "learning_rate": 1.1175766164320168e-05, "loss": 0.1051, "num_input_tokens_seen": 189929712, "step": 88010 }, { "epoch": 14.358075040783035, "grad_norm": 0.2559331953525543, "learning_rate": 1.1172800947331761e-05, "loss": 0.2461, "num_input_tokens_seen": 189939280, "step": 88015 }, { "epoch": 14.358890701468189, "grad_norm": 0.18137681484222412, "learning_rate": 1.1169836010569068e-05, "loss": 0.166, "num_input_tokens_seen": 189949840, "step": 88020 }, { "epoch": 14.359706362153345, "grad_norm": 0.8844108581542969, "learning_rate": 1.1166871354092209e-05, "loss": 0.0708, "num_input_tokens_seen": 189960912, "step": 88025 }, { "epoch": 14.360522022838499, "grad_norm": 1.8277029991149902, "learning_rate": 1.1163906977961235e-05, "loss": 0.129, "num_input_tokens_seen": 189970736, "step": 88030 }, { "epoch": 14.361337683523654, "grad_norm": 0.04023445025086403, "learning_rate": 1.1160942882236246e-05, "loss": 0.1014, "num_input_tokens_seen": 189981968, "step": 88035 }, { "epoch": 14.362153344208808, "grad_norm": 0.2561589479446411, "learning_rate": 1.1157979066977306e-05, "loss": 0.1249, "num_input_tokens_seen": 189992464, "step": 88040 }, { "epoch": 14.362969004893964, "grad_norm": 0.07334679365158081, "learning_rate": 1.115501553224448e-05, "loss": 0.2107, "num_input_tokens_seen": 190002896, "step": 88045 }, { "epoch": 14.36378466557912, "grad_norm": 1.7084935903549194, "learning_rate": 1.115205227809783e-05, "loss": 0.1567, "num_input_tokens_seen": 190014128, "step": 88050 }, { "epoch": 14.364600326264274, "grad_norm": 0.044416576623916626, "learning_rate": 1.1149089304597405e-05, "loss": 0.0106, "num_input_tokens_seen": 190024464, "step": 88055 }, { "epoch": 14.36541598694943, "grad_norm": 1.1495898962020874, "learning_rate": 1.1146126611803256e-05, "loss": 0.2644, "num_input_tokens_seen": 190034320, "step": 88060 }, { "epoch": 14.366231647634583, "grad_norm": 2.5539824962615967, "learning_rate": 1.114316419977543e-05, "loss": 0.124, "num_input_tokens_seen": 190044464, "step": 88065 }, { "epoch": 14.367047308319739, "grad_norm": 0.06124775856733322, "learning_rate": 1.1140202068573957e-05, "loss": 0.0378, "num_input_tokens_seen": 190056016, "step": 88070 }, { "epoch": 14.367862969004895, "grad_norm": 0.4453234374523163, "learning_rate": 1.1137240218258871e-05, "loss": 0.0267, "num_input_tokens_seen": 190066928, "step": 88075 }, { "epoch": 14.368678629690049, "grad_norm": 1.5367786884307861, "learning_rate": 1.1134278648890198e-05, "loss": 0.2292, "num_input_tokens_seen": 190078320, "step": 88080 }, { "epoch": 14.369494290375204, "grad_norm": 0.3868180215358734, "learning_rate": 1.1131317360527959e-05, "loss": 0.0281, "num_input_tokens_seen": 190089136, "step": 88085 }, { "epoch": 14.370309951060358, "grad_norm": 0.028118766844272614, "learning_rate": 1.112835635323217e-05, "loss": 0.0179, "num_input_tokens_seen": 190100400, "step": 88090 }, { "epoch": 14.371125611745514, "grad_norm": 0.034331656992435455, "learning_rate": 1.1125395627062834e-05, "loss": 0.0355, "num_input_tokens_seen": 190111088, "step": 88095 }, { "epoch": 14.37194127243067, "grad_norm": 0.06463780254125595, "learning_rate": 1.1122435182079957e-05, "loss": 0.0185, "num_input_tokens_seen": 190121008, "step": 88100 }, { "epoch": 14.372756933115824, "grad_norm": 0.06841545552015305, "learning_rate": 1.1119475018343536e-05, "loss": 0.0573, "num_input_tokens_seen": 190130896, "step": 88105 }, { "epoch": 14.37357259380098, "grad_norm": 0.3281592130661011, "learning_rate": 1.1116515135913564e-05, "loss": 0.0192, "num_input_tokens_seen": 190140624, "step": 88110 }, { "epoch": 14.374388254486133, "grad_norm": 0.08925920724868774, "learning_rate": 1.1113555534850015e-05, "loss": 0.063, "num_input_tokens_seen": 190150320, "step": 88115 }, { "epoch": 14.375203915171289, "grad_norm": 1.746050238609314, "learning_rate": 1.1110596215212895e-05, "loss": 0.0991, "num_input_tokens_seen": 190162640, "step": 88120 }, { "epoch": 14.376019575856443, "grad_norm": 1.0980967283248901, "learning_rate": 1.1107637177062147e-05, "loss": 0.098, "num_input_tokens_seen": 190173744, "step": 88125 }, { "epoch": 14.376835236541599, "grad_norm": 0.563107430934906, "learning_rate": 1.1104678420457775e-05, "loss": 0.0781, "num_input_tokens_seen": 190183792, "step": 88130 }, { "epoch": 14.377650897226754, "grad_norm": 2.439629316329956, "learning_rate": 1.1101719945459701e-05, "loss": 0.0646, "num_input_tokens_seen": 190194352, "step": 88135 }, { "epoch": 14.378466557911908, "grad_norm": 1.6032172441482544, "learning_rate": 1.1098761752127923e-05, "loss": 0.1313, "num_input_tokens_seen": 190205104, "step": 88140 }, { "epoch": 14.379282218597064, "grad_norm": 0.5179510116577148, "learning_rate": 1.1095803840522357e-05, "loss": 0.0156, "num_input_tokens_seen": 190216464, "step": 88145 }, { "epoch": 14.380097879282218, "grad_norm": 0.1627456694841385, "learning_rate": 1.1092846210702982e-05, "loss": 0.0713, "num_input_tokens_seen": 190227248, "step": 88150 }, { "epoch": 14.380913539967374, "grad_norm": 0.022920086979866028, "learning_rate": 1.10898888627297e-05, "loss": 0.0479, "num_input_tokens_seen": 190238032, "step": 88155 }, { "epoch": 14.38172920065253, "grad_norm": 1.8858391046524048, "learning_rate": 1.1086931796662487e-05, "loss": 0.3316, "num_input_tokens_seen": 190248272, "step": 88160 }, { "epoch": 14.382544861337683, "grad_norm": 0.6034469604492188, "learning_rate": 1.1083975012561232e-05, "loss": 0.0884, "num_input_tokens_seen": 190259664, "step": 88165 }, { "epoch": 14.383360522022839, "grad_norm": 0.15738597512245178, "learning_rate": 1.1081018510485897e-05, "loss": 0.0145, "num_input_tokens_seen": 190271088, "step": 88170 }, { "epoch": 14.384176182707993, "grad_norm": 0.7040307521820068, "learning_rate": 1.1078062290496357e-05, "loss": 0.2644, "num_input_tokens_seen": 190281200, "step": 88175 }, { "epoch": 14.384991843393149, "grad_norm": 0.044406089931726456, "learning_rate": 1.1075106352652559e-05, "loss": 0.1367, "num_input_tokens_seen": 190291472, "step": 88180 }, { "epoch": 14.385807504078304, "grad_norm": 1.8915510177612305, "learning_rate": 1.1072150697014397e-05, "loss": 0.1607, "num_input_tokens_seen": 190302448, "step": 88185 }, { "epoch": 14.386623164763458, "grad_norm": 0.09883049130439758, "learning_rate": 1.1069195323641765e-05, "loss": 0.2111, "num_input_tokens_seen": 190314288, "step": 88190 }, { "epoch": 14.387438825448614, "grad_norm": 0.8341259956359863, "learning_rate": 1.1066240232594567e-05, "loss": 0.1284, "num_input_tokens_seen": 190325296, "step": 88195 }, { "epoch": 14.388254486133768, "grad_norm": 0.08562219887971878, "learning_rate": 1.1063285423932684e-05, "loss": 0.0161, "num_input_tokens_seen": 190333776, "step": 88200 }, { "epoch": 14.389070146818923, "grad_norm": 0.15616664290428162, "learning_rate": 1.1060330897716e-05, "loss": 0.1538, "num_input_tokens_seen": 190344944, "step": 88205 }, { "epoch": 14.38988580750408, "grad_norm": 0.23006604611873627, "learning_rate": 1.1057376654004397e-05, "loss": 0.0195, "num_input_tokens_seen": 190355792, "step": 88210 }, { "epoch": 14.390701468189233, "grad_norm": 1.010451078414917, "learning_rate": 1.1054422692857744e-05, "loss": 0.1245, "num_input_tokens_seen": 190366640, "step": 88215 }, { "epoch": 14.391517128874389, "grad_norm": 0.30971759557724, "learning_rate": 1.1051469014335908e-05, "loss": 0.0499, "num_input_tokens_seen": 190377584, "step": 88220 }, { "epoch": 14.392332789559543, "grad_norm": 0.2393929660320282, "learning_rate": 1.1048515618498746e-05, "loss": 0.0342, "num_input_tokens_seen": 190388368, "step": 88225 }, { "epoch": 14.393148450244698, "grad_norm": 2.0446696281433105, "learning_rate": 1.1045562505406102e-05, "loss": 0.0737, "num_input_tokens_seen": 190398832, "step": 88230 }, { "epoch": 14.393964110929852, "grad_norm": 2.467860221862793, "learning_rate": 1.1042609675117854e-05, "loss": 0.1116, "num_input_tokens_seen": 190408720, "step": 88235 }, { "epoch": 14.394779771615008, "grad_norm": 1.2624579668045044, "learning_rate": 1.1039657127693811e-05, "loss": 0.0201, "num_input_tokens_seen": 190418832, "step": 88240 }, { "epoch": 14.395595432300164, "grad_norm": 0.05510345473885536, "learning_rate": 1.1036704863193844e-05, "loss": 0.1005, "num_input_tokens_seen": 190429776, "step": 88245 }, { "epoch": 14.396411092985318, "grad_norm": 0.21933341026306152, "learning_rate": 1.103375288167775e-05, "loss": 0.0698, "num_input_tokens_seen": 190439824, "step": 88250 }, { "epoch": 14.397226753670473, "grad_norm": 0.09859275072813034, "learning_rate": 1.1030801183205389e-05, "loss": 0.017, "num_input_tokens_seen": 190448880, "step": 88255 }, { "epoch": 14.398042414355627, "grad_norm": 0.263450562953949, "learning_rate": 1.1027849767836546e-05, "loss": 0.1627, "num_input_tokens_seen": 190460976, "step": 88260 }, { "epoch": 14.398858075040783, "grad_norm": 0.06678730994462967, "learning_rate": 1.1024898635631067e-05, "loss": 0.1008, "num_input_tokens_seen": 190472752, "step": 88265 }, { "epoch": 14.399673735725939, "grad_norm": 0.8710905313491821, "learning_rate": 1.1021947786648731e-05, "loss": 0.1357, "num_input_tokens_seen": 190482480, "step": 88270 }, { "epoch": 14.400489396411093, "grad_norm": 2.035626173019409, "learning_rate": 1.101899722094937e-05, "loss": 0.0756, "num_input_tokens_seen": 190493680, "step": 88275 }, { "epoch": 14.401305057096248, "grad_norm": 0.8756612539291382, "learning_rate": 1.1016046938592754e-05, "loss": 0.0776, "num_input_tokens_seen": 190502768, "step": 88280 }, { "epoch": 14.402120717781402, "grad_norm": 1.2189106941223145, "learning_rate": 1.10130969396387e-05, "loss": 0.0807, "num_input_tokens_seen": 190514000, "step": 88285 }, { "epoch": 14.402936378466558, "grad_norm": 1.3619263172149658, "learning_rate": 1.1010147224146963e-05, "loss": 0.2478, "num_input_tokens_seen": 190524688, "step": 88290 }, { "epoch": 14.403752039151712, "grad_norm": 1.292646884918213, "learning_rate": 1.100719779217736e-05, "loss": 0.0938, "num_input_tokens_seen": 190534960, "step": 88295 }, { "epoch": 14.404567699836868, "grad_norm": 0.6480607390403748, "learning_rate": 1.1004248643789624e-05, "loss": 0.0253, "num_input_tokens_seen": 190546352, "step": 88300 }, { "epoch": 14.405383360522023, "grad_norm": 0.11759582161903381, "learning_rate": 1.1001299779043564e-05, "loss": 0.087, "num_input_tokens_seen": 190557456, "step": 88305 }, { "epoch": 14.406199021207177, "grad_norm": 0.051991239190101624, "learning_rate": 1.0998351197998904e-05, "loss": 0.0083, "num_input_tokens_seen": 190567952, "step": 88310 }, { "epoch": 14.407014681892333, "grad_norm": 0.09759538620710373, "learning_rate": 1.0995402900715438e-05, "loss": 0.135, "num_input_tokens_seen": 190577776, "step": 88315 }, { "epoch": 14.407830342577487, "grad_norm": 0.03948996588587761, "learning_rate": 1.0992454887252878e-05, "loss": 0.1368, "num_input_tokens_seen": 190586800, "step": 88320 }, { "epoch": 14.408646003262643, "grad_norm": 1.1758373975753784, "learning_rate": 1.0989507157671e-05, "loss": 0.0371, "num_input_tokens_seen": 190597840, "step": 88325 }, { "epoch": 14.409461663947798, "grad_norm": 0.18045610189437866, "learning_rate": 1.0986559712029535e-05, "loss": 0.0498, "num_input_tokens_seen": 190607696, "step": 88330 }, { "epoch": 14.410277324632952, "grad_norm": 2.6537132263183594, "learning_rate": 1.0983612550388211e-05, "loss": 0.0966, "num_input_tokens_seen": 190618608, "step": 88335 }, { "epoch": 14.411092985318108, "grad_norm": 0.08854503929615021, "learning_rate": 1.0980665672806761e-05, "loss": 0.0231, "num_input_tokens_seen": 190628048, "step": 88340 }, { "epoch": 14.411908646003262, "grad_norm": 0.6021605730056763, "learning_rate": 1.0977719079344909e-05, "loss": 0.0376, "num_input_tokens_seen": 190640400, "step": 88345 }, { "epoch": 14.412724306688418, "grad_norm": 1.4415912628173828, "learning_rate": 1.0974772770062366e-05, "loss": 0.0364, "num_input_tokens_seen": 190649648, "step": 88350 }, { "epoch": 14.413539967373573, "grad_norm": 0.14262448251247406, "learning_rate": 1.0971826745018845e-05, "loss": 0.106, "num_input_tokens_seen": 190660816, "step": 88355 }, { "epoch": 14.414355628058727, "grad_norm": 2.2158477306365967, "learning_rate": 1.0968881004274051e-05, "loss": 0.1077, "num_input_tokens_seen": 190669648, "step": 88360 }, { "epoch": 14.415171288743883, "grad_norm": 0.3873877227306366, "learning_rate": 1.0965935547887682e-05, "loss": 0.1858, "num_input_tokens_seen": 190680112, "step": 88365 }, { "epoch": 14.415986949429037, "grad_norm": 0.08175353705883026, "learning_rate": 1.0962990375919435e-05, "loss": 0.0761, "num_input_tokens_seen": 190692112, "step": 88370 }, { "epoch": 14.416802610114193, "grad_norm": 1.1337119340896606, "learning_rate": 1.0960045488428986e-05, "loss": 0.0758, "num_input_tokens_seen": 190702160, "step": 88375 }, { "epoch": 14.417618270799348, "grad_norm": 0.08980966359376907, "learning_rate": 1.0957100885476043e-05, "loss": 0.0319, "num_input_tokens_seen": 190711216, "step": 88380 }, { "epoch": 14.418433931484502, "grad_norm": 0.05298595130443573, "learning_rate": 1.0954156567120246e-05, "loss": 0.185, "num_input_tokens_seen": 190722544, "step": 88385 }, { "epoch": 14.419249592169658, "grad_norm": 0.2867009937763214, "learning_rate": 1.0951212533421305e-05, "loss": 0.1225, "num_input_tokens_seen": 190732496, "step": 88390 }, { "epoch": 14.420065252854812, "grad_norm": 1.0937665700912476, "learning_rate": 1.0948268784438845e-05, "loss": 0.308, "num_input_tokens_seen": 190743280, "step": 88395 }, { "epoch": 14.420880913539968, "grad_norm": 2.0037412643432617, "learning_rate": 1.0945325320232563e-05, "loss": 0.0831, "num_input_tokens_seen": 190753520, "step": 88400 }, { "epoch": 14.421696574225122, "grad_norm": 0.03274097666144371, "learning_rate": 1.0942382140862073e-05, "loss": 0.0791, "num_input_tokens_seen": 190764080, "step": 88405 }, { "epoch": 14.422512234910277, "grad_norm": 2.2074954509735107, "learning_rate": 1.0939439246387062e-05, "loss": 0.2206, "num_input_tokens_seen": 190775120, "step": 88410 }, { "epoch": 14.423327895595433, "grad_norm": 0.08273173123598099, "learning_rate": 1.0936496636867133e-05, "loss": 0.1537, "num_input_tokens_seen": 190785488, "step": 88415 }, { "epoch": 14.424143556280587, "grad_norm": 1.1296741962432861, "learning_rate": 1.093355431236196e-05, "loss": 0.0804, "num_input_tokens_seen": 190796400, "step": 88420 }, { "epoch": 14.424959216965743, "grad_norm": 0.6973153948783875, "learning_rate": 1.0930612272931132e-05, "loss": 0.0506, "num_input_tokens_seen": 190807280, "step": 88425 }, { "epoch": 14.425774877650896, "grad_norm": 1.7619338035583496, "learning_rate": 1.0927670518634315e-05, "loss": 0.0997, "num_input_tokens_seen": 190817296, "step": 88430 }, { "epoch": 14.426590538336052, "grad_norm": 0.22020867466926575, "learning_rate": 1.0924729049531088e-05, "loss": 0.077, "num_input_tokens_seen": 190826864, "step": 88435 }, { "epoch": 14.427406199021208, "grad_norm": 0.18362674117088318, "learning_rate": 1.0921787865681093e-05, "loss": 0.0272, "num_input_tokens_seen": 190837264, "step": 88440 }, { "epoch": 14.428221859706362, "grad_norm": 0.2536686360836029, "learning_rate": 1.0918846967143925e-05, "loss": 0.0743, "num_input_tokens_seen": 190847920, "step": 88445 }, { "epoch": 14.429037520391518, "grad_norm": 0.1536472886800766, "learning_rate": 1.0915906353979188e-05, "loss": 0.1359, "num_input_tokens_seen": 190858544, "step": 88450 }, { "epoch": 14.429853181076671, "grad_norm": 0.057218726724386215, "learning_rate": 1.0912966026246474e-05, "loss": 0.1138, "num_input_tokens_seen": 190869168, "step": 88455 }, { "epoch": 14.430668841761827, "grad_norm": 0.2226141393184662, "learning_rate": 1.0910025984005375e-05, "loss": 0.0659, "num_input_tokens_seen": 190880976, "step": 88460 }, { "epoch": 14.431484502446983, "grad_norm": 1.1996670961380005, "learning_rate": 1.0907086227315475e-05, "loss": 0.0899, "num_input_tokens_seen": 190892240, "step": 88465 }, { "epoch": 14.432300163132137, "grad_norm": 0.15151506662368774, "learning_rate": 1.090414675623635e-05, "loss": 0.1499, "num_input_tokens_seen": 190904656, "step": 88470 }, { "epoch": 14.433115823817293, "grad_norm": 0.2736560106277466, "learning_rate": 1.090120757082757e-05, "loss": 0.0159, "num_input_tokens_seen": 190915344, "step": 88475 }, { "epoch": 14.433931484502446, "grad_norm": 0.08137614279985428, "learning_rate": 1.0898268671148707e-05, "loss": 0.0852, "num_input_tokens_seen": 190925936, "step": 88480 }, { "epoch": 14.434747145187602, "grad_norm": 0.7226479649543762, "learning_rate": 1.0895330057259318e-05, "loss": 0.0632, "num_input_tokens_seen": 190936432, "step": 88485 }, { "epoch": 14.435562805872756, "grad_norm": 0.11744891107082367, "learning_rate": 1.089239172921896e-05, "loss": 0.0451, "num_input_tokens_seen": 190946960, "step": 88490 }, { "epoch": 14.436378466557912, "grad_norm": 0.0775940790772438, "learning_rate": 1.0889453687087178e-05, "loss": 0.1354, "num_input_tokens_seen": 190956656, "step": 88495 }, { "epoch": 14.437194127243067, "grad_norm": 0.05195237323641777, "learning_rate": 1.0886515930923518e-05, "loss": 0.0296, "num_input_tokens_seen": 190968592, "step": 88500 }, { "epoch": 14.438009787928221, "grad_norm": 0.03103417530655861, "learning_rate": 1.0883578460787516e-05, "loss": 0.129, "num_input_tokens_seen": 190979856, "step": 88505 }, { "epoch": 14.438825448613377, "grad_norm": 0.7496840953826904, "learning_rate": 1.0880641276738707e-05, "loss": 0.1181, "num_input_tokens_seen": 190990064, "step": 88510 }, { "epoch": 14.439641109298531, "grad_norm": 0.38651764392852783, "learning_rate": 1.0877704378836614e-05, "loss": 0.0415, "num_input_tokens_seen": 191000432, "step": 88515 }, { "epoch": 14.440456769983687, "grad_norm": 1.2694823741912842, "learning_rate": 1.0874767767140745e-05, "loss": 0.0304, "num_input_tokens_seen": 191011920, "step": 88520 }, { "epoch": 14.441272430668842, "grad_norm": 1.072320818901062, "learning_rate": 1.0871831441710648e-05, "loss": 0.123, "num_input_tokens_seen": 191022096, "step": 88525 }, { "epoch": 14.442088091353996, "grad_norm": 0.14448091387748718, "learning_rate": 1.0868895402605788e-05, "loss": 0.0306, "num_input_tokens_seen": 191032784, "step": 88530 }, { "epoch": 14.442903752039152, "grad_norm": 2.0163843631744385, "learning_rate": 1.0865959649885712e-05, "loss": 0.0951, "num_input_tokens_seen": 191042640, "step": 88535 }, { "epoch": 14.443719412724306, "grad_norm": 1.4978277683258057, "learning_rate": 1.0863024183609873e-05, "loss": 0.0811, "num_input_tokens_seen": 191054512, "step": 88540 }, { "epoch": 14.444535073409462, "grad_norm": 0.10471928864717484, "learning_rate": 1.0860089003837792e-05, "loss": 0.1427, "num_input_tokens_seen": 191064624, "step": 88545 }, { "epoch": 14.445350734094617, "grad_norm": 0.0579492561519146, "learning_rate": 1.0857154110628945e-05, "loss": 0.087, "num_input_tokens_seen": 191076720, "step": 88550 }, { "epoch": 14.446166394779771, "grad_norm": 0.19907209277153015, "learning_rate": 1.0854219504042812e-05, "loss": 0.1293, "num_input_tokens_seen": 191086960, "step": 88555 }, { "epoch": 14.446982055464927, "grad_norm": 0.775287926197052, "learning_rate": 1.0851285184138865e-05, "loss": 0.1198, "num_input_tokens_seen": 191098384, "step": 88560 }, { "epoch": 14.447797716150081, "grad_norm": 0.12956561148166656, "learning_rate": 1.0848351150976574e-05, "loss": 0.061, "num_input_tokens_seen": 191109776, "step": 88565 }, { "epoch": 14.448613376835237, "grad_norm": 0.47348424792289734, "learning_rate": 1.0845417404615399e-05, "loss": 0.1553, "num_input_tokens_seen": 191119888, "step": 88570 }, { "epoch": 14.449429037520392, "grad_norm": 0.9624220132827759, "learning_rate": 1.0842483945114799e-05, "loss": 0.026, "num_input_tokens_seen": 191130032, "step": 88575 }, { "epoch": 14.450244698205546, "grad_norm": 0.572845995426178, "learning_rate": 1.0839550772534218e-05, "loss": 0.1389, "num_input_tokens_seen": 191140176, "step": 88580 }, { "epoch": 14.451060358890702, "grad_norm": 2.4472217559814453, "learning_rate": 1.0836617886933107e-05, "loss": 0.0602, "num_input_tokens_seen": 191151024, "step": 88585 }, { "epoch": 14.451876019575856, "grad_norm": 1.2643567323684692, "learning_rate": 1.0833685288370901e-05, "loss": 0.0352, "num_input_tokens_seen": 191162416, "step": 88590 }, { "epoch": 14.452691680261012, "grad_norm": 2.2206997871398926, "learning_rate": 1.0830752976907032e-05, "loss": 0.1644, "num_input_tokens_seen": 191173648, "step": 88595 }, { "epoch": 14.453507340946166, "grad_norm": 0.7129035592079163, "learning_rate": 1.0827820952600931e-05, "loss": 0.182, "num_input_tokens_seen": 191184560, "step": 88600 }, { "epoch": 14.454323001631321, "grad_norm": 0.4032510817050934, "learning_rate": 1.0824889215512018e-05, "loss": 0.0412, "num_input_tokens_seen": 191195696, "step": 88605 }, { "epoch": 14.455138662316477, "grad_norm": 0.07528819888830185, "learning_rate": 1.0821957765699705e-05, "loss": 0.3402, "num_input_tokens_seen": 191207536, "step": 88610 }, { "epoch": 14.455954323001631, "grad_norm": 0.06357011944055557, "learning_rate": 1.0819026603223406e-05, "loss": 0.2077, "num_input_tokens_seen": 191218992, "step": 88615 }, { "epoch": 14.456769983686787, "grad_norm": 0.023533852770924568, "learning_rate": 1.0816095728142523e-05, "loss": 0.0606, "num_input_tokens_seen": 191230448, "step": 88620 }, { "epoch": 14.45758564437194, "grad_norm": 1.3321828842163086, "learning_rate": 1.0813165140516451e-05, "loss": 0.154, "num_input_tokens_seen": 191241040, "step": 88625 }, { "epoch": 14.458401305057096, "grad_norm": 0.18072570860385895, "learning_rate": 1.0810234840404587e-05, "loss": 0.0742, "num_input_tokens_seen": 191251792, "step": 88630 }, { "epoch": 14.459216965742252, "grad_norm": 1.1983230113983154, "learning_rate": 1.0807304827866316e-05, "loss": 0.2614, "num_input_tokens_seen": 191262480, "step": 88635 }, { "epoch": 14.460032626427406, "grad_norm": 0.37506407499313354, "learning_rate": 1.0804375102961015e-05, "loss": 0.0789, "num_input_tokens_seen": 191274320, "step": 88640 }, { "epoch": 14.460848287112562, "grad_norm": 0.13788774609565735, "learning_rate": 1.0801445665748062e-05, "loss": 0.0202, "num_input_tokens_seen": 191285616, "step": 88645 }, { "epoch": 14.461663947797716, "grad_norm": 1.6126937866210938, "learning_rate": 1.0798516516286816e-05, "loss": 0.0876, "num_input_tokens_seen": 191296208, "step": 88650 }, { "epoch": 14.462479608482871, "grad_norm": 0.09185248613357544, "learning_rate": 1.0795587654636657e-05, "loss": 0.0766, "num_input_tokens_seen": 191306288, "step": 88655 }, { "epoch": 14.463295269168025, "grad_norm": 0.24153150618076324, "learning_rate": 1.0792659080856934e-05, "loss": 0.0206, "num_input_tokens_seen": 191316560, "step": 88660 }, { "epoch": 14.464110929853181, "grad_norm": 0.23867231607437134, "learning_rate": 1.0789730795006998e-05, "loss": 0.0738, "num_input_tokens_seen": 191326096, "step": 88665 }, { "epoch": 14.464926590538337, "grad_norm": 1.3176124095916748, "learning_rate": 1.0786802797146195e-05, "loss": 0.1524, "num_input_tokens_seen": 191335824, "step": 88670 }, { "epoch": 14.46574225122349, "grad_norm": 0.43719372153282166, "learning_rate": 1.078387508733386e-05, "loss": 0.0271, "num_input_tokens_seen": 191346192, "step": 88675 }, { "epoch": 14.466557911908646, "grad_norm": 0.6197569370269775, "learning_rate": 1.0780947665629338e-05, "loss": 0.0512, "num_input_tokens_seen": 191355248, "step": 88680 }, { "epoch": 14.4673735725938, "grad_norm": 2.4576728343963623, "learning_rate": 1.0778020532091946e-05, "loss": 0.1124, "num_input_tokens_seen": 191365872, "step": 88685 }, { "epoch": 14.468189233278956, "grad_norm": 0.05667810142040253, "learning_rate": 1.0775093686781007e-05, "loss": 0.0325, "num_input_tokens_seen": 191377296, "step": 88690 }, { "epoch": 14.469004893964112, "grad_norm": 1.3913302421569824, "learning_rate": 1.0772167129755845e-05, "loss": 0.3554, "num_input_tokens_seen": 191388432, "step": 88695 }, { "epoch": 14.469820554649266, "grad_norm": 0.0727018192410469, "learning_rate": 1.076924086107576e-05, "loss": 0.0807, "num_input_tokens_seen": 191397552, "step": 88700 }, { "epoch": 14.470636215334421, "grad_norm": 0.24026881158351898, "learning_rate": 1.0766314880800065e-05, "loss": 0.0604, "num_input_tokens_seen": 191408688, "step": 88705 }, { "epoch": 14.471451876019575, "grad_norm": 0.2270943522453308, "learning_rate": 1.0763389188988057e-05, "loss": 0.0511, "num_input_tokens_seen": 191420560, "step": 88710 }, { "epoch": 14.47226753670473, "grad_norm": 0.46571969985961914, "learning_rate": 1.0760463785699026e-05, "loss": 0.0715, "num_input_tokens_seen": 191431120, "step": 88715 }, { "epoch": 14.473083197389887, "grad_norm": 0.027888596057891846, "learning_rate": 1.0757538670992262e-05, "loss": 0.0977, "num_input_tokens_seen": 191441008, "step": 88720 }, { "epoch": 14.47389885807504, "grad_norm": 0.12062864005565643, "learning_rate": 1.0754613844927041e-05, "loss": 0.1118, "num_input_tokens_seen": 191451312, "step": 88725 }, { "epoch": 14.474714518760196, "grad_norm": 0.04622755944728851, "learning_rate": 1.0751689307562646e-05, "loss": 0.0269, "num_input_tokens_seen": 191461232, "step": 88730 }, { "epoch": 14.47553017944535, "grad_norm": 1.5928000211715698, "learning_rate": 1.074876505895834e-05, "loss": 0.202, "num_input_tokens_seen": 191472176, "step": 88735 }, { "epoch": 14.476345840130506, "grad_norm": 1.5895527601242065, "learning_rate": 1.0745841099173389e-05, "loss": 0.1129, "num_input_tokens_seen": 191482832, "step": 88740 }, { "epoch": 14.477161500815662, "grad_norm": 0.11790475994348526, "learning_rate": 1.0742917428267053e-05, "loss": 0.0892, "num_input_tokens_seen": 191494256, "step": 88745 }, { "epoch": 14.477977161500815, "grad_norm": 0.0656248927116394, "learning_rate": 1.073999404629858e-05, "loss": 0.1335, "num_input_tokens_seen": 191505840, "step": 88750 }, { "epoch": 14.478792822185971, "grad_norm": 0.1136728972196579, "learning_rate": 1.0737070953327207e-05, "loss": 0.0469, "num_input_tokens_seen": 191517296, "step": 88755 }, { "epoch": 14.479608482871125, "grad_norm": 0.09285721182823181, "learning_rate": 1.0734148149412206e-05, "loss": 0.1095, "num_input_tokens_seen": 191526608, "step": 88760 }, { "epoch": 14.48042414355628, "grad_norm": 1.1203292608261108, "learning_rate": 1.0731225634612774e-05, "loss": 0.1401, "num_input_tokens_seen": 191537552, "step": 88765 }, { "epoch": 14.481239804241435, "grad_norm": 0.06813137978315353, "learning_rate": 1.072830340898817e-05, "loss": 0.1299, "num_input_tokens_seen": 191548016, "step": 88770 }, { "epoch": 14.48205546492659, "grad_norm": 0.09174424409866333, "learning_rate": 1.0725381472597585e-05, "loss": 0.179, "num_input_tokens_seen": 191558864, "step": 88775 }, { "epoch": 14.482871125611746, "grad_norm": 0.17491364479064941, "learning_rate": 1.0722459825500273e-05, "loss": 0.0641, "num_input_tokens_seen": 191569584, "step": 88780 }, { "epoch": 14.4836867862969, "grad_norm": 0.04350396245718002, "learning_rate": 1.0719538467755405e-05, "loss": 0.1173, "num_input_tokens_seen": 191579888, "step": 88785 }, { "epoch": 14.484502446982056, "grad_norm": 0.0880415067076683, "learning_rate": 1.0716617399422225e-05, "loss": 0.0099, "num_input_tokens_seen": 191589776, "step": 88790 }, { "epoch": 14.48531810766721, "grad_norm": 0.40317782759666443, "learning_rate": 1.0713696620559893e-05, "loss": 0.0237, "num_input_tokens_seen": 191600496, "step": 88795 }, { "epoch": 14.486133768352365, "grad_norm": 2.399949312210083, "learning_rate": 1.0710776131227635e-05, "loss": 0.118, "num_input_tokens_seen": 191610960, "step": 88800 }, { "epoch": 14.486949429037521, "grad_norm": 1.3617748022079468, "learning_rate": 1.0707855931484622e-05, "loss": 0.0579, "num_input_tokens_seen": 191621296, "step": 88805 }, { "epoch": 14.487765089722675, "grad_norm": 0.13136202096939087, "learning_rate": 1.0704936021390042e-05, "loss": 0.0564, "num_input_tokens_seen": 191632240, "step": 88810 }, { "epoch": 14.48858075040783, "grad_norm": 0.08579355478286743, "learning_rate": 1.0702016401003067e-05, "loss": 0.0985, "num_input_tokens_seen": 191643696, "step": 88815 }, { "epoch": 14.489396411092985, "grad_norm": 0.017676372081041336, "learning_rate": 1.0699097070382869e-05, "loss": 0.1094, "num_input_tokens_seen": 191655632, "step": 88820 }, { "epoch": 14.49021207177814, "grad_norm": 0.1840900480747223, "learning_rate": 1.069617802958861e-05, "loss": 0.1298, "num_input_tokens_seen": 191666576, "step": 88825 }, { "epoch": 14.491027732463296, "grad_norm": 0.9987712502479553, "learning_rate": 1.069325927867945e-05, "loss": 0.0616, "num_input_tokens_seen": 191676240, "step": 88830 }, { "epoch": 14.49184339314845, "grad_norm": 1.9127397537231445, "learning_rate": 1.069034081771454e-05, "loss": 0.1456, "num_input_tokens_seen": 191687088, "step": 88835 }, { "epoch": 14.492659053833606, "grad_norm": 0.03466719761490822, "learning_rate": 1.0687422646753024e-05, "loss": 0.0683, "num_input_tokens_seen": 191697424, "step": 88840 }, { "epoch": 14.49347471451876, "grad_norm": 1.8059638738632202, "learning_rate": 1.0684504765854048e-05, "loss": 0.0718, "num_input_tokens_seen": 191708624, "step": 88845 }, { "epoch": 14.494290375203915, "grad_norm": 0.6734533905982971, "learning_rate": 1.068158717507674e-05, "loss": 0.0782, "num_input_tokens_seen": 191718992, "step": 88850 }, { "epoch": 14.49510603588907, "grad_norm": 0.3531411588191986, "learning_rate": 1.0678669874480235e-05, "loss": 0.0238, "num_input_tokens_seen": 191730128, "step": 88855 }, { "epoch": 14.495921696574225, "grad_norm": 0.20588833093643188, "learning_rate": 1.0675752864123642e-05, "loss": 0.0136, "num_input_tokens_seen": 191741488, "step": 88860 }, { "epoch": 14.49673735725938, "grad_norm": 0.3831038475036621, "learning_rate": 1.0672836144066109e-05, "loss": 0.1048, "num_input_tokens_seen": 191753456, "step": 88865 }, { "epoch": 14.497553017944535, "grad_norm": 1.0574649572372437, "learning_rate": 1.0669919714366708e-05, "loss": 0.2029, "num_input_tokens_seen": 191764592, "step": 88870 }, { "epoch": 14.49836867862969, "grad_norm": 0.31238049268722534, "learning_rate": 1.0667003575084581e-05, "loss": 0.1304, "num_input_tokens_seen": 191774224, "step": 88875 }, { "epoch": 14.499184339314844, "grad_norm": 2.0358853340148926, "learning_rate": 1.066408772627879e-05, "loss": 0.1824, "num_input_tokens_seen": 191784912, "step": 88880 }, { "epoch": 14.5, "grad_norm": 2.085911989212036, "learning_rate": 1.0661172168008466e-05, "loss": 0.1661, "num_input_tokens_seen": 191795568, "step": 88885 }, { "epoch": 14.500815660685156, "grad_norm": 0.6714189648628235, "learning_rate": 1.065825690033266e-05, "loss": 0.0216, "num_input_tokens_seen": 191806768, "step": 88890 }, { "epoch": 14.50163132137031, "grad_norm": 0.273412823677063, "learning_rate": 1.0655341923310492e-05, "loss": 0.0629, "num_input_tokens_seen": 191818352, "step": 88895 }, { "epoch": 14.502446982055465, "grad_norm": 0.5214383602142334, "learning_rate": 1.0652427237000998e-05, "loss": 0.0558, "num_input_tokens_seen": 191828048, "step": 88900 }, { "epoch": 14.50326264274062, "grad_norm": 1.7478500604629517, "learning_rate": 1.0649512841463284e-05, "loss": 0.0679, "num_input_tokens_seen": 191838512, "step": 88905 }, { "epoch": 14.504078303425775, "grad_norm": 1.636260747909546, "learning_rate": 1.064659873675638e-05, "loss": 0.2907, "num_input_tokens_seen": 191847856, "step": 88910 }, { "epoch": 14.50489396411093, "grad_norm": 0.23860034346580505, "learning_rate": 1.0643684922939379e-05, "loss": 0.015, "num_input_tokens_seen": 191858960, "step": 88915 }, { "epoch": 14.505709624796085, "grad_norm": 2.6204833984375, "learning_rate": 1.0640771400071298e-05, "loss": 0.0687, "num_input_tokens_seen": 191870064, "step": 88920 }, { "epoch": 14.50652528548124, "grad_norm": 1.200788974761963, "learning_rate": 1.0637858168211218e-05, "loss": 0.0969, "num_input_tokens_seen": 191882160, "step": 88925 }, { "epoch": 14.507340946166394, "grad_norm": 0.2684347927570343, "learning_rate": 1.0634945227418145e-05, "loss": 0.2035, "num_input_tokens_seen": 191893072, "step": 88930 }, { "epoch": 14.50815660685155, "grad_norm": 1.5654654502868652, "learning_rate": 1.0632032577751146e-05, "loss": 0.1957, "num_input_tokens_seen": 191905072, "step": 88935 }, { "epoch": 14.508972267536706, "grad_norm": 2.3757450580596924, "learning_rate": 1.0629120219269218e-05, "loss": 0.1821, "num_input_tokens_seen": 191916336, "step": 88940 }, { "epoch": 14.50978792822186, "grad_norm": 1.783246397972107, "learning_rate": 1.0626208152031406e-05, "loss": 0.3268, "num_input_tokens_seen": 191927376, "step": 88945 }, { "epoch": 14.510603588907015, "grad_norm": 0.1100776344537735, "learning_rate": 1.0623296376096723e-05, "loss": 0.0322, "num_input_tokens_seen": 191938832, "step": 88950 }, { "epoch": 14.51141924959217, "grad_norm": 1.5664167404174805, "learning_rate": 1.0620384891524177e-05, "loss": 0.1297, "num_input_tokens_seen": 191950896, "step": 88955 }, { "epoch": 14.512234910277325, "grad_norm": 0.02634492702782154, "learning_rate": 1.061747369837277e-05, "loss": 0.1738, "num_input_tokens_seen": 191961680, "step": 88960 }, { "epoch": 14.513050570962479, "grad_norm": 0.37315258383750916, "learning_rate": 1.0614562796701507e-05, "loss": 0.0812, "num_input_tokens_seen": 191972464, "step": 88965 }, { "epoch": 14.513866231647635, "grad_norm": 0.6315574645996094, "learning_rate": 1.0611652186569377e-05, "loss": 0.1638, "num_input_tokens_seen": 191983376, "step": 88970 }, { "epoch": 14.51468189233279, "grad_norm": 0.8678814172744751, "learning_rate": 1.0608741868035369e-05, "loss": 0.1593, "num_input_tokens_seen": 191993936, "step": 88975 }, { "epoch": 14.515497553017944, "grad_norm": 1.529679775238037, "learning_rate": 1.0605831841158461e-05, "loss": 0.2512, "num_input_tokens_seen": 192005584, "step": 88980 }, { "epoch": 14.5163132137031, "grad_norm": 0.05015569180250168, "learning_rate": 1.0602922105997633e-05, "loss": 0.1234, "num_input_tokens_seen": 192016624, "step": 88985 }, { "epoch": 14.517128874388254, "grad_norm": 0.08300250768661499, "learning_rate": 1.0600012662611855e-05, "loss": 0.1732, "num_input_tokens_seen": 192027184, "step": 88990 }, { "epoch": 14.51794453507341, "grad_norm": 0.09480538219213486, "learning_rate": 1.0597103511060077e-05, "loss": 0.0358, "num_input_tokens_seen": 192037328, "step": 88995 }, { "epoch": 14.518760195758565, "grad_norm": 1.8212177753448486, "learning_rate": 1.0594194651401284e-05, "loss": 0.2521, "num_input_tokens_seen": 192048272, "step": 89000 }, { "epoch": 14.51957585644372, "grad_norm": 0.08602575957775116, "learning_rate": 1.0591286083694397e-05, "loss": 0.0736, "num_input_tokens_seen": 192058448, "step": 89005 }, { "epoch": 14.520391517128875, "grad_norm": 0.1530790627002716, "learning_rate": 1.0588377807998393e-05, "loss": 0.0746, "num_input_tokens_seen": 192069232, "step": 89010 }, { "epoch": 14.521207177814029, "grad_norm": 2.256591796875, "learning_rate": 1.0585469824372174e-05, "loss": 0.0775, "num_input_tokens_seen": 192080240, "step": 89015 }, { "epoch": 14.522022838499185, "grad_norm": 0.2811744213104248, "learning_rate": 1.0582562132874716e-05, "loss": 0.0593, "num_input_tokens_seen": 192091184, "step": 89020 }, { "epoch": 14.522838499184338, "grad_norm": 0.03718385100364685, "learning_rate": 1.0579654733564909e-05, "loss": 0.0519, "num_input_tokens_seen": 192101872, "step": 89025 }, { "epoch": 14.523654159869494, "grad_norm": 0.17409753799438477, "learning_rate": 1.0576747626501709e-05, "loss": 0.0244, "num_input_tokens_seen": 192113232, "step": 89030 }, { "epoch": 14.52446982055465, "grad_norm": 0.16301663219928741, "learning_rate": 1.0573840811744e-05, "loss": 0.1387, "num_input_tokens_seen": 192122800, "step": 89035 }, { "epoch": 14.525285481239804, "grad_norm": 2.241257429122925, "learning_rate": 1.057093428935072e-05, "loss": 0.2057, "num_input_tokens_seen": 192132944, "step": 89040 }, { "epoch": 14.52610114192496, "grad_norm": 0.38927581906318665, "learning_rate": 1.0568028059380746e-05, "loss": 0.0771, "num_input_tokens_seen": 192144112, "step": 89045 }, { "epoch": 14.526916802610113, "grad_norm": 0.07744275033473969, "learning_rate": 1.0565122121893012e-05, "loss": 0.0622, "num_input_tokens_seen": 192154960, "step": 89050 }, { "epoch": 14.52773246329527, "grad_norm": 2.027508020401001, "learning_rate": 1.056221647694637e-05, "loss": 0.3557, "num_input_tokens_seen": 192165712, "step": 89055 }, { "epoch": 14.528548123980425, "grad_norm": 0.02762249857187271, "learning_rate": 1.0559311124599747e-05, "loss": 0.0626, "num_input_tokens_seen": 192176944, "step": 89060 }, { "epoch": 14.529363784665579, "grad_norm": 1.9913508892059326, "learning_rate": 1.0556406064911983e-05, "loss": 0.1588, "num_input_tokens_seen": 192188016, "step": 89065 }, { "epoch": 14.530179445350734, "grad_norm": 0.019977441057562828, "learning_rate": 1.055350129794199e-05, "loss": 0.0982, "num_input_tokens_seen": 192199760, "step": 89070 }, { "epoch": 14.530995106035888, "grad_norm": 1.0105907917022705, "learning_rate": 1.0550596823748604e-05, "loss": 0.2036, "num_input_tokens_seen": 192211312, "step": 89075 }, { "epoch": 14.531810766721044, "grad_norm": 0.181655153632164, "learning_rate": 1.0547692642390714e-05, "loss": 0.1396, "num_input_tokens_seen": 192221104, "step": 89080 }, { "epoch": 14.5326264274062, "grad_norm": 0.10790586471557617, "learning_rate": 1.0544788753927162e-05, "loss": 0.0203, "num_input_tokens_seen": 192230768, "step": 89085 }, { "epoch": 14.533442088091354, "grad_norm": 0.9612332582473755, "learning_rate": 1.0541885158416808e-05, "loss": 0.1374, "num_input_tokens_seen": 192241808, "step": 89090 }, { "epoch": 14.53425774877651, "grad_norm": 0.09007513523101807, "learning_rate": 1.0538981855918492e-05, "loss": 0.1069, "num_input_tokens_seen": 192252464, "step": 89095 }, { "epoch": 14.535073409461663, "grad_norm": 0.24874728918075562, "learning_rate": 1.0536078846491052e-05, "loss": 0.0183, "num_input_tokens_seen": 192262544, "step": 89100 }, { "epoch": 14.535889070146819, "grad_norm": 2.022826910018921, "learning_rate": 1.0533176130193323e-05, "loss": 0.1779, "num_input_tokens_seen": 192273840, "step": 89105 }, { "epoch": 14.536704730831975, "grad_norm": 0.6832176446914673, "learning_rate": 1.0530273707084135e-05, "loss": 0.0188, "num_input_tokens_seen": 192284880, "step": 89110 }, { "epoch": 14.537520391517129, "grad_norm": 0.2961387038230896, "learning_rate": 1.0527371577222304e-05, "loss": 0.1323, "num_input_tokens_seen": 192295920, "step": 89115 }, { "epoch": 14.538336052202284, "grad_norm": 0.07541229575872421, "learning_rate": 1.0524469740666649e-05, "loss": 0.1359, "num_input_tokens_seen": 192307216, "step": 89120 }, { "epoch": 14.539151712887438, "grad_norm": 1.2156165838241577, "learning_rate": 1.0521568197475979e-05, "loss": 0.0654, "num_input_tokens_seen": 192318448, "step": 89125 }, { "epoch": 14.539967373572594, "grad_norm": 0.031569626182317734, "learning_rate": 1.0518666947709093e-05, "loss": 0.0294, "num_input_tokens_seen": 192329424, "step": 89130 }, { "epoch": 14.540783034257748, "grad_norm": 0.21495066583156586, "learning_rate": 1.0515765991424795e-05, "loss": 0.0203, "num_input_tokens_seen": 192341456, "step": 89135 }, { "epoch": 14.541598694942904, "grad_norm": 2.146066665649414, "learning_rate": 1.0512865328681865e-05, "loss": 0.1455, "num_input_tokens_seen": 192352592, "step": 89140 }, { "epoch": 14.54241435562806, "grad_norm": 1.155896544456482, "learning_rate": 1.0509964959539115e-05, "loss": 0.1301, "num_input_tokens_seen": 192363312, "step": 89145 }, { "epoch": 14.543230016313213, "grad_norm": 0.01829114928841591, "learning_rate": 1.0507064884055287e-05, "loss": 0.1122, "num_input_tokens_seen": 192374288, "step": 89150 }, { "epoch": 14.544045676998369, "grad_norm": 0.09576860070228577, "learning_rate": 1.0504165102289196e-05, "loss": 0.0102, "num_input_tokens_seen": 192384976, "step": 89155 }, { "epoch": 14.544861337683523, "grad_norm": 1.0700641870498657, "learning_rate": 1.0501265614299569e-05, "loss": 0.2511, "num_input_tokens_seen": 192395152, "step": 89160 }, { "epoch": 14.545676998368679, "grad_norm": 0.9115756750106812, "learning_rate": 1.0498366420145203e-05, "loss": 0.1463, "num_input_tokens_seen": 192406864, "step": 89165 }, { "epoch": 14.546492659053834, "grad_norm": 0.05048998072743416, "learning_rate": 1.0495467519884824e-05, "loss": 0.0112, "num_input_tokens_seen": 192418192, "step": 89170 }, { "epoch": 14.547308319738988, "grad_norm": 0.22115084528923035, "learning_rate": 1.0492568913577212e-05, "loss": 0.0163, "num_input_tokens_seen": 192429232, "step": 89175 }, { "epoch": 14.548123980424144, "grad_norm": 0.4808785319328308, "learning_rate": 1.0489670601281076e-05, "loss": 0.1816, "num_input_tokens_seen": 192439632, "step": 89180 }, { "epoch": 14.548939641109298, "grad_norm": 1.31231689453125, "learning_rate": 1.0486772583055184e-05, "loss": 0.0585, "num_input_tokens_seen": 192451024, "step": 89185 }, { "epoch": 14.549755301794454, "grad_norm": 2.712219476699829, "learning_rate": 1.0483874858958257e-05, "loss": 0.2159, "num_input_tokens_seen": 192462096, "step": 89190 }, { "epoch": 14.550570962479608, "grad_norm": 0.7089307904243469, "learning_rate": 1.0480977429049019e-05, "loss": 0.2379, "num_input_tokens_seen": 192473040, "step": 89195 }, { "epoch": 14.551386623164763, "grad_norm": 0.5800131559371948, "learning_rate": 1.0478080293386195e-05, "loss": 0.1021, "num_input_tokens_seen": 192483568, "step": 89200 }, { "epoch": 14.552202283849919, "grad_norm": 0.7648651599884033, "learning_rate": 1.0475183452028491e-05, "loss": 0.0706, "num_input_tokens_seen": 192493680, "step": 89205 }, { "epoch": 14.553017944535073, "grad_norm": 0.9234951734542847, "learning_rate": 1.0472286905034626e-05, "loss": 0.0461, "num_input_tokens_seen": 192504592, "step": 89210 }, { "epoch": 14.553833605220229, "grad_norm": 2.054124355316162, "learning_rate": 1.0469390652463294e-05, "loss": 0.0782, "num_input_tokens_seen": 192514960, "step": 89215 }, { "epoch": 14.554649265905383, "grad_norm": 1.576511025428772, "learning_rate": 1.0466494694373194e-05, "loss": 0.0871, "num_input_tokens_seen": 192525808, "step": 89220 }, { "epoch": 14.555464926590538, "grad_norm": 0.12724018096923828, "learning_rate": 1.0463599030823015e-05, "loss": 0.1454, "num_input_tokens_seen": 192536976, "step": 89225 }, { "epoch": 14.556280587275694, "grad_norm": 0.020168771967291832, "learning_rate": 1.0460703661871446e-05, "loss": 0.1003, "num_input_tokens_seen": 192548816, "step": 89230 }, { "epoch": 14.557096247960848, "grad_norm": 1.5577815771102905, "learning_rate": 1.0457808587577159e-05, "loss": 0.1383, "num_input_tokens_seen": 192559888, "step": 89235 }, { "epoch": 14.557911908646004, "grad_norm": 0.22248536348342896, "learning_rate": 1.0454913807998829e-05, "loss": 0.0821, "num_input_tokens_seen": 192571248, "step": 89240 }, { "epoch": 14.558727569331158, "grad_norm": 0.2160450518131256, "learning_rate": 1.0452019323195122e-05, "loss": 0.0347, "num_input_tokens_seen": 192581840, "step": 89245 }, { "epoch": 14.559543230016313, "grad_norm": 0.5628939270973206, "learning_rate": 1.04491251332247e-05, "loss": 0.1149, "num_input_tokens_seen": 192591952, "step": 89250 }, { "epoch": 14.560358890701469, "grad_norm": 0.2974378168582916, "learning_rate": 1.0446231238146215e-05, "loss": 0.0499, "num_input_tokens_seen": 192602800, "step": 89255 }, { "epoch": 14.561174551386623, "grad_norm": 0.7782292366027832, "learning_rate": 1.044333763801832e-05, "loss": 0.0171, "num_input_tokens_seen": 192613360, "step": 89260 }, { "epoch": 14.561990212071779, "grad_norm": 0.054160453379154205, "learning_rate": 1.0440444332899652e-05, "loss": 0.0095, "num_input_tokens_seen": 192623952, "step": 89265 }, { "epoch": 14.562805872756933, "grad_norm": 2.466844320297241, "learning_rate": 1.0437551322848848e-05, "loss": 0.0744, "num_input_tokens_seen": 192635344, "step": 89270 }, { "epoch": 14.563621533442088, "grad_norm": 0.5396175384521484, "learning_rate": 1.043465860792454e-05, "loss": 0.0195, "num_input_tokens_seen": 192646736, "step": 89275 }, { "epoch": 14.564437194127244, "grad_norm": 0.3158602714538574, "learning_rate": 1.0431766188185357e-05, "loss": 0.0203, "num_input_tokens_seen": 192657200, "step": 89280 }, { "epoch": 14.565252854812398, "grad_norm": 0.6506655216217041, "learning_rate": 1.04288740636899e-05, "loss": 0.0941, "num_input_tokens_seen": 192667984, "step": 89285 }, { "epoch": 14.566068515497554, "grad_norm": 0.05768663436174393, "learning_rate": 1.0425982234496806e-05, "loss": 0.0992, "num_input_tokens_seen": 192680336, "step": 89290 }, { "epoch": 14.566884176182707, "grad_norm": 1.2833367586135864, "learning_rate": 1.042309070066467e-05, "loss": 0.0925, "num_input_tokens_seen": 192691728, "step": 89295 }, { "epoch": 14.567699836867863, "grad_norm": 0.07436201721429825, "learning_rate": 1.0420199462252095e-05, "loss": 0.0656, "num_input_tokens_seen": 192703408, "step": 89300 }, { "epoch": 14.568515497553017, "grad_norm": 0.5869126319885254, "learning_rate": 1.041730851931767e-05, "loss": 0.0238, "num_input_tokens_seen": 192713264, "step": 89305 }, { "epoch": 14.569331158238173, "grad_norm": 1.2862493991851807, "learning_rate": 1.041441787191999e-05, "loss": 0.0968, "num_input_tokens_seen": 192723568, "step": 89310 }, { "epoch": 14.570146818923329, "grad_norm": 1.2742362022399902, "learning_rate": 1.0411527520117634e-05, "loss": 0.1041, "num_input_tokens_seen": 192732464, "step": 89315 }, { "epoch": 14.570962479608482, "grad_norm": 0.5483509302139282, "learning_rate": 1.0408637463969182e-05, "loss": 0.1077, "num_input_tokens_seen": 192744112, "step": 89320 }, { "epoch": 14.571778140293638, "grad_norm": 1.222884178161621, "learning_rate": 1.0405747703533197e-05, "loss": 0.2179, "num_input_tokens_seen": 192755600, "step": 89325 }, { "epoch": 14.572593800978792, "grad_norm": 1.6922296285629272, "learning_rate": 1.0402858238868255e-05, "loss": 0.1789, "num_input_tokens_seen": 192766640, "step": 89330 }, { "epoch": 14.573409461663948, "grad_norm": 0.09209597110748291, "learning_rate": 1.0399969070032903e-05, "loss": 0.0608, "num_input_tokens_seen": 192777968, "step": 89335 }, { "epoch": 14.574225122349104, "grad_norm": 0.31601014733314514, "learning_rate": 1.0397080197085701e-05, "loss": 0.0247, "num_input_tokens_seen": 192788784, "step": 89340 }, { "epoch": 14.575040783034257, "grad_norm": 2.6741998195648193, "learning_rate": 1.0394191620085197e-05, "loss": 0.2093, "num_input_tokens_seen": 192800880, "step": 89345 }, { "epoch": 14.575856443719413, "grad_norm": 1.2069456577301025, "learning_rate": 1.0391303339089925e-05, "loss": 0.0998, "num_input_tokens_seen": 192811632, "step": 89350 }, { "epoch": 14.576672104404567, "grad_norm": 0.29920318722724915, "learning_rate": 1.0388415354158426e-05, "loss": 0.0686, "num_input_tokens_seen": 192822928, "step": 89355 }, { "epoch": 14.577487765089723, "grad_norm": 0.05566592514514923, "learning_rate": 1.0385527665349223e-05, "loss": 0.0852, "num_input_tokens_seen": 192833680, "step": 89360 }, { "epoch": 14.578303425774878, "grad_norm": 1.3098098039627075, "learning_rate": 1.0382640272720845e-05, "loss": 0.1795, "num_input_tokens_seen": 192844016, "step": 89365 }, { "epoch": 14.579119086460032, "grad_norm": 0.0980444848537445, "learning_rate": 1.0379753176331802e-05, "loss": 0.0722, "num_input_tokens_seen": 192854864, "step": 89370 }, { "epoch": 14.579934747145188, "grad_norm": 1.4554049968719482, "learning_rate": 1.037686637624061e-05, "loss": 0.2407, "num_input_tokens_seen": 192865808, "step": 89375 }, { "epoch": 14.580750407830342, "grad_norm": 0.09630529582500458, "learning_rate": 1.0373979872505771e-05, "loss": 0.0749, "num_input_tokens_seen": 192876880, "step": 89380 }, { "epoch": 14.581566068515498, "grad_norm": 0.08178719133138657, "learning_rate": 1.0371093665185786e-05, "loss": 0.1849, "num_input_tokens_seen": 192887728, "step": 89385 }, { "epoch": 14.582381729200652, "grad_norm": 1.6903209686279297, "learning_rate": 1.0368207754339143e-05, "loss": 0.144, "num_input_tokens_seen": 192900400, "step": 89390 }, { "epoch": 14.583197389885807, "grad_norm": 0.8760883212089539, "learning_rate": 1.0365322140024325e-05, "loss": 0.1449, "num_input_tokens_seen": 192912336, "step": 89395 }, { "epoch": 14.584013050570963, "grad_norm": 0.04388391971588135, "learning_rate": 1.0362436822299834e-05, "loss": 0.0509, "num_input_tokens_seen": 192922960, "step": 89400 }, { "epoch": 14.584828711256117, "grad_norm": 0.06370198726654053, "learning_rate": 1.0359551801224115e-05, "loss": 0.0462, "num_input_tokens_seen": 192933872, "step": 89405 }, { "epoch": 14.585644371941273, "grad_norm": 0.13391995429992676, "learning_rate": 1.035666707685567e-05, "loss": 0.1119, "num_input_tokens_seen": 192945232, "step": 89410 }, { "epoch": 14.586460032626427, "grad_norm": 1.1919289827346802, "learning_rate": 1.0353782649252922e-05, "loss": 0.1663, "num_input_tokens_seen": 192954992, "step": 89415 }, { "epoch": 14.587275693311582, "grad_norm": 0.19164232909679413, "learning_rate": 1.035089851847437e-05, "loss": 0.1567, "num_input_tokens_seen": 192965072, "step": 89420 }, { "epoch": 14.588091353996738, "grad_norm": 0.05402286350727081, "learning_rate": 1.0348014684578423e-05, "loss": 0.0784, "num_input_tokens_seen": 192974576, "step": 89425 }, { "epoch": 14.588907014681892, "grad_norm": 1.9977526664733887, "learning_rate": 1.0345131147623555e-05, "loss": 0.0988, "num_input_tokens_seen": 192985424, "step": 89430 }, { "epoch": 14.589722675367048, "grad_norm": 2.1434295177459717, "learning_rate": 1.0342247907668195e-05, "loss": 0.0493, "num_input_tokens_seen": 192996624, "step": 89435 }, { "epoch": 14.590538336052202, "grad_norm": 1.4287467002868652, "learning_rate": 1.0339364964770775e-05, "loss": 0.1194, "num_input_tokens_seen": 193005648, "step": 89440 }, { "epoch": 14.591353996737357, "grad_norm": 0.28374212980270386, "learning_rate": 1.0336482318989727e-05, "loss": 0.0657, "num_input_tokens_seen": 193017392, "step": 89445 }, { "epoch": 14.592169657422513, "grad_norm": 0.9274686574935913, "learning_rate": 1.0333599970383461e-05, "loss": 0.0895, "num_input_tokens_seen": 193029040, "step": 89450 }, { "epoch": 14.592985318107667, "grad_norm": 1.7358052730560303, "learning_rate": 1.03307179190104e-05, "loss": 0.1216, "num_input_tokens_seen": 193040144, "step": 89455 }, { "epoch": 14.593800978792823, "grad_norm": 0.12365169078111649, "learning_rate": 1.032783616492895e-05, "loss": 0.0692, "num_input_tokens_seen": 193050704, "step": 89460 }, { "epoch": 14.594616639477977, "grad_norm": 0.2607755661010742, "learning_rate": 1.0324954708197513e-05, "loss": 0.0663, "num_input_tokens_seen": 193061648, "step": 89465 }, { "epoch": 14.595432300163132, "grad_norm": 0.23223643004894257, "learning_rate": 1.0322073548874484e-05, "loss": 0.0643, "num_input_tokens_seen": 193071312, "step": 89470 }, { "epoch": 14.596247960848288, "grad_norm": 0.18374000489711761, "learning_rate": 1.0319192687018257e-05, "loss": 0.0314, "num_input_tokens_seen": 193081648, "step": 89475 }, { "epoch": 14.597063621533442, "grad_norm": 0.39162477850914, "learning_rate": 1.0316312122687213e-05, "loss": 0.0369, "num_input_tokens_seen": 193092144, "step": 89480 }, { "epoch": 14.597879282218598, "grad_norm": 3.4633591175079346, "learning_rate": 1.031343185593973e-05, "loss": 0.0578, "num_input_tokens_seen": 193103344, "step": 89485 }, { "epoch": 14.598694942903752, "grad_norm": 0.03604632243514061, "learning_rate": 1.0310551886834182e-05, "loss": 0.0889, "num_input_tokens_seen": 193113712, "step": 89490 }, { "epoch": 14.599510603588907, "grad_norm": 0.10890353471040726, "learning_rate": 1.0307672215428937e-05, "loss": 0.0604, "num_input_tokens_seen": 193124752, "step": 89495 }, { "epoch": 14.600326264274061, "grad_norm": 1.0257718563079834, "learning_rate": 1.0304792841782342e-05, "loss": 0.1791, "num_input_tokens_seen": 193135888, "step": 89500 }, { "epoch": 14.601141924959217, "grad_norm": 2.0738613605499268, "learning_rate": 1.0301913765952784e-05, "loss": 0.1388, "num_input_tokens_seen": 193146960, "step": 89505 }, { "epoch": 14.601957585644373, "grad_norm": 0.06246068328619003, "learning_rate": 1.0299034987998568e-05, "loss": 0.0847, "num_input_tokens_seen": 193156304, "step": 89510 }, { "epoch": 14.602773246329527, "grad_norm": 0.2865910530090332, "learning_rate": 1.0296156507978075e-05, "loss": 0.1118, "num_input_tokens_seen": 193165808, "step": 89515 }, { "epoch": 14.603588907014682, "grad_norm": 0.11336840689182281, "learning_rate": 1.0293278325949606e-05, "loss": 0.1441, "num_input_tokens_seen": 193175632, "step": 89520 }, { "epoch": 14.604404567699836, "grad_norm": 0.07920786738395691, "learning_rate": 1.0290400441971528e-05, "loss": 0.0386, "num_input_tokens_seen": 193187792, "step": 89525 }, { "epoch": 14.605220228384992, "grad_norm": 0.13415326178073883, "learning_rate": 1.0287522856102127e-05, "loss": 0.1137, "num_input_tokens_seen": 193198192, "step": 89530 }, { "epoch": 14.606035889070148, "grad_norm": 0.8578530550003052, "learning_rate": 1.0284645568399756e-05, "loss": 0.2011, "num_input_tokens_seen": 193208368, "step": 89535 }, { "epoch": 14.606851549755302, "grad_norm": 0.07556406408548355, "learning_rate": 1.0281768578922696e-05, "loss": 0.0286, "num_input_tokens_seen": 193219440, "step": 89540 }, { "epoch": 14.607667210440457, "grad_norm": 0.04847618192434311, "learning_rate": 1.027889188772928e-05, "loss": 0.022, "num_input_tokens_seen": 193230512, "step": 89545 }, { "epoch": 14.608482871125611, "grad_norm": 2.6777446269989014, "learning_rate": 1.027601549487778e-05, "loss": 0.1451, "num_input_tokens_seen": 193240784, "step": 89550 }, { "epoch": 14.609298531810767, "grad_norm": 0.03383222967386246, "learning_rate": 1.0273139400426523e-05, "loss": 0.0094, "num_input_tokens_seen": 193250160, "step": 89555 }, { "epoch": 14.61011419249592, "grad_norm": 0.6189484596252441, "learning_rate": 1.027026360443376e-05, "loss": 0.0683, "num_input_tokens_seen": 193260080, "step": 89560 }, { "epoch": 14.610929853181077, "grad_norm": 0.23819303512573242, "learning_rate": 1.026738810695781e-05, "loss": 0.0457, "num_input_tokens_seen": 193269936, "step": 89565 }, { "epoch": 14.611745513866232, "grad_norm": 0.061261970549821854, "learning_rate": 1.0264512908056908e-05, "loss": 0.056, "num_input_tokens_seen": 193280272, "step": 89570 }, { "epoch": 14.612561174551386, "grad_norm": 2.606640338897705, "learning_rate": 1.0261638007789356e-05, "loss": 0.0925, "num_input_tokens_seen": 193291984, "step": 89575 }, { "epoch": 14.613376835236542, "grad_norm": 0.0479840524494648, "learning_rate": 1.025876340621341e-05, "loss": 0.1224, "num_input_tokens_seen": 193303760, "step": 89580 }, { "epoch": 14.614192495921696, "grad_norm": 0.21238838136196136, "learning_rate": 1.025588910338732e-05, "loss": 0.1318, "num_input_tokens_seen": 193314224, "step": 89585 }, { "epoch": 14.615008156606851, "grad_norm": 0.7326000928878784, "learning_rate": 1.0253015099369343e-05, "loss": 0.1411, "num_input_tokens_seen": 193324848, "step": 89590 }, { "epoch": 14.615823817292007, "grad_norm": 1.136375904083252, "learning_rate": 1.0250141394217722e-05, "loss": 0.1582, "num_input_tokens_seen": 193335920, "step": 89595 }, { "epoch": 14.616639477977161, "grad_norm": 1.5118097066879272, "learning_rate": 1.02472679879907e-05, "loss": 0.1013, "num_input_tokens_seen": 193346960, "step": 89600 }, { "epoch": 14.617455138662317, "grad_norm": 0.09038794785737991, "learning_rate": 1.0244394880746505e-05, "loss": 0.1038, "num_input_tokens_seen": 193359568, "step": 89605 }, { "epoch": 14.61827079934747, "grad_norm": 0.08369489014148712, "learning_rate": 1.0241522072543366e-05, "loss": 0.06, "num_input_tokens_seen": 193369840, "step": 89610 }, { "epoch": 14.619086460032626, "grad_norm": 0.09310515224933624, "learning_rate": 1.0238649563439506e-05, "loss": 0.0075, "num_input_tokens_seen": 193379376, "step": 89615 }, { "epoch": 14.619902120717782, "grad_norm": 0.10636681318283081, "learning_rate": 1.0235777353493137e-05, "loss": 0.0814, "num_input_tokens_seen": 193389584, "step": 89620 }, { "epoch": 14.620717781402936, "grad_norm": 0.0756598562002182, "learning_rate": 1.0232905442762463e-05, "loss": 0.0386, "num_input_tokens_seen": 193400464, "step": 89625 }, { "epoch": 14.621533442088092, "grad_norm": 0.03285858407616615, "learning_rate": 1.0230033831305711e-05, "loss": 0.0319, "num_input_tokens_seen": 193411792, "step": 89630 }, { "epoch": 14.622349102773246, "grad_norm": 0.6051205396652222, "learning_rate": 1.022716251918104e-05, "loss": 0.0273, "num_input_tokens_seen": 193422576, "step": 89635 }, { "epoch": 14.623164763458401, "grad_norm": 0.06647537648677826, "learning_rate": 1.0224291506446681e-05, "loss": 0.0587, "num_input_tokens_seen": 193433872, "step": 89640 }, { "epoch": 14.623980424143557, "grad_norm": 1.2309526205062866, "learning_rate": 1.0221420793160783e-05, "loss": 0.1195, "num_input_tokens_seen": 193444272, "step": 89645 }, { "epoch": 14.624796084828711, "grad_norm": 1.179840326309204, "learning_rate": 1.0218550379381555e-05, "loss": 0.1169, "num_input_tokens_seen": 193456080, "step": 89650 }, { "epoch": 14.625611745513867, "grad_norm": 0.6301822662353516, "learning_rate": 1.021568026516714e-05, "loss": 0.2017, "num_input_tokens_seen": 193467888, "step": 89655 }, { "epoch": 14.62642740619902, "grad_norm": 1.6734673976898193, "learning_rate": 1.0212810450575736e-05, "loss": 0.0424, "num_input_tokens_seen": 193478000, "step": 89660 }, { "epoch": 14.627243066884176, "grad_norm": 0.2783251404762268, "learning_rate": 1.0209940935665472e-05, "loss": 0.2077, "num_input_tokens_seen": 193487984, "step": 89665 }, { "epoch": 14.62805872756933, "grad_norm": 4.781391620635986, "learning_rate": 1.0207071720494533e-05, "loss": 0.301, "num_input_tokens_seen": 193498256, "step": 89670 }, { "epoch": 14.628874388254486, "grad_norm": 1.075749397277832, "learning_rate": 1.0204202805121032e-05, "loss": 0.1092, "num_input_tokens_seen": 193508432, "step": 89675 }, { "epoch": 14.629690048939642, "grad_norm": 1.1193658113479614, "learning_rate": 1.0201334189603151e-05, "loss": 0.1606, "num_input_tokens_seen": 193518064, "step": 89680 }, { "epoch": 14.630505709624796, "grad_norm": 0.059780675917863846, "learning_rate": 1.0198465873998988e-05, "loss": 0.0108, "num_input_tokens_seen": 193529136, "step": 89685 }, { "epoch": 14.631321370309951, "grad_norm": 0.32118988037109375, "learning_rate": 1.019559785836671e-05, "loss": 0.0108, "num_input_tokens_seen": 193538640, "step": 89690 }, { "epoch": 14.632137030995105, "grad_norm": 0.08386247605085373, "learning_rate": 1.0192730142764403e-05, "loss": 0.0492, "num_input_tokens_seen": 193549488, "step": 89695 }, { "epoch": 14.632952691680261, "grad_norm": 0.031927689909935, "learning_rate": 1.0189862727250222e-05, "loss": 0.0465, "num_input_tokens_seen": 193559024, "step": 89700 }, { "epoch": 14.633768352365417, "grad_norm": 0.7832997441291809, "learning_rate": 1.0186995611882244e-05, "loss": 0.1472, "num_input_tokens_seen": 193570352, "step": 89705 }, { "epoch": 14.63458401305057, "grad_norm": 0.03941537067294121, "learning_rate": 1.0184128796718605e-05, "loss": 0.1131, "num_input_tokens_seen": 193582096, "step": 89710 }, { "epoch": 14.635399673735726, "grad_norm": 1.1857045888900757, "learning_rate": 1.018126228181738e-05, "loss": 0.0596, "num_input_tokens_seen": 193592112, "step": 89715 }, { "epoch": 14.63621533442088, "grad_norm": 0.613910973072052, "learning_rate": 1.0178396067236679e-05, "loss": 0.0553, "num_input_tokens_seen": 193603632, "step": 89720 }, { "epoch": 14.637030995106036, "grad_norm": 0.18755654990673065, "learning_rate": 1.0175530153034584e-05, "loss": 0.2879, "num_input_tokens_seen": 193614736, "step": 89725 }, { "epoch": 14.63784665579119, "grad_norm": 0.06044655293226242, "learning_rate": 1.0172664539269177e-05, "loss": 0.092, "num_input_tokens_seen": 193624656, "step": 89730 }, { "epoch": 14.638662316476346, "grad_norm": 0.04466542229056358, "learning_rate": 1.0169799225998534e-05, "loss": 0.0518, "num_input_tokens_seen": 193634960, "step": 89735 }, { "epoch": 14.639477977161501, "grad_norm": 0.17101508378982544, "learning_rate": 1.0166934213280723e-05, "loss": 0.2425, "num_input_tokens_seen": 193646576, "step": 89740 }, { "epoch": 14.640293637846655, "grad_norm": 0.06858876347541809, "learning_rate": 1.0164069501173806e-05, "loss": 0.0735, "num_input_tokens_seen": 193657136, "step": 89745 }, { "epoch": 14.641109298531811, "grad_norm": 0.2731941044330597, "learning_rate": 1.0161205089735842e-05, "loss": 0.045, "num_input_tokens_seen": 193668944, "step": 89750 }, { "epoch": 14.641924959216965, "grad_norm": 1.7672947645187378, "learning_rate": 1.015834097902488e-05, "loss": 0.2665, "num_input_tokens_seen": 193679952, "step": 89755 }, { "epoch": 14.64274061990212, "grad_norm": 0.09531202167272568, "learning_rate": 1.0155477169098967e-05, "loss": 0.1499, "num_input_tokens_seen": 193691856, "step": 89760 }, { "epoch": 14.643556280587276, "grad_norm": 0.2451341152191162, "learning_rate": 1.0152613660016142e-05, "loss": 0.0548, "num_input_tokens_seen": 193701968, "step": 89765 }, { "epoch": 14.64437194127243, "grad_norm": 2.6068005561828613, "learning_rate": 1.0149750451834427e-05, "loss": 0.1405, "num_input_tokens_seen": 193712528, "step": 89770 }, { "epoch": 14.645187601957586, "grad_norm": 0.0531691312789917, "learning_rate": 1.0146887544611874e-05, "loss": 0.1472, "num_input_tokens_seen": 193723664, "step": 89775 }, { "epoch": 14.64600326264274, "grad_norm": 2.1134188175201416, "learning_rate": 1.0144024938406471e-05, "loss": 0.1191, "num_input_tokens_seen": 193734576, "step": 89780 }, { "epoch": 14.646818923327896, "grad_norm": 1.3211568593978882, "learning_rate": 1.0141162633276268e-05, "loss": 0.0509, "num_input_tokens_seen": 193746192, "step": 89785 }, { "epoch": 14.647634584013051, "grad_norm": 0.32786527276039124, "learning_rate": 1.0138300629279234e-05, "loss": 0.101, "num_input_tokens_seen": 193755120, "step": 89790 }, { "epoch": 14.648450244698205, "grad_norm": 0.12658867239952087, "learning_rate": 1.013543892647341e-05, "loss": 0.0145, "num_input_tokens_seen": 193764880, "step": 89795 }, { "epoch": 14.649265905383361, "grad_norm": 0.46488678455352783, "learning_rate": 1.0132577524916755e-05, "loss": 0.0328, "num_input_tokens_seen": 193776304, "step": 89800 }, { "epoch": 14.650081566068515, "grad_norm": 0.06790382415056229, "learning_rate": 1.0129716424667296e-05, "loss": 0.0835, "num_input_tokens_seen": 193786544, "step": 89805 }, { "epoch": 14.65089722675367, "grad_norm": 0.3187323212623596, "learning_rate": 1.0126855625782978e-05, "loss": 0.0393, "num_input_tokens_seen": 193798000, "step": 89810 }, { "epoch": 14.651712887438826, "grad_norm": 0.3866710960865021, "learning_rate": 1.0123995128321817e-05, "loss": 0.1746, "num_input_tokens_seen": 193810032, "step": 89815 }, { "epoch": 14.65252854812398, "grad_norm": 0.11401918530464172, "learning_rate": 1.012113493234175e-05, "loss": 0.0278, "num_input_tokens_seen": 193820240, "step": 89820 }, { "epoch": 14.653344208809136, "grad_norm": 1.683719277381897, "learning_rate": 1.0118275037900768e-05, "loss": 0.1663, "num_input_tokens_seen": 193830480, "step": 89825 }, { "epoch": 14.65415986949429, "grad_norm": 1.0895808935165405, "learning_rate": 1.0115415445056819e-05, "loss": 0.0437, "num_input_tokens_seen": 193841168, "step": 89830 }, { "epoch": 14.654975530179446, "grad_norm": 0.13823868334293365, "learning_rate": 1.0112556153867858e-05, "loss": 0.0932, "num_input_tokens_seen": 193852752, "step": 89835 }, { "epoch": 14.655791190864601, "grad_norm": 1.0084877014160156, "learning_rate": 1.0109697164391835e-05, "loss": 0.0648, "num_input_tokens_seen": 193863824, "step": 89840 }, { "epoch": 14.656606851549755, "grad_norm": 0.1465008407831192, "learning_rate": 1.0106838476686687e-05, "loss": 0.0609, "num_input_tokens_seen": 193874640, "step": 89845 }, { "epoch": 14.65742251223491, "grad_norm": 1.5465130805969238, "learning_rate": 1.0103980090810352e-05, "loss": 0.1056, "num_input_tokens_seen": 193885520, "step": 89850 }, { "epoch": 14.658238172920065, "grad_norm": 0.4909383952617645, "learning_rate": 1.0101122006820757e-05, "loss": 0.1115, "num_input_tokens_seen": 193897424, "step": 89855 }, { "epoch": 14.65905383360522, "grad_norm": 0.03346407786011696, "learning_rate": 1.0098264224775825e-05, "loss": 0.0551, "num_input_tokens_seen": 193909264, "step": 89860 }, { "epoch": 14.659869494290374, "grad_norm": 1.4959962368011475, "learning_rate": 1.0095406744733471e-05, "loss": 0.035, "num_input_tokens_seen": 193918480, "step": 89865 }, { "epoch": 14.66068515497553, "grad_norm": 0.7311946749687195, "learning_rate": 1.0092549566751608e-05, "loss": 0.0273, "num_input_tokens_seen": 193929520, "step": 89870 }, { "epoch": 14.661500815660686, "grad_norm": 1.7826814651489258, "learning_rate": 1.0089692690888142e-05, "loss": 0.0467, "num_input_tokens_seen": 193940880, "step": 89875 }, { "epoch": 14.66231647634584, "grad_norm": 1.032880425453186, "learning_rate": 1.0086836117200966e-05, "loss": 0.0342, "num_input_tokens_seen": 193952560, "step": 89880 }, { "epoch": 14.663132137030995, "grad_norm": 0.0491739846765995, "learning_rate": 1.0083979845747974e-05, "loss": 0.0082, "num_input_tokens_seen": 193964208, "step": 89885 }, { "epoch": 14.66394779771615, "grad_norm": 1.8257348537445068, "learning_rate": 1.0081123876587054e-05, "loss": 0.0593, "num_input_tokens_seen": 193974160, "step": 89890 }, { "epoch": 14.664763458401305, "grad_norm": 1.6731607913970947, "learning_rate": 1.0078268209776084e-05, "loss": 0.2045, "num_input_tokens_seen": 193985520, "step": 89895 }, { "epoch": 14.66557911908646, "grad_norm": 1.6516740322113037, "learning_rate": 1.0075412845372942e-05, "loss": 0.2649, "num_input_tokens_seen": 193997008, "step": 89900 }, { "epoch": 14.666394779771615, "grad_norm": 0.4329712390899658, "learning_rate": 1.0072557783435487e-05, "loss": 0.1769, "num_input_tokens_seen": 194007472, "step": 89905 }, { "epoch": 14.66721044045677, "grad_norm": 2.2740705013275146, "learning_rate": 1.0069703024021588e-05, "loss": 0.2218, "num_input_tokens_seen": 194019888, "step": 89910 }, { "epoch": 14.668026101141924, "grad_norm": 0.6454143524169922, "learning_rate": 1.0066848567189089e-05, "loss": 0.0702, "num_input_tokens_seen": 194029968, "step": 89915 }, { "epoch": 14.66884176182708, "grad_norm": 1.744791030883789, "learning_rate": 1.0063994412995864e-05, "loss": 0.1421, "num_input_tokens_seen": 194040432, "step": 89920 }, { "epoch": 14.669657422512234, "grad_norm": 1.4893560409545898, "learning_rate": 1.0061140561499722e-05, "loss": 0.1315, "num_input_tokens_seen": 194050256, "step": 89925 }, { "epoch": 14.67047308319739, "grad_norm": 2.0984816551208496, "learning_rate": 1.0058287012758528e-05, "loss": 0.1252, "num_input_tokens_seen": 194061104, "step": 89930 }, { "epoch": 14.671288743882545, "grad_norm": 0.50266432762146, "learning_rate": 1.0055433766830102e-05, "loss": 0.049, "num_input_tokens_seen": 194071184, "step": 89935 }, { "epoch": 14.6721044045677, "grad_norm": 0.2682243585586548, "learning_rate": 1.0052580823772267e-05, "loss": 0.0541, "num_input_tokens_seen": 194082160, "step": 89940 }, { "epoch": 14.672920065252855, "grad_norm": 0.24877232313156128, "learning_rate": 1.0049728183642848e-05, "loss": 0.0485, "num_input_tokens_seen": 194094096, "step": 89945 }, { "epoch": 14.673735725938009, "grad_norm": 0.1993735134601593, "learning_rate": 1.004687584649965e-05, "loss": 0.0612, "num_input_tokens_seen": 194105424, "step": 89950 }, { "epoch": 14.674551386623165, "grad_norm": 1.5230742692947388, "learning_rate": 1.0044023812400483e-05, "loss": 0.184, "num_input_tokens_seen": 194115536, "step": 89955 }, { "epoch": 14.67536704730832, "grad_norm": 2.2013778686523438, "learning_rate": 1.0041172081403147e-05, "loss": 0.1801, "num_input_tokens_seen": 194127856, "step": 89960 }, { "epoch": 14.676182707993474, "grad_norm": 1.9327865839004517, "learning_rate": 1.0038320653565436e-05, "loss": 0.1511, "num_input_tokens_seen": 194138480, "step": 89965 }, { "epoch": 14.67699836867863, "grad_norm": 0.03761490806937218, "learning_rate": 1.0035469528945137e-05, "loss": 0.1576, "num_input_tokens_seen": 194148848, "step": 89970 }, { "epoch": 14.677814029363784, "grad_norm": 0.2080868035554886, "learning_rate": 1.003261870760003e-05, "loss": 0.1252, "num_input_tokens_seen": 194161424, "step": 89975 }, { "epoch": 14.67862969004894, "grad_norm": 0.06311897933483124, "learning_rate": 1.0029768189587896e-05, "loss": 0.0178, "num_input_tokens_seen": 194173872, "step": 89980 }, { "epoch": 14.679445350734095, "grad_norm": 1.049058198928833, "learning_rate": 1.0026917974966497e-05, "loss": 0.1928, "num_input_tokens_seen": 194184560, "step": 89985 }, { "epoch": 14.68026101141925, "grad_norm": 0.01391794253140688, "learning_rate": 1.00240680637936e-05, "loss": 0.1449, "num_input_tokens_seen": 194194800, "step": 89990 }, { "epoch": 14.681076672104405, "grad_norm": 0.03577511012554169, "learning_rate": 1.0021218456126965e-05, "loss": 0.0477, "num_input_tokens_seen": 194205072, "step": 89995 }, { "epoch": 14.681892332789559, "grad_norm": 1.2131931781768799, "learning_rate": 1.0018369152024337e-05, "loss": 0.0932, "num_input_tokens_seen": 194215216, "step": 90000 }, { "epoch": 14.682707993474715, "grad_norm": 0.4040205776691437, "learning_rate": 1.0015520151543467e-05, "loss": 0.1091, "num_input_tokens_seen": 194225584, "step": 90005 }, { "epoch": 14.68352365415987, "grad_norm": 0.04048456996679306, "learning_rate": 1.0012671454742086e-05, "loss": 0.0303, "num_input_tokens_seen": 194236528, "step": 90010 }, { "epoch": 14.684339314845024, "grad_norm": 2.0450799465179443, "learning_rate": 1.0009823061677934e-05, "loss": 0.0341, "num_input_tokens_seen": 194247344, "step": 90015 }, { "epoch": 14.68515497553018, "grad_norm": 0.113827645778656, "learning_rate": 1.0006974972408734e-05, "loss": 0.048, "num_input_tokens_seen": 194257776, "step": 90020 }, { "epoch": 14.685970636215334, "grad_norm": 0.05435694381594658, "learning_rate": 1.0004127186992205e-05, "loss": 0.0954, "num_input_tokens_seen": 194269264, "step": 90025 }, { "epoch": 14.68678629690049, "grad_norm": 0.07268025726079941, "learning_rate": 1.000127970548606e-05, "loss": 0.0228, "num_input_tokens_seen": 194279600, "step": 90030 }, { "epoch": 14.687601957585644, "grad_norm": 1.9492801427841187, "learning_rate": 9.998432527948004e-06, "loss": 0.165, "num_input_tokens_seen": 194289808, "step": 90035 }, { "epoch": 14.6884176182708, "grad_norm": 0.7568288445472717, "learning_rate": 9.99558565443576e-06, "loss": 0.0716, "num_input_tokens_seen": 194301520, "step": 90040 }, { "epoch": 14.689233278955955, "grad_norm": 0.05184999480843544, "learning_rate": 9.992739085006988e-06, "loss": 0.0352, "num_input_tokens_seen": 194311280, "step": 90045 }, { "epoch": 14.690048939641109, "grad_norm": 0.9124466180801392, "learning_rate": 9.989892819719418e-06, "loss": 0.0218, "num_input_tokens_seen": 194322608, "step": 90050 }, { "epoch": 14.690864600326265, "grad_norm": 0.05822140350937843, "learning_rate": 9.987046858630692e-06, "loss": 0.2374, "num_input_tokens_seen": 194332592, "step": 90055 }, { "epoch": 14.691680261011419, "grad_norm": 0.17139074206352234, "learning_rate": 9.984201201798515e-06, "loss": 0.063, "num_input_tokens_seen": 194343472, "step": 90060 }, { "epoch": 14.692495921696574, "grad_norm": 0.46778324246406555, "learning_rate": 9.981355849280548e-06, "loss": 0.1261, "num_input_tokens_seen": 194354032, "step": 90065 }, { "epoch": 14.69331158238173, "grad_norm": 2.0257601737976074, "learning_rate": 9.978510801134461e-06, "loss": 0.1356, "num_input_tokens_seen": 194363952, "step": 90070 }, { "epoch": 14.694127243066884, "grad_norm": 0.38334429264068604, "learning_rate": 9.975666057417904e-06, "loss": 0.0441, "num_input_tokens_seen": 194374928, "step": 90075 }, { "epoch": 14.69494290375204, "grad_norm": 0.18975763022899628, "learning_rate": 9.972821618188538e-06, "loss": 0.0383, "num_input_tokens_seen": 194385584, "step": 90080 }, { "epoch": 14.695758564437194, "grad_norm": 0.11519847065210342, "learning_rate": 9.969977483504004e-06, "loss": 0.1303, "num_input_tokens_seen": 194395600, "step": 90085 }, { "epoch": 14.69657422512235, "grad_norm": 0.2798280417919159, "learning_rate": 9.967133653421943e-06, "loss": 0.0754, "num_input_tokens_seen": 194407952, "step": 90090 }, { "epoch": 14.697389885807503, "grad_norm": 1.031166672706604, "learning_rate": 9.96429012799999e-06, "loss": 0.0429, "num_input_tokens_seen": 194419024, "step": 90095 }, { "epoch": 14.698205546492659, "grad_norm": 0.029984116554260254, "learning_rate": 9.96144690729577e-06, "loss": 0.1026, "num_input_tokens_seen": 194431152, "step": 90100 }, { "epoch": 14.699021207177815, "grad_norm": 1.5240968465805054, "learning_rate": 9.958603991366907e-06, "loss": 0.0795, "num_input_tokens_seen": 194441200, "step": 90105 }, { "epoch": 14.699836867862969, "grad_norm": 0.02194315753877163, "learning_rate": 9.955761380271014e-06, "loss": 0.0972, "num_input_tokens_seen": 194452528, "step": 90110 }, { "epoch": 14.700652528548124, "grad_norm": 0.1873011291027069, "learning_rate": 9.952919074065706e-06, "loss": 0.1831, "num_input_tokens_seen": 194463760, "step": 90115 }, { "epoch": 14.701468189233278, "grad_norm": 1.940280556678772, "learning_rate": 9.950077072808577e-06, "loss": 0.1893, "num_input_tokens_seen": 194473840, "step": 90120 }, { "epoch": 14.702283849918434, "grad_norm": 1.516684889793396, "learning_rate": 9.947235376557229e-06, "loss": 0.058, "num_input_tokens_seen": 194485168, "step": 90125 }, { "epoch": 14.70309951060359, "grad_norm": 1.0840551853179932, "learning_rate": 9.944393985369255e-06, "loss": 0.1601, "num_input_tokens_seen": 194495984, "step": 90130 }, { "epoch": 14.703915171288743, "grad_norm": 0.080783411860466, "learning_rate": 9.941552899302234e-06, "loss": 0.1901, "num_input_tokens_seen": 194506704, "step": 90135 }, { "epoch": 14.7047308319739, "grad_norm": 0.026059500873088837, "learning_rate": 9.938712118413737e-06, "loss": 0.0054, "num_input_tokens_seen": 194517584, "step": 90140 }, { "epoch": 14.705546492659053, "grad_norm": 2.4760751724243164, "learning_rate": 9.935871642761363e-06, "loss": 0.3509, "num_input_tokens_seen": 194529392, "step": 90145 }, { "epoch": 14.706362153344209, "grad_norm": 0.11086868494749069, "learning_rate": 9.933031472402646e-06, "loss": 0.039, "num_input_tokens_seen": 194540240, "step": 90150 }, { "epoch": 14.707177814029365, "grad_norm": 0.8916731476783752, "learning_rate": 9.930191607395172e-06, "loss": 0.0718, "num_input_tokens_seen": 194551664, "step": 90155 }, { "epoch": 14.707993474714518, "grad_norm": 1.574699878692627, "learning_rate": 9.927352047796471e-06, "loss": 0.1307, "num_input_tokens_seen": 194562896, "step": 90160 }, { "epoch": 14.708809135399674, "grad_norm": 0.07144486159086227, "learning_rate": 9.924512793664114e-06, "loss": 0.0331, "num_input_tokens_seen": 194573200, "step": 90165 }, { "epoch": 14.709624796084828, "grad_norm": 1.7052044868469238, "learning_rate": 9.921673845055618e-06, "loss": 0.1433, "num_input_tokens_seen": 194583760, "step": 90170 }, { "epoch": 14.710440456769984, "grad_norm": 0.5777065753936768, "learning_rate": 9.918835202028542e-06, "loss": 0.0554, "num_input_tokens_seen": 194594704, "step": 90175 }, { "epoch": 14.71125611745514, "grad_norm": 0.32539018988609314, "learning_rate": 9.915996864640387e-06, "loss": 0.1765, "num_input_tokens_seen": 194605488, "step": 90180 }, { "epoch": 14.712071778140293, "grad_norm": 0.08714134246110916, "learning_rate": 9.91315883294871e-06, "loss": 0.0326, "num_input_tokens_seen": 194616624, "step": 90185 }, { "epoch": 14.71288743882545, "grad_norm": 0.5781254768371582, "learning_rate": 9.91032110701099e-06, "loss": 0.1367, "num_input_tokens_seen": 194626896, "step": 90190 }, { "epoch": 14.713703099510603, "grad_norm": 0.2897871732711792, "learning_rate": 9.907483686884772e-06, "loss": 0.1632, "num_input_tokens_seen": 194638576, "step": 90195 }, { "epoch": 14.714518760195759, "grad_norm": 0.038825515657663345, "learning_rate": 9.904646572627524e-06, "loss": 0.1453, "num_input_tokens_seen": 194649424, "step": 90200 }, { "epoch": 14.715334420880914, "grad_norm": 0.10252441465854645, "learning_rate": 9.901809764296773e-06, "loss": 0.0172, "num_input_tokens_seen": 194660432, "step": 90205 }, { "epoch": 14.716150081566068, "grad_norm": 0.09730606526136398, "learning_rate": 9.898973261950003e-06, "loss": 0.0173, "num_input_tokens_seen": 194670768, "step": 90210 }, { "epoch": 14.716965742251224, "grad_norm": 1.496357798576355, "learning_rate": 9.896137065644696e-06, "loss": 0.1138, "num_input_tokens_seen": 194681456, "step": 90215 }, { "epoch": 14.717781402936378, "grad_norm": 0.5960182547569275, "learning_rate": 9.89330117543833e-06, "loss": 0.189, "num_input_tokens_seen": 194693296, "step": 90220 }, { "epoch": 14.718597063621534, "grad_norm": 2.7487659454345703, "learning_rate": 9.890465591388382e-06, "loss": 0.2352, "num_input_tokens_seen": 194703952, "step": 90225 }, { "epoch": 14.719412724306688, "grad_norm": 0.05197209119796753, "learning_rate": 9.887630313552316e-06, "loss": 0.0224, "num_input_tokens_seen": 194715728, "step": 90230 }, { "epoch": 14.720228384991843, "grad_norm": 0.3382674753665924, "learning_rate": 9.884795341987591e-06, "loss": 0.0436, "num_input_tokens_seen": 194726640, "step": 90235 }, { "epoch": 14.721044045676999, "grad_norm": 0.3323324918746948, "learning_rate": 9.881960676751668e-06, "loss": 0.2332, "num_input_tokens_seen": 194737680, "step": 90240 }, { "epoch": 14.721859706362153, "grad_norm": 0.9970471858978271, "learning_rate": 9.879126317901985e-06, "loss": 0.0456, "num_input_tokens_seen": 194746768, "step": 90245 }, { "epoch": 14.722675367047309, "grad_norm": 0.049253664910793304, "learning_rate": 9.876292265495993e-06, "loss": 0.0893, "num_input_tokens_seen": 194756880, "step": 90250 }, { "epoch": 14.723491027732463, "grad_norm": 0.20462188124656677, "learning_rate": 9.873458519591114e-06, "loss": 0.0343, "num_input_tokens_seen": 194767440, "step": 90255 }, { "epoch": 14.724306688417618, "grad_norm": 0.023739483207464218, "learning_rate": 9.870625080244805e-06, "loss": 0.1889, "num_input_tokens_seen": 194778352, "step": 90260 }, { "epoch": 14.725122349102774, "grad_norm": 0.0858636200428009, "learning_rate": 9.867791947514454e-06, "loss": 0.2198, "num_input_tokens_seen": 194789104, "step": 90265 }, { "epoch": 14.725938009787928, "grad_norm": 0.29490140080451965, "learning_rate": 9.864959121457515e-06, "loss": 0.2631, "num_input_tokens_seen": 194799760, "step": 90270 }, { "epoch": 14.726753670473084, "grad_norm": 0.3556298613548279, "learning_rate": 9.862126602131358e-06, "loss": 0.0505, "num_input_tokens_seen": 194810416, "step": 90275 }, { "epoch": 14.727569331158238, "grad_norm": 1.916601538658142, "learning_rate": 9.859294389593432e-06, "loss": 0.0792, "num_input_tokens_seen": 194820880, "step": 90280 }, { "epoch": 14.728384991843393, "grad_norm": 0.2890927493572235, "learning_rate": 9.85646248390109e-06, "loss": 0.0074, "num_input_tokens_seen": 194831344, "step": 90285 }, { "epoch": 14.729200652528547, "grad_norm": 1.4797269105911255, "learning_rate": 9.853630885111768e-06, "loss": 0.0499, "num_input_tokens_seen": 194842352, "step": 90290 }, { "epoch": 14.730016313213703, "grad_norm": 0.060539305210113525, "learning_rate": 9.850799593282811e-06, "loss": 0.1736, "num_input_tokens_seen": 194853968, "step": 90295 }, { "epoch": 14.730831973898859, "grad_norm": 2.26326060295105, "learning_rate": 9.847968608471636e-06, "loss": 0.1335, "num_input_tokens_seen": 194864912, "step": 90300 }, { "epoch": 14.731647634584013, "grad_norm": 0.176497682929039, "learning_rate": 9.845137930735584e-06, "loss": 0.0398, "num_input_tokens_seen": 194875824, "step": 90305 }, { "epoch": 14.732463295269168, "grad_norm": 0.05015682429075241, "learning_rate": 9.842307560132053e-06, "loss": 0.0851, "num_input_tokens_seen": 194885936, "step": 90310 }, { "epoch": 14.733278955954322, "grad_norm": 1.5673123598098755, "learning_rate": 9.839477496718369e-06, "loss": 0.1577, "num_input_tokens_seen": 194895824, "step": 90315 }, { "epoch": 14.734094616639478, "grad_norm": 0.044195789843797684, "learning_rate": 9.836647740551924e-06, "loss": 0.1056, "num_input_tokens_seen": 194907184, "step": 90320 }, { "epoch": 14.734910277324634, "grad_norm": 0.3308340013027191, "learning_rate": 9.833818291690034e-06, "loss": 0.0273, "num_input_tokens_seen": 194916272, "step": 90325 }, { "epoch": 14.735725938009788, "grad_norm": 0.3413068950176239, "learning_rate": 9.830989150190071e-06, "loss": 0.225, "num_input_tokens_seen": 194927536, "step": 90330 }, { "epoch": 14.736541598694943, "grad_norm": 0.44019100069999695, "learning_rate": 9.828160316109339e-06, "loss": 0.1195, "num_input_tokens_seen": 194937456, "step": 90335 }, { "epoch": 14.737357259380097, "grad_norm": 0.17275379598140717, "learning_rate": 9.825331789505202e-06, "loss": 0.0372, "num_input_tokens_seen": 194948368, "step": 90340 }, { "epoch": 14.738172920065253, "grad_norm": 0.5335655808448792, "learning_rate": 9.82250357043495e-06, "loss": 0.0564, "num_input_tokens_seen": 194958768, "step": 90345 }, { "epoch": 14.738988580750409, "grad_norm": 1.7599035501480103, "learning_rate": 9.819675658955926e-06, "loss": 0.0726, "num_input_tokens_seen": 194969808, "step": 90350 }, { "epoch": 14.739804241435563, "grad_norm": 0.09513645619153976, "learning_rate": 9.816848055125433e-06, "loss": 0.2047, "num_input_tokens_seen": 194981424, "step": 90355 }, { "epoch": 14.740619902120718, "grad_norm": 0.32746464014053345, "learning_rate": 9.814020759000775e-06, "loss": 0.0996, "num_input_tokens_seen": 194991856, "step": 90360 }, { "epoch": 14.741435562805872, "grad_norm": 0.0685427263379097, "learning_rate": 9.811193770639248e-06, "loss": 0.0265, "num_input_tokens_seen": 195003408, "step": 90365 }, { "epoch": 14.742251223491028, "grad_norm": 0.024046029895544052, "learning_rate": 9.80836709009815e-06, "loss": 0.0833, "num_input_tokens_seen": 195014416, "step": 90370 }, { "epoch": 14.743066884176184, "grad_norm": 0.19308125972747803, "learning_rate": 9.805540717434766e-06, "loss": 0.0446, "num_input_tokens_seen": 195024528, "step": 90375 }, { "epoch": 14.743882544861338, "grad_norm": 1.750599980354309, "learning_rate": 9.802714652706374e-06, "loss": 0.0626, "num_input_tokens_seen": 195035248, "step": 90380 }, { "epoch": 14.744698205546493, "grad_norm": 1.27582848072052, "learning_rate": 9.79988889597025e-06, "loss": 0.0996, "num_input_tokens_seen": 195046640, "step": 90385 }, { "epoch": 14.745513866231647, "grad_norm": 0.050055306404829025, "learning_rate": 9.797063447283658e-06, "loss": 0.0119, "num_input_tokens_seen": 195058128, "step": 90390 }, { "epoch": 14.746329526916803, "grad_norm": 0.28824278712272644, "learning_rate": 9.794238306703862e-06, "loss": 0.1397, "num_input_tokens_seen": 195070640, "step": 90395 }, { "epoch": 14.747145187601957, "grad_norm": 0.038790322840213776, "learning_rate": 9.791413474288108e-06, "loss": 0.0366, "num_input_tokens_seen": 195080560, "step": 90400 }, { "epoch": 14.747960848287113, "grad_norm": 0.02781159244477749, "learning_rate": 9.788588950093672e-06, "loss": 0.0371, "num_input_tokens_seen": 195092464, "step": 90405 }, { "epoch": 14.748776508972268, "grad_norm": 0.1454310268163681, "learning_rate": 9.78576473417776e-06, "loss": 0.1234, "num_input_tokens_seen": 195103664, "step": 90410 }, { "epoch": 14.749592169657422, "grad_norm": 0.11967311054468155, "learning_rate": 9.782940826597642e-06, "loss": 0.0438, "num_input_tokens_seen": 195114352, "step": 90415 }, { "epoch": 14.750407830342578, "grad_norm": 1.4774469137191772, "learning_rate": 9.780117227410518e-06, "loss": 0.1934, "num_input_tokens_seen": 195125488, "step": 90420 }, { "epoch": 14.751223491027732, "grad_norm": 0.7326188087463379, "learning_rate": 9.77729393667364e-06, "loss": 0.1717, "num_input_tokens_seen": 195135312, "step": 90425 }, { "epoch": 14.752039151712887, "grad_norm": 0.10006565600633621, "learning_rate": 9.774470954444198e-06, "loss": 0.0715, "num_input_tokens_seen": 195145744, "step": 90430 }, { "epoch": 14.752854812398043, "grad_norm": 0.047861576080322266, "learning_rate": 9.771648280779432e-06, "loss": 0.0604, "num_input_tokens_seen": 195156368, "step": 90435 }, { "epoch": 14.753670473083197, "grad_norm": 0.47488871216773987, "learning_rate": 9.768825915736515e-06, "loss": 0.031, "num_input_tokens_seen": 195166960, "step": 90440 }, { "epoch": 14.754486133768353, "grad_norm": 1.8875157833099365, "learning_rate": 9.766003859372683e-06, "loss": 0.0488, "num_input_tokens_seen": 195178928, "step": 90445 }, { "epoch": 14.755301794453507, "grad_norm": 0.06108153238892555, "learning_rate": 9.763182111745087e-06, "loss": 0.0733, "num_input_tokens_seen": 195189776, "step": 90450 }, { "epoch": 14.756117455138662, "grad_norm": 0.054916515946388245, "learning_rate": 9.760360672910954e-06, "loss": 0.2233, "num_input_tokens_seen": 195200560, "step": 90455 }, { "epoch": 14.756933115823816, "grad_norm": 0.06595690548419952, "learning_rate": 9.757539542927427e-06, "loss": 0.1027, "num_input_tokens_seen": 195211632, "step": 90460 }, { "epoch": 14.757748776508972, "grad_norm": 0.3403244614601135, "learning_rate": 9.754718721851707e-06, "loss": 0.1397, "num_input_tokens_seen": 195222224, "step": 90465 }, { "epoch": 14.758564437194128, "grad_norm": 0.1290663331747055, "learning_rate": 9.751898209740953e-06, "loss": 0.0323, "num_input_tokens_seen": 195234064, "step": 90470 }, { "epoch": 14.759380097879282, "grad_norm": 1.6301302909851074, "learning_rate": 9.749078006652323e-06, "loss": 0.2348, "num_input_tokens_seen": 195243248, "step": 90475 }, { "epoch": 14.760195758564437, "grad_norm": 0.3985257148742676, "learning_rate": 9.746258112642975e-06, "loss": 0.0571, "num_input_tokens_seen": 195253328, "step": 90480 }, { "epoch": 14.761011419249591, "grad_norm": 0.1930883824825287, "learning_rate": 9.74343852777006e-06, "loss": 0.0212, "num_input_tokens_seen": 195263408, "step": 90485 }, { "epoch": 14.761827079934747, "grad_norm": 0.37195366621017456, "learning_rate": 9.740619252090715e-06, "loss": 0.0329, "num_input_tokens_seen": 195275056, "step": 90490 }, { "epoch": 14.762642740619903, "grad_norm": 0.04459868371486664, "learning_rate": 9.73780028566208e-06, "loss": 0.0103, "num_input_tokens_seen": 195285584, "step": 90495 }, { "epoch": 14.763458401305057, "grad_norm": 0.07875563204288483, "learning_rate": 9.734981628541282e-06, "loss": 0.0567, "num_input_tokens_seen": 195296656, "step": 90500 }, { "epoch": 14.764274061990212, "grad_norm": 1.333548903465271, "learning_rate": 9.732163280785447e-06, "loss": 0.1118, "num_input_tokens_seen": 195307216, "step": 90505 }, { "epoch": 14.765089722675366, "grad_norm": 0.10675936192274094, "learning_rate": 9.72934524245169e-06, "loss": 0.0139, "num_input_tokens_seen": 195317776, "step": 90510 }, { "epoch": 14.765905383360522, "grad_norm": 0.0694665014743805, "learning_rate": 9.726527513597128e-06, "loss": 0.0486, "num_input_tokens_seen": 195328688, "step": 90515 }, { "epoch": 14.766721044045678, "grad_norm": 0.22397080063819885, "learning_rate": 9.723710094278857e-06, "loss": 0.0404, "num_input_tokens_seen": 195339952, "step": 90520 }, { "epoch": 14.767536704730832, "grad_norm": 0.4384707510471344, "learning_rate": 9.720892984553984e-06, "loss": 0.0365, "num_input_tokens_seen": 195350960, "step": 90525 }, { "epoch": 14.768352365415987, "grad_norm": 0.16904985904693604, "learning_rate": 9.718076184479597e-06, "loss": 0.0487, "num_input_tokens_seen": 195361776, "step": 90530 }, { "epoch": 14.769168026101141, "grad_norm": 1.4972773790359497, "learning_rate": 9.71525969411278e-06, "loss": 0.3178, "num_input_tokens_seen": 195373456, "step": 90535 }, { "epoch": 14.769983686786297, "grad_norm": 0.052481457591056824, "learning_rate": 9.712443513510619e-06, "loss": 0.0457, "num_input_tokens_seen": 195384816, "step": 90540 }, { "epoch": 14.770799347471453, "grad_norm": 1.5459145307540894, "learning_rate": 9.709627642730174e-06, "loss": 0.0413, "num_input_tokens_seen": 195395632, "step": 90545 }, { "epoch": 14.771615008156607, "grad_norm": 0.8458924889564514, "learning_rate": 9.706812081828537e-06, "loss": 0.0469, "num_input_tokens_seen": 195407248, "step": 90550 }, { "epoch": 14.772430668841762, "grad_norm": 1.1330666542053223, "learning_rate": 9.703996830862739e-06, "loss": 0.1192, "num_input_tokens_seen": 195418896, "step": 90555 }, { "epoch": 14.773246329526916, "grad_norm": 0.0507214181125164, "learning_rate": 9.701181889889866e-06, "loss": 0.0195, "num_input_tokens_seen": 195429328, "step": 90560 }, { "epoch": 14.774061990212072, "grad_norm": 2.641493082046509, "learning_rate": 9.698367258966934e-06, "loss": 0.0593, "num_input_tokens_seen": 195440368, "step": 90565 }, { "epoch": 14.774877650897226, "grad_norm": 1.9056077003479004, "learning_rate": 9.695552938151007e-06, "loss": 0.0403, "num_input_tokens_seen": 195451216, "step": 90570 }, { "epoch": 14.775693311582382, "grad_norm": 0.017588026821613312, "learning_rate": 9.692738927499117e-06, "loss": 0.022, "num_input_tokens_seen": 195462640, "step": 90575 }, { "epoch": 14.776508972267537, "grad_norm": 1.8814773559570312, "learning_rate": 9.689925227068292e-06, "loss": 0.1781, "num_input_tokens_seen": 195473456, "step": 90580 }, { "epoch": 14.777324632952691, "grad_norm": 1.8979463577270508, "learning_rate": 9.687111836915553e-06, "loss": 0.2092, "num_input_tokens_seen": 195484112, "step": 90585 }, { "epoch": 14.778140293637847, "grad_norm": 0.798103392124176, "learning_rate": 9.684298757097917e-06, "loss": 0.1689, "num_input_tokens_seen": 195493776, "step": 90590 }, { "epoch": 14.778955954323001, "grad_norm": 0.44200262427330017, "learning_rate": 9.681485987672398e-06, "loss": 0.0306, "num_input_tokens_seen": 195504240, "step": 90595 }, { "epoch": 14.779771615008157, "grad_norm": 1.279030203819275, "learning_rate": 9.678673528695998e-06, "loss": 0.234, "num_input_tokens_seen": 195514416, "step": 90600 }, { "epoch": 14.780587275693312, "grad_norm": 0.11510976403951645, "learning_rate": 9.675861380225714e-06, "loss": 0.1205, "num_input_tokens_seen": 195524400, "step": 90605 }, { "epoch": 14.781402936378466, "grad_norm": 1.8220185041427612, "learning_rate": 9.673049542318541e-06, "loss": 0.0887, "num_input_tokens_seen": 195535280, "step": 90610 }, { "epoch": 14.782218597063622, "grad_norm": 2.9797017574310303, "learning_rate": 9.670238015031464e-06, "loss": 0.1519, "num_input_tokens_seen": 195546832, "step": 90615 }, { "epoch": 14.783034257748776, "grad_norm": 2.4711623191833496, "learning_rate": 9.667426798421458e-06, "loss": 0.2687, "num_input_tokens_seen": 195556688, "step": 90620 }, { "epoch": 14.783849918433932, "grad_norm": 1.036359190940857, "learning_rate": 9.664615892545498e-06, "loss": 0.1031, "num_input_tokens_seen": 195566000, "step": 90625 }, { "epoch": 14.784665579119086, "grad_norm": 0.3081223666667938, "learning_rate": 9.661805297460554e-06, "loss": 0.0618, "num_input_tokens_seen": 195576208, "step": 90630 }, { "epoch": 14.785481239804241, "grad_norm": 1.4671319723129272, "learning_rate": 9.658995013223582e-06, "loss": 0.239, "num_input_tokens_seen": 195587344, "step": 90635 }, { "epoch": 14.786296900489397, "grad_norm": 0.4516872465610504, "learning_rate": 9.656185039891538e-06, "loss": 0.0558, "num_input_tokens_seen": 195597904, "step": 90640 }, { "epoch": 14.78711256117455, "grad_norm": 1.147781491279602, "learning_rate": 9.65337537752137e-06, "loss": 0.0378, "num_input_tokens_seen": 195609168, "step": 90645 }, { "epoch": 14.787928221859707, "grad_norm": 0.052979130297899246, "learning_rate": 9.650566026170015e-06, "loss": 0.3075, "num_input_tokens_seen": 195619088, "step": 90650 }, { "epoch": 14.78874388254486, "grad_norm": 2.2944767475128174, "learning_rate": 9.647756985894416e-06, "loss": 0.141, "num_input_tokens_seen": 195630512, "step": 90655 }, { "epoch": 14.789559543230016, "grad_norm": 0.11997149139642715, "learning_rate": 9.644948256751496e-06, "loss": 0.039, "num_input_tokens_seen": 195642512, "step": 90660 }, { "epoch": 14.790375203915172, "grad_norm": 0.08587153255939484, "learning_rate": 9.642139838798178e-06, "loss": 0.0832, "num_input_tokens_seen": 195653936, "step": 90665 }, { "epoch": 14.791190864600326, "grad_norm": 0.16695889830589294, "learning_rate": 9.63933173209138e-06, "loss": 0.0312, "num_input_tokens_seen": 195664784, "step": 90670 }, { "epoch": 14.792006525285482, "grad_norm": 0.048122189939022064, "learning_rate": 9.636523936688003e-06, "loss": 0.0719, "num_input_tokens_seen": 195675888, "step": 90675 }, { "epoch": 14.792822185970635, "grad_norm": 0.06447569280862808, "learning_rate": 9.633716452644972e-06, "loss": 0.1338, "num_input_tokens_seen": 195686768, "step": 90680 }, { "epoch": 14.793637846655791, "grad_norm": 2.061121940612793, "learning_rate": 9.630909280019158e-06, "loss": 0.0627, "num_input_tokens_seen": 195698544, "step": 90685 }, { "epoch": 14.794453507340947, "grad_norm": 0.05911646783351898, "learning_rate": 9.628102418867468e-06, "loss": 0.0348, "num_input_tokens_seen": 195707408, "step": 90690 }, { "epoch": 14.7952691680261, "grad_norm": 0.7511160373687744, "learning_rate": 9.625295869246787e-06, "loss": 0.1255, "num_input_tokens_seen": 195718352, "step": 90695 }, { "epoch": 14.796084828711257, "grad_norm": 1.7858800888061523, "learning_rate": 9.622489631213988e-06, "loss": 0.2156, "num_input_tokens_seen": 195729680, "step": 90700 }, { "epoch": 14.79690048939641, "grad_norm": 0.031874846667051315, "learning_rate": 9.619683704825947e-06, "loss": 0.1031, "num_input_tokens_seen": 195739344, "step": 90705 }, { "epoch": 14.797716150081566, "grad_norm": 1.2819037437438965, "learning_rate": 9.616878090139523e-06, "loss": 0.0697, "num_input_tokens_seen": 195749744, "step": 90710 }, { "epoch": 14.798531810766722, "grad_norm": 0.07047276198863983, "learning_rate": 9.61407278721158e-06, "loss": 0.1125, "num_input_tokens_seen": 195760720, "step": 90715 }, { "epoch": 14.799347471451876, "grad_norm": 1.0750287771224976, "learning_rate": 9.611267796098971e-06, "loss": 0.0708, "num_input_tokens_seen": 195772176, "step": 90720 }, { "epoch": 14.800163132137031, "grad_norm": 0.09714877605438232, "learning_rate": 9.608463116858542e-06, "loss": 0.0098, "num_input_tokens_seen": 195783568, "step": 90725 }, { "epoch": 14.800978792822185, "grad_norm": 0.711365818977356, "learning_rate": 9.605658749547136e-06, "loss": 0.0246, "num_input_tokens_seen": 195794896, "step": 90730 }, { "epoch": 14.801794453507341, "grad_norm": 1.6934120655059814, "learning_rate": 9.602854694221583e-06, "loss": 0.0995, "num_input_tokens_seen": 195804112, "step": 90735 }, { "epoch": 14.802610114192497, "grad_norm": 0.0428379625082016, "learning_rate": 9.600050950938714e-06, "loss": 0.0941, "num_input_tokens_seen": 195815056, "step": 90740 }, { "epoch": 14.80342577487765, "grad_norm": 0.49112898111343384, "learning_rate": 9.597247519755344e-06, "loss": 0.031, "num_input_tokens_seen": 195826032, "step": 90745 }, { "epoch": 14.804241435562806, "grad_norm": 0.15584935247898102, "learning_rate": 9.594444400728297e-06, "loss": 0.211, "num_input_tokens_seen": 195836528, "step": 90750 }, { "epoch": 14.80505709624796, "grad_norm": 0.3011326491832733, "learning_rate": 9.591641593914375e-06, "loss": 0.0879, "num_input_tokens_seen": 195845808, "step": 90755 }, { "epoch": 14.805872756933116, "grad_norm": 1.6843446493148804, "learning_rate": 9.588839099370384e-06, "loss": 0.2, "num_input_tokens_seen": 195856976, "step": 90760 }, { "epoch": 14.80668841761827, "grad_norm": 0.7618675827980042, "learning_rate": 9.58603691715312e-06, "loss": 0.0164, "num_input_tokens_seen": 195866704, "step": 90765 }, { "epoch": 14.807504078303426, "grad_norm": 3.6908607482910156, "learning_rate": 9.58323504731937e-06, "loss": 0.2831, "num_input_tokens_seen": 195878544, "step": 90770 }, { "epoch": 14.808319738988581, "grad_norm": 0.09741906821727753, "learning_rate": 9.580433489925923e-06, "loss": 0.0354, "num_input_tokens_seen": 195890576, "step": 90775 }, { "epoch": 14.809135399673735, "grad_norm": 0.6019733548164368, "learning_rate": 9.577632245029539e-06, "loss": 0.0349, "num_input_tokens_seen": 195901008, "step": 90780 }, { "epoch": 14.809951060358891, "grad_norm": 1.4235790967941284, "learning_rate": 9.574831312687022e-06, "loss": 0.2443, "num_input_tokens_seen": 195912304, "step": 90785 }, { "epoch": 14.810766721044045, "grad_norm": 0.089118093252182, "learning_rate": 9.572030692955097e-06, "loss": 0.0157, "num_input_tokens_seen": 195923568, "step": 90790 }, { "epoch": 14.8115823817292, "grad_norm": 0.1370527595281601, "learning_rate": 9.569230385890562e-06, "loss": 0.0414, "num_input_tokens_seen": 195933776, "step": 90795 }, { "epoch": 14.812398042414356, "grad_norm": 1.466810941696167, "learning_rate": 9.56643039155013e-06, "loss": 0.1127, "num_input_tokens_seen": 195943632, "step": 90800 }, { "epoch": 14.81321370309951, "grad_norm": 0.20940786600112915, "learning_rate": 9.563630709990584e-06, "loss": 0.0395, "num_input_tokens_seen": 195954672, "step": 90805 }, { "epoch": 14.814029363784666, "grad_norm": 1.2898504734039307, "learning_rate": 9.560831341268626e-06, "loss": 0.4234, "num_input_tokens_seen": 195965136, "step": 90810 }, { "epoch": 14.81484502446982, "grad_norm": 0.02613084949553013, "learning_rate": 9.558032285441021e-06, "loss": 0.024, "num_input_tokens_seen": 195975152, "step": 90815 }, { "epoch": 14.815660685154976, "grad_norm": 0.10206091403961182, "learning_rate": 9.555233542564467e-06, "loss": 0.0185, "num_input_tokens_seen": 195986160, "step": 90820 }, { "epoch": 14.81647634584013, "grad_norm": 0.040584370493888855, "learning_rate": 9.552435112695707e-06, "loss": 0.1488, "num_input_tokens_seen": 195996592, "step": 90825 }, { "epoch": 14.817292006525285, "grad_norm": 1.8767298460006714, "learning_rate": 9.549636995891445e-06, "loss": 0.1985, "num_input_tokens_seen": 196007280, "step": 90830 }, { "epoch": 14.818107667210441, "grad_norm": 0.029642615467309952, "learning_rate": 9.54683919220839e-06, "loss": 0.0662, "num_input_tokens_seen": 196018704, "step": 90835 }, { "epoch": 14.818923327895595, "grad_norm": 2.1285927295684814, "learning_rate": 9.544041701703243e-06, "loss": 0.0702, "num_input_tokens_seen": 196029456, "step": 90840 }, { "epoch": 14.81973898858075, "grad_norm": 0.07724615186452866, "learning_rate": 9.541244524432697e-06, "loss": 0.0066, "num_input_tokens_seen": 196039440, "step": 90845 }, { "epoch": 14.820554649265905, "grad_norm": 0.06292799115180969, "learning_rate": 9.538447660453443e-06, "loss": 0.2519, "num_input_tokens_seen": 196049840, "step": 90850 }, { "epoch": 14.82137030995106, "grad_norm": 0.06038770079612732, "learning_rate": 9.53565110982216e-06, "loss": 0.0154, "num_input_tokens_seen": 196060272, "step": 90855 }, { "epoch": 14.822185970636216, "grad_norm": 0.09815896302461624, "learning_rate": 9.532854872595526e-06, "loss": 0.0768, "num_input_tokens_seen": 196072656, "step": 90860 }, { "epoch": 14.82300163132137, "grad_norm": 0.07031294703483582, "learning_rate": 9.53005894883021e-06, "loss": 0.0866, "num_input_tokens_seen": 196081904, "step": 90865 }, { "epoch": 14.823817292006526, "grad_norm": 2.1530539989471436, "learning_rate": 9.527263338582872e-06, "loss": 0.1478, "num_input_tokens_seen": 196092432, "step": 90870 }, { "epoch": 14.82463295269168, "grad_norm": 2.000904083251953, "learning_rate": 9.524468041910173e-06, "loss": 0.1087, "num_input_tokens_seen": 196103920, "step": 90875 }, { "epoch": 14.825448613376835, "grad_norm": 1.8186016082763672, "learning_rate": 9.521673058868763e-06, "loss": 0.0951, "num_input_tokens_seen": 196114896, "step": 90880 }, { "epoch": 14.826264274061991, "grad_norm": 0.038216207176446915, "learning_rate": 9.518878389515273e-06, "loss": 0.1471, "num_input_tokens_seen": 196125744, "step": 90885 }, { "epoch": 14.827079934747145, "grad_norm": 1.0462828874588013, "learning_rate": 9.51608403390637e-06, "loss": 0.0513, "num_input_tokens_seen": 196133968, "step": 90890 }, { "epoch": 14.8278955954323, "grad_norm": 0.03650662302970886, "learning_rate": 9.513289992098648e-06, "loss": 0.062, "num_input_tokens_seen": 196145040, "step": 90895 }, { "epoch": 14.828711256117455, "grad_norm": 0.8526515364646912, "learning_rate": 9.510496264148768e-06, "loss": 0.0956, "num_input_tokens_seen": 196156112, "step": 90900 }, { "epoch": 14.82952691680261, "grad_norm": 0.07769694179296494, "learning_rate": 9.507702850113314e-06, "loss": 0.1394, "num_input_tokens_seen": 196168048, "step": 90905 }, { "epoch": 14.830342577487766, "grad_norm": 0.02803598716855049, "learning_rate": 9.504909750048934e-06, "loss": 0.0664, "num_input_tokens_seen": 196179568, "step": 90910 }, { "epoch": 14.83115823817292, "grad_norm": 1.3999135494232178, "learning_rate": 9.502116964012197e-06, "loss": 0.1124, "num_input_tokens_seen": 196188848, "step": 90915 }, { "epoch": 14.831973898858076, "grad_norm": 1.75883948802948, "learning_rate": 9.499324492059736e-06, "loss": 0.1685, "num_input_tokens_seen": 196198096, "step": 90920 }, { "epoch": 14.83278955954323, "grad_norm": 0.32830914855003357, "learning_rate": 9.496532334248112e-06, "loss": 0.0958, "num_input_tokens_seen": 196208848, "step": 90925 }, { "epoch": 14.833605220228385, "grad_norm": 0.18586182594299316, "learning_rate": 9.493740490633946e-06, "loss": 0.0227, "num_input_tokens_seen": 196218960, "step": 90930 }, { "epoch": 14.83442088091354, "grad_norm": 1.7753913402557373, "learning_rate": 9.490948961273782e-06, "loss": 0.1703, "num_input_tokens_seen": 196230288, "step": 90935 }, { "epoch": 14.835236541598695, "grad_norm": 2.316033124923706, "learning_rate": 9.488157746224226e-06, "loss": 0.0901, "num_input_tokens_seen": 196239888, "step": 90940 }, { "epoch": 14.83605220228385, "grad_norm": 1.737522006034851, "learning_rate": 9.485366845541818e-06, "loss": 0.247, "num_input_tokens_seen": 196251024, "step": 90945 }, { "epoch": 14.836867862969005, "grad_norm": 0.7994163036346436, "learning_rate": 9.482576259283149e-06, "loss": 0.1047, "num_input_tokens_seen": 196260528, "step": 90950 }, { "epoch": 14.83768352365416, "grad_norm": 0.9630771279335022, "learning_rate": 9.479785987504741e-06, "loss": 0.0612, "num_input_tokens_seen": 196270800, "step": 90955 }, { "epoch": 14.838499184339314, "grad_norm": 1.523050308227539, "learning_rate": 9.476996030263174e-06, "loss": 0.1705, "num_input_tokens_seen": 196280816, "step": 90960 }, { "epoch": 14.83931484502447, "grad_norm": 0.21847763657569885, "learning_rate": 9.47420638761496e-06, "loss": 0.134, "num_input_tokens_seen": 196291248, "step": 90965 }, { "epoch": 14.840130505709626, "grad_norm": 2.050658702850342, "learning_rate": 9.471417059616656e-06, "loss": 0.1515, "num_input_tokens_seen": 196302160, "step": 90970 }, { "epoch": 14.84094616639478, "grad_norm": 0.0430014543235302, "learning_rate": 9.468628046324785e-06, "loss": 0.0105, "num_input_tokens_seen": 196312304, "step": 90975 }, { "epoch": 14.841761827079935, "grad_norm": 0.042230408638715744, "learning_rate": 9.465839347795872e-06, "loss": 0.0549, "num_input_tokens_seen": 196324048, "step": 90980 }, { "epoch": 14.84257748776509, "grad_norm": 0.2538880705833435, "learning_rate": 9.463050964086428e-06, "loss": 0.0205, "num_input_tokens_seen": 196334384, "step": 90985 }, { "epoch": 14.843393148450245, "grad_norm": 0.07020044326782227, "learning_rate": 9.460262895252967e-06, "loss": 0.2032, "num_input_tokens_seen": 196345808, "step": 90990 }, { "epoch": 14.844208809135399, "grad_norm": 0.23827135562896729, "learning_rate": 9.457475141351993e-06, "loss": 0.1029, "num_input_tokens_seen": 196356464, "step": 90995 }, { "epoch": 14.845024469820554, "grad_norm": 0.7101358771324158, "learning_rate": 9.454687702439999e-06, "loss": 0.1372, "num_input_tokens_seen": 196368112, "step": 91000 }, { "epoch": 14.84584013050571, "grad_norm": 0.21770909428596497, "learning_rate": 9.451900578573483e-06, "loss": 0.2467, "num_input_tokens_seen": 196378352, "step": 91005 }, { "epoch": 14.846655791190864, "grad_norm": 0.7384656071662903, "learning_rate": 9.449113769808926e-06, "loss": 0.1576, "num_input_tokens_seen": 196389040, "step": 91010 }, { "epoch": 14.84747145187602, "grad_norm": 0.04143141582608223, "learning_rate": 9.446327276202804e-06, "loss": 0.2134, "num_input_tokens_seen": 196400144, "step": 91015 }, { "epoch": 14.848287112561174, "grad_norm": 1.1583516597747803, "learning_rate": 9.443541097811584e-06, "loss": 0.1056, "num_input_tokens_seen": 196412176, "step": 91020 }, { "epoch": 14.84910277324633, "grad_norm": 1.75734543800354, "learning_rate": 9.440755234691754e-06, "loss": 0.1265, "num_input_tokens_seen": 196423984, "step": 91025 }, { "epoch": 14.849918433931485, "grad_norm": 0.8085241317749023, "learning_rate": 9.437969686899743e-06, "loss": 0.1334, "num_input_tokens_seen": 196433968, "step": 91030 }, { "epoch": 14.850734094616639, "grad_norm": 1.8538414239883423, "learning_rate": 9.435184454492033e-06, "loss": 0.1352, "num_input_tokens_seen": 196446096, "step": 91035 }, { "epoch": 14.851549755301795, "grad_norm": 1.300907850265503, "learning_rate": 9.432399537525038e-06, "loss": 0.1799, "num_input_tokens_seen": 196456560, "step": 91040 }, { "epoch": 14.852365415986949, "grad_norm": 0.06854525953531265, "learning_rate": 9.429614936055234e-06, "loss": 0.1974, "num_input_tokens_seen": 196466544, "step": 91045 }, { "epoch": 14.853181076672104, "grad_norm": 2.0389208793640137, "learning_rate": 9.426830650139019e-06, "loss": 0.186, "num_input_tokens_seen": 196477360, "step": 91050 }, { "epoch": 14.85399673735726, "grad_norm": 1.1279466152191162, "learning_rate": 9.42404667983285e-06, "loss": 0.0357, "num_input_tokens_seen": 196488944, "step": 91055 }, { "epoch": 14.854812398042414, "grad_norm": 3.1685543060302734, "learning_rate": 9.42126302519312e-06, "loss": 0.1886, "num_input_tokens_seen": 196499056, "step": 91060 }, { "epoch": 14.85562805872757, "grad_norm": 0.05336257815361023, "learning_rate": 9.418479686276277e-06, "loss": 0.0079, "num_input_tokens_seen": 196509680, "step": 91065 }, { "epoch": 14.856443719412724, "grad_norm": 0.0872844010591507, "learning_rate": 9.41569666313869e-06, "loss": 0.1003, "num_input_tokens_seen": 196519856, "step": 91070 }, { "epoch": 14.85725938009788, "grad_norm": 0.26742056012153625, "learning_rate": 9.412913955836797e-06, "loss": 0.0361, "num_input_tokens_seen": 196531088, "step": 91075 }, { "epoch": 14.858075040783035, "grad_norm": 0.05180556699633598, "learning_rate": 9.41013156442696e-06, "loss": 0.0155, "num_input_tokens_seen": 196542384, "step": 91080 }, { "epoch": 14.858890701468189, "grad_norm": 0.19862040877342224, "learning_rate": 9.407349488965603e-06, "loss": 0.0519, "num_input_tokens_seen": 196553648, "step": 91085 }, { "epoch": 14.859706362153345, "grad_norm": 0.026572177186608315, "learning_rate": 9.40456772950907e-06, "loss": 0.0429, "num_input_tokens_seen": 196564624, "step": 91090 }, { "epoch": 14.860522022838499, "grad_norm": 0.3021896481513977, "learning_rate": 9.401786286113773e-06, "loss": 0.0326, "num_input_tokens_seen": 196575984, "step": 91095 }, { "epoch": 14.861337683523654, "grad_norm": 0.40558820962905884, "learning_rate": 9.39900515883605e-06, "loss": 0.0972, "num_input_tokens_seen": 196586352, "step": 91100 }, { "epoch": 14.86215334420881, "grad_norm": 0.12717744708061218, "learning_rate": 9.396224347732286e-06, "loss": 0.0222, "num_input_tokens_seen": 196598224, "step": 91105 }, { "epoch": 14.862969004893964, "grad_norm": 0.1004662960767746, "learning_rate": 9.39344385285883e-06, "loss": 0.0265, "num_input_tokens_seen": 196609840, "step": 91110 }, { "epoch": 14.86378466557912, "grad_norm": 0.22808204591274261, "learning_rate": 9.390663674272033e-06, "loss": 0.1281, "num_input_tokens_seen": 196620176, "step": 91115 }, { "epoch": 14.864600326264274, "grad_norm": 0.1578352004289627, "learning_rate": 9.387883812028237e-06, "loss": 0.0108, "num_input_tokens_seen": 196631952, "step": 91120 }, { "epoch": 14.86541598694943, "grad_norm": 3.3937976360321045, "learning_rate": 9.38510426618378e-06, "loss": 0.1537, "num_input_tokens_seen": 196642704, "step": 91125 }, { "epoch": 14.866231647634583, "grad_norm": 0.48935991525650024, "learning_rate": 9.382325036794998e-06, "loss": 0.0634, "num_input_tokens_seen": 196654544, "step": 91130 }, { "epoch": 14.867047308319739, "grad_norm": 0.7913491129875183, "learning_rate": 9.37954612391821e-06, "loss": 0.0494, "num_input_tokens_seen": 196664656, "step": 91135 }, { "epoch": 14.867862969004895, "grad_norm": 0.07452775537967682, "learning_rate": 9.376767527609734e-06, "loss": 0.1204, "num_input_tokens_seen": 196676400, "step": 91140 }, { "epoch": 14.868678629690049, "grad_norm": 0.3194156289100647, "learning_rate": 9.373989247925885e-06, "loss": 0.0803, "num_input_tokens_seen": 196687600, "step": 91145 }, { "epoch": 14.869494290375204, "grad_norm": 2.7094855308532715, "learning_rate": 9.371211284922967e-06, "loss": 0.1991, "num_input_tokens_seen": 196697424, "step": 91150 }, { "epoch": 14.870309951060358, "grad_norm": 0.63172447681427, "learning_rate": 9.368433638657279e-06, "loss": 0.0153, "num_input_tokens_seen": 196708496, "step": 91155 }, { "epoch": 14.871125611745514, "grad_norm": 2.1963744163513184, "learning_rate": 9.365656309185114e-06, "loss": 0.1103, "num_input_tokens_seen": 196717872, "step": 91160 }, { "epoch": 14.87194127243067, "grad_norm": 1.609771490097046, "learning_rate": 9.362879296562746e-06, "loss": 0.1651, "num_input_tokens_seen": 196728944, "step": 91165 }, { "epoch": 14.872756933115824, "grad_norm": 3.3534367084503174, "learning_rate": 9.360102600846488e-06, "loss": 0.372, "num_input_tokens_seen": 196739152, "step": 91170 }, { "epoch": 14.87357259380098, "grad_norm": 0.935451328754425, "learning_rate": 9.35732622209257e-06, "loss": 0.0344, "num_input_tokens_seen": 196749712, "step": 91175 }, { "epoch": 14.874388254486133, "grad_norm": 0.5112099051475525, "learning_rate": 9.354550160357298e-06, "loss": 0.1886, "num_input_tokens_seen": 196760560, "step": 91180 }, { "epoch": 14.875203915171289, "grad_norm": 1.0398277044296265, "learning_rate": 9.351774415696899e-06, "loss": 0.2991, "num_input_tokens_seen": 196770448, "step": 91185 }, { "epoch": 14.876019575856443, "grad_norm": 0.7994265556335449, "learning_rate": 9.34899898816766e-06, "loss": 0.1191, "num_input_tokens_seen": 196781968, "step": 91190 }, { "epoch": 14.876835236541599, "grad_norm": 0.03851635754108429, "learning_rate": 9.346223877825792e-06, "loss": 0.026, "num_input_tokens_seen": 196792848, "step": 91195 }, { "epoch": 14.877650897226754, "grad_norm": 0.39432066679000854, "learning_rate": 9.343449084727574e-06, "loss": 0.0179, "num_input_tokens_seen": 196804464, "step": 91200 }, { "epoch": 14.878466557911908, "grad_norm": 0.5261027812957764, "learning_rate": 9.340674608929203e-06, "loss": 0.1518, "num_input_tokens_seen": 196814512, "step": 91205 }, { "epoch": 14.879282218597064, "grad_norm": 1.7162965536117554, "learning_rate": 9.337900450486939e-06, "loss": 0.1161, "num_input_tokens_seen": 196823728, "step": 91210 }, { "epoch": 14.880097879282218, "grad_norm": 2.3480234146118164, "learning_rate": 9.335126609456988e-06, "loss": 0.0653, "num_input_tokens_seen": 196833584, "step": 91215 }, { "epoch": 14.880913539967374, "grad_norm": 1.6221131086349487, "learning_rate": 9.332353085895567e-06, "loss": 0.1464, "num_input_tokens_seen": 196844944, "step": 91220 }, { "epoch": 14.88172920065253, "grad_norm": 0.16607049107551575, "learning_rate": 9.329579879858891e-06, "loss": 0.188, "num_input_tokens_seen": 196856464, "step": 91225 }, { "epoch": 14.882544861337683, "grad_norm": 0.17424069344997406, "learning_rate": 9.326806991403155e-06, "loss": 0.0538, "num_input_tokens_seen": 196866800, "step": 91230 }, { "epoch": 14.883360522022839, "grad_norm": 0.04626993089914322, "learning_rate": 9.32403442058456e-06, "loss": 0.0166, "num_input_tokens_seen": 196878224, "step": 91235 }, { "epoch": 14.884176182707993, "grad_norm": 1.1039379835128784, "learning_rate": 9.32126216745929e-06, "loss": 0.0638, "num_input_tokens_seen": 196889136, "step": 91240 }, { "epoch": 14.884991843393149, "grad_norm": 0.30927833914756775, "learning_rate": 9.318490232083538e-06, "loss": 0.1453, "num_input_tokens_seen": 196900112, "step": 91245 }, { "epoch": 14.885807504078304, "grad_norm": 1.9032191038131714, "learning_rate": 9.315718614513471e-06, "loss": 0.1654, "num_input_tokens_seen": 196911856, "step": 91250 }, { "epoch": 14.886623164763458, "grad_norm": 1.2203922271728516, "learning_rate": 9.312947314805265e-06, "loss": 0.0582, "num_input_tokens_seen": 196922896, "step": 91255 }, { "epoch": 14.887438825448614, "grad_norm": 0.453462153673172, "learning_rate": 9.310176333015081e-06, "loss": 0.0336, "num_input_tokens_seen": 196934032, "step": 91260 }, { "epoch": 14.888254486133768, "grad_norm": 0.5488674640655518, "learning_rate": 9.307405669199077e-06, "loss": 0.2303, "num_input_tokens_seen": 196944688, "step": 91265 }, { "epoch": 14.889070146818923, "grad_norm": 1.0578663349151611, "learning_rate": 9.304635323413405e-06, "loss": 0.1597, "num_input_tokens_seen": 196954416, "step": 91270 }, { "epoch": 14.88988580750408, "grad_norm": 0.05722516030073166, "learning_rate": 9.30186529571421e-06, "loss": 0.1737, "num_input_tokens_seen": 196966224, "step": 91275 }, { "epoch": 14.890701468189233, "grad_norm": 0.4502408802509308, "learning_rate": 9.299095586157628e-06, "loss": 0.0941, "num_input_tokens_seen": 196976176, "step": 91280 }, { "epoch": 14.891517128874389, "grad_norm": 2.9197375774383545, "learning_rate": 9.296326194799792e-06, "loss": 0.3129, "num_input_tokens_seen": 196987216, "step": 91285 }, { "epoch": 14.892332789559543, "grad_norm": 0.18514996767044067, "learning_rate": 9.293557121696828e-06, "loss": 0.0885, "num_input_tokens_seen": 196997552, "step": 91290 }, { "epoch": 14.893148450244698, "grad_norm": 0.7949892282485962, "learning_rate": 9.290788366904854e-06, "loss": 0.0214, "num_input_tokens_seen": 197007984, "step": 91295 }, { "epoch": 14.893964110929852, "grad_norm": 0.7051142454147339, "learning_rate": 9.288019930479982e-06, "loss": 0.0458, "num_input_tokens_seen": 197018960, "step": 91300 }, { "epoch": 14.894779771615008, "grad_norm": 0.8549915552139282, "learning_rate": 9.285251812478318e-06, "loss": 0.1578, "num_input_tokens_seen": 197030544, "step": 91305 }, { "epoch": 14.895595432300164, "grad_norm": 1.814203143119812, "learning_rate": 9.282484012955953e-06, "loss": 0.111, "num_input_tokens_seen": 197040240, "step": 91310 }, { "epoch": 14.896411092985318, "grad_norm": 1.7869681119918823, "learning_rate": 9.279716531968999e-06, "loss": 0.1739, "num_input_tokens_seen": 197051152, "step": 91315 }, { "epoch": 14.897226753670473, "grad_norm": 1.741600751876831, "learning_rate": 9.27694936957353e-06, "loss": 0.0984, "num_input_tokens_seen": 197062384, "step": 91320 }, { "epoch": 14.898042414355627, "grad_norm": 0.19729550182819366, "learning_rate": 9.274182525825629e-06, "loss": 0.0359, "num_input_tokens_seen": 197073424, "step": 91325 }, { "epoch": 14.898858075040783, "grad_norm": 0.3704219162464142, "learning_rate": 9.271416000781365e-06, "loss": 0.0753, "num_input_tokens_seen": 197085136, "step": 91330 }, { "epoch": 14.899673735725939, "grad_norm": 0.4011433720588684, "learning_rate": 9.268649794496815e-06, "loss": 0.2314, "num_input_tokens_seen": 197096336, "step": 91335 }, { "epoch": 14.900489396411093, "grad_norm": 0.043291520327329636, "learning_rate": 9.26588390702803e-06, "loss": 0.3312, "num_input_tokens_seen": 197107568, "step": 91340 }, { "epoch": 14.901305057096248, "grad_norm": 2.0600695610046387, "learning_rate": 9.263118338431067e-06, "loss": 0.0622, "num_input_tokens_seen": 197118672, "step": 91345 }, { "epoch": 14.902120717781402, "grad_norm": 1.054983377456665, "learning_rate": 9.260353088761979e-06, "loss": 0.0545, "num_input_tokens_seen": 197129168, "step": 91350 }, { "epoch": 14.902936378466558, "grad_norm": 0.42627963423728943, "learning_rate": 9.257588158076798e-06, "loss": 0.0176, "num_input_tokens_seen": 197138992, "step": 91355 }, { "epoch": 14.903752039151712, "grad_norm": 0.3644421100616455, "learning_rate": 9.254823546431565e-06, "loss": 0.0152, "num_input_tokens_seen": 197149904, "step": 91360 }, { "epoch": 14.904567699836868, "grad_norm": 0.08440376818180084, "learning_rate": 9.252059253882309e-06, "loss": 0.0291, "num_input_tokens_seen": 197160144, "step": 91365 }, { "epoch": 14.905383360522023, "grad_norm": 0.4518904685974121, "learning_rate": 9.249295280485048e-06, "loss": 0.0277, "num_input_tokens_seen": 197171280, "step": 91370 }, { "epoch": 14.906199021207177, "grad_norm": 0.2786153554916382, "learning_rate": 9.246531626295799e-06, "loss": 0.216, "num_input_tokens_seen": 197182256, "step": 91375 }, { "epoch": 14.907014681892333, "grad_norm": 0.22879859805107117, "learning_rate": 9.243768291370572e-06, "loss": 0.0188, "num_input_tokens_seen": 197192784, "step": 91380 }, { "epoch": 14.907830342577487, "grad_norm": 1.3565465211868286, "learning_rate": 9.241005275765368e-06, "loss": 0.0484, "num_input_tokens_seen": 197201488, "step": 91385 }, { "epoch": 14.908646003262643, "grad_norm": 0.14406217634677887, "learning_rate": 9.238242579536186e-06, "loss": 0.0944, "num_input_tokens_seen": 197211984, "step": 91390 }, { "epoch": 14.909461663947798, "grad_norm": 0.6544556617736816, "learning_rate": 9.235480202739012e-06, "loss": 0.0247, "num_input_tokens_seen": 197223536, "step": 91395 }, { "epoch": 14.910277324632952, "grad_norm": 0.13121776282787323, "learning_rate": 9.232718145429831e-06, "loss": 0.0304, "num_input_tokens_seen": 197234768, "step": 91400 }, { "epoch": 14.911092985318108, "grad_norm": 2.0844600200653076, "learning_rate": 9.229956407664617e-06, "loss": 0.1352, "num_input_tokens_seen": 197246256, "step": 91405 }, { "epoch": 14.911908646003262, "grad_norm": 3.7149760723114014, "learning_rate": 9.227194989499344e-06, "loss": 0.0988, "num_input_tokens_seen": 197257648, "step": 91410 }, { "epoch": 14.912724306688418, "grad_norm": 0.07044438272714615, "learning_rate": 9.224433890989972e-06, "loss": 0.115, "num_input_tokens_seen": 197267888, "step": 91415 }, { "epoch": 14.913539967373573, "grad_norm": 0.056529805064201355, "learning_rate": 9.221673112192453e-06, "loss": 0.1506, "num_input_tokens_seen": 197278352, "step": 91420 }, { "epoch": 14.914355628058727, "grad_norm": 0.02235196903347969, "learning_rate": 9.218912653162762e-06, "loss": 0.1992, "num_input_tokens_seen": 197288528, "step": 91425 }, { "epoch": 14.915171288743883, "grad_norm": 0.13518118858337402, "learning_rate": 9.216152513956808e-06, "loss": 0.0907, "num_input_tokens_seen": 197300080, "step": 91430 }, { "epoch": 14.915986949429037, "grad_norm": 0.04086988419294357, "learning_rate": 9.213392694630563e-06, "loss": 0.135, "num_input_tokens_seen": 197311120, "step": 91435 }, { "epoch": 14.916802610114193, "grad_norm": 0.11773006618022919, "learning_rate": 9.210633195239924e-06, "loss": 0.0509, "num_input_tokens_seen": 197321520, "step": 91440 }, { "epoch": 14.917618270799348, "grad_norm": 0.023200327530503273, "learning_rate": 9.207874015840854e-06, "loss": 0.0554, "num_input_tokens_seen": 197332528, "step": 91445 }, { "epoch": 14.918433931484502, "grad_norm": 0.1751655638217926, "learning_rate": 9.20511515648923e-06, "loss": 0.2021, "num_input_tokens_seen": 197343248, "step": 91450 }, { "epoch": 14.919249592169658, "grad_norm": 0.08185034245252609, "learning_rate": 9.202356617240996e-06, "loss": 0.0399, "num_input_tokens_seen": 197354832, "step": 91455 }, { "epoch": 14.920065252854812, "grad_norm": 0.9318708777427673, "learning_rate": 9.199598398152044e-06, "loss": 0.0509, "num_input_tokens_seen": 197365200, "step": 91460 }, { "epoch": 14.920880913539968, "grad_norm": 1.129384994506836, "learning_rate": 9.196840499278276e-06, "loss": 0.066, "num_input_tokens_seen": 197375440, "step": 91465 }, { "epoch": 14.921696574225122, "grad_norm": 1.0218168497085571, "learning_rate": 9.194082920675582e-06, "loss": 0.1258, "num_input_tokens_seen": 197386064, "step": 91470 }, { "epoch": 14.922512234910277, "grad_norm": 0.12162081897258759, "learning_rate": 9.19132566239985e-06, "loss": 0.0464, "num_input_tokens_seen": 197397296, "step": 91475 }, { "epoch": 14.923327895595433, "grad_norm": 0.08864709734916687, "learning_rate": 9.188568724506954e-06, "loss": 0.112, "num_input_tokens_seen": 197408144, "step": 91480 }, { "epoch": 14.924143556280587, "grad_norm": 0.1341601461172104, "learning_rate": 9.185812107052776e-06, "loss": 0.1723, "num_input_tokens_seen": 197419216, "step": 91485 }, { "epoch": 14.924959216965743, "grad_norm": 0.03138437122106552, "learning_rate": 9.183055810093175e-06, "loss": 0.0442, "num_input_tokens_seen": 197429392, "step": 91490 }, { "epoch": 14.925774877650896, "grad_norm": 0.5400905013084412, "learning_rate": 9.180299833684014e-06, "loss": 0.0423, "num_input_tokens_seen": 197440048, "step": 91495 }, { "epoch": 14.926590538336052, "grad_norm": 0.015277559868991375, "learning_rate": 9.177544177881146e-06, "loss": 0.041, "num_input_tokens_seen": 197451664, "step": 91500 }, { "epoch": 14.927406199021208, "grad_norm": 0.12087363004684448, "learning_rate": 9.174788842740418e-06, "loss": 0.0639, "num_input_tokens_seen": 197463856, "step": 91505 }, { "epoch": 14.928221859706362, "grad_norm": 0.09822788834571838, "learning_rate": 9.17203382831767e-06, "loss": 0.1072, "num_input_tokens_seen": 197474480, "step": 91510 }, { "epoch": 14.929037520391518, "grad_norm": 2.5625998973846436, "learning_rate": 9.169279134668731e-06, "loss": 0.0401, "num_input_tokens_seen": 197486320, "step": 91515 }, { "epoch": 14.929853181076671, "grad_norm": 0.08588498085737228, "learning_rate": 9.166524761849437e-06, "loss": 0.0492, "num_input_tokens_seen": 197496880, "step": 91520 }, { "epoch": 14.930668841761827, "grad_norm": 0.6440420746803284, "learning_rate": 9.163770709915597e-06, "loss": 0.0525, "num_input_tokens_seen": 197506384, "step": 91525 }, { "epoch": 14.931484502446983, "grad_norm": 0.08204182982444763, "learning_rate": 9.161016978923048e-06, "loss": 0.0148, "num_input_tokens_seen": 197516400, "step": 91530 }, { "epoch": 14.932300163132137, "grad_norm": 0.03260667622089386, "learning_rate": 9.158263568927569e-06, "loss": 0.0186, "num_input_tokens_seen": 197527632, "step": 91535 }, { "epoch": 14.933115823817293, "grad_norm": 1.6238439083099365, "learning_rate": 9.15551047998499e-06, "loss": 0.2031, "num_input_tokens_seen": 197538736, "step": 91540 }, { "epoch": 14.933931484502446, "grad_norm": 0.057525865733623505, "learning_rate": 9.152757712151076e-06, "loss": 0.0656, "num_input_tokens_seen": 197550448, "step": 91545 }, { "epoch": 14.934747145187602, "grad_norm": 2.1300299167633057, "learning_rate": 9.150005265481646e-06, "loss": 0.1384, "num_input_tokens_seen": 197561712, "step": 91550 }, { "epoch": 14.935562805872756, "grad_norm": 2.071467161178589, "learning_rate": 9.14725314003245e-06, "loss": 0.1326, "num_input_tokens_seen": 197572176, "step": 91555 }, { "epoch": 14.936378466557912, "grad_norm": 0.0259434524923563, "learning_rate": 9.144501335859295e-06, "loss": 0.1244, "num_input_tokens_seen": 197583152, "step": 91560 }, { "epoch": 14.937194127243067, "grad_norm": 2.96541166305542, "learning_rate": 9.14174985301792e-06, "loss": 0.2632, "num_input_tokens_seen": 197594256, "step": 91565 }, { "epoch": 14.938009787928221, "grad_norm": 0.4827748239040375, "learning_rate": 9.138998691564119e-06, "loss": 0.0214, "num_input_tokens_seen": 197604848, "step": 91570 }, { "epoch": 14.938825448613377, "grad_norm": 0.06788767874240875, "learning_rate": 9.136247851553612e-06, "loss": 0.0714, "num_input_tokens_seen": 197615056, "step": 91575 }, { "epoch": 14.939641109298531, "grad_norm": 2.1265320777893066, "learning_rate": 9.133497333042184e-06, "loss": 0.111, "num_input_tokens_seen": 197626448, "step": 91580 }, { "epoch": 14.940456769983687, "grad_norm": 0.3776397109031677, "learning_rate": 9.130747136085546e-06, "loss": 0.0353, "num_input_tokens_seen": 197637296, "step": 91585 }, { "epoch": 14.941272430668842, "grad_norm": 0.038873340934515, "learning_rate": 9.127997260739462e-06, "loss": 0.0165, "num_input_tokens_seen": 197648368, "step": 91590 }, { "epoch": 14.942088091353996, "grad_norm": 0.08825980126857758, "learning_rate": 9.125247707059637e-06, "loss": 0.085, "num_input_tokens_seen": 197658448, "step": 91595 }, { "epoch": 14.942903752039152, "grad_norm": 0.15469224750995636, "learning_rate": 9.122498475101809e-06, "loss": 0.0472, "num_input_tokens_seen": 197669456, "step": 91600 }, { "epoch": 14.943719412724306, "grad_norm": 0.1731540411710739, "learning_rate": 9.119749564921695e-06, "loss": 0.0179, "num_input_tokens_seen": 197680656, "step": 91605 }, { "epoch": 14.944535073409462, "grad_norm": 0.961382269859314, "learning_rate": 9.117000976575e-06, "loss": 0.0988, "num_input_tokens_seen": 197691664, "step": 91610 }, { "epoch": 14.945350734094617, "grad_norm": 1.8944779634475708, "learning_rate": 9.11425271011743e-06, "loss": 0.1779, "num_input_tokens_seen": 197702928, "step": 91615 }, { "epoch": 14.946166394779771, "grad_norm": 1.3223252296447754, "learning_rate": 9.111504765604678e-06, "loss": 0.0522, "num_input_tokens_seen": 197713840, "step": 91620 }, { "epoch": 14.946982055464927, "grad_norm": 0.6469585299491882, "learning_rate": 9.108757143092441e-06, "loss": 0.0789, "num_input_tokens_seen": 197724656, "step": 91625 }, { "epoch": 14.947797716150081, "grad_norm": 0.973530650138855, "learning_rate": 9.106009842636401e-06, "loss": 0.0972, "num_input_tokens_seen": 197735344, "step": 91630 }, { "epoch": 14.948613376835237, "grad_norm": 0.7852373719215393, "learning_rate": 9.10326286429223e-06, "loss": 0.1566, "num_input_tokens_seen": 197747344, "step": 91635 }, { "epoch": 14.949429037520392, "grad_norm": 0.18383574485778809, "learning_rate": 9.100516208115608e-06, "loss": 0.049, "num_input_tokens_seen": 197758832, "step": 91640 }, { "epoch": 14.950244698205546, "grad_norm": 0.5536973476409912, "learning_rate": 9.097769874162193e-06, "loss": 0.0527, "num_input_tokens_seen": 197769968, "step": 91645 }, { "epoch": 14.951060358890702, "grad_norm": 0.2178250551223755, "learning_rate": 9.095023862487637e-06, "loss": 0.184, "num_input_tokens_seen": 197781200, "step": 91650 }, { "epoch": 14.951876019575856, "grad_norm": 0.03126955404877663, "learning_rate": 9.092278173147616e-06, "loss": 0.0278, "num_input_tokens_seen": 197791952, "step": 91655 }, { "epoch": 14.952691680261012, "grad_norm": 1.7852375507354736, "learning_rate": 9.089532806197742e-06, "loss": 0.0413, "num_input_tokens_seen": 197802320, "step": 91660 }, { "epoch": 14.953507340946166, "grad_norm": 1.2837525606155396, "learning_rate": 9.086787761693685e-06, "loss": 0.1153, "num_input_tokens_seen": 197812752, "step": 91665 }, { "epoch": 14.954323001631321, "grad_norm": 0.14763955771923065, "learning_rate": 9.084043039691045e-06, "loss": 0.0198, "num_input_tokens_seen": 197824304, "step": 91670 }, { "epoch": 14.955138662316477, "grad_norm": 1.2229104042053223, "learning_rate": 9.081298640245483e-06, "loss": 0.1609, "num_input_tokens_seen": 197834896, "step": 91675 }, { "epoch": 14.955954323001631, "grad_norm": 0.30480366945266724, "learning_rate": 9.078554563412578e-06, "loss": 0.0279, "num_input_tokens_seen": 197845744, "step": 91680 }, { "epoch": 14.956769983686787, "grad_norm": 0.04094839096069336, "learning_rate": 9.075810809247981e-06, "loss": 0.2115, "num_input_tokens_seen": 197855760, "step": 91685 }, { "epoch": 14.95758564437194, "grad_norm": 1.309165120124817, "learning_rate": 9.073067377807262e-06, "loss": 0.1064, "num_input_tokens_seen": 197867024, "step": 91690 }, { "epoch": 14.958401305057096, "grad_norm": 0.1962224245071411, "learning_rate": 9.070324269146055e-06, "loss": 0.0161, "num_input_tokens_seen": 197878416, "step": 91695 }, { "epoch": 14.959216965742252, "grad_norm": 0.23563885688781738, "learning_rate": 9.067581483319918e-06, "loss": 0.0126, "num_input_tokens_seen": 197888208, "step": 91700 }, { "epoch": 14.960032626427406, "grad_norm": 2.619328022003174, "learning_rate": 9.06483902038447e-06, "loss": 0.1236, "num_input_tokens_seen": 197899408, "step": 91705 }, { "epoch": 14.960848287112562, "grad_norm": 0.1380815953016281, "learning_rate": 9.062096880395257e-06, "loss": 0.0361, "num_input_tokens_seen": 197910672, "step": 91710 }, { "epoch": 14.961663947797716, "grad_norm": 0.7314483523368835, "learning_rate": 9.059355063407887e-06, "loss": 0.0741, "num_input_tokens_seen": 197921488, "step": 91715 }, { "epoch": 14.962479608482871, "grad_norm": 0.11088979989290237, "learning_rate": 9.056613569477892e-06, "loss": 0.0347, "num_input_tokens_seen": 197932624, "step": 91720 }, { "epoch": 14.963295269168025, "grad_norm": 0.0692131444811821, "learning_rate": 9.053872398660864e-06, "loss": 0.1313, "num_input_tokens_seen": 197942032, "step": 91725 }, { "epoch": 14.964110929853181, "grad_norm": 0.33792755007743835, "learning_rate": 9.051131551012325e-06, "loss": 0.0547, "num_input_tokens_seen": 197952848, "step": 91730 }, { "epoch": 14.964926590538337, "grad_norm": 2.120476007461548, "learning_rate": 9.048391026587858e-06, "loss": 0.0694, "num_input_tokens_seen": 197962736, "step": 91735 }, { "epoch": 14.96574225122349, "grad_norm": 0.009948201477527618, "learning_rate": 9.04565082544296e-06, "loss": 0.0256, "num_input_tokens_seen": 197973840, "step": 91740 }, { "epoch": 14.966557911908646, "grad_norm": 0.11407876759767532, "learning_rate": 9.0429109476332e-06, "loss": 0.0156, "num_input_tokens_seen": 197985040, "step": 91745 }, { "epoch": 14.9673735725938, "grad_norm": 0.06657262146472931, "learning_rate": 9.040171393214091e-06, "loss": 0.0262, "num_input_tokens_seen": 197995760, "step": 91750 }, { "epoch": 14.968189233278956, "grad_norm": 0.39431333541870117, "learning_rate": 9.037432162241158e-06, "loss": 0.0607, "num_input_tokens_seen": 198006992, "step": 91755 }, { "epoch": 14.969004893964112, "grad_norm": 0.11249488592147827, "learning_rate": 9.03469325476991e-06, "loss": 0.0327, "num_input_tokens_seen": 198016336, "step": 91760 }, { "epoch": 14.969820554649266, "grad_norm": 0.19820551574230194, "learning_rate": 9.031954670855858e-06, "loss": 0.2164, "num_input_tokens_seen": 198027920, "step": 91765 }, { "epoch": 14.970636215334421, "grad_norm": 2.2522435188293457, "learning_rate": 9.0292164105545e-06, "loss": 0.088, "num_input_tokens_seen": 198039024, "step": 91770 }, { "epoch": 14.971451876019575, "grad_norm": 0.10814490914344788, "learning_rate": 9.026478473921331e-06, "loss": 0.0235, "num_input_tokens_seen": 198049296, "step": 91775 }, { "epoch": 14.97226753670473, "grad_norm": 0.2149236649274826, "learning_rate": 9.02374086101184e-06, "loss": 0.2126, "num_input_tokens_seen": 198060912, "step": 91780 }, { "epoch": 14.973083197389887, "grad_norm": 0.0793696790933609, "learning_rate": 9.021003571881508e-06, "loss": 0.045, "num_input_tokens_seen": 198072240, "step": 91785 }, { "epoch": 14.97389885807504, "grad_norm": 0.12172917276620865, "learning_rate": 9.01826660658581e-06, "loss": 0.0168, "num_input_tokens_seen": 198084112, "step": 91790 }, { "epoch": 14.974714518760196, "grad_norm": 0.4575462341308594, "learning_rate": 9.015529965180206e-06, "loss": 0.0745, "num_input_tokens_seen": 198094096, "step": 91795 }, { "epoch": 14.97553017944535, "grad_norm": 0.8141267895698547, "learning_rate": 9.01279364772018e-06, "loss": 0.1608, "num_input_tokens_seen": 198104144, "step": 91800 }, { "epoch": 14.976345840130506, "grad_norm": 0.9651632308959961, "learning_rate": 9.010057654261157e-06, "loss": 0.2193, "num_input_tokens_seen": 198114256, "step": 91805 }, { "epoch": 14.977161500815662, "grad_norm": 2.1230971813201904, "learning_rate": 9.007321984858616e-06, "loss": 0.1429, "num_input_tokens_seen": 198124944, "step": 91810 }, { "epoch": 14.977977161500815, "grad_norm": 0.2960517406463623, "learning_rate": 9.004586639567967e-06, "loss": 0.0763, "num_input_tokens_seen": 198135472, "step": 91815 }, { "epoch": 14.978792822185971, "grad_norm": 0.2326757162809372, "learning_rate": 9.001851618444676e-06, "loss": 0.0769, "num_input_tokens_seen": 198145744, "step": 91820 }, { "epoch": 14.979608482871125, "grad_norm": 2.1547346115112305, "learning_rate": 8.999116921544142e-06, "loss": 0.1048, "num_input_tokens_seen": 198156304, "step": 91825 }, { "epoch": 14.98042414355628, "grad_norm": 0.3613075315952301, "learning_rate": 8.996382548921819e-06, "loss": 0.0277, "num_input_tokens_seen": 198166736, "step": 91830 }, { "epoch": 14.981239804241435, "grad_norm": 1.5724973678588867, "learning_rate": 8.993648500633087e-06, "loss": 0.0917, "num_input_tokens_seen": 198178000, "step": 91835 }, { "epoch": 14.98205546492659, "grad_norm": 1.367422342300415, "learning_rate": 8.990914776733392e-06, "loss": 0.1368, "num_input_tokens_seen": 198188720, "step": 91840 }, { "epoch": 14.982871125611746, "grad_norm": 0.03809471055865288, "learning_rate": 8.988181377278102e-06, "loss": 0.0144, "num_input_tokens_seen": 198199184, "step": 91845 }, { "epoch": 14.9836867862969, "grad_norm": 1.8023756742477417, "learning_rate": 8.985448302322635e-06, "loss": 0.2408, "num_input_tokens_seen": 198211184, "step": 91850 }, { "epoch": 14.984502446982056, "grad_norm": 0.029592564329504967, "learning_rate": 8.982715551922376e-06, "loss": 0.0986, "num_input_tokens_seen": 198223312, "step": 91855 }, { "epoch": 14.98531810766721, "grad_norm": 0.6758826375007629, "learning_rate": 8.979983126132705e-06, "loss": 0.0869, "num_input_tokens_seen": 198233968, "step": 91860 }, { "epoch": 14.986133768352365, "grad_norm": 0.11506232619285583, "learning_rate": 8.977251025009e-06, "loss": 0.1356, "num_input_tokens_seen": 198244240, "step": 91865 }, { "epoch": 14.986949429037521, "grad_norm": 1.6006944179534912, "learning_rate": 8.974519248606627e-06, "loss": 0.0903, "num_input_tokens_seen": 198255280, "step": 91870 }, { "epoch": 14.987765089722675, "grad_norm": 2.1803786754608154, "learning_rate": 8.971787796980954e-06, "loss": 0.2192, "num_input_tokens_seen": 198267120, "step": 91875 }, { "epoch": 14.98858075040783, "grad_norm": 2.1035358905792236, "learning_rate": 8.969056670187331e-06, "loss": 0.2003, "num_input_tokens_seen": 198278608, "step": 91880 }, { "epoch": 14.989396411092985, "grad_norm": 0.32600757479667664, "learning_rate": 8.966325868281114e-06, "loss": 0.0911, "num_input_tokens_seen": 198288976, "step": 91885 }, { "epoch": 14.99021207177814, "grad_norm": 1.862060785293579, "learning_rate": 8.963595391317642e-06, "loss": 0.1558, "num_input_tokens_seen": 198298224, "step": 91890 }, { "epoch": 14.991027732463294, "grad_norm": 0.16578468680381775, "learning_rate": 8.960865239352253e-06, "loss": 0.1239, "num_input_tokens_seen": 198308208, "step": 91895 }, { "epoch": 14.99184339314845, "grad_norm": 0.08540076017379761, "learning_rate": 8.95813541244028e-06, "loss": 0.0378, "num_input_tokens_seen": 198320016, "step": 91900 }, { "epoch": 14.992659053833606, "grad_norm": 0.021808292716741562, "learning_rate": 8.95540591063704e-06, "loss": 0.2204, "num_input_tokens_seen": 198330448, "step": 91905 }, { "epoch": 14.99347471451876, "grad_norm": 0.04965856671333313, "learning_rate": 8.952676733997855e-06, "loss": 0.0956, "num_input_tokens_seen": 198342384, "step": 91910 }, { "epoch": 14.994290375203915, "grad_norm": 0.14955705404281616, "learning_rate": 8.949947882578033e-06, "loss": 0.0954, "num_input_tokens_seen": 198353328, "step": 91915 }, { "epoch": 14.99510603588907, "grad_norm": 0.23988276720046997, "learning_rate": 8.947219356432875e-06, "loss": 0.072, "num_input_tokens_seen": 198363888, "step": 91920 }, { "epoch": 14.995921696574225, "grad_norm": 1.2341357469558716, "learning_rate": 8.944491155617687e-06, "loss": 0.0442, "num_input_tokens_seen": 198374672, "step": 91925 }, { "epoch": 14.99673735725938, "grad_norm": 2.0343456268310547, "learning_rate": 8.94176328018775e-06, "loss": 0.2375, "num_input_tokens_seen": 198385808, "step": 91930 }, { "epoch": 14.997553017944535, "grad_norm": 1.0324198007583618, "learning_rate": 8.939035730198353e-06, "loss": 0.0813, "num_input_tokens_seen": 198394960, "step": 91935 }, { "epoch": 14.99836867862969, "grad_norm": 1.728775143623352, "learning_rate": 8.93630850570476e-06, "loss": 0.0449, "num_input_tokens_seen": 198406256, "step": 91940 }, { "epoch": 14.999184339314844, "grad_norm": 0.12971574068069458, "learning_rate": 8.933581606762274e-06, "loss": 0.1338, "num_input_tokens_seen": 198417840, "step": 91945 }, { "epoch": 15.0, "grad_norm": 0.7232276797294617, "learning_rate": 8.930855033426119e-06, "loss": 0.0885, "num_input_tokens_seen": 198426688, "step": 91950 }, { "epoch": 15.0, "eval_loss": 0.14255428314208984, "eval_runtime": 90.8724, "eval_samples_per_second": 29.987, "eval_steps_per_second": 7.505, "num_input_tokens_seen": 198426688, "step": 91950 }, { "epoch": 15.000815660685156, "grad_norm": 0.43136996030807495, "learning_rate": 8.928128785751582e-06, "loss": 0.0576, "num_input_tokens_seen": 198438688, "step": 91955 }, { "epoch": 15.00163132137031, "grad_norm": 0.02675270102918148, "learning_rate": 8.925402863793903e-06, "loss": 0.0653, "num_input_tokens_seen": 198448480, "step": 91960 }, { "epoch": 15.002446982055465, "grad_norm": 0.034023087471723557, "learning_rate": 8.922677267608329e-06, "loss": 0.1419, "num_input_tokens_seen": 198460320, "step": 91965 }, { "epoch": 15.00326264274062, "grad_norm": 0.013165961019694805, "learning_rate": 8.919951997250092e-06, "loss": 0.0595, "num_input_tokens_seen": 198471712, "step": 91970 }, { "epoch": 15.004078303425775, "grad_norm": 1.8845645189285278, "learning_rate": 8.917227052774429e-06, "loss": 0.0941, "num_input_tokens_seen": 198482432, "step": 91975 }, { "epoch": 15.00489396411093, "grad_norm": 0.7422608733177185, "learning_rate": 8.914502434236563e-06, "loss": 0.115, "num_input_tokens_seen": 198493664, "step": 91980 }, { "epoch": 15.005709624796085, "grad_norm": 1.741431474685669, "learning_rate": 8.91177814169171e-06, "loss": 0.0929, "num_input_tokens_seen": 198505536, "step": 91985 }, { "epoch": 15.00652528548124, "grad_norm": 0.05043235048651695, "learning_rate": 8.90905417519508e-06, "loss": 0.1524, "num_input_tokens_seen": 198516416, "step": 91990 }, { "epoch": 15.007340946166394, "grad_norm": 0.03446223586797714, "learning_rate": 8.906330534801885e-06, "loss": 0.0285, "num_input_tokens_seen": 198528224, "step": 91995 }, { "epoch": 15.00815660685155, "grad_norm": 0.2036130428314209, "learning_rate": 8.903607220567315e-06, "loss": 0.0974, "num_input_tokens_seen": 198540096, "step": 92000 }, { "epoch": 15.008972267536704, "grad_norm": 1.6928893327713013, "learning_rate": 8.900884232546564e-06, "loss": 0.1043, "num_input_tokens_seen": 198550496, "step": 92005 }, { "epoch": 15.00978792822186, "grad_norm": 0.18337956070899963, "learning_rate": 8.89816157079482e-06, "loss": 0.0264, "num_input_tokens_seen": 198560512, "step": 92010 }, { "epoch": 15.010603588907015, "grad_norm": 0.3473834693431854, "learning_rate": 8.895439235367254e-06, "loss": 0.0174, "num_input_tokens_seen": 198570880, "step": 92015 }, { "epoch": 15.01141924959217, "grad_norm": 0.0353742353618145, "learning_rate": 8.892717226319045e-06, "loss": 0.0428, "num_input_tokens_seen": 198581024, "step": 92020 }, { "epoch": 15.012234910277325, "grad_norm": 0.0661345049738884, "learning_rate": 8.889995543705354e-06, "loss": 0.0378, "num_input_tokens_seen": 198592416, "step": 92025 }, { "epoch": 15.013050570962479, "grad_norm": 2.7707481384277344, "learning_rate": 8.88727418758134e-06, "loss": 0.1013, "num_input_tokens_seen": 198601888, "step": 92030 }, { "epoch": 15.013866231647635, "grad_norm": 0.20962601900100708, "learning_rate": 8.884553158002158e-06, "loss": 0.0151, "num_input_tokens_seen": 198612064, "step": 92035 }, { "epoch": 15.01468189233279, "grad_norm": 0.06448809057474136, "learning_rate": 8.881832455022946e-06, "loss": 0.2667, "num_input_tokens_seen": 198623072, "step": 92040 }, { "epoch": 15.015497553017944, "grad_norm": 0.02853485941886902, "learning_rate": 8.879112078698848e-06, "loss": 0.1021, "num_input_tokens_seen": 198635616, "step": 92045 }, { "epoch": 15.0163132137031, "grad_norm": 0.9363676309585571, "learning_rate": 8.876392029084996e-06, "loss": 0.0932, "num_input_tokens_seen": 198645952, "step": 92050 }, { "epoch": 15.017128874388254, "grad_norm": 0.15563035011291504, "learning_rate": 8.873672306236511e-06, "loss": 0.1134, "num_input_tokens_seen": 198655616, "step": 92055 }, { "epoch": 15.01794453507341, "grad_norm": 1.6845275163650513, "learning_rate": 8.870952910208507e-06, "loss": 0.2643, "num_input_tokens_seen": 198665472, "step": 92060 }, { "epoch": 15.018760195758565, "grad_norm": 0.8266997337341309, "learning_rate": 8.868233841056123e-06, "loss": 0.0291, "num_input_tokens_seen": 198677504, "step": 92065 }, { "epoch": 15.01957585644372, "grad_norm": 0.14265094697475433, "learning_rate": 8.865515098834423e-06, "loss": 0.1132, "num_input_tokens_seen": 198688288, "step": 92070 }, { "epoch": 15.020391517128875, "grad_norm": 1.121847152709961, "learning_rate": 8.862796683598548e-06, "loss": 0.1499, "num_input_tokens_seen": 198697792, "step": 92075 }, { "epoch": 15.021207177814029, "grad_norm": 0.19561155140399933, "learning_rate": 8.860078595403553e-06, "loss": 0.0143, "num_input_tokens_seen": 198708864, "step": 92080 }, { "epoch": 15.022022838499185, "grad_norm": 0.09218788146972656, "learning_rate": 8.857360834304549e-06, "loss": 0.1805, "num_input_tokens_seen": 198720448, "step": 92085 }, { "epoch": 15.022838499184338, "grad_norm": 0.024808194488286972, "learning_rate": 8.854643400356601e-06, "loss": 0.2155, "num_input_tokens_seen": 198732256, "step": 92090 }, { "epoch": 15.023654159869494, "grad_norm": 0.5734131336212158, "learning_rate": 8.851926293614793e-06, "loss": 0.0252, "num_input_tokens_seen": 198743392, "step": 92095 }, { "epoch": 15.02446982055465, "grad_norm": 0.017521729692816734, "learning_rate": 8.849209514134179e-06, "loss": 0.2563, "num_input_tokens_seen": 198753376, "step": 92100 }, { "epoch": 15.025285481239804, "grad_norm": 0.03434504568576813, "learning_rate": 8.846493061969827e-06, "loss": 0.22, "num_input_tokens_seen": 198763040, "step": 92105 }, { "epoch": 15.02610114192496, "grad_norm": 0.3958403468132019, "learning_rate": 8.843776937176781e-06, "loss": 0.0433, "num_input_tokens_seen": 198774080, "step": 92110 }, { "epoch": 15.026916802610113, "grad_norm": 0.10783713310956955, "learning_rate": 8.841061139810095e-06, "loss": 0.0719, "num_input_tokens_seen": 198785216, "step": 92115 }, { "epoch": 15.02773246329527, "grad_norm": 0.08604633808135986, "learning_rate": 8.838345669924802e-06, "loss": 0.0547, "num_input_tokens_seen": 198797440, "step": 92120 }, { "epoch": 15.028548123980425, "grad_norm": 0.30771544575691223, "learning_rate": 8.835630527575936e-06, "loss": 0.0386, "num_input_tokens_seen": 198808608, "step": 92125 }, { "epoch": 15.029363784665579, "grad_norm": 3.079420804977417, "learning_rate": 8.832915712818526e-06, "loss": 0.0445, "num_input_tokens_seen": 198821056, "step": 92130 }, { "epoch": 15.030179445350734, "grad_norm": 0.38273367285728455, "learning_rate": 8.830201225707585e-06, "loss": 0.0392, "num_input_tokens_seen": 198831232, "step": 92135 }, { "epoch": 15.030995106035888, "grad_norm": 0.042750708758831024, "learning_rate": 8.827487066298132e-06, "loss": 0.0234, "num_input_tokens_seen": 198842016, "step": 92140 }, { "epoch": 15.031810766721044, "grad_norm": 0.09217987209558487, "learning_rate": 8.824773234645167e-06, "loss": 0.171, "num_input_tokens_seen": 198853824, "step": 92145 }, { "epoch": 15.0326264274062, "grad_norm": 0.07632531225681305, "learning_rate": 8.822059730803694e-06, "loss": 0.0528, "num_input_tokens_seen": 198864160, "step": 92150 }, { "epoch": 15.033442088091354, "grad_norm": 0.026400871574878693, "learning_rate": 8.819346554828706e-06, "loss": 0.0384, "num_input_tokens_seen": 198873888, "step": 92155 }, { "epoch": 15.03425774877651, "grad_norm": 1.9405494928359985, "learning_rate": 8.816633706775182e-06, "loss": 0.2381, "num_input_tokens_seen": 198883712, "step": 92160 }, { "epoch": 15.035073409461663, "grad_norm": 0.8179096579551697, "learning_rate": 8.8139211866981e-06, "loss": 0.1046, "num_input_tokens_seen": 198893024, "step": 92165 }, { "epoch": 15.035889070146819, "grad_norm": 0.6544163823127747, "learning_rate": 8.811208994652457e-06, "loss": 0.079, "num_input_tokens_seen": 198903264, "step": 92170 }, { "epoch": 15.036704730831975, "grad_norm": 1.842462420463562, "learning_rate": 8.80849713069318e-06, "loss": 0.1484, "num_input_tokens_seen": 198914464, "step": 92175 }, { "epoch": 15.037520391517129, "grad_norm": 1.8006292581558228, "learning_rate": 8.805785594875268e-06, "loss": 0.0702, "num_input_tokens_seen": 198924800, "step": 92180 }, { "epoch": 15.038336052202284, "grad_norm": 0.13098689913749695, "learning_rate": 8.803074387253637e-06, "loss": 0.0142, "num_input_tokens_seen": 198934464, "step": 92185 }, { "epoch": 15.039151712887438, "grad_norm": 0.08185547590255737, "learning_rate": 8.80036350788327e-06, "loss": 0.1894, "num_input_tokens_seen": 198944896, "step": 92190 }, { "epoch": 15.039967373572594, "grad_norm": 0.043985214084386826, "learning_rate": 8.797652956819069e-06, "loss": 0.0092, "num_input_tokens_seen": 198956064, "step": 92195 }, { "epoch": 15.040783034257748, "grad_norm": 0.11662273108959198, "learning_rate": 8.794942734116003e-06, "loss": 0.06, "num_input_tokens_seen": 198966720, "step": 92200 }, { "epoch": 15.041598694942904, "grad_norm": 0.06299004703760147, "learning_rate": 8.792232839828959e-06, "loss": 0.0556, "num_input_tokens_seen": 198977248, "step": 92205 }, { "epoch": 15.04241435562806, "grad_norm": 3.1886236667633057, "learning_rate": 8.789523274012898e-06, "loss": 0.2234, "num_input_tokens_seen": 198988224, "step": 92210 }, { "epoch": 15.043230016313213, "grad_norm": 0.1368352770805359, "learning_rate": 8.786814036722695e-06, "loss": 0.0532, "num_input_tokens_seen": 198998272, "step": 92215 }, { "epoch": 15.044045676998369, "grad_norm": 1.4888579845428467, "learning_rate": 8.78410512801329e-06, "loss": 0.0341, "num_input_tokens_seen": 199009504, "step": 92220 }, { "epoch": 15.044861337683523, "grad_norm": 1.9677006006240845, "learning_rate": 8.78139654793955e-06, "loss": 0.1561, "num_input_tokens_seen": 199021792, "step": 92225 }, { "epoch": 15.045676998368679, "grad_norm": 0.7602152824401855, "learning_rate": 8.778688296556389e-06, "loss": 0.0439, "num_input_tokens_seen": 199032096, "step": 92230 }, { "epoch": 15.046492659053834, "grad_norm": 0.333181768655777, "learning_rate": 8.77598037391869e-06, "loss": 0.0945, "num_input_tokens_seen": 199043392, "step": 92235 }, { "epoch": 15.047308319738988, "grad_norm": 2.1256601810455322, "learning_rate": 8.773272780081332e-06, "loss": 0.1794, "num_input_tokens_seen": 199054016, "step": 92240 }, { "epoch": 15.048123980424144, "grad_norm": 0.09362179785966873, "learning_rate": 8.770565515099183e-06, "loss": 0.0146, "num_input_tokens_seen": 199065760, "step": 92245 }, { "epoch": 15.048939641109298, "grad_norm": 0.4911782443523407, "learning_rate": 8.767858579027113e-06, "loss": 0.1545, "num_input_tokens_seen": 199075808, "step": 92250 }, { "epoch": 15.049755301794454, "grad_norm": 0.021337101235985756, "learning_rate": 8.765151971919983e-06, "loss": 0.0425, "num_input_tokens_seen": 199085632, "step": 92255 }, { "epoch": 15.05057096247961, "grad_norm": 0.054523393511772156, "learning_rate": 8.762445693832642e-06, "loss": 0.0445, "num_input_tokens_seen": 199096160, "step": 92260 }, { "epoch": 15.051386623164763, "grad_norm": 2.4770140647888184, "learning_rate": 8.759739744819938e-06, "loss": 0.0587, "num_input_tokens_seen": 199107360, "step": 92265 }, { "epoch": 15.052202283849919, "grad_norm": 2.1797585487365723, "learning_rate": 8.757034124936712e-06, "loss": 0.2185, "num_input_tokens_seen": 199118720, "step": 92270 }, { "epoch": 15.053017944535073, "grad_norm": 0.42531993985176086, "learning_rate": 8.754328834237793e-06, "loss": 0.0376, "num_input_tokens_seen": 199129568, "step": 92275 }, { "epoch": 15.053833605220229, "grad_norm": 0.13415862619876862, "learning_rate": 8.751623872778001e-06, "loss": 0.005, "num_input_tokens_seen": 199141216, "step": 92280 }, { "epoch": 15.054649265905383, "grad_norm": 3.068183183670044, "learning_rate": 8.74891924061218e-06, "loss": 0.2011, "num_input_tokens_seen": 199151040, "step": 92285 }, { "epoch": 15.055464926590538, "grad_norm": 0.2716802954673767, "learning_rate": 8.74621493779511e-06, "loss": 0.0131, "num_input_tokens_seen": 199161632, "step": 92290 }, { "epoch": 15.056280587275694, "grad_norm": 1.7079718112945557, "learning_rate": 8.743510964381631e-06, "loss": 0.1969, "num_input_tokens_seen": 199171616, "step": 92295 }, { "epoch": 15.057096247960848, "grad_norm": 0.33647117018699646, "learning_rate": 8.740807320426511e-06, "loss": 0.0437, "num_input_tokens_seen": 199182720, "step": 92300 }, { "epoch": 15.057911908646004, "grad_norm": 0.06251490116119385, "learning_rate": 8.73810400598457e-06, "loss": 0.2283, "num_input_tokens_seen": 199193728, "step": 92305 }, { "epoch": 15.058727569331158, "grad_norm": 0.5657868981361389, "learning_rate": 8.735401021110565e-06, "loss": 0.177, "num_input_tokens_seen": 199205120, "step": 92310 }, { "epoch": 15.059543230016313, "grad_norm": 0.07989414036273956, "learning_rate": 8.73269836585931e-06, "loss": 0.0881, "num_input_tokens_seen": 199214528, "step": 92315 }, { "epoch": 15.060358890701469, "grad_norm": 0.12123610824346542, "learning_rate": 8.72999604028554e-06, "loss": 0.1305, "num_input_tokens_seen": 199224864, "step": 92320 }, { "epoch": 15.061174551386623, "grad_norm": 0.09368965029716492, "learning_rate": 8.72729404444406e-06, "loss": 0.0284, "num_input_tokens_seen": 199235296, "step": 92325 }, { "epoch": 15.061990212071779, "grad_norm": 0.4037320017814636, "learning_rate": 8.72459237838959e-06, "loss": 0.097, "num_input_tokens_seen": 199246144, "step": 92330 }, { "epoch": 15.062805872756933, "grad_norm": 1.870983600616455, "learning_rate": 8.721891042176919e-06, "loss": 0.1018, "num_input_tokens_seen": 199256544, "step": 92335 }, { "epoch": 15.063621533442088, "grad_norm": 0.09481240808963776, "learning_rate": 8.719190035860761e-06, "loss": 0.0694, "num_input_tokens_seen": 199266240, "step": 92340 }, { "epoch": 15.064437194127244, "grad_norm": 0.020580563694238663, "learning_rate": 8.716489359495886e-06, "loss": 0.0085, "num_input_tokens_seen": 199276800, "step": 92345 }, { "epoch": 15.065252854812398, "grad_norm": 0.600220799446106, "learning_rate": 8.713789013136994e-06, "loss": 0.308, "num_input_tokens_seen": 199287232, "step": 92350 }, { "epoch": 15.066068515497554, "grad_norm": 0.036933135241270065, "learning_rate": 8.711088996838848e-06, "loss": 0.113, "num_input_tokens_seen": 199296160, "step": 92355 }, { "epoch": 15.066884176182707, "grad_norm": 0.12936773896217346, "learning_rate": 8.70838931065613e-06, "loss": 0.0664, "num_input_tokens_seen": 199306176, "step": 92360 }, { "epoch": 15.067699836867863, "grad_norm": 0.6761808395385742, "learning_rate": 8.705689954643584e-06, "loss": 0.0871, "num_input_tokens_seen": 199315648, "step": 92365 }, { "epoch": 15.068515497553017, "grad_norm": 0.7457600235939026, "learning_rate": 8.702990928855887e-06, "loss": 0.026, "num_input_tokens_seen": 199325632, "step": 92370 }, { "epoch": 15.069331158238173, "grad_norm": 0.05799029767513275, "learning_rate": 8.70029223334776e-06, "loss": 0.0499, "num_input_tokens_seen": 199336384, "step": 92375 }, { "epoch": 15.070146818923329, "grad_norm": 2.670107126235962, "learning_rate": 8.697593868173889e-06, "loss": 0.0944, "num_input_tokens_seen": 199348864, "step": 92380 }, { "epoch": 15.070962479608482, "grad_norm": 0.059850145131349564, "learning_rate": 8.694895833388961e-06, "loss": 0.1352, "num_input_tokens_seen": 199358880, "step": 92385 }, { "epoch": 15.071778140293638, "grad_norm": 1.2063533067703247, "learning_rate": 8.692198129047652e-06, "loss": 0.1472, "num_input_tokens_seen": 199369984, "step": 92390 }, { "epoch": 15.072593800978792, "grad_norm": 0.11982103437185287, "learning_rate": 8.689500755204635e-06, "loss": 0.2606, "num_input_tokens_seen": 199380224, "step": 92395 }, { "epoch": 15.073409461663948, "grad_norm": 0.04872119799256325, "learning_rate": 8.68680371191458e-06, "loss": 0.0929, "num_input_tokens_seen": 199390464, "step": 92400 }, { "epoch": 15.074225122349104, "grad_norm": 0.13861197233200073, "learning_rate": 8.684106999232139e-06, "loss": 0.0361, "num_input_tokens_seen": 199401408, "step": 92405 }, { "epoch": 15.075040783034257, "grad_norm": 0.0825018659234047, "learning_rate": 8.681410617211969e-06, "loss": 0.0713, "num_input_tokens_seen": 199411872, "step": 92410 }, { "epoch": 15.075856443719413, "grad_norm": 1.229884147644043, "learning_rate": 8.678714565908716e-06, "loss": 0.1281, "num_input_tokens_seen": 199422976, "step": 92415 }, { "epoch": 15.076672104404567, "grad_norm": 0.2284487932920456, "learning_rate": 8.676018845377015e-06, "loss": 0.0238, "num_input_tokens_seen": 199434752, "step": 92420 }, { "epoch": 15.077487765089723, "grad_norm": 0.7142766714096069, "learning_rate": 8.673323455671493e-06, "loss": 0.0121, "num_input_tokens_seen": 199446272, "step": 92425 }, { "epoch": 15.078303425774878, "grad_norm": 1.297961950302124, "learning_rate": 8.6706283968468e-06, "loss": 0.1079, "num_input_tokens_seen": 199457376, "step": 92430 }, { "epoch": 15.079119086460032, "grad_norm": 1.5877974033355713, "learning_rate": 8.66793366895752e-06, "loss": 0.1012, "num_input_tokens_seen": 199469120, "step": 92435 }, { "epoch": 15.079934747145188, "grad_norm": 0.15036219358444214, "learning_rate": 8.6652392720583e-06, "loss": 0.0069, "num_input_tokens_seen": 199481024, "step": 92440 }, { "epoch": 15.080750407830342, "grad_norm": 0.04055211320519447, "learning_rate": 8.662545206203712e-06, "loss": 0.1448, "num_input_tokens_seen": 199492512, "step": 92445 }, { "epoch": 15.081566068515498, "grad_norm": 3.2670810222625732, "learning_rate": 8.659851471448388e-06, "loss": 0.2543, "num_input_tokens_seen": 199502944, "step": 92450 }, { "epoch": 15.082381729200652, "grad_norm": 0.1660722941160202, "learning_rate": 8.657158067846888e-06, "loss": 0.111, "num_input_tokens_seen": 199512544, "step": 92455 }, { "epoch": 15.083197389885807, "grad_norm": 0.07006672769784927, "learning_rate": 8.654464995453826e-06, "loss": 0.1548, "num_input_tokens_seen": 199523232, "step": 92460 }, { "epoch": 15.084013050570963, "grad_norm": 0.4115035831928253, "learning_rate": 8.65177225432375e-06, "loss": 0.1407, "num_input_tokens_seen": 199533984, "step": 92465 }, { "epoch": 15.084828711256117, "grad_norm": 0.1550241857767105, "learning_rate": 8.649079844511265e-06, "loss": 0.1007, "num_input_tokens_seen": 199544512, "step": 92470 }, { "epoch": 15.085644371941273, "grad_norm": 3.1791388988494873, "learning_rate": 8.646387766070904e-06, "loss": 0.1803, "num_input_tokens_seen": 199556192, "step": 92475 }, { "epoch": 15.086460032626427, "grad_norm": 0.06394894421100616, "learning_rate": 8.643696019057258e-06, "loss": 0.0263, "num_input_tokens_seen": 199567584, "step": 92480 }, { "epoch": 15.087275693311582, "grad_norm": 0.07590051740407944, "learning_rate": 8.641004603524844e-06, "loss": 0.0808, "num_input_tokens_seen": 199579168, "step": 92485 }, { "epoch": 15.088091353996738, "grad_norm": 0.7468376159667969, "learning_rate": 8.638313519528233e-06, "loss": 0.0455, "num_input_tokens_seen": 199589248, "step": 92490 }, { "epoch": 15.088907014681892, "grad_norm": 0.4173591434955597, "learning_rate": 8.635622767121956e-06, "loss": 0.0961, "num_input_tokens_seen": 199601024, "step": 92495 }, { "epoch": 15.089722675367048, "grad_norm": 0.8663695454597473, "learning_rate": 8.632932346360541e-06, "loss": 0.0299, "num_input_tokens_seen": 199611168, "step": 92500 }, { "epoch": 15.090538336052202, "grad_norm": 0.057807303965091705, "learning_rate": 8.630242257298518e-06, "loss": 0.029, "num_input_tokens_seen": 199621664, "step": 92505 }, { "epoch": 15.091353996737357, "grad_norm": 0.10161195695400238, "learning_rate": 8.627552499990398e-06, "loss": 0.0153, "num_input_tokens_seen": 199632640, "step": 92510 }, { "epoch": 15.092169657422513, "grad_norm": 0.06417280435562134, "learning_rate": 8.6248630744907e-06, "loss": 0.0129, "num_input_tokens_seen": 199644800, "step": 92515 }, { "epoch": 15.092985318107667, "grad_norm": 1.508374810218811, "learning_rate": 8.622173980853926e-06, "loss": 0.0917, "num_input_tokens_seen": 199655264, "step": 92520 }, { "epoch": 15.093800978792823, "grad_norm": 0.2386251538991928, "learning_rate": 8.61948521913457e-06, "loss": 0.0327, "num_input_tokens_seen": 199667136, "step": 92525 }, { "epoch": 15.094616639477977, "grad_norm": 1.7347520589828491, "learning_rate": 8.616796789387127e-06, "loss": 0.0452, "num_input_tokens_seen": 199678048, "step": 92530 }, { "epoch": 15.095432300163132, "grad_norm": 0.021040502935647964, "learning_rate": 8.614108691666081e-06, "loss": 0.0868, "num_input_tokens_seen": 199688608, "step": 92535 }, { "epoch": 15.096247960848286, "grad_norm": 0.4503108561038971, "learning_rate": 8.61142092602591e-06, "loss": 0.0363, "num_input_tokens_seen": 199699008, "step": 92540 }, { "epoch": 15.097063621533442, "grad_norm": 0.16132158041000366, "learning_rate": 8.608733492521085e-06, "loss": 0.0329, "num_input_tokens_seen": 199710144, "step": 92545 }, { "epoch": 15.097879282218598, "grad_norm": 0.06532105803489685, "learning_rate": 8.606046391206066e-06, "loss": 0.0074, "num_input_tokens_seen": 199721024, "step": 92550 }, { "epoch": 15.098694942903752, "grad_norm": 0.04592899978160858, "learning_rate": 8.603359622135318e-06, "loss": 0.0139, "num_input_tokens_seen": 199731936, "step": 92555 }, { "epoch": 15.099510603588907, "grad_norm": 0.5481767058372498, "learning_rate": 8.600673185363286e-06, "loss": 0.1182, "num_input_tokens_seen": 199744640, "step": 92560 }, { "epoch": 15.100326264274061, "grad_norm": 0.032116156071424484, "learning_rate": 8.597987080944417e-06, "loss": 0.0634, "num_input_tokens_seen": 199756064, "step": 92565 }, { "epoch": 15.101141924959217, "grad_norm": 0.026677696034312248, "learning_rate": 8.595301308933138e-06, "loss": 0.0119, "num_input_tokens_seen": 199766432, "step": 92570 }, { "epoch": 15.101957585644373, "grad_norm": 0.033700283616781235, "learning_rate": 8.592615869383905e-06, "loss": 0.0071, "num_input_tokens_seen": 199777472, "step": 92575 }, { "epoch": 15.102773246329527, "grad_norm": 0.023920845240354538, "learning_rate": 8.589930762351109e-06, "loss": 0.0141, "num_input_tokens_seen": 199789184, "step": 92580 }, { "epoch": 15.103588907014682, "grad_norm": 0.04357529804110527, "learning_rate": 8.5872459878892e-06, "loss": 0.132, "num_input_tokens_seen": 199799776, "step": 92585 }, { "epoch": 15.104404567699836, "grad_norm": 1.9423882961273193, "learning_rate": 8.584561546052558e-06, "loss": 0.1063, "num_input_tokens_seen": 199811424, "step": 92590 }, { "epoch": 15.105220228384992, "grad_norm": 4.247231960296631, "learning_rate": 8.581877436895605e-06, "loss": 0.2559, "num_input_tokens_seen": 199822784, "step": 92595 }, { "epoch": 15.106035889070148, "grad_norm": 0.08504944294691086, "learning_rate": 8.579193660472736e-06, "loss": 0.0136, "num_input_tokens_seen": 199833728, "step": 92600 }, { "epoch": 15.106851549755302, "grad_norm": 2.13301682472229, "learning_rate": 8.57651021683834e-06, "loss": 0.0409, "num_input_tokens_seen": 199845344, "step": 92605 }, { "epoch": 15.107667210440457, "grad_norm": 0.10576315969228745, "learning_rate": 8.573827106046797e-06, "loss": 0.0467, "num_input_tokens_seen": 199856960, "step": 92610 }, { "epoch": 15.108482871125611, "grad_norm": 0.8015965223312378, "learning_rate": 8.571144328152489e-06, "loss": 0.2047, "num_input_tokens_seen": 199867840, "step": 92615 }, { "epoch": 15.109298531810767, "grad_norm": 0.8916072845458984, "learning_rate": 8.56846188320978e-06, "loss": 0.0467, "num_input_tokens_seen": 199878912, "step": 92620 }, { "epoch": 15.11011419249592, "grad_norm": 0.16672134399414062, "learning_rate": 8.565779771273036e-06, "loss": 0.0113, "num_input_tokens_seen": 199890080, "step": 92625 }, { "epoch": 15.110929853181077, "grad_norm": 2.3078691959381104, "learning_rate": 8.563097992396613e-06, "loss": 0.2941, "num_input_tokens_seen": 199900800, "step": 92630 }, { "epoch": 15.111745513866232, "grad_norm": 0.15048536658287048, "learning_rate": 8.560416546634861e-06, "loss": 0.237, "num_input_tokens_seen": 199912608, "step": 92635 }, { "epoch": 15.112561174551386, "grad_norm": 0.0448380745947361, "learning_rate": 8.557735434042125e-06, "loss": 0.0614, "num_input_tokens_seen": 199923488, "step": 92640 }, { "epoch": 15.113376835236542, "grad_norm": 0.07251064479351044, "learning_rate": 8.555054654672734e-06, "loss": 0.0783, "num_input_tokens_seen": 199934592, "step": 92645 }, { "epoch": 15.114192495921696, "grad_norm": 0.26372599601745605, "learning_rate": 8.552374208581027e-06, "loss": 0.1068, "num_input_tokens_seen": 199945312, "step": 92650 }, { "epoch": 15.115008156606851, "grad_norm": 1.4160269498825073, "learning_rate": 8.54969409582132e-06, "loss": 0.0462, "num_input_tokens_seen": 199956352, "step": 92655 }, { "epoch": 15.115823817292007, "grad_norm": 0.04223400354385376, "learning_rate": 8.547014316447933e-06, "loss": 0.1064, "num_input_tokens_seen": 199967168, "step": 92660 }, { "epoch": 15.116639477977161, "grad_norm": 0.24252083897590637, "learning_rate": 8.544334870515169e-06, "loss": 0.227, "num_input_tokens_seen": 199977440, "step": 92665 }, { "epoch": 15.117455138662317, "grad_norm": 0.0899273008108139, "learning_rate": 8.541655758077336e-06, "loss": 0.0287, "num_input_tokens_seen": 199988512, "step": 92670 }, { "epoch": 15.11827079934747, "grad_norm": 1.6075409650802612, "learning_rate": 8.538976979188729e-06, "loss": 0.1485, "num_input_tokens_seen": 199999296, "step": 92675 }, { "epoch": 15.119086460032626, "grad_norm": 0.19744008779525757, "learning_rate": 8.536298533903636e-06, "loss": 0.0169, "num_input_tokens_seen": 200009856, "step": 92680 }, { "epoch": 15.119902120717782, "grad_norm": 0.07533872872591019, "learning_rate": 8.533620422276337e-06, "loss": 0.1612, "num_input_tokens_seen": 200021184, "step": 92685 }, { "epoch": 15.120717781402936, "grad_norm": 0.056142617017030716, "learning_rate": 8.530942644361109e-06, "loss": 0.1067, "num_input_tokens_seen": 200031616, "step": 92690 }, { "epoch": 15.121533442088092, "grad_norm": 0.08786074072122574, "learning_rate": 8.528265200212224e-06, "loss": 0.0439, "num_input_tokens_seen": 200040928, "step": 92695 }, { "epoch": 15.122349102773246, "grad_norm": 0.13386274874210358, "learning_rate": 8.525588089883928e-06, "loss": 0.1315, "num_input_tokens_seen": 200051776, "step": 92700 }, { "epoch": 15.123164763458401, "grad_norm": 0.05693232640624046, "learning_rate": 8.52291131343051e-06, "loss": 0.0344, "num_input_tokens_seen": 200063232, "step": 92705 }, { "epoch": 15.123980424143557, "grad_norm": 0.8274138569831848, "learning_rate": 8.520234870906178e-06, "loss": 0.1405, "num_input_tokens_seen": 200072160, "step": 92710 }, { "epoch": 15.124796084828711, "grad_norm": 0.6117650270462036, "learning_rate": 8.517558762365199e-06, "loss": 0.1008, "num_input_tokens_seen": 200081888, "step": 92715 }, { "epoch": 15.125611745513867, "grad_norm": 0.08697900921106339, "learning_rate": 8.514882987861803e-06, "loss": 0.2005, "num_input_tokens_seen": 200092864, "step": 92720 }, { "epoch": 15.12642740619902, "grad_norm": 2.471712350845337, "learning_rate": 8.512207547450215e-06, "loss": 0.1296, "num_input_tokens_seen": 200102976, "step": 92725 }, { "epoch": 15.127243066884176, "grad_norm": 0.20956847071647644, "learning_rate": 8.509532441184659e-06, "loss": 0.0453, "num_input_tokens_seen": 200112064, "step": 92730 }, { "epoch": 15.12805872756933, "grad_norm": 2.049619674682617, "learning_rate": 8.506857669119347e-06, "loss": 0.2562, "num_input_tokens_seen": 200122656, "step": 92735 }, { "epoch": 15.128874388254486, "grad_norm": 2.403059482574463, "learning_rate": 8.504183231308488e-06, "loss": 0.1954, "num_input_tokens_seen": 200133856, "step": 92740 }, { "epoch": 15.129690048939642, "grad_norm": 3.2699108123779297, "learning_rate": 8.501509127806282e-06, "loss": 0.1047, "num_input_tokens_seen": 200144608, "step": 92745 }, { "epoch": 15.130505709624796, "grad_norm": 2.2224807739257812, "learning_rate": 8.498835358666923e-06, "loss": 0.1794, "num_input_tokens_seen": 200156256, "step": 92750 }, { "epoch": 15.131321370309951, "grad_norm": 0.635276734828949, "learning_rate": 8.496161923944598e-06, "loss": 0.3149, "num_input_tokens_seen": 200167008, "step": 92755 }, { "epoch": 15.132137030995105, "grad_norm": 0.3737504184246063, "learning_rate": 8.493488823693488e-06, "loss": 0.084, "num_input_tokens_seen": 200177248, "step": 92760 }, { "epoch": 15.132952691680261, "grad_norm": 0.054488249123096466, "learning_rate": 8.490816057967768e-06, "loss": 0.1096, "num_input_tokens_seen": 200186624, "step": 92765 }, { "epoch": 15.133768352365417, "grad_norm": 1.0265213251113892, "learning_rate": 8.488143626821605e-06, "loss": 0.2939, "num_input_tokens_seen": 200197152, "step": 92770 }, { "epoch": 15.13458401305057, "grad_norm": 0.038107309490442276, "learning_rate": 8.485471530309155e-06, "loss": 0.0717, "num_input_tokens_seen": 200206976, "step": 92775 }, { "epoch": 15.135399673735726, "grad_norm": 0.03966990485787392, "learning_rate": 8.482799768484579e-06, "loss": 0.0182, "num_input_tokens_seen": 200216320, "step": 92780 }, { "epoch": 15.13621533442088, "grad_norm": 0.11068496108055115, "learning_rate": 8.480128341402017e-06, "loss": 0.0377, "num_input_tokens_seen": 200227200, "step": 92785 }, { "epoch": 15.137030995106036, "grad_norm": 1.1000268459320068, "learning_rate": 8.477457249115609e-06, "loss": 0.1629, "num_input_tokens_seen": 200237664, "step": 92790 }, { "epoch": 15.137846655791192, "grad_norm": 2.1883738040924072, "learning_rate": 8.474786491679492e-06, "loss": 0.4903, "num_input_tokens_seen": 200249568, "step": 92795 }, { "epoch": 15.138662316476346, "grad_norm": 0.30749911069869995, "learning_rate": 8.472116069147789e-06, "loss": 0.1917, "num_input_tokens_seen": 200260448, "step": 92800 }, { "epoch": 15.139477977161501, "grad_norm": 0.03160521015524864, "learning_rate": 8.469445981574617e-06, "loss": 0.11, "num_input_tokens_seen": 200271008, "step": 92805 }, { "epoch": 15.140293637846655, "grad_norm": 2.5992884635925293, "learning_rate": 8.466776229014107e-06, "loss": 0.079, "num_input_tokens_seen": 200281152, "step": 92810 }, { "epoch": 15.141109298531811, "grad_norm": 0.19792218506336212, "learning_rate": 8.464106811520331e-06, "loss": 0.1775, "num_input_tokens_seen": 200293152, "step": 92815 }, { "epoch": 15.141924959216965, "grad_norm": 0.4409952163696289, "learning_rate": 8.461437729147428e-06, "loss": 0.0175, "num_input_tokens_seen": 200303552, "step": 92820 }, { "epoch": 15.14274061990212, "grad_norm": 0.3114285171031952, "learning_rate": 8.458768981949453e-06, "loss": 0.0289, "num_input_tokens_seen": 200315136, "step": 92825 }, { "epoch": 15.143556280587276, "grad_norm": 0.43007171154022217, "learning_rate": 8.456100569980524e-06, "loss": 0.1543, "num_input_tokens_seen": 200326080, "step": 92830 }, { "epoch": 15.14437194127243, "grad_norm": 2.4483509063720703, "learning_rate": 8.453432493294689e-06, "loss": 0.0838, "num_input_tokens_seen": 200336992, "step": 92835 }, { "epoch": 15.145187601957586, "grad_norm": 0.09173155575990677, "learning_rate": 8.45076475194605e-06, "loss": 0.1542, "num_input_tokens_seen": 200348608, "step": 92840 }, { "epoch": 15.14600326264274, "grad_norm": 1.3904435634613037, "learning_rate": 8.448097345988642e-06, "loss": 0.0506, "num_input_tokens_seen": 200359040, "step": 92845 }, { "epoch": 15.146818923327896, "grad_norm": 0.08782998472452164, "learning_rate": 8.445430275476545e-06, "loss": 0.1711, "num_input_tokens_seen": 200369568, "step": 92850 }, { "epoch": 15.147634584013051, "grad_norm": 0.07543416321277618, "learning_rate": 8.442763540463807e-06, "loss": 0.0336, "num_input_tokens_seen": 200380864, "step": 92855 }, { "epoch": 15.148450244698205, "grad_norm": 2.471426486968994, "learning_rate": 8.440097141004471e-06, "loss": 0.2026, "num_input_tokens_seen": 200392736, "step": 92860 }, { "epoch": 15.149265905383361, "grad_norm": 0.7529128193855286, "learning_rate": 8.437431077152575e-06, "loss": 0.0442, "num_input_tokens_seen": 200403872, "step": 92865 }, { "epoch": 15.150081566068515, "grad_norm": 0.09149358421564102, "learning_rate": 8.434765348962145e-06, "loss": 0.0413, "num_input_tokens_seen": 200413248, "step": 92870 }, { "epoch": 15.15089722675367, "grad_norm": 0.11263912171125412, "learning_rate": 8.432099956487213e-06, "loss": 0.0377, "num_input_tokens_seen": 200424896, "step": 92875 }, { "epoch": 15.151712887438826, "grad_norm": 1.5046006441116333, "learning_rate": 8.429434899781794e-06, "loss": 0.0357, "num_input_tokens_seen": 200435872, "step": 92880 }, { "epoch": 15.15252854812398, "grad_norm": 0.07444117218255997, "learning_rate": 8.426770178899896e-06, "loss": 0.0239, "num_input_tokens_seen": 200447648, "step": 92885 }, { "epoch": 15.153344208809136, "grad_norm": 0.7906726002693176, "learning_rate": 8.424105793895523e-06, "loss": 0.0776, "num_input_tokens_seen": 200457536, "step": 92890 }, { "epoch": 15.15415986949429, "grad_norm": 1.9720429182052612, "learning_rate": 8.421441744822678e-06, "loss": 0.0677, "num_input_tokens_seen": 200467392, "step": 92895 }, { "epoch": 15.154975530179446, "grad_norm": 0.03068646602332592, "learning_rate": 8.418778031735344e-06, "loss": 0.0437, "num_input_tokens_seen": 200479264, "step": 92900 }, { "epoch": 15.1557911908646, "grad_norm": 1.7307535409927368, "learning_rate": 8.41611465468751e-06, "loss": 0.1116, "num_input_tokens_seen": 200490624, "step": 92905 }, { "epoch": 15.156606851549755, "grad_norm": 0.2774203419685364, "learning_rate": 8.413451613733142e-06, "loss": 0.06, "num_input_tokens_seen": 200500736, "step": 92910 }, { "epoch": 15.15742251223491, "grad_norm": 0.0802542194724083, "learning_rate": 8.410788908926237e-06, "loss": 0.0223, "num_input_tokens_seen": 200511968, "step": 92915 }, { "epoch": 15.158238172920065, "grad_norm": 2.4320452213287354, "learning_rate": 8.40812654032072e-06, "loss": 0.3884, "num_input_tokens_seen": 200522240, "step": 92920 }, { "epoch": 15.15905383360522, "grad_norm": 0.13700400292873383, "learning_rate": 8.405464507970587e-06, "loss": 0.1471, "num_input_tokens_seen": 200533632, "step": 92925 }, { "epoch": 15.159869494290374, "grad_norm": 0.12295174598693848, "learning_rate": 8.402802811929746e-06, "loss": 0.0874, "num_input_tokens_seen": 200545024, "step": 92930 }, { "epoch": 15.16068515497553, "grad_norm": 0.09042268246412277, "learning_rate": 8.40014145225218e-06, "loss": 0.0875, "num_input_tokens_seen": 200557152, "step": 92935 }, { "epoch": 15.161500815660686, "grad_norm": 0.46618494391441345, "learning_rate": 8.397480428991792e-06, "loss": 0.0127, "num_input_tokens_seen": 200566944, "step": 92940 }, { "epoch": 15.16231647634584, "grad_norm": 0.04110940545797348, "learning_rate": 8.394819742202539e-06, "loss": 0.1667, "num_input_tokens_seen": 200577184, "step": 92945 }, { "epoch": 15.163132137030995, "grad_norm": 2.1518139839172363, "learning_rate": 8.392159391938311e-06, "loss": 0.0842, "num_input_tokens_seen": 200588160, "step": 92950 }, { "epoch": 15.16394779771615, "grad_norm": 0.04850734770298004, "learning_rate": 8.38949937825306e-06, "loss": 0.0669, "num_input_tokens_seen": 200598272, "step": 92955 }, { "epoch": 15.164763458401305, "grad_norm": 0.038461729884147644, "learning_rate": 8.386839701200661e-06, "loss": 0.0695, "num_input_tokens_seen": 200609088, "step": 92960 }, { "epoch": 15.16557911908646, "grad_norm": 0.10952873528003693, "learning_rate": 8.384180360835045e-06, "loss": 0.0981, "num_input_tokens_seen": 200619200, "step": 92965 }, { "epoch": 15.166394779771615, "grad_norm": 1.3777332305908203, "learning_rate": 8.381521357210076e-06, "loss": 0.0546, "num_input_tokens_seen": 200630048, "step": 92970 }, { "epoch": 15.16721044045677, "grad_norm": 2.7055814266204834, "learning_rate": 8.378862690379677e-06, "loss": 0.2909, "num_input_tokens_seen": 200641696, "step": 92975 }, { "epoch": 15.168026101141924, "grad_norm": 0.6870363354682922, "learning_rate": 8.376204360397693e-06, "loss": 0.045, "num_input_tokens_seen": 200652928, "step": 92980 }, { "epoch": 15.16884176182708, "grad_norm": 0.07185802608728409, "learning_rate": 8.373546367318033e-06, "loss": 0.045, "num_input_tokens_seen": 200664608, "step": 92985 }, { "epoch": 15.169657422512234, "grad_norm": 0.9642469882965088, "learning_rate": 8.37088871119453e-06, "loss": 0.0472, "num_input_tokens_seen": 200675008, "step": 92990 }, { "epoch": 15.17047308319739, "grad_norm": 2.075066566467285, "learning_rate": 8.368231392081074e-06, "loss": 0.1259, "num_input_tokens_seen": 200685728, "step": 92995 }, { "epoch": 15.171288743882545, "grad_norm": 0.07984857261180878, "learning_rate": 8.365574410031506e-06, "loss": 0.0162, "num_input_tokens_seen": 200696608, "step": 93000 }, { "epoch": 15.1721044045677, "grad_norm": 0.1526004672050476, "learning_rate": 8.362917765099676e-06, "loss": 0.0585, "num_input_tokens_seen": 200706848, "step": 93005 }, { "epoch": 15.172920065252855, "grad_norm": 0.7874409556388855, "learning_rate": 8.360261457339421e-06, "loss": 0.0994, "num_input_tokens_seen": 200719232, "step": 93010 }, { "epoch": 15.173735725938009, "grad_norm": 1.8240258693695068, "learning_rate": 8.35760548680458e-06, "loss": 0.0752, "num_input_tokens_seen": 200729920, "step": 93015 }, { "epoch": 15.174551386623165, "grad_norm": 0.733109712600708, "learning_rate": 8.354949853548974e-06, "loss": 0.1016, "num_input_tokens_seen": 200740640, "step": 93020 }, { "epoch": 15.17536704730832, "grad_norm": 0.12396856397390366, "learning_rate": 8.352294557626424e-06, "loss": 0.1304, "num_input_tokens_seen": 200751136, "step": 93025 }, { "epoch": 15.176182707993474, "grad_norm": 0.6800150275230408, "learning_rate": 8.349639599090748e-06, "loss": 0.0672, "num_input_tokens_seen": 200762208, "step": 93030 }, { "epoch": 15.17699836867863, "grad_norm": 0.16372588276863098, "learning_rate": 8.346984977995741e-06, "loss": 0.1294, "num_input_tokens_seen": 200773824, "step": 93035 }, { "epoch": 15.177814029363784, "grad_norm": 1.7258232831954956, "learning_rate": 8.344330694395216e-06, "loss": 0.0439, "num_input_tokens_seen": 200784736, "step": 93040 }, { "epoch": 15.17862969004894, "grad_norm": 2.499194383621216, "learning_rate": 8.341676748342948e-06, "loss": 0.2182, "num_input_tokens_seen": 200794784, "step": 93045 }, { "epoch": 15.179445350734095, "grad_norm": 1.60490882396698, "learning_rate": 8.339023139892748e-06, "loss": 0.0889, "num_input_tokens_seen": 200805440, "step": 93050 }, { "epoch": 15.18026101141925, "grad_norm": 1.2286943197250366, "learning_rate": 8.336369869098368e-06, "loss": 0.1482, "num_input_tokens_seen": 200817024, "step": 93055 }, { "epoch": 15.181076672104405, "grad_norm": 1.7543094158172607, "learning_rate": 8.333716936013606e-06, "loss": 0.0793, "num_input_tokens_seen": 200827648, "step": 93060 }, { "epoch": 15.181892332789559, "grad_norm": 0.36710843443870544, "learning_rate": 8.331064340692193e-06, "loss": 0.0319, "num_input_tokens_seen": 200838496, "step": 93065 }, { "epoch": 15.182707993474715, "grad_norm": 0.19415238499641418, "learning_rate": 8.328412083187928e-06, "loss": 0.0139, "num_input_tokens_seen": 200849408, "step": 93070 }, { "epoch": 15.18352365415987, "grad_norm": 2.8196988105773926, "learning_rate": 8.325760163554525e-06, "loss": 0.1193, "num_input_tokens_seen": 200861152, "step": 93075 }, { "epoch": 15.184339314845024, "grad_norm": 0.13194863498210907, "learning_rate": 8.32310858184576e-06, "loss": 0.1983, "num_input_tokens_seen": 200871872, "step": 93080 }, { "epoch": 15.18515497553018, "grad_norm": 1.7078725099563599, "learning_rate": 8.320457338115339e-06, "loss": 0.1788, "num_input_tokens_seen": 200881856, "step": 93085 }, { "epoch": 15.185970636215334, "grad_norm": 0.4579978585243225, "learning_rate": 8.317806432417025e-06, "loss": 0.0499, "num_input_tokens_seen": 200892448, "step": 93090 }, { "epoch": 15.18678629690049, "grad_norm": 1.5888516902923584, "learning_rate": 8.315155864804513e-06, "loss": 0.0448, "num_input_tokens_seen": 200903904, "step": 93095 }, { "epoch": 15.187601957585644, "grad_norm": 1.8591763973236084, "learning_rate": 8.312505635331549e-06, "loss": 0.1619, "num_input_tokens_seen": 200913248, "step": 93100 }, { "epoch": 15.1884176182708, "grad_norm": 2.0941121578216553, "learning_rate": 8.309855744051812e-06, "loss": 0.0816, "num_input_tokens_seen": 200924832, "step": 93105 }, { "epoch": 15.189233278955955, "grad_norm": 0.28590261936187744, "learning_rate": 8.307206191019035e-06, "loss": 0.0561, "num_input_tokens_seen": 200935584, "step": 93110 }, { "epoch": 15.190048939641109, "grad_norm": 0.4240911602973938, "learning_rate": 8.304556976286887e-06, "loss": 0.0214, "num_input_tokens_seen": 200945312, "step": 93115 }, { "epoch": 15.190864600326265, "grad_norm": 0.20476756989955902, "learning_rate": 8.301908099909086e-06, "loss": 0.2407, "num_input_tokens_seen": 200957216, "step": 93120 }, { "epoch": 15.191680261011419, "grad_norm": 1.1902490854263306, "learning_rate": 8.299259561939285e-06, "loss": 0.0459, "num_input_tokens_seen": 200968288, "step": 93125 }, { "epoch": 15.192495921696574, "grad_norm": 0.32141971588134766, "learning_rate": 8.296611362431184e-06, "loss": 0.1304, "num_input_tokens_seen": 200979680, "step": 93130 }, { "epoch": 15.19331158238173, "grad_norm": 1.0625429153442383, "learning_rate": 8.293963501438446e-06, "loss": 0.1636, "num_input_tokens_seen": 200989536, "step": 93135 }, { "epoch": 15.194127243066884, "grad_norm": 0.09394583851099014, "learning_rate": 8.291315979014727e-06, "loss": 0.0377, "num_input_tokens_seen": 201000256, "step": 93140 }, { "epoch": 15.19494290375204, "grad_norm": 1.1272562742233276, "learning_rate": 8.288668795213686e-06, "loss": 0.0834, "num_input_tokens_seen": 201011360, "step": 93145 }, { "epoch": 15.195758564437194, "grad_norm": 1.0789071321487427, "learning_rate": 8.286021950088973e-06, "loss": 0.0655, "num_input_tokens_seen": 201021600, "step": 93150 }, { "epoch": 15.19657422512235, "grad_norm": 1.8954402208328247, "learning_rate": 8.283375443694231e-06, "loss": 0.1211, "num_input_tokens_seen": 201032320, "step": 93155 }, { "epoch": 15.197389885807505, "grad_norm": 0.35812753438949585, "learning_rate": 8.280729276083087e-06, "loss": 0.0106, "num_input_tokens_seen": 201043456, "step": 93160 }, { "epoch": 15.198205546492659, "grad_norm": 0.3541788160800934, "learning_rate": 8.278083447309177e-06, "loss": 0.125, "num_input_tokens_seen": 201054432, "step": 93165 }, { "epoch": 15.199021207177815, "grad_norm": 0.32200682163238525, "learning_rate": 8.27543795742612e-06, "loss": 0.1023, "num_input_tokens_seen": 201066112, "step": 93170 }, { "epoch": 15.199836867862969, "grad_norm": 0.1626053750514984, "learning_rate": 8.272792806487526e-06, "loss": 0.0489, "num_input_tokens_seen": 201076608, "step": 93175 }, { "epoch": 15.200652528548124, "grad_norm": 0.05128943547606468, "learning_rate": 8.270147994547009e-06, "loss": 0.0328, "num_input_tokens_seen": 201087488, "step": 93180 }, { "epoch": 15.201468189233278, "grad_norm": 0.5029922723770142, "learning_rate": 8.267503521658165e-06, "loss": 0.14, "num_input_tokens_seen": 201098208, "step": 93185 }, { "epoch": 15.202283849918434, "grad_norm": 0.31109321117401123, "learning_rate": 8.26485938787458e-06, "loss": 0.2317, "num_input_tokens_seen": 201109504, "step": 93190 }, { "epoch": 15.20309951060359, "grad_norm": 0.021584611386060715, "learning_rate": 8.262215593249867e-06, "loss": 0.0158, "num_input_tokens_seen": 201118720, "step": 93195 }, { "epoch": 15.203915171288743, "grad_norm": 0.46753308176994324, "learning_rate": 8.259572137837571e-06, "loss": 0.0205, "num_input_tokens_seen": 201129024, "step": 93200 }, { "epoch": 15.2047308319739, "grad_norm": 0.16633452475070953, "learning_rate": 8.2569290216913e-06, "loss": 0.1004, "num_input_tokens_seen": 201140608, "step": 93205 }, { "epoch": 15.205546492659053, "grad_norm": 0.016767816618084908, "learning_rate": 8.254286244864587e-06, "loss": 0.0126, "num_input_tokens_seen": 201150944, "step": 93210 }, { "epoch": 15.206362153344209, "grad_norm": 0.16689717769622803, "learning_rate": 8.251643807411021e-06, "loss": 0.1239, "num_input_tokens_seen": 201162400, "step": 93215 }, { "epoch": 15.207177814029365, "grad_norm": 0.05510816350579262, "learning_rate": 8.249001709384125e-06, "loss": 0.0349, "num_input_tokens_seen": 201173088, "step": 93220 }, { "epoch": 15.207993474714518, "grad_norm": 0.1724807471036911, "learning_rate": 8.246359950837473e-06, "loss": 0.018, "num_input_tokens_seen": 201183808, "step": 93225 }, { "epoch": 15.208809135399674, "grad_norm": 2.6851155757904053, "learning_rate": 8.243718531824575e-06, "loss": 0.1551, "num_input_tokens_seen": 201194400, "step": 93230 }, { "epoch": 15.209624796084828, "grad_norm": 0.36895519495010376, "learning_rate": 8.241077452398988e-06, "loss": 0.2549, "num_input_tokens_seen": 201204640, "step": 93235 }, { "epoch": 15.210440456769984, "grad_norm": 0.49094319343566895, "learning_rate": 8.238436712614225e-06, "loss": 0.2378, "num_input_tokens_seen": 201216000, "step": 93240 }, { "epoch": 15.21125611745514, "grad_norm": 2.0564136505126953, "learning_rate": 8.235796312523808e-06, "loss": 0.042, "num_input_tokens_seen": 201227360, "step": 93245 }, { "epoch": 15.212071778140293, "grad_norm": 0.6626622676849365, "learning_rate": 8.233156252181243e-06, "loss": 0.0587, "num_input_tokens_seen": 201238624, "step": 93250 }, { "epoch": 15.21288743882545, "grad_norm": 1.2494466304779053, "learning_rate": 8.230516531640037e-06, "loss": 0.0714, "num_input_tokens_seen": 201249632, "step": 93255 }, { "epoch": 15.213703099510603, "grad_norm": 0.3650432527065277, "learning_rate": 8.227877150953689e-06, "loss": 0.1358, "num_input_tokens_seen": 201259872, "step": 93260 }, { "epoch": 15.214518760195759, "grad_norm": 0.1411105841398239, "learning_rate": 8.225238110175684e-06, "loss": 0.1717, "num_input_tokens_seen": 201270272, "step": 93265 }, { "epoch": 15.215334420880913, "grad_norm": 1.5588955879211426, "learning_rate": 8.222599409359513e-06, "loss": 0.1609, "num_input_tokens_seen": 201280960, "step": 93270 }, { "epoch": 15.216150081566068, "grad_norm": 2.7356255054473877, "learning_rate": 8.219961048558646e-06, "loss": 0.153, "num_input_tokens_seen": 201291488, "step": 93275 }, { "epoch": 15.216965742251224, "grad_norm": 0.8605077862739563, "learning_rate": 8.217323027826557e-06, "loss": 0.0291, "num_input_tokens_seen": 201302304, "step": 93280 }, { "epoch": 15.217781402936378, "grad_norm": 0.5074530243873596, "learning_rate": 8.214685347216705e-06, "loss": 0.0324, "num_input_tokens_seen": 201313760, "step": 93285 }, { "epoch": 15.218597063621534, "grad_norm": 0.748077392578125, "learning_rate": 8.21204800678255e-06, "loss": 0.0464, "num_input_tokens_seen": 201324384, "step": 93290 }, { "epoch": 15.219412724306688, "grad_norm": 1.8817216157913208, "learning_rate": 8.20941100657754e-06, "loss": 0.1052, "num_input_tokens_seen": 201336000, "step": 93295 }, { "epoch": 15.220228384991843, "grad_norm": 0.06329457461833954, "learning_rate": 8.206774346655113e-06, "loss": 0.1468, "num_input_tokens_seen": 201346752, "step": 93300 }, { "epoch": 15.221044045676999, "grad_norm": 1.3528074026107788, "learning_rate": 8.204138027068709e-06, "loss": 0.0437, "num_input_tokens_seen": 201356352, "step": 93305 }, { "epoch": 15.221859706362153, "grad_norm": 0.9209125638008118, "learning_rate": 8.201502047871756e-06, "loss": 0.09, "num_input_tokens_seen": 201367840, "step": 93310 }, { "epoch": 15.222675367047309, "grad_norm": 0.11325350403785706, "learning_rate": 8.198866409117673e-06, "loss": 0.0485, "num_input_tokens_seen": 201378400, "step": 93315 }, { "epoch": 15.223491027732463, "grad_norm": 2.1768100261688232, "learning_rate": 8.196231110859875e-06, "loss": 0.2698, "num_input_tokens_seen": 201388672, "step": 93320 }, { "epoch": 15.224306688417618, "grad_norm": 0.7465216517448425, "learning_rate": 8.193596153151774e-06, "loss": 0.1821, "num_input_tokens_seen": 201399648, "step": 93325 }, { "epoch": 15.225122349102774, "grad_norm": 1.1550551652908325, "learning_rate": 8.190961536046766e-06, "loss": 0.1652, "num_input_tokens_seen": 201410208, "step": 93330 }, { "epoch": 15.225938009787928, "grad_norm": 0.20001102983951569, "learning_rate": 8.188327259598239e-06, "loss": 0.0178, "num_input_tokens_seen": 201420512, "step": 93335 }, { "epoch": 15.226753670473084, "grad_norm": 0.23444047570228577, "learning_rate": 8.185693323859593e-06, "loss": 0.0218, "num_input_tokens_seen": 201430944, "step": 93340 }, { "epoch": 15.227569331158238, "grad_norm": 0.1701628565788269, "learning_rate": 8.183059728884205e-06, "loss": 0.0192, "num_input_tokens_seen": 201440544, "step": 93345 }, { "epoch": 15.228384991843393, "grad_norm": 1.349522590637207, "learning_rate": 8.180426474725441e-06, "loss": 0.2538, "num_input_tokens_seen": 201451232, "step": 93350 }, { "epoch": 15.229200652528547, "grad_norm": 0.10051938146352768, "learning_rate": 8.177793561436676e-06, "loss": 0.1709, "num_input_tokens_seen": 201461984, "step": 93355 }, { "epoch": 15.230016313213703, "grad_norm": 0.6923608183860779, "learning_rate": 8.175160989071262e-06, "loss": 0.0574, "num_input_tokens_seen": 201473504, "step": 93360 }, { "epoch": 15.230831973898859, "grad_norm": 0.7865335941314697, "learning_rate": 8.172528757682557e-06, "loss": 0.0634, "num_input_tokens_seen": 201484480, "step": 93365 }, { "epoch": 15.231647634584013, "grad_norm": 0.014535106718540192, "learning_rate": 8.169896867323903e-06, "loss": 0.1788, "num_input_tokens_seen": 201495104, "step": 93370 }, { "epoch": 15.232463295269168, "grad_norm": 0.8612583875656128, "learning_rate": 8.167265318048639e-06, "loss": 0.2587, "num_input_tokens_seen": 201506784, "step": 93375 }, { "epoch": 15.233278955954322, "grad_norm": 0.9130905866622925, "learning_rate": 8.164634109910097e-06, "loss": 0.2087, "num_input_tokens_seen": 201517952, "step": 93380 }, { "epoch": 15.234094616639478, "grad_norm": 1.3640906810760498, "learning_rate": 8.162003242961601e-06, "loss": 0.2597, "num_input_tokens_seen": 201528800, "step": 93385 }, { "epoch": 15.234910277324634, "grad_norm": 0.07140232622623444, "learning_rate": 8.159372717256472e-06, "loss": 0.0188, "num_input_tokens_seen": 201539840, "step": 93390 }, { "epoch": 15.235725938009788, "grad_norm": 0.7636833190917969, "learning_rate": 8.15674253284802e-06, "loss": 0.1587, "num_input_tokens_seen": 201550816, "step": 93395 }, { "epoch": 15.236541598694943, "grad_norm": 0.01892511546611786, "learning_rate": 8.154112689789544e-06, "loss": 0.1371, "num_input_tokens_seen": 201560288, "step": 93400 }, { "epoch": 15.237357259380097, "grad_norm": 1.7242431640625, "learning_rate": 8.151483188134346e-06, "loss": 0.1653, "num_input_tokens_seen": 201571264, "step": 93405 }, { "epoch": 15.238172920065253, "grad_norm": 0.9428847432136536, "learning_rate": 8.148854027935716e-06, "loss": 0.1486, "num_input_tokens_seen": 201582368, "step": 93410 }, { "epoch": 15.238988580750409, "grad_norm": 0.10551971942186356, "learning_rate": 8.146225209246935e-06, "loss": 0.0099, "num_input_tokens_seen": 201595392, "step": 93415 }, { "epoch": 15.239804241435563, "grad_norm": 0.040755145251750946, "learning_rate": 8.143596732121281e-06, "loss": 0.0175, "num_input_tokens_seen": 201605568, "step": 93420 }, { "epoch": 15.240619902120718, "grad_norm": 0.7913221120834351, "learning_rate": 8.140968596612025e-06, "loss": 0.0393, "num_input_tokens_seen": 201616000, "step": 93425 }, { "epoch": 15.241435562805872, "grad_norm": 1.908500075340271, "learning_rate": 8.138340802772426e-06, "loss": 0.0639, "num_input_tokens_seen": 201626880, "step": 93430 }, { "epoch": 15.242251223491028, "grad_norm": 0.0752779021859169, "learning_rate": 8.135713350655741e-06, "loss": 0.0569, "num_input_tokens_seen": 201637888, "step": 93435 }, { "epoch": 15.243066884176184, "grad_norm": 0.8823435306549072, "learning_rate": 8.13308624031522e-06, "loss": 0.1196, "num_input_tokens_seen": 201648960, "step": 93440 }, { "epoch": 15.243882544861338, "grad_norm": 0.4566670358181, "learning_rate": 8.130459471804095e-06, "loss": 0.1901, "num_input_tokens_seen": 201660544, "step": 93445 }, { "epoch": 15.244698205546493, "grad_norm": 1.0050410032272339, "learning_rate": 8.127833045175624e-06, "loss": 0.0293, "num_input_tokens_seen": 201672416, "step": 93450 }, { "epoch": 15.245513866231647, "grad_norm": 0.19991064071655273, "learning_rate": 8.125206960483007e-06, "loss": 0.1826, "num_input_tokens_seen": 201683040, "step": 93455 }, { "epoch": 15.246329526916803, "grad_norm": 0.3217131495475769, "learning_rate": 8.12258121777949e-06, "loss": 0.0293, "num_input_tokens_seen": 201695104, "step": 93460 }, { "epoch": 15.247145187601957, "grad_norm": 1.13261079788208, "learning_rate": 8.119955817118263e-06, "loss": 0.1069, "num_input_tokens_seen": 201704928, "step": 93465 }, { "epoch": 15.247960848287113, "grad_norm": 1.9528589248657227, "learning_rate": 8.117330758552558e-06, "loss": 0.0582, "num_input_tokens_seen": 201716000, "step": 93470 }, { "epoch": 15.248776508972268, "grad_norm": 0.4908440411090851, "learning_rate": 8.114706042135547e-06, "loss": 0.0457, "num_input_tokens_seen": 201726016, "step": 93475 }, { "epoch": 15.249592169657422, "grad_norm": 0.05734090879559517, "learning_rate": 8.112081667920449e-06, "loss": 0.1052, "num_input_tokens_seen": 201736032, "step": 93480 }, { "epoch": 15.250407830342578, "grad_norm": 0.1775030791759491, "learning_rate": 8.109457635960438e-06, "loss": 0.2863, "num_input_tokens_seen": 201746592, "step": 93485 }, { "epoch": 15.251223491027732, "grad_norm": 0.03957965597510338, "learning_rate": 8.106833946308697e-06, "loss": 0.1108, "num_input_tokens_seen": 201756768, "step": 93490 }, { "epoch": 15.252039151712887, "grad_norm": 1.0981523990631104, "learning_rate": 8.104210599018394e-06, "loss": 0.0251, "num_input_tokens_seen": 201768672, "step": 93495 }, { "epoch": 15.252854812398043, "grad_norm": 1.1685606241226196, "learning_rate": 8.101587594142699e-06, "loss": 0.0751, "num_input_tokens_seen": 201779200, "step": 93500 }, { "epoch": 15.253670473083197, "grad_norm": 0.9955950975418091, "learning_rate": 8.098964931734768e-06, "loss": 0.0225, "num_input_tokens_seen": 201790016, "step": 93505 }, { "epoch": 15.254486133768353, "grad_norm": 0.5093445777893066, "learning_rate": 8.096342611847755e-06, "loss": 0.0676, "num_input_tokens_seen": 201800736, "step": 93510 }, { "epoch": 15.255301794453507, "grad_norm": 0.04979129508137703, "learning_rate": 8.093720634534798e-06, "loss": 0.1737, "num_input_tokens_seen": 201811808, "step": 93515 }, { "epoch": 15.256117455138662, "grad_norm": 0.449137419462204, "learning_rate": 8.091098999849042e-06, "loss": 0.1253, "num_input_tokens_seen": 201822624, "step": 93520 }, { "epoch": 15.256933115823816, "grad_norm": 0.031817417591810226, "learning_rate": 8.088477707843617e-06, "loss": 0.0698, "num_input_tokens_seen": 201833696, "step": 93525 }, { "epoch": 15.257748776508972, "grad_norm": 0.6715468168258667, "learning_rate": 8.085856758571645e-06, "loss": 0.0186, "num_input_tokens_seen": 201844640, "step": 93530 }, { "epoch": 15.258564437194128, "grad_norm": 0.062332045286893845, "learning_rate": 8.08323615208624e-06, "loss": 0.0422, "num_input_tokens_seen": 201855520, "step": 93535 }, { "epoch": 15.259380097879282, "grad_norm": 0.22679485380649567, "learning_rate": 8.080615888440515e-06, "loss": 0.0685, "num_input_tokens_seen": 201864352, "step": 93540 }, { "epoch": 15.260195758564437, "grad_norm": 1.137393593788147, "learning_rate": 8.077995967687574e-06, "loss": 0.0356, "num_input_tokens_seen": 201874752, "step": 93545 }, { "epoch": 15.261011419249591, "grad_norm": 0.04081837460398674, "learning_rate": 8.075376389880504e-06, "loss": 0.1232, "num_input_tokens_seen": 201885664, "step": 93550 }, { "epoch": 15.261827079934747, "grad_norm": 0.040859147906303406, "learning_rate": 8.07275715507242e-06, "loss": 0.0226, "num_input_tokens_seen": 201898112, "step": 93555 }, { "epoch": 15.262642740619903, "grad_norm": 0.06338027864694595, "learning_rate": 8.070138263316365e-06, "loss": 0.1862, "num_input_tokens_seen": 201909312, "step": 93560 }, { "epoch": 15.263458401305057, "grad_norm": 1.6942170858383179, "learning_rate": 8.067519714665456e-06, "loss": 0.0586, "num_input_tokens_seen": 201921216, "step": 93565 }, { "epoch": 15.264274061990212, "grad_norm": 1.2319467067718506, "learning_rate": 8.06490150917272e-06, "loss": 0.1115, "num_input_tokens_seen": 201931104, "step": 93570 }, { "epoch": 15.265089722675366, "grad_norm": 0.09372848272323608, "learning_rate": 8.062283646891258e-06, "loss": 0.0211, "num_input_tokens_seen": 201940928, "step": 93575 }, { "epoch": 15.265905383360522, "grad_norm": 0.04013992100954056, "learning_rate": 8.059666127874088e-06, "loss": 0.1147, "num_input_tokens_seen": 201952352, "step": 93580 }, { "epoch": 15.266721044045678, "grad_norm": 0.12383421510457993, "learning_rate": 8.05704895217429e-06, "loss": 0.0241, "num_input_tokens_seen": 201963552, "step": 93585 }, { "epoch": 15.267536704730832, "grad_norm": 0.1731344610452652, "learning_rate": 8.054432119844874e-06, "loss": 0.0439, "num_input_tokens_seen": 201974240, "step": 93590 }, { "epoch": 15.268352365415987, "grad_norm": 2.1631882190704346, "learning_rate": 8.0518156309389e-06, "loss": 0.266, "num_input_tokens_seen": 201983328, "step": 93595 }, { "epoch": 15.269168026101141, "grad_norm": 0.8343313932418823, "learning_rate": 8.04919948550937e-06, "loss": 0.0299, "num_input_tokens_seen": 201993824, "step": 93600 }, { "epoch": 15.269983686786297, "grad_norm": 0.026301903650164604, "learning_rate": 8.046583683609333e-06, "loss": 0.0506, "num_input_tokens_seen": 202004384, "step": 93605 }, { "epoch": 15.270799347471453, "grad_norm": 1.8828397989273071, "learning_rate": 8.043968225291765e-06, "loss": 0.1326, "num_input_tokens_seen": 202014592, "step": 93610 }, { "epoch": 15.271615008156607, "grad_norm": 0.08804808557033539, "learning_rate": 8.04135311060971e-06, "loss": 0.0509, "num_input_tokens_seen": 202026336, "step": 93615 }, { "epoch": 15.272430668841762, "grad_norm": 0.6667740345001221, "learning_rate": 8.038738339616131e-06, "loss": 0.1943, "num_input_tokens_seen": 202037760, "step": 93620 }, { "epoch": 15.273246329526916, "grad_norm": 0.3765020966529846, "learning_rate": 8.036123912364043e-06, "loss": 0.2468, "num_input_tokens_seen": 202048576, "step": 93625 }, { "epoch": 15.274061990212072, "grad_norm": 0.38480913639068604, "learning_rate": 8.033509828906427e-06, "loss": 0.0309, "num_input_tokens_seen": 202059104, "step": 93630 }, { "epoch": 15.274877650897226, "grad_norm": 1.0503863096237183, "learning_rate": 8.030896089296255e-06, "loss": 0.1499, "num_input_tokens_seen": 202070688, "step": 93635 }, { "epoch": 15.275693311582382, "grad_norm": 0.0832475870847702, "learning_rate": 8.028282693586503e-06, "loss": 0.0098, "num_input_tokens_seen": 202080352, "step": 93640 }, { "epoch": 15.276508972267537, "grad_norm": 0.1355476975440979, "learning_rate": 8.02566964183013e-06, "loss": 0.0213, "num_input_tokens_seen": 202089440, "step": 93645 }, { "epoch": 15.277324632952691, "grad_norm": 0.5260346531867981, "learning_rate": 8.023056934080098e-06, "loss": 0.1213, "num_input_tokens_seen": 202099840, "step": 93650 }, { "epoch": 15.278140293637847, "grad_norm": 0.04090237617492676, "learning_rate": 8.020444570389351e-06, "loss": 0.0467, "num_input_tokens_seen": 202111040, "step": 93655 }, { "epoch": 15.278955954323001, "grad_norm": 0.042570166289806366, "learning_rate": 8.017832550810835e-06, "loss": 0.0784, "num_input_tokens_seen": 202121760, "step": 93660 }, { "epoch": 15.279771615008157, "grad_norm": 1.8687690496444702, "learning_rate": 8.01522087539749e-06, "loss": 0.1118, "num_input_tokens_seen": 202132352, "step": 93665 }, { "epoch": 15.280587275693312, "grad_norm": 1.6435567140579224, "learning_rate": 8.012609544202234e-06, "loss": 0.1659, "num_input_tokens_seen": 202143840, "step": 93670 }, { "epoch": 15.281402936378466, "grad_norm": 0.03656979650259018, "learning_rate": 8.009998557277993e-06, "loss": 0.1814, "num_input_tokens_seen": 202154720, "step": 93675 }, { "epoch": 15.282218597063622, "grad_norm": 1.4559531211853027, "learning_rate": 8.0073879146777e-06, "loss": 0.0366, "num_input_tokens_seen": 202165536, "step": 93680 }, { "epoch": 15.283034257748776, "grad_norm": 1.1193851232528687, "learning_rate": 8.004777616454228e-06, "loss": 0.0445, "num_input_tokens_seen": 202176128, "step": 93685 }, { "epoch": 15.283849918433932, "grad_norm": 0.6306520700454712, "learning_rate": 8.002167662660518e-06, "loss": 0.0156, "num_input_tokens_seen": 202187072, "step": 93690 }, { "epoch": 15.284665579119087, "grad_norm": 0.9564418792724609, "learning_rate": 7.999558053349424e-06, "loss": 0.0652, "num_input_tokens_seen": 202198144, "step": 93695 }, { "epoch": 15.285481239804241, "grad_norm": 2.120044231414795, "learning_rate": 7.996948788573872e-06, "loss": 0.1532, "num_input_tokens_seen": 202208576, "step": 93700 }, { "epoch": 15.286296900489397, "grad_norm": 2.8260104656219482, "learning_rate": 7.994339868386704e-06, "loss": 0.1629, "num_input_tokens_seen": 202219232, "step": 93705 }, { "epoch": 15.28711256117455, "grad_norm": 1.8093574047088623, "learning_rate": 7.991731292840828e-06, "loss": 0.1303, "num_input_tokens_seen": 202229024, "step": 93710 }, { "epoch": 15.287928221859707, "grad_norm": 0.21369624137878418, "learning_rate": 7.989123061989079e-06, "loss": 0.0663, "num_input_tokens_seen": 202239040, "step": 93715 }, { "epoch": 15.28874388254486, "grad_norm": 0.05841187760233879, "learning_rate": 7.986515175884347e-06, "loss": 0.0201, "num_input_tokens_seen": 202250400, "step": 93720 }, { "epoch": 15.289559543230016, "grad_norm": 0.03251504898071289, "learning_rate": 7.983907634579451e-06, "loss": 0.1236, "num_input_tokens_seen": 202260960, "step": 93725 }, { "epoch": 15.290375203915172, "grad_norm": 0.06156063452363014, "learning_rate": 7.981300438127271e-06, "loss": 0.0525, "num_input_tokens_seen": 202272448, "step": 93730 }, { "epoch": 15.291190864600326, "grad_norm": 0.6542724370956421, "learning_rate": 7.97869358658061e-06, "loss": 0.0217, "num_input_tokens_seen": 202282336, "step": 93735 }, { "epoch": 15.292006525285482, "grad_norm": 1.9235308170318604, "learning_rate": 7.976087079992334e-06, "loss": 0.0615, "num_input_tokens_seen": 202293888, "step": 93740 }, { "epoch": 15.292822185970635, "grad_norm": 0.06705636531114578, "learning_rate": 7.97348091841523e-06, "loss": 0.1146, "num_input_tokens_seen": 202303040, "step": 93745 }, { "epoch": 15.293637846655791, "grad_norm": 1.5425236225128174, "learning_rate": 7.970875101902153e-06, "loss": 0.3012, "num_input_tokens_seen": 202313536, "step": 93750 }, { "epoch": 15.294453507340947, "grad_norm": 0.4024849534034729, "learning_rate": 7.968269630505881e-06, "loss": 0.2322, "num_input_tokens_seen": 202324480, "step": 93755 }, { "epoch": 15.2952691680261, "grad_norm": 2.3990581035614014, "learning_rate": 7.965664504279238e-06, "loss": 0.1362, "num_input_tokens_seen": 202334176, "step": 93760 }, { "epoch": 15.296084828711257, "grad_norm": 3.7376809120178223, "learning_rate": 7.963059723275013e-06, "loss": 0.2157, "num_input_tokens_seen": 202344480, "step": 93765 }, { "epoch": 15.29690048939641, "grad_norm": 0.12240627408027649, "learning_rate": 7.960455287545996e-06, "loss": 0.1081, "num_input_tokens_seen": 202355712, "step": 93770 }, { "epoch": 15.297716150081566, "grad_norm": 0.4162333011627197, "learning_rate": 7.957851197144967e-06, "loss": 0.1277, "num_input_tokens_seen": 202366880, "step": 93775 }, { "epoch": 15.298531810766722, "grad_norm": 0.04704819992184639, "learning_rate": 7.955247452124706e-06, "loss": 0.0729, "num_input_tokens_seen": 202377984, "step": 93780 }, { "epoch": 15.299347471451876, "grad_norm": 1.1374845504760742, "learning_rate": 7.952644052537975e-06, "loss": 0.2221, "num_input_tokens_seen": 202387328, "step": 93785 }, { "epoch": 15.300163132137031, "grad_norm": 3.9410080909729004, "learning_rate": 7.950040998437542e-06, "loss": 0.342, "num_input_tokens_seen": 202398304, "step": 93790 }, { "epoch": 15.300978792822185, "grad_norm": 0.46201837062835693, "learning_rate": 7.947438289876155e-06, "loss": 0.0952, "num_input_tokens_seen": 202408000, "step": 93795 }, { "epoch": 15.301794453507341, "grad_norm": 1.4726004600524902, "learning_rate": 7.944835926906563e-06, "loss": 0.2018, "num_input_tokens_seen": 202419424, "step": 93800 }, { "epoch": 15.302610114192497, "grad_norm": 0.021458493545651436, "learning_rate": 7.94223390958151e-06, "loss": 0.0616, "num_input_tokens_seen": 202430912, "step": 93805 }, { "epoch": 15.30342577487765, "grad_norm": 0.32504144310951233, "learning_rate": 7.939632237953724e-06, "loss": 0.0183, "num_input_tokens_seen": 202442368, "step": 93810 }, { "epoch": 15.304241435562806, "grad_norm": 0.027017204090952873, "learning_rate": 7.937030912075932e-06, "loss": 0.072, "num_input_tokens_seen": 202453376, "step": 93815 }, { "epoch": 15.30505709624796, "grad_norm": 0.7550625801086426, "learning_rate": 7.934429932000847e-06, "loss": 0.0553, "num_input_tokens_seen": 202462816, "step": 93820 }, { "epoch": 15.305872756933116, "grad_norm": 0.5387651324272156, "learning_rate": 7.931829297781207e-06, "loss": 0.0282, "num_input_tokens_seen": 202473856, "step": 93825 }, { "epoch": 15.30668841761827, "grad_norm": 0.16244959831237793, "learning_rate": 7.929229009469682e-06, "loss": 0.076, "num_input_tokens_seen": 202486304, "step": 93830 }, { "epoch": 15.307504078303426, "grad_norm": 2.1695456504821777, "learning_rate": 7.926629067119005e-06, "loss": 0.2387, "num_input_tokens_seen": 202496800, "step": 93835 }, { "epoch": 15.308319738988581, "grad_norm": 0.3255048394203186, "learning_rate": 7.92402947078183e-06, "loss": 0.074, "num_input_tokens_seen": 202507648, "step": 93840 }, { "epoch": 15.309135399673735, "grad_norm": 2.8094542026519775, "learning_rate": 7.921430220510876e-06, "loss": 0.2446, "num_input_tokens_seen": 202518592, "step": 93845 }, { "epoch": 15.309951060358891, "grad_norm": 0.15739905834197998, "learning_rate": 7.918831316358789e-06, "loss": 0.1941, "num_input_tokens_seen": 202528928, "step": 93850 }, { "epoch": 15.310766721044045, "grad_norm": 0.2576697766780853, "learning_rate": 7.916232758378269e-06, "loss": 0.1083, "num_input_tokens_seen": 202538720, "step": 93855 }, { "epoch": 15.3115823817292, "grad_norm": 0.1732560247182846, "learning_rate": 7.913634546621951e-06, "loss": 0.1214, "num_input_tokens_seen": 202549696, "step": 93860 }, { "epoch": 15.312398042414356, "grad_norm": 2.4885454177856445, "learning_rate": 7.91103668114252e-06, "loss": 0.0761, "num_input_tokens_seen": 202559520, "step": 93865 }, { "epoch": 15.31321370309951, "grad_norm": 1.1137075424194336, "learning_rate": 7.908439161992592e-06, "loss": 0.1545, "num_input_tokens_seen": 202568832, "step": 93870 }, { "epoch": 15.314029363784666, "grad_norm": 0.31081804633140564, "learning_rate": 7.905841989224836e-06, "loss": 0.1267, "num_input_tokens_seen": 202578912, "step": 93875 }, { "epoch": 15.31484502446982, "grad_norm": 0.336312472820282, "learning_rate": 7.903245162891875e-06, "loss": 0.0983, "num_input_tokens_seen": 202589632, "step": 93880 }, { "epoch": 15.315660685154976, "grad_norm": 0.9979838132858276, "learning_rate": 7.900648683046344e-06, "loss": 0.0643, "num_input_tokens_seen": 202600256, "step": 93885 }, { "epoch": 15.31647634584013, "grad_norm": 0.03358616307377815, "learning_rate": 7.898052549740856e-06, "loss": 0.0437, "num_input_tokens_seen": 202610976, "step": 93890 }, { "epoch": 15.317292006525285, "grad_norm": 0.07552336156368256, "learning_rate": 7.89545676302803e-06, "loss": 0.092, "num_input_tokens_seen": 202621600, "step": 93895 }, { "epoch": 15.318107667210441, "grad_norm": 0.14049026370048523, "learning_rate": 7.892861322960471e-06, "loss": 0.0353, "num_input_tokens_seen": 202633120, "step": 93900 }, { "epoch": 15.318923327895595, "grad_norm": 2.263604164123535, "learning_rate": 7.890266229590779e-06, "loss": 0.0473, "num_input_tokens_seen": 202643904, "step": 93905 }, { "epoch": 15.31973898858075, "grad_norm": 1.0618757009506226, "learning_rate": 7.887671482971549e-06, "loss": 0.1093, "num_input_tokens_seen": 202654944, "step": 93910 }, { "epoch": 15.320554649265905, "grad_norm": 1.382005214691162, "learning_rate": 7.885077083155363e-06, "loss": 0.0303, "num_input_tokens_seen": 202665120, "step": 93915 }, { "epoch": 15.32137030995106, "grad_norm": 3.1932132244110107, "learning_rate": 7.882483030194801e-06, "loss": 0.1024, "num_input_tokens_seen": 202676640, "step": 93920 }, { "epoch": 15.322185970636216, "grad_norm": 0.24342751502990723, "learning_rate": 7.879889324142437e-06, "loss": 0.0299, "num_input_tokens_seen": 202687744, "step": 93925 }, { "epoch": 15.32300163132137, "grad_norm": 0.12768927216529846, "learning_rate": 7.877295965050832e-06, "loss": 0.0111, "num_input_tokens_seen": 202698208, "step": 93930 }, { "epoch": 15.323817292006526, "grad_norm": 1.8287526369094849, "learning_rate": 7.874702952972549e-06, "loss": 0.1193, "num_input_tokens_seen": 202708576, "step": 93935 }, { "epoch": 15.32463295269168, "grad_norm": 1.7594454288482666, "learning_rate": 7.872110287960132e-06, "loss": 0.2017, "num_input_tokens_seen": 202718816, "step": 93940 }, { "epoch": 15.325448613376835, "grad_norm": 0.5058555006980896, "learning_rate": 7.86951797006613e-06, "loss": 0.1414, "num_input_tokens_seen": 202728608, "step": 93945 }, { "epoch": 15.326264274061991, "grad_norm": 0.18342670798301697, "learning_rate": 7.86692599934308e-06, "loss": 0.0214, "num_input_tokens_seen": 202738912, "step": 93950 }, { "epoch": 15.327079934747145, "grad_norm": 0.975877583026886, "learning_rate": 7.864334375843508e-06, "loss": 0.0184, "num_input_tokens_seen": 202748736, "step": 93955 }, { "epoch": 15.3278955954323, "grad_norm": 0.04284770414233208, "learning_rate": 7.861743099619939e-06, "loss": 0.2748, "num_input_tokens_seen": 202760416, "step": 93960 }, { "epoch": 15.328711256117455, "grad_norm": 2.4068734645843506, "learning_rate": 7.859152170724879e-06, "loss": 0.2049, "num_input_tokens_seen": 202771456, "step": 93965 }, { "epoch": 15.32952691680261, "grad_norm": 1.9719090461730957, "learning_rate": 7.856561589210858e-06, "loss": 0.1252, "num_input_tokens_seen": 202782880, "step": 93970 }, { "epoch": 15.330342577487766, "grad_norm": 0.5599901676177979, "learning_rate": 7.853971355130353e-06, "loss": 0.1164, "num_input_tokens_seen": 202793824, "step": 93975 }, { "epoch": 15.33115823817292, "grad_norm": 0.03288135305047035, "learning_rate": 7.851381468535874e-06, "loss": 0.0183, "num_input_tokens_seen": 202803808, "step": 93980 }, { "epoch": 15.331973898858076, "grad_norm": 0.05484289675951004, "learning_rate": 7.848791929479906e-06, "loss": 0.053, "num_input_tokens_seen": 202813664, "step": 93985 }, { "epoch": 15.33278955954323, "grad_norm": 0.5195763111114502, "learning_rate": 7.846202738014926e-06, "loss": 0.1695, "num_input_tokens_seen": 202824576, "step": 93990 }, { "epoch": 15.333605220228385, "grad_norm": 0.04690530523657799, "learning_rate": 7.84361389419341e-06, "loss": 0.0154, "num_input_tokens_seen": 202835104, "step": 93995 }, { "epoch": 15.33442088091354, "grad_norm": 0.037223100662231445, "learning_rate": 7.841025398067823e-06, "loss": 0.0742, "num_input_tokens_seen": 202844704, "step": 94000 }, { "epoch": 15.335236541598695, "grad_norm": 0.024426298215985298, "learning_rate": 7.838437249690624e-06, "loss": 0.0322, "num_input_tokens_seen": 202855584, "step": 94005 }, { "epoch": 15.33605220228385, "grad_norm": 0.021885927766561508, "learning_rate": 7.835849449114266e-06, "loss": 0.0419, "num_input_tokens_seen": 202865248, "step": 94010 }, { "epoch": 15.336867862969005, "grad_norm": 1.4396501779556274, "learning_rate": 7.833261996391192e-06, "loss": 0.1566, "num_input_tokens_seen": 202875936, "step": 94015 }, { "epoch": 15.33768352365416, "grad_norm": 1.2765296697616577, "learning_rate": 7.83067489157384e-06, "loss": 0.0869, "num_input_tokens_seen": 202887456, "step": 94020 }, { "epoch": 15.338499184339314, "grad_norm": 0.1859128475189209, "learning_rate": 7.828088134714642e-06, "loss": 0.0182, "num_input_tokens_seen": 202898240, "step": 94025 }, { "epoch": 15.33931484502447, "grad_norm": 0.6956703662872314, "learning_rate": 7.825501725866025e-06, "loss": 0.0527, "num_input_tokens_seen": 202909824, "step": 94030 }, { "epoch": 15.340130505709626, "grad_norm": 0.05798427388072014, "learning_rate": 7.822915665080403e-06, "loss": 0.0062, "num_input_tokens_seen": 202920032, "step": 94035 }, { "epoch": 15.34094616639478, "grad_norm": 0.1100151538848877, "learning_rate": 7.820329952410183e-06, "loss": 0.0468, "num_input_tokens_seen": 202931456, "step": 94040 }, { "epoch": 15.341761827079935, "grad_norm": 0.1274537444114685, "learning_rate": 7.81774458790777e-06, "loss": 0.0098, "num_input_tokens_seen": 202940640, "step": 94045 }, { "epoch": 15.34257748776509, "grad_norm": 0.02626030705869198, "learning_rate": 7.81515957162556e-06, "loss": 0.0324, "num_input_tokens_seen": 202951360, "step": 94050 }, { "epoch": 15.343393148450245, "grad_norm": 1.8629871606826782, "learning_rate": 7.812574903615947e-06, "loss": 0.2353, "num_input_tokens_seen": 202962400, "step": 94055 }, { "epoch": 15.3442088091354, "grad_norm": 0.7912946939468384, "learning_rate": 7.809990583931301e-06, "loss": 0.0986, "num_input_tokens_seen": 202973568, "step": 94060 }, { "epoch": 15.345024469820554, "grad_norm": 0.06882259994745255, "learning_rate": 7.807406612624008e-06, "loss": 0.1593, "num_input_tokens_seen": 202984896, "step": 94065 }, { "epoch": 15.34584013050571, "grad_norm": 1.7155359983444214, "learning_rate": 7.804822989746427e-06, "loss": 0.1823, "num_input_tokens_seen": 202995072, "step": 94070 }, { "epoch": 15.346655791190864, "grad_norm": 0.05668947845697403, "learning_rate": 7.802239715350923e-06, "loss": 0.0457, "num_input_tokens_seen": 203006528, "step": 94075 }, { "epoch": 15.34747145187602, "grad_norm": 1.960279941558838, "learning_rate": 7.799656789489848e-06, "loss": 0.0739, "num_input_tokens_seen": 203016320, "step": 94080 }, { "epoch": 15.348287112561174, "grad_norm": 0.4500526189804077, "learning_rate": 7.797074212215539e-06, "loss": 0.0984, "num_input_tokens_seen": 203026272, "step": 94085 }, { "epoch": 15.34910277324633, "grad_norm": 0.14613409340381622, "learning_rate": 7.794491983580362e-06, "loss": 0.0898, "num_input_tokens_seen": 203036736, "step": 94090 }, { "epoch": 15.349918433931485, "grad_norm": 0.9285050630569458, "learning_rate": 7.791910103636615e-06, "loss": 0.1435, "num_input_tokens_seen": 203046048, "step": 94095 }, { "epoch": 15.350734094616639, "grad_norm": 2.8872156143188477, "learning_rate": 7.789328572436658e-06, "loss": 0.1376, "num_input_tokens_seen": 203053952, "step": 94100 }, { "epoch": 15.351549755301795, "grad_norm": 1.6978923082351685, "learning_rate": 7.786747390032773e-06, "loss": 0.1775, "num_input_tokens_seen": 203064768, "step": 94105 }, { "epoch": 15.352365415986949, "grad_norm": 3.5766732692718506, "learning_rate": 7.784166556477296e-06, "loss": 0.3425, "num_input_tokens_seen": 203075712, "step": 94110 }, { "epoch": 15.353181076672104, "grad_norm": 1.0555015802383423, "learning_rate": 7.781586071822523e-06, "loss": 0.0688, "num_input_tokens_seen": 203087168, "step": 94115 }, { "epoch": 15.35399673735726, "grad_norm": 0.042643602937459946, "learning_rate": 7.779005936120754e-06, "loss": 0.0297, "num_input_tokens_seen": 203096384, "step": 94120 }, { "epoch": 15.354812398042414, "grad_norm": 0.04915270209312439, "learning_rate": 7.776426149424274e-06, "loss": 0.0615, "num_input_tokens_seen": 203107328, "step": 94125 }, { "epoch": 15.35562805872757, "grad_norm": 0.41733211278915405, "learning_rate": 7.773846711785368e-06, "loss": 0.1511, "num_input_tokens_seen": 203117792, "step": 94130 }, { "epoch": 15.356443719412724, "grad_norm": 0.022288791835308075, "learning_rate": 7.771267623256312e-06, "loss": 0.1131, "num_input_tokens_seen": 203128288, "step": 94135 }, { "epoch": 15.35725938009788, "grad_norm": 0.7855232954025269, "learning_rate": 7.768688883889371e-06, "loss": 0.1658, "num_input_tokens_seen": 203138560, "step": 94140 }, { "epoch": 15.358075040783035, "grad_norm": 0.12024757266044617, "learning_rate": 7.766110493736814e-06, "loss": 0.1295, "num_input_tokens_seen": 203150208, "step": 94145 }, { "epoch": 15.358890701468189, "grad_norm": 3.0398356914520264, "learning_rate": 7.763532452850888e-06, "loss": 0.2442, "num_input_tokens_seen": 203161152, "step": 94150 }, { "epoch": 15.359706362153345, "grad_norm": 1.992666244506836, "learning_rate": 7.760954761283843e-06, "loss": 0.1461, "num_input_tokens_seen": 203171264, "step": 94155 }, { "epoch": 15.360522022838499, "grad_norm": 0.19472570717334747, "learning_rate": 7.75837741908792e-06, "loss": 0.1643, "num_input_tokens_seen": 203183360, "step": 94160 }, { "epoch": 15.361337683523654, "grad_norm": 0.668389081954956, "learning_rate": 7.75580042631535e-06, "loss": 0.2828, "num_input_tokens_seen": 203195168, "step": 94165 }, { "epoch": 15.362153344208808, "grad_norm": 1.8465197086334229, "learning_rate": 7.75322378301836e-06, "loss": 0.0901, "num_input_tokens_seen": 203206240, "step": 94170 }, { "epoch": 15.362969004893964, "grad_norm": 1.2444998025894165, "learning_rate": 7.750647489249171e-06, "loss": 0.0933, "num_input_tokens_seen": 203217088, "step": 94175 }, { "epoch": 15.36378466557912, "grad_norm": 1.370518445968628, "learning_rate": 7.74807154505999e-06, "loss": 0.1003, "num_input_tokens_seen": 203227648, "step": 94180 }, { "epoch": 15.364600326264274, "grad_norm": 1.4023284912109375, "learning_rate": 7.745495950503026e-06, "loss": 0.0732, "num_input_tokens_seen": 203238240, "step": 94185 }, { "epoch": 15.36541598694943, "grad_norm": 0.06631337106227875, "learning_rate": 7.742920705630468e-06, "loss": 0.1304, "num_input_tokens_seen": 203249312, "step": 94190 }, { "epoch": 15.366231647634583, "grad_norm": 0.3744456171989441, "learning_rate": 7.740345810494532e-06, "loss": 0.0543, "num_input_tokens_seen": 203260320, "step": 94195 }, { "epoch": 15.367047308319739, "grad_norm": 1.8851869106292725, "learning_rate": 7.737771265147365e-06, "loss": 0.0652, "num_input_tokens_seen": 203270528, "step": 94200 }, { "epoch": 15.367862969004895, "grad_norm": 2.7867510318756104, "learning_rate": 7.735197069641179e-06, "loss": 0.1113, "num_input_tokens_seen": 203282272, "step": 94205 }, { "epoch": 15.368678629690049, "grad_norm": 0.02936440147459507, "learning_rate": 7.73262322402811e-06, "loss": 0.1142, "num_input_tokens_seen": 203292672, "step": 94210 }, { "epoch": 15.369494290375204, "grad_norm": 0.12252302467823029, "learning_rate": 7.730049728360353e-06, "loss": 0.158, "num_input_tokens_seen": 203303520, "step": 94215 }, { "epoch": 15.370309951060358, "grad_norm": 0.37744617462158203, "learning_rate": 7.72747658269003e-06, "loss": 0.1897, "num_input_tokens_seen": 203315232, "step": 94220 }, { "epoch": 15.371125611745514, "grad_norm": 0.12073379755020142, "learning_rate": 7.724903787069321e-06, "loss": 0.0159, "num_input_tokens_seen": 203326464, "step": 94225 }, { "epoch": 15.37194127243067, "grad_norm": 1.3599039316177368, "learning_rate": 7.722331341550337e-06, "loss": 0.153, "num_input_tokens_seen": 203338688, "step": 94230 }, { "epoch": 15.372756933115824, "grad_norm": 0.03146765008568764, "learning_rate": 7.719759246185244e-06, "loss": 0.0451, "num_input_tokens_seen": 203350272, "step": 94235 }, { "epoch": 15.37357259380098, "grad_norm": 0.09334086626768112, "learning_rate": 7.717187501026132e-06, "loss": 0.0145, "num_input_tokens_seen": 203361504, "step": 94240 }, { "epoch": 15.374388254486133, "grad_norm": 0.14847727119922638, "learning_rate": 7.714616106125159e-06, "loss": 0.0234, "num_input_tokens_seen": 203371424, "step": 94245 }, { "epoch": 15.375203915171289, "grad_norm": 0.0484575554728508, "learning_rate": 7.7120450615344e-06, "loss": 0.1374, "num_input_tokens_seen": 203382368, "step": 94250 }, { "epoch": 15.376019575856443, "grad_norm": 0.30304792523384094, "learning_rate": 7.709474367305988e-06, "loss": 0.0248, "num_input_tokens_seen": 203392544, "step": 94255 }, { "epoch": 15.376835236541599, "grad_norm": 0.2607335150241852, "learning_rate": 7.706904023492015e-06, "loss": 0.0217, "num_input_tokens_seen": 203403072, "step": 94260 }, { "epoch": 15.377650897226754, "grad_norm": 0.8034794330596924, "learning_rate": 7.704334030144566e-06, "loss": 0.1029, "num_input_tokens_seen": 203412640, "step": 94265 }, { "epoch": 15.378466557911908, "grad_norm": 0.10942478477954865, "learning_rate": 7.701764387315732e-06, "loss": 0.0225, "num_input_tokens_seen": 203422784, "step": 94270 }, { "epoch": 15.379282218597064, "grad_norm": 0.3210750222206116, "learning_rate": 7.699195095057587e-06, "loss": 0.1797, "num_input_tokens_seen": 203434880, "step": 94275 }, { "epoch": 15.380097879282218, "grad_norm": 0.319130003452301, "learning_rate": 7.696626153422201e-06, "loss": 0.0697, "num_input_tokens_seen": 203446144, "step": 94280 }, { "epoch": 15.380913539967374, "grad_norm": 0.8654014468193054, "learning_rate": 7.694057562461634e-06, "loss": 0.017, "num_input_tokens_seen": 203455712, "step": 94285 }, { "epoch": 15.38172920065253, "grad_norm": 0.21156463027000427, "learning_rate": 7.69148932222795e-06, "loss": 0.0296, "num_input_tokens_seen": 203466112, "step": 94290 }, { "epoch": 15.382544861337683, "grad_norm": 0.6509072780609131, "learning_rate": 7.688921432773186e-06, "loss": 0.0312, "num_input_tokens_seen": 203477408, "step": 94295 }, { "epoch": 15.383360522022839, "grad_norm": 1.974358320236206, "learning_rate": 7.686353894149395e-06, "loss": 0.1015, "num_input_tokens_seen": 203487264, "step": 94300 }, { "epoch": 15.384176182707993, "grad_norm": 3.8957629203796387, "learning_rate": 7.683786706408594e-06, "loss": 0.103, "num_input_tokens_seen": 203498112, "step": 94305 }, { "epoch": 15.384991843393149, "grad_norm": 0.6297956705093384, "learning_rate": 7.68121986960284e-06, "loss": 0.1203, "num_input_tokens_seen": 203509184, "step": 94310 }, { "epoch": 15.385807504078304, "grad_norm": 0.07739764451980591, "learning_rate": 7.678653383784121e-06, "loss": 0.0335, "num_input_tokens_seen": 203519616, "step": 94315 }, { "epoch": 15.386623164763458, "grad_norm": 0.031314726918935776, "learning_rate": 7.676087249004482e-06, "loss": 0.013, "num_input_tokens_seen": 203530272, "step": 94320 }, { "epoch": 15.387438825448614, "grad_norm": 0.07058876007795334, "learning_rate": 7.673521465315894e-06, "loss": 0.0089, "num_input_tokens_seen": 203540992, "step": 94325 }, { "epoch": 15.388254486133768, "grad_norm": 0.04433596879243851, "learning_rate": 7.670956032770391e-06, "loss": 0.3074, "num_input_tokens_seen": 203549824, "step": 94330 }, { "epoch": 15.389070146818923, "grad_norm": 1.2638201713562012, "learning_rate": 7.66839095141993e-06, "loss": 0.0924, "num_input_tokens_seen": 203561024, "step": 94335 }, { "epoch": 15.38988580750408, "grad_norm": 0.07505420595407486, "learning_rate": 7.665826221316527e-06, "loss": 0.0281, "num_input_tokens_seen": 203571712, "step": 94340 }, { "epoch": 15.390701468189233, "grad_norm": 1.0255405902862549, "learning_rate": 7.663261842512132e-06, "loss": 0.0162, "num_input_tokens_seen": 203582432, "step": 94345 }, { "epoch": 15.391517128874389, "grad_norm": 0.6070581078529358, "learning_rate": 7.660697815058746e-06, "loss": 0.1382, "num_input_tokens_seen": 203593632, "step": 94350 }, { "epoch": 15.392332789559543, "grad_norm": 2.027162551879883, "learning_rate": 7.658134139008297e-06, "loss": 0.1359, "num_input_tokens_seen": 203605216, "step": 94355 }, { "epoch": 15.393148450244698, "grad_norm": 0.019891073927283287, "learning_rate": 7.655570814412776e-06, "loss": 0.0582, "num_input_tokens_seen": 203616192, "step": 94360 }, { "epoch": 15.393964110929852, "grad_norm": 0.10211513936519623, "learning_rate": 7.6530078413241e-06, "loss": 0.0285, "num_input_tokens_seen": 203627072, "step": 94365 }, { "epoch": 15.394779771615008, "grad_norm": 0.06675302982330322, "learning_rate": 7.650445219794244e-06, "loss": 0.0214, "num_input_tokens_seen": 203638400, "step": 94370 }, { "epoch": 15.395595432300164, "grad_norm": 0.5440732836723328, "learning_rate": 7.647882949875108e-06, "loss": 0.1368, "num_input_tokens_seen": 203649088, "step": 94375 }, { "epoch": 15.396411092985318, "grad_norm": 0.3610718250274658, "learning_rate": 7.645321031618653e-06, "loss": 0.0186, "num_input_tokens_seen": 203659744, "step": 94380 }, { "epoch": 15.397226753670473, "grad_norm": 1.6164851188659668, "learning_rate": 7.642759465076765e-06, "loss": 0.0984, "num_input_tokens_seen": 203670816, "step": 94385 }, { "epoch": 15.398042414355627, "grad_norm": 0.1489362120628357, "learning_rate": 7.640198250301392e-06, "loss": 0.023, "num_input_tokens_seen": 203681984, "step": 94390 }, { "epoch": 15.398858075040783, "grad_norm": 0.07627218961715698, "learning_rate": 7.63763738734441e-06, "loss": 0.0485, "num_input_tokens_seen": 203692096, "step": 94395 }, { "epoch": 15.399673735725939, "grad_norm": 0.07162856310606003, "learning_rate": 7.635076876257737e-06, "loss": 0.108, "num_input_tokens_seen": 203704064, "step": 94400 }, { "epoch": 15.400489396411093, "grad_norm": 1.3977330923080444, "learning_rate": 7.632516717093264e-06, "loss": 0.2769, "num_input_tokens_seen": 203715104, "step": 94405 }, { "epoch": 15.401305057096248, "grad_norm": 4.995927333831787, "learning_rate": 7.62995690990287e-06, "loss": 0.2402, "num_input_tokens_seen": 203726496, "step": 94410 }, { "epoch": 15.402120717781402, "grad_norm": 0.12277690321207047, "learning_rate": 7.627397454738436e-06, "loss": 0.0984, "num_input_tokens_seen": 203737504, "step": 94415 }, { "epoch": 15.402936378466558, "grad_norm": 0.5039557814598083, "learning_rate": 7.6248383516518276e-06, "loss": 0.0279, "num_input_tokens_seen": 203747648, "step": 94420 }, { "epoch": 15.403752039151712, "grad_norm": 0.07965628057718277, "learning_rate": 7.622279600694915e-06, "loss": 0.02, "num_input_tokens_seen": 203759936, "step": 94425 }, { "epoch": 15.404567699836868, "grad_norm": 0.07952059060335159, "learning_rate": 7.619721201919553e-06, "loss": 0.0617, "num_input_tokens_seen": 203771104, "step": 94430 }, { "epoch": 15.405383360522023, "grad_norm": 0.08176849782466888, "learning_rate": 7.617163155377585e-06, "loss": 0.0686, "num_input_tokens_seen": 203782240, "step": 94435 }, { "epoch": 15.406199021207177, "grad_norm": 0.8024484515190125, "learning_rate": 7.614605461120861e-06, "loss": 0.1815, "num_input_tokens_seen": 203793120, "step": 94440 }, { "epoch": 15.407014681892333, "grad_norm": 0.16235676407814026, "learning_rate": 7.612048119201209e-06, "loss": 0.0289, "num_input_tokens_seen": 203803904, "step": 94445 }, { "epoch": 15.407830342577487, "grad_norm": 2.875694751739502, "learning_rate": 7.609491129670454e-06, "loss": 0.0671, "num_input_tokens_seen": 203815840, "step": 94450 }, { "epoch": 15.408646003262643, "grad_norm": 1.8804138898849487, "learning_rate": 7.60693449258044e-06, "loss": 0.0664, "num_input_tokens_seen": 203826688, "step": 94455 }, { "epoch": 15.409461663947798, "grad_norm": 3.0093162059783936, "learning_rate": 7.604378207982946e-06, "loss": 0.0907, "num_input_tokens_seen": 203836736, "step": 94460 }, { "epoch": 15.410277324632952, "grad_norm": 0.18430037796497345, "learning_rate": 7.6018222759298095e-06, "loss": 0.0916, "num_input_tokens_seen": 203847360, "step": 94465 }, { "epoch": 15.411092985318108, "grad_norm": 0.1368967741727829, "learning_rate": 7.599266696472801e-06, "loss": 0.0112, "num_input_tokens_seen": 203856960, "step": 94470 }, { "epoch": 15.411908646003262, "grad_norm": 0.6011936068534851, "learning_rate": 7.596711469663745e-06, "loss": 0.04, "num_input_tokens_seen": 203868640, "step": 94475 }, { "epoch": 15.412724306688418, "grad_norm": 0.0346473753452301, "learning_rate": 7.594156595554392e-06, "loss": 0.1604, "num_input_tokens_seen": 203879872, "step": 94480 }, { "epoch": 15.413539967373573, "grad_norm": 0.250388503074646, "learning_rate": 7.591602074196552e-06, "loss": 0.187, "num_input_tokens_seen": 203891200, "step": 94485 }, { "epoch": 15.414355628058727, "grad_norm": 1.2578659057617188, "learning_rate": 7.589047905641963e-06, "loss": 0.0761, "num_input_tokens_seen": 203903264, "step": 94490 }, { "epoch": 15.415171288743883, "grad_norm": 1.4776437282562256, "learning_rate": 7.586494089942423e-06, "loss": 0.1186, "num_input_tokens_seen": 203914624, "step": 94495 }, { "epoch": 15.415986949429037, "grad_norm": 0.06374702602624893, "learning_rate": 7.583940627149655e-06, "loss": 0.0224, "num_input_tokens_seen": 203924640, "step": 94500 }, { "epoch": 15.416802610114193, "grad_norm": 0.21025891602039337, "learning_rate": 7.58138751731543e-06, "loss": 0.1056, "num_input_tokens_seen": 203935168, "step": 94505 }, { "epoch": 15.417618270799348, "grad_norm": 0.06995384395122528, "learning_rate": 7.5788347604914875e-06, "loss": 0.1105, "num_input_tokens_seen": 203945600, "step": 94510 }, { "epoch": 15.418433931484502, "grad_norm": 1.7346792221069336, "learning_rate": 7.576282356729555e-06, "loss": 0.1513, "num_input_tokens_seen": 203957088, "step": 94515 }, { "epoch": 15.419249592169658, "grad_norm": 2.835704803466797, "learning_rate": 7.573730306081367e-06, "loss": 0.1682, "num_input_tokens_seen": 203967616, "step": 94520 }, { "epoch": 15.420065252854812, "grad_norm": 0.03225664794445038, "learning_rate": 7.571178608598639e-06, "loss": 0.0087, "num_input_tokens_seen": 203979744, "step": 94525 }, { "epoch": 15.420880913539968, "grad_norm": 0.02739996463060379, "learning_rate": 7.568627264333089e-06, "loss": 0.086, "num_input_tokens_seen": 203990560, "step": 94530 }, { "epoch": 15.421696574225122, "grad_norm": 0.03600094094872475, "learning_rate": 7.5660762733364195e-06, "loss": 0.0218, "num_input_tokens_seen": 204001472, "step": 94535 }, { "epoch": 15.422512234910277, "grad_norm": 0.3599514365196228, "learning_rate": 7.563525635660332e-06, "loss": 0.0137, "num_input_tokens_seen": 204011968, "step": 94540 }, { "epoch": 15.423327895595433, "grad_norm": 0.09245704859495163, "learning_rate": 7.560975351356514e-06, "loss": 0.0662, "num_input_tokens_seen": 204021472, "step": 94545 }, { "epoch": 15.424143556280587, "grad_norm": 0.08670218288898468, "learning_rate": 7.558425420476656e-06, "loss": 0.0524, "num_input_tokens_seen": 204032928, "step": 94550 }, { "epoch": 15.424959216965743, "grad_norm": 0.6072961091995239, "learning_rate": 7.555875843072433e-06, "loss": 0.0387, "num_input_tokens_seen": 204043456, "step": 94555 }, { "epoch": 15.425774877650896, "grad_norm": 2.3429665565490723, "learning_rate": 7.553326619195514e-06, "loss": 0.1367, "num_input_tokens_seen": 204053984, "step": 94560 }, { "epoch": 15.426590538336052, "grad_norm": 1.4125440120697021, "learning_rate": 7.550777748897564e-06, "loss": 0.2224, "num_input_tokens_seen": 204064032, "step": 94565 }, { "epoch": 15.427406199021208, "grad_norm": 3.114572763442993, "learning_rate": 7.5482292322302415e-06, "loss": 0.0606, "num_input_tokens_seen": 204073632, "step": 94570 }, { "epoch": 15.428221859706362, "grad_norm": 0.036961693316698074, "learning_rate": 7.545681069245189e-06, "loss": 0.0278, "num_input_tokens_seen": 204085696, "step": 94575 }, { "epoch": 15.429037520391518, "grad_norm": 0.11251851916313171, "learning_rate": 7.543133259994054e-06, "loss": 0.0064, "num_input_tokens_seen": 204096992, "step": 94580 }, { "epoch": 15.429853181076671, "grad_norm": 0.3401830494403839, "learning_rate": 7.540585804528469e-06, "loss": 0.0158, "num_input_tokens_seen": 204107008, "step": 94585 }, { "epoch": 15.430668841761827, "grad_norm": 2.2017805576324463, "learning_rate": 7.538038702900063e-06, "loss": 0.0391, "num_input_tokens_seen": 204118208, "step": 94590 }, { "epoch": 15.431484502446983, "grad_norm": 1.8908203840255737, "learning_rate": 7.535491955160446e-06, "loss": 0.0596, "num_input_tokens_seen": 204128448, "step": 94595 }, { "epoch": 15.432300163132137, "grad_norm": 0.031184211373329163, "learning_rate": 7.532945561361254e-06, "loss": 0.0115, "num_input_tokens_seen": 204138752, "step": 94600 }, { "epoch": 15.433115823817293, "grad_norm": 0.2623136043548584, "learning_rate": 7.530399521554063e-06, "loss": 0.0239, "num_input_tokens_seen": 204149632, "step": 94605 }, { "epoch": 15.433931484502446, "grad_norm": 1.7049802541732788, "learning_rate": 7.527853835790505e-06, "loss": 0.0809, "num_input_tokens_seen": 204160576, "step": 94610 }, { "epoch": 15.434747145187602, "grad_norm": 1.0440603494644165, "learning_rate": 7.525308504122136e-06, "loss": 0.0743, "num_input_tokens_seen": 204171744, "step": 94615 }, { "epoch": 15.435562805872756, "grad_norm": 1.1398097276687622, "learning_rate": 7.522763526600568e-06, "loss": 0.1203, "num_input_tokens_seen": 204182432, "step": 94620 }, { "epoch": 15.436378466557912, "grad_norm": 1.2465487718582153, "learning_rate": 7.520218903277368e-06, "loss": 0.0568, "num_input_tokens_seen": 204193920, "step": 94625 }, { "epoch": 15.437194127243067, "grad_norm": 0.06114432215690613, "learning_rate": 7.517674634204108e-06, "loss": 0.0897, "num_input_tokens_seen": 204203456, "step": 94630 }, { "epoch": 15.438009787928221, "grad_norm": 0.9561399221420288, "learning_rate": 7.515130719432348e-06, "loss": 0.1745, "num_input_tokens_seen": 204213632, "step": 94635 }, { "epoch": 15.438825448613377, "grad_norm": 0.03595302626490593, "learning_rate": 7.512587159013645e-06, "loss": 0.1174, "num_input_tokens_seen": 204221184, "step": 94640 }, { "epoch": 15.439641109298531, "grad_norm": 0.3130040764808655, "learning_rate": 7.510043952999548e-06, "loss": 0.2291, "num_input_tokens_seen": 204232640, "step": 94645 }, { "epoch": 15.440456769983687, "grad_norm": 0.8017404675483704, "learning_rate": 7.507501101441597e-06, "loss": 0.123, "num_input_tokens_seen": 204243072, "step": 94650 }, { "epoch": 15.441272430668842, "grad_norm": 1.9125956296920776, "learning_rate": 7.5049586043913295e-06, "loss": 0.0559, "num_input_tokens_seen": 204255104, "step": 94655 }, { "epoch": 15.442088091353996, "grad_norm": 1.117713451385498, "learning_rate": 7.502416461900266e-06, "loss": 0.1399, "num_input_tokens_seen": 204266368, "step": 94660 }, { "epoch": 15.442903752039152, "grad_norm": 0.06038676202297211, "learning_rate": 7.499874674019933e-06, "loss": 0.0158, "num_input_tokens_seen": 204278208, "step": 94665 }, { "epoch": 15.443719412724306, "grad_norm": 0.27204740047454834, "learning_rate": 7.49733324080184e-06, "loss": 0.1511, "num_input_tokens_seen": 204287424, "step": 94670 }, { "epoch": 15.444535073409462, "grad_norm": 0.15638574957847595, "learning_rate": 7.494792162297492e-06, "loss": 0.0512, "num_input_tokens_seen": 204298368, "step": 94675 }, { "epoch": 15.445350734094617, "grad_norm": 0.17824800312519073, "learning_rate": 7.492251438558387e-06, "loss": 0.2002, "num_input_tokens_seen": 204309088, "step": 94680 }, { "epoch": 15.446166394779771, "grad_norm": 4.204033851623535, "learning_rate": 7.489711069636018e-06, "loss": 0.1944, "num_input_tokens_seen": 204318624, "step": 94685 }, { "epoch": 15.446982055464927, "grad_norm": 0.276488721370697, "learning_rate": 7.487171055581868e-06, "loss": 0.0515, "num_input_tokens_seen": 204328064, "step": 94690 }, { "epoch": 15.447797716150081, "grad_norm": 2.9393365383148193, "learning_rate": 7.4846313964474115e-06, "loss": 0.1904, "num_input_tokens_seen": 204339264, "step": 94695 }, { "epoch": 15.448613376835237, "grad_norm": 0.43562793731689453, "learning_rate": 7.482092092284121e-06, "loss": 0.0888, "num_input_tokens_seen": 204349280, "step": 94700 }, { "epoch": 15.449429037520392, "grad_norm": 0.9073593616485596, "learning_rate": 7.4795531431434555e-06, "loss": 0.082, "num_input_tokens_seen": 204359232, "step": 94705 }, { "epoch": 15.450244698205546, "grad_norm": 0.4000988006591797, "learning_rate": 7.477014549076875e-06, "loss": 0.0937, "num_input_tokens_seen": 204370432, "step": 94710 }, { "epoch": 15.451060358890702, "grad_norm": 0.09060932695865631, "learning_rate": 7.474476310135823e-06, "loss": 0.0146, "num_input_tokens_seen": 204380160, "step": 94715 }, { "epoch": 15.451876019575856, "grad_norm": 0.04716481640934944, "learning_rate": 7.471938426371739e-06, "loss": 0.1531, "num_input_tokens_seen": 204391168, "step": 94720 }, { "epoch": 15.452691680261012, "grad_norm": 2.8583977222442627, "learning_rate": 7.469400897836051e-06, "loss": 0.0835, "num_input_tokens_seen": 204401184, "step": 94725 }, { "epoch": 15.453507340946166, "grad_norm": 0.17064760625362396, "learning_rate": 7.466863724580211e-06, "loss": 0.0118, "num_input_tokens_seen": 204410656, "step": 94730 }, { "epoch": 15.454323001631321, "grad_norm": 1.4438562393188477, "learning_rate": 7.4643269066556025e-06, "loss": 0.1629, "num_input_tokens_seen": 204421248, "step": 94735 }, { "epoch": 15.455138662316477, "grad_norm": 1.2052956819534302, "learning_rate": 7.461790444113664e-06, "loss": 0.1593, "num_input_tokens_seen": 204430976, "step": 94740 }, { "epoch": 15.455954323001631, "grad_norm": 0.7530904412269592, "learning_rate": 7.459254337005792e-06, "loss": 0.1247, "num_input_tokens_seen": 204440000, "step": 94745 }, { "epoch": 15.456769983686787, "grad_norm": 1.2566328048706055, "learning_rate": 7.456718585383379e-06, "loss": 0.1186, "num_input_tokens_seen": 204450016, "step": 94750 }, { "epoch": 15.45758564437194, "grad_norm": 1.3461087942123413, "learning_rate": 7.454183189297822e-06, "loss": 0.1988, "num_input_tokens_seen": 204460288, "step": 94755 }, { "epoch": 15.458401305057096, "grad_norm": 2.4240031242370605, "learning_rate": 7.4516481488005e-06, "loss": 0.1112, "num_input_tokens_seen": 204471072, "step": 94760 }, { "epoch": 15.459216965742252, "grad_norm": 0.3539712727069855, "learning_rate": 7.4491134639427915e-06, "loss": 0.0406, "num_input_tokens_seen": 204482496, "step": 94765 }, { "epoch": 15.460032626427406, "grad_norm": 0.10713297873735428, "learning_rate": 7.446579134776063e-06, "loss": 0.0824, "num_input_tokens_seen": 204493248, "step": 94770 }, { "epoch": 15.460848287112562, "grad_norm": 0.056442275643348694, "learning_rate": 7.444045161351676e-06, "loss": 0.1669, "num_input_tokens_seen": 204503904, "step": 94775 }, { "epoch": 15.461663947797716, "grad_norm": 0.10968957841396332, "learning_rate": 7.441511543720983e-06, "loss": 0.1284, "num_input_tokens_seen": 204515456, "step": 94780 }, { "epoch": 15.462479608482871, "grad_norm": 0.07131891697645187, "learning_rate": 7.438978281935335e-06, "loss": 0.0155, "num_input_tokens_seen": 204525440, "step": 94785 }, { "epoch": 15.463295269168025, "grad_norm": 0.4280443787574768, "learning_rate": 7.436445376046073e-06, "loss": 0.1392, "num_input_tokens_seen": 204536448, "step": 94790 }, { "epoch": 15.464110929853181, "grad_norm": 0.06312686949968338, "learning_rate": 7.433912826104522e-06, "loss": 0.2686, "num_input_tokens_seen": 204548064, "step": 94795 }, { "epoch": 15.464926590538337, "grad_norm": 0.21393340826034546, "learning_rate": 7.4313806321620134e-06, "loss": 0.0782, "num_input_tokens_seen": 204557760, "step": 94800 }, { "epoch": 15.46574225122349, "grad_norm": 0.04433055222034454, "learning_rate": 7.4288487942698635e-06, "loss": 0.1456, "num_input_tokens_seen": 204567552, "step": 94805 }, { "epoch": 15.466557911908646, "grad_norm": 0.36003902554512024, "learning_rate": 7.426317312479383e-06, "loss": 0.0247, "num_input_tokens_seen": 204578944, "step": 94810 }, { "epoch": 15.4673735725938, "grad_norm": 0.19128026068210602, "learning_rate": 7.423786186841874e-06, "loss": 0.0816, "num_input_tokens_seen": 204589888, "step": 94815 }, { "epoch": 15.468189233278956, "grad_norm": 0.23422981798648834, "learning_rate": 7.4212554174086374e-06, "loss": 0.0784, "num_input_tokens_seen": 204599872, "step": 94820 }, { "epoch": 15.469004893964112, "grad_norm": 0.3310288190841675, "learning_rate": 7.418725004230962e-06, "loss": 0.0287, "num_input_tokens_seen": 204611744, "step": 94825 }, { "epoch": 15.469820554649266, "grad_norm": 1.0273520946502686, "learning_rate": 7.416194947360117e-06, "loss": 0.1621, "num_input_tokens_seen": 204622400, "step": 94830 }, { "epoch": 15.470636215334421, "grad_norm": 0.7257596850395203, "learning_rate": 7.413665246847404e-06, "loss": 0.0489, "num_input_tokens_seen": 204633664, "step": 94835 }, { "epoch": 15.471451876019575, "grad_norm": 0.5646163821220398, "learning_rate": 7.41113590274406e-06, "loss": 0.2087, "num_input_tokens_seen": 204644544, "step": 94840 }, { "epoch": 15.47226753670473, "grad_norm": 0.10486026108264923, "learning_rate": 7.408606915101374e-06, "loss": 0.1223, "num_input_tokens_seen": 204655808, "step": 94845 }, { "epoch": 15.473083197389887, "grad_norm": 0.061979252845048904, "learning_rate": 7.406078283970572e-06, "loss": 0.1583, "num_input_tokens_seen": 204666240, "step": 94850 }, { "epoch": 15.47389885807504, "grad_norm": 0.01607995107769966, "learning_rate": 7.403550009402927e-06, "loss": 0.0263, "num_input_tokens_seen": 204677440, "step": 94855 }, { "epoch": 15.474714518760196, "grad_norm": 0.27531832456588745, "learning_rate": 7.401022091449647e-06, "loss": 0.0143, "num_input_tokens_seen": 204689024, "step": 94860 }, { "epoch": 15.47553017944535, "grad_norm": 0.574613094329834, "learning_rate": 7.398494530161998e-06, "loss": 0.0622, "num_input_tokens_seen": 204698528, "step": 94865 }, { "epoch": 15.476345840130506, "grad_norm": 4.646021366119385, "learning_rate": 7.395967325591169e-06, "loss": 0.3426, "num_input_tokens_seen": 204710112, "step": 94870 }, { "epoch": 15.477161500815662, "grad_norm": 1.8915979862213135, "learning_rate": 7.39344047778841e-06, "loss": 0.1372, "num_input_tokens_seen": 204720416, "step": 94875 }, { "epoch": 15.477977161500815, "grad_norm": 0.3100608289241791, "learning_rate": 7.390913986804901e-06, "loss": 0.0442, "num_input_tokens_seen": 204731424, "step": 94880 }, { "epoch": 15.478792822185971, "grad_norm": 0.16957338154315948, "learning_rate": 7.3883878526918665e-06, "loss": 0.2258, "num_input_tokens_seen": 204744096, "step": 94885 }, { "epoch": 15.479608482871125, "grad_norm": 0.7457824945449829, "learning_rate": 7.385862075500494e-06, "loss": 0.0529, "num_input_tokens_seen": 204755200, "step": 94890 }, { "epoch": 15.48042414355628, "grad_norm": 0.050178103148937225, "learning_rate": 7.383336655281972e-06, "loss": 0.0587, "num_input_tokens_seen": 204766944, "step": 94895 }, { "epoch": 15.481239804241435, "grad_norm": 1.3023301362991333, "learning_rate": 7.38081159208748e-06, "loss": 0.0491, "num_input_tokens_seen": 204777408, "step": 94900 }, { "epoch": 15.48205546492659, "grad_norm": 0.033782102167606354, "learning_rate": 7.378286885968192e-06, "loss": 0.0295, "num_input_tokens_seen": 204789024, "step": 94905 }, { "epoch": 15.482871125611746, "grad_norm": 0.041081368923187256, "learning_rate": 7.375762536975276e-06, "loss": 0.0323, "num_input_tokens_seen": 204800736, "step": 94910 }, { "epoch": 15.4836867862969, "grad_norm": 0.11439462006092072, "learning_rate": 7.373238545159891e-06, "loss": 0.0107, "num_input_tokens_seen": 204811648, "step": 94915 }, { "epoch": 15.484502446982056, "grad_norm": 1.8646430969238281, "learning_rate": 7.370714910573187e-06, "loss": 0.0817, "num_input_tokens_seen": 204821664, "step": 94920 }, { "epoch": 15.48531810766721, "grad_norm": 1.1837847232818604, "learning_rate": 7.368191633266311e-06, "loss": 0.188, "num_input_tokens_seen": 204832928, "step": 94925 }, { "epoch": 15.486133768352365, "grad_norm": 0.12159795314073563, "learning_rate": 7.365668713290397e-06, "loss": 0.1382, "num_input_tokens_seen": 204843488, "step": 94930 }, { "epoch": 15.486949429037521, "grad_norm": 0.3154357075691223, "learning_rate": 7.36314615069657e-06, "loss": 0.095, "num_input_tokens_seen": 204854496, "step": 94935 }, { "epoch": 15.487765089722675, "grad_norm": 0.5987045764923096, "learning_rate": 7.360623945535977e-06, "loss": 0.2522, "num_input_tokens_seen": 204865536, "step": 94940 }, { "epoch": 15.48858075040783, "grad_norm": 0.06452810019254684, "learning_rate": 7.358102097859698e-06, "loss": 0.0536, "num_input_tokens_seen": 204875648, "step": 94945 }, { "epoch": 15.489396411092985, "grad_norm": 0.7933687567710876, "learning_rate": 7.3555806077188785e-06, "loss": 0.0435, "num_input_tokens_seen": 204885888, "step": 94950 }, { "epoch": 15.49021207177814, "grad_norm": 0.27597978711128235, "learning_rate": 7.3530594751645866e-06, "loss": 0.0944, "num_input_tokens_seen": 204897440, "step": 94955 }, { "epoch": 15.491027732463296, "grad_norm": 0.4442203938961029, "learning_rate": 7.350538700247947e-06, "loss": 0.1054, "num_input_tokens_seen": 204908480, "step": 94960 }, { "epoch": 15.49184339314845, "grad_norm": 0.11926909536123276, "learning_rate": 7.348018283020014e-06, "loss": 0.0607, "num_input_tokens_seen": 204920736, "step": 94965 }, { "epoch": 15.492659053833606, "grad_norm": 0.08271844685077667, "learning_rate": 7.345498223531899e-06, "loss": 0.177, "num_input_tokens_seen": 204931136, "step": 94970 }, { "epoch": 15.49347471451876, "grad_norm": 0.3038446605205536, "learning_rate": 7.342978521834643e-06, "loss": 0.1231, "num_input_tokens_seen": 204940352, "step": 94975 }, { "epoch": 15.494290375203915, "grad_norm": 0.16161957383155823, "learning_rate": 7.340459177979342e-06, "loss": 0.2467, "num_input_tokens_seen": 204950720, "step": 94980 }, { "epoch": 15.49510603588907, "grad_norm": 0.3323361873626709, "learning_rate": 7.3379401920170234e-06, "loss": 0.0277, "num_input_tokens_seen": 204960768, "step": 94985 }, { "epoch": 15.495921696574225, "grad_norm": 0.459970086812973, "learning_rate": 7.335421563998771e-06, "loss": 0.0953, "num_input_tokens_seen": 204971968, "step": 94990 }, { "epoch": 15.49673735725938, "grad_norm": 0.6076539158821106, "learning_rate": 7.33290329397559e-06, "loss": 0.0917, "num_input_tokens_seen": 204982688, "step": 94995 }, { "epoch": 15.497553017944535, "grad_norm": 3.206008195877075, "learning_rate": 7.330385381998553e-06, "loss": 0.0697, "num_input_tokens_seen": 204993088, "step": 95000 }, { "epoch": 15.49836867862969, "grad_norm": 0.47863703966140747, "learning_rate": 7.327867828118656e-06, "loss": 0.0951, "num_input_tokens_seen": 205003872, "step": 95005 }, { "epoch": 15.499184339314844, "grad_norm": 0.09304982423782349, "learning_rate": 7.325350632386954e-06, "loss": 0.0136, "num_input_tokens_seen": 205014144, "step": 95010 }, { "epoch": 15.5, "grad_norm": 0.02959452010691166, "learning_rate": 7.322833794854425e-06, "loss": 0.0711, "num_input_tokens_seen": 205025440, "step": 95015 }, { "epoch": 15.500815660685156, "grad_norm": 0.0707961842417717, "learning_rate": 7.320317315572103e-06, "loss": 0.0802, "num_input_tokens_seen": 205036480, "step": 95020 }, { "epoch": 15.50163132137031, "grad_norm": 0.05062038078904152, "learning_rate": 7.317801194590979e-06, "loss": 0.017, "num_input_tokens_seen": 205047488, "step": 95025 }, { "epoch": 15.502446982055465, "grad_norm": 0.01823515258729458, "learning_rate": 7.315285431962043e-06, "loss": 0.0887, "num_input_tokens_seen": 205057536, "step": 95030 }, { "epoch": 15.50326264274062, "grad_norm": 1.9185638427734375, "learning_rate": 7.3127700277362846e-06, "loss": 0.0615, "num_input_tokens_seen": 205068448, "step": 95035 }, { "epoch": 15.504078303425775, "grad_norm": 0.042440105229616165, "learning_rate": 7.310254981964682e-06, "loss": 0.0043, "num_input_tokens_seen": 205079648, "step": 95040 }, { "epoch": 15.50489396411093, "grad_norm": 0.6661184430122375, "learning_rate": 7.3077402946981984e-06, "loss": 0.0222, "num_input_tokens_seen": 205089792, "step": 95045 }, { "epoch": 15.505709624796085, "grad_norm": 0.07847024500370026, "learning_rate": 7.305225965987806e-06, "loss": 0.0733, "num_input_tokens_seen": 205100096, "step": 95050 }, { "epoch": 15.50652528548124, "grad_norm": 0.06257498264312744, "learning_rate": 7.302711995884454e-06, "loss": 0.0213, "num_input_tokens_seen": 205110400, "step": 95055 }, { "epoch": 15.507340946166394, "grad_norm": 0.22917841374874115, "learning_rate": 7.300198384439094e-06, "loss": 0.0783, "num_input_tokens_seen": 205121344, "step": 95060 }, { "epoch": 15.50815660685155, "grad_norm": 0.6827455759048462, "learning_rate": 7.29768513170267e-06, "loss": 0.0205, "num_input_tokens_seen": 205132928, "step": 95065 }, { "epoch": 15.508972267536706, "grad_norm": 0.15498778223991394, "learning_rate": 7.295172237726111e-06, "loss": 0.1708, "num_input_tokens_seen": 205142976, "step": 95070 }, { "epoch": 15.50978792822186, "grad_norm": 0.07478391379117966, "learning_rate": 7.292659702560348e-06, "loss": 0.0191, "num_input_tokens_seen": 205153152, "step": 95075 }, { "epoch": 15.510603588907015, "grad_norm": 0.5949718952178955, "learning_rate": 7.2901475262562915e-06, "loss": 0.1509, "num_input_tokens_seen": 205163200, "step": 95080 }, { "epoch": 15.51141924959217, "grad_norm": 0.08556891232728958, "learning_rate": 7.287635708864876e-06, "loss": 0.0133, "num_input_tokens_seen": 205173888, "step": 95085 }, { "epoch": 15.512234910277325, "grad_norm": 0.21420539915561676, "learning_rate": 7.285124250436981e-06, "loss": 0.053, "num_input_tokens_seen": 205184320, "step": 95090 }, { "epoch": 15.513050570962479, "grad_norm": 1.9074592590332031, "learning_rate": 7.282613151023529e-06, "loss": 0.0856, "num_input_tokens_seen": 205195104, "step": 95095 }, { "epoch": 15.513866231647635, "grad_norm": 0.01729525253176689, "learning_rate": 7.280102410675383e-06, "loss": 0.1365, "num_input_tokens_seen": 205205824, "step": 95100 }, { "epoch": 15.51468189233279, "grad_norm": 0.04219873994588852, "learning_rate": 7.277592029443453e-06, "loss": 0.0267, "num_input_tokens_seen": 205217184, "step": 95105 }, { "epoch": 15.515497553017944, "grad_norm": 2.121901035308838, "learning_rate": 7.275082007378592e-06, "loss": 0.1384, "num_input_tokens_seen": 205228256, "step": 95110 }, { "epoch": 15.5163132137031, "grad_norm": 0.5698716044425964, "learning_rate": 7.272572344531692e-06, "loss": 0.021, "num_input_tokens_seen": 205239648, "step": 95115 }, { "epoch": 15.517128874388254, "grad_norm": 0.0999903604388237, "learning_rate": 7.2700630409535895e-06, "loss": 0.0156, "num_input_tokens_seen": 205250784, "step": 95120 }, { "epoch": 15.51794453507341, "grad_norm": 0.026760421693325043, "learning_rate": 7.267554096695164e-06, "loss": 0.1025, "num_input_tokens_seen": 205260704, "step": 95125 }, { "epoch": 15.518760195758565, "grad_norm": 1.1470942497253418, "learning_rate": 7.265045511807236e-06, "loss": 0.0534, "num_input_tokens_seen": 205270112, "step": 95130 }, { "epoch": 15.51957585644372, "grad_norm": 0.21335944533348083, "learning_rate": 7.262537286340676e-06, "loss": 0.1237, "num_input_tokens_seen": 205280928, "step": 95135 }, { "epoch": 15.520391517128875, "grad_norm": 2.2542362213134766, "learning_rate": 7.260029420346282e-06, "loss": 0.0762, "num_input_tokens_seen": 205291488, "step": 95140 }, { "epoch": 15.521207177814029, "grad_norm": 0.05260752886533737, "learning_rate": 7.257521913874907e-06, "loss": 0.0292, "num_input_tokens_seen": 205301024, "step": 95145 }, { "epoch": 15.522022838499185, "grad_norm": 0.8378501534461975, "learning_rate": 7.255014766977355e-06, "loss": 0.1211, "num_input_tokens_seen": 205313632, "step": 95150 }, { "epoch": 15.522838499184338, "grad_norm": 0.6308209896087646, "learning_rate": 7.252507979704442e-06, "loss": 0.0125, "num_input_tokens_seen": 205324832, "step": 95155 }, { "epoch": 15.523654159869494, "grad_norm": 1.7657296657562256, "learning_rate": 7.2500015521069665e-06, "loss": 0.2463, "num_input_tokens_seen": 205336160, "step": 95160 }, { "epoch": 15.52446982055465, "grad_norm": 1.027320146560669, "learning_rate": 7.247495484235731e-06, "loss": 0.2476, "num_input_tokens_seen": 205347648, "step": 95165 }, { "epoch": 15.525285481239804, "grad_norm": 1.5440258979797363, "learning_rate": 7.244989776141517e-06, "loss": 0.1561, "num_input_tokens_seen": 205359392, "step": 95170 }, { "epoch": 15.52610114192496, "grad_norm": 0.6396998763084412, "learning_rate": 7.24248442787511e-06, "loss": 0.0456, "num_input_tokens_seen": 205368800, "step": 95175 }, { "epoch": 15.526916802610113, "grad_norm": 0.11473571509122849, "learning_rate": 7.239979439487283e-06, "loss": 0.1707, "num_input_tokens_seen": 205379744, "step": 95180 }, { "epoch": 15.52773246329527, "grad_norm": 1.273838996887207, "learning_rate": 7.2374748110288025e-06, "loss": 0.0623, "num_input_tokens_seen": 205389344, "step": 95185 }, { "epoch": 15.528548123980425, "grad_norm": 0.07012274116277695, "learning_rate": 7.2349705425504285e-06, "loss": 0.1468, "num_input_tokens_seen": 205399264, "step": 95190 }, { "epoch": 15.529363784665579, "grad_norm": 1.9210858345031738, "learning_rate": 7.232466634102913e-06, "loss": 0.1009, "num_input_tokens_seen": 205410496, "step": 95195 }, { "epoch": 15.530179445350734, "grad_norm": 0.059729788452386856, "learning_rate": 7.229963085737002e-06, "loss": 0.044, "num_input_tokens_seen": 205420320, "step": 95200 }, { "epoch": 15.530995106035888, "grad_norm": 3.084362268447876, "learning_rate": 7.22745989750343e-06, "loss": 0.1493, "num_input_tokens_seen": 205430944, "step": 95205 }, { "epoch": 15.531810766721044, "grad_norm": 0.018916860222816467, "learning_rate": 7.224957069452931e-06, "loss": 0.0604, "num_input_tokens_seen": 205441344, "step": 95210 }, { "epoch": 15.5326264274062, "grad_norm": 2.864347219467163, "learning_rate": 7.222454601636219e-06, "loss": 0.1938, "num_input_tokens_seen": 205452544, "step": 95215 }, { "epoch": 15.533442088091354, "grad_norm": 0.455462783575058, "learning_rate": 7.219952494104029e-06, "loss": 0.0664, "num_input_tokens_seen": 205463648, "step": 95220 }, { "epoch": 15.53425774877651, "grad_norm": 1.3012133836746216, "learning_rate": 7.217450746907045e-06, "loss": 0.0953, "num_input_tokens_seen": 205475296, "step": 95225 }, { "epoch": 15.535073409461663, "grad_norm": 0.17491543292999268, "learning_rate": 7.2149493600959964e-06, "loss": 0.0367, "num_input_tokens_seen": 205485888, "step": 95230 }, { "epoch": 15.535889070146819, "grad_norm": 0.15771061182022095, "learning_rate": 7.212448333721541e-06, "loss": 0.1603, "num_input_tokens_seen": 205496288, "step": 95235 }, { "epoch": 15.536704730831975, "grad_norm": 0.815313458442688, "learning_rate": 7.209947667834405e-06, "loss": 0.0201, "num_input_tokens_seen": 205507104, "step": 95240 }, { "epoch": 15.537520391517129, "grad_norm": 1.81254243850708, "learning_rate": 7.207447362485232e-06, "loss": 0.2747, "num_input_tokens_seen": 205518208, "step": 95245 }, { "epoch": 15.538336052202284, "grad_norm": 1.4619140625, "learning_rate": 7.204947417724722e-06, "loss": 0.2946, "num_input_tokens_seen": 205526688, "step": 95250 }, { "epoch": 15.539151712887438, "grad_norm": 0.12508675456047058, "learning_rate": 7.202447833603515e-06, "loss": 0.066, "num_input_tokens_seen": 205536928, "step": 95255 }, { "epoch": 15.539967373572594, "grad_norm": 0.04720006138086319, "learning_rate": 7.1999486101722884e-06, "loss": 0.2007, "num_input_tokens_seen": 205547744, "step": 95260 }, { "epoch": 15.540783034257748, "grad_norm": 0.05503140762448311, "learning_rate": 7.197449747481682e-06, "loss": 0.1945, "num_input_tokens_seen": 205559296, "step": 95265 }, { "epoch": 15.541598694942904, "grad_norm": 0.013068304397165775, "learning_rate": 7.19495124558234e-06, "loss": 0.2283, "num_input_tokens_seen": 205570112, "step": 95270 }, { "epoch": 15.54241435562806, "grad_norm": 0.3216266334056854, "learning_rate": 7.1924531045249015e-06, "loss": 0.0437, "num_input_tokens_seen": 205579424, "step": 95275 }, { "epoch": 15.543230016313213, "grad_norm": 1.2166887521743774, "learning_rate": 7.18995532435999e-06, "loss": 0.2257, "num_input_tokens_seen": 205591520, "step": 95280 }, { "epoch": 15.544045676998369, "grad_norm": 0.4099194407463074, "learning_rate": 7.187457905138226e-06, "loss": 0.1602, "num_input_tokens_seen": 205603328, "step": 95285 }, { "epoch": 15.544861337683523, "grad_norm": 1.041157841682434, "learning_rate": 7.1849608469102244e-06, "loss": 0.0706, "num_input_tokens_seen": 205614848, "step": 95290 }, { "epoch": 15.545676998368679, "grad_norm": 2.6370906829833984, "learning_rate": 7.182464149726592e-06, "loss": 0.1815, "num_input_tokens_seen": 205626432, "step": 95295 }, { "epoch": 15.546492659053834, "grad_norm": 0.9942541122436523, "learning_rate": 7.1799678136379275e-06, "loss": 0.1377, "num_input_tokens_seen": 205637536, "step": 95300 }, { "epoch": 15.547308319738988, "grad_norm": 0.026382924988865852, "learning_rate": 7.177471838694821e-06, "loss": 0.0102, "num_input_tokens_seen": 205647648, "step": 95305 }, { "epoch": 15.548123980424144, "grad_norm": 0.04102293401956558, "learning_rate": 7.174976224947858e-06, "loss": 0.0106, "num_input_tokens_seen": 205658784, "step": 95310 }, { "epoch": 15.548939641109298, "grad_norm": 0.01444252673536539, "learning_rate": 7.172480972447612e-06, "loss": 0.0783, "num_input_tokens_seen": 205669216, "step": 95315 }, { "epoch": 15.549755301794454, "grad_norm": 0.16830895841121674, "learning_rate": 7.169986081244659e-06, "loss": 0.1353, "num_input_tokens_seen": 205680288, "step": 95320 }, { "epoch": 15.550570962479608, "grad_norm": 1.4969621896743774, "learning_rate": 7.167491551389555e-06, "loss": 0.2703, "num_input_tokens_seen": 205691296, "step": 95325 }, { "epoch": 15.551386623164763, "grad_norm": 2.8648018836975098, "learning_rate": 7.164997382932856e-06, "loss": 0.1891, "num_input_tokens_seen": 205703040, "step": 95330 }, { "epoch": 15.552202283849919, "grad_norm": 1.7359057664871216, "learning_rate": 7.162503575925109e-06, "loss": 0.1699, "num_input_tokens_seen": 205713664, "step": 95335 }, { "epoch": 15.553017944535073, "grad_norm": 1.1072685718536377, "learning_rate": 7.160010130416858e-06, "loss": 0.2859, "num_input_tokens_seen": 205723840, "step": 95340 }, { "epoch": 15.553833605220229, "grad_norm": 1.3430496454238892, "learning_rate": 7.1575170464586335e-06, "loss": 0.0875, "num_input_tokens_seen": 205734272, "step": 95345 }, { "epoch": 15.554649265905383, "grad_norm": 0.23561951518058777, "learning_rate": 7.155024324100959e-06, "loss": 0.1643, "num_input_tokens_seen": 205743872, "step": 95350 }, { "epoch": 15.555464926590538, "grad_norm": 2.6487178802490234, "learning_rate": 7.1525319633943545e-06, "loss": 0.2617, "num_input_tokens_seen": 205754912, "step": 95355 }, { "epoch": 15.556280587275694, "grad_norm": 0.04093337431550026, "learning_rate": 7.150039964389324e-06, "loss": 0.0794, "num_input_tokens_seen": 205765440, "step": 95360 }, { "epoch": 15.557096247960848, "grad_norm": 1.3447133302688599, "learning_rate": 7.147548327136386e-06, "loss": 0.191, "num_input_tokens_seen": 205776544, "step": 95365 }, { "epoch": 15.557911908646004, "grad_norm": 0.21201536059379578, "learning_rate": 7.1450570516860295e-06, "loss": 0.0317, "num_input_tokens_seen": 205787584, "step": 95370 }, { "epoch": 15.558727569331158, "grad_norm": 0.015119098126888275, "learning_rate": 7.142566138088741e-06, "loss": 0.1069, "num_input_tokens_seen": 205797504, "step": 95375 }, { "epoch": 15.559543230016313, "grad_norm": 0.07104844599962234, "learning_rate": 7.1400755863950034e-06, "loss": 0.0395, "num_input_tokens_seen": 205808384, "step": 95380 }, { "epoch": 15.560358890701469, "grad_norm": 0.024509748443961143, "learning_rate": 7.137585396655291e-06, "loss": 0.1001, "num_input_tokens_seen": 205819168, "step": 95385 }, { "epoch": 15.561174551386623, "grad_norm": 1.9149043560028076, "learning_rate": 7.135095568920072e-06, "loss": 0.1692, "num_input_tokens_seen": 205829792, "step": 95390 }, { "epoch": 15.561990212071779, "grad_norm": 1.6864879131317139, "learning_rate": 7.1326061032398026e-06, "loss": 0.0868, "num_input_tokens_seen": 205839488, "step": 95395 }, { "epoch": 15.562805872756933, "grad_norm": 0.04272652417421341, "learning_rate": 7.130116999664938e-06, "loss": 0.113, "num_input_tokens_seen": 205850400, "step": 95400 }, { "epoch": 15.563621533442088, "grad_norm": 1.2176439762115479, "learning_rate": 7.127628258245922e-06, "loss": 0.1207, "num_input_tokens_seen": 205861216, "step": 95405 }, { "epoch": 15.564437194127244, "grad_norm": 0.15714417397975922, "learning_rate": 7.125139879033194e-06, "loss": 0.161, "num_input_tokens_seen": 205872128, "step": 95410 }, { "epoch": 15.565252854812398, "grad_norm": 1.3267455101013184, "learning_rate": 7.122651862077179e-06, "loss": 0.142, "num_input_tokens_seen": 205883648, "step": 95415 }, { "epoch": 15.566068515497554, "grad_norm": 0.3757335841655731, "learning_rate": 7.120164207428303e-06, "loss": 0.0915, "num_input_tokens_seen": 205895616, "step": 95420 }, { "epoch": 15.566884176182707, "grad_norm": 0.3102898895740509, "learning_rate": 7.117676915136984e-06, "loss": 0.0669, "num_input_tokens_seen": 205906624, "step": 95425 }, { "epoch": 15.567699836867863, "grad_norm": 0.06887758523225784, "learning_rate": 7.115189985253626e-06, "loss": 0.0232, "num_input_tokens_seen": 205916320, "step": 95430 }, { "epoch": 15.568515497553017, "grad_norm": 0.3676247298717499, "learning_rate": 7.112703417828631e-06, "loss": 0.2701, "num_input_tokens_seen": 205928064, "step": 95435 }, { "epoch": 15.569331158238173, "grad_norm": 0.6841440200805664, "learning_rate": 7.110217212912393e-06, "loss": 0.1452, "num_input_tokens_seen": 205938496, "step": 95440 }, { "epoch": 15.570146818923329, "grad_norm": 0.04568805918097496, "learning_rate": 7.1077313705552985e-06, "loss": 0.1163, "num_input_tokens_seen": 205949440, "step": 95445 }, { "epoch": 15.570962479608482, "grad_norm": 0.060766275972127914, "learning_rate": 7.105245890807727e-06, "loss": 0.0132, "num_input_tokens_seen": 205961792, "step": 95450 }, { "epoch": 15.571778140293638, "grad_norm": 1.999355673789978, "learning_rate": 7.1027607737200465e-06, "loss": 0.1185, "num_input_tokens_seen": 205972960, "step": 95455 }, { "epoch": 15.572593800978792, "grad_norm": 1.013197898864746, "learning_rate": 7.100276019342625e-06, "loss": 0.0561, "num_input_tokens_seen": 205984192, "step": 95460 }, { "epoch": 15.573409461663948, "grad_norm": 0.13754037022590637, "learning_rate": 7.097791627725817e-06, "loss": 0.034, "num_input_tokens_seen": 205995424, "step": 95465 }, { "epoch": 15.574225122349104, "grad_norm": 4.14033317565918, "learning_rate": 7.0953075989199645e-06, "loss": 0.1682, "num_input_tokens_seen": 206007552, "step": 95470 }, { "epoch": 15.575040783034257, "grad_norm": 0.2934022843837738, "learning_rate": 7.092823932975431e-06, "loss": 0.0259, "num_input_tokens_seen": 206018048, "step": 95475 }, { "epoch": 15.575856443719413, "grad_norm": 0.10781611502170563, "learning_rate": 7.090340629942524e-06, "loss": 0.0313, "num_input_tokens_seen": 206028992, "step": 95480 }, { "epoch": 15.576672104404567, "grad_norm": 0.24521706998348236, "learning_rate": 7.087857689871599e-06, "loss": 0.0674, "num_input_tokens_seen": 206039584, "step": 95485 }, { "epoch": 15.577487765089723, "grad_norm": 0.12613573670387268, "learning_rate": 7.085375112812944e-06, "loss": 0.1366, "num_input_tokens_seen": 206050624, "step": 95490 }, { "epoch": 15.578303425774878, "grad_norm": 0.09123612940311432, "learning_rate": 7.082892898816906e-06, "loss": 0.1128, "num_input_tokens_seen": 206062624, "step": 95495 }, { "epoch": 15.579119086460032, "grad_norm": 0.05150054767727852, "learning_rate": 7.080411047933758e-06, "loss": 0.0163, "num_input_tokens_seen": 206073728, "step": 95500 }, { "epoch": 15.579934747145188, "grad_norm": 1.2752094268798828, "learning_rate": 7.077929560213817e-06, "loss": 0.0662, "num_input_tokens_seen": 206084256, "step": 95505 }, { "epoch": 15.580750407830342, "grad_norm": 0.02691786363720894, "learning_rate": 7.075448435707371e-06, "loss": 0.0396, "num_input_tokens_seen": 206095040, "step": 95510 }, { "epoch": 15.581566068515498, "grad_norm": 0.31265681982040405, "learning_rate": 7.072967674464704e-06, "loss": 0.0293, "num_input_tokens_seen": 206105216, "step": 95515 }, { "epoch": 15.582381729200652, "grad_norm": 1.4012494087219238, "learning_rate": 7.070487276536087e-06, "loss": 0.1073, "num_input_tokens_seen": 206116768, "step": 95520 }, { "epoch": 15.583197389885807, "grad_norm": 0.03032671846449375, "learning_rate": 7.0680072419717884e-06, "loss": 0.0121, "num_input_tokens_seen": 206128608, "step": 95525 }, { "epoch": 15.584013050570963, "grad_norm": 0.08071320503950119, "learning_rate": 7.065527570822075e-06, "loss": 0.1313, "num_input_tokens_seen": 206139744, "step": 95530 }, { "epoch": 15.584828711256117, "grad_norm": 0.2286117672920227, "learning_rate": 7.063048263137195e-06, "loss": 0.034, "num_input_tokens_seen": 206151168, "step": 95535 }, { "epoch": 15.585644371941273, "grad_norm": 0.21096573770046234, "learning_rate": 7.060569318967397e-06, "loss": 0.1054, "num_input_tokens_seen": 206162400, "step": 95540 }, { "epoch": 15.586460032626427, "grad_norm": 0.17841407656669617, "learning_rate": 7.058090738362918e-06, "loss": 0.0439, "num_input_tokens_seen": 206173984, "step": 95545 }, { "epoch": 15.587275693311582, "grad_norm": 0.7184702157974243, "learning_rate": 7.0556125213739945e-06, "loss": 0.0789, "num_input_tokens_seen": 206183488, "step": 95550 }, { "epoch": 15.588091353996738, "grad_norm": 0.05651906877756119, "learning_rate": 7.053134668050845e-06, "loss": 0.1067, "num_input_tokens_seen": 206193120, "step": 95555 }, { "epoch": 15.588907014681892, "grad_norm": 0.6686608195304871, "learning_rate": 7.05065717844369e-06, "loss": 0.1003, "num_input_tokens_seen": 206204576, "step": 95560 }, { "epoch": 15.589722675367048, "grad_norm": 1.7910308837890625, "learning_rate": 7.048180052602737e-06, "loss": 0.1447, "num_input_tokens_seen": 206214752, "step": 95565 }, { "epoch": 15.590538336052202, "grad_norm": 1.5915234088897705, "learning_rate": 7.045703290578187e-06, "loss": 0.1191, "num_input_tokens_seen": 206226528, "step": 95570 }, { "epoch": 15.591353996737357, "grad_norm": 0.11740688234567642, "learning_rate": 7.043226892420232e-06, "loss": 0.2177, "num_input_tokens_seen": 206238912, "step": 95575 }, { "epoch": 15.592169657422513, "grad_norm": 2.532559871673584, "learning_rate": 7.040750858179076e-06, "loss": 0.0837, "num_input_tokens_seen": 206250624, "step": 95580 }, { "epoch": 15.592985318107667, "grad_norm": 0.06860779225826263, "learning_rate": 7.038275187904875e-06, "loss": 0.0781, "num_input_tokens_seen": 206262240, "step": 95585 }, { "epoch": 15.593800978792823, "grad_norm": 1.6308536529541016, "learning_rate": 7.0357998816478256e-06, "loss": 0.1163, "num_input_tokens_seen": 206271456, "step": 95590 }, { "epoch": 15.594616639477977, "grad_norm": 0.2406580150127411, "learning_rate": 7.033324939458066e-06, "loss": 0.093, "num_input_tokens_seen": 206282272, "step": 95595 }, { "epoch": 15.595432300163132, "grad_norm": 0.07271069288253784, "learning_rate": 7.030850361385785e-06, "loss": 0.0669, "num_input_tokens_seen": 206293152, "step": 95600 }, { "epoch": 15.596247960848288, "grad_norm": 0.06457407027482986, "learning_rate": 7.0283761474811e-06, "loss": 0.016, "num_input_tokens_seen": 206303072, "step": 95605 }, { "epoch": 15.597063621533442, "grad_norm": 0.03777627274394035, "learning_rate": 7.0259022977941875e-06, "loss": 0.1441, "num_input_tokens_seen": 206314304, "step": 95610 }, { "epoch": 15.597879282218598, "grad_norm": 1.7652194499969482, "learning_rate": 7.023428812375152e-06, "loss": 0.0844, "num_input_tokens_seen": 206324672, "step": 95615 }, { "epoch": 15.598694942903752, "grad_norm": 1.142081379890442, "learning_rate": 7.020955691274153e-06, "loss": 0.0536, "num_input_tokens_seen": 206335360, "step": 95620 }, { "epoch": 15.599510603588907, "grad_norm": 0.11416281759738922, "learning_rate": 7.018482934541276e-06, "loss": 0.0383, "num_input_tokens_seen": 206347776, "step": 95625 }, { "epoch": 15.600326264274061, "grad_norm": 2.077545404434204, "learning_rate": 7.0160105422266725e-06, "loss": 0.0775, "num_input_tokens_seen": 206358880, "step": 95630 }, { "epoch": 15.601141924959217, "grad_norm": 0.4186468720436096, "learning_rate": 7.0135385143804106e-06, "loss": 0.0695, "num_input_tokens_seen": 206369920, "step": 95635 }, { "epoch": 15.601957585644373, "grad_norm": 0.11087266355752945, "learning_rate": 7.011066851052625e-06, "loss": 0.0828, "num_input_tokens_seen": 206379776, "step": 95640 }, { "epoch": 15.602773246329527, "grad_norm": 0.10337500274181366, "learning_rate": 7.008595552293376e-06, "loss": 0.1011, "num_input_tokens_seen": 206390496, "step": 95645 }, { "epoch": 15.603588907014682, "grad_norm": 2.35565447807312, "learning_rate": 7.006124618152771e-06, "loss": 0.1014, "num_input_tokens_seen": 206403104, "step": 95650 }, { "epoch": 15.604404567699836, "grad_norm": 1.7595157623291016, "learning_rate": 7.0036540486808744e-06, "loss": 0.2488, "num_input_tokens_seen": 206414368, "step": 95655 }, { "epoch": 15.605220228384992, "grad_norm": 0.5892563462257385, "learning_rate": 7.0011838439277585e-06, "loss": 0.0314, "num_input_tokens_seen": 206425248, "step": 95660 }, { "epoch": 15.606035889070148, "grad_norm": 0.054449163377285004, "learning_rate": 6.998714003943488e-06, "loss": 0.1275, "num_input_tokens_seen": 206435008, "step": 95665 }, { "epoch": 15.606851549755302, "grad_norm": 0.18049561977386475, "learning_rate": 6.996244528778112e-06, "loss": 0.1869, "num_input_tokens_seen": 206446048, "step": 95670 }, { "epoch": 15.607667210440457, "grad_norm": 0.6489474773406982, "learning_rate": 6.993775418481679e-06, "loss": 0.0355, "num_input_tokens_seen": 206458144, "step": 95675 }, { "epoch": 15.608482871125611, "grad_norm": 0.060449931770563126, "learning_rate": 6.991306673104231e-06, "loss": 0.0937, "num_input_tokens_seen": 206468320, "step": 95680 }, { "epoch": 15.609298531810767, "grad_norm": 0.1322067826986313, "learning_rate": 6.988838292695798e-06, "loss": 0.0145, "num_input_tokens_seen": 206478720, "step": 95685 }, { "epoch": 15.61011419249592, "grad_norm": 1.597001075744629, "learning_rate": 6.9863702773064085e-06, "loss": 0.1125, "num_input_tokens_seen": 206489248, "step": 95690 }, { "epoch": 15.610929853181077, "grad_norm": 2.6856062412261963, "learning_rate": 6.983902626986074e-06, "loss": 0.0533, "num_input_tokens_seen": 206499040, "step": 95695 }, { "epoch": 15.611745513866232, "grad_norm": 0.9044954776763916, "learning_rate": 6.981435341784803e-06, "loss": 0.0407, "num_input_tokens_seen": 206510560, "step": 95700 }, { "epoch": 15.612561174551386, "grad_norm": 0.17301513254642487, "learning_rate": 6.9789684217526185e-06, "loss": 0.0188, "num_input_tokens_seen": 206521408, "step": 95705 }, { "epoch": 15.613376835236542, "grad_norm": 0.09313564747571945, "learning_rate": 6.976501866939483e-06, "loss": 0.056, "num_input_tokens_seen": 206532000, "step": 95710 }, { "epoch": 15.614192495921696, "grad_norm": 0.7002674341201782, "learning_rate": 6.9740356773954155e-06, "loss": 0.2797, "num_input_tokens_seen": 206543744, "step": 95715 }, { "epoch": 15.615008156606851, "grad_norm": 1.9741551876068115, "learning_rate": 6.971569853170371e-06, "loss": 0.1545, "num_input_tokens_seen": 206554144, "step": 95720 }, { "epoch": 15.615823817292007, "grad_norm": 2.625239133834839, "learning_rate": 6.969104394314346e-06, "loss": 0.12, "num_input_tokens_seen": 206563936, "step": 95725 }, { "epoch": 15.616639477977161, "grad_norm": 0.2391899824142456, "learning_rate": 6.9666393008772815e-06, "loss": 0.054, "num_input_tokens_seen": 206575840, "step": 95730 }, { "epoch": 15.617455138662317, "grad_norm": 0.3431456983089447, "learning_rate": 6.964174572909163e-06, "loss": 0.0787, "num_input_tokens_seen": 206586720, "step": 95735 }, { "epoch": 15.61827079934747, "grad_norm": 0.8286740779876709, "learning_rate": 6.961710210459913e-06, "loss": 0.0325, "num_input_tokens_seen": 206595456, "step": 95740 }, { "epoch": 15.619086460032626, "grad_norm": 0.0691496878862381, "learning_rate": 6.959246213579501e-06, "loss": 0.0112, "num_input_tokens_seen": 206606016, "step": 95745 }, { "epoch": 15.619902120717782, "grad_norm": 0.09469333291053772, "learning_rate": 6.956782582317836e-06, "loss": 0.0222, "num_input_tokens_seen": 206617184, "step": 95750 }, { "epoch": 15.620717781402936, "grad_norm": 0.03210965543985367, "learning_rate": 6.954319316724878e-06, "loss": 0.0122, "num_input_tokens_seen": 206629440, "step": 95755 }, { "epoch": 15.621533442088092, "grad_norm": 0.07149900496006012, "learning_rate": 6.9518564168505165e-06, "loss": 0.0818, "num_input_tokens_seen": 206640608, "step": 95760 }, { "epoch": 15.622349102773246, "grad_norm": 1.7735955715179443, "learning_rate": 6.949393882744696e-06, "loss": 0.106, "num_input_tokens_seen": 206651424, "step": 95765 }, { "epoch": 15.623164763458401, "grad_norm": 0.3785000443458557, "learning_rate": 6.946931714457292e-06, "loss": 0.1659, "num_input_tokens_seen": 206662752, "step": 95770 }, { "epoch": 15.623980424143557, "grad_norm": 0.2710239589214325, "learning_rate": 6.944469912038232e-06, "loss": 0.1137, "num_input_tokens_seen": 206673056, "step": 95775 }, { "epoch": 15.624796084828711, "grad_norm": 0.060016948729753494, "learning_rate": 6.942008475537379e-06, "loss": 0.0975, "num_input_tokens_seen": 206684032, "step": 95780 }, { "epoch": 15.625611745513867, "grad_norm": 0.2424556165933609, "learning_rate": 6.939547405004643e-06, "loss": 0.1262, "num_input_tokens_seen": 206695456, "step": 95785 }, { "epoch": 15.62642740619902, "grad_norm": 0.04851452261209488, "learning_rate": 6.937086700489886e-06, "loss": 0.1049, "num_input_tokens_seen": 206705760, "step": 95790 }, { "epoch": 15.627243066884176, "grad_norm": 0.12769772112369537, "learning_rate": 6.934626362042987e-06, "loss": 0.0137, "num_input_tokens_seen": 206716928, "step": 95795 }, { "epoch": 15.62805872756933, "grad_norm": 0.08482397347688675, "learning_rate": 6.932166389713798e-06, "loss": 0.0415, "num_input_tokens_seen": 206727424, "step": 95800 }, { "epoch": 15.628874388254486, "grad_norm": 1.0318530797958374, "learning_rate": 6.929706783552179e-06, "loss": 0.0386, "num_input_tokens_seen": 206739104, "step": 95805 }, { "epoch": 15.629690048939642, "grad_norm": 0.07999173551797867, "learning_rate": 6.927247543607976e-06, "loss": 0.0077, "num_input_tokens_seen": 206750112, "step": 95810 }, { "epoch": 15.630505709624796, "grad_norm": 1.3868110179901123, "learning_rate": 6.924788669931029e-06, "loss": 0.1946, "num_input_tokens_seen": 206761600, "step": 95815 }, { "epoch": 15.631321370309951, "grad_norm": 0.013523521833121777, "learning_rate": 6.922330162571167e-06, "loss": 0.1946, "num_input_tokens_seen": 206772704, "step": 95820 }, { "epoch": 15.632137030995105, "grad_norm": 0.056981462985277176, "learning_rate": 6.919872021578219e-06, "loss": 0.1568, "num_input_tokens_seen": 206783360, "step": 95825 }, { "epoch": 15.632952691680261, "grad_norm": 0.07050596177577972, "learning_rate": 6.917414247002002e-06, "loss": 0.1256, "num_input_tokens_seen": 206794720, "step": 95830 }, { "epoch": 15.633768352365417, "grad_norm": 0.3466874361038208, "learning_rate": 6.914956838892325e-06, "loss": 0.1201, "num_input_tokens_seen": 206804576, "step": 95835 }, { "epoch": 15.63458401305057, "grad_norm": 0.17542627453804016, "learning_rate": 6.912499797298988e-06, "loss": 0.0144, "num_input_tokens_seen": 206815680, "step": 95840 }, { "epoch": 15.635399673735726, "grad_norm": 0.16118957102298737, "learning_rate": 6.910043122271781e-06, "loss": 0.1091, "num_input_tokens_seen": 206826688, "step": 95845 }, { "epoch": 15.63621533442088, "grad_norm": 0.06314952671527863, "learning_rate": 6.907586813860514e-06, "loss": 0.332, "num_input_tokens_seen": 206837952, "step": 95850 }, { "epoch": 15.637030995106036, "grad_norm": 1.7842342853546143, "learning_rate": 6.905130872114937e-06, "loss": 0.1567, "num_input_tokens_seen": 206849184, "step": 95855 }, { "epoch": 15.63784665579119, "grad_norm": 0.0388079471886158, "learning_rate": 6.902675297084854e-06, "loss": 0.0408, "num_input_tokens_seen": 206860160, "step": 95860 }, { "epoch": 15.638662316476346, "grad_norm": 0.052230559289455414, "learning_rate": 6.90022008882e-06, "loss": 0.0309, "num_input_tokens_seen": 206871232, "step": 95865 }, { "epoch": 15.639477977161501, "grad_norm": 0.3171374201774597, "learning_rate": 6.897765247370161e-06, "loss": 0.015, "num_input_tokens_seen": 206880928, "step": 95870 }, { "epoch": 15.640293637846655, "grad_norm": 2.4467239379882812, "learning_rate": 6.895310772785058e-06, "loss": 0.2718, "num_input_tokens_seen": 206891392, "step": 95875 }, { "epoch": 15.641109298531811, "grad_norm": 1.288378119468689, "learning_rate": 6.892856665114464e-06, "loss": 0.0491, "num_input_tokens_seen": 206901856, "step": 95880 }, { "epoch": 15.641924959216965, "grad_norm": 0.03932071849703789, "learning_rate": 6.890402924408088e-06, "loss": 0.0398, "num_input_tokens_seen": 206913440, "step": 95885 }, { "epoch": 15.64274061990212, "grad_norm": 0.759288489818573, "learning_rate": 6.887949550715675e-06, "loss": 0.0158, "num_input_tokens_seen": 206924608, "step": 95890 }, { "epoch": 15.643556280587276, "grad_norm": 0.08121147006750107, "learning_rate": 6.885496544086942e-06, "loss": 0.0706, "num_input_tokens_seen": 206934880, "step": 95895 }, { "epoch": 15.64437194127243, "grad_norm": 0.16592809557914734, "learning_rate": 6.883043904571601e-06, "loss": 0.0246, "num_input_tokens_seen": 206946240, "step": 95900 }, { "epoch": 15.645187601957586, "grad_norm": 1.6026403903961182, "learning_rate": 6.88059163221936e-06, "loss": 0.1355, "num_input_tokens_seen": 206956416, "step": 95905 }, { "epoch": 15.64600326264274, "grad_norm": 0.30791687965393066, "learning_rate": 6.878139727079913e-06, "loss": 0.1853, "num_input_tokens_seen": 206967008, "step": 95910 }, { "epoch": 15.646818923327896, "grad_norm": 0.0439537838101387, "learning_rate": 6.875688189202955e-06, "loss": 0.0523, "num_input_tokens_seen": 206979456, "step": 95915 }, { "epoch": 15.647634584013051, "grad_norm": 0.031286969780921936, "learning_rate": 6.873237018638168e-06, "loss": 0.0382, "num_input_tokens_seen": 206989856, "step": 95920 }, { "epoch": 15.648450244698205, "grad_norm": 1.4881852865219116, "learning_rate": 6.870786215435229e-06, "loss": 0.0303, "num_input_tokens_seen": 207000224, "step": 95925 }, { "epoch": 15.649265905383361, "grad_norm": 0.029875652864575386, "learning_rate": 6.8683357796438054e-06, "loss": 0.2387, "num_input_tokens_seen": 207011264, "step": 95930 }, { "epoch": 15.650081566068515, "grad_norm": 0.5377532243728638, "learning_rate": 6.865885711313561e-06, "loss": 0.1329, "num_input_tokens_seen": 207021344, "step": 95935 }, { "epoch": 15.65089722675367, "grad_norm": 0.07542270421981812, "learning_rate": 6.863436010494146e-06, "loss": 0.018, "num_input_tokens_seen": 207032320, "step": 95940 }, { "epoch": 15.651712887438826, "grad_norm": 0.41973617672920227, "learning_rate": 6.8609866772352065e-06, "loss": 0.1363, "num_input_tokens_seen": 207042848, "step": 95945 }, { "epoch": 15.65252854812398, "grad_norm": 1.2623295783996582, "learning_rate": 6.858537711586383e-06, "loss": 0.1393, "num_input_tokens_seen": 207053344, "step": 95950 }, { "epoch": 15.653344208809136, "grad_norm": 0.07325929403305054, "learning_rate": 6.856089113597308e-06, "loss": 0.0339, "num_input_tokens_seen": 207063584, "step": 95955 }, { "epoch": 15.65415986949429, "grad_norm": 0.029979972168803215, "learning_rate": 6.853640883317603e-06, "loss": 0.1002, "num_input_tokens_seen": 207075008, "step": 95960 }, { "epoch": 15.654975530179446, "grad_norm": 0.025374719873070717, "learning_rate": 6.851193020796886e-06, "loss": 0.1372, "num_input_tokens_seen": 207086560, "step": 95965 }, { "epoch": 15.655791190864601, "grad_norm": 0.06139308214187622, "learning_rate": 6.848745526084768e-06, "loss": 0.0183, "num_input_tokens_seen": 207097024, "step": 95970 }, { "epoch": 15.656606851549755, "grad_norm": 1.5289338827133179, "learning_rate": 6.846298399230847e-06, "loss": 0.1933, "num_input_tokens_seen": 207107904, "step": 95975 }, { "epoch": 15.65742251223491, "grad_norm": 0.08444491773843765, "learning_rate": 6.843851640284718e-06, "loss": 0.0936, "num_input_tokens_seen": 207119328, "step": 95980 }, { "epoch": 15.658238172920065, "grad_norm": 3.1189846992492676, "learning_rate": 6.84140524929597e-06, "loss": 0.1791, "num_input_tokens_seen": 207129504, "step": 95985 }, { "epoch": 15.65905383360522, "grad_norm": 0.09756597131490707, "learning_rate": 6.838959226314171e-06, "loss": 0.035, "num_input_tokens_seen": 207139616, "step": 95990 }, { "epoch": 15.659869494290374, "grad_norm": 0.051502399146556854, "learning_rate": 6.836513571388919e-06, "loss": 0.0051, "num_input_tokens_seen": 207149600, "step": 95995 }, { "epoch": 15.66068515497553, "grad_norm": 0.08141589909791946, "learning_rate": 6.834068284569745e-06, "loss": 0.0742, "num_input_tokens_seen": 207160192, "step": 96000 }, { "epoch": 15.661500815660686, "grad_norm": 1.536562204360962, "learning_rate": 6.831623365906231e-06, "loss": 0.0407, "num_input_tokens_seen": 207170912, "step": 96005 }, { "epoch": 15.66231647634584, "grad_norm": 1.1265164613723755, "learning_rate": 6.8291788154479175e-06, "loss": 0.0746, "num_input_tokens_seen": 207180896, "step": 96010 }, { "epoch": 15.663132137030995, "grad_norm": 2.117661952972412, "learning_rate": 6.826734633244344e-06, "loss": 0.1486, "num_input_tokens_seen": 207192672, "step": 96015 }, { "epoch": 15.66394779771615, "grad_norm": 0.14761894941329956, "learning_rate": 6.82429081934505e-06, "loss": 0.0627, "num_input_tokens_seen": 207202976, "step": 96020 }, { "epoch": 15.664763458401305, "grad_norm": 0.06310168653726578, "learning_rate": 6.821847373799561e-06, "loss": 0.0304, "num_input_tokens_seen": 207213344, "step": 96025 }, { "epoch": 15.66557911908646, "grad_norm": 0.07678576558828354, "learning_rate": 6.819404296657395e-06, "loss": 0.0688, "num_input_tokens_seen": 207224224, "step": 96030 }, { "epoch": 15.666394779771615, "grad_norm": 0.4023083448410034, "learning_rate": 6.816961587968063e-06, "loss": 0.2053, "num_input_tokens_seen": 207233984, "step": 96035 }, { "epoch": 15.66721044045677, "grad_norm": 0.03747589513659477, "learning_rate": 6.814519247781073e-06, "loss": 0.1446, "num_input_tokens_seen": 207245216, "step": 96040 }, { "epoch": 15.668026101141924, "grad_norm": 0.14102207124233246, "learning_rate": 6.812077276145917e-06, "loss": 0.142, "num_input_tokens_seen": 207256000, "step": 96045 }, { "epoch": 15.66884176182708, "grad_norm": 0.07216119021177292, "learning_rate": 6.809635673112091e-06, "loss": 0.0075, "num_input_tokens_seen": 207265792, "step": 96050 }, { "epoch": 15.669657422512234, "grad_norm": 2.560182809829712, "learning_rate": 6.807194438729075e-06, "loss": 0.0653, "num_input_tokens_seen": 207277760, "step": 96055 }, { "epoch": 15.67047308319739, "grad_norm": 0.1471065729856491, "learning_rate": 6.804753573046341e-06, "loss": 0.0291, "num_input_tokens_seen": 207288608, "step": 96060 }, { "epoch": 15.671288743882545, "grad_norm": 0.8954663872718811, "learning_rate": 6.802313076113359e-06, "loss": 0.2012, "num_input_tokens_seen": 207298656, "step": 96065 }, { "epoch": 15.6721044045677, "grad_norm": 0.722160279750824, "learning_rate": 6.799872947979588e-06, "loss": 0.1223, "num_input_tokens_seen": 207310016, "step": 96070 }, { "epoch": 15.672920065252855, "grad_norm": 0.03252672404050827, "learning_rate": 6.79743318869448e-06, "loss": 0.0523, "num_input_tokens_seen": 207321312, "step": 96075 }, { "epoch": 15.673735725938009, "grad_norm": 0.10630763322114944, "learning_rate": 6.7949937983074805e-06, "loss": 0.0518, "num_input_tokens_seen": 207331424, "step": 96080 }, { "epoch": 15.674551386623165, "grad_norm": 0.05444115772843361, "learning_rate": 6.792554776868026e-06, "loss": 0.0105, "num_input_tokens_seen": 207341216, "step": 96085 }, { "epoch": 15.67536704730832, "grad_norm": 0.04038574919104576, "learning_rate": 6.790116124425547e-06, "loss": 0.0834, "num_input_tokens_seen": 207352192, "step": 96090 }, { "epoch": 15.676182707993474, "grad_norm": 0.061812445521354675, "learning_rate": 6.787677841029466e-06, "loss": 0.0608, "num_input_tokens_seen": 207363328, "step": 96095 }, { "epoch": 15.67699836867863, "grad_norm": 0.04943094402551651, "learning_rate": 6.785239926729195e-06, "loss": 0.015, "num_input_tokens_seen": 207372960, "step": 96100 }, { "epoch": 15.677814029363784, "grad_norm": 0.9168797135353088, "learning_rate": 6.782802381574147e-06, "loss": 0.1024, "num_input_tokens_seen": 207384192, "step": 96105 }, { "epoch": 15.67862969004894, "grad_norm": 0.030813556164503098, "learning_rate": 6.780365205613709e-06, "loss": 0.0212, "num_input_tokens_seen": 207395200, "step": 96110 }, { "epoch": 15.679445350734095, "grad_norm": 1.4170368909835815, "learning_rate": 6.777928398897302e-06, "loss": 0.0366, "num_input_tokens_seen": 207406368, "step": 96115 }, { "epoch": 15.68026101141925, "grad_norm": 0.7027127742767334, "learning_rate": 6.775491961474273e-06, "loss": 0.0149, "num_input_tokens_seen": 207415040, "step": 96120 }, { "epoch": 15.681076672104405, "grad_norm": 0.45718052983283997, "learning_rate": 6.773055893394037e-06, "loss": 0.1321, "num_input_tokens_seen": 207425216, "step": 96125 }, { "epoch": 15.681892332789559, "grad_norm": 0.05236769840121269, "learning_rate": 6.7706201947059285e-06, "loss": 0.0208, "num_input_tokens_seen": 207436960, "step": 96130 }, { "epoch": 15.682707993474715, "grad_norm": 0.27322646975517273, "learning_rate": 6.7681848654593344e-06, "loss": 0.0312, "num_input_tokens_seen": 207447936, "step": 96135 }, { "epoch": 15.68352365415987, "grad_norm": 0.2872917056083679, "learning_rate": 6.765749905703605e-06, "loss": 0.1264, "num_input_tokens_seen": 207459712, "step": 96140 }, { "epoch": 15.684339314845024, "grad_norm": 1.852634310722351, "learning_rate": 6.7633153154880865e-06, "loss": 0.1105, "num_input_tokens_seen": 207471552, "step": 96145 }, { "epoch": 15.68515497553018, "grad_norm": 0.08442869782447815, "learning_rate": 6.7608810948621145e-06, "loss": 0.1483, "num_input_tokens_seen": 207483008, "step": 96150 }, { "epoch": 15.685970636215334, "grad_norm": 0.8479976654052734, "learning_rate": 6.7584472438750264e-06, "loss": 0.0237, "num_input_tokens_seen": 207494016, "step": 96155 }, { "epoch": 15.68678629690049, "grad_norm": 0.106073297560215, "learning_rate": 6.756013762576147e-06, "loss": 0.0067, "num_input_tokens_seen": 207503776, "step": 96160 }, { "epoch": 15.687601957585644, "grad_norm": 0.5464537143707275, "learning_rate": 6.753580651014793e-06, "loss": 0.1491, "num_input_tokens_seen": 207513568, "step": 96165 }, { "epoch": 15.6884176182708, "grad_norm": 1.287602424621582, "learning_rate": 6.751147909240274e-06, "loss": 0.117, "num_input_tokens_seen": 207525792, "step": 96170 }, { "epoch": 15.689233278955955, "grad_norm": 0.21122890710830688, "learning_rate": 6.748715537301892e-06, "loss": 0.0264, "num_input_tokens_seen": 207536320, "step": 96175 }, { "epoch": 15.690048939641109, "grad_norm": 0.14406061172485352, "learning_rate": 6.746283535248943e-06, "loss": 0.2126, "num_input_tokens_seen": 207547296, "step": 96180 }, { "epoch": 15.690864600326265, "grad_norm": 0.1825273483991623, "learning_rate": 6.7438519031307154e-06, "loss": 0.079, "num_input_tokens_seen": 207556736, "step": 96185 }, { "epoch": 15.691680261011419, "grad_norm": 0.03588827699422836, "learning_rate": 6.7414206409964875e-06, "loss": 0.0278, "num_input_tokens_seen": 207568768, "step": 96190 }, { "epoch": 15.692495921696574, "grad_norm": 0.32660841941833496, "learning_rate": 6.738989748895533e-06, "loss": 0.1017, "num_input_tokens_seen": 207579712, "step": 96195 }, { "epoch": 15.69331158238173, "grad_norm": 0.041411392390728, "learning_rate": 6.736559226877118e-06, "loss": 0.0285, "num_input_tokens_seen": 207590432, "step": 96200 }, { "epoch": 15.694127243066884, "grad_norm": 2.0445477962493896, "learning_rate": 6.7341290749905e-06, "loss": 0.1024, "num_input_tokens_seen": 207599904, "step": 96205 }, { "epoch": 15.69494290375204, "grad_norm": 0.011001142673194408, "learning_rate": 6.731699293284927e-06, "loss": 0.0296, "num_input_tokens_seen": 207610816, "step": 96210 }, { "epoch": 15.695758564437194, "grad_norm": 0.03586055710911751, "learning_rate": 6.729269881809635e-06, "loss": 0.0081, "num_input_tokens_seen": 207621632, "step": 96215 }, { "epoch": 15.69657422512235, "grad_norm": 0.21622136235237122, "learning_rate": 6.7268408406138825e-06, "loss": 0.1002, "num_input_tokens_seen": 207632224, "step": 96220 }, { "epoch": 15.697389885807503, "grad_norm": 0.25372615456581116, "learning_rate": 6.724412169746866e-06, "loss": 0.0942, "num_input_tokens_seen": 207643328, "step": 96225 }, { "epoch": 15.698205546492659, "grad_norm": 0.09035154432058334, "learning_rate": 6.721983869257836e-06, "loss": 0.07, "num_input_tokens_seen": 207654176, "step": 96230 }, { "epoch": 15.699021207177815, "grad_norm": 0.22502410411834717, "learning_rate": 6.719555939195976e-06, "loss": 0.196, "num_input_tokens_seen": 207664704, "step": 96235 }, { "epoch": 15.699836867862969, "grad_norm": 2.0209481716156006, "learning_rate": 6.717128379610519e-06, "loss": 0.1256, "num_input_tokens_seen": 207675200, "step": 96240 }, { "epoch": 15.700652528548124, "grad_norm": 0.04222431033849716, "learning_rate": 6.714701190550635e-06, "loss": 0.0354, "num_input_tokens_seen": 207686432, "step": 96245 }, { "epoch": 15.701468189233278, "grad_norm": 1.6168123483657837, "learning_rate": 6.712274372065544e-06, "loss": 0.1753, "num_input_tokens_seen": 207696320, "step": 96250 }, { "epoch": 15.702283849918434, "grad_norm": 0.5673998594284058, "learning_rate": 6.709847924204396e-06, "loss": 0.0239, "num_input_tokens_seen": 207707840, "step": 96255 }, { "epoch": 15.70309951060359, "grad_norm": 0.048694148659706116, "learning_rate": 6.707421847016396e-06, "loss": 0.0262, "num_input_tokens_seen": 207717920, "step": 96260 }, { "epoch": 15.703915171288743, "grad_norm": 1.1306623220443726, "learning_rate": 6.704996140550684e-06, "loss": 0.0769, "num_input_tokens_seen": 207728800, "step": 96265 }, { "epoch": 15.7047308319739, "grad_norm": 0.12203960865736008, "learning_rate": 6.702570804856451e-06, "loss": 0.11, "num_input_tokens_seen": 207739424, "step": 96270 }, { "epoch": 15.705546492659053, "grad_norm": 0.11801248043775558, "learning_rate": 6.700145839982816e-06, "loss": 0.0332, "num_input_tokens_seen": 207749568, "step": 96275 }, { "epoch": 15.706362153344209, "grad_norm": 0.12895843386650085, "learning_rate": 6.697721245978949e-06, "loss": 0.0538, "num_input_tokens_seen": 207759776, "step": 96280 }, { "epoch": 15.707177814029365, "grad_norm": 0.8071635961532593, "learning_rate": 6.6952970228939786e-06, "loss": 0.0973, "num_input_tokens_seen": 207771488, "step": 96285 }, { "epoch": 15.707993474714518, "grad_norm": 0.02809225581586361, "learning_rate": 6.692873170777034e-06, "loss": 0.0227, "num_input_tokens_seen": 207782112, "step": 96290 }, { "epoch": 15.708809135399674, "grad_norm": 0.08604870736598969, "learning_rate": 6.690449689677239e-06, "loss": 0.1654, "num_input_tokens_seen": 207792640, "step": 96295 }, { "epoch": 15.709624796084828, "grad_norm": 1.5102721452713013, "learning_rate": 6.6880265796437104e-06, "loss": 0.0485, "num_input_tokens_seen": 207802720, "step": 96300 }, { "epoch": 15.710440456769984, "grad_norm": 1.2420626878738403, "learning_rate": 6.685603840725552e-06, "loss": 0.14, "num_input_tokens_seen": 207813696, "step": 96305 }, { "epoch": 15.71125611745514, "grad_norm": 0.346991628408432, "learning_rate": 6.6831814729718665e-06, "loss": 0.0115, "num_input_tokens_seen": 207823840, "step": 96310 }, { "epoch": 15.712071778140293, "grad_norm": 0.31837475299835205, "learning_rate": 6.680759476431744e-06, "loss": 0.0443, "num_input_tokens_seen": 207835712, "step": 96315 }, { "epoch": 15.71288743882545, "grad_norm": 0.022898374125361443, "learning_rate": 6.67833785115427e-06, "loss": 0.061, "num_input_tokens_seen": 207847264, "step": 96320 }, { "epoch": 15.713703099510603, "grad_norm": 0.44500932097435, "learning_rate": 6.675916597188522e-06, "loss": 0.0615, "num_input_tokens_seen": 207857984, "step": 96325 }, { "epoch": 15.714518760195759, "grad_norm": 0.5353744626045227, "learning_rate": 6.673495714583563e-06, "loss": 0.1592, "num_input_tokens_seen": 207869568, "step": 96330 }, { "epoch": 15.715334420880914, "grad_norm": 1.5349643230438232, "learning_rate": 6.671075203388477e-06, "loss": 0.131, "num_input_tokens_seen": 207880576, "step": 96335 }, { "epoch": 15.716150081566068, "grad_norm": 0.21891340613365173, "learning_rate": 6.668655063652288e-06, "loss": 0.3047, "num_input_tokens_seen": 207891808, "step": 96340 }, { "epoch": 15.716965742251224, "grad_norm": 0.7197676301002502, "learning_rate": 6.666235295424075e-06, "loss": 0.1794, "num_input_tokens_seen": 207903456, "step": 96345 }, { "epoch": 15.717781402936378, "grad_norm": 0.16754686832427979, "learning_rate": 6.663815898752848e-06, "loss": 0.0677, "num_input_tokens_seen": 207913344, "step": 96350 }, { "epoch": 15.718597063621534, "grad_norm": 1.536994457244873, "learning_rate": 6.661396873687667e-06, "loss": 0.15, "num_input_tokens_seen": 207923456, "step": 96355 }, { "epoch": 15.719412724306688, "grad_norm": 0.3550632894039154, "learning_rate": 6.6589782202775256e-06, "loss": 0.3241, "num_input_tokens_seen": 207935328, "step": 96360 }, { "epoch": 15.720228384991843, "grad_norm": 0.16521304845809937, "learning_rate": 6.656559938571475e-06, "loss": 0.1221, "num_input_tokens_seen": 207945440, "step": 96365 }, { "epoch": 15.721044045676999, "grad_norm": 2.4299662113189697, "learning_rate": 6.654142028618493e-06, "loss": 0.2443, "num_input_tokens_seen": 207957024, "step": 96370 }, { "epoch": 15.721859706362153, "grad_norm": 0.23720146715641022, "learning_rate": 6.651724490467609e-06, "loss": 0.0743, "num_input_tokens_seen": 207967488, "step": 96375 }, { "epoch": 15.722675367047309, "grad_norm": 1.7126522064208984, "learning_rate": 6.649307324167792e-06, "loss": 0.0725, "num_input_tokens_seen": 207978400, "step": 96380 }, { "epoch": 15.723491027732463, "grad_norm": 0.5440821051597595, "learning_rate": 6.6468905297680544e-06, "loss": 0.0139, "num_input_tokens_seen": 207987744, "step": 96385 }, { "epoch": 15.724306688417618, "grad_norm": 0.6436014771461487, "learning_rate": 6.64447410731735e-06, "loss": 0.0393, "num_input_tokens_seen": 207998464, "step": 96390 }, { "epoch": 15.725122349102774, "grad_norm": 0.5591453909873962, "learning_rate": 6.642058056864678e-06, "loss": 0.1544, "num_input_tokens_seen": 208009440, "step": 96395 }, { "epoch": 15.725938009787928, "grad_norm": 0.06088123843073845, "learning_rate": 6.639642378458971e-06, "loss": 0.1281, "num_input_tokens_seen": 208020896, "step": 96400 }, { "epoch": 15.726753670473084, "grad_norm": 1.8495457172393799, "learning_rate": 6.637227072149219e-06, "loss": 0.1162, "num_input_tokens_seen": 208032160, "step": 96405 }, { "epoch": 15.727569331158238, "grad_norm": 0.31099483370780945, "learning_rate": 6.634812137984342e-06, "loss": 0.1576, "num_input_tokens_seen": 208042656, "step": 96410 }, { "epoch": 15.728384991843393, "grad_norm": 0.11802861094474792, "learning_rate": 6.632397576013308e-06, "loss": 0.0235, "num_input_tokens_seen": 208052640, "step": 96415 }, { "epoch": 15.729200652528547, "grad_norm": 1.2920771837234497, "learning_rate": 6.629983386285024e-06, "loss": 0.0358, "num_input_tokens_seen": 208064096, "step": 96420 }, { "epoch": 15.730016313213703, "grad_norm": 0.026208357885479927, "learning_rate": 6.627569568848438e-06, "loss": 0.0433, "num_input_tokens_seen": 208075040, "step": 96425 }, { "epoch": 15.730831973898859, "grad_norm": 0.5875062346458435, "learning_rate": 6.62515612375246e-06, "loss": 0.0388, "num_input_tokens_seen": 208086656, "step": 96430 }, { "epoch": 15.731647634584013, "grad_norm": 0.08252692967653275, "learning_rate": 6.622743051046005e-06, "loss": 0.017, "num_input_tokens_seen": 208097536, "step": 96435 }, { "epoch": 15.732463295269168, "grad_norm": 0.6523786783218384, "learning_rate": 6.620330350777975e-06, "loss": 0.0504, "num_input_tokens_seen": 208109568, "step": 96440 }, { "epoch": 15.733278955954322, "grad_norm": 2.14416241645813, "learning_rate": 6.617918022997266e-06, "loss": 0.2278, "num_input_tokens_seen": 208119808, "step": 96445 }, { "epoch": 15.734094616639478, "grad_norm": 1.4591374397277832, "learning_rate": 6.615506067752769e-06, "loss": 0.1362, "num_input_tokens_seen": 208129632, "step": 96450 }, { "epoch": 15.734910277324634, "grad_norm": 0.23036235570907593, "learning_rate": 6.613094485093363e-06, "loss": 0.0706, "num_input_tokens_seen": 208139968, "step": 96455 }, { "epoch": 15.735725938009788, "grad_norm": 0.6155000925064087, "learning_rate": 6.610683275067925e-06, "loss": 0.1557, "num_input_tokens_seen": 208151232, "step": 96460 }, { "epoch": 15.736541598694943, "grad_norm": 0.11475612968206406, "learning_rate": 6.608272437725316e-06, "loss": 0.0356, "num_input_tokens_seen": 208163200, "step": 96465 }, { "epoch": 15.737357259380097, "grad_norm": 0.15299931168556213, "learning_rate": 6.605861973114399e-06, "loss": 0.2363, "num_input_tokens_seen": 208175136, "step": 96470 }, { "epoch": 15.738172920065253, "grad_norm": 1.0274499654769897, "learning_rate": 6.603451881284014e-06, "loss": 0.0594, "num_input_tokens_seen": 208184960, "step": 96475 }, { "epoch": 15.738988580750409, "grad_norm": 1.863862156867981, "learning_rate": 6.601042162283033e-06, "loss": 0.119, "num_input_tokens_seen": 208195840, "step": 96480 }, { "epoch": 15.739804241435563, "grad_norm": 0.06022351235151291, "learning_rate": 6.598632816160255e-06, "loss": 0.0176, "num_input_tokens_seen": 208205888, "step": 96485 }, { "epoch": 15.740619902120718, "grad_norm": 0.18447434902191162, "learning_rate": 6.596223842964544e-06, "loss": 0.1155, "num_input_tokens_seen": 208215744, "step": 96490 }, { "epoch": 15.741435562805872, "grad_norm": 0.056086454540491104, "learning_rate": 6.593815242744689e-06, "loss": 0.0727, "num_input_tokens_seen": 208226784, "step": 96495 }, { "epoch": 15.742251223491028, "grad_norm": 0.050046585500240326, "learning_rate": 6.59140701554953e-06, "loss": 0.1058, "num_input_tokens_seen": 208238304, "step": 96500 }, { "epoch": 15.743066884176184, "grad_norm": 0.04422622546553612, "learning_rate": 6.588999161427847e-06, "loss": 0.017, "num_input_tokens_seen": 208249408, "step": 96505 }, { "epoch": 15.743882544861338, "grad_norm": 0.4930354654788971, "learning_rate": 6.586591680428469e-06, "loss": 0.3522, "num_input_tokens_seen": 208260736, "step": 96510 }, { "epoch": 15.744698205546493, "grad_norm": 1.7277326583862305, "learning_rate": 6.5841845726001524e-06, "loss": 0.0771, "num_input_tokens_seen": 208270336, "step": 96515 }, { "epoch": 15.745513866231647, "grad_norm": 0.061512093991041183, "learning_rate": 6.581777837991715e-06, "loss": 0.0287, "num_input_tokens_seen": 208281440, "step": 96520 }, { "epoch": 15.746329526916803, "grad_norm": 0.3546236455440521, "learning_rate": 6.579371476651896e-06, "loss": 0.0169, "num_input_tokens_seen": 208292128, "step": 96525 }, { "epoch": 15.747145187601957, "grad_norm": 1.3922761678695679, "learning_rate": 6.576965488629494e-06, "loss": 0.0937, "num_input_tokens_seen": 208303296, "step": 96530 }, { "epoch": 15.747960848287113, "grad_norm": 0.9658174514770508, "learning_rate": 6.5745598739732546e-06, "loss": 0.0667, "num_input_tokens_seen": 208314496, "step": 96535 }, { "epoch": 15.748776508972268, "grad_norm": 0.07434158772230148, "learning_rate": 6.572154632731936e-06, "loss": 0.1534, "num_input_tokens_seen": 208326464, "step": 96540 }, { "epoch": 15.749592169657422, "grad_norm": 0.21078817546367645, "learning_rate": 6.5697497649542805e-06, "loss": 0.0174, "num_input_tokens_seen": 208337056, "step": 96545 }, { "epoch": 15.750407830342578, "grad_norm": 0.20623816549777985, "learning_rate": 6.5673452706890265e-06, "loss": 0.0529, "num_input_tokens_seen": 208348192, "step": 96550 }, { "epoch": 15.751223491027732, "grad_norm": 0.04078781604766846, "learning_rate": 6.564941149984902e-06, "loss": 0.0996, "num_input_tokens_seen": 208359424, "step": 96555 }, { "epoch": 15.752039151712887, "grad_norm": 1.7086865901947021, "learning_rate": 6.562537402890634e-06, "loss": 0.1788, "num_input_tokens_seen": 208369472, "step": 96560 }, { "epoch": 15.752854812398043, "grad_norm": 2.247742176055908, "learning_rate": 6.560134029454934e-06, "loss": 0.0754, "num_input_tokens_seen": 208379552, "step": 96565 }, { "epoch": 15.753670473083197, "grad_norm": 0.042580533772706985, "learning_rate": 6.55773102972651e-06, "loss": 0.0154, "num_input_tokens_seen": 208390816, "step": 96570 }, { "epoch": 15.754486133768353, "grad_norm": 0.05205710604786873, "learning_rate": 6.555328403754063e-06, "loss": 0.0852, "num_input_tokens_seen": 208400704, "step": 96575 }, { "epoch": 15.755301794453507, "grad_norm": 3.4882149696350098, "learning_rate": 6.552926151586286e-06, "loss": 0.2302, "num_input_tokens_seen": 208410976, "step": 96580 }, { "epoch": 15.756117455138662, "grad_norm": 0.5175674557685852, "learning_rate": 6.55052427327186e-06, "loss": 0.0227, "num_input_tokens_seen": 208421824, "step": 96585 }, { "epoch": 15.756933115823816, "grad_norm": 0.17215052247047424, "learning_rate": 6.548122768859466e-06, "loss": 0.0675, "num_input_tokens_seen": 208432480, "step": 96590 }, { "epoch": 15.757748776508972, "grad_norm": 0.14586885273456573, "learning_rate": 6.545721638397772e-06, "loss": 0.0944, "num_input_tokens_seen": 208443360, "step": 96595 }, { "epoch": 15.758564437194128, "grad_norm": 0.049115411937236786, "learning_rate": 6.543320881935439e-06, "loss": 0.0265, "num_input_tokens_seen": 208453056, "step": 96600 }, { "epoch": 15.759380097879282, "grad_norm": 0.24696697294712067, "learning_rate": 6.540920499521122e-06, "loss": 0.0362, "num_input_tokens_seen": 208462560, "step": 96605 }, { "epoch": 15.760195758564437, "grad_norm": 0.5062825679779053, "learning_rate": 6.53852049120347e-06, "loss": 0.0274, "num_input_tokens_seen": 208474304, "step": 96610 }, { "epoch": 15.761011419249591, "grad_norm": 0.06646963208913803, "learning_rate": 6.53612085703112e-06, "loss": 0.2094, "num_input_tokens_seen": 208484896, "step": 96615 }, { "epoch": 15.761827079934747, "grad_norm": 0.7819457054138184, "learning_rate": 6.533721597052694e-06, "loss": 0.1398, "num_input_tokens_seen": 208496000, "step": 96620 }, { "epoch": 15.762642740619903, "grad_norm": 0.04205426201224327, "learning_rate": 6.531322711316842e-06, "loss": 0.1045, "num_input_tokens_seen": 208506688, "step": 96625 }, { "epoch": 15.763458401305057, "grad_norm": 5.0023016929626465, "learning_rate": 6.52892419987215e-06, "loss": 0.1248, "num_input_tokens_seen": 208517056, "step": 96630 }, { "epoch": 15.764274061990212, "grad_norm": 3.786830425262451, "learning_rate": 6.526526062767249e-06, "loss": 0.1707, "num_input_tokens_seen": 208526688, "step": 96635 }, { "epoch": 15.765089722675366, "grad_norm": 0.4489212930202484, "learning_rate": 6.524128300050733e-06, "loss": 0.1859, "num_input_tokens_seen": 208537792, "step": 96640 }, { "epoch": 15.765905383360522, "grad_norm": 0.03327278420329094, "learning_rate": 6.521730911771193e-06, "loss": 0.2437, "num_input_tokens_seen": 208547616, "step": 96645 }, { "epoch": 15.766721044045678, "grad_norm": 1.4360417127609253, "learning_rate": 6.51933389797722e-06, "loss": 0.1327, "num_input_tokens_seen": 208558880, "step": 96650 }, { "epoch": 15.767536704730832, "grad_norm": 0.12230093777179718, "learning_rate": 6.51693725871739e-06, "loss": 0.1061, "num_input_tokens_seen": 208570464, "step": 96655 }, { "epoch": 15.768352365415987, "grad_norm": 1.2628668546676636, "learning_rate": 6.514540994040272e-06, "loss": 0.2208, "num_input_tokens_seen": 208581920, "step": 96660 }, { "epoch": 15.769168026101141, "grad_norm": 0.1928972601890564, "learning_rate": 6.5121451039944325e-06, "loss": 0.1337, "num_input_tokens_seen": 208593504, "step": 96665 }, { "epoch": 15.769983686786297, "grad_norm": 0.04769292101264, "learning_rate": 6.509749588628425e-06, "loss": 0.0114, "num_input_tokens_seen": 208604384, "step": 96670 }, { "epoch": 15.770799347471453, "grad_norm": 0.14404769241809845, "learning_rate": 6.507354447990799e-06, "loss": 0.1053, "num_input_tokens_seen": 208614272, "step": 96675 }, { "epoch": 15.771615008156607, "grad_norm": 0.28973788022994995, "learning_rate": 6.504959682130096e-06, "loss": 0.1481, "num_input_tokens_seen": 208624704, "step": 96680 }, { "epoch": 15.772430668841762, "grad_norm": 0.08305352926254272, "learning_rate": 6.502565291094845e-06, "loss": 0.0867, "num_input_tokens_seen": 208634752, "step": 96685 }, { "epoch": 15.773246329526916, "grad_norm": 2.4456725120544434, "learning_rate": 6.500171274933575e-06, "loss": 0.288, "num_input_tokens_seen": 208645696, "step": 96690 }, { "epoch": 15.774061990212072, "grad_norm": 0.30630797147750854, "learning_rate": 6.4977776336948045e-06, "loss": 0.0167, "num_input_tokens_seen": 208656224, "step": 96695 }, { "epoch": 15.774877650897226, "grad_norm": 0.24298420548439026, "learning_rate": 6.49538436742704e-06, "loss": 0.0182, "num_input_tokens_seen": 208665984, "step": 96700 }, { "epoch": 15.775693311582382, "grad_norm": 0.0917285829782486, "learning_rate": 6.4929914761787895e-06, "loss": 0.0377, "num_input_tokens_seen": 208676768, "step": 96705 }, { "epoch": 15.776508972267537, "grad_norm": 0.24684648215770721, "learning_rate": 6.4905989599985395e-06, "loss": 0.0098, "num_input_tokens_seen": 208686496, "step": 96710 }, { "epoch": 15.777324632952691, "grad_norm": 0.37874820828437805, "learning_rate": 6.488206818934786e-06, "loss": 0.0938, "num_input_tokens_seen": 208698656, "step": 96715 }, { "epoch": 15.778140293637847, "grad_norm": 0.39745908975601196, "learning_rate": 6.485815053036004e-06, "loss": 0.0136, "num_input_tokens_seen": 208708064, "step": 96720 }, { "epoch": 15.778955954323001, "grad_norm": 2.2521824836730957, "learning_rate": 6.483423662350668e-06, "loss": 0.1292, "num_input_tokens_seen": 208717888, "step": 96725 }, { "epoch": 15.779771615008157, "grad_norm": 0.04887053370475769, "learning_rate": 6.4810326469272405e-06, "loss": 0.0923, "num_input_tokens_seen": 208728128, "step": 96730 }, { "epoch": 15.780587275693312, "grad_norm": 3.2855613231658936, "learning_rate": 6.4786420068141785e-06, "loss": 0.0913, "num_input_tokens_seen": 208738592, "step": 96735 }, { "epoch": 15.781402936378466, "grad_norm": 3.9584169387817383, "learning_rate": 6.4762517420599325e-06, "loss": 0.2436, "num_input_tokens_seen": 208748608, "step": 96740 }, { "epoch": 15.782218597063622, "grad_norm": 0.07526355236768723, "learning_rate": 6.473861852712945e-06, "loss": 0.0802, "num_input_tokens_seen": 208758784, "step": 96745 }, { "epoch": 15.783034257748776, "grad_norm": 2.405780792236328, "learning_rate": 6.471472338821641e-06, "loss": 0.3159, "num_input_tokens_seen": 208768832, "step": 96750 }, { "epoch": 15.783849918433932, "grad_norm": 0.3430122137069702, "learning_rate": 6.46908320043447e-06, "loss": 0.0246, "num_input_tokens_seen": 208779456, "step": 96755 }, { "epoch": 15.784665579119086, "grad_norm": 0.0335947722196579, "learning_rate": 6.466694437599821e-06, "loss": 0.0166, "num_input_tokens_seen": 208790784, "step": 96760 }, { "epoch": 15.785481239804241, "grad_norm": 1.1830483675003052, "learning_rate": 6.464306050366128e-06, "loss": 0.0869, "num_input_tokens_seen": 208801536, "step": 96765 }, { "epoch": 15.786296900489397, "grad_norm": 0.7821608781814575, "learning_rate": 6.461918038781789e-06, "loss": 0.1924, "num_input_tokens_seen": 208811360, "step": 96770 }, { "epoch": 15.78711256117455, "grad_norm": 0.07539485394954681, "learning_rate": 6.459530402895195e-06, "loss": 0.1013, "num_input_tokens_seen": 208823424, "step": 96775 }, { "epoch": 15.787928221859707, "grad_norm": 1.1962710618972778, "learning_rate": 6.45714314275474e-06, "loss": 0.1356, "num_input_tokens_seen": 208834368, "step": 96780 }, { "epoch": 15.78874388254486, "grad_norm": 0.7341863512992859, "learning_rate": 6.454756258408801e-06, "loss": 0.0406, "num_input_tokens_seen": 208845472, "step": 96785 }, { "epoch": 15.789559543230016, "grad_norm": 0.01445277500897646, "learning_rate": 6.452369749905754e-06, "loss": 0.0162, "num_input_tokens_seen": 208854688, "step": 96790 }, { "epoch": 15.790375203915172, "grad_norm": 0.046769678592681885, "learning_rate": 6.449983617293961e-06, "loss": 0.0541, "num_input_tokens_seen": 208864544, "step": 96795 }, { "epoch": 15.791190864600326, "grad_norm": 1.9804850816726685, "learning_rate": 6.447597860621784e-06, "loss": 0.1598, "num_input_tokens_seen": 208875744, "step": 96800 }, { "epoch": 15.792006525285482, "grad_norm": 0.042212896049022675, "learning_rate": 6.445212479937574e-06, "loss": 0.0184, "num_input_tokens_seen": 208886848, "step": 96805 }, { "epoch": 15.792822185970635, "grad_norm": 0.40559425950050354, "learning_rate": 6.44282747528967e-06, "loss": 0.0967, "num_input_tokens_seen": 208896672, "step": 96810 }, { "epoch": 15.793637846655791, "grad_norm": 0.4243473410606384, "learning_rate": 6.440442846726408e-06, "loss": 0.1429, "num_input_tokens_seen": 208906528, "step": 96815 }, { "epoch": 15.794453507340947, "grad_norm": 0.03297404199838638, "learning_rate": 6.438058594296115e-06, "loss": 0.0423, "num_input_tokens_seen": 208917120, "step": 96820 }, { "epoch": 15.7952691680261, "grad_norm": 0.12351193279027939, "learning_rate": 6.435674718047114e-06, "loss": 0.1129, "num_input_tokens_seen": 208927808, "step": 96825 }, { "epoch": 15.796084828711257, "grad_norm": 0.0929345116019249, "learning_rate": 6.433291218027715e-06, "loss": 0.0856, "num_input_tokens_seen": 208937952, "step": 96830 }, { "epoch": 15.79690048939641, "grad_norm": 0.33920302987098694, "learning_rate": 6.430908094286223e-06, "loss": 0.0235, "num_input_tokens_seen": 208949344, "step": 96835 }, { "epoch": 15.797716150081566, "grad_norm": 0.11273786425590515, "learning_rate": 6.428525346870934e-06, "loss": 0.0098, "num_input_tokens_seen": 208960064, "step": 96840 }, { "epoch": 15.798531810766722, "grad_norm": 0.05500125512480736, "learning_rate": 6.426142975830138e-06, "loss": 0.0059, "num_input_tokens_seen": 208971008, "step": 96845 }, { "epoch": 15.799347471451876, "grad_norm": 0.19792690873146057, "learning_rate": 6.423760981212118e-06, "loss": 0.0115, "num_input_tokens_seen": 208981696, "step": 96850 }, { "epoch": 15.800163132137031, "grad_norm": 2.6386256217956543, "learning_rate": 6.421379363065142e-06, "loss": 0.0962, "num_input_tokens_seen": 208993088, "step": 96855 }, { "epoch": 15.800978792822185, "grad_norm": 0.07028928399085999, "learning_rate": 6.418998121437494e-06, "loss": 0.0484, "num_input_tokens_seen": 209003712, "step": 96860 }, { "epoch": 15.801794453507341, "grad_norm": 1.0626733303070068, "learning_rate": 6.416617256377408e-06, "loss": 0.0846, "num_input_tokens_seen": 209014240, "step": 96865 }, { "epoch": 15.802610114192497, "grad_norm": 0.6572602987289429, "learning_rate": 6.41423676793316e-06, "loss": 0.0328, "num_input_tokens_seen": 209024480, "step": 96870 }, { "epoch": 15.80342577487765, "grad_norm": 1.4318801164627075, "learning_rate": 6.411856656152971e-06, "loss": 0.2173, "num_input_tokens_seen": 209036352, "step": 96875 }, { "epoch": 15.804241435562806, "grad_norm": 2.026888847351074, "learning_rate": 6.409476921085098e-06, "loss": 0.2998, "num_input_tokens_seen": 209046048, "step": 96880 }, { "epoch": 15.80505709624796, "grad_norm": 1.6841387748718262, "learning_rate": 6.407097562777744e-06, "loss": 0.24, "num_input_tokens_seen": 209057120, "step": 96885 }, { "epoch": 15.805872756933116, "grad_norm": 1.7592735290527344, "learning_rate": 6.404718581279162e-06, "loss": 0.1567, "num_input_tokens_seen": 209068256, "step": 96890 }, { "epoch": 15.80668841761827, "grad_norm": 0.7199226021766663, "learning_rate": 6.402339976637534e-06, "loss": 0.0335, "num_input_tokens_seen": 209080192, "step": 96895 }, { "epoch": 15.807504078303426, "grad_norm": 0.8189366459846497, "learning_rate": 6.3999617489010926e-06, "loss": 0.0518, "num_input_tokens_seen": 209091936, "step": 96900 }, { "epoch": 15.808319738988581, "grad_norm": 0.4503825306892395, "learning_rate": 6.397583898118006e-06, "loss": 0.122, "num_input_tokens_seen": 209104160, "step": 96905 }, { "epoch": 15.809135399673735, "grad_norm": 0.41207513213157654, "learning_rate": 6.395206424336489e-06, "loss": 0.0363, "num_input_tokens_seen": 209115136, "step": 96910 }, { "epoch": 15.809951060358891, "grad_norm": 1.7485954761505127, "learning_rate": 6.392829327604716e-06, "loss": 0.1272, "num_input_tokens_seen": 209126464, "step": 96915 }, { "epoch": 15.810766721044045, "grad_norm": 0.26291799545288086, "learning_rate": 6.390452607970862e-06, "loss": 0.0185, "num_input_tokens_seen": 209135808, "step": 96920 }, { "epoch": 15.8115823817292, "grad_norm": 0.012604249641299248, "learning_rate": 6.388076265483092e-06, "loss": 0.0829, "num_input_tokens_seen": 209144480, "step": 96925 }, { "epoch": 15.812398042414356, "grad_norm": 0.0412202887237072, "learning_rate": 6.3857003001895706e-06, "loss": 0.0645, "num_input_tokens_seen": 209155008, "step": 96930 }, { "epoch": 15.81321370309951, "grad_norm": 0.21714939177036285, "learning_rate": 6.383324712138444e-06, "loss": 0.0057, "num_input_tokens_seen": 209167136, "step": 96935 }, { "epoch": 15.814029363784666, "grad_norm": 0.6043254137039185, "learning_rate": 6.380949501377859e-06, "loss": 0.1967, "num_input_tokens_seen": 209179232, "step": 96940 }, { "epoch": 15.81484502446982, "grad_norm": 1.3350571393966675, "learning_rate": 6.378574667955953e-06, "loss": 0.2702, "num_input_tokens_seen": 209191264, "step": 96945 }, { "epoch": 15.815660685154976, "grad_norm": 0.04663282632827759, "learning_rate": 6.3762002119208545e-06, "loss": 0.1131, "num_input_tokens_seen": 209202176, "step": 96950 }, { "epoch": 15.81647634584013, "grad_norm": 0.601621687412262, "learning_rate": 6.373826133320685e-06, "loss": 0.0604, "num_input_tokens_seen": 209214208, "step": 96955 }, { "epoch": 15.817292006525285, "grad_norm": 0.23409290611743927, "learning_rate": 6.371452432203548e-06, "loss": 0.0142, "num_input_tokens_seen": 209225664, "step": 96960 }, { "epoch": 15.818107667210441, "grad_norm": 1.7098464965820312, "learning_rate": 6.369079108617576e-06, "loss": 0.2149, "num_input_tokens_seen": 209236608, "step": 96965 }, { "epoch": 15.818923327895595, "grad_norm": 0.5528028011322021, "learning_rate": 6.366706162610836e-06, "loss": 0.0893, "num_input_tokens_seen": 209247328, "step": 96970 }, { "epoch": 15.81973898858075, "grad_norm": 0.04549062252044678, "learning_rate": 6.364333594231447e-06, "loss": 0.0647, "num_input_tokens_seen": 209258304, "step": 96975 }, { "epoch": 15.820554649265905, "grad_norm": 1.0130897760391235, "learning_rate": 6.361961403527467e-06, "loss": 0.042, "num_input_tokens_seen": 209269056, "step": 96980 }, { "epoch": 15.82137030995106, "grad_norm": 0.04078308492898941, "learning_rate": 6.359589590546997e-06, "loss": 0.0432, "num_input_tokens_seen": 209280160, "step": 96985 }, { "epoch": 15.822185970636216, "grad_norm": 0.26193657517433167, "learning_rate": 6.3572181553380735e-06, "loss": 0.0281, "num_input_tokens_seen": 209291392, "step": 96990 }, { "epoch": 15.82300163132137, "grad_norm": 0.15862488746643066, "learning_rate": 6.3548470979487936e-06, "loss": 0.0522, "num_input_tokens_seen": 209302112, "step": 96995 }, { "epoch": 15.823817292006526, "grad_norm": 0.05739428848028183, "learning_rate": 6.352476418427173e-06, "loss": 0.0603, "num_input_tokens_seen": 209312576, "step": 97000 }, { "epoch": 15.82463295269168, "grad_norm": 1.262532114982605, "learning_rate": 6.350106116821289e-06, "loss": 0.1628, "num_input_tokens_seen": 209323936, "step": 97005 }, { "epoch": 15.825448613376835, "grad_norm": 0.10709060728549957, "learning_rate": 6.347736193179146e-06, "loss": 0.0333, "num_input_tokens_seen": 209335616, "step": 97010 }, { "epoch": 15.826264274061991, "grad_norm": 1.4953075647354126, "learning_rate": 6.345366647548809e-06, "loss": 0.2141, "num_input_tokens_seen": 209345856, "step": 97015 }, { "epoch": 15.827079934747145, "grad_norm": 0.022369345650076866, "learning_rate": 6.342997479978266e-06, "loss": 0.0405, "num_input_tokens_seen": 209356096, "step": 97020 }, { "epoch": 15.8278955954323, "grad_norm": 0.7257959246635437, "learning_rate": 6.340628690515562e-06, "loss": 0.0283, "num_input_tokens_seen": 209367104, "step": 97025 }, { "epoch": 15.828711256117455, "grad_norm": 1.9376702308654785, "learning_rate": 6.3382602792086746e-06, "loss": 0.2228, "num_input_tokens_seen": 209377472, "step": 97030 }, { "epoch": 15.82952691680261, "grad_norm": 0.1420000046491623, "learning_rate": 6.335892246105632e-06, "loss": 0.3835, "num_input_tokens_seen": 209388448, "step": 97035 }, { "epoch": 15.830342577487766, "grad_norm": 2.4961912631988525, "learning_rate": 6.333524591254395e-06, "loss": 0.2001, "num_input_tokens_seen": 209399136, "step": 97040 }, { "epoch": 15.83115823817292, "grad_norm": 0.25443127751350403, "learning_rate": 6.331157314702968e-06, "loss": 0.0382, "num_input_tokens_seen": 209409952, "step": 97045 }, { "epoch": 15.831973898858076, "grad_norm": 0.08281148225069046, "learning_rate": 6.328790416499322e-06, "loss": 0.0822, "num_input_tokens_seen": 209421632, "step": 97050 }, { "epoch": 15.83278955954323, "grad_norm": 1.6674115657806396, "learning_rate": 6.326423896691425e-06, "loss": 0.0859, "num_input_tokens_seen": 209432000, "step": 97055 }, { "epoch": 15.833605220228385, "grad_norm": 0.04405874386429787, "learning_rate": 6.324057755327237e-06, "loss": 0.0186, "num_input_tokens_seen": 209443232, "step": 97060 }, { "epoch": 15.83442088091354, "grad_norm": 0.7928524017333984, "learning_rate": 6.3216919924547116e-06, "loss": 0.1808, "num_input_tokens_seen": 209452192, "step": 97065 }, { "epoch": 15.835236541598695, "grad_norm": 0.03356797248125076, "learning_rate": 6.3193266081217895e-06, "loss": 0.0721, "num_input_tokens_seen": 209463488, "step": 97070 }, { "epoch": 15.83605220228385, "grad_norm": 0.4960254430770874, "learning_rate": 6.316961602376415e-06, "loss": 0.1821, "num_input_tokens_seen": 209474464, "step": 97075 }, { "epoch": 15.836867862969005, "grad_norm": 0.06949711591005325, "learning_rate": 6.314596975266515e-06, "loss": 0.0689, "num_input_tokens_seen": 209484896, "step": 97080 }, { "epoch": 15.83768352365416, "grad_norm": 2.898427963256836, "learning_rate": 6.312232726840009e-06, "loss": 0.1112, "num_input_tokens_seen": 209497280, "step": 97085 }, { "epoch": 15.838499184339314, "grad_norm": 0.27597561478614807, "learning_rate": 6.309868857144816e-06, "loss": 0.1772, "num_input_tokens_seen": 209508736, "step": 97090 }, { "epoch": 15.83931484502447, "grad_norm": 2.5314011573791504, "learning_rate": 6.307505366228838e-06, "loss": 0.0952, "num_input_tokens_seen": 209519808, "step": 97095 }, { "epoch": 15.840130505709626, "grad_norm": 2.084460496902466, "learning_rate": 6.305142254139978e-06, "loss": 0.2473, "num_input_tokens_seen": 209530592, "step": 97100 }, { "epoch": 15.84094616639478, "grad_norm": 2.852790355682373, "learning_rate": 6.302779520926122e-06, "loss": 0.2208, "num_input_tokens_seen": 209542592, "step": 97105 }, { "epoch": 15.841761827079935, "grad_norm": 0.5926704406738281, "learning_rate": 6.300417166635169e-06, "loss": 0.125, "num_input_tokens_seen": 209552416, "step": 97110 }, { "epoch": 15.84257748776509, "grad_norm": 0.20814764499664307, "learning_rate": 6.298055191314972e-06, "loss": 0.0462, "num_input_tokens_seen": 209562784, "step": 97115 }, { "epoch": 15.843393148450245, "grad_norm": 0.2235545516014099, "learning_rate": 6.295693595013424e-06, "loss": 0.0575, "num_input_tokens_seen": 209572896, "step": 97120 }, { "epoch": 15.844208809135399, "grad_norm": 0.03952698037028313, "learning_rate": 6.29333237777836e-06, "loss": 0.0031, "num_input_tokens_seen": 209583616, "step": 97125 }, { "epoch": 15.845024469820554, "grad_norm": 0.9112827181816101, "learning_rate": 6.2909715396576616e-06, "loss": 0.0419, "num_input_tokens_seen": 209593824, "step": 97130 }, { "epoch": 15.84584013050571, "grad_norm": 0.3895535171031952, "learning_rate": 6.288611080699144e-06, "loss": 0.0866, "num_input_tokens_seen": 209605120, "step": 97135 }, { "epoch": 15.846655791190864, "grad_norm": 0.07719000428915024, "learning_rate": 6.286251000950675e-06, "loss": 0.1163, "num_input_tokens_seen": 209617376, "step": 97140 }, { "epoch": 15.84747145187602, "grad_norm": 0.04402337223291397, "learning_rate": 6.283891300460054e-06, "loss": 0.3065, "num_input_tokens_seen": 209627328, "step": 97145 }, { "epoch": 15.848287112561174, "grad_norm": 0.07272496074438095, "learning_rate": 6.281531979275135e-06, "loss": 0.1389, "num_input_tokens_seen": 209638016, "step": 97150 }, { "epoch": 15.84910277324633, "grad_norm": 0.04790802299976349, "learning_rate": 6.2791730374437e-06, "loss": 0.0169, "num_input_tokens_seen": 209649536, "step": 97155 }, { "epoch": 15.849918433931485, "grad_norm": 0.07613583654165268, "learning_rate": 6.276814475013587e-06, "loss": 0.0259, "num_input_tokens_seen": 209660576, "step": 97160 }, { "epoch": 15.850734094616639, "grad_norm": 0.21951566636562347, "learning_rate": 6.2744562920325685e-06, "loss": 0.0115, "num_input_tokens_seen": 209671808, "step": 97165 }, { "epoch": 15.851549755301795, "grad_norm": 2.75046968460083, "learning_rate": 6.272098488548453e-06, "loss": 0.2877, "num_input_tokens_seen": 209682080, "step": 97170 }, { "epoch": 15.852365415986949, "grad_norm": 0.380449116230011, "learning_rate": 6.269741064609022e-06, "loss": 0.011, "num_input_tokens_seen": 209692768, "step": 97175 }, { "epoch": 15.853181076672104, "grad_norm": 0.41444575786590576, "learning_rate": 6.267384020262049e-06, "loss": 0.1148, "num_input_tokens_seen": 209703232, "step": 97180 }, { "epoch": 15.85399673735726, "grad_norm": 0.09464960545301437, "learning_rate": 6.2650273555553e-06, "loss": 0.0128, "num_input_tokens_seen": 209713984, "step": 97185 }, { "epoch": 15.854812398042414, "grad_norm": 0.8461070656776428, "learning_rate": 6.262671070536541e-06, "loss": 0.0278, "num_input_tokens_seen": 209723264, "step": 97190 }, { "epoch": 15.85562805872757, "grad_norm": 1.1069321632385254, "learning_rate": 6.260315165253522e-06, "loss": 0.1494, "num_input_tokens_seen": 209733504, "step": 97195 }, { "epoch": 15.856443719412724, "grad_norm": 4.068737030029297, "learning_rate": 6.2579596397539905e-06, "loss": 0.1752, "num_input_tokens_seen": 209744352, "step": 97200 }, { "epoch": 15.85725938009788, "grad_norm": 0.02090681903064251, "learning_rate": 6.255604494085682e-06, "loss": 0.1857, "num_input_tokens_seen": 209755776, "step": 97205 }, { "epoch": 15.858075040783035, "grad_norm": 2.921119451522827, "learning_rate": 6.253249728296326e-06, "loss": 0.3291, "num_input_tokens_seen": 209766592, "step": 97210 }, { "epoch": 15.858890701468189, "grad_norm": 2.094118118286133, "learning_rate": 6.250895342433646e-06, "loss": 0.2041, "num_input_tokens_seen": 209777760, "step": 97215 }, { "epoch": 15.859706362153345, "grad_norm": 0.015205679461359978, "learning_rate": 6.24854133654536e-06, "loss": 0.0248, "num_input_tokens_seen": 209788736, "step": 97220 }, { "epoch": 15.860522022838499, "grad_norm": 0.46478623151779175, "learning_rate": 6.246187710679169e-06, "loss": 0.0899, "num_input_tokens_seen": 209800480, "step": 97225 }, { "epoch": 15.861337683523654, "grad_norm": 0.08425075560808182, "learning_rate": 6.243834464882778e-06, "loss": 0.0083, "num_input_tokens_seen": 209811872, "step": 97230 }, { "epoch": 15.86215334420881, "grad_norm": 0.12696577608585358, "learning_rate": 6.241481599203872e-06, "loss": 0.0222, "num_input_tokens_seen": 209822848, "step": 97235 }, { "epoch": 15.862969004893964, "grad_norm": 0.03067401982843876, "learning_rate": 6.239129113690131e-06, "loss": 0.0235, "num_input_tokens_seen": 209835232, "step": 97240 }, { "epoch": 15.86378466557912, "grad_norm": 1.2912116050720215, "learning_rate": 6.2367770083892554e-06, "loss": 0.0234, "num_input_tokens_seen": 209845504, "step": 97245 }, { "epoch": 15.864600326264274, "grad_norm": 0.9022355079650879, "learning_rate": 6.23442528334888e-06, "loss": 0.0408, "num_input_tokens_seen": 209855936, "step": 97250 }, { "epoch": 15.86541598694943, "grad_norm": 0.28338122367858887, "learning_rate": 6.232073938616698e-06, "loss": 0.0237, "num_input_tokens_seen": 209866208, "step": 97255 }, { "epoch": 15.866231647634583, "grad_norm": 0.32572484016418457, "learning_rate": 6.2297229742403316e-06, "loss": 0.1217, "num_input_tokens_seen": 209877440, "step": 97260 }, { "epoch": 15.867047308319739, "grad_norm": 1.4461889266967773, "learning_rate": 6.227372390267456e-06, "loss": 0.0627, "num_input_tokens_seen": 209888480, "step": 97265 }, { "epoch": 15.867862969004895, "grad_norm": 0.028626004233956337, "learning_rate": 6.225022186745677e-06, "loss": 0.1403, "num_input_tokens_seen": 209898144, "step": 97270 }, { "epoch": 15.868678629690049, "grad_norm": 0.14881466329097748, "learning_rate": 6.222672363722651e-06, "loss": 0.1427, "num_input_tokens_seen": 209908832, "step": 97275 }, { "epoch": 15.869494290375204, "grad_norm": 0.08293037116527557, "learning_rate": 6.2203229212459885e-06, "loss": 0.2002, "num_input_tokens_seen": 209919712, "step": 97280 }, { "epoch": 15.870309951060358, "grad_norm": 0.6419994831085205, "learning_rate": 6.217973859363305e-06, "loss": 0.0997, "num_input_tokens_seen": 209929792, "step": 97285 }, { "epoch": 15.871125611745514, "grad_norm": 0.07277079671621323, "learning_rate": 6.2156251781222115e-06, "loss": 0.0124, "num_input_tokens_seen": 209941088, "step": 97290 }, { "epoch": 15.87194127243067, "grad_norm": 0.44717201590538025, "learning_rate": 6.213276877570301e-06, "loss": 0.1102, "num_input_tokens_seen": 209952288, "step": 97295 }, { "epoch": 15.872756933115824, "grad_norm": 2.403918504714966, "learning_rate": 6.210928957755166e-06, "loss": 0.0811, "num_input_tokens_seen": 209963584, "step": 97300 }, { "epoch": 15.87357259380098, "grad_norm": 1.987762451171875, "learning_rate": 6.208581418724394e-06, "loss": 0.1212, "num_input_tokens_seen": 209975200, "step": 97305 }, { "epoch": 15.874388254486133, "grad_norm": 0.38078010082244873, "learning_rate": 6.206234260525556e-06, "loss": 0.1027, "num_input_tokens_seen": 209984672, "step": 97310 }, { "epoch": 15.875203915171289, "grad_norm": 2.186192750930786, "learning_rate": 6.2038874832062225e-06, "loss": 0.2845, "num_input_tokens_seen": 209996064, "step": 97315 }, { "epoch": 15.876019575856443, "grad_norm": 1.658766746520996, "learning_rate": 6.2015410868139555e-06, "loss": 0.3939, "num_input_tokens_seen": 210006720, "step": 97320 }, { "epoch": 15.876835236541599, "grad_norm": 0.08002731204032898, "learning_rate": 6.199195071396305e-06, "loss": 0.1383, "num_input_tokens_seen": 210018112, "step": 97325 }, { "epoch": 15.877650897226754, "grad_norm": 2.300516128540039, "learning_rate": 6.196849437000815e-06, "loss": 0.106, "num_input_tokens_seen": 210029728, "step": 97330 }, { "epoch": 15.878466557911908, "grad_norm": 0.22824853658676147, "learning_rate": 6.194504183675026e-06, "loss": 0.0516, "num_input_tokens_seen": 210040832, "step": 97335 }, { "epoch": 15.879282218597064, "grad_norm": 0.26472482085227966, "learning_rate": 6.192159311466467e-06, "loss": 0.0647, "num_input_tokens_seen": 210050848, "step": 97340 }, { "epoch": 15.880097879282218, "grad_norm": 1.0812252759933472, "learning_rate": 6.189814820422657e-06, "loss": 0.0781, "num_input_tokens_seen": 210062080, "step": 97345 }, { "epoch": 15.880913539967374, "grad_norm": 0.11516387015581131, "learning_rate": 6.187470710591114e-06, "loss": 0.3516, "num_input_tokens_seen": 210072608, "step": 97350 }, { "epoch": 15.88172920065253, "grad_norm": 0.9337526559829712, "learning_rate": 6.185126982019343e-06, "loss": 0.0324, "num_input_tokens_seen": 210082592, "step": 97355 }, { "epoch": 15.882544861337683, "grad_norm": 0.0745106115937233, "learning_rate": 6.182783634754841e-06, "loss": 0.0326, "num_input_tokens_seen": 210093568, "step": 97360 }, { "epoch": 15.883360522022839, "grad_norm": 0.6284683346748352, "learning_rate": 6.1804406688451e-06, "loss": 0.0485, "num_input_tokens_seen": 210105760, "step": 97365 }, { "epoch": 15.884176182707993, "grad_norm": 0.04033944383263588, "learning_rate": 6.178098084337602e-06, "loss": 0.122, "num_input_tokens_seen": 210117696, "step": 97370 }, { "epoch": 15.884991843393149, "grad_norm": 0.10743427276611328, "learning_rate": 6.175755881279826e-06, "loss": 0.1496, "num_input_tokens_seen": 210129280, "step": 97375 }, { "epoch": 15.885807504078304, "grad_norm": 0.8130269646644592, "learning_rate": 6.173414059719235e-06, "loss": 0.0514, "num_input_tokens_seen": 210140576, "step": 97380 }, { "epoch": 15.886623164763458, "grad_norm": 1.5041639804840088, "learning_rate": 6.1710726197032855e-06, "loss": 0.3793, "num_input_tokens_seen": 210149856, "step": 97385 }, { "epoch": 15.887438825448614, "grad_norm": 0.5655065178871155, "learning_rate": 6.1687315612794425e-06, "loss": 0.0321, "num_input_tokens_seen": 210158880, "step": 97390 }, { "epoch": 15.888254486133768, "grad_norm": 1.6878536939620972, "learning_rate": 6.166390884495144e-06, "loss": 0.1007, "num_input_tokens_seen": 210169152, "step": 97395 }, { "epoch": 15.889070146818923, "grad_norm": 0.13200503587722778, "learning_rate": 6.164050589397824e-06, "loss": 0.0311, "num_input_tokens_seen": 210180480, "step": 97400 }, { "epoch": 15.88988580750408, "grad_norm": 0.8255677819252014, "learning_rate": 6.1617106760349155e-06, "loss": 0.0202, "num_input_tokens_seen": 210191712, "step": 97405 }, { "epoch": 15.890701468189233, "grad_norm": 0.6000947952270508, "learning_rate": 6.1593711444538405e-06, "loss": 0.0288, "num_input_tokens_seen": 210203680, "step": 97410 }, { "epoch": 15.891517128874389, "grad_norm": 0.355180025100708, "learning_rate": 6.157031994702006e-06, "loss": 0.0206, "num_input_tokens_seen": 210214080, "step": 97415 }, { "epoch": 15.892332789559543, "grad_norm": 0.2113536298274994, "learning_rate": 6.154693226826824e-06, "loss": 0.0573, "num_input_tokens_seen": 210225440, "step": 97420 }, { "epoch": 15.893148450244698, "grad_norm": 0.35020700097084045, "learning_rate": 6.152354840875693e-06, "loss": 0.0177, "num_input_tokens_seen": 210236256, "step": 97425 }, { "epoch": 15.893964110929852, "grad_norm": 0.17176464200019836, "learning_rate": 6.150016836896e-06, "loss": 0.1757, "num_input_tokens_seen": 210247104, "step": 97430 }, { "epoch": 15.894779771615008, "grad_norm": 0.1981736570596695, "learning_rate": 6.147679214935126e-06, "loss": 0.1343, "num_input_tokens_seen": 210258016, "step": 97435 }, { "epoch": 15.895595432300164, "grad_norm": 0.3539956212043762, "learning_rate": 6.14534197504045e-06, "loss": 0.0541, "num_input_tokens_seen": 210267680, "step": 97440 }, { "epoch": 15.896411092985318, "grad_norm": 0.11682064086198807, "learning_rate": 6.1430051172593404e-06, "loss": 0.0902, "num_input_tokens_seen": 210278240, "step": 97445 }, { "epoch": 15.897226753670473, "grad_norm": 0.9596481919288635, "learning_rate": 6.1406686416391495e-06, "loss": 0.1345, "num_input_tokens_seen": 210289152, "step": 97450 }, { "epoch": 15.898042414355627, "grad_norm": 1.445441722869873, "learning_rate": 6.138332548227235e-06, "loss": 0.0422, "num_input_tokens_seen": 210300320, "step": 97455 }, { "epoch": 15.898858075040783, "grad_norm": 1.3727365732192993, "learning_rate": 6.1359968370709395e-06, "loss": 0.2687, "num_input_tokens_seen": 210311968, "step": 97460 }, { "epoch": 15.899673735725939, "grad_norm": 1.6210196018218994, "learning_rate": 6.1336615082176e-06, "loss": 0.1012, "num_input_tokens_seen": 210320960, "step": 97465 }, { "epoch": 15.900489396411093, "grad_norm": 0.21585524082183838, "learning_rate": 6.131326561714543e-06, "loss": 0.0321, "num_input_tokens_seen": 210329696, "step": 97470 }, { "epoch": 15.901305057096248, "grad_norm": 0.5068474411964417, "learning_rate": 6.12899199760909e-06, "loss": 0.1113, "num_input_tokens_seen": 210338720, "step": 97475 }, { "epoch": 15.902120717781402, "grad_norm": 0.08086924999952316, "learning_rate": 6.1266578159485525e-06, "loss": 0.0918, "num_input_tokens_seen": 210349024, "step": 97480 }, { "epoch": 15.902936378466558, "grad_norm": 2.060539722442627, "learning_rate": 6.124324016780237e-06, "loss": 0.2944, "num_input_tokens_seen": 210361664, "step": 97485 }, { "epoch": 15.903752039151712, "grad_norm": 0.04621759057044983, "learning_rate": 6.1219906001514425e-06, "loss": 0.031, "num_input_tokens_seen": 210373184, "step": 97490 }, { "epoch": 15.904567699836868, "grad_norm": 1.9700924158096313, "learning_rate": 6.119657566109449e-06, "loss": 0.1532, "num_input_tokens_seen": 210383872, "step": 97495 }, { "epoch": 15.905383360522023, "grad_norm": 0.3633527159690857, "learning_rate": 6.117324914701558e-06, "loss": 0.0369, "num_input_tokens_seen": 210394592, "step": 97500 }, { "epoch": 15.906199021207177, "grad_norm": 1.0398733615875244, "learning_rate": 6.114992645975021e-06, "loss": 0.2869, "num_input_tokens_seen": 210405664, "step": 97505 }, { "epoch": 15.907014681892333, "grad_norm": 0.043509162962436676, "learning_rate": 6.112660759977129e-06, "loss": 0.0569, "num_input_tokens_seen": 210417632, "step": 97510 }, { "epoch": 15.907830342577487, "grad_norm": 0.016786275431513786, "learning_rate": 6.110329256755112e-06, "loss": 0.0329, "num_input_tokens_seen": 210426080, "step": 97515 }, { "epoch": 15.908646003262643, "grad_norm": 0.014380096457898617, "learning_rate": 6.107998136356249e-06, "loss": 0.0169, "num_input_tokens_seen": 210435232, "step": 97520 }, { "epoch": 15.909461663947798, "grad_norm": 3.1797947883605957, "learning_rate": 6.105667398827758e-06, "loss": 0.1433, "num_input_tokens_seen": 210446464, "step": 97525 }, { "epoch": 15.910277324632952, "grad_norm": 0.8388198018074036, "learning_rate": 6.103337044216892e-06, "loss": 0.0977, "num_input_tokens_seen": 210456928, "step": 97530 }, { "epoch": 15.911092985318108, "grad_norm": 0.053974539041519165, "learning_rate": 6.101007072570875e-06, "loss": 0.085, "num_input_tokens_seen": 210467584, "step": 97535 }, { "epoch": 15.911908646003262, "grad_norm": 1.087342619895935, "learning_rate": 6.098677483936924e-06, "loss": 0.0351, "num_input_tokens_seen": 210477984, "step": 97540 }, { "epoch": 15.912724306688418, "grad_norm": 2.99851131439209, "learning_rate": 6.096348278362249e-06, "loss": 0.1171, "num_input_tokens_seen": 210489472, "step": 97545 }, { "epoch": 15.913539967373573, "grad_norm": 1.6136466264724731, "learning_rate": 6.09401945589406e-06, "loss": 0.0952, "num_input_tokens_seen": 210500672, "step": 97550 }, { "epoch": 15.914355628058727, "grad_norm": 0.0911383107304573, "learning_rate": 6.091691016579551e-06, "loss": 0.1713, "num_input_tokens_seen": 210511968, "step": 97555 }, { "epoch": 15.915171288743883, "grad_norm": 0.19117693603038788, "learning_rate": 6.089362960465908e-06, "loss": 0.199, "num_input_tokens_seen": 210522816, "step": 97560 }, { "epoch": 15.915986949429037, "grad_norm": 0.04610038176178932, "learning_rate": 6.087035287600315e-06, "loss": 0.0235, "num_input_tokens_seen": 210533536, "step": 97565 }, { "epoch": 15.916802610114193, "grad_norm": 0.08274099975824356, "learning_rate": 6.084707998029945e-06, "loss": 0.0543, "num_input_tokens_seen": 210544544, "step": 97570 }, { "epoch": 15.917618270799348, "grad_norm": 1.7877353429794312, "learning_rate": 6.0823810918019646e-06, "loss": 0.12, "num_input_tokens_seen": 210555328, "step": 97575 }, { "epoch": 15.918433931484502, "grad_norm": 0.2697582542896271, "learning_rate": 6.0800545689635304e-06, "loss": 0.0302, "num_input_tokens_seen": 210567072, "step": 97580 }, { "epoch": 15.919249592169658, "grad_norm": 0.2863010764122009, "learning_rate": 6.077728429561791e-06, "loss": 0.0938, "num_input_tokens_seen": 210576384, "step": 97585 }, { "epoch": 15.920065252854812, "grad_norm": 0.0755690261721611, "learning_rate": 6.0754026736438875e-06, "loss": 0.0738, "num_input_tokens_seen": 210587712, "step": 97590 }, { "epoch": 15.920880913539968, "grad_norm": 0.02033342979848385, "learning_rate": 6.073077301256958e-06, "loss": 0.0751, "num_input_tokens_seen": 210598944, "step": 97595 }, { "epoch": 15.921696574225122, "grad_norm": 0.24151748418807983, "learning_rate": 6.070752312448122e-06, "loss": 0.0162, "num_input_tokens_seen": 210610400, "step": 97600 }, { "epoch": 15.922512234910277, "grad_norm": 1.9556043148040771, "learning_rate": 6.068427707264515e-06, "loss": 0.3467, "num_input_tokens_seen": 210621856, "step": 97605 }, { "epoch": 15.923327895595433, "grad_norm": 0.02753504551947117, "learning_rate": 6.066103485753224e-06, "loss": 0.0893, "num_input_tokens_seen": 210632384, "step": 97610 }, { "epoch": 15.924143556280587, "grad_norm": 1.3726171255111694, "learning_rate": 6.063779647961381e-06, "loss": 0.1011, "num_input_tokens_seen": 210642496, "step": 97615 }, { "epoch": 15.924959216965743, "grad_norm": 0.27291297912597656, "learning_rate": 6.0614561939360495e-06, "loss": 0.0373, "num_input_tokens_seen": 210653376, "step": 97620 }, { "epoch": 15.925774877650896, "grad_norm": 0.9512603878974915, "learning_rate": 6.0591331237243505e-06, "loss": 0.1435, "num_input_tokens_seen": 210662720, "step": 97625 }, { "epoch": 15.926590538336052, "grad_norm": 0.06450697779655457, "learning_rate": 6.056810437373328e-06, "loss": 0.1455, "num_input_tokens_seen": 210673952, "step": 97630 }, { "epoch": 15.927406199021208, "grad_norm": 0.05984062701463699, "learning_rate": 6.054488134930092e-06, "loss": 0.1157, "num_input_tokens_seen": 210684928, "step": 97635 }, { "epoch": 15.928221859706362, "grad_norm": 0.039469677954912186, "learning_rate": 6.052166216441671e-06, "loss": 0.0427, "num_input_tokens_seen": 210695424, "step": 97640 }, { "epoch": 15.929037520391518, "grad_norm": 0.8980590105056763, "learning_rate": 6.049844681955155e-06, "loss": 0.0725, "num_input_tokens_seen": 210705888, "step": 97645 }, { "epoch": 15.929853181076671, "grad_norm": 0.027362752705812454, "learning_rate": 6.04752353151756e-06, "loss": 0.0466, "num_input_tokens_seen": 210716512, "step": 97650 }, { "epoch": 15.930668841761827, "grad_norm": 0.17425449192523956, "learning_rate": 6.045202765175958e-06, "loss": 0.1071, "num_input_tokens_seen": 210726304, "step": 97655 }, { "epoch": 15.931484502446983, "grad_norm": 0.3483542799949646, "learning_rate": 6.042882382977355e-06, "loss": 0.0591, "num_input_tokens_seen": 210737632, "step": 97660 }, { "epoch": 15.932300163132137, "grad_norm": 0.02479896880686283, "learning_rate": 6.040562384968803e-06, "loss": 0.1802, "num_input_tokens_seen": 210747168, "step": 97665 }, { "epoch": 15.933115823817293, "grad_norm": 0.020732320845127106, "learning_rate": 6.038242771197292e-06, "loss": 0.0799, "num_input_tokens_seen": 210758240, "step": 97670 }, { "epoch": 15.933931484502446, "grad_norm": 0.16790011525154114, "learning_rate": 6.035923541709853e-06, "loss": 0.019, "num_input_tokens_seen": 210768640, "step": 97675 }, { "epoch": 15.934747145187602, "grad_norm": 0.8559532761573792, "learning_rate": 6.0336046965534795e-06, "loss": 0.1219, "num_input_tokens_seen": 210778880, "step": 97680 }, { "epoch": 15.935562805872756, "grad_norm": 1.9274075031280518, "learning_rate": 6.0312862357751685e-06, "loss": 0.2428, "num_input_tokens_seen": 210788832, "step": 97685 }, { "epoch": 15.936378466557912, "grad_norm": 0.10198651999235153, "learning_rate": 6.0289681594219075e-06, "loss": 0.0112, "num_input_tokens_seen": 210800800, "step": 97690 }, { "epoch": 15.937194127243067, "grad_norm": 0.19860674440860748, "learning_rate": 6.026650467540671e-06, "loss": 0.0339, "num_input_tokens_seen": 210811712, "step": 97695 }, { "epoch": 15.938009787928221, "grad_norm": 0.17612490057945251, "learning_rate": 6.0243331601784335e-06, "loss": 0.0744, "num_input_tokens_seen": 210823008, "step": 97700 }, { "epoch": 15.938825448613377, "grad_norm": 0.9680519104003906, "learning_rate": 6.0220162373821555e-06, "loss": 0.142, "num_input_tokens_seen": 210834624, "step": 97705 }, { "epoch": 15.939641109298531, "grad_norm": 0.5664297938346863, "learning_rate": 6.019699699198794e-06, "loss": 0.193, "num_input_tokens_seen": 210845536, "step": 97710 }, { "epoch": 15.940456769983687, "grad_norm": 0.07755699753761292, "learning_rate": 6.0173835456752954e-06, "loss": 0.0403, "num_input_tokens_seen": 210855136, "step": 97715 }, { "epoch": 15.941272430668842, "grad_norm": 0.11953932046890259, "learning_rate": 6.015067776858602e-06, "loss": 0.0991, "num_input_tokens_seen": 210867104, "step": 97720 }, { "epoch": 15.942088091353996, "grad_norm": 0.039394643157720566, "learning_rate": 6.012752392795637e-06, "loss": 0.1276, "num_input_tokens_seen": 210876640, "step": 97725 }, { "epoch": 15.942903752039152, "grad_norm": 0.17172189056873322, "learning_rate": 6.010437393533344e-06, "loss": 0.0076, "num_input_tokens_seen": 210886304, "step": 97730 }, { "epoch": 15.943719412724306, "grad_norm": 1.602323055267334, "learning_rate": 6.008122779118616e-06, "loss": 0.1684, "num_input_tokens_seen": 210897408, "step": 97735 }, { "epoch": 15.944535073409462, "grad_norm": 0.03356589004397392, "learning_rate": 6.005808549598385e-06, "loss": 0.2265, "num_input_tokens_seen": 210908256, "step": 97740 }, { "epoch": 15.945350734094617, "grad_norm": 0.04549603909254074, "learning_rate": 6.0034947050195274e-06, "loss": 0.0832, "num_input_tokens_seen": 210919360, "step": 97745 }, { "epoch": 15.946166394779771, "grad_norm": 0.30011221766471863, "learning_rate": 6.001181245428961e-06, "loss": 0.1272, "num_input_tokens_seen": 210928864, "step": 97750 }, { "epoch": 15.946982055464927, "grad_norm": 1.0984609127044678, "learning_rate": 5.998868170873542e-06, "loss": 0.0483, "num_input_tokens_seen": 210940160, "step": 97755 }, { "epoch": 15.947797716150081, "grad_norm": 0.2274194061756134, "learning_rate": 5.996555481400182e-06, "loss": 0.2898, "num_input_tokens_seen": 210952192, "step": 97760 }, { "epoch": 15.948613376835237, "grad_norm": 0.25980186462402344, "learning_rate": 5.9942431770557165e-06, "loss": 0.1298, "num_input_tokens_seen": 210962752, "step": 97765 }, { "epoch": 15.949429037520392, "grad_norm": 0.20523661375045776, "learning_rate": 5.99193125788704e-06, "loss": 0.0878, "num_input_tokens_seen": 210973408, "step": 97770 }, { "epoch": 15.950244698205546, "grad_norm": 0.19915246963500977, "learning_rate": 5.989619723940973e-06, "loss": 0.3675, "num_input_tokens_seen": 210984608, "step": 97775 }, { "epoch": 15.951060358890702, "grad_norm": 0.07334663718938828, "learning_rate": 5.987308575264394e-06, "loss": 0.0962, "num_input_tokens_seen": 210994656, "step": 97780 }, { "epoch": 15.951876019575856, "grad_norm": 0.20871946215629578, "learning_rate": 5.984997811904111e-06, "loss": 0.1869, "num_input_tokens_seen": 211006016, "step": 97785 }, { "epoch": 15.952691680261012, "grad_norm": 0.075911745429039, "learning_rate": 5.982687433906986e-06, "loss": 0.0908, "num_input_tokens_seen": 211017472, "step": 97790 }, { "epoch": 15.953507340946166, "grad_norm": 0.07040104269981384, "learning_rate": 5.980377441319809e-06, "loss": 0.0956, "num_input_tokens_seen": 211026784, "step": 97795 }, { "epoch": 15.954323001631321, "grad_norm": 1.6800645589828491, "learning_rate": 5.978067834189427e-06, "loss": 0.2599, "num_input_tokens_seen": 211037504, "step": 97800 }, { "epoch": 15.955138662316477, "grad_norm": 0.05297226831316948, "learning_rate": 5.975758612562615e-06, "loss": 0.2003, "num_input_tokens_seen": 211048544, "step": 97805 }, { "epoch": 15.955954323001631, "grad_norm": 2.4845945835113525, "learning_rate": 5.973449776486198e-06, "loss": 0.26, "num_input_tokens_seen": 211060352, "step": 97810 }, { "epoch": 15.956769983686787, "grad_norm": 0.05184090510010719, "learning_rate": 5.971141326006957e-06, "loss": 0.1274, "num_input_tokens_seen": 211071584, "step": 97815 }, { "epoch": 15.95758564437194, "grad_norm": 2.096705436706543, "learning_rate": 5.968833261171677e-06, "loss": 0.1142, "num_input_tokens_seen": 211082208, "step": 97820 }, { "epoch": 15.958401305057096, "grad_norm": 1.5400172472000122, "learning_rate": 5.966525582027136e-06, "loss": 0.0967, "num_input_tokens_seen": 211092384, "step": 97825 }, { "epoch": 15.959216965742252, "grad_norm": 0.4880678355693817, "learning_rate": 5.964218288620097e-06, "loss": 0.0171, "num_input_tokens_seen": 211104000, "step": 97830 }, { "epoch": 15.960032626427406, "grad_norm": 0.12821753323078156, "learning_rate": 5.9619113809973235e-06, "loss": 0.0989, "num_input_tokens_seen": 211114304, "step": 97835 }, { "epoch": 15.960848287112562, "grad_norm": 0.13201944530010223, "learning_rate": 5.959604859205567e-06, "loss": 0.0983, "num_input_tokens_seen": 211125664, "step": 97840 }, { "epoch": 15.961663947797716, "grad_norm": 1.1102272272109985, "learning_rate": 5.9572987232915736e-06, "loss": 0.193, "num_input_tokens_seen": 211135584, "step": 97845 }, { "epoch": 15.962479608482871, "grad_norm": 0.32581111788749695, "learning_rate": 5.954992973302079e-06, "loss": 0.162, "num_input_tokens_seen": 211146592, "step": 97850 }, { "epoch": 15.963295269168025, "grad_norm": 0.0871662124991417, "learning_rate": 5.952687609283811e-06, "loss": 0.2186, "num_input_tokens_seen": 211156576, "step": 97855 }, { "epoch": 15.964110929853181, "grad_norm": 1.4501874446868896, "learning_rate": 5.9503826312834935e-06, "loss": 0.1921, "num_input_tokens_seen": 211167712, "step": 97860 }, { "epoch": 15.964926590538337, "grad_norm": 2.0079004764556885, "learning_rate": 5.948078039347838e-06, "loss": 0.3546, "num_input_tokens_seen": 211178464, "step": 97865 }, { "epoch": 15.96574225122349, "grad_norm": 0.06060008704662323, "learning_rate": 5.945773833523541e-06, "loss": 0.1944, "num_input_tokens_seen": 211189568, "step": 97870 }, { "epoch": 15.966557911908646, "grad_norm": 0.05501696467399597, "learning_rate": 5.943470013857325e-06, "loss": 0.0192, "num_input_tokens_seen": 211199360, "step": 97875 }, { "epoch": 15.9673735725938, "grad_norm": 0.16891130805015564, "learning_rate": 5.9411665803958484e-06, "loss": 0.1112, "num_input_tokens_seen": 211209760, "step": 97880 }, { "epoch": 15.968189233278956, "grad_norm": 3.7714955806732178, "learning_rate": 5.9388635331858235e-06, "loss": 0.141, "num_input_tokens_seen": 211219904, "step": 97885 }, { "epoch": 15.969004893964112, "grad_norm": 0.6985742449760437, "learning_rate": 5.936560872273897e-06, "loss": 0.0223, "num_input_tokens_seen": 211230400, "step": 97890 }, { "epoch": 15.969820554649266, "grad_norm": 0.07261386513710022, "learning_rate": 5.934258597706762e-06, "loss": 0.2037, "num_input_tokens_seen": 211241440, "step": 97895 }, { "epoch": 15.970636215334421, "grad_norm": 0.36932477355003357, "learning_rate": 5.931956709531048e-06, "loss": 0.1175, "num_input_tokens_seen": 211253600, "step": 97900 }, { "epoch": 15.971451876019575, "grad_norm": 0.07301934063434601, "learning_rate": 5.929655207793436e-06, "loss": 0.1053, "num_input_tokens_seen": 211265280, "step": 97905 }, { "epoch": 15.97226753670473, "grad_norm": 1.646138072013855, "learning_rate": 5.927354092540538e-06, "loss": 0.1949, "num_input_tokens_seen": 211275520, "step": 97910 }, { "epoch": 15.973083197389887, "grad_norm": 0.043841682374477386, "learning_rate": 5.925053363819014e-06, "loss": 0.061, "num_input_tokens_seen": 211286496, "step": 97915 }, { "epoch": 15.97389885807504, "grad_norm": 0.6225994825363159, "learning_rate": 5.922753021675478e-06, "loss": 0.0585, "num_input_tokens_seen": 211295808, "step": 97920 }, { "epoch": 15.974714518760196, "grad_norm": 0.2951430380344391, "learning_rate": 5.920453066156559e-06, "loss": 0.0624, "num_input_tokens_seen": 211307168, "step": 97925 }, { "epoch": 15.97553017944535, "grad_norm": 0.06414981186389923, "learning_rate": 5.918153497308859e-06, "loss": 0.1352, "num_input_tokens_seen": 211317120, "step": 97930 }, { "epoch": 15.976345840130506, "grad_norm": 0.40500110387802124, "learning_rate": 5.915854315178984e-06, "loss": 0.0989, "num_input_tokens_seen": 211327328, "step": 97935 }, { "epoch": 15.977161500815662, "grad_norm": 1.021669626235962, "learning_rate": 5.913555519813535e-06, "loss": 0.1902, "num_input_tokens_seen": 211336608, "step": 97940 }, { "epoch": 15.977977161500815, "grad_norm": 0.15575279295444489, "learning_rate": 5.911257111259094e-06, "loss": 0.0283, "num_input_tokens_seen": 211348448, "step": 97945 }, { "epoch": 15.978792822185971, "grad_norm": 1.019717812538147, "learning_rate": 5.908959089562243e-06, "loss": 0.0792, "num_input_tokens_seen": 211359104, "step": 97950 }, { "epoch": 15.979608482871125, "grad_norm": 2.3132383823394775, "learning_rate": 5.906661454769557e-06, "loss": 0.1909, "num_input_tokens_seen": 211369760, "step": 97955 }, { "epoch": 15.98042414355628, "grad_norm": 3.5709757804870605, "learning_rate": 5.904364206927596e-06, "loss": 0.3431, "num_input_tokens_seen": 211379584, "step": 97960 }, { "epoch": 15.981239804241435, "grad_norm": 0.11202750355005264, "learning_rate": 5.902067346082921e-06, "loss": 0.0672, "num_input_tokens_seen": 211390400, "step": 97965 }, { "epoch": 15.98205546492659, "grad_norm": 0.1225212812423706, "learning_rate": 5.899770872282076e-06, "loss": 0.0556, "num_input_tokens_seen": 211400000, "step": 97970 }, { "epoch": 15.982871125611746, "grad_norm": 1.000683069229126, "learning_rate": 5.897474785571608e-06, "loss": 0.0625, "num_input_tokens_seen": 211412288, "step": 97975 }, { "epoch": 15.9836867862969, "grad_norm": 0.13050878047943115, "learning_rate": 5.895179085998043e-06, "loss": 0.0875, "num_input_tokens_seen": 211423744, "step": 97980 }, { "epoch": 15.984502446982056, "grad_norm": 0.2231704294681549, "learning_rate": 5.892883773607913e-06, "loss": 0.2401, "num_input_tokens_seen": 211435168, "step": 97985 }, { "epoch": 15.98531810766721, "grad_norm": 4.502804756164551, "learning_rate": 5.890588848447731e-06, "loss": 0.3595, "num_input_tokens_seen": 211446016, "step": 97990 }, { "epoch": 15.986133768352365, "grad_norm": 0.423526793718338, "learning_rate": 5.888294310564008e-06, "loss": 0.1051, "num_input_tokens_seen": 211456512, "step": 97995 }, { "epoch": 15.986949429037521, "grad_norm": 1.6627463102340698, "learning_rate": 5.8860001600032465e-06, "loss": 0.0523, "num_input_tokens_seen": 211467808, "step": 98000 }, { "epoch": 15.987765089722675, "grad_norm": 0.13773740828037262, "learning_rate": 5.883706396811939e-06, "loss": 0.0364, "num_input_tokens_seen": 211478624, "step": 98005 }, { "epoch": 15.98858075040783, "grad_norm": 0.47543269395828247, "learning_rate": 5.881413021036572e-06, "loss": 0.0154, "num_input_tokens_seen": 211488960, "step": 98010 }, { "epoch": 15.989396411092985, "grad_norm": 0.09512569010257721, "learning_rate": 5.879120032723618e-06, "loss": 0.0472, "num_input_tokens_seen": 211499360, "step": 98015 }, { "epoch": 15.99021207177814, "grad_norm": 0.29719775915145874, "learning_rate": 5.876827431919557e-06, "loss": 0.1113, "num_input_tokens_seen": 211510368, "step": 98020 }, { "epoch": 15.991027732463294, "grad_norm": 0.1290861815214157, "learning_rate": 5.874535218670852e-06, "loss": 0.1835, "num_input_tokens_seen": 211522208, "step": 98025 }, { "epoch": 15.99184339314845, "grad_norm": 1.0830755233764648, "learning_rate": 5.872243393023949e-06, "loss": 0.1523, "num_input_tokens_seen": 211532800, "step": 98030 }, { "epoch": 15.992659053833606, "grad_norm": 0.16132228076457977, "learning_rate": 5.869951955025302e-06, "loss": 0.088, "num_input_tokens_seen": 211544000, "step": 98035 }, { "epoch": 15.99347471451876, "grad_norm": 0.1291530281305313, "learning_rate": 5.8676609047213445e-06, "loss": 0.0449, "num_input_tokens_seen": 211555008, "step": 98040 }, { "epoch": 15.994290375203915, "grad_norm": 2.2180464267730713, "learning_rate": 5.86537024215851e-06, "loss": 0.1621, "num_input_tokens_seen": 211565152, "step": 98045 }, { "epoch": 15.99510603588907, "grad_norm": 0.42731836438179016, "learning_rate": 5.863079967383223e-06, "loss": 0.0164, "num_input_tokens_seen": 211576064, "step": 98050 }, { "epoch": 15.995921696574225, "grad_norm": 0.03256814181804657, "learning_rate": 5.8607900804418965e-06, "loss": 0.0211, "num_input_tokens_seen": 211588000, "step": 98055 }, { "epoch": 15.99673735725938, "grad_norm": 1.8890413045883179, "learning_rate": 5.8585005813809406e-06, "loss": 0.076, "num_input_tokens_seen": 211599008, "step": 98060 }, { "epoch": 15.997553017944535, "grad_norm": 0.054498303681612015, "learning_rate": 5.85621147024675e-06, "loss": 0.1638, "num_input_tokens_seen": 211609120, "step": 98065 }, { "epoch": 15.99836867862969, "grad_norm": 2.0145459175109863, "learning_rate": 5.853922747085722e-06, "loss": 0.241, "num_input_tokens_seen": 211620608, "step": 98070 }, { "epoch": 15.999184339314844, "grad_norm": 1.671681523323059, "learning_rate": 5.851634411944237e-06, "loss": 0.1199, "num_input_tokens_seen": 211631296, "step": 98075 }, { "epoch": 16.0, "grad_norm": 4.665553092956543, "learning_rate": 5.849346464868674e-06, "loss": 0.1168, "num_input_tokens_seen": 211640976, "step": 98080 }, { "epoch": 16.0, "eval_loss": 0.14113585650920868, "eval_runtime": 91.1058, "eval_samples_per_second": 29.91, "eval_steps_per_second": 7.486, "num_input_tokens_seen": 211640976, "step": 98080 }, { "epoch": 16.000815660685156, "grad_norm": 0.0773867815732956, "learning_rate": 5.8470589059053975e-06, "loss": 0.1693, "num_input_tokens_seen": 211651504, "step": 98085 }, { "epoch": 16.00163132137031, "grad_norm": 0.05135141685605049, "learning_rate": 5.844771735100768e-06, "loss": 0.0384, "num_input_tokens_seen": 211661616, "step": 98090 }, { "epoch": 16.002446982055464, "grad_norm": 0.37798503041267395, "learning_rate": 5.842484952501143e-06, "loss": 0.1515, "num_input_tokens_seen": 211671632, "step": 98095 }, { "epoch": 16.00326264274062, "grad_norm": 0.034022506326436996, "learning_rate": 5.8401985581528594e-06, "loss": 0.0774, "num_input_tokens_seen": 211682800, "step": 98100 }, { "epoch": 16.004078303425775, "grad_norm": 0.1629764884710312, "learning_rate": 5.837912552102262e-06, "loss": 0.0168, "num_input_tokens_seen": 211693296, "step": 98105 }, { "epoch": 16.00489396411093, "grad_norm": 0.03525311127305031, "learning_rate": 5.835626934395671e-06, "loss": 0.2075, "num_input_tokens_seen": 211705680, "step": 98110 }, { "epoch": 16.005709624796086, "grad_norm": 0.04033324867486954, "learning_rate": 5.833341705079415e-06, "loss": 0.1264, "num_input_tokens_seen": 211717104, "step": 98115 }, { "epoch": 16.00652528548124, "grad_norm": 0.979023277759552, "learning_rate": 5.831056864199805e-06, "loss": 0.2945, "num_input_tokens_seen": 211728432, "step": 98120 }, { "epoch": 16.007340946166394, "grad_norm": 1.4643489122390747, "learning_rate": 5.828772411803143e-06, "loss": 0.1386, "num_input_tokens_seen": 211740208, "step": 98125 }, { "epoch": 16.00815660685155, "grad_norm": 0.18819259107112885, "learning_rate": 5.826488347935729e-06, "loss": 0.0344, "num_input_tokens_seen": 211751312, "step": 98130 }, { "epoch": 16.008972267536706, "grad_norm": 0.30825451016426086, "learning_rate": 5.824204672643846e-06, "loss": 0.0816, "num_input_tokens_seen": 211761392, "step": 98135 }, { "epoch": 16.00978792822186, "grad_norm": 0.34793102741241455, "learning_rate": 5.821921385973794e-06, "loss": 0.1918, "num_input_tokens_seen": 211772624, "step": 98140 }, { "epoch": 16.010603588907014, "grad_norm": 1.1160814762115479, "learning_rate": 5.819638487971821e-06, "loss": 0.0317, "num_input_tokens_seen": 211783536, "step": 98145 }, { "epoch": 16.01141924959217, "grad_norm": 1.5953714847564697, "learning_rate": 5.817355978684219e-06, "loss": 0.072, "num_input_tokens_seen": 211793424, "step": 98150 }, { "epoch": 16.012234910277325, "grad_norm": 0.8603345155715942, "learning_rate": 5.8150738581572214e-06, "loss": 0.0457, "num_input_tokens_seen": 211804208, "step": 98155 }, { "epoch": 16.01305057096248, "grad_norm": 1.218016266822815, "learning_rate": 5.8127921264370945e-06, "loss": 0.044, "num_input_tokens_seen": 211814992, "step": 98160 }, { "epoch": 16.013866231647636, "grad_norm": 0.05100460350513458, "learning_rate": 5.810510783570078e-06, "loss": 0.2057, "num_input_tokens_seen": 211825488, "step": 98165 }, { "epoch": 16.01468189233279, "grad_norm": 0.4290968179702759, "learning_rate": 5.808229829602402e-06, "loss": 0.0597, "num_input_tokens_seen": 211837264, "step": 98170 }, { "epoch": 16.015497553017944, "grad_norm": 0.45397332310676575, "learning_rate": 5.805949264580296e-06, "loss": 0.0187, "num_input_tokens_seen": 211848080, "step": 98175 }, { "epoch": 16.0163132137031, "grad_norm": 0.8262665867805481, "learning_rate": 5.8036690885499765e-06, "loss": 0.1274, "num_input_tokens_seen": 211858992, "step": 98180 }, { "epoch": 16.017128874388256, "grad_norm": 1.3573384284973145, "learning_rate": 5.801389301557655e-06, "loss": 0.374, "num_input_tokens_seen": 211869552, "step": 98185 }, { "epoch": 16.017944535073408, "grad_norm": 0.14448383450508118, "learning_rate": 5.799109903649538e-06, "loss": 0.1541, "num_input_tokens_seen": 211880048, "step": 98190 }, { "epoch": 16.018760195758563, "grad_norm": 1.5370908975601196, "learning_rate": 5.7968308948718135e-06, "loss": 0.1333, "num_input_tokens_seen": 211891696, "step": 98195 }, { "epoch": 16.01957585644372, "grad_norm": 0.42971619963645935, "learning_rate": 5.794552275270673e-06, "loss": 0.1075, "num_input_tokens_seen": 211901392, "step": 98200 }, { "epoch": 16.020391517128875, "grad_norm": 0.05868193879723549, "learning_rate": 5.792274044892293e-06, "loss": 0.089, "num_input_tokens_seen": 211912240, "step": 98205 }, { "epoch": 16.02120717781403, "grad_norm": 1.215986728668213, "learning_rate": 5.789996203782849e-06, "loss": 0.072, "num_input_tokens_seen": 211923888, "step": 98210 }, { "epoch": 16.022022838499183, "grad_norm": 0.06695995479822159, "learning_rate": 5.7877187519885e-06, "loss": 0.0855, "num_input_tokens_seen": 211935056, "step": 98215 }, { "epoch": 16.02283849918434, "grad_norm": 0.8873407244682312, "learning_rate": 5.785441689555402e-06, "loss": 0.251, "num_input_tokens_seen": 211946512, "step": 98220 }, { "epoch": 16.023654159869494, "grad_norm": 0.2874654531478882, "learning_rate": 5.783165016529704e-06, "loss": 0.052, "num_input_tokens_seen": 211958640, "step": 98225 }, { "epoch": 16.02446982055465, "grad_norm": 0.0710519552230835, "learning_rate": 5.7808887329575455e-06, "loss": 0.1648, "num_input_tokens_seen": 211967632, "step": 98230 }, { "epoch": 16.025285481239806, "grad_norm": 0.42087045311927795, "learning_rate": 5.778612838885058e-06, "loss": 0.1333, "num_input_tokens_seen": 211978928, "step": 98235 }, { "epoch": 16.026101141924958, "grad_norm": 0.03352763131260872, "learning_rate": 5.776337334358361e-06, "loss": 0.3533, "num_input_tokens_seen": 211989232, "step": 98240 }, { "epoch": 16.026916802610113, "grad_norm": 0.08127294480800629, "learning_rate": 5.774062219423587e-06, "loss": 0.0951, "num_input_tokens_seen": 211998576, "step": 98245 }, { "epoch": 16.02773246329527, "grad_norm": 0.08226804435253143, "learning_rate": 5.771787494126815e-06, "loss": 0.0778, "num_input_tokens_seen": 212009552, "step": 98250 }, { "epoch": 16.028548123980425, "grad_norm": 0.1927916407585144, "learning_rate": 5.769513158514181e-06, "loss": 0.1204, "num_input_tokens_seen": 212019472, "step": 98255 }, { "epoch": 16.02936378466558, "grad_norm": 0.9551305174827576, "learning_rate": 5.767239212631742e-06, "loss": 0.114, "num_input_tokens_seen": 212030000, "step": 98260 }, { "epoch": 16.030179445350733, "grad_norm": 1.5766087770462036, "learning_rate": 5.764965656525612e-06, "loss": 0.1052, "num_input_tokens_seen": 212041072, "step": 98265 }, { "epoch": 16.03099510603589, "grad_norm": 0.1135251596570015, "learning_rate": 5.762692490241842e-06, "loss": 0.0116, "num_input_tokens_seen": 212052144, "step": 98270 }, { "epoch": 16.031810766721044, "grad_norm": 0.049009956419467926, "learning_rate": 5.760419713826526e-06, "loss": 0.0761, "num_input_tokens_seen": 212063664, "step": 98275 }, { "epoch": 16.0326264274062, "grad_norm": 0.05565542355179787, "learning_rate": 5.758147327325697e-06, "loss": 0.3369, "num_input_tokens_seen": 212073968, "step": 98280 }, { "epoch": 16.033442088091356, "grad_norm": 0.24189697206020355, "learning_rate": 5.755875330785437e-06, "loss": 0.0887, "num_input_tokens_seen": 212083376, "step": 98285 }, { "epoch": 16.034257748776508, "grad_norm": 1.8275691270828247, "learning_rate": 5.753603724251763e-06, "loss": 0.0975, "num_input_tokens_seen": 212094000, "step": 98290 }, { "epoch": 16.035073409461663, "grad_norm": 0.024644741788506508, "learning_rate": 5.7513325077707404e-06, "loss": 0.1961, "num_input_tokens_seen": 212104720, "step": 98295 }, { "epoch": 16.03588907014682, "grad_norm": 2.0163891315460205, "learning_rate": 5.749061681388365e-06, "loss": 0.1438, "num_input_tokens_seen": 212115504, "step": 98300 }, { "epoch": 16.036704730831975, "grad_norm": 0.07103001326322556, "learning_rate": 5.746791245150687e-06, "loss": 0.0068, "num_input_tokens_seen": 212125776, "step": 98305 }, { "epoch": 16.03752039151713, "grad_norm": 0.16232319176197052, "learning_rate": 5.744521199103708e-06, "loss": 0.041, "num_input_tokens_seen": 212136752, "step": 98310 }, { "epoch": 16.038336052202283, "grad_norm": 2.0402143001556396, "learning_rate": 5.7422515432934335e-06, "loss": 0.1512, "num_input_tokens_seen": 212146480, "step": 98315 }, { "epoch": 16.03915171288744, "grad_norm": 1.4587270021438599, "learning_rate": 5.739982277765863e-06, "loss": 0.1447, "num_input_tokens_seen": 212157232, "step": 98320 }, { "epoch": 16.039967373572594, "grad_norm": 0.15369930863380432, "learning_rate": 5.737713402566983e-06, "loss": 0.0495, "num_input_tokens_seen": 212168272, "step": 98325 }, { "epoch": 16.04078303425775, "grad_norm": 1.8353466987609863, "learning_rate": 5.735444917742777e-06, "loss": 0.1124, "num_input_tokens_seen": 212178128, "step": 98330 }, { "epoch": 16.041598694942905, "grad_norm": 0.30407074093818665, "learning_rate": 5.7331768233392205e-06, "loss": 0.0416, "num_input_tokens_seen": 212189264, "step": 98335 }, { "epoch": 16.042414355628058, "grad_norm": 1.3963325023651123, "learning_rate": 5.730909119402275e-06, "loss": 0.0709, "num_input_tokens_seen": 212199632, "step": 98340 }, { "epoch": 16.043230016313213, "grad_norm": 0.08656591176986694, "learning_rate": 5.7286418059779025e-06, "loss": 0.0593, "num_input_tokens_seen": 212209232, "step": 98345 }, { "epoch": 16.04404567699837, "grad_norm": 0.35004252195358276, "learning_rate": 5.726374883112051e-06, "loss": 0.0201, "num_input_tokens_seen": 212219472, "step": 98350 }, { "epoch": 16.044861337683525, "grad_norm": 0.803512692451477, "learning_rate": 5.724108350850654e-06, "loss": 0.0264, "num_input_tokens_seen": 212230096, "step": 98355 }, { "epoch": 16.045676998368677, "grad_norm": 1.6476422548294067, "learning_rate": 5.721842209239672e-06, "loss": 0.2297, "num_input_tokens_seen": 212240752, "step": 98360 }, { "epoch": 16.046492659053833, "grad_norm": 0.16549080610275269, "learning_rate": 5.719576458324996e-06, "loss": 0.0394, "num_input_tokens_seen": 212251952, "step": 98365 }, { "epoch": 16.04730831973899, "grad_norm": 0.0755259320139885, "learning_rate": 5.717311098152578e-06, "loss": 0.0203, "num_input_tokens_seen": 212263248, "step": 98370 }, { "epoch": 16.048123980424144, "grad_norm": 1.7269680500030518, "learning_rate": 5.715046128768298e-06, "loss": 0.1988, "num_input_tokens_seen": 212272208, "step": 98375 }, { "epoch": 16.0489396411093, "grad_norm": 1.4975489377975464, "learning_rate": 5.712781550218088e-06, "loss": 0.1914, "num_input_tokens_seen": 212283984, "step": 98380 }, { "epoch": 16.049755301794452, "grad_norm": 0.11359717696905136, "learning_rate": 5.7105173625478125e-06, "loss": 0.0728, "num_input_tokens_seen": 212295504, "step": 98385 }, { "epoch": 16.050570962479608, "grad_norm": 0.017864318564534187, "learning_rate": 5.708253565803387e-06, "loss": 0.0576, "num_input_tokens_seen": 212307280, "step": 98390 }, { "epoch": 16.051386623164763, "grad_norm": 0.08957549929618835, "learning_rate": 5.705990160030661e-06, "loss": 0.0093, "num_input_tokens_seen": 212317552, "step": 98395 }, { "epoch": 16.05220228384992, "grad_norm": 0.07787090539932251, "learning_rate": 5.703727145275536e-06, "loss": 0.0226, "num_input_tokens_seen": 212328016, "step": 98400 }, { "epoch": 16.053017944535075, "grad_norm": 0.08228199928998947, "learning_rate": 5.701464521583846e-06, "loss": 0.0105, "num_input_tokens_seen": 212338416, "step": 98405 }, { "epoch": 16.053833605220227, "grad_norm": 0.061369433999061584, "learning_rate": 5.6992022890014736e-06, "loss": 0.0711, "num_input_tokens_seen": 212349168, "step": 98410 }, { "epoch": 16.054649265905383, "grad_norm": 1.2326613664627075, "learning_rate": 5.696940447574237e-06, "loss": 0.1209, "num_input_tokens_seen": 212359728, "step": 98415 }, { "epoch": 16.05546492659054, "grad_norm": 0.06629981100559235, "learning_rate": 5.694678997348002e-06, "loss": 0.0421, "num_input_tokens_seen": 212369712, "step": 98420 }, { "epoch": 16.056280587275694, "grad_norm": 1.8855825662612915, "learning_rate": 5.6924179383685775e-06, "loss": 0.1735, "num_input_tokens_seen": 212381264, "step": 98425 }, { "epoch": 16.05709624796085, "grad_norm": 2.3796942234039307, "learning_rate": 5.690157270681806e-06, "loss": 0.1172, "num_input_tokens_seen": 212391280, "step": 98430 }, { "epoch": 16.057911908646002, "grad_norm": 0.023327557370066643, "learning_rate": 5.687896994333483e-06, "loss": 0.035, "num_input_tokens_seen": 212402704, "step": 98435 }, { "epoch": 16.058727569331158, "grad_norm": 0.25033706426620483, "learning_rate": 5.68563710936944e-06, "loss": 0.0174, "num_input_tokens_seen": 212414640, "step": 98440 }, { "epoch": 16.059543230016313, "grad_norm": 0.9658394455909729, "learning_rate": 5.68337761583545e-06, "loss": 0.0417, "num_input_tokens_seen": 212425488, "step": 98445 }, { "epoch": 16.06035889070147, "grad_norm": 0.09342655539512634, "learning_rate": 5.681118513777322e-06, "loss": 0.0334, "num_input_tokens_seen": 212436080, "step": 98450 }, { "epoch": 16.061174551386625, "grad_norm": 0.05798693373799324, "learning_rate": 5.678859803240838e-06, "loss": 0.0121, "num_input_tokens_seen": 212447664, "step": 98455 }, { "epoch": 16.061990212071777, "grad_norm": 0.13944198191165924, "learning_rate": 5.676601484271768e-06, "loss": 0.1056, "num_input_tokens_seen": 212457840, "step": 98460 }, { "epoch": 16.062805872756933, "grad_norm": 0.19871045649051666, "learning_rate": 5.674343556915884e-06, "loss": 0.0603, "num_input_tokens_seen": 212466640, "step": 98465 }, { "epoch": 16.063621533442088, "grad_norm": 0.027146119624376297, "learning_rate": 5.672086021218945e-06, "loss": 0.1718, "num_input_tokens_seen": 212476016, "step": 98470 }, { "epoch": 16.064437194127244, "grad_norm": 1.856387734413147, "learning_rate": 5.669828877226699e-06, "loss": 0.1938, "num_input_tokens_seen": 212486832, "step": 98475 }, { "epoch": 16.0652528548124, "grad_norm": 1.598600149154663, "learning_rate": 5.667572124984896e-06, "loss": 0.1265, "num_input_tokens_seen": 212497968, "step": 98480 }, { "epoch": 16.06606851549755, "grad_norm": 0.5640532374382019, "learning_rate": 5.665315764539267e-06, "loss": 0.0515, "num_input_tokens_seen": 212508592, "step": 98485 }, { "epoch": 16.066884176182707, "grad_norm": 0.05116024985909462, "learning_rate": 5.663059795935544e-06, "loss": 0.0383, "num_input_tokens_seen": 212519152, "step": 98490 }, { "epoch": 16.067699836867863, "grad_norm": 0.17608913779258728, "learning_rate": 5.660804219219443e-06, "loss": 0.0541, "num_input_tokens_seen": 212529296, "step": 98495 }, { "epoch": 16.06851549755302, "grad_norm": 1.8919223546981812, "learning_rate": 5.658549034436669e-06, "loss": 0.1671, "num_input_tokens_seen": 212540144, "step": 98500 }, { "epoch": 16.069331158238175, "grad_norm": 1.3859888315200806, "learning_rate": 5.656294241632951e-06, "loss": 0.146, "num_input_tokens_seen": 212550896, "step": 98505 }, { "epoch": 16.070146818923327, "grad_norm": 0.07215534150600433, "learning_rate": 5.654039840853956e-06, "loss": 0.2511, "num_input_tokens_seen": 212561168, "step": 98510 }, { "epoch": 16.070962479608482, "grad_norm": 3.1911301612854004, "learning_rate": 5.651785832145401e-06, "loss": 0.2122, "num_input_tokens_seen": 212572592, "step": 98515 }, { "epoch": 16.071778140293638, "grad_norm": 0.9870429635047913, "learning_rate": 5.649532215552936e-06, "loss": 0.0437, "num_input_tokens_seen": 212583408, "step": 98520 }, { "epoch": 16.072593800978794, "grad_norm": 2.885213613510132, "learning_rate": 5.64727899112226e-06, "loss": 0.2884, "num_input_tokens_seen": 212593968, "step": 98525 }, { "epoch": 16.07340946166395, "grad_norm": 0.30600085854530334, "learning_rate": 5.645026158899014e-06, "loss": 0.0154, "num_input_tokens_seen": 212603312, "step": 98530 }, { "epoch": 16.0742251223491, "grad_norm": 1.379952311515808, "learning_rate": 5.642773718928882e-06, "loss": 0.0811, "num_input_tokens_seen": 212614672, "step": 98535 }, { "epoch": 16.075040783034257, "grad_norm": 0.12483180314302444, "learning_rate": 5.6405216712574785e-06, "loss": 0.0222, "num_input_tokens_seen": 212625456, "step": 98540 }, { "epoch": 16.075856443719413, "grad_norm": 0.1345081925392151, "learning_rate": 5.63827001593048e-06, "loss": 0.0974, "num_input_tokens_seen": 212634928, "step": 98545 }, { "epoch": 16.07667210440457, "grad_norm": 0.3358532190322876, "learning_rate": 5.6360187529934885e-06, "loss": 0.0884, "num_input_tokens_seen": 212644048, "step": 98550 }, { "epoch": 16.07748776508972, "grad_norm": 0.06358176469802856, "learning_rate": 5.633767882492147e-06, "loss": 0.0247, "num_input_tokens_seen": 212655696, "step": 98555 }, { "epoch": 16.078303425774877, "grad_norm": 1.2775866985321045, "learning_rate": 5.631517404472067e-06, "loss": 0.0668, "num_input_tokens_seen": 212666544, "step": 98560 }, { "epoch": 16.079119086460032, "grad_norm": 0.1376284509897232, "learning_rate": 5.629267318978859e-06, "loss": 0.0364, "num_input_tokens_seen": 212677712, "step": 98565 }, { "epoch": 16.079934747145188, "grad_norm": 0.028649913147091866, "learning_rate": 5.627017626058121e-06, "loss": 0.1466, "num_input_tokens_seen": 212687664, "step": 98570 }, { "epoch": 16.080750407830344, "grad_norm": 1.2239816188812256, "learning_rate": 5.624768325755448e-06, "loss": 0.0753, "num_input_tokens_seen": 212697776, "step": 98575 }, { "epoch": 16.081566068515496, "grad_norm": 0.2194957733154297, "learning_rate": 5.622519418116423e-06, "loss": 0.2205, "num_input_tokens_seen": 212709840, "step": 98580 }, { "epoch": 16.08238172920065, "grad_norm": 0.8046718239784241, "learning_rate": 5.620270903186625e-06, "loss": 0.0811, "num_input_tokens_seen": 212721424, "step": 98585 }, { "epoch": 16.083197389885807, "grad_norm": 2.7579033374786377, "learning_rate": 5.618022781011623e-06, "loss": 0.1394, "num_input_tokens_seen": 212733168, "step": 98590 }, { "epoch": 16.084013050570963, "grad_norm": 1.1651725769042969, "learning_rate": 5.615775051636976e-06, "loss": 0.0826, "num_input_tokens_seen": 212744016, "step": 98595 }, { "epoch": 16.08482871125612, "grad_norm": 0.5041465163230896, "learning_rate": 5.6135277151082385e-06, "loss": 0.0666, "num_input_tokens_seen": 212754128, "step": 98600 }, { "epoch": 16.08564437194127, "grad_norm": 0.059083759784698486, "learning_rate": 5.6112807714709535e-06, "loss": 0.1491, "num_input_tokens_seen": 212765040, "step": 98605 }, { "epoch": 16.086460032626427, "grad_norm": 0.13260020315647125, "learning_rate": 5.609034220770662e-06, "loss": 0.1314, "num_input_tokens_seen": 212774416, "step": 98610 }, { "epoch": 16.087275693311582, "grad_norm": 0.3846474587917328, "learning_rate": 5.606788063052889e-06, "loss": 0.158, "num_input_tokens_seen": 212784496, "step": 98615 }, { "epoch": 16.088091353996738, "grad_norm": 0.3354516923427582, "learning_rate": 5.6045422983631605e-06, "loss": 0.0926, "num_input_tokens_seen": 212795728, "step": 98620 }, { "epoch": 16.088907014681894, "grad_norm": 0.0855412483215332, "learning_rate": 5.602296926746989e-06, "loss": 0.0797, "num_input_tokens_seen": 212806128, "step": 98625 }, { "epoch": 16.089722675367046, "grad_norm": 0.3386366367340088, "learning_rate": 5.600051948249874e-06, "loss": 0.0569, "num_input_tokens_seen": 212816976, "step": 98630 }, { "epoch": 16.0905383360522, "grad_norm": 0.4775177240371704, "learning_rate": 5.5978073629173174e-06, "loss": 0.1439, "num_input_tokens_seen": 212827984, "step": 98635 }, { "epoch": 16.091353996737357, "grad_norm": 2.223965883255005, "learning_rate": 5.595563170794809e-06, "loss": 0.1235, "num_input_tokens_seen": 212838192, "step": 98640 }, { "epoch": 16.092169657422513, "grad_norm": 2.143097400665283, "learning_rate": 5.593319371927822e-06, "loss": 0.0613, "num_input_tokens_seen": 212847920, "step": 98645 }, { "epoch": 16.09298531810767, "grad_norm": 1.6720389127731323, "learning_rate": 5.5910759663618525e-06, "loss": 0.0659, "num_input_tokens_seen": 212858480, "step": 98650 }, { "epoch": 16.09380097879282, "grad_norm": 0.0687410831451416, "learning_rate": 5.588832954142334e-06, "loss": 0.0401, "num_input_tokens_seen": 212869360, "step": 98655 }, { "epoch": 16.094616639477977, "grad_norm": 0.5412418842315674, "learning_rate": 5.586590335314748e-06, "loss": 0.125, "num_input_tokens_seen": 212880816, "step": 98660 }, { "epoch": 16.095432300163132, "grad_norm": 0.13561123609542847, "learning_rate": 5.584348109924539e-06, "loss": 0.0139, "num_input_tokens_seen": 212891152, "step": 98665 }, { "epoch": 16.096247960848288, "grad_norm": 0.1979801505804062, "learning_rate": 5.5821062780171435e-06, "loss": 0.0786, "num_input_tokens_seen": 212900560, "step": 98670 }, { "epoch": 16.097063621533444, "grad_norm": 1.1007721424102783, "learning_rate": 5.579864839637999e-06, "loss": 0.1126, "num_input_tokens_seen": 212911472, "step": 98675 }, { "epoch": 16.097879282218596, "grad_norm": 0.5720046162605286, "learning_rate": 5.577623794832529e-06, "loss": 0.2347, "num_input_tokens_seen": 212922416, "step": 98680 }, { "epoch": 16.09869494290375, "grad_norm": 1.0143851041793823, "learning_rate": 5.575383143646151e-06, "loss": 0.1223, "num_input_tokens_seen": 212934160, "step": 98685 }, { "epoch": 16.099510603588907, "grad_norm": 0.9290143251419067, "learning_rate": 5.5731428861242775e-06, "loss": 0.2463, "num_input_tokens_seen": 212945936, "step": 98690 }, { "epoch": 16.100326264274063, "grad_norm": 1.783669114112854, "learning_rate": 5.5709030223123075e-06, "loss": 0.0738, "num_input_tokens_seen": 212956848, "step": 98695 }, { "epoch": 16.10114192495922, "grad_norm": 0.06398912519216537, "learning_rate": 5.568663552255635e-06, "loss": 0.0231, "num_input_tokens_seen": 212967760, "step": 98700 }, { "epoch": 16.10195758564437, "grad_norm": 0.09099388122558594, "learning_rate": 5.5664244759996434e-06, "loss": 0.1373, "num_input_tokens_seen": 212978224, "step": 98705 }, { "epoch": 16.102773246329527, "grad_norm": 2.275449514389038, "learning_rate": 5.564185793589716e-06, "loss": 0.1506, "num_input_tokens_seen": 212988720, "step": 98710 }, { "epoch": 16.103588907014682, "grad_norm": 1.3355578184127808, "learning_rate": 5.561947505071219e-06, "loss": 0.0521, "num_input_tokens_seen": 212999952, "step": 98715 }, { "epoch": 16.104404567699838, "grad_norm": 0.6304321885108948, "learning_rate": 5.559709610489517e-06, "loss": 0.0413, "num_input_tokens_seen": 213011088, "step": 98720 }, { "epoch": 16.10522022838499, "grad_norm": 1.646744966506958, "learning_rate": 5.557472109889958e-06, "loss": 0.0262, "num_input_tokens_seen": 213022480, "step": 98725 }, { "epoch": 16.106035889070146, "grad_norm": 0.19437673687934875, "learning_rate": 5.55523500331789e-06, "loss": 0.0599, "num_input_tokens_seen": 213033040, "step": 98730 }, { "epoch": 16.1068515497553, "grad_norm": 1.4571030139923096, "learning_rate": 5.552998290818656e-06, "loss": 0.1095, "num_input_tokens_seen": 213044336, "step": 98735 }, { "epoch": 16.107667210440457, "grad_norm": 0.15444320440292358, "learning_rate": 5.550761972437582e-06, "loss": 0.0245, "num_input_tokens_seen": 213053456, "step": 98740 }, { "epoch": 16.108482871125613, "grad_norm": 0.19539068639278412, "learning_rate": 5.548526048219987e-06, "loss": 0.022, "num_input_tokens_seen": 213064752, "step": 98745 }, { "epoch": 16.109298531810765, "grad_norm": 0.363610178232193, "learning_rate": 5.546290518211189e-06, "loss": 0.0462, "num_input_tokens_seen": 213076080, "step": 98750 }, { "epoch": 16.11011419249592, "grad_norm": 0.6949738264083862, "learning_rate": 5.5440553824564925e-06, "loss": 0.0339, "num_input_tokens_seen": 213085264, "step": 98755 }, { "epoch": 16.110929853181077, "grad_norm": 0.7010151743888855, "learning_rate": 5.541820641001194e-06, "loss": 0.3714, "num_input_tokens_seen": 213095760, "step": 98760 }, { "epoch": 16.111745513866232, "grad_norm": 0.11270341277122498, "learning_rate": 5.539586293890586e-06, "loss": 0.0179, "num_input_tokens_seen": 213106384, "step": 98765 }, { "epoch": 16.112561174551388, "grad_norm": 0.11163002252578735, "learning_rate": 5.537352341169949e-06, "loss": 0.198, "num_input_tokens_seen": 213117936, "step": 98770 }, { "epoch": 16.11337683523654, "grad_norm": 0.08848673850297928, "learning_rate": 5.535118782884549e-06, "loss": 0.0789, "num_input_tokens_seen": 213128240, "step": 98775 }, { "epoch": 16.114192495921696, "grad_norm": 0.5614897608757019, "learning_rate": 5.532885619079675e-06, "loss": 0.02, "num_input_tokens_seen": 213140240, "step": 98780 }, { "epoch": 16.11500815660685, "grad_norm": 0.17527692019939423, "learning_rate": 5.530652849800555e-06, "loss": 0.1128, "num_input_tokens_seen": 213150288, "step": 98785 }, { "epoch": 16.115823817292007, "grad_norm": 3.33335280418396, "learning_rate": 5.5284204750924606e-06, "loss": 0.1625, "num_input_tokens_seen": 213161456, "step": 98790 }, { "epoch": 16.116639477977163, "grad_norm": 0.0818326473236084, "learning_rate": 5.5261884950006285e-06, "loss": 0.0187, "num_input_tokens_seen": 213173040, "step": 98795 }, { "epoch": 16.117455138662315, "grad_norm": 0.04095581918954849, "learning_rate": 5.523956909570288e-06, "loss": 0.0129, "num_input_tokens_seen": 213184656, "step": 98800 }, { "epoch": 16.11827079934747, "grad_norm": 0.012931139208376408, "learning_rate": 5.521725718846671e-06, "loss": 0.0949, "num_input_tokens_seen": 213194224, "step": 98805 }, { "epoch": 16.119086460032626, "grad_norm": 0.6436774134635925, "learning_rate": 5.51949492287499e-06, "loss": 0.0853, "num_input_tokens_seen": 213205392, "step": 98810 }, { "epoch": 16.119902120717782, "grad_norm": 0.22924308478832245, "learning_rate": 5.517264521700457e-06, "loss": 0.1316, "num_input_tokens_seen": 213216496, "step": 98815 }, { "epoch": 16.120717781402938, "grad_norm": 0.1410619616508484, "learning_rate": 5.515034515368275e-06, "loss": 0.052, "num_input_tokens_seen": 213227920, "step": 98820 }, { "epoch": 16.12153344208809, "grad_norm": 0.0794786661863327, "learning_rate": 5.512804903923635e-06, "loss": 0.1379, "num_input_tokens_seen": 213237584, "step": 98825 }, { "epoch": 16.122349102773246, "grad_norm": 1.5165114402770996, "learning_rate": 5.510575687411729e-06, "loss": 0.0738, "num_input_tokens_seen": 213248336, "step": 98830 }, { "epoch": 16.1231647634584, "grad_norm": 0.13712362945079803, "learning_rate": 5.508346865877728e-06, "loss": 0.0291, "num_input_tokens_seen": 213258736, "step": 98835 }, { "epoch": 16.123980424143557, "grad_norm": 0.02381555736064911, "learning_rate": 5.506118439366803e-06, "loss": 0.0265, "num_input_tokens_seen": 213270768, "step": 98840 }, { "epoch": 16.124796084828713, "grad_norm": 0.03675463795661926, "learning_rate": 5.50389040792412e-06, "loss": 0.0369, "num_input_tokens_seen": 213282576, "step": 98845 }, { "epoch": 16.125611745513865, "grad_norm": 2.494063138961792, "learning_rate": 5.501662771594831e-06, "loss": 0.2222, "num_input_tokens_seen": 213291568, "step": 98850 }, { "epoch": 16.12642740619902, "grad_norm": 0.18474511802196503, "learning_rate": 5.49943553042408e-06, "loss": 0.0708, "num_input_tokens_seen": 213303056, "step": 98855 }, { "epoch": 16.127243066884176, "grad_norm": 2.1791067123413086, "learning_rate": 5.497208684457006e-06, "loss": 0.1771, "num_input_tokens_seen": 213314160, "step": 98860 }, { "epoch": 16.128058727569332, "grad_norm": 0.11067349463701248, "learning_rate": 5.494982233738741e-06, "loss": 0.0429, "num_input_tokens_seen": 213325232, "step": 98865 }, { "epoch": 16.128874388254488, "grad_norm": 1.3505080938339233, "learning_rate": 5.492756178314404e-06, "loss": 0.1269, "num_input_tokens_seen": 213336976, "step": 98870 }, { "epoch": 16.12969004893964, "grad_norm": 0.5033813714981079, "learning_rate": 5.490530518229109e-06, "loss": 0.0338, "num_input_tokens_seen": 213346672, "step": 98875 }, { "epoch": 16.130505709624796, "grad_norm": 2.433887243270874, "learning_rate": 5.488305253527953e-06, "loss": 0.14, "num_input_tokens_seen": 213358128, "step": 98880 }, { "epoch": 16.13132137030995, "grad_norm": 0.1618814468383789, "learning_rate": 5.486080384256062e-06, "loss": 0.1394, "num_input_tokens_seen": 213368112, "step": 98885 }, { "epoch": 16.132137030995107, "grad_norm": 0.02534686028957367, "learning_rate": 5.483855910458491e-06, "loss": 0.0433, "num_input_tokens_seen": 213378608, "step": 98890 }, { "epoch": 16.13295269168026, "grad_norm": 0.07478151470422745, "learning_rate": 5.481631832180353e-06, "loss": 0.0838, "num_input_tokens_seen": 213387856, "step": 98895 }, { "epoch": 16.133768352365415, "grad_norm": 0.07286366820335388, "learning_rate": 5.479408149466692e-06, "loss": 0.1122, "num_input_tokens_seen": 213398832, "step": 98900 }, { "epoch": 16.13458401305057, "grad_norm": 1.5027772188186646, "learning_rate": 5.477184862362602e-06, "loss": 0.1295, "num_input_tokens_seen": 213409488, "step": 98905 }, { "epoch": 16.135399673735726, "grad_norm": 0.08146897703409195, "learning_rate": 5.474961970913114e-06, "loss": 0.0286, "num_input_tokens_seen": 213419984, "step": 98910 }, { "epoch": 16.136215334420882, "grad_norm": 0.027600159868597984, "learning_rate": 5.472739475163305e-06, "loss": 0.2464, "num_input_tokens_seen": 213430960, "step": 98915 }, { "epoch": 16.137030995106034, "grad_norm": 0.019864169880747795, "learning_rate": 5.470517375158188e-06, "loss": 0.1038, "num_input_tokens_seen": 213440656, "step": 98920 }, { "epoch": 16.13784665579119, "grad_norm": 0.03184859827160835, "learning_rate": 5.468295670942824e-06, "loss": 0.0166, "num_input_tokens_seen": 213451376, "step": 98925 }, { "epoch": 16.138662316476346, "grad_norm": 0.6648868918418884, "learning_rate": 5.466074362562215e-06, "loss": 0.013, "num_input_tokens_seen": 213462864, "step": 98930 }, { "epoch": 16.1394779771615, "grad_norm": 0.054014552384614944, "learning_rate": 5.463853450061396e-06, "loss": 0.0557, "num_input_tokens_seen": 213474576, "step": 98935 }, { "epoch": 16.140293637846657, "grad_norm": 0.24138899147510529, "learning_rate": 5.46163293348537e-06, "loss": 0.1513, "num_input_tokens_seen": 213485488, "step": 98940 }, { "epoch": 16.14110929853181, "grad_norm": 2.0772275924682617, "learning_rate": 5.4594128128791364e-06, "loss": 0.1136, "num_input_tokens_seen": 213495984, "step": 98945 }, { "epoch": 16.141924959216965, "grad_norm": 0.07937964797019958, "learning_rate": 5.457193088287693e-06, "loss": 0.0847, "num_input_tokens_seen": 213507600, "step": 98950 }, { "epoch": 16.14274061990212, "grad_norm": 1.4285684823989868, "learning_rate": 5.454973759756024e-06, "loss": 0.1005, "num_input_tokens_seen": 213518064, "step": 98955 }, { "epoch": 16.143556280587276, "grad_norm": 0.7510390281677246, "learning_rate": 5.452754827329104e-06, "loss": 0.1555, "num_input_tokens_seen": 213528656, "step": 98960 }, { "epoch": 16.144371941272432, "grad_norm": 0.08668909221887589, "learning_rate": 5.450536291051908e-06, "loss": 0.0274, "num_input_tokens_seen": 213537648, "step": 98965 }, { "epoch": 16.145187601957584, "grad_norm": 1.5770328044891357, "learning_rate": 5.448318150969392e-06, "loss": 0.0549, "num_input_tokens_seen": 213549488, "step": 98970 }, { "epoch": 16.14600326264274, "grad_norm": 1.0337929725646973, "learning_rate": 5.446100407126512e-06, "loss": 0.0913, "num_input_tokens_seen": 213558704, "step": 98975 }, { "epoch": 16.146818923327896, "grad_norm": 0.5418010950088501, "learning_rate": 5.4438830595682136e-06, "loss": 0.1775, "num_input_tokens_seen": 213569584, "step": 98980 }, { "epoch": 16.14763458401305, "grad_norm": 0.13512757420539856, "learning_rate": 5.4416661083394275e-06, "loss": 0.0206, "num_input_tokens_seen": 213580048, "step": 98985 }, { "epoch": 16.148450244698207, "grad_norm": 0.037330906838178635, "learning_rate": 5.439449553485101e-06, "loss": 0.176, "num_input_tokens_seen": 213590768, "step": 98990 }, { "epoch": 16.14926590538336, "grad_norm": 1.6277064085006714, "learning_rate": 5.4372333950501345e-06, "loss": 0.1667, "num_input_tokens_seen": 213599824, "step": 98995 }, { "epoch": 16.150081566068515, "grad_norm": 0.2572150230407715, "learning_rate": 5.435017633079459e-06, "loss": 0.0991, "num_input_tokens_seen": 213610352, "step": 99000 }, { "epoch": 16.15089722675367, "grad_norm": 0.19340580701828003, "learning_rate": 5.432802267617959e-06, "loss": 0.1236, "num_input_tokens_seen": 213621744, "step": 99005 }, { "epoch": 16.151712887438826, "grad_norm": 1.9269708395004272, "learning_rate": 5.43058729871056e-06, "loss": 0.1956, "num_input_tokens_seen": 213632272, "step": 99010 }, { "epoch": 16.152528548123982, "grad_norm": 0.8962540626525879, "learning_rate": 5.428372726402117e-06, "loss": 0.1358, "num_input_tokens_seen": 213641808, "step": 99015 }, { "epoch": 16.153344208809134, "grad_norm": 0.06999015063047409, "learning_rate": 5.426158550737545e-06, "loss": 0.1057, "num_input_tokens_seen": 213651760, "step": 99020 }, { "epoch": 16.15415986949429, "grad_norm": 0.3096623420715332, "learning_rate": 5.423944771761688e-06, "loss": 0.0121, "num_input_tokens_seen": 213662608, "step": 99025 }, { "epoch": 16.154975530179446, "grad_norm": 6.329553127288818, "learning_rate": 5.421731389519438e-06, "loss": 0.1768, "num_input_tokens_seen": 213673232, "step": 99030 }, { "epoch": 16.1557911908646, "grad_norm": 1.9393221139907837, "learning_rate": 5.419518404055624e-06, "loss": 0.1584, "num_input_tokens_seen": 213683792, "step": 99035 }, { "epoch": 16.156606851549757, "grad_norm": 0.09724196046590805, "learning_rate": 5.417305815415122e-06, "loss": 0.0386, "num_input_tokens_seen": 213694224, "step": 99040 }, { "epoch": 16.15742251223491, "grad_norm": 0.09870327264070511, "learning_rate": 5.415093623642745e-06, "loss": 0.0429, "num_input_tokens_seen": 213704592, "step": 99045 }, { "epoch": 16.158238172920065, "grad_norm": 0.9723864793777466, "learning_rate": 5.412881828783359e-06, "loss": 0.0248, "num_input_tokens_seen": 213716144, "step": 99050 }, { "epoch": 16.15905383360522, "grad_norm": 0.026302125304937363, "learning_rate": 5.410670430881754e-06, "loss": 0.0149, "num_input_tokens_seen": 213726640, "step": 99055 }, { "epoch": 16.159869494290376, "grad_norm": 0.018130941316485405, "learning_rate": 5.408459429982776e-06, "loss": 0.0746, "num_input_tokens_seen": 213738320, "step": 99060 }, { "epoch": 16.160685154975532, "grad_norm": 0.06838533282279968, "learning_rate": 5.40624882613121e-06, "loss": 0.2887, "num_input_tokens_seen": 213750320, "step": 99065 }, { "epoch": 16.161500815660684, "grad_norm": 0.08375386893749237, "learning_rate": 5.404038619371882e-06, "loss": 0.0377, "num_input_tokens_seen": 213760368, "step": 99070 }, { "epoch": 16.16231647634584, "grad_norm": 0.05195176601409912, "learning_rate": 5.401828809749554e-06, "loss": 0.0759, "num_input_tokens_seen": 213771536, "step": 99075 }, { "epoch": 16.163132137030995, "grad_norm": 0.12018278986215591, "learning_rate": 5.399619397309039e-06, "loss": 0.1007, "num_input_tokens_seen": 213782288, "step": 99080 }, { "epoch": 16.16394779771615, "grad_norm": 0.03939139097929001, "learning_rate": 5.397410382095097e-06, "loss": 0.0092, "num_input_tokens_seen": 213792208, "step": 99085 }, { "epoch": 16.164763458401303, "grad_norm": 0.24635763466358185, "learning_rate": 5.395201764152502e-06, "loss": 0.0297, "num_input_tokens_seen": 213804144, "step": 99090 }, { "epoch": 16.16557911908646, "grad_norm": 0.0647096335887909, "learning_rate": 5.392993543526015e-06, "loss": 0.1432, "num_input_tokens_seen": 213814032, "step": 99095 }, { "epoch": 16.166394779771615, "grad_norm": 0.11625116318464279, "learning_rate": 5.390785720260388e-06, "loss": 0.0078, "num_input_tokens_seen": 213823824, "step": 99100 }, { "epoch": 16.16721044045677, "grad_norm": 0.18215346336364746, "learning_rate": 5.388578294400362e-06, "loss": 0.0279, "num_input_tokens_seen": 213834224, "step": 99105 }, { "epoch": 16.168026101141926, "grad_norm": 0.43936651945114136, "learning_rate": 5.386371265990678e-06, "loss": 0.0964, "num_input_tokens_seen": 213844240, "step": 99110 }, { "epoch": 16.16884176182708, "grad_norm": 0.08461743593215942, "learning_rate": 5.384164635076061e-06, "loss": 0.0117, "num_input_tokens_seen": 213854896, "step": 99115 }, { "epoch": 16.169657422512234, "grad_norm": 0.06416762620210648, "learning_rate": 5.381958401701231e-06, "loss": 0.1427, "num_input_tokens_seen": 213866640, "step": 99120 }, { "epoch": 16.17047308319739, "grad_norm": 0.08776602894067764, "learning_rate": 5.379752565910903e-06, "loss": 0.4142, "num_input_tokens_seen": 213878192, "step": 99125 }, { "epoch": 16.171288743882545, "grad_norm": 0.35231080651283264, "learning_rate": 5.377547127749772e-06, "loss": 0.035, "num_input_tokens_seen": 213888752, "step": 99130 }, { "epoch": 16.1721044045677, "grad_norm": 0.1495053768157959, "learning_rate": 5.3753420872625556e-06, "loss": 0.0333, "num_input_tokens_seen": 213899600, "step": 99135 }, { "epoch": 16.172920065252853, "grad_norm": 1.13886296749115, "learning_rate": 5.373137444493912e-06, "loss": 0.1493, "num_input_tokens_seen": 213911152, "step": 99140 }, { "epoch": 16.17373572593801, "grad_norm": 1.4407057762145996, "learning_rate": 5.370933199488551e-06, "loss": 0.1687, "num_input_tokens_seen": 213922960, "step": 99145 }, { "epoch": 16.174551386623165, "grad_norm": 0.3514598309993744, "learning_rate": 5.368729352291116e-06, "loss": 0.0233, "num_input_tokens_seen": 213933136, "step": 99150 }, { "epoch": 16.17536704730832, "grad_norm": 2.269033432006836, "learning_rate": 5.366525902946301e-06, "loss": 0.0703, "num_input_tokens_seen": 213943472, "step": 99155 }, { "epoch": 16.176182707993476, "grad_norm": 0.11724711954593658, "learning_rate": 5.364322851498729e-06, "loss": 0.0996, "num_input_tokens_seen": 213953584, "step": 99160 }, { "epoch": 16.17699836867863, "grad_norm": 0.45960766077041626, "learning_rate": 5.3621201979930786e-06, "loss": 0.1387, "num_input_tokens_seen": 213964176, "step": 99165 }, { "epoch": 16.177814029363784, "grad_norm": 0.0762137919664383, "learning_rate": 5.35991794247396e-06, "loss": 0.0073, "num_input_tokens_seen": 213974320, "step": 99170 }, { "epoch": 16.17862969004894, "grad_norm": 0.5656600594520569, "learning_rate": 5.3577160849860335e-06, "loss": 0.1628, "num_input_tokens_seen": 213984976, "step": 99175 }, { "epoch": 16.179445350734095, "grad_norm": 2.614422082901001, "learning_rate": 5.355514625573896e-06, "loss": 0.2452, "num_input_tokens_seen": 213996624, "step": 99180 }, { "epoch": 16.18026101141925, "grad_norm": 0.46891239285469055, "learning_rate": 5.353313564282189e-06, "loss": 0.1743, "num_input_tokens_seen": 214005808, "step": 99185 }, { "epoch": 16.181076672104403, "grad_norm": 0.04224373400211334, "learning_rate": 5.351112901155492e-06, "loss": 0.1038, "num_input_tokens_seen": 214016720, "step": 99190 }, { "epoch": 16.18189233278956, "grad_norm": 0.18491236865520477, "learning_rate": 5.348912636238423e-06, "loss": 0.0201, "num_input_tokens_seen": 214028272, "step": 99195 }, { "epoch": 16.182707993474715, "grad_norm": 1.2098257541656494, "learning_rate": 5.346712769575571e-06, "loss": 0.0467, "num_input_tokens_seen": 214038896, "step": 99200 }, { "epoch": 16.18352365415987, "grad_norm": 0.039774034172296524, "learning_rate": 5.344513301211518e-06, "loss": 0.0694, "num_input_tokens_seen": 214050224, "step": 99205 }, { "epoch": 16.184339314845026, "grad_norm": 0.024049891158938408, "learning_rate": 5.342314231190837e-06, "loss": 0.0326, "num_input_tokens_seen": 214059408, "step": 99210 }, { "epoch": 16.18515497553018, "grad_norm": 1.0352935791015625, "learning_rate": 5.340115559558092e-06, "loss": 0.095, "num_input_tokens_seen": 214071216, "step": 99215 }, { "epoch": 16.185970636215334, "grad_norm": 1.6924489736557007, "learning_rate": 5.337917286357849e-06, "loss": 0.1678, "num_input_tokens_seen": 214080784, "step": 99220 }, { "epoch": 16.18678629690049, "grad_norm": 0.6818826198577881, "learning_rate": 5.335719411634657e-06, "loss": 0.139, "num_input_tokens_seen": 214091984, "step": 99225 }, { "epoch": 16.187601957585645, "grad_norm": 0.9161953330039978, "learning_rate": 5.333521935433053e-06, "loss": 0.1305, "num_input_tokens_seen": 214103632, "step": 99230 }, { "epoch": 16.1884176182708, "grad_norm": 0.03904346004128456, "learning_rate": 5.331324857797579e-06, "loss": 0.0272, "num_input_tokens_seen": 214111920, "step": 99235 }, { "epoch": 16.189233278955953, "grad_norm": 0.2437405288219452, "learning_rate": 5.329128178772755e-06, "loss": 0.0104, "num_input_tokens_seen": 214120048, "step": 99240 }, { "epoch": 16.19004893964111, "grad_norm": 1.6447832584381104, "learning_rate": 5.3269318984031064e-06, "loss": 0.0633, "num_input_tokens_seen": 214129296, "step": 99245 }, { "epoch": 16.190864600326265, "grad_norm": 0.04099424183368683, "learning_rate": 5.324736016733137e-06, "loss": 0.2277, "num_input_tokens_seen": 214138832, "step": 99250 }, { "epoch": 16.19168026101142, "grad_norm": 2.0867135524749756, "learning_rate": 5.322540533807352e-06, "loss": 0.1324, "num_input_tokens_seen": 214149840, "step": 99255 }, { "epoch": 16.192495921696572, "grad_norm": 0.14162029325962067, "learning_rate": 5.320345449670247e-06, "loss": 0.1561, "num_input_tokens_seen": 214161360, "step": 99260 }, { "epoch": 16.193311582381728, "grad_norm": 2.3199872970581055, "learning_rate": 5.318150764366306e-06, "loss": 0.2521, "num_input_tokens_seen": 214172496, "step": 99265 }, { "epoch": 16.194127243066884, "grad_norm": 0.0866454690694809, "learning_rate": 5.315956477940009e-06, "loss": 0.1447, "num_input_tokens_seen": 214183504, "step": 99270 }, { "epoch": 16.19494290375204, "grad_norm": 0.9273226857185364, "learning_rate": 5.313762590435817e-06, "loss": 0.1046, "num_input_tokens_seen": 214194864, "step": 99275 }, { "epoch": 16.195758564437195, "grad_norm": 0.1339752972126007, "learning_rate": 5.311569101898215e-06, "loss": 0.1922, "num_input_tokens_seen": 214206512, "step": 99280 }, { "epoch": 16.196574225122347, "grad_norm": 0.08116097003221512, "learning_rate": 5.309376012371628e-06, "loss": 0.1024, "num_input_tokens_seen": 214218960, "step": 99285 }, { "epoch": 16.197389885807503, "grad_norm": 2.2913174629211426, "learning_rate": 5.307183321900527e-06, "loss": 0.122, "num_input_tokens_seen": 214229520, "step": 99290 }, { "epoch": 16.19820554649266, "grad_norm": 1.1128835678100586, "learning_rate": 5.304991030529327e-06, "loss": 0.1006, "num_input_tokens_seen": 214240112, "step": 99295 }, { "epoch": 16.199021207177815, "grad_norm": 0.03970722481608391, "learning_rate": 5.302799138302475e-06, "loss": 0.1122, "num_input_tokens_seen": 214251248, "step": 99300 }, { "epoch": 16.19983686786297, "grad_norm": 0.2378087192773819, "learning_rate": 5.3006076452643855e-06, "loss": 0.0912, "num_input_tokens_seen": 214262032, "step": 99305 }, { "epoch": 16.200652528548122, "grad_norm": 0.618375837802887, "learning_rate": 5.29841655145947e-06, "loss": 0.117, "num_input_tokens_seen": 214273104, "step": 99310 }, { "epoch": 16.201468189233278, "grad_norm": 0.1723867654800415, "learning_rate": 5.296225856932141e-06, "loss": 0.0453, "num_input_tokens_seen": 214284720, "step": 99315 }, { "epoch": 16.202283849918434, "grad_norm": 0.031658872961997986, "learning_rate": 5.2940355617267875e-06, "loss": 0.0173, "num_input_tokens_seen": 214296336, "step": 99320 }, { "epoch": 16.20309951060359, "grad_norm": 0.05870722606778145, "learning_rate": 5.291845665887804e-06, "loss": 0.029, "num_input_tokens_seen": 214307888, "step": 99325 }, { "epoch": 16.203915171288745, "grad_norm": 1.2670345306396484, "learning_rate": 5.28965616945957e-06, "loss": 0.0245, "num_input_tokens_seen": 214318352, "step": 99330 }, { "epoch": 16.204730831973897, "grad_norm": 2.498335361480713, "learning_rate": 5.287467072486454e-06, "loss": 0.0386, "num_input_tokens_seen": 214329232, "step": 99335 }, { "epoch": 16.205546492659053, "grad_norm": 0.05860970914363861, "learning_rate": 5.285278375012828e-06, "loss": 0.0229, "num_input_tokens_seen": 214338416, "step": 99340 }, { "epoch": 16.20636215334421, "grad_norm": 0.1565847247838974, "learning_rate": 5.283090077083047e-06, "loss": 0.0621, "num_input_tokens_seen": 214348976, "step": 99345 }, { "epoch": 16.207177814029365, "grad_norm": 0.0725700631737709, "learning_rate": 5.280902178741457e-06, "loss": 0.0518, "num_input_tokens_seen": 214359344, "step": 99350 }, { "epoch": 16.20799347471452, "grad_norm": 1.380632996559143, "learning_rate": 5.278714680032398e-06, "loss": 0.1203, "num_input_tokens_seen": 214370672, "step": 99355 }, { "epoch": 16.208809135399672, "grad_norm": 2.3231658935546875, "learning_rate": 5.276527581000207e-06, "loss": 0.138, "num_input_tokens_seen": 214382128, "step": 99360 }, { "epoch": 16.209624796084828, "grad_norm": 0.10415992885828018, "learning_rate": 5.2743408816892045e-06, "loss": 0.0453, "num_input_tokens_seen": 214393008, "step": 99365 }, { "epoch": 16.210440456769984, "grad_norm": 0.06382453441619873, "learning_rate": 5.272154582143709e-06, "loss": 0.0346, "num_input_tokens_seen": 214404112, "step": 99370 }, { "epoch": 16.21125611745514, "grad_norm": 0.27101510763168335, "learning_rate": 5.269968682408024e-06, "loss": 0.0961, "num_input_tokens_seen": 214416432, "step": 99375 }, { "epoch": 16.212071778140295, "grad_norm": 1.154881477355957, "learning_rate": 5.267783182526456e-06, "loss": 0.1464, "num_input_tokens_seen": 214428240, "step": 99380 }, { "epoch": 16.212887438825447, "grad_norm": 0.04423864185810089, "learning_rate": 5.265598082543294e-06, "loss": 0.1059, "num_input_tokens_seen": 214439696, "step": 99385 }, { "epoch": 16.213703099510603, "grad_norm": 0.0593026764690876, "learning_rate": 5.2634133825028215e-06, "loss": 0.0112, "num_input_tokens_seen": 214449936, "step": 99390 }, { "epoch": 16.21451876019576, "grad_norm": 0.19025208055973053, "learning_rate": 5.261229082449312e-06, "loss": 0.2637, "num_input_tokens_seen": 214461712, "step": 99395 }, { "epoch": 16.215334420880914, "grad_norm": 0.026981553062796593, "learning_rate": 5.259045182427039e-06, "loss": 0.0101, "num_input_tokens_seen": 214473584, "step": 99400 }, { "epoch": 16.21615008156607, "grad_norm": 1.6098740100860596, "learning_rate": 5.256861682480249e-06, "loss": 0.1997, "num_input_tokens_seen": 214484528, "step": 99405 }, { "epoch": 16.216965742251222, "grad_norm": 0.677177906036377, "learning_rate": 5.254678582653211e-06, "loss": 0.0213, "num_input_tokens_seen": 214495536, "step": 99410 }, { "epoch": 16.217781402936378, "grad_norm": 2.0701675415039062, "learning_rate": 5.2524958829901605e-06, "loss": 0.137, "num_input_tokens_seen": 214506128, "step": 99415 }, { "epoch": 16.218597063621534, "grad_norm": 0.2108633816242218, "learning_rate": 5.250313583535333e-06, "loss": 0.0132, "num_input_tokens_seen": 214516528, "step": 99420 }, { "epoch": 16.21941272430669, "grad_norm": 0.566410481929779, "learning_rate": 5.248131684332952e-06, "loss": 0.1372, "num_input_tokens_seen": 214526896, "step": 99425 }, { "epoch": 16.22022838499184, "grad_norm": 0.27615654468536377, "learning_rate": 5.245950185427243e-06, "loss": 0.1696, "num_input_tokens_seen": 214538448, "step": 99430 }, { "epoch": 16.221044045676997, "grad_norm": 0.5424875617027283, "learning_rate": 5.24376908686241e-06, "loss": 0.0527, "num_input_tokens_seen": 214548112, "step": 99435 }, { "epoch": 16.221859706362153, "grad_norm": 0.867325484752655, "learning_rate": 5.241588388682661e-06, "loss": 0.1737, "num_input_tokens_seen": 214558832, "step": 99440 }, { "epoch": 16.22267536704731, "grad_norm": 0.11347059160470963, "learning_rate": 5.239408090932188e-06, "loss": 0.1018, "num_input_tokens_seen": 214570064, "step": 99445 }, { "epoch": 16.223491027732464, "grad_norm": 0.030056532472372055, "learning_rate": 5.2372281936551805e-06, "loss": 0.1277, "num_input_tokens_seen": 214580944, "step": 99450 }, { "epoch": 16.224306688417617, "grad_norm": 0.24728965759277344, "learning_rate": 5.235048696895811e-06, "loss": 0.1302, "num_input_tokens_seen": 214591696, "step": 99455 }, { "epoch": 16.225122349102772, "grad_norm": 0.14697983860969543, "learning_rate": 5.232869600698256e-06, "loss": 0.0321, "num_input_tokens_seen": 214602384, "step": 99460 }, { "epoch": 16.225938009787928, "grad_norm": 0.35333892703056335, "learning_rate": 5.230690905106672e-06, "loss": 0.2126, "num_input_tokens_seen": 214613712, "step": 99465 }, { "epoch": 16.226753670473084, "grad_norm": 0.06778699159622192, "learning_rate": 5.228512610165218e-06, "loss": 0.0303, "num_input_tokens_seen": 214624752, "step": 99470 }, { "epoch": 16.22756933115824, "grad_norm": 0.09950544685125351, "learning_rate": 5.226334715918038e-06, "loss": 0.2234, "num_input_tokens_seen": 214634608, "step": 99475 }, { "epoch": 16.22838499184339, "grad_norm": 1.3953263759613037, "learning_rate": 5.22415722240927e-06, "loss": 0.0393, "num_input_tokens_seen": 214645616, "step": 99480 }, { "epoch": 16.229200652528547, "grad_norm": 0.17311708629131317, "learning_rate": 5.221980129683041e-06, "loss": 0.0108, "num_input_tokens_seen": 214656944, "step": 99485 }, { "epoch": 16.230016313213703, "grad_norm": 0.14515239000320435, "learning_rate": 5.219803437783477e-06, "loss": 0.0343, "num_input_tokens_seen": 214666800, "step": 99490 }, { "epoch": 16.23083197389886, "grad_norm": 0.1157073825597763, "learning_rate": 5.217627146754689e-06, "loss": 0.0233, "num_input_tokens_seen": 214677872, "step": 99495 }, { "epoch": 16.231647634584014, "grad_norm": 0.20568791031837463, "learning_rate": 5.215451256640783e-06, "loss": 0.0864, "num_input_tokens_seen": 214689232, "step": 99500 }, { "epoch": 16.232463295269167, "grad_norm": 0.2438693344593048, "learning_rate": 5.2132757674858534e-06, "loss": 0.015, "num_input_tokens_seen": 214700176, "step": 99505 }, { "epoch": 16.233278955954322, "grad_norm": 0.03151680901646614, "learning_rate": 5.211100679333994e-06, "loss": 0.0297, "num_input_tokens_seen": 214711920, "step": 99510 }, { "epoch": 16.234094616639478, "grad_norm": 0.2817326784133911, "learning_rate": 5.208925992229283e-06, "loss": 0.104, "num_input_tokens_seen": 214722544, "step": 99515 }, { "epoch": 16.234910277324634, "grad_norm": 1.2359837293624878, "learning_rate": 5.206751706215785e-06, "loss": 0.1137, "num_input_tokens_seen": 214733488, "step": 99520 }, { "epoch": 16.23572593800979, "grad_norm": 0.07660293579101562, "learning_rate": 5.204577821337589e-06, "loss": 0.0393, "num_input_tokens_seen": 214744336, "step": 99525 }, { "epoch": 16.23654159869494, "grad_norm": 0.11399659514427185, "learning_rate": 5.2024043376387225e-06, "loss": 0.0263, "num_input_tokens_seen": 214755664, "step": 99530 }, { "epoch": 16.237357259380097, "grad_norm": 0.06402376294136047, "learning_rate": 5.200231255163257e-06, "loss": 0.0952, "num_input_tokens_seen": 214765072, "step": 99535 }, { "epoch": 16.238172920065253, "grad_norm": 1.333353042602539, "learning_rate": 5.1980585739552116e-06, "loss": 0.2313, "num_input_tokens_seen": 214776688, "step": 99540 }, { "epoch": 16.23898858075041, "grad_norm": 0.04566303640604019, "learning_rate": 5.195886294058644e-06, "loss": 0.1412, "num_input_tokens_seen": 214787760, "step": 99545 }, { "epoch": 16.239804241435564, "grad_norm": 0.22328609228134155, "learning_rate": 5.19371441551755e-06, "loss": 0.0118, "num_input_tokens_seen": 214798192, "step": 99550 }, { "epoch": 16.240619902120716, "grad_norm": 0.045755647122859955, "learning_rate": 5.191542938375965e-06, "loss": 0.3015, "num_input_tokens_seen": 214808656, "step": 99555 }, { "epoch": 16.241435562805872, "grad_norm": 0.5651265382766724, "learning_rate": 5.1893718626778945e-06, "loss": 0.1109, "num_input_tokens_seen": 214817552, "step": 99560 }, { "epoch": 16.242251223491028, "grad_norm": 0.667448103427887, "learning_rate": 5.187201188467331e-06, "loss": 0.0249, "num_input_tokens_seen": 214829264, "step": 99565 }, { "epoch": 16.243066884176184, "grad_norm": 1.1140575408935547, "learning_rate": 5.185030915788272e-06, "loss": 0.094, "num_input_tokens_seen": 214840368, "step": 99570 }, { "epoch": 16.24388254486134, "grad_norm": 0.07052021473646164, "learning_rate": 5.182861044684698e-06, "loss": 0.0722, "num_input_tokens_seen": 214851696, "step": 99575 }, { "epoch": 16.24469820554649, "grad_norm": 0.05687835067510605, "learning_rate": 5.180691575200583e-06, "loss": 0.0875, "num_input_tokens_seen": 214862896, "step": 99580 }, { "epoch": 16.245513866231647, "grad_norm": 0.11513650417327881, "learning_rate": 5.178522507379899e-06, "loss": 0.0233, "num_input_tokens_seen": 214872944, "step": 99585 }, { "epoch": 16.246329526916803, "grad_norm": 0.4335269629955292, "learning_rate": 5.176353841266598e-06, "loss": 0.0704, "num_input_tokens_seen": 214884624, "step": 99590 }, { "epoch": 16.24714518760196, "grad_norm": 0.16603167355060577, "learning_rate": 5.1741855769046385e-06, "loss": 0.1068, "num_input_tokens_seen": 214893840, "step": 99595 }, { "epoch": 16.247960848287114, "grad_norm": 2.142573833465576, "learning_rate": 5.172017714337956e-06, "loss": 0.2059, "num_input_tokens_seen": 214904688, "step": 99600 }, { "epoch": 16.248776508972266, "grad_norm": 1.7029997110366821, "learning_rate": 5.169850253610489e-06, "loss": 0.1873, "num_input_tokens_seen": 214916144, "step": 99605 }, { "epoch": 16.249592169657422, "grad_norm": 0.11273956298828125, "learning_rate": 5.167683194766165e-06, "loss": 0.0366, "num_input_tokens_seen": 214927184, "step": 99610 }, { "epoch": 16.250407830342578, "grad_norm": 1.9234892129898071, "learning_rate": 5.1655165378488965e-06, "loss": 0.0996, "num_input_tokens_seen": 214937744, "step": 99615 }, { "epoch": 16.251223491027734, "grad_norm": 0.21996086835861206, "learning_rate": 5.1633502829025986e-06, "loss": 0.1884, "num_input_tokens_seen": 214949008, "step": 99620 }, { "epoch": 16.252039151712886, "grad_norm": 0.019095616415143013, "learning_rate": 5.1611844299711664e-06, "loss": 0.0529, "num_input_tokens_seen": 214960592, "step": 99625 }, { "epoch": 16.25285481239804, "grad_norm": 1.7346216440200806, "learning_rate": 5.159018979098512e-06, "loss": 0.1123, "num_input_tokens_seen": 214970672, "step": 99630 }, { "epoch": 16.253670473083197, "grad_norm": 1.8363884687423706, "learning_rate": 5.1568539303284956e-06, "loss": 0.0982, "num_input_tokens_seen": 214981168, "step": 99635 }, { "epoch": 16.254486133768353, "grad_norm": 0.06719933450222015, "learning_rate": 5.1546892837050196e-06, "loss": 0.0529, "num_input_tokens_seen": 214992304, "step": 99640 }, { "epoch": 16.25530179445351, "grad_norm": 0.27461692690849304, "learning_rate": 5.152525039271924e-06, "loss": 0.1361, "num_input_tokens_seen": 215003440, "step": 99645 }, { "epoch": 16.25611745513866, "grad_norm": 2.785733461380005, "learning_rate": 5.150361197073106e-06, "loss": 0.0706, "num_input_tokens_seen": 215014960, "step": 99650 }, { "epoch": 16.256933115823816, "grad_norm": 0.3741433322429657, "learning_rate": 5.1481977571523816e-06, "loss": 0.0162, "num_input_tokens_seen": 215025808, "step": 99655 }, { "epoch": 16.257748776508972, "grad_norm": 1.835297703742981, "learning_rate": 5.146034719553627e-06, "loss": 0.1413, "num_input_tokens_seen": 215036752, "step": 99660 }, { "epoch": 16.258564437194128, "grad_norm": 0.06675302237272263, "learning_rate": 5.143872084320655e-06, "loss": 0.1698, "num_input_tokens_seen": 215047376, "step": 99665 }, { "epoch": 16.259380097879284, "grad_norm": 0.13466978073120117, "learning_rate": 5.141709851497317e-06, "loss": 0.0215, "num_input_tokens_seen": 215058512, "step": 99670 }, { "epoch": 16.260195758564436, "grad_norm": 0.05941731482744217, "learning_rate": 5.139548021127405e-06, "loss": 0.0218, "num_input_tokens_seen": 215069264, "step": 99675 }, { "epoch": 16.26101141924959, "grad_norm": 0.3785316050052643, "learning_rate": 5.137386593254765e-06, "loss": 0.0259, "num_input_tokens_seen": 215080624, "step": 99680 }, { "epoch": 16.261827079934747, "grad_norm": 0.6807472705841064, "learning_rate": 5.135225567923166e-06, "loss": 0.0122, "num_input_tokens_seen": 215089744, "step": 99685 }, { "epoch": 16.262642740619903, "grad_norm": 0.17593860626220703, "learning_rate": 5.1330649451764355e-06, "loss": 0.055, "num_input_tokens_seen": 215100400, "step": 99690 }, { "epoch": 16.26345840130506, "grad_norm": 0.1768210530281067, "learning_rate": 5.130904725058333e-06, "loss": 0.1406, "num_input_tokens_seen": 215110416, "step": 99695 }, { "epoch": 16.26427406199021, "grad_norm": 0.13967949151992798, "learning_rate": 5.128744907612659e-06, "loss": 0.0251, "num_input_tokens_seen": 215120912, "step": 99700 }, { "epoch": 16.265089722675366, "grad_norm": 0.1909008026123047, "learning_rate": 5.126585492883179e-06, "loss": 0.0424, "num_input_tokens_seen": 215132912, "step": 99705 }, { "epoch": 16.265905383360522, "grad_norm": 0.11782613396644592, "learning_rate": 5.1244264809136546e-06, "loss": 0.0186, "num_input_tokens_seen": 215144240, "step": 99710 }, { "epoch": 16.266721044045678, "grad_norm": 0.041265327483415604, "learning_rate": 5.12226787174784e-06, "loss": 0.0126, "num_input_tokens_seen": 215155568, "step": 99715 }, { "epoch": 16.267536704730833, "grad_norm": 0.13518419861793518, "learning_rate": 5.120109665429485e-06, "loss": 0.1293, "num_input_tokens_seen": 215166320, "step": 99720 }, { "epoch": 16.268352365415986, "grad_norm": 0.1410524845123291, "learning_rate": 5.117951862002327e-06, "loss": 0.022, "num_input_tokens_seen": 215178448, "step": 99725 }, { "epoch": 16.26916802610114, "grad_norm": 0.17256051301956177, "learning_rate": 5.115794461510098e-06, "loss": 0.1212, "num_input_tokens_seen": 215190480, "step": 99730 }, { "epoch": 16.269983686786297, "grad_norm": 0.03230956941843033, "learning_rate": 5.113637463996518e-06, "loss": 0.0114, "num_input_tokens_seen": 215200304, "step": 99735 }, { "epoch": 16.270799347471453, "grad_norm": 0.019546611234545708, "learning_rate": 5.1114808695053e-06, "loss": 0.0226, "num_input_tokens_seen": 215211152, "step": 99740 }, { "epoch": 16.27161500815661, "grad_norm": 0.6575511693954468, "learning_rate": 5.109324678080158e-06, "loss": 0.1149, "num_input_tokens_seen": 215222960, "step": 99745 }, { "epoch": 16.27243066884176, "grad_norm": 0.5165355801582336, "learning_rate": 5.107168889764774e-06, "loss": 0.0673, "num_input_tokens_seen": 215234032, "step": 99750 }, { "epoch": 16.273246329526916, "grad_norm": 0.75928795337677, "learning_rate": 5.10501350460286e-06, "loss": 0.0571, "num_input_tokens_seen": 215243728, "step": 99755 }, { "epoch": 16.274061990212072, "grad_norm": 0.15342682600021362, "learning_rate": 5.102858522638077e-06, "loss": 0.0515, "num_input_tokens_seen": 215254704, "step": 99760 }, { "epoch": 16.274877650897228, "grad_norm": 0.1013823077082634, "learning_rate": 5.100703943914118e-06, "loss": 0.1291, "num_input_tokens_seen": 215264464, "step": 99765 }, { "epoch": 16.275693311582383, "grad_norm": 0.14272189140319824, "learning_rate": 5.098549768474625e-06, "loss": 0.1278, "num_input_tokens_seen": 215275792, "step": 99770 }, { "epoch": 16.276508972267536, "grad_norm": 0.016948290169239044, "learning_rate": 5.096395996363279e-06, "loss": 0.0183, "num_input_tokens_seen": 215285872, "step": 99775 }, { "epoch": 16.27732463295269, "grad_norm": 0.5972411036491394, "learning_rate": 5.094242627623708e-06, "loss": 0.0626, "num_input_tokens_seen": 215296656, "step": 99780 }, { "epoch": 16.278140293637847, "grad_norm": 0.4209493398666382, "learning_rate": 5.0920896622995734e-06, "loss": 0.0697, "num_input_tokens_seen": 215307344, "step": 99785 }, { "epoch": 16.278955954323003, "grad_norm": 0.23233315348625183, "learning_rate": 5.0899371004344835e-06, "loss": 0.0861, "num_input_tokens_seen": 215317648, "step": 99790 }, { "epoch": 16.27977161500816, "grad_norm": 0.10072465240955353, "learning_rate": 5.087784942072091e-06, "loss": 0.0923, "num_input_tokens_seen": 215329552, "step": 99795 }, { "epoch": 16.28058727569331, "grad_norm": 0.08238961547613144, "learning_rate": 5.085633187255981e-06, "loss": 0.064, "num_input_tokens_seen": 215338608, "step": 99800 }, { "epoch": 16.281402936378466, "grad_norm": 0.36046770215034485, "learning_rate": 5.083481836029794e-06, "loss": 0.0145, "num_input_tokens_seen": 215349104, "step": 99805 }, { "epoch": 16.282218597063622, "grad_norm": 0.4969029426574707, "learning_rate": 5.081330888437097e-06, "loss": 0.133, "num_input_tokens_seen": 215360528, "step": 99810 }, { "epoch": 16.283034257748778, "grad_norm": 0.5110458135604858, "learning_rate": 5.079180344521514e-06, "loss": 0.026, "num_input_tokens_seen": 215370768, "step": 99815 }, { "epoch": 16.28384991843393, "grad_norm": 0.06042611226439476, "learning_rate": 5.0770302043265995e-06, "loss": 0.0457, "num_input_tokens_seen": 215381872, "step": 99820 }, { "epoch": 16.284665579119086, "grad_norm": 0.04030626639723778, "learning_rate": 5.074880467895954e-06, "loss": 0.0396, "num_input_tokens_seen": 215392208, "step": 99825 }, { "epoch": 16.28548123980424, "grad_norm": 0.13287553191184998, "learning_rate": 5.07273113527312e-06, "loss": 0.0577, "num_input_tokens_seen": 215403920, "step": 99830 }, { "epoch": 16.286296900489397, "grad_norm": 1.8741616010665894, "learning_rate": 5.070582206501676e-06, "loss": 0.1196, "num_input_tokens_seen": 215414992, "step": 99835 }, { "epoch": 16.287112561174553, "grad_norm": 0.05203970521688461, "learning_rate": 5.068433681625165e-06, "loss": 0.0945, "num_input_tokens_seen": 215426160, "step": 99840 }, { "epoch": 16.287928221859705, "grad_norm": 1.2067148685455322, "learning_rate": 5.066285560687129e-06, "loss": 0.2057, "num_input_tokens_seen": 215437232, "step": 99845 }, { "epoch": 16.28874388254486, "grad_norm": 1.44707190990448, "learning_rate": 5.064137843731107e-06, "loss": 0.231, "num_input_tokens_seen": 215451792, "step": 99850 }, { "epoch": 16.289559543230016, "grad_norm": 0.8692528605461121, "learning_rate": 5.06199053080062e-06, "loss": 0.1162, "num_input_tokens_seen": 215463504, "step": 99855 }, { "epoch": 16.290375203915172, "grad_norm": 0.16475407779216766, "learning_rate": 5.059843621939189e-06, "loss": 0.039, "num_input_tokens_seen": 215474896, "step": 99860 }, { "epoch": 16.291190864600328, "grad_norm": 0.6430028676986694, "learning_rate": 5.057697117190322e-06, "loss": 0.1543, "num_input_tokens_seen": 215486512, "step": 99865 }, { "epoch": 16.29200652528548, "grad_norm": 0.03651651367545128, "learning_rate": 5.0555510165975204e-06, "loss": 0.0316, "num_input_tokens_seen": 215497360, "step": 99870 }, { "epoch": 16.292822185970635, "grad_norm": 0.339214026927948, "learning_rate": 5.053405320204283e-06, "loss": 0.0477, "num_input_tokens_seen": 215507280, "step": 99875 }, { "epoch": 16.29363784665579, "grad_norm": 0.2981514036655426, "learning_rate": 5.051260028054086e-06, "loss": 0.1242, "num_input_tokens_seen": 215517200, "step": 99880 }, { "epoch": 16.294453507340947, "grad_norm": 0.07549314200878143, "learning_rate": 5.049115140190414e-06, "loss": 0.0405, "num_input_tokens_seen": 215527760, "step": 99885 }, { "epoch": 16.295269168026103, "grad_norm": 0.03714780509471893, "learning_rate": 5.046970656656733e-06, "loss": 0.025, "num_input_tokens_seen": 215538640, "step": 99890 }, { "epoch": 16.296084828711255, "grad_norm": 0.15152224898338318, "learning_rate": 5.044826577496497e-06, "loss": 0.0774, "num_input_tokens_seen": 215549840, "step": 99895 }, { "epoch": 16.29690048939641, "grad_norm": 0.7220253944396973, "learning_rate": 5.042682902753179e-06, "loss": 0.1947, "num_input_tokens_seen": 215560752, "step": 99900 }, { "epoch": 16.297716150081566, "grad_norm": 0.0935758650302887, "learning_rate": 5.040539632470196e-06, "loss": 0.1505, "num_input_tokens_seen": 215571952, "step": 99905 }, { "epoch": 16.298531810766722, "grad_norm": 0.08454182744026184, "learning_rate": 5.038396766691011e-06, "loss": 0.0852, "num_input_tokens_seen": 215582096, "step": 99910 }, { "epoch": 16.299347471451878, "grad_norm": 0.17325134575366974, "learning_rate": 5.036254305459023e-06, "loss": 0.1316, "num_input_tokens_seen": 215591120, "step": 99915 }, { "epoch": 16.30016313213703, "grad_norm": 0.08439628034830093, "learning_rate": 5.034112248817685e-06, "loss": 0.1558, "num_input_tokens_seen": 215601520, "step": 99920 }, { "epoch": 16.300978792822185, "grad_norm": 0.12053283303976059, "learning_rate": 5.031970596810376e-06, "loss": 0.1356, "num_input_tokens_seen": 215611664, "step": 99925 }, { "epoch": 16.30179445350734, "grad_norm": 0.6108657121658325, "learning_rate": 5.0298293494805245e-06, "loss": 0.0154, "num_input_tokens_seen": 215621936, "step": 99930 }, { "epoch": 16.302610114192497, "grad_norm": 0.0740627869963646, "learning_rate": 5.027688506871506e-06, "loss": 0.0392, "num_input_tokens_seen": 215630928, "step": 99935 }, { "epoch": 16.303425774877653, "grad_norm": 3.0267112255096436, "learning_rate": 5.02554806902672e-06, "loss": 0.2682, "num_input_tokens_seen": 215643024, "step": 99940 }, { "epoch": 16.304241435562805, "grad_norm": 0.5320202708244324, "learning_rate": 5.023408035989543e-06, "loss": 0.04, "num_input_tokens_seen": 215654096, "step": 99945 }, { "epoch": 16.30505709624796, "grad_norm": 1.924482822418213, "learning_rate": 5.0212684078033455e-06, "loss": 0.1228, "num_input_tokens_seen": 215666352, "step": 99950 }, { "epoch": 16.305872756933116, "grad_norm": 0.08325842767953873, "learning_rate": 5.019129184511487e-06, "loss": 0.0156, "num_input_tokens_seen": 215678288, "step": 99955 }, { "epoch": 16.306688417618272, "grad_norm": 0.18868905305862427, "learning_rate": 5.016990366157323e-06, "loss": 0.0789, "num_input_tokens_seen": 215689040, "step": 99960 }, { "epoch": 16.307504078303428, "grad_norm": 0.04263905808329582, "learning_rate": 5.0148519527841994e-06, "loss": 0.0937, "num_input_tokens_seen": 215699728, "step": 99965 }, { "epoch": 16.30831973898858, "grad_norm": 2.2437100410461426, "learning_rate": 5.012713944435455e-06, "loss": 0.1011, "num_input_tokens_seen": 215710992, "step": 99970 }, { "epoch": 16.309135399673735, "grad_norm": 0.08377119153738022, "learning_rate": 5.010576341154419e-06, "loss": 0.0242, "num_input_tokens_seen": 215721840, "step": 99975 }, { "epoch": 16.30995106035889, "grad_norm": 1.39833664894104, "learning_rate": 5.008439142984408e-06, "loss": 0.2225, "num_input_tokens_seen": 215731600, "step": 99980 }, { "epoch": 16.310766721044047, "grad_norm": 1.4967682361602783, "learning_rate": 5.006302349968742e-06, "loss": 0.1658, "num_input_tokens_seen": 215741264, "step": 99985 }, { "epoch": 16.3115823817292, "grad_norm": 0.030680086463689804, "learning_rate": 5.004165962150722e-06, "loss": 0.0412, "num_input_tokens_seen": 215752624, "step": 99990 }, { "epoch": 16.312398042414355, "grad_norm": 0.27293792366981506, "learning_rate": 5.002029979573647e-06, "loss": 0.0372, "num_input_tokens_seen": 215761680, "step": 99995 }, { "epoch": 16.31321370309951, "grad_norm": 0.27644988894462585, "learning_rate": 4.9998944022808024e-06, "loss": 0.1381, "num_input_tokens_seen": 215771408, "step": 100000 }, { "epoch": 16.314029363784666, "grad_norm": 0.05588941276073456, "learning_rate": 4.9977592303154685e-06, "loss": 0.013, "num_input_tokens_seen": 215781840, "step": 100005 }, { "epoch": 16.31484502446982, "grad_norm": 0.810076117515564, "learning_rate": 4.995624463720919e-06, "loss": 0.0164, "num_input_tokens_seen": 215792048, "step": 100010 }, { "epoch": 16.315660685154974, "grad_norm": 0.8594179153442383, "learning_rate": 4.9934901025404185e-06, "loss": 0.0355, "num_input_tokens_seen": 215804304, "step": 100015 }, { "epoch": 16.31647634584013, "grad_norm": 1.74884033203125, "learning_rate": 4.991356146817219e-06, "loss": 0.2102, "num_input_tokens_seen": 215815632, "step": 100020 }, { "epoch": 16.317292006525285, "grad_norm": 0.5078054070472717, "learning_rate": 4.9892225965945695e-06, "loss": 0.1262, "num_input_tokens_seen": 215825296, "step": 100025 }, { "epoch": 16.31810766721044, "grad_norm": 1.781854510307312, "learning_rate": 4.987089451915714e-06, "loss": 0.1313, "num_input_tokens_seen": 215836272, "step": 100030 }, { "epoch": 16.318923327895597, "grad_norm": 0.2443678081035614, "learning_rate": 4.984956712823874e-06, "loss": 0.1255, "num_input_tokens_seen": 215845904, "step": 100035 }, { "epoch": 16.31973898858075, "grad_norm": 1.2317754030227661, "learning_rate": 4.982824379362272e-06, "loss": 0.0811, "num_input_tokens_seen": 215856688, "step": 100040 }, { "epoch": 16.320554649265905, "grad_norm": 1.916078805923462, "learning_rate": 4.9806924515741344e-06, "loss": 0.2687, "num_input_tokens_seen": 215867120, "step": 100045 }, { "epoch": 16.32137030995106, "grad_norm": 0.053067367523908615, "learning_rate": 4.9785609295026625e-06, "loss": 0.0959, "num_input_tokens_seen": 215878032, "step": 100050 }, { "epoch": 16.322185970636216, "grad_norm": 0.6094605922698975, "learning_rate": 4.97642981319105e-06, "loss": 0.1875, "num_input_tokens_seen": 215889296, "step": 100055 }, { "epoch": 16.32300163132137, "grad_norm": 0.06713512539863586, "learning_rate": 4.9742991026824895e-06, "loss": 0.0336, "num_input_tokens_seen": 215900144, "step": 100060 }, { "epoch": 16.323817292006524, "grad_norm": 0.9383639097213745, "learning_rate": 4.972168798020163e-06, "loss": 0.0907, "num_input_tokens_seen": 215912592, "step": 100065 }, { "epoch": 16.32463295269168, "grad_norm": 1.6343103647232056, "learning_rate": 4.970038899247243e-06, "loss": 0.0971, "num_input_tokens_seen": 215923472, "step": 100070 }, { "epoch": 16.325448613376835, "grad_norm": 2.441800832748413, "learning_rate": 4.9679094064068925e-06, "loss": 0.1981, "num_input_tokens_seen": 215934544, "step": 100075 }, { "epoch": 16.32626427406199, "grad_norm": 0.41700899600982666, "learning_rate": 4.965780319542271e-06, "loss": 0.0815, "num_input_tokens_seen": 215945936, "step": 100080 }, { "epoch": 16.327079934747147, "grad_norm": 0.3945111036300659, "learning_rate": 4.963651638696526e-06, "loss": 0.0852, "num_input_tokens_seen": 215956016, "step": 100085 }, { "epoch": 16.3278955954323, "grad_norm": 1.2638646364212036, "learning_rate": 4.961523363912801e-06, "loss": 0.2249, "num_input_tokens_seen": 215966192, "step": 100090 }, { "epoch": 16.328711256117455, "grad_norm": 0.537774384021759, "learning_rate": 4.959395495234223e-06, "loss": 0.0563, "num_input_tokens_seen": 215978096, "step": 100095 }, { "epoch": 16.32952691680261, "grad_norm": 0.16353821754455566, "learning_rate": 4.957268032703921e-06, "loss": 0.2131, "num_input_tokens_seen": 215989392, "step": 100100 }, { "epoch": 16.330342577487766, "grad_norm": 2.2770349979400635, "learning_rate": 4.955140976365005e-06, "loss": 0.0731, "num_input_tokens_seen": 215999664, "step": 100105 }, { "epoch": 16.33115823817292, "grad_norm": 0.03819974139332771, "learning_rate": 4.953014326260588e-06, "loss": 0.0187, "num_input_tokens_seen": 216010736, "step": 100110 }, { "epoch": 16.331973898858074, "grad_norm": 0.16227993369102478, "learning_rate": 4.950888082433769e-06, "loss": 0.1015, "num_input_tokens_seen": 216021872, "step": 100115 }, { "epoch": 16.33278955954323, "grad_norm": 0.04324056953191757, "learning_rate": 4.948762244927635e-06, "loss": 0.0945, "num_input_tokens_seen": 216033584, "step": 100120 }, { "epoch": 16.333605220228385, "grad_norm": 0.2221997082233429, "learning_rate": 4.946636813785271e-06, "loss": 0.0261, "num_input_tokens_seen": 216044752, "step": 100125 }, { "epoch": 16.33442088091354, "grad_norm": 0.69715416431427, "learning_rate": 4.944511789049752e-06, "loss": 0.0724, "num_input_tokens_seen": 216055952, "step": 100130 }, { "epoch": 16.335236541598697, "grad_norm": 0.5913898944854736, "learning_rate": 4.9423871707641425e-06, "loss": 0.0362, "num_input_tokens_seen": 216066480, "step": 100135 }, { "epoch": 16.33605220228385, "grad_norm": 0.29963821172714233, "learning_rate": 4.940262958971503e-06, "loss": 0.0851, "num_input_tokens_seen": 216077904, "step": 100140 }, { "epoch": 16.336867862969005, "grad_norm": 2.483279228210449, "learning_rate": 4.9381391537148825e-06, "loss": 0.0641, "num_input_tokens_seen": 216089136, "step": 100145 }, { "epoch": 16.33768352365416, "grad_norm": 1.8796013593673706, "learning_rate": 4.936015755037313e-06, "loss": 0.1061, "num_input_tokens_seen": 216101328, "step": 100150 }, { "epoch": 16.338499184339316, "grad_norm": 1.2349752187728882, "learning_rate": 4.933892762981854e-06, "loss": 0.0762, "num_input_tokens_seen": 216111792, "step": 100155 }, { "epoch": 16.339314845024468, "grad_norm": 0.11246251314878464, "learning_rate": 4.9317701775915e-06, "loss": 0.0181, "num_input_tokens_seen": 216121552, "step": 100160 }, { "epoch": 16.340130505709624, "grad_norm": 0.23193123936653137, "learning_rate": 4.9296479989092924e-06, "loss": 0.0208, "num_input_tokens_seen": 216131856, "step": 100165 }, { "epoch": 16.34094616639478, "grad_norm": 0.4570574462413788, "learning_rate": 4.927526226978219e-06, "loss": 0.0246, "num_input_tokens_seen": 216141488, "step": 100170 }, { "epoch": 16.341761827079935, "grad_norm": 0.039900291711091995, "learning_rate": 4.925404861841301e-06, "loss": 0.0064, "num_input_tokens_seen": 216153552, "step": 100175 }, { "epoch": 16.34257748776509, "grad_norm": 0.09682494401931763, "learning_rate": 4.923283903541509e-06, "loss": 0.0081, "num_input_tokens_seen": 216164848, "step": 100180 }, { "epoch": 16.343393148450243, "grad_norm": 0.0779142901301384, "learning_rate": 4.921163352121841e-06, "loss": 0.0288, "num_input_tokens_seen": 216176400, "step": 100185 }, { "epoch": 16.3442088091354, "grad_norm": 1.2985788583755493, "learning_rate": 4.919043207625273e-06, "loss": 0.1699, "num_input_tokens_seen": 216186896, "step": 100190 }, { "epoch": 16.345024469820554, "grad_norm": 1.5840667486190796, "learning_rate": 4.916923470094767e-06, "loss": 0.0267, "num_input_tokens_seen": 216198160, "step": 100195 }, { "epoch": 16.34584013050571, "grad_norm": 1.3595082759857178, "learning_rate": 4.914804139573284e-06, "loss": 0.0394, "num_input_tokens_seen": 216208784, "step": 100200 }, { "epoch": 16.346655791190866, "grad_norm": 0.10477151721715927, "learning_rate": 4.912685216103777e-06, "loss": 0.0348, "num_input_tokens_seen": 216220592, "step": 100205 }, { "epoch": 16.347471451876018, "grad_norm": 0.09496551752090454, "learning_rate": 4.910566699729186e-06, "loss": 0.0757, "num_input_tokens_seen": 216231696, "step": 100210 }, { "epoch": 16.348287112561174, "grad_norm": 0.048504799604415894, "learning_rate": 4.908448590492445e-06, "loss": 0.0314, "num_input_tokens_seen": 216242928, "step": 100215 }, { "epoch": 16.34910277324633, "grad_norm": 2.419518232345581, "learning_rate": 4.9063308884364805e-06, "loss": 0.1367, "num_input_tokens_seen": 216253360, "step": 100220 }, { "epoch": 16.349918433931485, "grad_norm": 0.6502422094345093, "learning_rate": 4.904213593604212e-06, "loss": 0.0439, "num_input_tokens_seen": 216263536, "step": 100225 }, { "epoch": 16.35073409461664, "grad_norm": 1.9956448078155518, "learning_rate": 4.90209670603855e-06, "loss": 0.0921, "num_input_tokens_seen": 216274832, "step": 100230 }, { "epoch": 16.351549755301793, "grad_norm": 0.49698299169540405, "learning_rate": 4.899980225782394e-06, "loss": 0.0364, "num_input_tokens_seen": 216285936, "step": 100235 }, { "epoch": 16.35236541598695, "grad_norm": 1.4904234409332275, "learning_rate": 4.897864152878634e-06, "loss": 0.0442, "num_input_tokens_seen": 216296336, "step": 100240 }, { "epoch": 16.353181076672104, "grad_norm": 0.07672827690839767, "learning_rate": 4.895748487370161e-06, "loss": 0.2152, "num_input_tokens_seen": 216307472, "step": 100245 }, { "epoch": 16.35399673735726, "grad_norm": 1.5788270235061646, "learning_rate": 4.893633229299849e-06, "loss": 0.1322, "num_input_tokens_seen": 216317456, "step": 100250 }, { "epoch": 16.354812398042416, "grad_norm": 0.07505176216363907, "learning_rate": 4.891518378710566e-06, "loss": 0.0413, "num_input_tokens_seen": 216326992, "step": 100255 }, { "epoch": 16.355628058727568, "grad_norm": 2.0977845191955566, "learning_rate": 4.889403935645173e-06, "loss": 0.1597, "num_input_tokens_seen": 216338064, "step": 100260 }, { "epoch": 16.356443719412724, "grad_norm": 0.04874701425433159, "learning_rate": 4.887289900146513e-06, "loss": 0.1325, "num_input_tokens_seen": 216349232, "step": 100265 }, { "epoch": 16.35725938009788, "grad_norm": 1.8666828870773315, "learning_rate": 4.885176272257452e-06, "loss": 0.1115, "num_input_tokens_seen": 216360400, "step": 100270 }, { "epoch": 16.358075040783035, "grad_norm": 2.0915966033935547, "learning_rate": 4.883063052020801e-06, "loss": 0.1802, "num_input_tokens_seen": 216372144, "step": 100275 }, { "epoch": 16.35889070146819, "grad_norm": 0.06791575253009796, "learning_rate": 4.880950239479406e-06, "loss": 0.0624, "num_input_tokens_seen": 216382480, "step": 100280 }, { "epoch": 16.359706362153343, "grad_norm": 0.11578693985939026, "learning_rate": 4.878837834676067e-06, "loss": 0.071, "num_input_tokens_seen": 216392176, "step": 100285 }, { "epoch": 16.3605220228385, "grad_norm": 0.0543486662209034, "learning_rate": 4.876725837653617e-06, "loss": 0.0396, "num_input_tokens_seen": 216403184, "step": 100290 }, { "epoch": 16.361337683523654, "grad_norm": 0.6067067980766296, "learning_rate": 4.874614248454834e-06, "loss": 0.0727, "num_input_tokens_seen": 216414288, "step": 100295 }, { "epoch": 16.36215334420881, "grad_norm": 1.6839896440505981, "learning_rate": 4.872503067122536e-06, "loss": 0.3417, "num_input_tokens_seen": 216425520, "step": 100300 }, { "epoch": 16.362969004893966, "grad_norm": 0.30554062128067017, "learning_rate": 4.870392293699483e-06, "loss": 0.1318, "num_input_tokens_seen": 216436272, "step": 100305 }, { "epoch": 16.363784665579118, "grad_norm": 1.3557900190353394, "learning_rate": 4.86828192822848e-06, "loss": 0.2161, "num_input_tokens_seen": 216447632, "step": 100310 }, { "epoch": 16.364600326264274, "grad_norm": 0.3814469277858734, "learning_rate": 4.86617197075227e-06, "loss": 0.1044, "num_input_tokens_seen": 216458320, "step": 100315 }, { "epoch": 16.36541598694943, "grad_norm": 0.06980358064174652, "learning_rate": 4.864062421313639e-06, "loss": 0.0208, "num_input_tokens_seen": 216468720, "step": 100320 }, { "epoch": 16.366231647634585, "grad_norm": 0.7896131277084351, "learning_rate": 4.861953279955317e-06, "loss": 0.0813, "num_input_tokens_seen": 216478864, "step": 100325 }, { "epoch": 16.36704730831974, "grad_norm": 0.23680587112903595, "learning_rate": 4.859844546720063e-06, "loss": 0.1182, "num_input_tokens_seen": 216490192, "step": 100330 }, { "epoch": 16.367862969004893, "grad_norm": 1.02437162399292, "learning_rate": 4.857736221650608e-06, "loss": 0.1912, "num_input_tokens_seen": 216500368, "step": 100335 }, { "epoch": 16.36867862969005, "grad_norm": 0.10411255061626434, "learning_rate": 4.855628304789684e-06, "loss": 0.0495, "num_input_tokens_seen": 216511152, "step": 100340 }, { "epoch": 16.369494290375204, "grad_norm": 0.01394729409366846, "learning_rate": 4.853520796180003e-06, "loss": 0.0124, "num_input_tokens_seen": 216521520, "step": 100345 }, { "epoch": 16.37030995106036, "grad_norm": 0.7095744013786316, "learning_rate": 4.851413695864282e-06, "loss": 0.0412, "num_input_tokens_seen": 216532464, "step": 100350 }, { "epoch": 16.371125611745512, "grad_norm": 0.1770738810300827, "learning_rate": 4.849307003885225e-06, "loss": 0.108, "num_input_tokens_seen": 216544208, "step": 100355 }, { "epoch": 16.371941272430668, "grad_norm": 0.586210310459137, "learning_rate": 4.847200720285522e-06, "loss": 0.0546, "num_input_tokens_seen": 216554992, "step": 100360 }, { "epoch": 16.372756933115824, "grad_norm": 0.07024353742599487, "learning_rate": 4.845094845107864e-06, "loss": 0.0208, "num_input_tokens_seen": 216566096, "step": 100365 }, { "epoch": 16.37357259380098, "grad_norm": 0.061725955456495285, "learning_rate": 4.842989378394927e-06, "loss": 0.0232, "num_input_tokens_seen": 216578096, "step": 100370 }, { "epoch": 16.374388254486135, "grad_norm": 1.605246901512146, "learning_rate": 4.840884320189379e-06, "loss": 0.1613, "num_input_tokens_seen": 216589040, "step": 100375 }, { "epoch": 16.375203915171287, "grad_norm": 0.4378974437713623, "learning_rate": 4.83877967053388e-06, "loss": 0.0255, "num_input_tokens_seen": 216600656, "step": 100380 }, { "epoch": 16.376019575856443, "grad_norm": 0.074341781437397, "learning_rate": 4.836675429471099e-06, "loss": 0.1019, "num_input_tokens_seen": 216610384, "step": 100385 }, { "epoch": 16.3768352365416, "grad_norm": 0.15223094820976257, "learning_rate": 4.834571597043657e-06, "loss": 0.0591, "num_input_tokens_seen": 216620720, "step": 100390 }, { "epoch": 16.377650897226754, "grad_norm": 0.19553928077220917, "learning_rate": 4.8324681732942164e-06, "loss": 0.037, "num_input_tokens_seen": 216632112, "step": 100395 }, { "epoch": 16.37846655791191, "grad_norm": 1.1899210214614868, "learning_rate": 4.830365158265379e-06, "loss": 0.0961, "num_input_tokens_seen": 216642864, "step": 100400 }, { "epoch": 16.379282218597062, "grad_norm": 0.04813719168305397, "learning_rate": 4.8282625519997905e-06, "loss": 0.0115, "num_input_tokens_seen": 216654320, "step": 100405 }, { "epoch": 16.380097879282218, "grad_norm": 0.42553701996803284, "learning_rate": 4.826160354540041e-06, "loss": 0.0179, "num_input_tokens_seen": 216663728, "step": 100410 }, { "epoch": 16.380913539967374, "grad_norm": 0.6470410227775574, "learning_rate": 4.824058565928754e-06, "loss": 0.0708, "num_input_tokens_seen": 216675280, "step": 100415 }, { "epoch": 16.38172920065253, "grad_norm": 2.4806830883026123, "learning_rate": 4.821957186208506e-06, "loss": 0.0484, "num_input_tokens_seen": 216686352, "step": 100420 }, { "epoch": 16.382544861337685, "grad_norm": 0.034324128180742264, "learning_rate": 4.819856215421903e-06, "loss": 0.0204, "num_input_tokens_seen": 216697712, "step": 100425 }, { "epoch": 16.383360522022837, "grad_norm": 0.8189162611961365, "learning_rate": 4.8177556536115025e-06, "loss": 0.1099, "num_input_tokens_seen": 216708784, "step": 100430 }, { "epoch": 16.384176182707993, "grad_norm": 0.6673800349235535, "learning_rate": 4.8156555008198985e-06, "loss": 0.0451, "num_input_tokens_seen": 216718864, "step": 100435 }, { "epoch": 16.38499184339315, "grad_norm": 1.314776062965393, "learning_rate": 4.81355575708963e-06, "loss": 0.2757, "num_input_tokens_seen": 216729040, "step": 100440 }, { "epoch": 16.385807504078304, "grad_norm": 0.06964704394340515, "learning_rate": 4.8114564224632755e-06, "loss": 0.0163, "num_input_tokens_seen": 216738512, "step": 100445 }, { "epoch": 16.38662316476346, "grad_norm": 0.06309027969837189, "learning_rate": 4.809357496983352e-06, "loss": 0.0958, "num_input_tokens_seen": 216749744, "step": 100450 }, { "epoch": 16.387438825448612, "grad_norm": 0.36244288086891174, "learning_rate": 4.807258980692428e-06, "loss": 0.0233, "num_input_tokens_seen": 216761200, "step": 100455 }, { "epoch": 16.388254486133768, "grad_norm": 0.20066222548484802, "learning_rate": 4.805160873633002e-06, "loss": 0.0073, "num_input_tokens_seen": 216772016, "step": 100460 }, { "epoch": 16.389070146818923, "grad_norm": 0.027823710814118385, "learning_rate": 4.803063175847625e-06, "loss": 0.1156, "num_input_tokens_seen": 216782928, "step": 100465 }, { "epoch": 16.38988580750408, "grad_norm": 2.281426191329956, "learning_rate": 4.80096588737878e-06, "loss": 0.1851, "num_input_tokens_seen": 216794896, "step": 100470 }, { "epoch": 16.390701468189235, "grad_norm": 0.30222636461257935, "learning_rate": 4.798869008268994e-06, "loss": 0.0382, "num_input_tokens_seen": 216804592, "step": 100475 }, { "epoch": 16.391517128874387, "grad_norm": 3.4055583477020264, "learning_rate": 4.796772538560754e-06, "loss": 0.4173, "num_input_tokens_seen": 216816176, "step": 100480 }, { "epoch": 16.392332789559543, "grad_norm": 0.509415864944458, "learning_rate": 4.7946764782965475e-06, "loss": 0.0458, "num_input_tokens_seen": 216827152, "step": 100485 }, { "epoch": 16.3931484502447, "grad_norm": 0.12506170570850372, "learning_rate": 4.792580827518853e-06, "loss": 0.0357, "num_input_tokens_seen": 216836624, "step": 100490 }, { "epoch": 16.393964110929854, "grad_norm": 1.1761670112609863, "learning_rate": 4.7904855862701435e-06, "loss": 0.0767, "num_input_tokens_seen": 216846928, "step": 100495 }, { "epoch": 16.39477977161501, "grad_norm": 1.6306703090667725, "learning_rate": 4.788390754592883e-06, "loss": 0.0913, "num_input_tokens_seen": 216856944, "step": 100500 }, { "epoch": 16.395595432300162, "grad_norm": 0.06785766780376434, "learning_rate": 4.786296332529522e-06, "loss": 0.0937, "num_input_tokens_seen": 216868528, "step": 100505 }, { "epoch": 16.396411092985318, "grad_norm": 0.3916351795196533, "learning_rate": 4.78420232012251e-06, "loss": 0.1069, "num_input_tokens_seen": 216878896, "step": 100510 }, { "epoch": 16.397226753670473, "grad_norm": 2.2624475955963135, "learning_rate": 4.782108717414283e-06, "loss": 0.1452, "num_input_tokens_seen": 216890448, "step": 100515 }, { "epoch": 16.39804241435563, "grad_norm": 0.2293052077293396, "learning_rate": 4.780015524447271e-06, "loss": 0.1867, "num_input_tokens_seen": 216900432, "step": 100520 }, { "epoch": 16.39885807504078, "grad_norm": 1.2063711881637573, "learning_rate": 4.77792274126389e-06, "loss": 0.1986, "num_input_tokens_seen": 216910704, "step": 100525 }, { "epoch": 16.399673735725937, "grad_norm": 0.0698050856590271, "learning_rate": 4.77583036790657e-06, "loss": 0.0716, "num_input_tokens_seen": 216921904, "step": 100530 }, { "epoch": 16.400489396411093, "grad_norm": 0.03436290845274925, "learning_rate": 4.773738404417691e-06, "loss": 0.0155, "num_input_tokens_seen": 216932176, "step": 100535 }, { "epoch": 16.40130505709625, "grad_norm": 0.550844669342041, "learning_rate": 4.771646850839675e-06, "loss": 0.0954, "num_input_tokens_seen": 216942992, "step": 100540 }, { "epoch": 16.402120717781404, "grad_norm": 0.03981762006878853, "learning_rate": 4.769555707214884e-06, "loss": 0.116, "num_input_tokens_seen": 216951856, "step": 100545 }, { "epoch": 16.402936378466556, "grad_norm": 0.038722552359104156, "learning_rate": 4.767464973585725e-06, "loss": 0.0389, "num_input_tokens_seen": 216963248, "step": 100550 }, { "epoch": 16.403752039151712, "grad_norm": 0.23943085968494415, "learning_rate": 4.765374649994539e-06, "loss": 0.0149, "num_input_tokens_seen": 216973360, "step": 100555 }, { "epoch": 16.404567699836868, "grad_norm": 0.6176033020019531, "learning_rate": 4.763284736483722e-06, "loss": 0.1874, "num_input_tokens_seen": 216984880, "step": 100560 }, { "epoch": 16.405383360522023, "grad_norm": 0.02626902051270008, "learning_rate": 4.761195233095597e-06, "loss": 0.0992, "num_input_tokens_seen": 216996304, "step": 100565 }, { "epoch": 16.40619902120718, "grad_norm": 1.2756099700927734, "learning_rate": 4.759106139872538e-06, "loss": 0.1668, "num_input_tokens_seen": 217007888, "step": 100570 }, { "epoch": 16.40701468189233, "grad_norm": 0.13480742275714874, "learning_rate": 4.757017456856858e-06, "loss": 0.033, "num_input_tokens_seen": 217018576, "step": 100575 }, { "epoch": 16.407830342577487, "grad_norm": 1.0443730354309082, "learning_rate": 4.754929184090906e-06, "loss": 0.032, "num_input_tokens_seen": 217030128, "step": 100580 }, { "epoch": 16.408646003262643, "grad_norm": 0.19956377148628235, "learning_rate": 4.7528413216169965e-06, "loss": 0.1481, "num_input_tokens_seen": 217041072, "step": 100585 }, { "epoch": 16.4094616639478, "grad_norm": 2.91279935836792, "learning_rate": 4.7507538694774426e-06, "loss": 0.2183, "num_input_tokens_seen": 217052912, "step": 100590 }, { "epoch": 16.410277324632954, "grad_norm": 2.6217238903045654, "learning_rate": 4.7486668277145485e-06, "loss": 0.053, "num_input_tokens_seen": 217064912, "step": 100595 }, { "epoch": 16.411092985318106, "grad_norm": 0.01865522563457489, "learning_rate": 4.746580196370614e-06, "loss": 0.0155, "num_input_tokens_seen": 217075536, "step": 100600 }, { "epoch": 16.411908646003262, "grad_norm": 0.1753149777650833, "learning_rate": 4.744493975487924e-06, "loss": 0.0715, "num_input_tokens_seen": 217086960, "step": 100605 }, { "epoch": 16.412724306688418, "grad_norm": 0.30789318680763245, "learning_rate": 4.742408165108761e-06, "loss": 0.0711, "num_input_tokens_seen": 217097680, "step": 100610 }, { "epoch": 16.413539967373573, "grad_norm": 0.6354652047157288, "learning_rate": 4.740322765275393e-06, "loss": 0.0364, "num_input_tokens_seen": 217108976, "step": 100615 }, { "epoch": 16.41435562805873, "grad_norm": 1.217071533203125, "learning_rate": 4.738237776030085e-06, "loss": 0.0845, "num_input_tokens_seen": 217119216, "step": 100620 }, { "epoch": 16.41517128874388, "grad_norm": 2.195192575454712, "learning_rate": 4.7361531974150944e-06, "loss": 0.1826, "num_input_tokens_seen": 217129552, "step": 100625 }, { "epoch": 16.415986949429037, "grad_norm": 0.5299192070960999, "learning_rate": 4.734069029472665e-06, "loss": 0.1194, "num_input_tokens_seen": 217140144, "step": 100630 }, { "epoch": 16.416802610114193, "grad_norm": 0.02250138856470585, "learning_rate": 4.7319852722450345e-06, "loss": 0.1841, "num_input_tokens_seen": 217150576, "step": 100635 }, { "epoch": 16.41761827079935, "grad_norm": 0.08260875940322876, "learning_rate": 4.729901925774438e-06, "loss": 0.0492, "num_input_tokens_seen": 217161328, "step": 100640 }, { "epoch": 16.418433931484504, "grad_norm": 0.4365317225456238, "learning_rate": 4.727818990103089e-06, "loss": 0.0601, "num_input_tokens_seen": 217171952, "step": 100645 }, { "epoch": 16.419249592169656, "grad_norm": 1.0881208181381226, "learning_rate": 4.725736465273209e-06, "loss": 0.0889, "num_input_tokens_seen": 217183216, "step": 100650 }, { "epoch": 16.420065252854812, "grad_norm": 0.37052637338638306, "learning_rate": 4.723654351326997e-06, "loss": 0.1926, "num_input_tokens_seen": 217193808, "step": 100655 }, { "epoch": 16.420880913539968, "grad_norm": 0.9153696894645691, "learning_rate": 4.721572648306652e-06, "loss": 0.0588, "num_input_tokens_seen": 217203760, "step": 100660 }, { "epoch": 16.421696574225123, "grad_norm": 0.16455979645252228, "learning_rate": 4.719491356254363e-06, "loss": 0.1087, "num_input_tokens_seen": 217215120, "step": 100665 }, { "epoch": 16.42251223491028, "grad_norm": 0.25869303941726685, "learning_rate": 4.717410475212303e-06, "loss": 0.0609, "num_input_tokens_seen": 217225776, "step": 100670 }, { "epoch": 16.42332789559543, "grad_norm": 1.0360132455825806, "learning_rate": 4.715330005222662e-06, "loss": 0.2147, "num_input_tokens_seen": 217236848, "step": 100675 }, { "epoch": 16.424143556280587, "grad_norm": 0.4387400150299072, "learning_rate": 4.713249946327578e-06, "loss": 0.0356, "num_input_tokens_seen": 217247408, "step": 100680 }, { "epoch": 16.424959216965743, "grad_norm": 0.09590611606836319, "learning_rate": 4.7111702985692285e-06, "loss": 0.1183, "num_input_tokens_seen": 217258320, "step": 100685 }, { "epoch": 16.4257748776509, "grad_norm": 2.1627190113067627, "learning_rate": 4.709091061989748e-06, "loss": 0.1558, "num_input_tokens_seen": 217270288, "step": 100690 }, { "epoch": 16.42659053833605, "grad_norm": 0.047987405210733414, "learning_rate": 4.707012236631281e-06, "loss": 0.1155, "num_input_tokens_seen": 217281008, "step": 100695 }, { "epoch": 16.427406199021206, "grad_norm": 0.4174147844314575, "learning_rate": 4.7049338225359505e-06, "loss": 0.0662, "num_input_tokens_seen": 217293168, "step": 100700 }, { "epoch": 16.428221859706362, "grad_norm": 0.052798833698034286, "learning_rate": 4.702855819745886e-06, "loss": 0.016, "num_input_tokens_seen": 217303056, "step": 100705 }, { "epoch": 16.429037520391518, "grad_norm": 0.09663664549589157, "learning_rate": 4.700778228303196e-06, "loss": 0.1895, "num_input_tokens_seen": 217313136, "step": 100710 }, { "epoch": 16.429853181076673, "grad_norm": 0.21271748840808868, "learning_rate": 4.698701048249987e-06, "loss": 0.1451, "num_input_tokens_seen": 217324176, "step": 100715 }, { "epoch": 16.430668841761825, "grad_norm": 0.3230688273906708, "learning_rate": 4.696624279628353e-06, "loss": 0.0623, "num_input_tokens_seen": 217333744, "step": 100720 }, { "epoch": 16.43148450244698, "grad_norm": 0.7419354319572449, "learning_rate": 4.694547922480386e-06, "loss": 0.1497, "num_input_tokens_seen": 217344592, "step": 100725 }, { "epoch": 16.432300163132137, "grad_norm": 0.11056597530841827, "learning_rate": 4.692471976848164e-06, "loss": 0.1141, "num_input_tokens_seen": 217355536, "step": 100730 }, { "epoch": 16.433115823817293, "grad_norm": 0.6776586174964905, "learning_rate": 4.690396442773762e-06, "loss": 0.0385, "num_input_tokens_seen": 217366416, "step": 100735 }, { "epoch": 16.43393148450245, "grad_norm": 2.0098717212677, "learning_rate": 4.688321320299238e-06, "loss": 0.1692, "num_input_tokens_seen": 217376400, "step": 100740 }, { "epoch": 16.4347471451876, "grad_norm": 0.06282682716846466, "learning_rate": 4.686246609466652e-06, "loss": 0.0584, "num_input_tokens_seen": 217386960, "step": 100745 }, { "epoch": 16.435562805872756, "grad_norm": 1.1294413805007935, "learning_rate": 4.684172310318047e-06, "loss": 0.0921, "num_input_tokens_seen": 217397872, "step": 100750 }, { "epoch": 16.436378466557912, "grad_norm": 0.13913682103157043, "learning_rate": 4.682098422895462e-06, "loss": 0.1021, "num_input_tokens_seen": 217409264, "step": 100755 }, { "epoch": 16.437194127243067, "grad_norm": 0.05916912108659744, "learning_rate": 4.680024947240927e-06, "loss": 0.0414, "num_input_tokens_seen": 217420560, "step": 100760 }, { "epoch": 16.438009787928223, "grad_norm": 1.6192909479141235, "learning_rate": 4.677951883396464e-06, "loss": 0.2726, "num_input_tokens_seen": 217432208, "step": 100765 }, { "epoch": 16.438825448613375, "grad_norm": 0.1333950161933899, "learning_rate": 4.675879231404087e-06, "loss": 0.0484, "num_input_tokens_seen": 217442800, "step": 100770 }, { "epoch": 16.43964110929853, "grad_norm": 0.6552926301956177, "learning_rate": 4.6738069913058014e-06, "loss": 0.1306, "num_input_tokens_seen": 217453008, "step": 100775 }, { "epoch": 16.440456769983687, "grad_norm": 0.6288781762123108, "learning_rate": 4.671735163143604e-06, "loss": 0.2661, "num_input_tokens_seen": 217463440, "step": 100780 }, { "epoch": 16.441272430668842, "grad_norm": 1.7242294549942017, "learning_rate": 4.66966374695948e-06, "loss": 0.0758, "num_input_tokens_seen": 217473616, "step": 100785 }, { "epoch": 16.442088091353998, "grad_norm": 0.027160756289958954, "learning_rate": 4.667592742795404e-06, "loss": 0.0386, "num_input_tokens_seen": 217484752, "step": 100790 }, { "epoch": 16.44290375203915, "grad_norm": 0.2059267908334732, "learning_rate": 4.665522150693366e-06, "loss": 0.0904, "num_input_tokens_seen": 217495824, "step": 100795 }, { "epoch": 16.443719412724306, "grad_norm": 1.0156782865524292, "learning_rate": 4.663451970695307e-06, "loss": 0.1841, "num_input_tokens_seen": 217506256, "step": 100800 }, { "epoch": 16.44453507340946, "grad_norm": 0.14473262429237366, "learning_rate": 4.6613822028432065e-06, "loss": 0.0331, "num_input_tokens_seen": 217516144, "step": 100805 }, { "epoch": 16.445350734094617, "grad_norm": 0.15210984647274017, "learning_rate": 4.659312847178982e-06, "loss": 0.0153, "num_input_tokens_seen": 217527120, "step": 100810 }, { "epoch": 16.446166394779773, "grad_norm": 1.0578488111495972, "learning_rate": 4.657243903744593e-06, "loss": 0.056, "num_input_tokens_seen": 217538192, "step": 100815 }, { "epoch": 16.446982055464925, "grad_norm": 0.03085070475935936, "learning_rate": 4.655175372581966e-06, "loss": 0.0133, "num_input_tokens_seen": 217548400, "step": 100820 }, { "epoch": 16.44779771615008, "grad_norm": 1.5622320175170898, "learning_rate": 4.653107253733016e-06, "loss": 0.0917, "num_input_tokens_seen": 217559216, "step": 100825 }, { "epoch": 16.448613376835237, "grad_norm": 0.28698721528053284, "learning_rate": 4.651039547239661e-06, "loss": 0.0045, "num_input_tokens_seen": 217570768, "step": 100830 }, { "epoch": 16.449429037520392, "grad_norm": 0.9974372982978821, "learning_rate": 4.648972253143805e-06, "loss": 0.1606, "num_input_tokens_seen": 217581200, "step": 100835 }, { "epoch": 16.450244698205548, "grad_norm": 0.14727744460105896, "learning_rate": 4.646905371487339e-06, "loss": 0.1218, "num_input_tokens_seen": 217591728, "step": 100840 }, { "epoch": 16.4510603588907, "grad_norm": 0.28290021419525146, "learning_rate": 4.6448389023121594e-06, "loss": 0.0922, "num_input_tokens_seen": 217602704, "step": 100845 }, { "epoch": 16.451876019575856, "grad_norm": 0.17246249318122864, "learning_rate": 4.64277284566014e-06, "loss": 0.0167, "num_input_tokens_seen": 217614352, "step": 100850 }, { "epoch": 16.45269168026101, "grad_norm": 0.23983807861804962, "learning_rate": 4.6407072015731555e-06, "loss": 0.008, "num_input_tokens_seen": 217625520, "step": 100855 }, { "epoch": 16.453507340946167, "grad_norm": 0.7572500705718994, "learning_rate": 4.638641970093066e-06, "loss": 0.0297, "num_input_tokens_seen": 217635600, "step": 100860 }, { "epoch": 16.454323001631323, "grad_norm": 2.705300807952881, "learning_rate": 4.6365771512617285e-06, "loss": 0.2687, "num_input_tokens_seen": 217646928, "step": 100865 }, { "epoch": 16.455138662316475, "grad_norm": 0.264694482088089, "learning_rate": 4.634512745120986e-06, "loss": 0.0547, "num_input_tokens_seen": 217657680, "step": 100870 }, { "epoch": 16.45595432300163, "grad_norm": 0.029952341690659523, "learning_rate": 4.6324487517126785e-06, "loss": 0.0166, "num_input_tokens_seen": 217669360, "step": 100875 }, { "epoch": 16.456769983686787, "grad_norm": 0.13789787888526917, "learning_rate": 4.630385171078635e-06, "loss": 0.0076, "num_input_tokens_seen": 217680112, "step": 100880 }, { "epoch": 16.457585644371942, "grad_norm": 0.9751308560371399, "learning_rate": 4.628322003260679e-06, "loss": 0.1265, "num_input_tokens_seen": 217689968, "step": 100885 }, { "epoch": 16.458401305057095, "grad_norm": 0.182738795876503, "learning_rate": 4.626259248300618e-06, "loss": 0.1008, "num_input_tokens_seen": 217700208, "step": 100890 }, { "epoch": 16.45921696574225, "grad_norm": 0.14015008509159088, "learning_rate": 4.624196906240261e-06, "loss": 0.0332, "num_input_tokens_seen": 217710192, "step": 100895 }, { "epoch": 16.460032626427406, "grad_norm": 0.14054276049137115, "learning_rate": 4.622134977121403e-06, "loss": 0.0618, "num_input_tokens_seen": 217720592, "step": 100900 }, { "epoch": 16.46084828711256, "grad_norm": 1.056525468826294, "learning_rate": 4.620073460985822e-06, "loss": 0.1266, "num_input_tokens_seen": 217730864, "step": 100905 }, { "epoch": 16.461663947797717, "grad_norm": 0.048126377165317535, "learning_rate": 4.618012357875321e-06, "loss": 0.0217, "num_input_tokens_seen": 217743184, "step": 100910 }, { "epoch": 16.46247960848287, "grad_norm": 2.3476040363311768, "learning_rate": 4.615951667831642e-06, "loss": 0.2574, "num_input_tokens_seen": 217754160, "step": 100915 }, { "epoch": 16.463295269168025, "grad_norm": 0.1061382070183754, "learning_rate": 4.6138913908965745e-06, "loss": 0.0941, "num_input_tokens_seen": 217763920, "step": 100920 }, { "epoch": 16.46411092985318, "grad_norm": 0.06546953320503235, "learning_rate": 4.611831527111846e-06, "loss": 0.1076, "num_input_tokens_seen": 217773968, "step": 100925 }, { "epoch": 16.464926590538337, "grad_norm": 0.7998341917991638, "learning_rate": 4.609772076519231e-06, "loss": 0.063, "num_input_tokens_seen": 217785072, "step": 100930 }, { "epoch": 16.465742251223492, "grad_norm": 0.08212395012378693, "learning_rate": 4.607713039160436e-06, "loss": 0.011, "num_input_tokens_seen": 217795472, "step": 100935 }, { "epoch": 16.466557911908644, "grad_norm": 2.3166301250457764, "learning_rate": 4.605654415077221e-06, "loss": 0.0819, "num_input_tokens_seen": 217806960, "step": 100940 }, { "epoch": 16.4673735725938, "grad_norm": 1.4334626197814941, "learning_rate": 4.603596204311275e-06, "loss": 0.2053, "num_input_tokens_seen": 217816944, "step": 100945 }, { "epoch": 16.468189233278956, "grad_norm": 1.1031794548034668, "learning_rate": 4.601538406904343e-06, "loss": 0.0926, "num_input_tokens_seen": 217828720, "step": 100950 }, { "epoch": 16.46900489396411, "grad_norm": 0.07618214190006256, "learning_rate": 4.599481022898097e-06, "loss": 0.1691, "num_input_tokens_seen": 217838128, "step": 100955 }, { "epoch": 16.469820554649267, "grad_norm": 0.2948734760284424, "learning_rate": 4.597424052334257e-06, "loss": 0.0881, "num_input_tokens_seen": 217849776, "step": 100960 }, { "epoch": 16.47063621533442, "grad_norm": 1.2051607370376587, "learning_rate": 4.595367495254499e-06, "loss": 0.2377, "num_input_tokens_seen": 217860720, "step": 100965 }, { "epoch": 16.471451876019575, "grad_norm": 1.6886378526687622, "learning_rate": 4.593311351700505e-06, "loss": 0.0612, "num_input_tokens_seen": 217871536, "step": 100970 }, { "epoch": 16.47226753670473, "grad_norm": 0.1814006119966507, "learning_rate": 4.591255621713944e-06, "loss": 0.1053, "num_input_tokens_seen": 217883344, "step": 100975 }, { "epoch": 16.473083197389887, "grad_norm": 2.136476993560791, "learning_rate": 4.589200305336478e-06, "loss": 0.1828, "num_input_tokens_seen": 217894832, "step": 100980 }, { "epoch": 16.473898858075042, "grad_norm": 2.9194047451019287, "learning_rate": 4.58714540260976e-06, "loss": 0.2944, "num_input_tokens_seen": 217904592, "step": 100985 }, { "epoch": 16.474714518760194, "grad_norm": 0.021442802622914314, "learning_rate": 4.585090913575438e-06, "loss": 0.0303, "num_input_tokens_seen": 217915376, "step": 100990 }, { "epoch": 16.47553017944535, "grad_norm": 2.063929796218872, "learning_rate": 4.583036838275145e-06, "loss": 0.2203, "num_input_tokens_seen": 217927760, "step": 100995 }, { "epoch": 16.476345840130506, "grad_norm": 0.8516324758529663, "learning_rate": 4.580983176750512e-06, "loss": 0.0364, "num_input_tokens_seen": 217939120, "step": 101000 }, { "epoch": 16.47716150081566, "grad_norm": 1.7077844142913818, "learning_rate": 4.578929929043157e-06, "loss": 0.0556, "num_input_tokens_seen": 217949712, "step": 101005 }, { "epoch": 16.477977161500817, "grad_norm": 0.14925071597099304, "learning_rate": 4.57687709519469e-06, "loss": 0.0485, "num_input_tokens_seen": 217959408, "step": 101010 }, { "epoch": 16.47879282218597, "grad_norm": 0.26924049854278564, "learning_rate": 4.574824675246728e-06, "loss": 0.0566, "num_input_tokens_seen": 217971312, "step": 101015 }, { "epoch": 16.479608482871125, "grad_norm": 0.14529463648796082, "learning_rate": 4.572772669240841e-06, "loss": 0.0882, "num_input_tokens_seen": 217982800, "step": 101020 }, { "epoch": 16.48042414355628, "grad_norm": 0.9192562699317932, "learning_rate": 4.570721077218642e-06, "loss": 0.0274, "num_input_tokens_seen": 217994704, "step": 101025 }, { "epoch": 16.481239804241437, "grad_norm": 0.02517763152718544, "learning_rate": 4.5686698992216865e-06, "loss": 0.0597, "num_input_tokens_seen": 218006032, "step": 101030 }, { "epoch": 16.482055464926592, "grad_norm": 0.12270987778902054, "learning_rate": 4.566619135291567e-06, "loss": 0.0183, "num_input_tokens_seen": 218016880, "step": 101035 }, { "epoch": 16.482871125611744, "grad_norm": 0.40992382168769836, "learning_rate": 4.564568785469816e-06, "loss": 0.2721, "num_input_tokens_seen": 218025776, "step": 101040 }, { "epoch": 16.4836867862969, "grad_norm": 0.10203462839126587, "learning_rate": 4.562518849798019e-06, "loss": 0.1197, "num_input_tokens_seen": 218038224, "step": 101045 }, { "epoch": 16.484502446982056, "grad_norm": 2.2102959156036377, "learning_rate": 4.56046932831769e-06, "loss": 0.1456, "num_input_tokens_seen": 218048752, "step": 101050 }, { "epoch": 16.48531810766721, "grad_norm": 0.8914692401885986, "learning_rate": 4.558420221070395e-06, "loss": 0.2158, "num_input_tokens_seen": 218060272, "step": 101055 }, { "epoch": 16.486133768352367, "grad_norm": 0.8535287976264954, "learning_rate": 4.55637152809763e-06, "loss": 0.111, "num_input_tokens_seen": 218071984, "step": 101060 }, { "epoch": 16.48694942903752, "grad_norm": 0.4598221182823181, "learning_rate": 4.554323249440945e-06, "loss": 0.0188, "num_input_tokens_seen": 218083152, "step": 101065 }, { "epoch": 16.487765089722675, "grad_norm": 2.3817899227142334, "learning_rate": 4.552275385141824e-06, "loss": 0.0773, "num_input_tokens_seen": 218094640, "step": 101070 }, { "epoch": 16.48858075040783, "grad_norm": 0.16935549676418304, "learning_rate": 4.5502279352417935e-06, "loss": 0.2355, "num_input_tokens_seen": 218104752, "step": 101075 }, { "epoch": 16.489396411092986, "grad_norm": 0.03999185189604759, "learning_rate": 4.548180899782326e-06, "loss": 0.0606, "num_input_tokens_seen": 218115120, "step": 101080 }, { "epoch": 16.49021207177814, "grad_norm": 0.3082849383354187, "learning_rate": 4.546134278804928e-06, "loss": 0.0745, "num_input_tokens_seen": 218126448, "step": 101085 }, { "epoch": 16.491027732463294, "grad_norm": 1.6497020721435547, "learning_rate": 4.544088072351052e-06, "loss": 0.1139, "num_input_tokens_seen": 218136912, "step": 101090 }, { "epoch": 16.49184339314845, "grad_norm": 1.9722816944122314, "learning_rate": 4.542042280462197e-06, "loss": 0.081, "num_input_tokens_seen": 218147312, "step": 101095 }, { "epoch": 16.492659053833606, "grad_norm": 2.1587204933166504, "learning_rate": 4.539996903179794e-06, "loss": 0.0928, "num_input_tokens_seen": 218158512, "step": 101100 }, { "epoch": 16.49347471451876, "grad_norm": 1.2205663919448853, "learning_rate": 4.537951940545318e-06, "loss": 0.1141, "num_input_tokens_seen": 218169488, "step": 101105 }, { "epoch": 16.494290375203914, "grad_norm": 0.25018569827079773, "learning_rate": 4.5359073926002e-06, "loss": 0.0772, "num_input_tokens_seen": 218180016, "step": 101110 }, { "epoch": 16.49510603588907, "grad_norm": 0.31917673349380493, "learning_rate": 4.533863259385882e-06, "loss": 0.1325, "num_input_tokens_seen": 218191728, "step": 101115 }, { "epoch": 16.495921696574225, "grad_norm": 1.7798913717269897, "learning_rate": 4.531819540943788e-06, "loss": 0.1578, "num_input_tokens_seen": 218203152, "step": 101120 }, { "epoch": 16.49673735725938, "grad_norm": 0.1697167158126831, "learning_rate": 4.529776237315336e-06, "loss": 0.0489, "num_input_tokens_seen": 218214320, "step": 101125 }, { "epoch": 16.497553017944536, "grad_norm": 0.3805854022502899, "learning_rate": 4.527733348541938e-06, "loss": 0.1173, "num_input_tokens_seen": 218225776, "step": 101130 }, { "epoch": 16.49836867862969, "grad_norm": 0.06241212412714958, "learning_rate": 4.525690874664992e-06, "loss": 0.1647, "num_input_tokens_seen": 218236944, "step": 101135 }, { "epoch": 16.499184339314844, "grad_norm": 0.23950080573558807, "learning_rate": 4.523648815725895e-06, "loss": 0.2355, "num_input_tokens_seen": 218248496, "step": 101140 }, { "epoch": 16.5, "grad_norm": 0.03445478901267052, "learning_rate": 4.521607171766032e-06, "loss": 0.0805, "num_input_tokens_seen": 218259248, "step": 101145 }, { "epoch": 16.500815660685156, "grad_norm": 0.11190203577280045, "learning_rate": 4.519565942826778e-06, "loss": 0.1438, "num_input_tokens_seen": 218269392, "step": 101150 }, { "epoch": 16.50163132137031, "grad_norm": 0.09151195734739304, "learning_rate": 4.517525128949496e-06, "loss": 0.021, "num_input_tokens_seen": 218279504, "step": 101155 }, { "epoch": 16.502446982055464, "grad_norm": 0.20306937396526337, "learning_rate": 4.515484730175562e-06, "loss": 0.029, "num_input_tokens_seen": 218290000, "step": 101160 }, { "epoch": 16.50326264274062, "grad_norm": 0.0406617745757103, "learning_rate": 4.5134447465463025e-06, "loss": 0.1149, "num_input_tokens_seen": 218299888, "step": 101165 }, { "epoch": 16.504078303425775, "grad_norm": 0.17293883860111237, "learning_rate": 4.5114051781030895e-06, "loss": 0.0599, "num_input_tokens_seen": 218310320, "step": 101170 }, { "epoch": 16.50489396411093, "grad_norm": 0.15777763724327087, "learning_rate": 4.50936602488723e-06, "loss": 0.2168, "num_input_tokens_seen": 218321744, "step": 101175 }, { "epoch": 16.505709624796086, "grad_norm": 0.3675493001937866, "learning_rate": 4.507327286940072e-06, "loss": 0.0582, "num_input_tokens_seen": 218334384, "step": 101180 }, { "epoch": 16.50652528548124, "grad_norm": 0.22051936388015747, "learning_rate": 4.505288964302912e-06, "loss": 0.0905, "num_input_tokens_seen": 218345776, "step": 101185 }, { "epoch": 16.507340946166394, "grad_norm": 0.1475389450788498, "learning_rate": 4.503251057017086e-06, "loss": 0.0207, "num_input_tokens_seen": 218357200, "step": 101190 }, { "epoch": 16.50815660685155, "grad_norm": 0.06480530649423599, "learning_rate": 4.501213565123863e-06, "loss": 0.182, "num_input_tokens_seen": 218368208, "step": 101195 }, { "epoch": 16.508972267536706, "grad_norm": 0.052454978227615356, "learning_rate": 4.499176488664564e-06, "loss": 0.0552, "num_input_tokens_seen": 218378992, "step": 101200 }, { "epoch": 16.50978792822186, "grad_norm": 2.715170383453369, "learning_rate": 4.49713982768045e-06, "loss": 0.1811, "num_input_tokens_seen": 218389808, "step": 101205 }, { "epoch": 16.510603588907014, "grad_norm": 0.07683468610048294, "learning_rate": 4.4951035822128205e-06, "loss": 0.3224, "num_input_tokens_seen": 218399696, "step": 101210 }, { "epoch": 16.51141924959217, "grad_norm": 0.07143247127532959, "learning_rate": 4.493067752302915e-06, "loss": 0.0169, "num_input_tokens_seen": 218411984, "step": 101215 }, { "epoch": 16.512234910277325, "grad_norm": 0.9627754092216492, "learning_rate": 4.4910323379920125e-06, "loss": 0.0528, "num_input_tokens_seen": 218421552, "step": 101220 }, { "epoch": 16.51305057096248, "grad_norm": 0.18052664399147034, "learning_rate": 4.488997339321358e-06, "loss": 0.2166, "num_input_tokens_seen": 218433264, "step": 101225 }, { "epoch": 16.513866231647633, "grad_norm": 0.0815044566988945, "learning_rate": 4.4869627563321915e-06, "loss": 0.0613, "num_input_tokens_seen": 218443472, "step": 101230 }, { "epoch": 16.51468189233279, "grad_norm": 0.038821954280138016, "learning_rate": 4.484928589065749e-06, "loss": 0.1053, "num_input_tokens_seen": 218454000, "step": 101235 }, { "epoch": 16.515497553017944, "grad_norm": 2.0846242904663086, "learning_rate": 4.48289483756325e-06, "loss": 0.2461, "num_input_tokens_seen": 218464144, "step": 101240 }, { "epoch": 16.5163132137031, "grad_norm": 0.7480098009109497, "learning_rate": 4.480861501865918e-06, "loss": 0.063, "num_input_tokens_seen": 218474864, "step": 101245 }, { "epoch": 16.517128874388256, "grad_norm": 0.16306544840335846, "learning_rate": 4.47882858201496e-06, "loss": 0.0196, "num_input_tokens_seen": 218484912, "step": 101250 }, { "epoch": 16.517944535073408, "grad_norm": 0.4182378351688385, "learning_rate": 4.47679607805157e-06, "loss": 0.0382, "num_input_tokens_seen": 218496432, "step": 101255 }, { "epoch": 16.518760195758563, "grad_norm": 2.2653985023498535, "learning_rate": 4.474763990016945e-06, "loss": 0.2262, "num_input_tokens_seen": 218507408, "step": 101260 }, { "epoch": 16.51957585644372, "grad_norm": 0.15589028596878052, "learning_rate": 4.472732317952266e-06, "loss": 0.028, "num_input_tokens_seen": 218517712, "step": 101265 }, { "epoch": 16.520391517128875, "grad_norm": 1.4685633182525635, "learning_rate": 4.470701061898705e-06, "loss": 0.2272, "num_input_tokens_seen": 218528880, "step": 101270 }, { "epoch": 16.52120717781403, "grad_norm": 2.0560970306396484, "learning_rate": 4.4686702218974335e-06, "loss": 0.175, "num_input_tokens_seen": 218540368, "step": 101275 }, { "epoch": 16.522022838499183, "grad_norm": 0.14372766017913818, "learning_rate": 4.466639797989602e-06, "loss": 0.1713, "num_input_tokens_seen": 218550448, "step": 101280 }, { "epoch": 16.52283849918434, "grad_norm": 0.27810806035995483, "learning_rate": 4.4646097902163675e-06, "loss": 0.0281, "num_input_tokens_seen": 218561200, "step": 101285 }, { "epoch": 16.523654159869494, "grad_norm": 0.1378297656774521, "learning_rate": 4.462580198618862e-06, "loss": 0.0137, "num_input_tokens_seen": 218572656, "step": 101290 }, { "epoch": 16.52446982055465, "grad_norm": 0.0716816633939743, "learning_rate": 4.460551023238227e-06, "loss": 0.0406, "num_input_tokens_seen": 218583408, "step": 101295 }, { "epoch": 16.525285481239806, "grad_norm": 0.29537081718444824, "learning_rate": 4.458522264115572e-06, "loss": 0.0278, "num_input_tokens_seen": 218593168, "step": 101300 }, { "epoch": 16.526101141924958, "grad_norm": 0.5306413173675537, "learning_rate": 4.456493921292037e-06, "loss": 0.0682, "num_input_tokens_seen": 218604976, "step": 101305 }, { "epoch": 16.526916802610113, "grad_norm": 2.7094850540161133, "learning_rate": 4.454465994808702e-06, "loss": 0.1308, "num_input_tokens_seen": 218615856, "step": 101310 }, { "epoch": 16.52773246329527, "grad_norm": 1.6551729440689087, "learning_rate": 4.4524384847066866e-06, "loss": 0.1912, "num_input_tokens_seen": 218626352, "step": 101315 }, { "epoch": 16.528548123980425, "grad_norm": 0.5688623785972595, "learning_rate": 4.450411391027062e-06, "loss": 0.0768, "num_input_tokens_seen": 218637456, "step": 101320 }, { "epoch": 16.52936378466558, "grad_norm": 1.667824387550354, "learning_rate": 4.448384713810924e-06, "loss": 0.1212, "num_input_tokens_seen": 218648080, "step": 101325 }, { "epoch": 16.530179445350733, "grad_norm": 2.338230848312378, "learning_rate": 4.446358453099342e-06, "loss": 0.1341, "num_input_tokens_seen": 218658544, "step": 101330 }, { "epoch": 16.53099510603589, "grad_norm": 0.08617769926786423, "learning_rate": 4.44433260893338e-06, "loss": 0.0439, "num_input_tokens_seen": 218668656, "step": 101335 }, { "epoch": 16.531810766721044, "grad_norm": 1.112331509590149, "learning_rate": 4.442307181354094e-06, "loss": 0.1048, "num_input_tokens_seen": 218679408, "step": 101340 }, { "epoch": 16.5326264274062, "grad_norm": 0.05212220549583435, "learning_rate": 4.440282170402535e-06, "loss": 0.1915, "num_input_tokens_seen": 218689232, "step": 101345 }, { "epoch": 16.533442088091356, "grad_norm": 1.7251912355422974, "learning_rate": 4.438257576119736e-06, "loss": 0.2204, "num_input_tokens_seen": 218700880, "step": 101350 }, { "epoch": 16.534257748776508, "grad_norm": 0.07718193531036377, "learning_rate": 4.436233398546733e-06, "loss": 0.087, "num_input_tokens_seen": 218712400, "step": 101355 }, { "epoch": 16.535073409461663, "grad_norm": 0.0999438464641571, "learning_rate": 4.4342096377245485e-06, "loss": 0.0347, "num_input_tokens_seen": 218723344, "step": 101360 }, { "epoch": 16.53588907014682, "grad_norm": 0.9096325635910034, "learning_rate": 4.432186293694193e-06, "loss": 0.0443, "num_input_tokens_seen": 218735344, "step": 101365 }, { "epoch": 16.536704730831975, "grad_norm": 2.8153271675109863, "learning_rate": 4.430163366496675e-06, "loss": 0.2309, "num_input_tokens_seen": 218746032, "step": 101370 }, { "epoch": 16.53752039151713, "grad_norm": 0.04758572205901146, "learning_rate": 4.428140856172994e-06, "loss": 0.0451, "num_input_tokens_seen": 218757328, "step": 101375 }, { "epoch": 16.538336052202283, "grad_norm": 1.479414701461792, "learning_rate": 4.426118762764131e-06, "loss": 0.1342, "num_input_tokens_seen": 218768688, "step": 101380 }, { "epoch": 16.53915171288744, "grad_norm": 1.3540678024291992, "learning_rate": 4.424097086311074e-06, "loss": 0.1615, "num_input_tokens_seen": 218779792, "step": 101385 }, { "epoch": 16.539967373572594, "grad_norm": 0.08745783567428589, "learning_rate": 4.422075826854791e-06, "loss": 0.0619, "num_input_tokens_seen": 218790096, "step": 101390 }, { "epoch": 16.54078303425775, "grad_norm": 2.3249478340148926, "learning_rate": 4.420054984436248e-06, "loss": 0.0752, "num_input_tokens_seen": 218801680, "step": 101395 }, { "epoch": 16.541598694942905, "grad_norm": 0.5365981459617615, "learning_rate": 4.418034559096395e-06, "loss": 0.0938, "num_input_tokens_seen": 218812976, "step": 101400 }, { "epoch": 16.542414355628058, "grad_norm": 0.5049877166748047, "learning_rate": 4.4160145508761855e-06, "loss": 0.0465, "num_input_tokens_seen": 218823952, "step": 101405 }, { "epoch": 16.543230016313213, "grad_norm": 0.08286905288696289, "learning_rate": 4.413994959816553e-06, "loss": 0.0218, "num_input_tokens_seen": 218835408, "step": 101410 }, { "epoch": 16.54404567699837, "grad_norm": 0.597724199295044, "learning_rate": 4.4119757859584246e-06, "loss": 0.0179, "num_input_tokens_seen": 218847344, "step": 101415 }, { "epoch": 16.544861337683525, "grad_norm": 0.3752771317958832, "learning_rate": 4.409957029342729e-06, "loss": 0.0266, "num_input_tokens_seen": 218857840, "step": 101420 }, { "epoch": 16.545676998368677, "grad_norm": 0.41385313868522644, "learning_rate": 4.407938690010371e-06, "loss": 0.0318, "num_input_tokens_seen": 218868336, "step": 101425 }, { "epoch": 16.546492659053833, "grad_norm": 1.6945936679840088, "learning_rate": 4.405920768002253e-06, "loss": 0.1252, "num_input_tokens_seen": 218876976, "step": 101430 }, { "epoch": 16.54730831973899, "grad_norm": 0.9947482347488403, "learning_rate": 4.4039032633592825e-06, "loss": 0.0176, "num_input_tokens_seen": 218887888, "step": 101435 }, { "epoch": 16.548123980424144, "grad_norm": 0.16959159076213837, "learning_rate": 4.401886176122341e-06, "loss": 0.0357, "num_input_tokens_seen": 218898896, "step": 101440 }, { "epoch": 16.5489396411093, "grad_norm": 0.3965626657009125, "learning_rate": 4.399869506332307e-06, "loss": 0.1631, "num_input_tokens_seen": 218910480, "step": 101445 }, { "epoch": 16.549755301794452, "grad_norm": 0.8801009654998779, "learning_rate": 4.397853254030052e-06, "loss": 0.0589, "num_input_tokens_seen": 218921392, "step": 101450 }, { "epoch": 16.550570962479608, "grad_norm": 0.8470849394798279, "learning_rate": 4.395837419256432e-06, "loss": 0.1471, "num_input_tokens_seen": 218932368, "step": 101455 }, { "epoch": 16.551386623164763, "grad_norm": 0.11726665496826172, "learning_rate": 4.393822002052309e-06, "loss": 0.0469, "num_input_tokens_seen": 218942384, "step": 101460 }, { "epoch": 16.55220228384992, "grad_norm": 0.12531431019306183, "learning_rate": 4.39180700245852e-06, "loss": 0.0868, "num_input_tokens_seen": 218952752, "step": 101465 }, { "epoch": 16.553017944535075, "grad_norm": 0.2523931860923767, "learning_rate": 4.389792420515909e-06, "loss": 0.2101, "num_input_tokens_seen": 218963696, "step": 101470 }, { "epoch": 16.553833605220227, "grad_norm": 0.17993776500225067, "learning_rate": 4.387778256265299e-06, "loss": 0.024, "num_input_tokens_seen": 218974480, "step": 101475 }, { "epoch": 16.554649265905383, "grad_norm": 0.058129992336034775, "learning_rate": 4.385764509747511e-06, "loss": 0.159, "num_input_tokens_seen": 218984624, "step": 101480 }, { "epoch": 16.55546492659054, "grad_norm": 2.4727072715759277, "learning_rate": 4.383751181003357e-06, "loss": 0.225, "num_input_tokens_seen": 218993424, "step": 101485 }, { "epoch": 16.556280587275694, "grad_norm": 2.605989456176758, "learning_rate": 4.381738270073638e-06, "loss": 0.2181, "num_input_tokens_seen": 219004368, "step": 101490 }, { "epoch": 16.55709624796085, "grad_norm": 0.6736527681350708, "learning_rate": 4.379725776999149e-06, "loss": 0.0449, "num_input_tokens_seen": 219015280, "step": 101495 }, { "epoch": 16.557911908646002, "grad_norm": 0.09571456909179688, "learning_rate": 4.377713701820677e-06, "loss": 0.0934, "num_input_tokens_seen": 219024752, "step": 101500 }, { "epoch": 16.558727569331158, "grad_norm": 2.3119544982910156, "learning_rate": 4.375702044578997e-06, "loss": 0.2643, "num_input_tokens_seen": 219034544, "step": 101505 }, { "epoch": 16.559543230016313, "grad_norm": 0.044650837779045105, "learning_rate": 4.373690805314881e-06, "loss": 0.0975, "num_input_tokens_seen": 219045840, "step": 101510 }, { "epoch": 16.56035889070147, "grad_norm": 0.044083550572395325, "learning_rate": 4.371679984069085e-06, "loss": 0.1397, "num_input_tokens_seen": 219055760, "step": 101515 }, { "epoch": 16.561174551386625, "grad_norm": 2.097306489944458, "learning_rate": 4.369669580882363e-06, "loss": 0.1313, "num_input_tokens_seen": 219065840, "step": 101520 }, { "epoch": 16.561990212071777, "grad_norm": 0.055870234966278076, "learning_rate": 4.3676595957954594e-06, "loss": 0.1172, "num_input_tokens_seen": 219076176, "step": 101525 }, { "epoch": 16.562805872756933, "grad_norm": 0.8353304266929626, "learning_rate": 4.365650028849108e-06, "loss": 0.0942, "num_input_tokens_seen": 219086800, "step": 101530 }, { "epoch": 16.563621533442088, "grad_norm": 0.07080678641796112, "learning_rate": 4.3636408800840275e-06, "loss": 0.0482, "num_input_tokens_seen": 219096912, "step": 101535 }, { "epoch": 16.564437194127244, "grad_norm": 0.045225538313388824, "learning_rate": 4.361632149540956e-06, "loss": 0.1358, "num_input_tokens_seen": 219109040, "step": 101540 }, { "epoch": 16.5652528548124, "grad_norm": 0.05629301443696022, "learning_rate": 4.359623837260579e-06, "loss": 0.2752, "num_input_tokens_seen": 219120432, "step": 101545 }, { "epoch": 16.56606851549755, "grad_norm": 1.1634718179702759, "learning_rate": 4.357615943283624e-06, "loss": 0.0306, "num_input_tokens_seen": 219130928, "step": 101550 }, { "epoch": 16.566884176182707, "grad_norm": 0.03154103085398674, "learning_rate": 4.355608467650754e-06, "loss": 0.0209, "num_input_tokens_seen": 219139600, "step": 101555 }, { "epoch": 16.567699836867863, "grad_norm": 2.8270175457000732, "learning_rate": 4.35360141040268e-06, "loss": 0.1404, "num_input_tokens_seen": 219151280, "step": 101560 }, { "epoch": 16.56851549755302, "grad_norm": 0.07998274266719818, "learning_rate": 4.351594771580053e-06, "loss": 0.0952, "num_input_tokens_seen": 219162096, "step": 101565 }, { "epoch": 16.569331158238175, "grad_norm": 1.3047778606414795, "learning_rate": 4.349588551223563e-06, "loss": 0.176, "num_input_tokens_seen": 219172432, "step": 101570 }, { "epoch": 16.570146818923327, "grad_norm": 2.641900062561035, "learning_rate": 4.3475827493738495e-06, "loss": 0.3045, "num_input_tokens_seen": 219184400, "step": 101575 }, { "epoch": 16.570962479608482, "grad_norm": 0.0343494787812233, "learning_rate": 4.345577366071574e-06, "loss": 0.117, "num_input_tokens_seen": 219195408, "step": 101580 }, { "epoch": 16.571778140293638, "grad_norm": 1.768430233001709, "learning_rate": 4.343572401357376e-06, "loss": 0.1704, "num_input_tokens_seen": 219205712, "step": 101585 }, { "epoch": 16.572593800978794, "grad_norm": 0.41830694675445557, "learning_rate": 4.341567855271888e-06, "loss": 0.0216, "num_input_tokens_seen": 219216688, "step": 101590 }, { "epoch": 16.57340946166395, "grad_norm": 0.4112313687801361, "learning_rate": 4.339563727855733e-06, "loss": 0.072, "num_input_tokens_seen": 219227408, "step": 101595 }, { "epoch": 16.5742251223491, "grad_norm": 0.0409059002995491, "learning_rate": 4.337560019149531e-06, "loss": 0.042, "num_input_tokens_seen": 219237808, "step": 101600 }, { "epoch": 16.575040783034257, "grad_norm": 4.274208068847656, "learning_rate": 4.335556729193887e-06, "loss": 0.0285, "num_input_tokens_seen": 219249584, "step": 101605 }, { "epoch": 16.575856443719413, "grad_norm": 0.681331217288971, "learning_rate": 4.333553858029399e-06, "loss": 0.069, "num_input_tokens_seen": 219261520, "step": 101610 }, { "epoch": 16.57667210440457, "grad_norm": 1.3004517555236816, "learning_rate": 4.33155140569666e-06, "loss": 0.0747, "num_input_tokens_seen": 219272656, "step": 101615 }, { "epoch": 16.57748776508972, "grad_norm": 0.04470912739634514, "learning_rate": 4.3295493722362525e-06, "loss": 0.1438, "num_input_tokens_seen": 219284016, "step": 101620 }, { "epoch": 16.578303425774877, "grad_norm": 3.775405168533325, "learning_rate": 4.327547757688749e-06, "loss": 0.0861, "num_input_tokens_seen": 219294480, "step": 101625 }, { "epoch": 16.579119086460032, "grad_norm": 0.45946967601776123, "learning_rate": 4.325546562094715e-06, "loss": 0.1052, "num_input_tokens_seen": 219305136, "step": 101630 }, { "epoch": 16.579934747145188, "grad_norm": 2.5409164428710938, "learning_rate": 4.323545785494707e-06, "loss": 0.0714, "num_input_tokens_seen": 219316464, "step": 101635 }, { "epoch": 16.580750407830344, "grad_norm": 0.14718292653560638, "learning_rate": 4.3215454279292745e-06, "loss": 0.0153, "num_input_tokens_seen": 219326672, "step": 101640 }, { "epoch": 16.581566068515496, "grad_norm": 1.6950949430465698, "learning_rate": 4.319545489438956e-06, "loss": 0.1341, "num_input_tokens_seen": 219336368, "step": 101645 }, { "epoch": 16.58238172920065, "grad_norm": 0.7952552437782288, "learning_rate": 4.317545970064274e-06, "loss": 0.0402, "num_input_tokens_seen": 219347600, "step": 101650 }, { "epoch": 16.583197389885807, "grad_norm": 0.19988955557346344, "learning_rate": 4.315546869845777e-06, "loss": 0.2391, "num_input_tokens_seen": 219357488, "step": 101655 }, { "epoch": 16.584013050570963, "grad_norm": 0.04544821381568909, "learning_rate": 4.313548188823949e-06, "loss": 0.0401, "num_input_tokens_seen": 219368368, "step": 101660 }, { "epoch": 16.58482871125612, "grad_norm": 0.10121334344148636, "learning_rate": 4.3115499270393215e-06, "loss": 0.0071, "num_input_tokens_seen": 219379856, "step": 101665 }, { "epoch": 16.58564437194127, "grad_norm": 0.035344984382390976, "learning_rate": 4.3095520845323674e-06, "loss": 0.0327, "num_input_tokens_seen": 219389904, "step": 101670 }, { "epoch": 16.586460032626427, "grad_norm": 0.08741266280412674, "learning_rate": 4.307554661343602e-06, "loss": 0.0094, "num_input_tokens_seen": 219400656, "step": 101675 }, { "epoch": 16.587275693311582, "grad_norm": 0.03759506717324257, "learning_rate": 4.3055576575134755e-06, "loss": 0.1239, "num_input_tokens_seen": 219410736, "step": 101680 }, { "epoch": 16.588091353996738, "grad_norm": 1.5919599533081055, "learning_rate": 4.303561073082493e-06, "loss": 0.0862, "num_input_tokens_seen": 219423632, "step": 101685 }, { "epoch": 16.588907014681894, "grad_norm": 2.0700104236602783, "learning_rate": 4.3015649080910855e-06, "loss": 0.2112, "num_input_tokens_seen": 219434800, "step": 101690 }, { "epoch": 16.589722675367046, "grad_norm": 0.35635796189308167, "learning_rate": 4.2995691625797365e-06, "loss": 0.008, "num_input_tokens_seen": 219445456, "step": 101695 }, { "epoch": 16.5905383360522, "grad_norm": 0.12639054656028748, "learning_rate": 4.297573836588864e-06, "loss": 0.0905, "num_input_tokens_seen": 219456400, "step": 101700 }, { "epoch": 16.591353996737357, "grad_norm": 0.13128019869327545, "learning_rate": 4.295578930158936e-06, "loss": 0.0393, "num_input_tokens_seen": 219467472, "step": 101705 }, { "epoch": 16.592169657422513, "grad_norm": 0.3443795144557953, "learning_rate": 4.293584443330353e-06, "loss": 0.0196, "num_input_tokens_seen": 219478160, "step": 101710 }, { "epoch": 16.59298531810767, "grad_norm": 1.5890674591064453, "learning_rate": 4.2915903761435614e-06, "loss": 0.1168, "num_input_tokens_seen": 219487664, "step": 101715 }, { "epoch": 16.59380097879282, "grad_norm": 0.1471521407365799, "learning_rate": 4.289596728638948e-06, "loss": 0.0211, "num_input_tokens_seen": 219498192, "step": 101720 }, { "epoch": 16.594616639477977, "grad_norm": 1.6187289953231812, "learning_rate": 4.287603500856938e-06, "loss": 0.1745, "num_input_tokens_seen": 219508400, "step": 101725 }, { "epoch": 16.595432300163132, "grad_norm": 0.07846420258283615, "learning_rate": 4.285610692837916e-06, "loss": 0.0363, "num_input_tokens_seen": 219518896, "step": 101730 }, { "epoch": 16.596247960848288, "grad_norm": 1.0276563167572021, "learning_rate": 4.283618304622272e-06, "loss": 0.1376, "num_input_tokens_seen": 219530128, "step": 101735 }, { "epoch": 16.597063621533444, "grad_norm": 0.05013462156057358, "learning_rate": 4.281626336250386e-06, "loss": 0.0774, "num_input_tokens_seen": 219541168, "step": 101740 }, { "epoch": 16.597879282218596, "grad_norm": 0.03961673751473427, "learning_rate": 4.279634787762623e-06, "loss": 0.0396, "num_input_tokens_seen": 219552656, "step": 101745 }, { "epoch": 16.59869494290375, "grad_norm": 1.8561768531799316, "learning_rate": 4.277643659199349e-06, "loss": 0.0703, "num_input_tokens_seen": 219563248, "step": 101750 }, { "epoch": 16.599510603588907, "grad_norm": 0.06638456135988235, "learning_rate": 4.275652950600912e-06, "loss": 0.0719, "num_input_tokens_seen": 219573840, "step": 101755 }, { "epoch": 16.600326264274063, "grad_norm": 2.259007453918457, "learning_rate": 4.27366266200766e-06, "loss": 0.1011, "num_input_tokens_seen": 219583888, "step": 101760 }, { "epoch": 16.601141924959215, "grad_norm": 0.08668484538793564, "learning_rate": 4.271672793459927e-06, "loss": 0.2898, "num_input_tokens_seen": 219594672, "step": 101765 }, { "epoch": 16.60195758564437, "grad_norm": 1.4642330408096313, "learning_rate": 4.269683344998041e-06, "loss": 0.2033, "num_input_tokens_seen": 219605520, "step": 101770 }, { "epoch": 16.602773246329527, "grad_norm": 0.08592496812343597, "learning_rate": 4.267694316662313e-06, "loss": 0.1207, "num_input_tokens_seen": 219614832, "step": 101775 }, { "epoch": 16.603588907014682, "grad_norm": 2.45516037940979, "learning_rate": 4.265705708493076e-06, "loss": 0.1145, "num_input_tokens_seen": 219625040, "step": 101780 }, { "epoch": 16.604404567699838, "grad_norm": 0.04232516512274742, "learning_rate": 4.2637175205306e-06, "loss": 0.182, "num_input_tokens_seen": 219635312, "step": 101785 }, { "epoch": 16.605220228384994, "grad_norm": 1.9533966779708862, "learning_rate": 4.261729752815208e-06, "loss": 0.1401, "num_input_tokens_seen": 219646128, "step": 101790 }, { "epoch": 16.606035889070146, "grad_norm": 0.10047059506177902, "learning_rate": 4.25974240538716e-06, "loss": 0.2344, "num_input_tokens_seen": 219657136, "step": 101795 }, { "epoch": 16.6068515497553, "grad_norm": 0.3442908525466919, "learning_rate": 4.257755478286754e-06, "loss": 0.0528, "num_input_tokens_seen": 219667440, "step": 101800 }, { "epoch": 16.607667210440457, "grad_norm": 1.4034684896469116, "learning_rate": 4.255768971554233e-06, "loss": 0.0362, "num_input_tokens_seen": 219677968, "step": 101805 }, { "epoch": 16.608482871125613, "grad_norm": 1.993428349494934, "learning_rate": 4.253782885229884e-06, "loss": 0.327, "num_input_tokens_seen": 219689424, "step": 101810 }, { "epoch": 16.609298531810765, "grad_norm": 0.190339595079422, "learning_rate": 4.251797219353928e-06, "loss": 0.023, "num_input_tokens_seen": 219699888, "step": 101815 }, { "epoch": 16.61011419249592, "grad_norm": 0.16093827784061432, "learning_rate": 4.249811973966639e-06, "loss": 0.0568, "num_input_tokens_seen": 219711376, "step": 101820 }, { "epoch": 16.610929853181077, "grad_norm": 0.15100298821926117, "learning_rate": 4.247827149108219e-06, "loss": 0.0219, "num_input_tokens_seen": 219721360, "step": 101825 }, { "epoch": 16.611745513866232, "grad_norm": 0.08989201486110687, "learning_rate": 4.24584274481892e-06, "loss": 0.0385, "num_input_tokens_seen": 219732752, "step": 101830 }, { "epoch": 16.612561174551388, "grad_norm": 0.18825826048851013, "learning_rate": 4.243858761138935e-06, "loss": 0.0279, "num_input_tokens_seen": 219743824, "step": 101835 }, { "epoch": 16.61337683523654, "grad_norm": 1.9138247966766357, "learning_rate": 4.241875198108494e-06, "loss": 0.0766, "num_input_tokens_seen": 219755184, "step": 101840 }, { "epoch": 16.614192495921696, "grad_norm": 0.01624440588057041, "learning_rate": 4.239892055767775e-06, "loss": 0.0409, "num_input_tokens_seen": 219765968, "step": 101845 }, { "epoch": 16.61500815660685, "grad_norm": 1.9178810119628906, "learning_rate": 4.237909334156992e-06, "loss": 0.1231, "num_input_tokens_seen": 219776016, "step": 101850 }, { "epoch": 16.615823817292007, "grad_norm": 0.050515610724687576, "learning_rate": 4.235927033316303e-06, "loss": 0.2185, "num_input_tokens_seen": 219786448, "step": 101855 }, { "epoch": 16.616639477977163, "grad_norm": 0.2688736021518707, "learning_rate": 4.233945153285898e-06, "loss": 0.1357, "num_input_tokens_seen": 219797104, "step": 101860 }, { "epoch": 16.617455138662315, "grad_norm": 2.8928284645080566, "learning_rate": 4.231963694105939e-06, "loss": 0.3445, "num_input_tokens_seen": 219807056, "step": 101865 }, { "epoch": 16.61827079934747, "grad_norm": 0.08223021030426025, "learning_rate": 4.229982655816581e-06, "loss": 0.0268, "num_input_tokens_seen": 219817232, "step": 101870 }, { "epoch": 16.619086460032626, "grad_norm": 0.9015021920204163, "learning_rate": 4.228002038457976e-06, "loss": 0.0931, "num_input_tokens_seen": 219828240, "step": 101875 }, { "epoch": 16.619902120717782, "grad_norm": 1.2479957342147827, "learning_rate": 4.226021842070257e-06, "loss": 0.0627, "num_input_tokens_seen": 219838480, "step": 101880 }, { "epoch": 16.620717781402938, "grad_norm": 0.14272841811180115, "learning_rate": 4.224042066693562e-06, "loss": 0.1609, "num_input_tokens_seen": 219849808, "step": 101885 }, { "epoch": 16.62153344208809, "grad_norm": 0.3305594325065613, "learning_rate": 4.222062712368008e-06, "loss": 0.0411, "num_input_tokens_seen": 219861232, "step": 101890 }, { "epoch": 16.622349102773246, "grad_norm": 2.0795772075653076, "learning_rate": 4.220083779133715e-06, "loss": 0.1507, "num_input_tokens_seen": 219869904, "step": 101895 }, { "epoch": 16.6231647634584, "grad_norm": 1.4068788290023804, "learning_rate": 4.218105267030784e-06, "loss": 0.145, "num_input_tokens_seen": 219881680, "step": 101900 }, { "epoch": 16.623980424143557, "grad_norm": 0.11253883689641953, "learning_rate": 4.216127176099313e-06, "loss": 0.1376, "num_input_tokens_seen": 219892400, "step": 101905 }, { "epoch": 16.624796084828713, "grad_norm": 0.04902420565485954, "learning_rate": 4.214149506379389e-06, "loss": 0.0049, "num_input_tokens_seen": 219903344, "step": 101910 }, { "epoch": 16.625611745513865, "grad_norm": 0.059120144695043564, "learning_rate": 4.212172257911098e-06, "loss": 0.0307, "num_input_tokens_seen": 219914416, "step": 101915 }, { "epoch": 16.62642740619902, "grad_norm": 2.1354591846466064, "learning_rate": 4.210195430734498e-06, "loss": 0.2715, "num_input_tokens_seen": 219924688, "step": 101920 }, { "epoch": 16.627243066884176, "grad_norm": 0.5929040312767029, "learning_rate": 4.208219024889676e-06, "loss": 0.2511, "num_input_tokens_seen": 219935152, "step": 101925 }, { "epoch": 16.628058727569332, "grad_norm": 1.5916929244995117, "learning_rate": 4.206243040416657e-06, "loss": 0.1743, "num_input_tokens_seen": 219945648, "step": 101930 }, { "epoch": 16.628874388254488, "grad_norm": 0.35552287101745605, "learning_rate": 4.204267477355517e-06, "loss": 0.0268, "num_input_tokens_seen": 219955952, "step": 101935 }, { "epoch": 16.62969004893964, "grad_norm": 1.9615557193756104, "learning_rate": 4.202292335746263e-06, "loss": 0.0356, "num_input_tokens_seen": 219967024, "step": 101940 }, { "epoch": 16.630505709624796, "grad_norm": 0.48188790678977966, "learning_rate": 4.200317615628951e-06, "loss": 0.0348, "num_input_tokens_seen": 219976304, "step": 101945 }, { "epoch": 16.63132137030995, "grad_norm": 0.09816569089889526, "learning_rate": 4.1983433170435794e-06, "loss": 0.1534, "num_input_tokens_seen": 219988336, "step": 101950 }, { "epoch": 16.632137030995107, "grad_norm": 0.6820790767669678, "learning_rate": 4.196369440030179e-06, "loss": 0.0703, "num_input_tokens_seen": 219997808, "step": 101955 }, { "epoch": 16.63295269168026, "grad_norm": 1.7803781032562256, "learning_rate": 4.1943959846287295e-06, "loss": 0.1201, "num_input_tokens_seen": 220008816, "step": 101960 }, { "epoch": 16.633768352365415, "grad_norm": 0.09491182863712311, "learning_rate": 4.192422950879249e-06, "loss": 0.0897, "num_input_tokens_seen": 220020048, "step": 101965 }, { "epoch": 16.63458401305057, "grad_norm": 0.09338702261447906, "learning_rate": 4.190450338821711e-06, "loss": 0.1417, "num_input_tokens_seen": 220031152, "step": 101970 }, { "epoch": 16.635399673735726, "grad_norm": 0.06084975227713585, "learning_rate": 4.188478148496098e-06, "loss": 0.0154, "num_input_tokens_seen": 220041744, "step": 101975 }, { "epoch": 16.636215334420882, "grad_norm": 0.47747746109962463, "learning_rate": 4.186506379942373e-06, "loss": 0.1413, "num_input_tokens_seen": 220050896, "step": 101980 }, { "epoch": 16.637030995106034, "grad_norm": 0.33124780654907227, "learning_rate": 4.184535033200501e-06, "loss": 0.04, "num_input_tokens_seen": 220061040, "step": 101985 }, { "epoch": 16.63784665579119, "grad_norm": 1.9702311754226685, "learning_rate": 4.182564108310435e-06, "loss": 0.0352, "num_input_tokens_seen": 220072336, "step": 101990 }, { "epoch": 16.638662316476346, "grad_norm": 1.3698840141296387, "learning_rate": 4.180593605312116e-06, "loss": 0.043, "num_input_tokens_seen": 220084016, "step": 101995 }, { "epoch": 16.6394779771615, "grad_norm": 0.5787404775619507, "learning_rate": 4.178623524245479e-06, "loss": 0.1335, "num_input_tokens_seen": 220095376, "step": 102000 }, { "epoch": 16.640293637846657, "grad_norm": 0.14881666004657745, "learning_rate": 4.176653865150448e-06, "loss": 0.1655, "num_input_tokens_seen": 220105616, "step": 102005 }, { "epoch": 16.64110929853181, "grad_norm": 0.1861283928155899, "learning_rate": 4.174684628066944e-06, "loss": 0.0182, "num_input_tokens_seen": 220117072, "step": 102010 }, { "epoch": 16.641924959216965, "grad_norm": 1.7350022792816162, "learning_rate": 4.172715813034872e-06, "loss": 0.1878, "num_input_tokens_seen": 220127248, "step": 102015 }, { "epoch": 16.64274061990212, "grad_norm": 1.3462650775909424, "learning_rate": 4.1707474200941395e-06, "loss": 0.1975, "num_input_tokens_seen": 220138224, "step": 102020 }, { "epoch": 16.643556280587276, "grad_norm": 0.06707089394330978, "learning_rate": 4.168779449284632e-06, "loss": 0.1742, "num_input_tokens_seen": 220150512, "step": 102025 }, { "epoch": 16.644371941272432, "grad_norm": 2.3176920413970947, "learning_rate": 4.166811900646236e-06, "loss": 0.2562, "num_input_tokens_seen": 220161872, "step": 102030 }, { "epoch": 16.645187601957584, "grad_norm": 0.3888966143131256, "learning_rate": 4.164844774218826e-06, "loss": 0.0203, "num_input_tokens_seen": 220171856, "step": 102035 }, { "epoch": 16.64600326264274, "grad_norm": 0.5641575455665588, "learning_rate": 4.162878070042267e-06, "loss": 0.0175, "num_input_tokens_seen": 220181712, "step": 102040 }, { "epoch": 16.646818923327896, "grad_norm": 0.2652477025985718, "learning_rate": 4.1609117881564184e-06, "loss": 0.1964, "num_input_tokens_seen": 220192336, "step": 102045 }, { "epoch": 16.64763458401305, "grad_norm": 0.49435070157051086, "learning_rate": 4.1589459286011264e-06, "loss": 0.0413, "num_input_tokens_seen": 220202768, "step": 102050 }, { "epoch": 16.648450244698207, "grad_norm": 0.19342999160289764, "learning_rate": 4.156980491416238e-06, "loss": 0.0264, "num_input_tokens_seen": 220214128, "step": 102055 }, { "epoch": 16.64926590538336, "grad_norm": 0.30759531259536743, "learning_rate": 4.155015476641577e-06, "loss": 0.0654, "num_input_tokens_seen": 220224016, "step": 102060 }, { "epoch": 16.650081566068515, "grad_norm": 0.8837685585021973, "learning_rate": 4.153050884316969e-06, "loss": 0.1336, "num_input_tokens_seen": 220234384, "step": 102065 }, { "epoch": 16.65089722675367, "grad_norm": 0.0857643261551857, "learning_rate": 4.151086714482236e-06, "loss": 0.1251, "num_input_tokens_seen": 220245680, "step": 102070 }, { "epoch": 16.651712887438826, "grad_norm": 2.888495683670044, "learning_rate": 4.149122967177177e-06, "loss": 0.0936, "num_input_tokens_seen": 220256976, "step": 102075 }, { "epoch": 16.652528548123982, "grad_norm": 0.11886422336101532, "learning_rate": 4.1471596424415945e-06, "loss": 0.0905, "num_input_tokens_seen": 220268400, "step": 102080 }, { "epoch": 16.653344208809134, "grad_norm": 2.660569429397583, "learning_rate": 4.145196740315274e-06, "loss": 0.0992, "num_input_tokens_seen": 220278640, "step": 102085 }, { "epoch": 16.65415986949429, "grad_norm": 0.1434766799211502, "learning_rate": 4.143234260838e-06, "loss": 0.0813, "num_input_tokens_seen": 220287312, "step": 102090 }, { "epoch": 16.654975530179446, "grad_norm": 1.6914279460906982, "learning_rate": 4.14127220404954e-06, "loss": 0.2119, "num_input_tokens_seen": 220297360, "step": 102095 }, { "epoch": 16.6557911908646, "grad_norm": 0.30278098583221436, "learning_rate": 4.139310569989663e-06, "loss": 0.0666, "num_input_tokens_seen": 220308464, "step": 102100 }, { "epoch": 16.656606851549757, "grad_norm": 0.24013881385326385, "learning_rate": 4.137349358698117e-06, "loss": 0.0079, "num_input_tokens_seen": 220318960, "step": 102105 }, { "epoch": 16.65742251223491, "grad_norm": 0.028739510104060173, "learning_rate": 4.135388570214654e-06, "loss": 0.1724, "num_input_tokens_seen": 220330544, "step": 102110 }, { "epoch": 16.658238172920065, "grad_norm": 1.2895740270614624, "learning_rate": 4.133428204579013e-06, "loss": 0.1825, "num_input_tokens_seen": 220342576, "step": 102115 }, { "epoch": 16.65905383360522, "grad_norm": 0.4219132363796234, "learning_rate": 4.1314682618309174e-06, "loss": 0.0156, "num_input_tokens_seen": 220354000, "step": 102120 }, { "epoch": 16.659869494290376, "grad_norm": 3.1666486263275146, "learning_rate": 4.1295087420100905e-06, "loss": 0.0754, "num_input_tokens_seen": 220365136, "step": 102125 }, { "epoch": 16.660685154975532, "grad_norm": 0.13263560831546783, "learning_rate": 4.127549645156248e-06, "loss": 0.0253, "num_input_tokens_seen": 220376208, "step": 102130 }, { "epoch": 16.661500815660684, "grad_norm": 1.7178049087524414, "learning_rate": 4.125590971309088e-06, "loss": 0.2081, "num_input_tokens_seen": 220386640, "step": 102135 }, { "epoch": 16.66231647634584, "grad_norm": 0.044932615011930466, "learning_rate": 4.123632720508311e-06, "loss": 0.1286, "num_input_tokens_seen": 220397136, "step": 102140 }, { "epoch": 16.663132137030995, "grad_norm": 0.3844754099845886, "learning_rate": 4.121674892793598e-06, "loss": 0.0159, "num_input_tokens_seen": 220408848, "step": 102145 }, { "epoch": 16.66394779771615, "grad_norm": 1.0186960697174072, "learning_rate": 4.11971748820463e-06, "loss": 0.0423, "num_input_tokens_seen": 220418512, "step": 102150 }, { "epoch": 16.664763458401303, "grad_norm": 3.0964527130126953, "learning_rate": 4.117760506781074e-06, "loss": 0.2324, "num_input_tokens_seen": 220429328, "step": 102155 }, { "epoch": 16.66557911908646, "grad_norm": 0.30763840675354004, "learning_rate": 4.115803948562594e-06, "loss": 0.0544, "num_input_tokens_seen": 220439312, "step": 102160 }, { "epoch": 16.666394779771615, "grad_norm": 1.2681775093078613, "learning_rate": 4.1138478135888395e-06, "loss": 0.1365, "num_input_tokens_seen": 220450448, "step": 102165 }, { "epoch": 16.66721044045677, "grad_norm": 0.9135929942131042, "learning_rate": 4.111892101899456e-06, "loss": 0.0506, "num_input_tokens_seen": 220461552, "step": 102170 }, { "epoch": 16.668026101141926, "grad_norm": 0.10261755436658859, "learning_rate": 4.109936813534071e-06, "loss": 0.0192, "num_input_tokens_seen": 220472624, "step": 102175 }, { "epoch": 16.66884176182708, "grad_norm": 0.2515617609024048, "learning_rate": 4.10798194853233e-06, "loss": 0.2066, "num_input_tokens_seen": 220482672, "step": 102180 }, { "epoch": 16.669657422512234, "grad_norm": 1.2306268215179443, "learning_rate": 4.1060275069338265e-06, "loss": 0.0611, "num_input_tokens_seen": 220493488, "step": 102185 }, { "epoch": 16.67047308319739, "grad_norm": 0.9146336317062378, "learning_rate": 4.104073488778193e-06, "loss": 0.1856, "num_input_tokens_seen": 220504976, "step": 102190 }, { "epoch": 16.671288743882545, "grad_norm": 0.3996991515159607, "learning_rate": 4.1021198941050054e-06, "loss": 0.0395, "num_input_tokens_seen": 220516240, "step": 102195 }, { "epoch": 16.6721044045677, "grad_norm": 1.8418039083480835, "learning_rate": 4.100166722953882e-06, "loss": 0.0402, "num_input_tokens_seen": 220527408, "step": 102200 }, { "epoch": 16.672920065252853, "grad_norm": 0.27177464962005615, "learning_rate": 4.098213975364381e-06, "loss": 0.0491, "num_input_tokens_seen": 220537520, "step": 102205 }, { "epoch": 16.67373572593801, "grad_norm": 2.7534241676330566, "learning_rate": 4.096261651376096e-06, "loss": 0.0851, "num_input_tokens_seen": 220548304, "step": 102210 }, { "epoch": 16.674551386623165, "grad_norm": 4.008549213409424, "learning_rate": 4.0943097510285885e-06, "loss": 0.2206, "num_input_tokens_seen": 220559728, "step": 102215 }, { "epoch": 16.67536704730832, "grad_norm": 0.1142876073718071, "learning_rate": 4.092358274361413e-06, "loss": 0.0461, "num_input_tokens_seen": 220571152, "step": 102220 }, { "epoch": 16.676182707993476, "grad_norm": 0.4946313500404358, "learning_rate": 4.090407221414119e-06, "loss": 0.1461, "num_input_tokens_seen": 220582544, "step": 102225 }, { "epoch": 16.67699836867863, "grad_norm": 2.9260053634643555, "learning_rate": 4.08845659222625e-06, "loss": 0.2694, "num_input_tokens_seen": 220593616, "step": 102230 }, { "epoch": 16.677814029363784, "grad_norm": 0.08185973763465881, "learning_rate": 4.086506386837335e-06, "loss": 0.1207, "num_input_tokens_seen": 220604880, "step": 102235 }, { "epoch": 16.67862969004894, "grad_norm": 0.6513988971710205, "learning_rate": 4.084556605286902e-06, "loss": 0.0329, "num_input_tokens_seen": 220615824, "step": 102240 }, { "epoch": 16.679445350734095, "grad_norm": 0.6807125210762024, "learning_rate": 4.082607247614459e-06, "loss": 0.0382, "num_input_tokens_seen": 220626160, "step": 102245 }, { "epoch": 16.68026101141925, "grad_norm": 0.03564770147204399, "learning_rate": 4.080658313859517e-06, "loss": 0.0116, "num_input_tokens_seen": 220637552, "step": 102250 }, { "epoch": 16.681076672104403, "grad_norm": 0.031616199761629105, "learning_rate": 4.0787098040615725e-06, "loss": 0.2461, "num_input_tokens_seen": 220649232, "step": 102255 }, { "epoch": 16.68189233278956, "grad_norm": 0.42959168553352356, "learning_rate": 4.0767617182601136e-06, "loss": 0.1251, "num_input_tokens_seen": 220659664, "step": 102260 }, { "epoch": 16.682707993474715, "grad_norm": 0.019788507372140884, "learning_rate": 4.074814056494622e-06, "loss": 0.101, "num_input_tokens_seen": 220670192, "step": 102265 }, { "epoch": 16.68352365415987, "grad_norm": 2.842378616333008, "learning_rate": 4.072866818804569e-06, "loss": 0.2512, "num_input_tokens_seen": 220680112, "step": 102270 }, { "epoch": 16.684339314845026, "grad_norm": 0.6586837768554688, "learning_rate": 4.070920005229417e-06, "loss": 0.1149, "num_input_tokens_seen": 220690192, "step": 102275 }, { "epoch": 16.68515497553018, "grad_norm": 0.23974041640758514, "learning_rate": 4.068973615808619e-06, "loss": 0.1394, "num_input_tokens_seen": 220701392, "step": 102280 }, { "epoch": 16.685970636215334, "grad_norm": 0.7792413234710693, "learning_rate": 4.067027650581632e-06, "loss": 0.2146, "num_input_tokens_seen": 220712336, "step": 102285 }, { "epoch": 16.68678629690049, "grad_norm": 1.917433500289917, "learning_rate": 4.065082109587873e-06, "loss": 0.051, "num_input_tokens_seen": 220722576, "step": 102290 }, { "epoch": 16.687601957585645, "grad_norm": 1.1965700387954712, "learning_rate": 4.063136992866798e-06, "loss": 0.1962, "num_input_tokens_seen": 220733264, "step": 102295 }, { "epoch": 16.6884176182708, "grad_norm": 0.8426223397254944, "learning_rate": 4.061192300457797e-06, "loss": 0.1145, "num_input_tokens_seen": 220744848, "step": 102300 }, { "epoch": 16.689233278955953, "grad_norm": 0.06361785531044006, "learning_rate": 4.059248032400309e-06, "loss": 0.0288, "num_input_tokens_seen": 220755728, "step": 102305 }, { "epoch": 16.69004893964111, "grad_norm": 1.3716360330581665, "learning_rate": 4.057304188733716e-06, "loss": 0.0289, "num_input_tokens_seen": 220765840, "step": 102310 }, { "epoch": 16.690864600326265, "grad_norm": 0.08214685320854187, "learning_rate": 4.055360769497429e-06, "loss": 0.1153, "num_input_tokens_seen": 220776816, "step": 102315 }, { "epoch": 16.69168026101142, "grad_norm": 0.7171151638031006, "learning_rate": 4.053417774730817e-06, "loss": 0.0986, "num_input_tokens_seen": 220787888, "step": 102320 }, { "epoch": 16.692495921696576, "grad_norm": 0.2559693455696106, "learning_rate": 4.051475204473277e-06, "loss": 0.161, "num_input_tokens_seen": 220799248, "step": 102325 }, { "epoch": 16.693311582381728, "grad_norm": 1.9678701162338257, "learning_rate": 4.049533058764154e-06, "loss": 0.1529, "num_input_tokens_seen": 220810384, "step": 102330 }, { "epoch": 16.694127243066884, "grad_norm": 0.04237821698188782, "learning_rate": 4.047591337642836e-06, "loss": 0.0255, "num_input_tokens_seen": 220821296, "step": 102335 }, { "epoch": 16.69494290375204, "grad_norm": 0.3061963617801666, "learning_rate": 4.045650041148646e-06, "loss": 0.0416, "num_input_tokens_seen": 220831248, "step": 102340 }, { "epoch": 16.695758564437195, "grad_norm": 0.33577248454093933, "learning_rate": 4.043709169320953e-06, "loss": 0.039, "num_input_tokens_seen": 220842544, "step": 102345 }, { "epoch": 16.696574225122347, "grad_norm": 0.03220575675368309, "learning_rate": 4.041768722199066e-06, "loss": 0.2031, "num_input_tokens_seen": 220853840, "step": 102350 }, { "epoch": 16.697389885807503, "grad_norm": 0.05286445841193199, "learning_rate": 4.039828699822332e-06, "loss": 0.0191, "num_input_tokens_seen": 220864944, "step": 102355 }, { "epoch": 16.69820554649266, "grad_norm": 0.016963262110948563, "learning_rate": 4.037889102230055e-06, "loss": 0.045, "num_input_tokens_seen": 220876688, "step": 102360 }, { "epoch": 16.699021207177815, "grad_norm": 0.24670001864433289, "learning_rate": 4.035949929461549e-06, "loss": 0.1488, "num_input_tokens_seen": 220888016, "step": 102365 }, { "epoch": 16.69983686786297, "grad_norm": 0.4830205738544464, "learning_rate": 4.034011181556113e-06, "loss": 0.2773, "num_input_tokens_seen": 220899440, "step": 102370 }, { "epoch": 16.700652528548122, "grad_norm": 0.06995900720357895, "learning_rate": 4.0320728585530375e-06, "loss": 0.0581, "num_input_tokens_seen": 220910320, "step": 102375 }, { "epoch": 16.701468189233278, "grad_norm": 0.8651038408279419, "learning_rate": 4.030134960491605e-06, "loss": 0.1637, "num_input_tokens_seen": 220921200, "step": 102380 }, { "epoch": 16.702283849918434, "grad_norm": 1.859753131866455, "learning_rate": 4.028197487411092e-06, "loss": 0.2565, "num_input_tokens_seen": 220932048, "step": 102385 }, { "epoch": 16.70309951060359, "grad_norm": 1.2913415431976318, "learning_rate": 4.02626043935076e-06, "loss": 0.0976, "num_input_tokens_seen": 220942000, "step": 102390 }, { "epoch": 16.703915171288745, "grad_norm": 0.2505374550819397, "learning_rate": 4.024323816349865e-06, "loss": 0.0383, "num_input_tokens_seen": 220952336, "step": 102395 }, { "epoch": 16.704730831973897, "grad_norm": 0.17097321152687073, "learning_rate": 4.022387618447659e-06, "loss": 0.1214, "num_input_tokens_seen": 220964400, "step": 102400 }, { "epoch": 16.705546492659053, "grad_norm": 0.045879118144512177, "learning_rate": 4.0204518456833764e-06, "loss": 0.0913, "num_input_tokens_seen": 220974672, "step": 102405 }, { "epoch": 16.70636215334421, "grad_norm": 2.59330153465271, "learning_rate": 4.018516498096261e-06, "loss": 0.0873, "num_input_tokens_seen": 220985904, "step": 102410 }, { "epoch": 16.707177814029365, "grad_norm": 0.4811491072177887, "learning_rate": 4.016581575725515e-06, "loss": 0.088, "num_input_tokens_seen": 220997040, "step": 102415 }, { "epoch": 16.70799347471452, "grad_norm": 0.5717535614967346, "learning_rate": 4.014647078610373e-06, "loss": 0.0915, "num_input_tokens_seen": 221007312, "step": 102420 }, { "epoch": 16.708809135399672, "grad_norm": 0.057628072798252106, "learning_rate": 4.012713006790017e-06, "loss": 0.0392, "num_input_tokens_seen": 221018800, "step": 102425 }, { "epoch": 16.709624796084828, "grad_norm": 0.6755437254905701, "learning_rate": 4.010779360303671e-06, "loss": 0.0433, "num_input_tokens_seen": 221028656, "step": 102430 }, { "epoch": 16.710440456769984, "grad_norm": 0.30945590138435364, "learning_rate": 4.0088461391904946e-06, "loss": 0.0368, "num_input_tokens_seen": 221040208, "step": 102435 }, { "epoch": 16.71125611745514, "grad_norm": 1.9334150552749634, "learning_rate": 4.0069133434896904e-06, "loss": 0.1023, "num_input_tokens_seen": 221051888, "step": 102440 }, { "epoch": 16.712071778140295, "grad_norm": 1.5989100933074951, "learning_rate": 4.00498097324041e-06, "loss": 0.0939, "num_input_tokens_seen": 221063088, "step": 102445 }, { "epoch": 16.712887438825447, "grad_norm": 0.057372551411390305, "learning_rate": 4.003049028481834e-06, "loss": 0.0121, "num_input_tokens_seen": 221074448, "step": 102450 }, { "epoch": 16.713703099510603, "grad_norm": 1.3385754823684692, "learning_rate": 4.001117509253096e-06, "loss": 0.0554, "num_input_tokens_seen": 221085648, "step": 102455 }, { "epoch": 16.71451876019576, "grad_norm": 0.046685684472322464, "learning_rate": 3.999186415593362e-06, "loss": 0.1367, "num_input_tokens_seen": 221095248, "step": 102460 }, { "epoch": 16.715334420880914, "grad_norm": 0.8836663365364075, "learning_rate": 3.997255747541745e-06, "loss": 0.2173, "num_input_tokens_seen": 221104336, "step": 102465 }, { "epoch": 16.71615008156607, "grad_norm": 1.5971755981445312, "learning_rate": 3.995325505137396e-06, "loss": 0.0953, "num_input_tokens_seen": 221115600, "step": 102470 }, { "epoch": 16.716965742251222, "grad_norm": 0.28464263677597046, "learning_rate": 3.99339568841941e-06, "loss": 0.0718, "num_input_tokens_seen": 221126608, "step": 102475 }, { "epoch": 16.717781402936378, "grad_norm": 0.048681288957595825, "learning_rate": 3.991466297426919e-06, "loss": 0.0709, "num_input_tokens_seen": 221137872, "step": 102480 }, { "epoch": 16.718597063621534, "grad_norm": 0.06412293761968613, "learning_rate": 3.989537332199006e-06, "loss": 0.0084, "num_input_tokens_seen": 221148272, "step": 102485 }, { "epoch": 16.71941272430669, "grad_norm": 0.047124601900577545, "learning_rate": 3.987608792774783e-06, "loss": 0.1121, "num_input_tokens_seen": 221160048, "step": 102490 }, { "epoch": 16.72022838499184, "grad_norm": 0.4295179545879364, "learning_rate": 3.9856806791933096e-06, "loss": 0.1399, "num_input_tokens_seen": 221169552, "step": 102495 }, { "epoch": 16.721044045676997, "grad_norm": 0.5091255307197571, "learning_rate": 3.983752991493686e-06, "loss": 0.0187, "num_input_tokens_seen": 221180336, "step": 102500 }, { "epoch": 16.721859706362153, "grad_norm": 0.6227867007255554, "learning_rate": 3.981825729714966e-06, "loss": 0.0183, "num_input_tokens_seen": 221189328, "step": 102505 }, { "epoch": 16.72267536704731, "grad_norm": 2.07589054107666, "learning_rate": 3.979898893896214e-06, "loss": 0.1952, "num_input_tokens_seen": 221201136, "step": 102510 }, { "epoch": 16.723491027732464, "grad_norm": 0.05771663039922714, "learning_rate": 3.977972484076476e-06, "loss": 0.0233, "num_input_tokens_seen": 221211344, "step": 102515 }, { "epoch": 16.724306688417617, "grad_norm": 0.06229642778635025, "learning_rate": 3.9760465002947914e-06, "loss": 0.1864, "num_input_tokens_seen": 221222672, "step": 102520 }, { "epoch": 16.725122349102772, "grad_norm": 0.04210354760289192, "learning_rate": 3.974120942590198e-06, "loss": 0.1999, "num_input_tokens_seen": 221233040, "step": 102525 }, { "epoch": 16.725938009787928, "grad_norm": 0.8815999627113342, "learning_rate": 3.9721958110017136e-06, "loss": 0.0625, "num_input_tokens_seen": 221242704, "step": 102530 }, { "epoch": 16.726753670473084, "grad_norm": 0.6459254622459412, "learning_rate": 3.97027110556836e-06, "loss": 0.0782, "num_input_tokens_seen": 221253232, "step": 102535 }, { "epoch": 16.72756933115824, "grad_norm": 0.08535280078649521, "learning_rate": 3.968346826329139e-06, "loss": 0.1354, "num_input_tokens_seen": 221263248, "step": 102540 }, { "epoch": 16.72838499184339, "grad_norm": 0.7780359983444214, "learning_rate": 3.96642297332305e-06, "loss": 0.0542, "num_input_tokens_seen": 221272656, "step": 102545 }, { "epoch": 16.729200652528547, "grad_norm": 1.8228439092636108, "learning_rate": 3.964499546589079e-06, "loss": 0.1291, "num_input_tokens_seen": 221284400, "step": 102550 }, { "epoch": 16.730016313213703, "grad_norm": 0.020839262753725052, "learning_rate": 3.962576546166219e-06, "loss": 0.0106, "num_input_tokens_seen": 221295984, "step": 102555 }, { "epoch": 16.73083197389886, "grad_norm": 0.0560205914080143, "learning_rate": 3.960653972093423e-06, "loss": 0.1213, "num_input_tokens_seen": 221306544, "step": 102560 }, { "epoch": 16.731647634584014, "grad_norm": 0.035855650901794434, "learning_rate": 3.958731824409675e-06, "loss": 0.0446, "num_input_tokens_seen": 221318544, "step": 102565 }, { "epoch": 16.732463295269167, "grad_norm": 1.0573521852493286, "learning_rate": 3.9568101031539075e-06, "loss": 0.1281, "num_input_tokens_seen": 221328944, "step": 102570 }, { "epoch": 16.733278955954322, "grad_norm": 0.5207914113998413, "learning_rate": 3.9548888083650925e-06, "loss": 0.0963, "num_input_tokens_seen": 221340624, "step": 102575 }, { "epoch": 16.734094616639478, "grad_norm": 0.23748940229415894, "learning_rate": 3.952967940082139e-06, "loss": 0.2401, "num_input_tokens_seen": 221349744, "step": 102580 }, { "epoch": 16.734910277324634, "grad_norm": 2.527966260910034, "learning_rate": 3.951047498344005e-06, "loss": 0.1088, "num_input_tokens_seen": 221360336, "step": 102585 }, { "epoch": 16.73572593800979, "grad_norm": 1.0836111307144165, "learning_rate": 3.949127483189579e-06, "loss": 0.0594, "num_input_tokens_seen": 221370448, "step": 102590 }, { "epoch": 16.73654159869494, "grad_norm": 2.1755211353302, "learning_rate": 3.947207894657806e-06, "loss": 0.2517, "num_input_tokens_seen": 221381296, "step": 102595 }, { "epoch": 16.737357259380097, "grad_norm": 0.1002366915345192, "learning_rate": 3.945288732787561e-06, "loss": 0.0466, "num_input_tokens_seen": 221392784, "step": 102600 }, { "epoch": 16.738172920065253, "grad_norm": 0.5229086875915527, "learning_rate": 3.943369997617752e-06, "loss": 0.0477, "num_input_tokens_seen": 221404080, "step": 102605 }, { "epoch": 16.73898858075041, "grad_norm": 0.7285533547401428, "learning_rate": 3.9414516891872644e-06, "loss": 0.157, "num_input_tokens_seen": 221414448, "step": 102610 }, { "epoch": 16.739804241435564, "grad_norm": 0.9384031295776367, "learning_rate": 3.939533807534973e-06, "loss": 0.1294, "num_input_tokens_seen": 221425008, "step": 102615 }, { "epoch": 16.740619902120716, "grad_norm": 0.04561378061771393, "learning_rate": 3.937616352699744e-06, "loss": 0.0861, "num_input_tokens_seen": 221435184, "step": 102620 }, { "epoch": 16.741435562805872, "grad_norm": 2.6752266883850098, "learning_rate": 3.935699324720443e-06, "loss": 0.1142, "num_input_tokens_seen": 221445328, "step": 102625 }, { "epoch": 16.742251223491028, "grad_norm": 3.0486626625061035, "learning_rate": 3.933782723635915e-06, "loss": 0.1923, "num_input_tokens_seen": 221457904, "step": 102630 }, { "epoch": 16.743066884176184, "grad_norm": 0.07143516093492508, "learning_rate": 3.931866549485003e-06, "loss": 0.0643, "num_input_tokens_seen": 221468176, "step": 102635 }, { "epoch": 16.74388254486134, "grad_norm": 1.704866647720337, "learning_rate": 3.929950802306545e-06, "loss": 0.1222, "num_input_tokens_seen": 221479120, "step": 102640 }, { "epoch": 16.74469820554649, "grad_norm": 1.1067582368850708, "learning_rate": 3.928035482139361e-06, "loss": 0.1034, "num_input_tokens_seen": 221490288, "step": 102645 }, { "epoch": 16.745513866231647, "grad_norm": 0.9807212948799133, "learning_rate": 3.92612058902227e-06, "loss": 0.1545, "num_input_tokens_seen": 221502384, "step": 102650 }, { "epoch": 16.746329526916803, "grad_norm": 0.033780377358198166, "learning_rate": 3.924206122994078e-06, "loss": 0.1789, "num_input_tokens_seen": 221513488, "step": 102655 }, { "epoch": 16.74714518760196, "grad_norm": 0.08359914273023605, "learning_rate": 3.92229208409359e-06, "loss": 0.0917, "num_input_tokens_seen": 221522832, "step": 102660 }, { "epoch": 16.747960848287114, "grad_norm": 0.3191339373588562, "learning_rate": 3.920378472359587e-06, "loss": 0.2257, "num_input_tokens_seen": 221534992, "step": 102665 }, { "epoch": 16.748776508972266, "grad_norm": 0.07450053840875626, "learning_rate": 3.918465287830858e-06, "loss": 0.108, "num_input_tokens_seen": 221546032, "step": 102670 }, { "epoch": 16.749592169657422, "grad_norm": 0.05622898042201996, "learning_rate": 3.916552530546175e-06, "loss": 0.0994, "num_input_tokens_seen": 221557296, "step": 102675 }, { "epoch": 16.750407830342578, "grad_norm": 2.33185076713562, "learning_rate": 3.914640200544301e-06, "loss": 0.3005, "num_input_tokens_seen": 221567504, "step": 102680 }, { "epoch": 16.751223491027734, "grad_norm": 1.247219204902649, "learning_rate": 3.912728297863991e-06, "loss": 0.1111, "num_input_tokens_seen": 221579184, "step": 102685 }, { "epoch": 16.752039151712886, "grad_norm": 0.15897810459136963, "learning_rate": 3.910816822543992e-06, "loss": 0.0715, "num_input_tokens_seen": 221590128, "step": 102690 }, { "epoch": 16.75285481239804, "grad_norm": 1.689998984336853, "learning_rate": 3.908905774623039e-06, "loss": 0.0616, "num_input_tokens_seen": 221600752, "step": 102695 }, { "epoch": 16.753670473083197, "grad_norm": 0.10106132924556732, "learning_rate": 3.906995154139878e-06, "loss": 0.0693, "num_input_tokens_seen": 221611984, "step": 102700 }, { "epoch": 16.754486133768353, "grad_norm": 0.014410914853215218, "learning_rate": 3.905084961133207e-06, "loss": 0.0325, "num_input_tokens_seen": 221623056, "step": 102705 }, { "epoch": 16.75530179445351, "grad_norm": 2.743128776550293, "learning_rate": 3.903175195641754e-06, "loss": 0.1997, "num_input_tokens_seen": 221634128, "step": 102710 }, { "epoch": 16.75611745513866, "grad_norm": 0.4474928677082062, "learning_rate": 3.90126585770422e-06, "loss": 0.0509, "num_input_tokens_seen": 221644528, "step": 102715 }, { "epoch": 16.756933115823816, "grad_norm": 0.11751966178417206, "learning_rate": 3.8993569473593005e-06, "loss": 0.1125, "num_input_tokens_seen": 221654480, "step": 102720 }, { "epoch": 16.757748776508972, "grad_norm": 0.6150006651878357, "learning_rate": 3.897448464645681e-06, "loss": 0.0911, "num_input_tokens_seen": 221664336, "step": 102725 }, { "epoch": 16.758564437194128, "grad_norm": 0.07536493241786957, "learning_rate": 3.895540409602036e-06, "loss": 0.0507, "num_input_tokens_seen": 221674480, "step": 102730 }, { "epoch": 16.759380097879284, "grad_norm": 0.025343013927340508, "learning_rate": 3.89363278226704e-06, "loss": 0.0913, "num_input_tokens_seen": 221686224, "step": 102735 }, { "epoch": 16.760195758564436, "grad_norm": 0.3358939588069916, "learning_rate": 3.891725582679348e-06, "loss": 0.0095, "num_input_tokens_seen": 221697616, "step": 102740 }, { "epoch": 16.76101141924959, "grad_norm": 2.2249977588653564, "learning_rate": 3.889818810877618e-06, "loss": 0.3409, "num_input_tokens_seen": 221707536, "step": 102745 }, { "epoch": 16.761827079934747, "grad_norm": 0.6878055334091187, "learning_rate": 3.88791246690049e-06, "loss": 0.0611, "num_input_tokens_seen": 221718800, "step": 102750 }, { "epoch": 16.762642740619903, "grad_norm": 0.1486276537179947, "learning_rate": 3.886006550786597e-06, "loss": 0.0293, "num_input_tokens_seen": 221728816, "step": 102755 }, { "epoch": 16.76345840130506, "grad_norm": 0.026402875781059265, "learning_rate": 3.884101062574566e-06, "loss": 0.0081, "num_input_tokens_seen": 221739120, "step": 102760 }, { "epoch": 16.76427406199021, "grad_norm": 0.01150096207857132, "learning_rate": 3.882196002303015e-06, "loss": 0.1361, "num_input_tokens_seen": 221749328, "step": 102765 }, { "epoch": 16.765089722675366, "grad_norm": 0.7179291844367981, "learning_rate": 3.8802913700105496e-06, "loss": 0.0582, "num_input_tokens_seen": 221760496, "step": 102770 }, { "epoch": 16.765905383360522, "grad_norm": 2.313542127609253, "learning_rate": 3.878387165735775e-06, "loss": 0.2645, "num_input_tokens_seen": 221772592, "step": 102775 }, { "epoch": 16.766721044045678, "grad_norm": 1.9952658414840698, "learning_rate": 3.876483389517277e-06, "loss": 0.1871, "num_input_tokens_seen": 221782256, "step": 102780 }, { "epoch": 16.767536704730833, "grad_norm": 1.2269985675811768, "learning_rate": 3.874580041393641e-06, "loss": 0.1026, "num_input_tokens_seen": 221793104, "step": 102785 }, { "epoch": 16.768352365415986, "grad_norm": 0.3022565245628357, "learning_rate": 3.872677121403442e-06, "loss": 0.0292, "num_input_tokens_seen": 221804400, "step": 102790 }, { "epoch": 16.76916802610114, "grad_norm": 1.5427124500274658, "learning_rate": 3.870774629585242e-06, "loss": 0.2401, "num_input_tokens_seen": 221814416, "step": 102795 }, { "epoch": 16.769983686786297, "grad_norm": 0.13479086756706238, "learning_rate": 3.868872565977596e-06, "loss": 0.0278, "num_input_tokens_seen": 221824528, "step": 102800 }, { "epoch": 16.770799347471453, "grad_norm": 0.7104355692863464, "learning_rate": 3.866970930619057e-06, "loss": 0.0265, "num_input_tokens_seen": 221836336, "step": 102805 }, { "epoch": 16.77161500815661, "grad_norm": 1.0296127796173096, "learning_rate": 3.865069723548159e-06, "loss": 0.0548, "num_input_tokens_seen": 221847632, "step": 102810 }, { "epoch": 16.77243066884176, "grad_norm": 0.07942961901426315, "learning_rate": 3.86316894480343e-06, "loss": 0.0391, "num_input_tokens_seen": 221859248, "step": 102815 }, { "epoch": 16.773246329526916, "grad_norm": 0.0926915854215622, "learning_rate": 3.861268594423409e-06, "loss": 0.0154, "num_input_tokens_seen": 221870768, "step": 102820 }, { "epoch": 16.774061990212072, "grad_norm": 0.028321554884314537, "learning_rate": 3.8593686724465834e-06, "loss": 0.1029, "num_input_tokens_seen": 221882064, "step": 102825 }, { "epoch": 16.774877650897228, "grad_norm": 0.24729453027248383, "learning_rate": 3.8574691789114825e-06, "loss": 0.0495, "num_input_tokens_seen": 221893072, "step": 102830 }, { "epoch": 16.775693311582383, "grad_norm": 0.06602305918931961, "learning_rate": 3.855570113856577e-06, "loss": 0.0113, "num_input_tokens_seen": 221903984, "step": 102835 }, { "epoch": 16.776508972267536, "grad_norm": 0.8590641617774963, "learning_rate": 3.853671477320376e-06, "loss": 0.1092, "num_input_tokens_seen": 221913936, "step": 102840 }, { "epoch": 16.77732463295269, "grad_norm": 1.5644572973251343, "learning_rate": 3.851773269341344e-06, "loss": 0.091, "num_input_tokens_seen": 221923280, "step": 102845 }, { "epoch": 16.778140293637847, "grad_norm": 0.1062435656785965, "learning_rate": 3.8498754899579575e-06, "loss": 0.1296, "num_input_tokens_seen": 221932944, "step": 102850 }, { "epoch": 16.778955954323003, "grad_norm": 0.05916263163089752, "learning_rate": 3.847978139208674e-06, "loss": 0.0565, "num_input_tokens_seen": 221944560, "step": 102855 }, { "epoch": 16.77977161500816, "grad_norm": 1.9450702667236328, "learning_rate": 3.846081217131947e-06, "loss": 0.1048, "num_input_tokens_seen": 221955984, "step": 102860 }, { "epoch": 16.78058727569331, "grad_norm": 0.09528131037950516, "learning_rate": 3.844184723766223e-06, "loss": 0.1647, "num_input_tokens_seen": 221967248, "step": 102865 }, { "epoch": 16.781402936378466, "grad_norm": 1.3721013069152832, "learning_rate": 3.842288659149928e-06, "loss": 0.0782, "num_input_tokens_seen": 221978000, "step": 102870 }, { "epoch": 16.782218597063622, "grad_norm": 0.12021300196647644, "learning_rate": 3.840393023321498e-06, "loss": 0.0787, "num_input_tokens_seen": 221987920, "step": 102875 }, { "epoch": 16.783034257748778, "grad_norm": 0.06380707025527954, "learning_rate": 3.8384978163193444e-06, "loss": 0.202, "num_input_tokens_seen": 221998576, "step": 102880 }, { "epoch": 16.78384991843393, "grad_norm": 0.14528581500053406, "learning_rate": 3.836603038181879e-06, "loss": 0.0649, "num_input_tokens_seen": 222009168, "step": 102885 }, { "epoch": 16.784665579119086, "grad_norm": 0.05525138974189758, "learning_rate": 3.8347086889475e-06, "loss": 0.0545, "num_input_tokens_seen": 222020048, "step": 102890 }, { "epoch": 16.78548123980424, "grad_norm": 1.809074878692627, "learning_rate": 3.8328147686546e-06, "loss": 0.2062, "num_input_tokens_seen": 222032368, "step": 102895 }, { "epoch": 16.786296900489397, "grad_norm": 0.298268586397171, "learning_rate": 3.830921277341562e-06, "loss": 0.1165, "num_input_tokens_seen": 222043472, "step": 102900 }, { "epoch": 16.787112561174553, "grad_norm": 0.11666163802146912, "learning_rate": 3.829028215046757e-06, "loss": 0.08, "num_input_tokens_seen": 222054320, "step": 102905 }, { "epoch": 16.787928221859705, "grad_norm": 0.4573201835155487, "learning_rate": 3.827135581808553e-06, "loss": 0.1493, "num_input_tokens_seen": 222064496, "step": 102910 }, { "epoch": 16.78874388254486, "grad_norm": 0.47147509455680847, "learning_rate": 3.825243377665308e-06, "loss": 0.0493, "num_input_tokens_seen": 222075152, "step": 102915 }, { "epoch": 16.789559543230016, "grad_norm": 0.09592147171497345, "learning_rate": 3.823351602655359e-06, "loss": 0.0287, "num_input_tokens_seen": 222086416, "step": 102920 }, { "epoch": 16.790375203915172, "grad_norm": 0.3487565219402313, "learning_rate": 3.82146025681707e-06, "loss": 0.1364, "num_input_tokens_seen": 222097808, "step": 102925 }, { "epoch": 16.791190864600328, "grad_norm": 0.05363168194890022, "learning_rate": 3.819569340188739e-06, "loss": 0.107, "num_input_tokens_seen": 222108144, "step": 102930 }, { "epoch": 16.79200652528548, "grad_norm": 1.4821553230285645, "learning_rate": 3.817678852808721e-06, "loss": 0.0796, "num_input_tokens_seen": 222119184, "step": 102935 }, { "epoch": 16.792822185970635, "grad_norm": 0.1822410374879837, "learning_rate": 3.815788794715297e-06, "loss": 0.0732, "num_input_tokens_seen": 222130384, "step": 102940 }, { "epoch": 16.79363784665579, "grad_norm": 2.0339341163635254, "learning_rate": 3.8138991659468023e-06, "loss": 0.1378, "num_input_tokens_seen": 222140720, "step": 102945 }, { "epoch": 16.794453507340947, "grad_norm": 1.7940973043441772, "learning_rate": 3.812009966541505e-06, "loss": 0.1554, "num_input_tokens_seen": 222152400, "step": 102950 }, { "epoch": 16.795269168026103, "grad_norm": 0.03233041614294052, "learning_rate": 3.8101211965377164e-06, "loss": 0.1591, "num_input_tokens_seen": 222164144, "step": 102955 }, { "epoch": 16.796084828711255, "grad_norm": 2.0978963375091553, "learning_rate": 3.8082328559736894e-06, "loss": 0.1352, "num_input_tokens_seen": 222175088, "step": 102960 }, { "epoch": 16.79690048939641, "grad_norm": 0.44042861461639404, "learning_rate": 3.8063449448877213e-06, "loss": 0.022, "num_input_tokens_seen": 222186256, "step": 102965 }, { "epoch": 16.797716150081566, "grad_norm": 0.04038684442639351, "learning_rate": 3.8044574633180453e-06, "loss": 0.0696, "num_input_tokens_seen": 222197744, "step": 102970 }, { "epoch": 16.798531810766722, "grad_norm": 0.07187923043966293, "learning_rate": 3.8025704113029415e-06, "loss": 0.0175, "num_input_tokens_seen": 222209840, "step": 102975 }, { "epoch": 16.799347471451878, "grad_norm": 0.03593195229768753, "learning_rate": 3.800683788880624e-06, "loss": 0.0588, "num_input_tokens_seen": 222220592, "step": 102980 }, { "epoch": 16.80016313213703, "grad_norm": 0.6960670948028564, "learning_rate": 3.798797596089351e-06, "loss": 0.0622, "num_input_tokens_seen": 222232048, "step": 102985 }, { "epoch": 16.800978792822185, "grad_norm": 0.2950982451438904, "learning_rate": 3.7969118329673397e-06, "loss": 0.095, "num_input_tokens_seen": 222243120, "step": 102990 }, { "epoch": 16.80179445350734, "grad_norm": 2.1050379276275635, "learning_rate": 3.795026499552809e-06, "loss": 0.0706, "num_input_tokens_seen": 222254288, "step": 102995 }, { "epoch": 16.802610114192497, "grad_norm": 0.4674755930900574, "learning_rate": 3.7931415958839644e-06, "loss": 0.0582, "num_input_tokens_seen": 222265712, "step": 103000 }, { "epoch": 16.803425774877653, "grad_norm": 0.8618441820144653, "learning_rate": 3.7912571219990116e-06, "loss": 0.0669, "num_input_tokens_seen": 222277424, "step": 103005 }, { "epoch": 16.804241435562805, "grad_norm": 0.16856937110424042, "learning_rate": 3.789373077936134e-06, "loss": 0.0388, "num_input_tokens_seen": 222286864, "step": 103010 }, { "epoch": 16.80505709624796, "grad_norm": 3.673968553543091, "learning_rate": 3.7874894637335235e-06, "loss": 0.1289, "num_input_tokens_seen": 222297584, "step": 103015 }, { "epoch": 16.805872756933116, "grad_norm": 0.3776124119758606, "learning_rate": 3.785606279429346e-06, "loss": 0.0511, "num_input_tokens_seen": 222308400, "step": 103020 }, { "epoch": 16.806688417618272, "grad_norm": 1.1643810272216797, "learning_rate": 3.783723525061769e-06, "loss": 0.171, "num_input_tokens_seen": 222319184, "step": 103025 }, { "epoch": 16.807504078303424, "grad_norm": 0.18298980593681335, "learning_rate": 3.781841200668951e-06, "loss": 0.0255, "num_input_tokens_seen": 222329360, "step": 103030 }, { "epoch": 16.80831973898858, "grad_norm": 0.045527566224336624, "learning_rate": 3.7799593062890324e-06, "loss": 0.0374, "num_input_tokens_seen": 222338832, "step": 103035 }, { "epoch": 16.809135399673735, "grad_norm": 2.5181469917297363, "learning_rate": 3.77807784196017e-06, "loss": 0.1286, "num_input_tokens_seen": 222348944, "step": 103040 }, { "epoch": 16.80995106035889, "grad_norm": 0.33125489950180054, "learning_rate": 3.7761968077204696e-06, "loss": 0.0661, "num_input_tokens_seen": 222360592, "step": 103045 }, { "epoch": 16.810766721044047, "grad_norm": 2.4811596870422363, "learning_rate": 3.7743162036080778e-06, "loss": 0.043, "num_input_tokens_seen": 222370832, "step": 103050 }, { "epoch": 16.8115823817292, "grad_norm": 0.1535891592502594, "learning_rate": 3.7724360296610806e-06, "loss": 0.0314, "num_input_tokens_seen": 222381552, "step": 103055 }, { "epoch": 16.812398042414355, "grad_norm": 0.7660227417945862, "learning_rate": 3.770556285917609e-06, "loss": 0.0549, "num_input_tokens_seen": 222393296, "step": 103060 }, { "epoch": 16.81321370309951, "grad_norm": 0.6449263095855713, "learning_rate": 3.7686769724157355e-06, "loss": 0.0919, "num_input_tokens_seen": 222404144, "step": 103065 }, { "epoch": 16.814029363784666, "grad_norm": 1.6952056884765625, "learning_rate": 3.7667980891935685e-06, "loss": 0.0948, "num_input_tokens_seen": 222414704, "step": 103070 }, { "epoch": 16.81484502446982, "grad_norm": 1.9494057893753052, "learning_rate": 3.7649196362891633e-06, "loss": 0.0941, "num_input_tokens_seen": 222422992, "step": 103075 }, { "epoch": 16.815660685154974, "grad_norm": 2.386796474456787, "learning_rate": 3.7630416137406116e-06, "loss": 0.1465, "num_input_tokens_seen": 222433552, "step": 103080 }, { "epoch": 16.81647634584013, "grad_norm": 0.10917682945728302, "learning_rate": 3.76116402158595e-06, "loss": 0.0163, "num_input_tokens_seen": 222443984, "step": 103085 }, { "epoch": 16.817292006525285, "grad_norm": 0.08006954938173294, "learning_rate": 3.7592868598632537e-06, "loss": 0.0503, "num_input_tokens_seen": 222453968, "step": 103090 }, { "epoch": 16.81810766721044, "grad_norm": 2.083575963973999, "learning_rate": 3.7574101286105444e-06, "loss": 0.1244, "num_input_tokens_seen": 222465424, "step": 103095 }, { "epoch": 16.818923327895597, "grad_norm": 0.04711775854229927, "learning_rate": 3.7555338278658784e-06, "loss": 0.0776, "num_input_tokens_seen": 222476112, "step": 103100 }, { "epoch": 16.81973898858075, "grad_norm": 0.03216678649187088, "learning_rate": 3.7536579576672587e-06, "loss": 0.2188, "num_input_tokens_seen": 222485648, "step": 103105 }, { "epoch": 16.820554649265905, "grad_norm": 0.08192183822393417, "learning_rate": 3.7517825180527237e-06, "loss": 0.1101, "num_input_tokens_seen": 222496816, "step": 103110 }, { "epoch": 16.82137030995106, "grad_norm": 0.10869745165109634, "learning_rate": 3.7499075090602604e-06, "loss": 0.1515, "num_input_tokens_seen": 222508176, "step": 103115 }, { "epoch": 16.822185970636216, "grad_norm": 0.09160853922367096, "learning_rate": 3.7480329307278933e-06, "loss": 0.1603, "num_input_tokens_seen": 222517616, "step": 103120 }, { "epoch": 16.82300163132137, "grad_norm": 0.03276952728629112, "learning_rate": 3.746158783093584e-06, "loss": 0.0105, "num_input_tokens_seen": 222528784, "step": 103125 }, { "epoch": 16.823817292006524, "grad_norm": 0.7702341675758362, "learning_rate": 3.7442850661953357e-06, "loss": 0.119, "num_input_tokens_seen": 222539920, "step": 103130 }, { "epoch": 16.82463295269168, "grad_norm": 0.11860158294439316, "learning_rate": 3.7424117800711172e-06, "loss": 0.0796, "num_input_tokens_seen": 222551088, "step": 103135 }, { "epoch": 16.825448613376835, "grad_norm": 1.912947416305542, "learning_rate": 3.740538924758888e-06, "loss": 0.2206, "num_input_tokens_seen": 222562640, "step": 103140 }, { "epoch": 16.82626427406199, "grad_norm": 0.312919944524765, "learning_rate": 3.738666500296609e-06, "loss": 0.2818, "num_input_tokens_seen": 222572880, "step": 103145 }, { "epoch": 16.827079934747147, "grad_norm": 0.9785570502281189, "learning_rate": 3.7367945067222245e-06, "loss": 0.04, "num_input_tokens_seen": 222584208, "step": 103150 }, { "epoch": 16.8278955954323, "grad_norm": 0.06859772652387619, "learning_rate": 3.734922944073674e-06, "loss": 0.0083, "num_input_tokens_seen": 222593840, "step": 103155 }, { "epoch": 16.828711256117455, "grad_norm": 2.3572299480438232, "learning_rate": 3.733051812388888e-06, "loss": 0.1759, "num_input_tokens_seen": 222605808, "step": 103160 }, { "epoch": 16.82952691680261, "grad_norm": 0.8807229995727539, "learning_rate": 3.7311811117057836e-06, "loss": 0.0227, "num_input_tokens_seen": 222617200, "step": 103165 }, { "epoch": 16.830342577487766, "grad_norm": 2.513101577758789, "learning_rate": 3.729310842062278e-06, "loss": 0.2842, "num_input_tokens_seen": 222626448, "step": 103170 }, { "epoch": 16.83115823817292, "grad_norm": 0.07683175802230835, "learning_rate": 3.727441003496271e-06, "loss": 0.148, "num_input_tokens_seen": 222638032, "step": 103175 }, { "epoch": 16.831973898858074, "grad_norm": 1.2596708536148071, "learning_rate": 3.7255715960456515e-06, "loss": 0.3973, "num_input_tokens_seen": 222650224, "step": 103180 }, { "epoch": 16.83278955954323, "grad_norm": 0.04751106724143028, "learning_rate": 3.7237026197483234e-06, "loss": 0.1614, "num_input_tokens_seen": 222660880, "step": 103185 }, { "epoch": 16.833605220228385, "grad_norm": 0.05840512737631798, "learning_rate": 3.7218340746421444e-06, "loss": 0.0222, "num_input_tokens_seen": 222673072, "step": 103190 }, { "epoch": 16.83442088091354, "grad_norm": 0.08031874150037766, "learning_rate": 3.7199659607649984e-06, "loss": 0.1203, "num_input_tokens_seen": 222684304, "step": 103195 }, { "epoch": 16.835236541598697, "grad_norm": 1.702963948249817, "learning_rate": 3.7180982781547274e-06, "loss": 0.2223, "num_input_tokens_seen": 222695216, "step": 103200 }, { "epoch": 16.83605220228385, "grad_norm": 1.7306607961654663, "learning_rate": 3.7162310268492066e-06, "loss": 0.3788, "num_input_tokens_seen": 222705776, "step": 103205 }, { "epoch": 16.836867862969005, "grad_norm": 0.16253691911697388, "learning_rate": 3.7143642068862505e-06, "loss": 0.0538, "num_input_tokens_seen": 222716720, "step": 103210 }, { "epoch": 16.83768352365416, "grad_norm": 0.3389459550380707, "learning_rate": 3.71249781830372e-06, "loss": 0.0378, "num_input_tokens_seen": 222726896, "step": 103215 }, { "epoch": 16.838499184339316, "grad_norm": 0.46009910106658936, "learning_rate": 3.710631861139413e-06, "loss": 0.0085, "num_input_tokens_seen": 222736816, "step": 103220 }, { "epoch": 16.839314845024468, "grad_norm": 0.059398673474788666, "learning_rate": 3.7087663354311715e-06, "loss": 0.1572, "num_input_tokens_seen": 222746448, "step": 103225 }, { "epoch": 16.840130505709624, "grad_norm": 0.3330238461494446, "learning_rate": 3.706901241216779e-06, "loss": 0.1291, "num_input_tokens_seen": 222756816, "step": 103230 }, { "epoch": 16.84094616639478, "grad_norm": 1.0746484994888306, "learning_rate": 3.7050365785340547e-06, "loss": 0.0482, "num_input_tokens_seen": 222768400, "step": 103235 }, { "epoch": 16.841761827079935, "grad_norm": 0.9708142876625061, "learning_rate": 3.7031723474207693e-06, "loss": 0.1162, "num_input_tokens_seen": 222778736, "step": 103240 }, { "epoch": 16.84257748776509, "grad_norm": 1.3374515771865845, "learning_rate": 3.701308547914717e-06, "loss": 0.1883, "num_input_tokens_seen": 222788528, "step": 103245 }, { "epoch": 16.843393148450243, "grad_norm": 1.831568717956543, "learning_rate": 3.6994451800536677e-06, "loss": 0.1617, "num_input_tokens_seen": 222799216, "step": 103250 }, { "epoch": 16.8442088091354, "grad_norm": 0.38536036014556885, "learning_rate": 3.697582243875383e-06, "loss": 0.1942, "num_input_tokens_seen": 222810032, "step": 103255 }, { "epoch": 16.845024469820554, "grad_norm": 0.25290775299072266, "learning_rate": 3.6957197394176162e-06, "loss": 0.0223, "num_input_tokens_seen": 222819664, "step": 103260 }, { "epoch": 16.84584013050571, "grad_norm": 0.3433535099029541, "learning_rate": 3.6938576667181174e-06, "loss": 0.0555, "num_input_tokens_seen": 222832016, "step": 103265 }, { "epoch": 16.846655791190866, "grad_norm": 1.8925960063934326, "learning_rate": 3.691996025814623e-06, "loss": 0.0961, "num_input_tokens_seen": 222841936, "step": 103270 }, { "epoch": 16.847471451876018, "grad_norm": 1.5084867477416992, "learning_rate": 3.6901348167448585e-06, "loss": 0.0731, "num_input_tokens_seen": 222851248, "step": 103275 }, { "epoch": 16.848287112561174, "grad_norm": 0.34108543395996094, "learning_rate": 3.6882740395465436e-06, "loss": 0.0534, "num_input_tokens_seen": 222861584, "step": 103280 }, { "epoch": 16.84910277324633, "grad_norm": 3.287980556488037, "learning_rate": 3.686413694257396e-06, "loss": 0.0748, "num_input_tokens_seen": 222872720, "step": 103285 }, { "epoch": 16.849918433931485, "grad_norm": 0.021122684702277184, "learning_rate": 3.6845537809151093e-06, "loss": 0.1182, "num_input_tokens_seen": 222883952, "step": 103290 }, { "epoch": 16.85073409461664, "grad_norm": 1.3557014465332031, "learning_rate": 3.682694299557382e-06, "loss": 0.1146, "num_input_tokens_seen": 222893392, "step": 103295 }, { "epoch": 16.851549755301793, "grad_norm": 1.0291122198104858, "learning_rate": 3.680835250221898e-06, "loss": 0.1214, "num_input_tokens_seen": 222903856, "step": 103300 }, { "epoch": 16.85236541598695, "grad_norm": 0.13910727202892303, "learning_rate": 3.678976632946332e-06, "loss": 0.0315, "num_input_tokens_seen": 222914288, "step": 103305 }, { "epoch": 16.853181076672104, "grad_norm": 1.9790490865707397, "learning_rate": 3.6771184477683517e-06, "loss": 0.1375, "num_input_tokens_seen": 222925264, "step": 103310 }, { "epoch": 16.85399673735726, "grad_norm": 0.049185071140527725, "learning_rate": 3.6752606947256156e-06, "loss": 0.1227, "num_input_tokens_seen": 222935696, "step": 103315 }, { "epoch": 16.854812398042416, "grad_norm": 0.11016837507486343, "learning_rate": 3.673403373855777e-06, "loss": 0.1031, "num_input_tokens_seen": 222946864, "step": 103320 }, { "epoch": 16.855628058727568, "grad_norm": 0.5809866189956665, "learning_rate": 3.671546485196464e-06, "loss": 0.0913, "num_input_tokens_seen": 222957840, "step": 103325 }, { "epoch": 16.856443719412724, "grad_norm": 0.1551760882139206, "learning_rate": 3.669690028785333e-06, "loss": 0.0703, "num_input_tokens_seen": 222967632, "step": 103330 }, { "epoch": 16.85725938009788, "grad_norm": 3.001948595046997, "learning_rate": 3.6678340046599813e-06, "loss": 0.1794, "num_input_tokens_seen": 222978416, "step": 103335 }, { "epoch": 16.858075040783035, "grad_norm": 0.15768109261989594, "learning_rate": 3.6659784128580454e-06, "loss": 0.0417, "num_input_tokens_seen": 222989616, "step": 103340 }, { "epoch": 16.85889070146819, "grad_norm": 0.04866597428917885, "learning_rate": 3.6641232534171122e-06, "loss": 0.0154, "num_input_tokens_seen": 223000048, "step": 103345 }, { "epoch": 16.859706362153343, "grad_norm": 0.863556444644928, "learning_rate": 3.66226852637479e-06, "loss": 0.0222, "num_input_tokens_seen": 223009872, "step": 103350 }, { "epoch": 16.8605220228385, "grad_norm": 0.027760738506913185, "learning_rate": 3.660414231768666e-06, "loss": 0.0555, "num_input_tokens_seen": 223019600, "step": 103355 }, { "epoch": 16.861337683523654, "grad_norm": 0.21020275354385376, "learning_rate": 3.6585603696363214e-06, "loss": 0.122, "num_input_tokens_seen": 223030128, "step": 103360 }, { "epoch": 16.86215334420881, "grad_norm": 1.7829347848892212, "learning_rate": 3.6567069400153227e-06, "loss": 0.1425, "num_input_tokens_seen": 223040528, "step": 103365 }, { "epoch": 16.862969004893966, "grad_norm": 0.034142449498176575, "learning_rate": 3.6548539429432317e-06, "loss": 0.0603, "num_input_tokens_seen": 223051664, "step": 103370 }, { "epoch": 16.863784665579118, "grad_norm": 1.133740782737732, "learning_rate": 3.653001378457607e-06, "loss": 0.1613, "num_input_tokens_seen": 223062896, "step": 103375 }, { "epoch": 16.864600326264274, "grad_norm": 1.3672244548797607, "learning_rate": 3.651149246595986e-06, "loss": 0.0976, "num_input_tokens_seen": 223073616, "step": 103380 }, { "epoch": 16.86541598694943, "grad_norm": 2.1791839599609375, "learning_rate": 3.6492975473959134e-06, "loss": 0.0551, "num_input_tokens_seen": 223085648, "step": 103385 }, { "epoch": 16.866231647634585, "grad_norm": 1.6595280170440674, "learning_rate": 3.647446280894909e-06, "loss": 0.3881, "num_input_tokens_seen": 223095152, "step": 103390 }, { "epoch": 16.86704730831974, "grad_norm": 0.09831979125738144, "learning_rate": 3.6455954471304926e-06, "loss": 0.0399, "num_input_tokens_seen": 223106544, "step": 103395 }, { "epoch": 16.867862969004893, "grad_norm": 1.2310596704483032, "learning_rate": 3.6437450461401734e-06, "loss": 0.0776, "num_input_tokens_seen": 223116912, "step": 103400 }, { "epoch": 16.86867862969005, "grad_norm": 0.550597071647644, "learning_rate": 3.641895077961455e-06, "loss": 0.0817, "num_input_tokens_seen": 223128752, "step": 103405 }, { "epoch": 16.869494290375204, "grad_norm": 0.2467770129442215, "learning_rate": 3.6400455426318287e-06, "loss": 0.0259, "num_input_tokens_seen": 223139152, "step": 103410 }, { "epoch": 16.87030995106036, "grad_norm": 0.30157700181007385, "learning_rate": 3.6381964401887735e-06, "loss": 0.0109, "num_input_tokens_seen": 223150000, "step": 103415 }, { "epoch": 16.871125611745512, "grad_norm": 0.1967574954032898, "learning_rate": 3.6363477706697706e-06, "loss": 0.027, "num_input_tokens_seen": 223161424, "step": 103420 }, { "epoch": 16.871941272430668, "grad_norm": 0.030649349093437195, "learning_rate": 3.6344995341122788e-06, "loss": 0.0961, "num_input_tokens_seen": 223173136, "step": 103425 }, { "epoch": 16.872756933115824, "grad_norm": 0.34843599796295166, "learning_rate": 3.63265173055376e-06, "loss": 0.036, "num_input_tokens_seen": 223183536, "step": 103430 }, { "epoch": 16.87357259380098, "grad_norm": 0.21863789856433868, "learning_rate": 3.630804360031659e-06, "loss": 0.0983, "num_input_tokens_seen": 223195280, "step": 103435 }, { "epoch": 16.874388254486135, "grad_norm": 0.0986478328704834, "learning_rate": 3.628957422583415e-06, "loss": 0.1091, "num_input_tokens_seen": 223204592, "step": 103440 }, { "epoch": 16.875203915171287, "grad_norm": 1.976736307144165, "learning_rate": 3.6271109182464624e-06, "loss": 0.076, "num_input_tokens_seen": 223214928, "step": 103445 }, { "epoch": 16.876019575856443, "grad_norm": 1.3626662492752075, "learning_rate": 3.6252648470582186e-06, "loss": 0.0614, "num_input_tokens_seen": 223225840, "step": 103450 }, { "epoch": 16.8768352365416, "grad_norm": 2.156641960144043, "learning_rate": 3.6234192090560948e-06, "loss": 0.1602, "num_input_tokens_seen": 223236240, "step": 103455 }, { "epoch": 16.877650897226754, "grad_norm": 0.1081962063908577, "learning_rate": 3.6215740042775087e-06, "loss": 0.0167, "num_input_tokens_seen": 223247408, "step": 103460 }, { "epoch": 16.87846655791191, "grad_norm": 0.09960318356752396, "learning_rate": 3.619729232759836e-06, "loss": 0.0727, "num_input_tokens_seen": 223257968, "step": 103465 }, { "epoch": 16.879282218597062, "grad_norm": 0.18419867753982544, "learning_rate": 3.617884894540477e-06, "loss": 0.0871, "num_input_tokens_seen": 223267376, "step": 103470 }, { "epoch": 16.880097879282218, "grad_norm": 0.29115602374076843, "learning_rate": 3.6160409896568077e-06, "loss": 0.1094, "num_input_tokens_seen": 223277680, "step": 103475 }, { "epoch": 16.880913539967374, "grad_norm": 3.598520278930664, "learning_rate": 3.6141975181461925e-06, "loss": 0.0501, "num_input_tokens_seen": 223289040, "step": 103480 }, { "epoch": 16.88172920065253, "grad_norm": 0.18721075356006622, "learning_rate": 3.6123544800459956e-06, "loss": 0.0157, "num_input_tokens_seen": 223300848, "step": 103485 }, { "epoch": 16.882544861337685, "grad_norm": 0.01592845842242241, "learning_rate": 3.6105118753935683e-06, "loss": 0.0244, "num_input_tokens_seen": 223311696, "step": 103490 }, { "epoch": 16.883360522022837, "grad_norm": 1.1594703197479248, "learning_rate": 3.6086697042262527e-06, "loss": 0.0789, "num_input_tokens_seen": 223322640, "step": 103495 }, { "epoch": 16.884176182707993, "grad_norm": 0.06940730661153793, "learning_rate": 3.6068279665813805e-06, "loss": 0.1251, "num_input_tokens_seen": 223332656, "step": 103500 }, { "epoch": 16.88499184339315, "grad_norm": 0.5311437249183655, "learning_rate": 3.604986662496279e-06, "loss": 0.1447, "num_input_tokens_seen": 223344016, "step": 103505 }, { "epoch": 16.885807504078304, "grad_norm": 0.33722439408302307, "learning_rate": 3.6031457920082666e-06, "loss": 0.2085, "num_input_tokens_seen": 223354608, "step": 103510 }, { "epoch": 16.88662316476346, "grad_norm": 3.0389528274536133, "learning_rate": 3.601305355154647e-06, "loss": 0.1159, "num_input_tokens_seen": 223366160, "step": 103515 }, { "epoch": 16.887438825448612, "grad_norm": 0.05652274936437607, "learning_rate": 3.5994653519727193e-06, "loss": 0.0048, "num_input_tokens_seen": 223377648, "step": 103520 }, { "epoch": 16.888254486133768, "grad_norm": 0.1832273006439209, "learning_rate": 3.597625782499775e-06, "loss": 0.0711, "num_input_tokens_seen": 223389424, "step": 103525 }, { "epoch": 16.889070146818923, "grad_norm": 1.7934943437576294, "learning_rate": 3.595786646773097e-06, "loss": 0.2862, "num_input_tokens_seen": 223400912, "step": 103530 }, { "epoch": 16.88988580750408, "grad_norm": 1.7684441804885864, "learning_rate": 3.5939479448299532e-06, "loss": 0.107, "num_input_tokens_seen": 223411728, "step": 103535 }, { "epoch": 16.890701468189235, "grad_norm": 0.033298395574092865, "learning_rate": 3.592109676707614e-06, "loss": 0.1324, "num_input_tokens_seen": 223422672, "step": 103540 }, { "epoch": 16.891517128874387, "grad_norm": 2.283024787902832, "learning_rate": 3.5902718424433268e-06, "loss": 0.138, "num_input_tokens_seen": 223434544, "step": 103545 }, { "epoch": 16.892332789559543, "grad_norm": 0.31957975029945374, "learning_rate": 3.58843444207434e-06, "loss": 0.0424, "num_input_tokens_seen": 223445424, "step": 103550 }, { "epoch": 16.8931484502447, "grad_norm": 0.16588987410068512, "learning_rate": 3.586597475637893e-06, "loss": 0.0706, "num_input_tokens_seen": 223455536, "step": 103555 }, { "epoch": 16.893964110929854, "grad_norm": 3.229868173599243, "learning_rate": 3.5847609431712085e-06, "loss": 0.2432, "num_input_tokens_seen": 223465296, "step": 103560 }, { "epoch": 16.894779771615006, "grad_norm": 1.0104044675827026, "learning_rate": 3.5829248447115216e-06, "loss": 0.2534, "num_input_tokens_seen": 223477008, "step": 103565 }, { "epoch": 16.895595432300162, "grad_norm": 0.9448798298835754, "learning_rate": 3.5810891802960186e-06, "loss": 0.064, "num_input_tokens_seen": 223488272, "step": 103570 }, { "epoch": 16.896411092985318, "grad_norm": 0.17443634569644928, "learning_rate": 3.5792539499619303e-06, "loss": 0.1958, "num_input_tokens_seen": 223498640, "step": 103575 }, { "epoch": 16.897226753670473, "grad_norm": 0.03204435110092163, "learning_rate": 3.5774191537464196e-06, "loss": 0.1345, "num_input_tokens_seen": 223510384, "step": 103580 }, { "epoch": 16.89804241435563, "grad_norm": 3.603520631790161, "learning_rate": 3.5755847916867007e-06, "loss": 0.044, "num_input_tokens_seen": 223520784, "step": 103585 }, { "epoch": 16.898858075040785, "grad_norm": 0.454332560300827, "learning_rate": 3.5737508638199217e-06, "loss": 0.0277, "num_input_tokens_seen": 223530800, "step": 103590 }, { "epoch": 16.899673735725937, "grad_norm": 1.281439185142517, "learning_rate": 3.5719173701832752e-06, "loss": 0.0465, "num_input_tokens_seen": 223541904, "step": 103595 }, { "epoch": 16.900489396411093, "grad_norm": 0.1206582561135292, "learning_rate": 3.5700843108138928e-06, "loss": 0.0939, "num_input_tokens_seen": 223552688, "step": 103600 }, { "epoch": 16.90130505709625, "grad_norm": 0.22598083317279816, "learning_rate": 3.568251685748944e-06, "loss": 0.0121, "num_input_tokens_seen": 223563216, "step": 103605 }, { "epoch": 16.902120717781404, "grad_norm": 0.1252163201570511, "learning_rate": 3.5664194950255664e-06, "loss": 0.1218, "num_input_tokens_seen": 223574192, "step": 103610 }, { "epoch": 16.902936378466556, "grad_norm": 0.05596493184566498, "learning_rate": 3.564587738680883e-06, "loss": 0.2015, "num_input_tokens_seen": 223583248, "step": 103615 }, { "epoch": 16.903752039151712, "grad_norm": 0.05652420595288277, "learning_rate": 3.562756416752025e-06, "loss": 0.0756, "num_input_tokens_seen": 223593136, "step": 103620 }, { "epoch": 16.904567699836868, "grad_norm": 0.02610119804739952, "learning_rate": 3.5609255292761046e-06, "loss": 0.03, "num_input_tokens_seen": 223604240, "step": 103625 }, { "epoch": 16.905383360522023, "grad_norm": 2.629894495010376, "learning_rate": 3.5590950762902226e-06, "loss": 0.1392, "num_input_tokens_seen": 223616016, "step": 103630 }, { "epoch": 16.90619902120718, "grad_norm": 0.05405108258128166, "learning_rate": 3.5572650578314797e-06, "loss": 0.029, "num_input_tokens_seen": 223627984, "step": 103635 }, { "epoch": 16.90701468189233, "grad_norm": 0.0778045579791069, "learning_rate": 3.5554354739369634e-06, "loss": 0.1556, "num_input_tokens_seen": 223638544, "step": 103640 }, { "epoch": 16.907830342577487, "grad_norm": 1.1030224561691284, "learning_rate": 3.553606324643749e-06, "loss": 0.2019, "num_input_tokens_seen": 223649168, "step": 103645 }, { "epoch": 16.908646003262643, "grad_norm": 0.12581905722618103, "learning_rate": 3.5517776099889127e-06, "loss": 0.0204, "num_input_tokens_seen": 223660176, "step": 103650 }, { "epoch": 16.9094616639478, "grad_norm": 0.02579948864877224, "learning_rate": 3.549949330009508e-06, "loss": 0.0544, "num_input_tokens_seen": 223670672, "step": 103655 }, { "epoch": 16.910277324632954, "grad_norm": 1.8080943822860718, "learning_rate": 3.5481214847425946e-06, "loss": 0.3236, "num_input_tokens_seen": 223681264, "step": 103660 }, { "epoch": 16.911092985318106, "grad_norm": 2.115870714187622, "learning_rate": 3.5462940742252067e-06, "loss": 0.0566, "num_input_tokens_seen": 223691248, "step": 103665 }, { "epoch": 16.911908646003262, "grad_norm": 0.29446056485176086, "learning_rate": 3.5444670984943974e-06, "loss": 0.038, "num_input_tokens_seen": 223702576, "step": 103670 }, { "epoch": 16.912724306688418, "grad_norm": 0.30823248624801636, "learning_rate": 3.542640557587168e-06, "loss": 0.1027, "num_input_tokens_seen": 223714512, "step": 103675 }, { "epoch": 16.913539967373573, "grad_norm": 0.4373515546321869, "learning_rate": 3.5408144515405583e-06, "loss": 0.114, "num_input_tokens_seen": 223725040, "step": 103680 }, { "epoch": 16.91435562805873, "grad_norm": 0.06558600068092346, "learning_rate": 3.5389887803915582e-06, "loss": 0.0413, "num_input_tokens_seen": 223736560, "step": 103685 }, { "epoch": 16.91517128874388, "grad_norm": 0.5054214000701904, "learning_rate": 3.5371635441771854e-06, "loss": 0.0819, "num_input_tokens_seen": 223748176, "step": 103690 }, { "epoch": 16.915986949429037, "grad_norm": 0.09817881882190704, "learning_rate": 3.5353387429344103e-06, "loss": 0.0353, "num_input_tokens_seen": 223759088, "step": 103695 }, { "epoch": 16.916802610114193, "grad_norm": 1.4806104898452759, "learning_rate": 3.5335143767002364e-06, "loss": 0.0807, "num_input_tokens_seen": 223770288, "step": 103700 }, { "epoch": 16.91761827079935, "grad_norm": 0.554768979549408, "learning_rate": 3.531690445511615e-06, "loss": 0.1911, "num_input_tokens_seen": 223780976, "step": 103705 }, { "epoch": 16.918433931484504, "grad_norm": 0.13149575889110565, "learning_rate": 3.5298669494055303e-06, "loss": 0.1249, "num_input_tokens_seen": 223792272, "step": 103710 }, { "epoch": 16.919249592169656, "grad_norm": 0.8860262036323547, "learning_rate": 3.528043888418919e-06, "loss": 0.0896, "num_input_tokens_seen": 223804080, "step": 103715 }, { "epoch": 16.920065252854812, "grad_norm": 0.2535129189491272, "learning_rate": 3.52622126258875e-06, "loss": 0.0101, "num_input_tokens_seen": 223815696, "step": 103720 }, { "epoch": 16.920880913539968, "grad_norm": 0.08744250237941742, "learning_rate": 3.5243990719519365e-06, "loss": 0.0987, "num_input_tokens_seen": 223826608, "step": 103725 }, { "epoch": 16.921696574225123, "grad_norm": 0.29904207587242126, "learning_rate": 3.5225773165454306e-06, "loss": 0.1759, "num_input_tokens_seen": 223838032, "step": 103730 }, { "epoch": 16.92251223491028, "grad_norm": 0.6226428151130676, "learning_rate": 3.5207559964061276e-06, "loss": 0.0888, "num_input_tokens_seen": 223848144, "step": 103735 }, { "epoch": 16.92332789559543, "grad_norm": 0.45434144139289856, "learning_rate": 3.5189351115709674e-06, "loss": 0.0427, "num_input_tokens_seen": 223857360, "step": 103740 }, { "epoch": 16.924143556280587, "grad_norm": 1.4584276676177979, "learning_rate": 3.5171146620768257e-06, "loss": 0.104, "num_input_tokens_seen": 223868336, "step": 103745 }, { "epoch": 16.924959216965743, "grad_norm": 2.891960859298706, "learning_rate": 3.5152946479606127e-06, "loss": 0.1459, "num_input_tokens_seen": 223879504, "step": 103750 }, { "epoch": 16.9257748776509, "grad_norm": 0.7628777027130127, "learning_rate": 3.5134750692592116e-06, "loss": 0.031, "num_input_tokens_seen": 223890128, "step": 103755 }, { "epoch": 16.92659053833605, "grad_norm": 0.736965537071228, "learning_rate": 3.5116559260094937e-06, "loss": 0.0634, "num_input_tokens_seen": 223902352, "step": 103760 }, { "epoch": 16.927406199021206, "grad_norm": 0.06073520705103874, "learning_rate": 3.5098372182483267e-06, "loss": 0.104, "num_input_tokens_seen": 223912464, "step": 103765 }, { "epoch": 16.928221859706362, "grad_norm": 0.048117656260728836, "learning_rate": 3.5080189460125755e-06, "loss": 0.0698, "num_input_tokens_seen": 223924688, "step": 103770 }, { "epoch": 16.929037520391518, "grad_norm": 2.1119043827056885, "learning_rate": 3.50620110933908e-06, "loss": 0.2472, "num_input_tokens_seen": 223936720, "step": 103775 }, { "epoch": 16.929853181076673, "grad_norm": 1.1821521520614624, "learning_rate": 3.5043837082646885e-06, "loss": 0.2411, "num_input_tokens_seen": 223947344, "step": 103780 }, { "epoch": 16.930668841761825, "grad_norm": 0.1402249038219452, "learning_rate": 3.50256674282623e-06, "loss": 0.0997, "num_input_tokens_seen": 223958832, "step": 103785 }, { "epoch": 16.93148450244698, "grad_norm": 0.29158368706703186, "learning_rate": 3.5007502130605274e-06, "loss": 0.0431, "num_input_tokens_seen": 223967248, "step": 103790 }, { "epoch": 16.932300163132137, "grad_norm": 2.962883472442627, "learning_rate": 3.498934119004396e-06, "loss": 0.047, "num_input_tokens_seen": 223979184, "step": 103795 }, { "epoch": 16.933115823817293, "grad_norm": 0.19832941889762878, "learning_rate": 3.4971184606946343e-06, "loss": 0.1767, "num_input_tokens_seen": 223989104, "step": 103800 }, { "epoch": 16.93393148450245, "grad_norm": 1.7312066555023193, "learning_rate": 3.495303238168057e-06, "loss": 0.0162, "num_input_tokens_seen": 224000336, "step": 103805 }, { "epoch": 16.9347471451876, "grad_norm": 2.904479742050171, "learning_rate": 3.4934884514614295e-06, "loss": 0.4108, "num_input_tokens_seen": 224011600, "step": 103810 }, { "epoch": 16.935562805872756, "grad_norm": 0.10068248957395554, "learning_rate": 3.4916741006115527e-06, "loss": 0.2773, "num_input_tokens_seen": 224022320, "step": 103815 }, { "epoch": 16.936378466557912, "grad_norm": 0.05187816545367241, "learning_rate": 3.489860185655175e-06, "loss": 0.0605, "num_input_tokens_seen": 224033264, "step": 103820 }, { "epoch": 16.937194127243067, "grad_norm": 2.199345827102661, "learning_rate": 3.488046706629078e-06, "loss": 0.1737, "num_input_tokens_seen": 224043408, "step": 103825 }, { "epoch": 16.938009787928223, "grad_norm": 0.4260006844997406, "learning_rate": 3.4862336635699935e-06, "loss": 0.0322, "num_input_tokens_seen": 224054288, "step": 103830 }, { "epoch": 16.938825448613375, "grad_norm": 0.3579459488391876, "learning_rate": 3.484421056514686e-06, "loss": 0.0371, "num_input_tokens_seen": 224064528, "step": 103835 }, { "epoch": 16.93964110929853, "grad_norm": 0.03853999450802803, "learning_rate": 3.482608885499869e-06, "loss": 0.088, "num_input_tokens_seen": 224075248, "step": 103840 }, { "epoch": 16.940456769983687, "grad_norm": 0.0354495532810688, "learning_rate": 3.480797150562293e-06, "loss": 0.0872, "num_input_tokens_seen": 224085616, "step": 103845 }, { "epoch": 16.941272430668842, "grad_norm": 0.08376356214284897, "learning_rate": 3.4789858517386505e-06, "loss": 0.1248, "num_input_tokens_seen": 224095536, "step": 103850 }, { "epoch": 16.942088091353998, "grad_norm": 0.5130432844161987, "learning_rate": 3.4771749890656706e-06, "loss": 0.0719, "num_input_tokens_seen": 224105264, "step": 103855 }, { "epoch": 16.94290375203915, "grad_norm": 0.0457131564617157, "learning_rate": 3.475364562580033e-06, "loss": 0.1544, "num_input_tokens_seen": 224116720, "step": 103860 }, { "epoch": 16.943719412724306, "grad_norm": 1.647095799446106, "learning_rate": 3.4735545723184493e-06, "loss": 0.1637, "num_input_tokens_seen": 224127120, "step": 103865 }, { "epoch": 16.94453507340946, "grad_norm": 0.1520356833934784, "learning_rate": 3.471745018317579e-06, "loss": 0.0162, "num_input_tokens_seen": 224138864, "step": 103870 }, { "epoch": 16.945350734094617, "grad_norm": 0.2768745720386505, "learning_rate": 3.4699359006141186e-06, "loss": 0.0193, "num_input_tokens_seen": 224150864, "step": 103875 }, { "epoch": 16.946166394779773, "grad_norm": 0.08478929847478867, "learning_rate": 3.4681272192447074e-06, "loss": 0.0433, "num_input_tokens_seen": 224162416, "step": 103880 }, { "epoch": 16.946982055464925, "grad_norm": 0.4378913640975952, "learning_rate": 3.4663189742460188e-06, "loss": 0.0599, "num_input_tokens_seen": 224173616, "step": 103885 }, { "epoch": 16.94779771615008, "grad_norm": 0.24336916208267212, "learning_rate": 3.4645111656546965e-06, "loss": 0.0403, "num_input_tokens_seen": 224184944, "step": 103890 }, { "epoch": 16.948613376835237, "grad_norm": 0.38109612464904785, "learning_rate": 3.4627037935073715e-06, "loss": 0.2088, "num_input_tokens_seen": 224195184, "step": 103895 }, { "epoch": 16.949429037520392, "grad_norm": 1.2038053274154663, "learning_rate": 3.4608968578406786e-06, "loss": 0.0557, "num_input_tokens_seen": 224204592, "step": 103900 }, { "epoch": 16.950244698205548, "grad_norm": 0.07246710360050201, "learning_rate": 3.459090358691233e-06, "loss": 0.2315, "num_input_tokens_seen": 224216112, "step": 103905 }, { "epoch": 16.9510603588907, "grad_norm": 1.4809222221374512, "learning_rate": 3.4572842960956505e-06, "loss": 0.1129, "num_input_tokens_seen": 224228080, "step": 103910 }, { "epoch": 16.951876019575856, "grad_norm": 1.848994493484497, "learning_rate": 3.4554786700905285e-06, "loss": 0.1193, "num_input_tokens_seen": 224239216, "step": 103915 }, { "epoch": 16.95269168026101, "grad_norm": 0.08607779443264008, "learning_rate": 3.453673480712463e-06, "loss": 0.0445, "num_input_tokens_seen": 224250960, "step": 103920 }, { "epoch": 16.953507340946167, "grad_norm": 0.35615628957748413, "learning_rate": 3.4518687279980366e-06, "loss": 0.0998, "num_input_tokens_seen": 224261584, "step": 103925 }, { "epoch": 16.954323001631323, "grad_norm": 0.06709105521440506, "learning_rate": 3.4500644119838275e-06, "loss": 0.0613, "num_input_tokens_seen": 224271760, "step": 103930 }, { "epoch": 16.955138662316475, "grad_norm": 0.3725886940956116, "learning_rate": 3.4482605327064014e-06, "loss": 0.0747, "num_input_tokens_seen": 224282160, "step": 103935 }, { "epoch": 16.95595432300163, "grad_norm": 1.7745801210403442, "learning_rate": 3.446457090202315e-06, "loss": 0.054, "num_input_tokens_seen": 224292464, "step": 103940 }, { "epoch": 16.956769983686787, "grad_norm": 0.20400221645832062, "learning_rate": 3.4446540845081144e-06, "loss": 0.0948, "num_input_tokens_seen": 224304656, "step": 103945 }, { "epoch": 16.957585644371942, "grad_norm": 0.06741758435964584, "learning_rate": 3.442851515660353e-06, "loss": 0.0736, "num_input_tokens_seen": 224316016, "step": 103950 }, { "epoch": 16.958401305057095, "grad_norm": 0.16378895938396454, "learning_rate": 3.441049383695544e-06, "loss": 0.0161, "num_input_tokens_seen": 224326736, "step": 103955 }, { "epoch": 16.95921696574225, "grad_norm": 2.7083091735839844, "learning_rate": 3.4392476886502268e-06, "loss": 0.1528, "num_input_tokens_seen": 224337904, "step": 103960 }, { "epoch": 16.960032626427406, "grad_norm": 1.908835530281067, "learning_rate": 3.4374464305608976e-06, "loss": 0.2325, "num_input_tokens_seen": 224348464, "step": 103965 }, { "epoch": 16.96084828711256, "grad_norm": 2.1024670600891113, "learning_rate": 3.4356456094640834e-06, "loss": 0.181, "num_input_tokens_seen": 224359472, "step": 103970 }, { "epoch": 16.961663947797717, "grad_norm": 2.3377878665924072, "learning_rate": 3.433845225396254e-06, "loss": 0.1635, "num_input_tokens_seen": 224371120, "step": 103975 }, { "epoch": 16.96247960848287, "grad_norm": 0.3432057201862335, "learning_rate": 3.4320452783939195e-06, "loss": 0.0712, "num_input_tokens_seen": 224379472, "step": 103980 }, { "epoch": 16.963295269168025, "grad_norm": 0.09363734722137451, "learning_rate": 3.4302457684935396e-06, "loss": 0.0164, "num_input_tokens_seen": 224390832, "step": 103985 }, { "epoch": 16.96411092985318, "grad_norm": 0.05834619700908661, "learning_rate": 3.4284466957315965e-06, "loss": 0.0615, "num_input_tokens_seen": 224402576, "step": 103990 }, { "epoch": 16.964926590538337, "grad_norm": 0.082989901304245, "learning_rate": 3.426648060144547e-06, "loss": 0.0194, "num_input_tokens_seen": 224413264, "step": 103995 }, { "epoch": 16.965742251223492, "grad_norm": 0.6250301599502563, "learning_rate": 3.424849861768842e-06, "loss": 0.1217, "num_input_tokens_seen": 224422608, "step": 104000 }, { "epoch": 16.966557911908644, "grad_norm": 0.2819729447364807, "learning_rate": 3.423052100640925e-06, "loss": 0.0448, "num_input_tokens_seen": 224434384, "step": 104005 }, { "epoch": 16.9673735725938, "grad_norm": 0.05919349566102028, "learning_rate": 3.421254776797231e-06, "loss": 0.0853, "num_input_tokens_seen": 224444368, "step": 104010 }, { "epoch": 16.968189233278956, "grad_norm": 0.4011155664920807, "learning_rate": 3.4194578902741803e-06, "loss": 0.0152, "num_input_tokens_seen": 224455600, "step": 104015 }, { "epoch": 16.96900489396411, "grad_norm": 3.2137491703033447, "learning_rate": 3.417661441108194e-06, "loss": 0.1552, "num_input_tokens_seen": 224465936, "step": 104020 }, { "epoch": 16.969820554649267, "grad_norm": 0.05402451008558273, "learning_rate": 3.4158654293356767e-06, "loss": 0.0151, "num_input_tokens_seen": 224477392, "step": 104025 }, { "epoch": 16.97063621533442, "grad_norm": 0.8427491784095764, "learning_rate": 3.4140698549930294e-06, "loss": 0.0547, "num_input_tokens_seen": 224488880, "step": 104030 }, { "epoch": 16.971451876019575, "grad_norm": 1.0002751350402832, "learning_rate": 3.4122747181166397e-06, "loss": 0.0331, "num_input_tokens_seen": 224499952, "step": 104035 }, { "epoch": 16.97226753670473, "grad_norm": 0.06865192949771881, "learning_rate": 3.4104800187428898e-06, "loss": 0.0743, "num_input_tokens_seen": 224510000, "step": 104040 }, { "epoch": 16.973083197389887, "grad_norm": 0.038070570677518845, "learning_rate": 3.4086857569081503e-06, "loss": 0.0424, "num_input_tokens_seen": 224520592, "step": 104045 }, { "epoch": 16.973898858075042, "grad_norm": 0.9801254272460938, "learning_rate": 3.4068919326487814e-06, "loss": 0.1177, "num_input_tokens_seen": 224530096, "step": 104050 }, { "epoch": 16.974714518760194, "grad_norm": 0.331777423620224, "learning_rate": 3.4050985460011425e-06, "loss": 0.2121, "num_input_tokens_seen": 224540848, "step": 104055 }, { "epoch": 16.97553017944535, "grad_norm": 0.17383457720279694, "learning_rate": 3.403305597001577e-06, "loss": 0.0548, "num_input_tokens_seen": 224551280, "step": 104060 }, { "epoch": 16.976345840130506, "grad_norm": 0.512765645980835, "learning_rate": 3.4015130856864223e-06, "loss": 0.0764, "num_input_tokens_seen": 224561584, "step": 104065 }, { "epoch": 16.97716150081566, "grad_norm": 0.04725757986307144, "learning_rate": 3.3997210120920024e-06, "loss": 0.0856, "num_input_tokens_seen": 224571952, "step": 104070 }, { "epoch": 16.977977161500817, "grad_norm": 0.03702158480882645, "learning_rate": 3.3979293762546384e-06, "loss": 0.1156, "num_input_tokens_seen": 224584304, "step": 104075 }, { "epoch": 16.97879282218597, "grad_norm": 0.5139603614807129, "learning_rate": 3.3961381782106395e-06, "loss": 0.2489, "num_input_tokens_seen": 224594768, "step": 104080 }, { "epoch": 16.979608482871125, "grad_norm": 0.22094818949699402, "learning_rate": 3.3943474179963075e-06, "loss": 0.0255, "num_input_tokens_seen": 224605456, "step": 104085 }, { "epoch": 16.98042414355628, "grad_norm": 0.8091690540313721, "learning_rate": 3.3925570956479275e-06, "loss": 0.0434, "num_input_tokens_seen": 224617424, "step": 104090 }, { "epoch": 16.981239804241437, "grad_norm": 0.2986324429512024, "learning_rate": 3.3907672112017954e-06, "loss": 0.076, "num_input_tokens_seen": 224628688, "step": 104095 }, { "epoch": 16.982055464926592, "grad_norm": 0.06926999986171722, "learning_rate": 3.388977764694179e-06, "loss": 0.0478, "num_input_tokens_seen": 224638864, "step": 104100 }, { "epoch": 16.982871125611744, "grad_norm": 2.1902103424072266, "learning_rate": 3.387188756161344e-06, "loss": 0.2465, "num_input_tokens_seen": 224650672, "step": 104105 }, { "epoch": 16.9836867862969, "grad_norm": 1.6161895990371704, "learning_rate": 3.385400185639545e-06, "loss": 0.0331, "num_input_tokens_seen": 224660368, "step": 104110 }, { "epoch": 16.984502446982056, "grad_norm": 0.20841258764266968, "learning_rate": 3.3836120531650334e-06, "loss": 0.0805, "num_input_tokens_seen": 224670992, "step": 104115 }, { "epoch": 16.98531810766721, "grad_norm": 0.21509385108947754, "learning_rate": 3.3818243587740437e-06, "loss": 0.1724, "num_input_tokens_seen": 224681520, "step": 104120 }, { "epoch": 16.986133768352367, "grad_norm": 0.055652640759944916, "learning_rate": 3.380037102502809e-06, "loss": 0.0684, "num_input_tokens_seen": 224693040, "step": 104125 }, { "epoch": 16.98694942903752, "grad_norm": 0.12977854907512665, "learning_rate": 3.378250284387549e-06, "loss": 0.1215, "num_input_tokens_seen": 224704464, "step": 104130 }, { "epoch": 16.987765089722675, "grad_norm": 1.7124722003936768, "learning_rate": 3.3764639044644774e-06, "loss": 0.3826, "num_input_tokens_seen": 224715056, "step": 104135 }, { "epoch": 16.98858075040783, "grad_norm": 0.24629822373390198, "learning_rate": 3.3746779627697954e-06, "loss": 0.1921, "num_input_tokens_seen": 224725392, "step": 104140 }, { "epoch": 16.989396411092986, "grad_norm": 2.3055684566497803, "learning_rate": 3.3728924593396994e-06, "loss": 0.139, "num_input_tokens_seen": 224735856, "step": 104145 }, { "epoch": 16.99021207177814, "grad_norm": 0.336383581161499, "learning_rate": 3.371107394210371e-06, "loss": 0.016, "num_input_tokens_seen": 224746768, "step": 104150 }, { "epoch": 16.991027732463294, "grad_norm": 1.8719481229782104, "learning_rate": 3.36932276741799e-06, "loss": 0.1413, "num_input_tokens_seen": 224755056, "step": 104155 }, { "epoch": 16.99184339314845, "grad_norm": 2.49890398979187, "learning_rate": 3.3675385789987246e-06, "loss": 0.1541, "num_input_tokens_seen": 224766128, "step": 104160 }, { "epoch": 16.992659053833606, "grad_norm": 0.03531579673290253, "learning_rate": 3.365754828988732e-06, "loss": 0.0318, "num_input_tokens_seen": 224777680, "step": 104165 }, { "epoch": 16.99347471451876, "grad_norm": 0.3371815085411072, "learning_rate": 3.363971517424164e-06, "loss": 0.0399, "num_input_tokens_seen": 224788496, "step": 104170 }, { "epoch": 16.994290375203914, "grad_norm": 0.36106160283088684, "learning_rate": 3.3621886443411604e-06, "loss": 0.0851, "num_input_tokens_seen": 224799216, "step": 104175 }, { "epoch": 16.99510603588907, "grad_norm": 0.025459332391619682, "learning_rate": 3.3604062097758517e-06, "loss": 0.0888, "num_input_tokens_seen": 224808112, "step": 104180 }, { "epoch": 16.995921696574225, "grad_norm": 0.9785051345825195, "learning_rate": 3.3586242137643638e-06, "loss": 0.1692, "num_input_tokens_seen": 224818512, "step": 104185 }, { "epoch": 16.99673735725938, "grad_norm": 1.5025701522827148, "learning_rate": 3.3568426563428094e-06, "loss": 0.3186, "num_input_tokens_seen": 224828208, "step": 104190 }, { "epoch": 16.997553017944536, "grad_norm": 1.9152830839157104, "learning_rate": 3.3550615375472968e-06, "loss": 0.0748, "num_input_tokens_seen": 224839376, "step": 104195 }, { "epoch": 16.99836867862969, "grad_norm": 0.23265470564365387, "learning_rate": 3.353280857413915e-06, "loss": 0.0784, "num_input_tokens_seen": 224850000, "step": 104200 }, { "epoch": 16.999184339314844, "grad_norm": 2.153940200805664, "learning_rate": 3.351500615978767e-06, "loss": 0.1158, "num_input_tokens_seen": 224861936, "step": 104205 }, { "epoch": 17.0, "grad_norm": 0.449285626411438, "learning_rate": 3.3497208132779122e-06, "loss": 0.0501, "num_input_tokens_seen": 224870720, "step": 104210 }, { "epoch": 17.0, "eval_loss": 0.14444111287593842, "eval_runtime": 90.9206, "eval_samples_per_second": 29.971, "eval_steps_per_second": 7.501, "num_input_tokens_seen": 224870720, "step": 104210 }, { "epoch": 17.000815660685156, "grad_norm": 1.6377167701721191, "learning_rate": 3.3479414493474415e-06, "loss": 0.2825, "num_input_tokens_seen": 224881536, "step": 104215 }, { "epoch": 17.00163132137031, "grad_norm": 0.07169580459594727, "learning_rate": 3.3461625242233925e-06, "loss": 0.1081, "num_input_tokens_seen": 224892352, "step": 104220 }, { "epoch": 17.002446982055464, "grad_norm": 1.9830418825149536, "learning_rate": 3.344384037941842e-06, "loss": 0.1684, "num_input_tokens_seen": 224904064, "step": 104225 }, { "epoch": 17.00326264274062, "grad_norm": 0.050670117139816284, "learning_rate": 3.342605990538808e-06, "loss": 0.0999, "num_input_tokens_seen": 224913856, "step": 104230 }, { "epoch": 17.004078303425775, "grad_norm": 0.3286278545856476, "learning_rate": 3.3408283820503456e-06, "loss": 0.0817, "num_input_tokens_seen": 224925408, "step": 104235 }, { "epoch": 17.00489396411093, "grad_norm": 0.04652542248368263, "learning_rate": 3.3390512125124727e-06, "loss": 0.201, "num_input_tokens_seen": 224935104, "step": 104240 }, { "epoch": 17.005709624796086, "grad_norm": 0.8725987076759338, "learning_rate": 3.3372744819612057e-06, "loss": 0.2185, "num_input_tokens_seen": 224945088, "step": 104245 }, { "epoch": 17.00652528548124, "grad_norm": 0.12911061942577362, "learning_rate": 3.335498190432551e-06, "loss": 0.0232, "num_input_tokens_seen": 224956448, "step": 104250 }, { "epoch": 17.007340946166394, "grad_norm": 0.9664469361305237, "learning_rate": 3.333722337962511e-06, "loss": 0.0869, "num_input_tokens_seen": 224967264, "step": 104255 }, { "epoch": 17.00815660685155, "grad_norm": 0.27556127309799194, "learning_rate": 3.3319469245870732e-06, "loss": 0.0878, "num_input_tokens_seen": 224978080, "step": 104260 }, { "epoch": 17.008972267536706, "grad_norm": 0.9033831357955933, "learning_rate": 3.3301719503422174e-06, "loss": 0.1344, "num_input_tokens_seen": 224989632, "step": 104265 }, { "epoch": 17.00978792822186, "grad_norm": 0.31645575165748596, "learning_rate": 3.3283974152639175e-06, "loss": 0.0235, "num_input_tokens_seen": 224999936, "step": 104270 }, { "epoch": 17.010603588907014, "grad_norm": 1.5434757471084595, "learning_rate": 3.3266233193881364e-06, "loss": 0.142, "num_input_tokens_seen": 225010688, "step": 104275 }, { "epoch": 17.01141924959217, "grad_norm": 2.1286282539367676, "learning_rate": 3.3248496627508285e-06, "loss": 0.1921, "num_input_tokens_seen": 225022496, "step": 104280 }, { "epoch": 17.012234910277325, "grad_norm": 1.0243054628372192, "learning_rate": 3.3230764453879402e-06, "loss": 0.163, "num_input_tokens_seen": 225033152, "step": 104285 }, { "epoch": 17.01305057096248, "grad_norm": 2.967797040939331, "learning_rate": 3.321303667335404e-06, "loss": 0.1116, "num_input_tokens_seen": 225043936, "step": 104290 }, { "epoch": 17.013866231647636, "grad_norm": 0.3252573609352112, "learning_rate": 3.3195313286291522e-06, "loss": 0.1241, "num_input_tokens_seen": 225055136, "step": 104295 }, { "epoch": 17.01468189233279, "grad_norm": 0.031241079792380333, "learning_rate": 3.3177594293051033e-06, "loss": 0.0229, "num_input_tokens_seen": 225064480, "step": 104300 }, { "epoch": 17.015497553017944, "grad_norm": 0.36909157037734985, "learning_rate": 3.315987969399156e-06, "loss": 0.05, "num_input_tokens_seen": 225075904, "step": 104305 }, { "epoch": 17.0163132137031, "grad_norm": 0.0765896812081337, "learning_rate": 3.3142169489472353e-06, "loss": 0.0078, "num_input_tokens_seen": 225086784, "step": 104310 }, { "epoch": 17.017128874388256, "grad_norm": 0.12135293334722519, "learning_rate": 3.3124463679852056e-06, "loss": 0.0876, "num_input_tokens_seen": 225096256, "step": 104315 }, { "epoch": 17.017944535073408, "grad_norm": 0.12373088300228119, "learning_rate": 3.3106762265489755e-06, "loss": 0.0732, "num_input_tokens_seen": 225105600, "step": 104320 }, { "epoch": 17.018760195758563, "grad_norm": 0.6651124954223633, "learning_rate": 3.3089065246743934e-06, "loss": 0.0212, "num_input_tokens_seen": 225117216, "step": 104325 }, { "epoch": 17.01957585644372, "grad_norm": 0.058404721319675446, "learning_rate": 3.3071372623973503e-06, "loss": 0.2239, "num_input_tokens_seen": 225128896, "step": 104330 }, { "epoch": 17.020391517128875, "grad_norm": 0.04549967870116234, "learning_rate": 3.305368439753678e-06, "loss": 0.0339, "num_input_tokens_seen": 225140032, "step": 104335 }, { "epoch": 17.02120717781403, "grad_norm": 0.04134353622794151, "learning_rate": 3.3036000567792464e-06, "loss": 0.2606, "num_input_tokens_seen": 225151040, "step": 104340 }, { "epoch": 17.022022838499183, "grad_norm": 0.09476987272500992, "learning_rate": 3.301832113509873e-06, "loss": 0.1239, "num_input_tokens_seen": 225162016, "step": 104345 }, { "epoch": 17.02283849918434, "grad_norm": 0.5791128277778625, "learning_rate": 3.30006460998141e-06, "loss": 0.1949, "num_input_tokens_seen": 225173504, "step": 104350 }, { "epoch": 17.023654159869494, "grad_norm": 0.04704803600907326, "learning_rate": 3.2982975462296535e-06, "loss": 0.1346, "num_input_tokens_seen": 225183488, "step": 104355 }, { "epoch": 17.02446982055465, "grad_norm": 0.044983040541410446, "learning_rate": 3.296530922290439e-06, "loss": 0.0363, "num_input_tokens_seen": 225193856, "step": 104360 }, { "epoch": 17.025285481239806, "grad_norm": 0.13335740566253662, "learning_rate": 3.294764738199549e-06, "loss": 0.0243, "num_input_tokens_seen": 225205312, "step": 104365 }, { "epoch": 17.026101141924958, "grad_norm": 0.09420004487037659, "learning_rate": 3.2929989939927964e-06, "loss": 0.0486, "num_input_tokens_seen": 225215776, "step": 104370 }, { "epoch": 17.026916802610113, "grad_norm": 0.03110402077436447, "learning_rate": 3.2912336897059444e-06, "loss": 0.0545, "num_input_tokens_seen": 225226368, "step": 104375 }, { "epoch": 17.02773246329527, "grad_norm": 1.155083417892456, "learning_rate": 3.289468825374786e-06, "loss": 0.1251, "num_input_tokens_seen": 225237920, "step": 104380 }, { "epoch": 17.028548123980425, "grad_norm": 0.06584428250789642, "learning_rate": 3.2877044010350854e-06, "loss": 0.136, "num_input_tokens_seen": 225248224, "step": 104385 }, { "epoch": 17.02936378466558, "grad_norm": 0.7334451079368591, "learning_rate": 3.2859404167225994e-06, "loss": 0.0263, "num_input_tokens_seen": 225258176, "step": 104390 }, { "epoch": 17.030179445350733, "grad_norm": 0.0824069082736969, "learning_rate": 3.284176872473077e-06, "loss": 0.1085, "num_input_tokens_seen": 225269344, "step": 104395 }, { "epoch": 17.03099510603589, "grad_norm": 0.25151538848876953, "learning_rate": 3.2824137683222593e-06, "loss": 0.0891, "num_input_tokens_seen": 225278784, "step": 104400 }, { "epoch": 17.031810766721044, "grad_norm": 0.03793656453490257, "learning_rate": 3.280651104305876e-06, "loss": 0.0078, "num_input_tokens_seen": 225289856, "step": 104405 }, { "epoch": 17.0326264274062, "grad_norm": 2.59431791305542, "learning_rate": 3.2788888804596517e-06, "loss": 0.2233, "num_input_tokens_seen": 225299392, "step": 104410 }, { "epoch": 17.033442088091356, "grad_norm": 0.09830627590417862, "learning_rate": 3.277127096819299e-06, "loss": 0.0657, "num_input_tokens_seen": 225308096, "step": 104415 }, { "epoch": 17.034257748776508, "grad_norm": 0.10338053852319717, "learning_rate": 3.275365753420523e-06, "loss": 0.015, "num_input_tokens_seen": 225319168, "step": 104420 }, { "epoch": 17.035073409461663, "grad_norm": 0.0813923329114914, "learning_rate": 3.2736048502990195e-06, "loss": 0.1195, "num_input_tokens_seen": 225330496, "step": 104425 }, { "epoch": 17.03588907014682, "grad_norm": 2.0196468830108643, "learning_rate": 3.271844387490472e-06, "loss": 0.1231, "num_input_tokens_seen": 225340576, "step": 104430 }, { "epoch": 17.036704730831975, "grad_norm": 0.5355864763259888, "learning_rate": 3.270084365030571e-06, "loss": 0.0685, "num_input_tokens_seen": 225351840, "step": 104435 }, { "epoch": 17.03752039151713, "grad_norm": 0.06532534956932068, "learning_rate": 3.2683247829549653e-06, "loss": 0.0464, "num_input_tokens_seen": 225362336, "step": 104440 }, { "epoch": 17.038336052202283, "grad_norm": 0.15139968693256378, "learning_rate": 3.266565641299338e-06, "loss": 0.0969, "num_input_tokens_seen": 225373792, "step": 104445 }, { "epoch": 17.03915171288744, "grad_norm": 0.025800608098506927, "learning_rate": 3.2648069400993187e-06, "loss": 0.1457, "num_input_tokens_seen": 225385728, "step": 104450 }, { "epoch": 17.039967373572594, "grad_norm": 0.8904395699501038, "learning_rate": 3.2630486793905707e-06, "loss": 0.0639, "num_input_tokens_seen": 225395840, "step": 104455 }, { "epoch": 17.04078303425775, "grad_norm": 1.535503625869751, "learning_rate": 3.2612908592087043e-06, "loss": 0.2575, "num_input_tokens_seen": 225407712, "step": 104460 }, { "epoch": 17.041598694942905, "grad_norm": 0.10482171177864075, "learning_rate": 3.259533479589369e-06, "loss": 0.014, "num_input_tokens_seen": 225419072, "step": 104465 }, { "epoch": 17.042414355628058, "grad_norm": 0.048177603632211685, "learning_rate": 3.2577765405681554e-06, "loss": 0.0101, "num_input_tokens_seen": 225429248, "step": 104470 }, { "epoch": 17.043230016313213, "grad_norm": 2.5850205421447754, "learning_rate": 3.2560200421806936e-06, "loss": 0.1756, "num_input_tokens_seen": 225440288, "step": 104475 }, { "epoch": 17.04404567699837, "grad_norm": 1.6675093173980713, "learning_rate": 3.2542639844625576e-06, "loss": 0.0505, "num_input_tokens_seen": 225450592, "step": 104480 }, { "epoch": 17.044861337683525, "grad_norm": 1.5333201885223389, "learning_rate": 3.2525083674493613e-06, "loss": 0.1206, "num_input_tokens_seen": 225462208, "step": 104485 }, { "epoch": 17.045676998368677, "grad_norm": 0.0508020743727684, "learning_rate": 3.2507531911766585e-06, "loss": 0.1478, "num_input_tokens_seen": 225472832, "step": 104490 }, { "epoch": 17.046492659053833, "grad_norm": 0.241378054022789, "learning_rate": 3.248998455680047e-06, "loss": 0.0296, "num_input_tokens_seen": 225483136, "step": 104495 }, { "epoch": 17.04730831973899, "grad_norm": 1.326169729232788, "learning_rate": 3.247244160995061e-06, "loss": 0.0787, "num_input_tokens_seen": 225493792, "step": 104500 }, { "epoch": 17.048123980424144, "grad_norm": 0.6246733069419861, "learning_rate": 3.2454903071572785e-06, "loss": 0.0393, "num_input_tokens_seen": 225504288, "step": 104505 }, { "epoch": 17.0489396411093, "grad_norm": 0.7371880412101746, "learning_rate": 3.2437368942022234e-06, "loss": 0.04, "num_input_tokens_seen": 225515776, "step": 104510 }, { "epoch": 17.049755301794452, "grad_norm": 0.12692828476428986, "learning_rate": 3.241983922165451e-06, "loss": 0.0399, "num_input_tokens_seen": 225526880, "step": 104515 }, { "epoch": 17.050570962479608, "grad_norm": 0.09387492388486862, "learning_rate": 3.240231391082463e-06, "loss": 0.0279, "num_input_tokens_seen": 225537472, "step": 104520 }, { "epoch": 17.051386623164763, "grad_norm": 0.08670803904533386, "learning_rate": 3.2384793009887978e-06, "loss": 0.0554, "num_input_tokens_seen": 225548640, "step": 104525 }, { "epoch": 17.05220228384992, "grad_norm": 1.1349304914474487, "learning_rate": 3.2367276519199548e-06, "loss": 0.0581, "num_input_tokens_seen": 225560896, "step": 104530 }, { "epoch": 17.053017944535075, "grad_norm": 0.48378005623817444, "learning_rate": 3.2349764439114334e-06, "loss": 0.0196, "num_input_tokens_seen": 225571648, "step": 104535 }, { "epoch": 17.053833605220227, "grad_norm": 0.8633857369422913, "learning_rate": 3.2332256769987274e-06, "loss": 0.0626, "num_input_tokens_seen": 225580896, "step": 104540 }, { "epoch": 17.054649265905383, "grad_norm": 0.07739108055830002, "learning_rate": 3.231475351217317e-06, "loss": 0.0519, "num_input_tokens_seen": 225593600, "step": 104545 }, { "epoch": 17.05546492659054, "grad_norm": 0.723942756652832, "learning_rate": 3.2297254666026733e-06, "loss": 0.1317, "num_input_tokens_seen": 225605344, "step": 104550 }, { "epoch": 17.056280587275694, "grad_norm": 0.0880478173494339, "learning_rate": 3.2279760231902574e-06, "loss": 0.0648, "num_input_tokens_seen": 225615808, "step": 104555 }, { "epoch": 17.05709624796085, "grad_norm": 0.11661991477012634, "learning_rate": 3.2262270210155293e-06, "loss": 0.2224, "num_input_tokens_seen": 225626496, "step": 104560 }, { "epoch": 17.057911908646002, "grad_norm": 1.5462366342544556, "learning_rate": 3.224478460113933e-06, "loss": 0.1744, "num_input_tokens_seen": 225636704, "step": 104565 }, { "epoch": 17.058727569331158, "grad_norm": 0.22246934473514557, "learning_rate": 3.2227303405209075e-06, "loss": 0.0126, "num_input_tokens_seen": 225647584, "step": 104570 }, { "epoch": 17.059543230016313, "grad_norm": 0.650396466255188, "learning_rate": 3.220982662271868e-06, "loss": 0.0366, "num_input_tokens_seen": 225658592, "step": 104575 }, { "epoch": 17.06035889070147, "grad_norm": 1.9374101161956787, "learning_rate": 3.219235425402256e-06, "loss": 0.0882, "num_input_tokens_seen": 225670336, "step": 104580 }, { "epoch": 17.061174551386625, "grad_norm": 0.11889788508415222, "learning_rate": 3.21748862994746e-06, "loss": 0.0218, "num_input_tokens_seen": 225679872, "step": 104585 }, { "epoch": 17.061990212071777, "grad_norm": 0.17313960194587708, "learning_rate": 3.2157422759428986e-06, "loss": 0.1217, "num_input_tokens_seen": 225692288, "step": 104590 }, { "epoch": 17.062805872756933, "grad_norm": 0.05035679414868355, "learning_rate": 3.213996363423946e-06, "loss": 0.0608, "num_input_tokens_seen": 225704032, "step": 104595 }, { "epoch": 17.063621533442088, "grad_norm": 0.20159806311130524, "learning_rate": 3.2122508924260024e-06, "loss": 0.0482, "num_input_tokens_seen": 225713536, "step": 104600 }, { "epoch": 17.064437194127244, "grad_norm": 3.0283453464508057, "learning_rate": 3.210505862984428e-06, "loss": 0.3309, "num_input_tokens_seen": 225723648, "step": 104605 }, { "epoch": 17.0652528548124, "grad_norm": 2.26020884513855, "learning_rate": 3.2087612751346002e-06, "loss": 0.0624, "num_input_tokens_seen": 225734048, "step": 104610 }, { "epoch": 17.06606851549755, "grad_norm": 0.13769719004631042, "learning_rate": 3.207017128911863e-06, "loss": 0.0471, "num_input_tokens_seen": 225744960, "step": 104615 }, { "epoch": 17.066884176182707, "grad_norm": 0.4726927876472473, "learning_rate": 3.2052734243515765e-06, "loss": 0.152, "num_input_tokens_seen": 225756000, "step": 104620 }, { "epoch": 17.067699836867863, "grad_norm": 0.1432536393404007, "learning_rate": 3.2035301614890658e-06, "loss": 0.3047, "num_input_tokens_seen": 225765984, "step": 104625 }, { "epoch": 17.06851549755302, "grad_norm": 0.1271674782037735, "learning_rate": 3.2017873403596716e-06, "loss": 0.0638, "num_input_tokens_seen": 225776928, "step": 104630 }, { "epoch": 17.069331158238175, "grad_norm": 0.17564240097999573, "learning_rate": 3.2000449609987076e-06, "loss": 0.0132, "num_input_tokens_seen": 225788768, "step": 104635 }, { "epoch": 17.070146818923327, "grad_norm": 0.06395164132118225, "learning_rate": 3.19830302344149e-06, "loss": 0.0274, "num_input_tokens_seen": 225800672, "step": 104640 }, { "epoch": 17.070962479608482, "grad_norm": 0.03586623817682266, "learning_rate": 3.1965615277233186e-06, "loss": 0.0744, "num_input_tokens_seen": 225810048, "step": 104645 }, { "epoch": 17.071778140293638, "grad_norm": 1.8842923641204834, "learning_rate": 3.194820473879487e-06, "loss": 0.1433, "num_input_tokens_seen": 225821792, "step": 104650 }, { "epoch": 17.072593800978794, "grad_norm": 0.48349958658218384, "learning_rate": 3.193079861945281e-06, "loss": 0.0808, "num_input_tokens_seen": 225832000, "step": 104655 }, { "epoch": 17.07340946166395, "grad_norm": 1.425234317779541, "learning_rate": 3.1913396919559744e-06, "loss": 0.1866, "num_input_tokens_seen": 225842752, "step": 104660 }, { "epoch": 17.0742251223491, "grad_norm": 1.795129418373108, "learning_rate": 3.189599963946835e-06, "loss": 0.1525, "num_input_tokens_seen": 225854176, "step": 104665 }, { "epoch": 17.075040783034257, "grad_norm": 0.20003394782543182, "learning_rate": 3.187860677953122e-06, "loss": 0.052, "num_input_tokens_seen": 225865120, "step": 104670 }, { "epoch": 17.075856443719413, "grad_norm": 0.02278289943933487, "learning_rate": 3.1861218340100836e-06, "loss": 0.02, "num_input_tokens_seen": 225877440, "step": 104675 }, { "epoch": 17.07667210440457, "grad_norm": 0.18998394906520844, "learning_rate": 3.1843834321529568e-06, "loss": 0.0122, "num_input_tokens_seen": 225888512, "step": 104680 }, { "epoch": 17.07748776508972, "grad_norm": 0.06677855551242828, "learning_rate": 3.1826454724169757e-06, "loss": 0.1809, "num_input_tokens_seen": 225898048, "step": 104685 }, { "epoch": 17.078303425774877, "grad_norm": 2.4067418575286865, "learning_rate": 3.1809079548373617e-06, "loss": 0.2208, "num_input_tokens_seen": 225909056, "step": 104690 }, { "epoch": 17.079119086460032, "grad_norm": 0.3475218713283539, "learning_rate": 3.179170879449328e-06, "loss": 0.0361, "num_input_tokens_seen": 225920512, "step": 104695 }, { "epoch": 17.079934747145188, "grad_norm": 0.03456604480743408, "learning_rate": 3.1774342462880774e-06, "loss": 0.0931, "num_input_tokens_seen": 225930976, "step": 104700 }, { "epoch": 17.080750407830344, "grad_norm": 0.0849318727850914, "learning_rate": 3.1756980553888065e-06, "loss": 0.0173, "num_input_tokens_seen": 225941952, "step": 104705 }, { "epoch": 17.081566068515496, "grad_norm": 0.09589874744415283, "learning_rate": 3.173962306786701e-06, "loss": 0.1154, "num_input_tokens_seen": 225953984, "step": 104710 }, { "epoch": 17.08238172920065, "grad_norm": 0.03183881193399429, "learning_rate": 3.172227000516936e-06, "loss": 0.0425, "num_input_tokens_seen": 225963840, "step": 104715 }, { "epoch": 17.083197389885807, "grad_norm": 0.3372040390968323, "learning_rate": 3.1704921366146777e-06, "loss": 0.087, "num_input_tokens_seen": 225973120, "step": 104720 }, { "epoch": 17.084013050570963, "grad_norm": 0.20034641027450562, "learning_rate": 3.1687577151150975e-06, "loss": 0.0587, "num_input_tokens_seen": 225983168, "step": 104725 }, { "epoch": 17.08482871125612, "grad_norm": 0.35781243443489075, "learning_rate": 3.167023736053329e-06, "loss": 0.1308, "num_input_tokens_seen": 225994240, "step": 104730 }, { "epoch": 17.08564437194127, "grad_norm": 1.3729008436203003, "learning_rate": 3.165290199464524e-06, "loss": 0.0991, "num_input_tokens_seen": 226005344, "step": 104735 }, { "epoch": 17.086460032626427, "grad_norm": 0.038033634424209595, "learning_rate": 3.163557105383816e-06, "loss": 0.0399, "num_input_tokens_seen": 226016416, "step": 104740 }, { "epoch": 17.087275693311582, "grad_norm": 1.4161126613616943, "learning_rate": 3.161824453846321e-06, "loss": 0.1223, "num_input_tokens_seen": 226026496, "step": 104745 }, { "epoch": 17.088091353996738, "grad_norm": 0.07618477940559387, "learning_rate": 3.1600922448871584e-06, "loss": 0.0103, "num_input_tokens_seen": 226036480, "step": 104750 }, { "epoch": 17.088907014681894, "grad_norm": 0.2725682556629181, "learning_rate": 3.1583604785414333e-06, "loss": 0.0386, "num_input_tokens_seen": 226048608, "step": 104755 }, { "epoch": 17.089722675367046, "grad_norm": 0.05400218814611435, "learning_rate": 3.15662915484424e-06, "loss": 0.0057, "num_input_tokens_seen": 226059808, "step": 104760 }, { "epoch": 17.0905383360522, "grad_norm": 0.2108258605003357, "learning_rate": 3.15489827383067e-06, "loss": 0.0259, "num_input_tokens_seen": 226071168, "step": 104765 }, { "epoch": 17.091353996737357, "grad_norm": 0.07347305119037628, "learning_rate": 3.1531678355357945e-06, "loss": 0.1828, "num_input_tokens_seen": 226081472, "step": 104770 }, { "epoch": 17.092169657422513, "grad_norm": 0.6096987724304199, "learning_rate": 3.151437839994692e-06, "loss": 0.0154, "num_input_tokens_seen": 226092768, "step": 104775 }, { "epoch": 17.09298531810767, "grad_norm": 0.8978290557861328, "learning_rate": 3.1497082872424143e-06, "loss": 0.0671, "num_input_tokens_seen": 226103840, "step": 104780 }, { "epoch": 17.09380097879282, "grad_norm": 0.07326068729162216, "learning_rate": 3.14797917731402e-06, "loss": 0.0319, "num_input_tokens_seen": 226115168, "step": 104785 }, { "epoch": 17.094616639477977, "grad_norm": 0.4021015763282776, "learning_rate": 3.14625051024455e-06, "loss": 0.0169, "num_input_tokens_seen": 226127328, "step": 104790 }, { "epoch": 17.095432300163132, "grad_norm": 0.6665942072868347, "learning_rate": 3.1445222860690348e-06, "loss": 0.0445, "num_input_tokens_seen": 226137696, "step": 104795 }, { "epoch": 17.096247960848288, "grad_norm": 0.08723597973585129, "learning_rate": 3.1427945048224992e-06, "loss": 0.1133, "num_input_tokens_seen": 226147552, "step": 104800 }, { "epoch": 17.097063621533444, "grad_norm": 0.05000467598438263, "learning_rate": 3.1410671665399627e-06, "loss": 0.0531, "num_input_tokens_seen": 226158304, "step": 104805 }, { "epoch": 17.097879282218596, "grad_norm": 1.4874045848846436, "learning_rate": 3.139340271256433e-06, "loss": 0.1414, "num_input_tokens_seen": 226168896, "step": 104810 }, { "epoch": 17.09869494290375, "grad_norm": 0.06893258541822433, "learning_rate": 3.1376138190069017e-06, "loss": 0.1815, "num_input_tokens_seen": 226178720, "step": 104815 }, { "epoch": 17.099510603588907, "grad_norm": 0.037636011838912964, "learning_rate": 3.13588780982636e-06, "loss": 0.0256, "num_input_tokens_seen": 226189888, "step": 104820 }, { "epoch": 17.100326264274063, "grad_norm": 0.16394446790218353, "learning_rate": 3.1341622437497914e-06, "loss": 0.1692, "num_input_tokens_seen": 226201280, "step": 104825 }, { "epoch": 17.10114192495922, "grad_norm": 0.6228009462356567, "learning_rate": 3.132437120812162e-06, "loss": 0.0328, "num_input_tokens_seen": 226211488, "step": 104830 }, { "epoch": 17.10195758564437, "grad_norm": 1.0261412858963013, "learning_rate": 3.1307124410484334e-06, "loss": 0.3185, "num_input_tokens_seen": 226222752, "step": 104835 }, { "epoch": 17.102773246329527, "grad_norm": 1.500133991241455, "learning_rate": 3.1289882044935576e-06, "loss": 0.1991, "num_input_tokens_seen": 226231872, "step": 104840 }, { "epoch": 17.103588907014682, "grad_norm": 0.08979768306016922, "learning_rate": 3.1272644111824927e-06, "loss": 0.2415, "num_input_tokens_seen": 226242272, "step": 104845 }, { "epoch": 17.104404567699838, "grad_norm": 0.10231830924749374, "learning_rate": 3.1255410611501503e-06, "loss": 0.0863, "num_input_tokens_seen": 226253952, "step": 104850 }, { "epoch": 17.10522022838499, "grad_norm": 0.7490396499633789, "learning_rate": 3.1238181544314765e-06, "loss": 0.0292, "num_input_tokens_seen": 226264640, "step": 104855 }, { "epoch": 17.106035889070146, "grad_norm": 0.18511487543582916, "learning_rate": 3.122095691061372e-06, "loss": 0.2322, "num_input_tokens_seen": 226274688, "step": 104860 }, { "epoch": 17.1068515497553, "grad_norm": 0.343268483877182, "learning_rate": 3.1203736710747554e-06, "loss": 0.0233, "num_input_tokens_seen": 226286432, "step": 104865 }, { "epoch": 17.107667210440457, "grad_norm": 0.06071719154715538, "learning_rate": 3.1186520945065244e-06, "loss": 0.0503, "num_input_tokens_seen": 226295456, "step": 104870 }, { "epoch": 17.108482871125613, "grad_norm": 1.799775242805481, "learning_rate": 3.1169309613915674e-06, "loss": 0.059, "num_input_tokens_seen": 226304064, "step": 104875 }, { "epoch": 17.109298531810765, "grad_norm": 0.2800445854663849, "learning_rate": 3.1152102717647623e-06, "loss": 0.1169, "num_input_tokens_seen": 226314944, "step": 104880 }, { "epoch": 17.11011419249592, "grad_norm": 0.04063693806529045, "learning_rate": 3.113490025660987e-06, "loss": 0.0105, "num_input_tokens_seen": 226325856, "step": 104885 }, { "epoch": 17.110929853181077, "grad_norm": 0.49974918365478516, "learning_rate": 3.1117702231151013e-06, "loss": 0.1026, "num_input_tokens_seen": 226337248, "step": 104890 }, { "epoch": 17.111745513866232, "grad_norm": 0.6797182559967041, "learning_rate": 3.1100508641619562e-06, "loss": 0.0434, "num_input_tokens_seen": 226346400, "step": 104895 }, { "epoch": 17.112561174551388, "grad_norm": 0.12141779065132141, "learning_rate": 3.1083319488364016e-06, "loss": 0.1338, "num_input_tokens_seen": 226357088, "step": 104900 }, { "epoch": 17.11337683523654, "grad_norm": 0.49803048372268677, "learning_rate": 3.106613477173273e-06, "loss": 0.0248, "num_input_tokens_seen": 226367488, "step": 104905 }, { "epoch": 17.114192495921696, "grad_norm": 0.05777202546596527, "learning_rate": 3.1048954492073927e-06, "loss": 0.1163, "num_input_tokens_seen": 226378080, "step": 104910 }, { "epoch": 17.11500815660685, "grad_norm": 0.21029634773731232, "learning_rate": 3.103177864973586e-06, "loss": 0.0964, "num_input_tokens_seen": 226388736, "step": 104915 }, { "epoch": 17.115823817292007, "grad_norm": 0.043306320905685425, "learning_rate": 3.101460724506655e-06, "loss": 0.1015, "num_input_tokens_seen": 226400640, "step": 104920 }, { "epoch": 17.116639477977163, "grad_norm": 0.05829748511314392, "learning_rate": 3.099744027841403e-06, "loss": 0.2164, "num_input_tokens_seen": 226411040, "step": 104925 }, { "epoch": 17.117455138662315, "grad_norm": 0.43124642968177795, "learning_rate": 3.0980277750126215e-06, "loss": 0.084, "num_input_tokens_seen": 226421408, "step": 104930 }, { "epoch": 17.11827079934747, "grad_norm": 1.2772403955459595, "learning_rate": 3.096311966055093e-06, "loss": 0.094, "num_input_tokens_seen": 226432736, "step": 104935 }, { "epoch": 17.119086460032626, "grad_norm": 0.08013466745615005, "learning_rate": 3.0945966010035883e-06, "loss": 0.0308, "num_input_tokens_seen": 226443648, "step": 104940 }, { "epoch": 17.119902120717782, "grad_norm": 3.0902884006500244, "learning_rate": 3.0928816798928644e-06, "loss": 0.3017, "num_input_tokens_seen": 226454880, "step": 104945 }, { "epoch": 17.120717781402938, "grad_norm": 0.32211264967918396, "learning_rate": 3.0911672027576997e-06, "loss": 0.0098, "num_input_tokens_seen": 226465920, "step": 104950 }, { "epoch": 17.12153344208809, "grad_norm": 0.038153380155563354, "learning_rate": 3.089453169632811e-06, "loss": 0.0184, "num_input_tokens_seen": 226476832, "step": 104955 }, { "epoch": 17.122349102773246, "grad_norm": 0.801609456539154, "learning_rate": 3.087739580552962e-06, "loss": 0.1432, "num_input_tokens_seen": 226486464, "step": 104960 }, { "epoch": 17.1231647634584, "grad_norm": 0.6894977688789368, "learning_rate": 3.086026435552855e-06, "loss": 0.1982, "num_input_tokens_seen": 226497024, "step": 104965 }, { "epoch": 17.123980424143557, "grad_norm": 1.001624584197998, "learning_rate": 3.0843137346672353e-06, "loss": 0.2664, "num_input_tokens_seen": 226508320, "step": 104970 }, { "epoch": 17.124796084828713, "grad_norm": 1.4695857763290405, "learning_rate": 3.082601477930788e-06, "loss": 0.2099, "num_input_tokens_seen": 226518208, "step": 104975 }, { "epoch": 17.125611745513865, "grad_norm": 0.8952361941337585, "learning_rate": 3.0808896653782367e-06, "loss": 0.1461, "num_input_tokens_seen": 226529472, "step": 104980 }, { "epoch": 17.12642740619902, "grad_norm": 1.4364680051803589, "learning_rate": 3.0791782970442524e-06, "loss": 0.0442, "num_input_tokens_seen": 226539648, "step": 104985 }, { "epoch": 17.127243066884176, "grad_norm": 0.09398636221885681, "learning_rate": 3.0774673729635385e-06, "loss": 0.0268, "num_input_tokens_seen": 226549760, "step": 104990 }, { "epoch": 17.128058727569332, "grad_norm": 2.616753101348877, "learning_rate": 3.0757568931707475e-06, "loss": 0.2095, "num_input_tokens_seen": 226559264, "step": 104995 }, { "epoch": 17.128874388254488, "grad_norm": 0.0583377406001091, "learning_rate": 3.074046857700569e-06, "loss": 0.1465, "num_input_tokens_seen": 226568992, "step": 105000 }, { "epoch": 17.12969004893964, "grad_norm": 0.8550251722335815, "learning_rate": 3.0723372665876326e-06, "loss": 0.1542, "num_input_tokens_seen": 226580064, "step": 105005 }, { "epoch": 17.130505709624796, "grad_norm": 0.15075556933879852, "learning_rate": 3.070628119866606e-06, "loss": 0.0598, "num_input_tokens_seen": 226591904, "step": 105010 }, { "epoch": 17.13132137030995, "grad_norm": 0.7476267218589783, "learning_rate": 3.068919417572119e-06, "loss": 0.1121, "num_input_tokens_seen": 226601600, "step": 105015 }, { "epoch": 17.132137030995107, "grad_norm": 0.4297618567943573, "learning_rate": 3.0672111597388003e-06, "loss": 0.0238, "num_input_tokens_seen": 226612448, "step": 105020 }, { "epoch": 17.13295269168026, "grad_norm": 0.6377849578857422, "learning_rate": 3.0655033464012744e-06, "loss": 0.0758, "num_input_tokens_seen": 226623072, "step": 105025 }, { "epoch": 17.133768352365415, "grad_norm": 0.0697440579533577, "learning_rate": 3.0637959775941474e-06, "loss": 0.3723, "num_input_tokens_seen": 226633856, "step": 105030 }, { "epoch": 17.13458401305057, "grad_norm": 0.04279007390141487, "learning_rate": 3.0620890533520213e-06, "loss": 0.0296, "num_input_tokens_seen": 226643552, "step": 105035 }, { "epoch": 17.135399673735726, "grad_norm": 0.13063320517539978, "learning_rate": 3.0603825737094944e-06, "loss": 0.2548, "num_input_tokens_seen": 226654112, "step": 105040 }, { "epoch": 17.136215334420882, "grad_norm": 0.10880828648805618, "learning_rate": 3.058676538701144e-06, "loss": 0.0124, "num_input_tokens_seen": 226664320, "step": 105045 }, { "epoch": 17.137030995106034, "grad_norm": 0.06903203576803207, "learning_rate": 3.056970948361548e-06, "loss": 0.0196, "num_input_tokens_seen": 226674848, "step": 105050 }, { "epoch": 17.13784665579119, "grad_norm": 0.14079399406909943, "learning_rate": 3.055265802725274e-06, "loss": 0.0559, "num_input_tokens_seen": 226685056, "step": 105055 }, { "epoch": 17.138662316476346, "grad_norm": 0.07845456898212433, "learning_rate": 3.053561101826871e-06, "loss": 0.0096, "num_input_tokens_seen": 226696192, "step": 105060 }, { "epoch": 17.1394779771615, "grad_norm": 2.236046314239502, "learning_rate": 3.0518568457009066e-06, "loss": 0.0958, "num_input_tokens_seen": 226707040, "step": 105065 }, { "epoch": 17.140293637846657, "grad_norm": 0.16033703088760376, "learning_rate": 3.0501530343818917e-06, "loss": 0.1047, "num_input_tokens_seen": 226717824, "step": 105070 }, { "epoch": 17.14110929853181, "grad_norm": 0.11476179212331772, "learning_rate": 3.048449667904385e-06, "loss": 0.0209, "num_input_tokens_seen": 226728224, "step": 105075 }, { "epoch": 17.141924959216965, "grad_norm": 2.0861659049987793, "learning_rate": 3.046746746302881e-06, "loss": 0.0928, "num_input_tokens_seen": 226738304, "step": 105080 }, { "epoch": 17.14274061990212, "grad_norm": 0.768875241279602, "learning_rate": 3.0450442696119157e-06, "loss": 0.1562, "num_input_tokens_seen": 226747296, "step": 105085 }, { "epoch": 17.143556280587276, "grad_norm": 1.7416667938232422, "learning_rate": 3.0433422378659675e-06, "loss": 0.1252, "num_input_tokens_seen": 226758944, "step": 105090 }, { "epoch": 17.144371941272432, "grad_norm": 0.8303448557853699, "learning_rate": 3.041640651099556e-06, "loss": 0.046, "num_input_tokens_seen": 226769472, "step": 105095 }, { "epoch": 17.145187601957584, "grad_norm": 0.9441450834274292, "learning_rate": 3.0399395093471395e-06, "loss": 0.0758, "num_input_tokens_seen": 226779488, "step": 105100 }, { "epoch": 17.14600326264274, "grad_norm": 1.1740365028381348, "learning_rate": 3.0382388126432183e-06, "loss": 0.0578, "num_input_tokens_seen": 226790272, "step": 105105 }, { "epoch": 17.146818923327896, "grad_norm": 1.5521475076675415, "learning_rate": 3.0365385610222398e-06, "loss": 0.1221, "num_input_tokens_seen": 226800960, "step": 105110 }, { "epoch": 17.14763458401305, "grad_norm": 0.03675559163093567, "learning_rate": 3.034838754518679e-06, "loss": 0.0911, "num_input_tokens_seen": 226812064, "step": 105115 }, { "epoch": 17.148450244698207, "grad_norm": 0.5540518760681152, "learning_rate": 3.0331393931669645e-06, "loss": 0.0153, "num_input_tokens_seen": 226822080, "step": 105120 }, { "epoch": 17.14926590538336, "grad_norm": 0.4179382622241974, "learning_rate": 3.031440477001557e-06, "loss": 0.1043, "num_input_tokens_seen": 226832736, "step": 105125 }, { "epoch": 17.150081566068515, "grad_norm": 0.34758275747299194, "learning_rate": 3.029742006056868e-06, "loss": 0.2065, "num_input_tokens_seen": 226844160, "step": 105130 }, { "epoch": 17.15089722675367, "grad_norm": 0.05988838151097298, "learning_rate": 3.02804398036734e-06, "loss": 0.0117, "num_input_tokens_seen": 226853440, "step": 105135 }, { "epoch": 17.151712887438826, "grad_norm": 0.20430831611156464, "learning_rate": 3.026346399967361e-06, "loss": 0.0409, "num_input_tokens_seen": 226864416, "step": 105140 }, { "epoch": 17.152528548123982, "grad_norm": 0.06997991353273392, "learning_rate": 3.0246492648913605e-06, "loss": 0.0281, "num_input_tokens_seen": 226875520, "step": 105145 }, { "epoch": 17.153344208809134, "grad_norm": 1.2640039920806885, "learning_rate": 3.0229525751737096e-06, "loss": 0.1447, "num_input_tokens_seen": 226886944, "step": 105150 }, { "epoch": 17.15415986949429, "grad_norm": 1.386778712272644, "learning_rate": 3.0212563308488096e-06, "loss": 0.0324, "num_input_tokens_seen": 226897088, "step": 105155 }, { "epoch": 17.154975530179446, "grad_norm": 1.981305480003357, "learning_rate": 3.0195605319510324e-06, "loss": 0.3002, "num_input_tokens_seen": 226908288, "step": 105160 }, { "epoch": 17.1557911908646, "grad_norm": 1.4868444204330444, "learning_rate": 3.017865178514745e-06, "loss": 0.1793, "num_input_tokens_seen": 226918752, "step": 105165 }, { "epoch": 17.156606851549757, "grad_norm": 0.021266940981149673, "learning_rate": 3.016170270574306e-06, "loss": 0.0286, "num_input_tokens_seen": 226929952, "step": 105170 }, { "epoch": 17.15742251223491, "grad_norm": 0.18350163102149963, "learning_rate": 3.0144758081640635e-06, "loss": 0.0883, "num_input_tokens_seen": 226941216, "step": 105175 }, { "epoch": 17.158238172920065, "grad_norm": 0.1914895921945572, "learning_rate": 3.0127817913183615e-06, "loss": 0.1402, "num_input_tokens_seen": 226952032, "step": 105180 }, { "epoch": 17.15905383360522, "grad_norm": 1.238202452659607, "learning_rate": 3.0110882200715284e-06, "loss": 0.0646, "num_input_tokens_seen": 226963808, "step": 105185 }, { "epoch": 17.159869494290376, "grad_norm": 1.2268669605255127, "learning_rate": 3.009395094457887e-06, "loss": 0.0589, "num_input_tokens_seen": 226974208, "step": 105190 }, { "epoch": 17.160685154975532, "grad_norm": 0.678671658039093, "learning_rate": 3.007702414511751e-06, "loss": 0.0566, "num_input_tokens_seen": 226985888, "step": 105195 }, { "epoch": 17.161500815660684, "grad_norm": 0.4716865122318268, "learning_rate": 3.0060101802674267e-06, "loss": 0.1733, "num_input_tokens_seen": 226998048, "step": 105200 }, { "epoch": 17.16231647634584, "grad_norm": 0.5562509894371033, "learning_rate": 3.0043183917592004e-06, "loss": 0.0566, "num_input_tokens_seen": 227008672, "step": 105205 }, { "epoch": 17.163132137030995, "grad_norm": 0.03347703069448471, "learning_rate": 3.0026270490213753e-06, "loss": 0.0308, "num_input_tokens_seen": 227020864, "step": 105210 }, { "epoch": 17.16394779771615, "grad_norm": 0.24493414163589478, "learning_rate": 3.0009361520882102e-06, "loss": 0.183, "num_input_tokens_seen": 227032256, "step": 105215 }, { "epoch": 17.164763458401303, "grad_norm": 0.9709342122077942, "learning_rate": 2.9992457009939912e-06, "loss": 0.092, "num_input_tokens_seen": 227042048, "step": 105220 }, { "epoch": 17.16557911908646, "grad_norm": 0.04054206982254982, "learning_rate": 2.9975556957729576e-06, "loss": 0.2269, "num_input_tokens_seen": 227052448, "step": 105225 }, { "epoch": 17.166394779771615, "grad_norm": 0.06016046553850174, "learning_rate": 2.99586613645938e-06, "loss": 0.0097, "num_input_tokens_seen": 227062624, "step": 105230 }, { "epoch": 17.16721044045677, "grad_norm": 0.5233718156814575, "learning_rate": 2.99417702308748e-06, "loss": 0.0774, "num_input_tokens_seen": 227074016, "step": 105235 }, { "epoch": 17.168026101141926, "grad_norm": 0.028354279696941376, "learning_rate": 2.9924883556915083e-06, "loss": 0.1262, "num_input_tokens_seen": 227085120, "step": 105240 }, { "epoch": 17.16884176182708, "grad_norm": 0.28410640358924866, "learning_rate": 2.9908001343056684e-06, "loss": 0.0104, "num_input_tokens_seen": 227095296, "step": 105245 }, { "epoch": 17.169657422512234, "grad_norm": 0.2461778223514557, "learning_rate": 2.9891123589641937e-06, "loss": 0.0837, "num_input_tokens_seen": 227105792, "step": 105250 }, { "epoch": 17.17047308319739, "grad_norm": 0.25370314717292786, "learning_rate": 2.9874250297012713e-06, "loss": 0.0446, "num_input_tokens_seen": 227117600, "step": 105255 }, { "epoch": 17.171288743882545, "grad_norm": 1.1997336149215698, "learning_rate": 2.9857381465511145e-06, "loss": 0.0258, "num_input_tokens_seen": 227129376, "step": 105260 }, { "epoch": 17.1721044045677, "grad_norm": 1.054317593574524, "learning_rate": 2.9840517095478944e-06, "loss": 0.1185, "num_input_tokens_seen": 227139360, "step": 105265 }, { "epoch": 17.172920065252853, "grad_norm": 0.04188604652881622, "learning_rate": 2.9823657187257962e-06, "loss": 0.0584, "num_input_tokens_seen": 227150080, "step": 105270 }, { "epoch": 17.17373572593801, "grad_norm": 0.12288374453783035, "learning_rate": 2.980680174118991e-06, "loss": 0.012, "num_input_tokens_seen": 227159808, "step": 105275 }, { "epoch": 17.174551386623165, "grad_norm": 0.08069982379674911, "learning_rate": 2.978995075761634e-06, "loss": 0.0606, "num_input_tokens_seen": 227170336, "step": 105280 }, { "epoch": 17.17536704730832, "grad_norm": 0.11248137801885605, "learning_rate": 2.977310423687879e-06, "loss": 0.0205, "num_input_tokens_seen": 227180928, "step": 105285 }, { "epoch": 17.176182707993476, "grad_norm": 0.23546132445335388, "learning_rate": 2.975626217931865e-06, "loss": 0.061, "num_input_tokens_seen": 227192128, "step": 105290 }, { "epoch": 17.17699836867863, "grad_norm": 0.42461666464805603, "learning_rate": 2.9739424585277258e-06, "loss": 0.091, "num_input_tokens_seen": 227203552, "step": 105295 }, { "epoch": 17.177814029363784, "grad_norm": 0.057479217648506165, "learning_rate": 2.9722591455095846e-06, "loss": 0.0641, "num_input_tokens_seen": 227213344, "step": 105300 }, { "epoch": 17.17862969004894, "grad_norm": 0.2017117291688919, "learning_rate": 2.970576278911555e-06, "loss": 0.2478, "num_input_tokens_seen": 227224128, "step": 105305 }, { "epoch": 17.179445350734095, "grad_norm": 0.10231773555278778, "learning_rate": 2.9688938587677435e-06, "loss": 0.1121, "num_input_tokens_seen": 227233856, "step": 105310 }, { "epoch": 17.18026101141925, "grad_norm": 1.543936014175415, "learning_rate": 2.9672118851122475e-06, "loss": 0.1994, "num_input_tokens_seen": 227243616, "step": 105315 }, { "epoch": 17.181076672104403, "grad_norm": 0.3795207738876343, "learning_rate": 2.9655303579791546e-06, "loss": 0.0553, "num_input_tokens_seen": 227255840, "step": 105320 }, { "epoch": 17.18189233278956, "grad_norm": 0.2867615520954132, "learning_rate": 2.963849277402539e-06, "loss": 0.0248, "num_input_tokens_seen": 227266688, "step": 105325 }, { "epoch": 17.182707993474715, "grad_norm": 0.02021077461540699, "learning_rate": 2.9621686434164743e-06, "loss": 0.0711, "num_input_tokens_seen": 227278016, "step": 105330 }, { "epoch": 17.18352365415987, "grad_norm": 0.034209493547677994, "learning_rate": 2.960488456055019e-06, "loss": 0.1152, "num_input_tokens_seen": 227287776, "step": 105335 }, { "epoch": 17.184339314845026, "grad_norm": 0.050803784281015396, "learning_rate": 2.9588087153522237e-06, "loss": 0.0125, "num_input_tokens_seen": 227298208, "step": 105340 }, { "epoch": 17.18515497553018, "grad_norm": 2.1951868534088135, "learning_rate": 2.957129421342131e-06, "loss": 0.0694, "num_input_tokens_seen": 227308896, "step": 105345 }, { "epoch": 17.185970636215334, "grad_norm": 0.37475457787513733, "learning_rate": 2.9554505740587684e-06, "loss": 0.1476, "num_input_tokens_seen": 227318976, "step": 105350 }, { "epoch": 17.18678629690049, "grad_norm": 0.46998804807662964, "learning_rate": 2.9537721735361763e-06, "loss": 0.0793, "num_input_tokens_seen": 227330624, "step": 105355 }, { "epoch": 17.187601957585645, "grad_norm": 0.03423944115638733, "learning_rate": 2.9520942198083464e-06, "loss": 0.0139, "num_input_tokens_seen": 227341600, "step": 105360 }, { "epoch": 17.1884176182708, "grad_norm": 0.0511467382311821, "learning_rate": 2.9504167129093096e-06, "loss": 0.0037, "num_input_tokens_seen": 227351296, "step": 105365 }, { "epoch": 17.189233278955953, "grad_norm": 0.26922622323036194, "learning_rate": 2.9487396528730395e-06, "loss": 0.0106, "num_input_tokens_seen": 227362208, "step": 105370 }, { "epoch": 17.19004893964111, "grad_norm": 1.2907650470733643, "learning_rate": 2.9470630397335386e-06, "loss": 0.3002, "num_input_tokens_seen": 227372800, "step": 105375 }, { "epoch": 17.190864600326265, "grad_norm": 0.5607628226280212, "learning_rate": 2.945386873524783e-06, "loss": 0.1576, "num_input_tokens_seen": 227382944, "step": 105380 }, { "epoch": 17.19168026101142, "grad_norm": 0.3888920545578003, "learning_rate": 2.94371115428074e-06, "loss": 0.1514, "num_input_tokens_seen": 227392768, "step": 105385 }, { "epoch": 17.192495921696572, "grad_norm": 0.10394212603569031, "learning_rate": 2.942035882035368e-06, "loss": 0.1186, "num_input_tokens_seen": 227403200, "step": 105390 }, { "epoch": 17.193311582381728, "grad_norm": 0.6466279029846191, "learning_rate": 2.940361056822624e-06, "loss": 0.0578, "num_input_tokens_seen": 227415072, "step": 105395 }, { "epoch": 17.194127243066884, "grad_norm": 0.12145952880382538, "learning_rate": 2.9386866786764474e-06, "loss": 0.1094, "num_input_tokens_seen": 227425280, "step": 105400 }, { "epoch": 17.19494290375204, "grad_norm": 0.07946932315826416, "learning_rate": 2.937012747630774e-06, "loss": 0.0401, "num_input_tokens_seen": 227436320, "step": 105405 }, { "epoch": 17.195758564437195, "grad_norm": 0.40387317538261414, "learning_rate": 2.9353392637195247e-06, "loss": 0.0268, "num_input_tokens_seen": 227447008, "step": 105410 }, { "epoch": 17.196574225122347, "grad_norm": 0.27967965602874756, "learning_rate": 2.9336662269766142e-06, "loss": 0.1046, "num_input_tokens_seen": 227457664, "step": 105415 }, { "epoch": 17.197389885807503, "grad_norm": 0.625034749507904, "learning_rate": 2.9319936374359536e-06, "loss": 0.0526, "num_input_tokens_seen": 227469216, "step": 105420 }, { "epoch": 17.19820554649266, "grad_norm": 1.5390397310256958, "learning_rate": 2.9303214951314357e-06, "loss": 0.1347, "num_input_tokens_seen": 227482208, "step": 105425 }, { "epoch": 17.199021207177815, "grad_norm": 0.08960211277008057, "learning_rate": 2.9286498000969497e-06, "loss": 0.0565, "num_input_tokens_seen": 227493056, "step": 105430 }, { "epoch": 17.19983686786297, "grad_norm": 0.5088279843330383, "learning_rate": 2.9269785523663746e-06, "loss": 0.0172, "num_input_tokens_seen": 227504480, "step": 105435 }, { "epoch": 17.200652528548122, "grad_norm": 0.04198312386870384, "learning_rate": 2.9253077519735827e-06, "loss": 0.0424, "num_input_tokens_seen": 227513024, "step": 105440 }, { "epoch": 17.201468189233278, "grad_norm": 0.2428794801235199, "learning_rate": 2.9236373989524337e-06, "loss": 0.1054, "num_input_tokens_seen": 227524960, "step": 105445 }, { "epoch": 17.202283849918434, "grad_norm": 0.34074923396110535, "learning_rate": 2.9219674933367747e-06, "loss": 0.1358, "num_input_tokens_seen": 227536416, "step": 105450 }, { "epoch": 17.20309951060359, "grad_norm": 0.9475268125534058, "learning_rate": 2.920298035160457e-06, "loss": 0.0449, "num_input_tokens_seen": 227547136, "step": 105455 }, { "epoch": 17.203915171288745, "grad_norm": 0.4686199128627777, "learning_rate": 2.918629024457306e-06, "loss": 0.0783, "num_input_tokens_seen": 227558400, "step": 105460 }, { "epoch": 17.204730831973897, "grad_norm": 0.13954347372055054, "learning_rate": 2.9169604612611507e-06, "loss": 0.0138, "num_input_tokens_seen": 227568960, "step": 105465 }, { "epoch": 17.205546492659053, "grad_norm": 0.15340985357761383, "learning_rate": 2.915292345605808e-06, "loss": 0.2065, "num_input_tokens_seen": 227579264, "step": 105470 }, { "epoch": 17.20636215334421, "grad_norm": 3.1362836360931396, "learning_rate": 2.9136246775250812e-06, "loss": 0.2224, "num_input_tokens_seen": 227590656, "step": 105475 }, { "epoch": 17.207177814029365, "grad_norm": 0.38169431686401367, "learning_rate": 2.9119574570527638e-06, "loss": 0.0764, "num_input_tokens_seen": 227600896, "step": 105480 }, { "epoch": 17.20799347471452, "grad_norm": 1.5460891723632812, "learning_rate": 2.910290684222658e-06, "loss": 0.0423, "num_input_tokens_seen": 227611840, "step": 105485 }, { "epoch": 17.208809135399672, "grad_norm": 1.3284465074539185, "learning_rate": 2.908624359068526e-06, "loss": 0.2748, "num_input_tokens_seen": 227621632, "step": 105490 }, { "epoch": 17.209624796084828, "grad_norm": 0.06620665639638901, "learning_rate": 2.90695848162415e-06, "loss": 0.1986, "num_input_tokens_seen": 227632320, "step": 105495 }, { "epoch": 17.210440456769984, "grad_norm": 0.7239102721214294, "learning_rate": 2.9052930519232886e-06, "loss": 0.0413, "num_input_tokens_seen": 227642784, "step": 105500 }, { "epoch": 17.21125611745514, "grad_norm": 0.08776140958070755, "learning_rate": 2.903628069999692e-06, "loss": 0.1111, "num_input_tokens_seen": 227653024, "step": 105505 }, { "epoch": 17.212071778140295, "grad_norm": 0.3650825619697571, "learning_rate": 2.9019635358871034e-06, "loss": 0.0453, "num_input_tokens_seen": 227663232, "step": 105510 }, { "epoch": 17.212887438825447, "grad_norm": 0.14737272262573242, "learning_rate": 2.9002994496192594e-06, "loss": 0.0248, "num_input_tokens_seen": 227674944, "step": 105515 }, { "epoch": 17.213703099510603, "grad_norm": 0.033565111458301544, "learning_rate": 2.89863581122988e-06, "loss": 0.314, "num_input_tokens_seen": 227685568, "step": 105520 }, { "epoch": 17.21451876019576, "grad_norm": 0.2788412868976593, "learning_rate": 2.896972620752683e-06, "loss": 0.0401, "num_input_tokens_seen": 227696032, "step": 105525 }, { "epoch": 17.215334420880914, "grad_norm": 0.7838461995124817, "learning_rate": 2.8953098782213777e-06, "loss": 0.0734, "num_input_tokens_seen": 227706336, "step": 105530 }, { "epoch": 17.21615008156607, "grad_norm": 2.798325777053833, "learning_rate": 2.8936475836696592e-06, "loss": 0.1084, "num_input_tokens_seen": 227717632, "step": 105535 }, { "epoch": 17.216965742251222, "grad_norm": 0.44316813349723816, "learning_rate": 2.891985737131214e-06, "loss": 0.2113, "num_input_tokens_seen": 227728544, "step": 105540 }, { "epoch": 17.217781402936378, "grad_norm": 0.06554335355758667, "learning_rate": 2.8903243386397272e-06, "loss": 0.1111, "num_input_tokens_seen": 227740192, "step": 105545 }, { "epoch": 17.218597063621534, "grad_norm": 0.7102687954902649, "learning_rate": 2.888663388228863e-06, "loss": 0.1713, "num_input_tokens_seen": 227750816, "step": 105550 }, { "epoch": 17.21941272430669, "grad_norm": 0.08799293637275696, "learning_rate": 2.887002885932286e-06, "loss": 0.2275, "num_input_tokens_seen": 227762784, "step": 105555 }, { "epoch": 17.22022838499184, "grad_norm": 1.9729812145233154, "learning_rate": 2.8853428317836497e-06, "loss": 0.1238, "num_input_tokens_seen": 227772672, "step": 105560 }, { "epoch": 17.221044045676997, "grad_norm": 0.35215047001838684, "learning_rate": 2.883683225816594e-06, "loss": 0.0566, "num_input_tokens_seen": 227782304, "step": 105565 }, { "epoch": 17.221859706362153, "grad_norm": 0.35616716742515564, "learning_rate": 2.8820240680647537e-06, "loss": 0.0133, "num_input_tokens_seen": 227793088, "step": 105570 }, { "epoch": 17.22267536704731, "grad_norm": 2.037567377090454, "learning_rate": 2.880365358561754e-06, "loss": 0.0274, "num_input_tokens_seen": 227804704, "step": 105575 }, { "epoch": 17.223491027732464, "grad_norm": 0.5784807205200195, "learning_rate": 2.8787070973412127e-06, "loss": 0.1285, "num_input_tokens_seen": 227815296, "step": 105580 }, { "epoch": 17.224306688417617, "grad_norm": 1.5937559604644775, "learning_rate": 2.877049284436728e-06, "loss": 0.2708, "num_input_tokens_seen": 227826656, "step": 105585 }, { "epoch": 17.225122349102772, "grad_norm": 0.05627519264817238, "learning_rate": 2.875391919881917e-06, "loss": 0.01, "num_input_tokens_seen": 227836832, "step": 105590 }, { "epoch": 17.225938009787928, "grad_norm": 0.0905749723315239, "learning_rate": 2.8737350037103427e-06, "loss": 0.0202, "num_input_tokens_seen": 227848640, "step": 105595 }, { "epoch": 17.226753670473084, "grad_norm": 0.4244447350502014, "learning_rate": 2.872078535955611e-06, "loss": 0.0557, "num_input_tokens_seen": 227858624, "step": 105600 }, { "epoch": 17.22756933115824, "grad_norm": 0.1361040621995926, "learning_rate": 2.8704225166512667e-06, "loss": 0.0135, "num_input_tokens_seen": 227869792, "step": 105605 }, { "epoch": 17.22838499184339, "grad_norm": 0.12154968082904816, "learning_rate": 2.8687669458308947e-06, "loss": 0.0688, "num_input_tokens_seen": 227879712, "step": 105610 }, { "epoch": 17.229200652528547, "grad_norm": 0.04473944753408432, "learning_rate": 2.8671118235280266e-06, "loss": 0.0403, "num_input_tokens_seen": 227891744, "step": 105615 }, { "epoch": 17.230016313213703, "grad_norm": 1.9707067012786865, "learning_rate": 2.8654571497762238e-06, "loss": 0.2467, "num_input_tokens_seen": 227901664, "step": 105620 }, { "epoch": 17.23083197389886, "grad_norm": 2.206411123275757, "learning_rate": 2.8638029246089994e-06, "loss": 0.0491, "num_input_tokens_seen": 227912160, "step": 105625 }, { "epoch": 17.231647634584014, "grad_norm": 1.6607398986816406, "learning_rate": 2.8621491480598976e-06, "loss": 0.1772, "num_input_tokens_seen": 227920928, "step": 105630 }, { "epoch": 17.232463295269167, "grad_norm": 2.6005828380584717, "learning_rate": 2.8604958201624288e-06, "loss": 0.1375, "num_input_tokens_seen": 227930880, "step": 105635 }, { "epoch": 17.233278955954322, "grad_norm": 1.7177932262420654, "learning_rate": 2.858842940950096e-06, "loss": 0.0757, "num_input_tokens_seen": 227941408, "step": 105640 }, { "epoch": 17.234094616639478, "grad_norm": 0.13752849400043488, "learning_rate": 2.8571905104564004e-06, "loss": 0.173, "num_input_tokens_seen": 227952032, "step": 105645 }, { "epoch": 17.234910277324634, "grad_norm": 0.8234204053878784, "learning_rate": 2.8555385287148293e-06, "loss": 0.0677, "num_input_tokens_seen": 227962912, "step": 105650 }, { "epoch": 17.23572593800979, "grad_norm": 1.9208530187606812, "learning_rate": 2.8538869957588617e-06, "loss": 0.2016, "num_input_tokens_seen": 227973952, "step": 105655 }, { "epoch": 17.23654159869494, "grad_norm": 1.4126720428466797, "learning_rate": 2.8522359116219673e-06, "loss": 0.0882, "num_input_tokens_seen": 227984320, "step": 105660 }, { "epoch": 17.237357259380097, "grad_norm": 0.1855604201555252, "learning_rate": 2.8505852763376085e-06, "loss": 0.0123, "num_input_tokens_seen": 227996128, "step": 105665 }, { "epoch": 17.238172920065253, "grad_norm": 2.1494972705841064, "learning_rate": 2.8489350899392393e-06, "loss": 0.1572, "num_input_tokens_seen": 228006976, "step": 105670 }, { "epoch": 17.23898858075041, "grad_norm": 0.03139667958021164, "learning_rate": 2.8472853524602994e-06, "loss": 0.1688, "num_input_tokens_seen": 228019040, "step": 105675 }, { "epoch": 17.239804241435564, "grad_norm": 0.030039697885513306, "learning_rate": 2.845636063934226e-06, "loss": 0.0285, "num_input_tokens_seen": 228028832, "step": 105680 }, { "epoch": 17.240619902120716, "grad_norm": 0.03487931191921234, "learning_rate": 2.8439872243944425e-06, "loss": 0.0583, "num_input_tokens_seen": 228040288, "step": 105685 }, { "epoch": 17.241435562805872, "grad_norm": 0.40164878964424133, "learning_rate": 2.8423388338743583e-06, "loss": 0.06, "num_input_tokens_seen": 228050752, "step": 105690 }, { "epoch": 17.242251223491028, "grad_norm": 0.9808809757232666, "learning_rate": 2.8406908924073993e-06, "loss": 0.0629, "num_input_tokens_seen": 228062144, "step": 105695 }, { "epoch": 17.243066884176184, "grad_norm": 0.0995960533618927, "learning_rate": 2.8390434000269388e-06, "loss": 0.4442, "num_input_tokens_seen": 228073152, "step": 105700 }, { "epoch": 17.24388254486134, "grad_norm": 0.3912073075771332, "learning_rate": 2.8373963567663866e-06, "loss": 0.1601, "num_input_tokens_seen": 228084544, "step": 105705 }, { "epoch": 17.24469820554649, "grad_norm": 0.5365817546844482, "learning_rate": 2.835749762659104e-06, "loss": 0.0194, "num_input_tokens_seen": 228094656, "step": 105710 }, { "epoch": 17.245513866231647, "grad_norm": 0.10865380614995956, "learning_rate": 2.834103617738479e-06, "loss": 0.0624, "num_input_tokens_seen": 228104544, "step": 105715 }, { "epoch": 17.246329526916803, "grad_norm": 2.0250649452209473, "learning_rate": 2.832457922037851e-06, "loss": 0.0869, "num_input_tokens_seen": 228115072, "step": 105720 }, { "epoch": 17.24714518760196, "grad_norm": 0.5591230392456055, "learning_rate": 2.8308126755905967e-06, "loss": 0.0294, "num_input_tokens_seen": 228124736, "step": 105725 }, { "epoch": 17.247960848287114, "grad_norm": 0.036099210381507874, "learning_rate": 2.829167878430039e-06, "loss": 0.0133, "num_input_tokens_seen": 228135008, "step": 105730 }, { "epoch": 17.248776508972266, "grad_norm": 0.38010215759277344, "learning_rate": 2.827523530589529e-06, "loss": 0.1553, "num_input_tokens_seen": 228146208, "step": 105735 }, { "epoch": 17.249592169657422, "grad_norm": 2.561836004257202, "learning_rate": 2.8258796321023713e-06, "loss": 0.1887, "num_input_tokens_seen": 228156192, "step": 105740 }, { "epoch": 17.250407830342578, "grad_norm": 0.2371804565191269, "learning_rate": 2.824236183001902e-06, "loss": 0.2129, "num_input_tokens_seen": 228166464, "step": 105745 }, { "epoch": 17.251223491027734, "grad_norm": 0.08775237947702408, "learning_rate": 2.822593183321412e-06, "loss": 0.1442, "num_input_tokens_seen": 228178112, "step": 105750 }, { "epoch": 17.252039151712886, "grad_norm": 0.12012697011232376, "learning_rate": 2.8209506330942105e-06, "loss": 0.062, "num_input_tokens_seen": 228189792, "step": 105755 }, { "epoch": 17.25285481239804, "grad_norm": 0.052841827273368835, "learning_rate": 2.8193085323535734e-06, "loss": 0.0547, "num_input_tokens_seen": 228200928, "step": 105760 }, { "epoch": 17.253670473083197, "grad_norm": 1.5074946880340576, "learning_rate": 2.8176668811327967e-06, "loss": 0.0965, "num_input_tokens_seen": 228212480, "step": 105765 }, { "epoch": 17.254486133768353, "grad_norm": 0.1448659896850586, "learning_rate": 2.816025679465131e-06, "loss": 0.1338, "num_input_tokens_seen": 228223616, "step": 105770 }, { "epoch": 17.25530179445351, "grad_norm": 1.2096840143203735, "learning_rate": 2.8143849273838535e-06, "loss": 0.1376, "num_input_tokens_seen": 228234720, "step": 105775 }, { "epoch": 17.25611745513866, "grad_norm": 0.04656662419438362, "learning_rate": 2.812744624922209e-06, "loss": 0.1308, "num_input_tokens_seen": 228244832, "step": 105780 }, { "epoch": 17.256933115823816, "grad_norm": 0.5770191550254822, "learning_rate": 2.81110477211344e-06, "loss": 0.0173, "num_input_tokens_seen": 228255264, "step": 105785 }, { "epoch": 17.257748776508972, "grad_norm": 0.9703280925750732, "learning_rate": 2.8094653689907826e-06, "loss": 0.0289, "num_input_tokens_seen": 228266944, "step": 105790 }, { "epoch": 17.258564437194128, "grad_norm": 0.10063931345939636, "learning_rate": 2.8078264155874583e-06, "loss": 0.0095, "num_input_tokens_seen": 228278400, "step": 105795 }, { "epoch": 17.259380097879284, "grad_norm": 2.871019124984741, "learning_rate": 2.806187911936686e-06, "loss": 0.1998, "num_input_tokens_seen": 228289280, "step": 105800 }, { "epoch": 17.260195758564436, "grad_norm": 0.22468723356723785, "learning_rate": 2.8045498580716702e-06, "loss": 0.1281, "num_input_tokens_seen": 228300672, "step": 105805 }, { "epoch": 17.26101141924959, "grad_norm": 0.6946840882301331, "learning_rate": 2.802912254025608e-06, "loss": 0.0356, "num_input_tokens_seen": 228311360, "step": 105810 }, { "epoch": 17.261827079934747, "grad_norm": 1.7181146144866943, "learning_rate": 2.8012750998316904e-06, "loss": 0.0332, "num_input_tokens_seen": 228322048, "step": 105815 }, { "epoch": 17.262642740619903, "grad_norm": 0.19292885065078735, "learning_rate": 2.799638395523091e-06, "loss": 0.0318, "num_input_tokens_seen": 228332128, "step": 105820 }, { "epoch": 17.26345840130506, "grad_norm": 0.7828234434127808, "learning_rate": 2.7980021411329797e-06, "loss": 0.1148, "num_input_tokens_seen": 228342880, "step": 105825 }, { "epoch": 17.26427406199021, "grad_norm": 2.248157262802124, "learning_rate": 2.796366336694528e-06, "loss": 0.125, "num_input_tokens_seen": 228353536, "step": 105830 }, { "epoch": 17.265089722675366, "grad_norm": 0.1925608366727829, "learning_rate": 2.794730982240873e-06, "loss": 0.2523, "num_input_tokens_seen": 228365312, "step": 105835 }, { "epoch": 17.265905383360522, "grad_norm": 0.22302226722240448, "learning_rate": 2.7930960778051714e-06, "loss": 0.1094, "num_input_tokens_seen": 228376064, "step": 105840 }, { "epoch": 17.266721044045678, "grad_norm": 2.5007598400115967, "learning_rate": 2.791461623420541e-06, "loss": 0.1175, "num_input_tokens_seen": 228387424, "step": 105845 }, { "epoch": 17.267536704730833, "grad_norm": 0.4055543541908264, "learning_rate": 2.7898276191201223e-06, "loss": 0.1616, "num_input_tokens_seen": 228396864, "step": 105850 }, { "epoch": 17.268352365415986, "grad_norm": 0.17757542431354523, "learning_rate": 2.7881940649370136e-06, "loss": 0.0534, "num_input_tokens_seen": 228406912, "step": 105855 }, { "epoch": 17.26916802610114, "grad_norm": 0.5446879863739014, "learning_rate": 2.7865609609043415e-06, "loss": 0.1933, "num_input_tokens_seen": 228417760, "step": 105860 }, { "epoch": 17.269983686786297, "grad_norm": 0.11081210523843765, "learning_rate": 2.784928307055179e-06, "loss": 0.211, "num_input_tokens_seen": 228427424, "step": 105865 }, { "epoch": 17.270799347471453, "grad_norm": 0.07508382201194763, "learning_rate": 2.7832961034226365e-06, "loss": 0.051, "num_input_tokens_seen": 228437248, "step": 105870 }, { "epoch": 17.27161500815661, "grad_norm": 0.19223733246326447, "learning_rate": 2.7816643500397725e-06, "loss": 0.1894, "num_input_tokens_seen": 228447360, "step": 105875 }, { "epoch": 17.27243066884176, "grad_norm": 0.09998404234647751, "learning_rate": 2.7800330469396758e-06, "loss": 0.0353, "num_input_tokens_seen": 228457984, "step": 105880 }, { "epoch": 17.273246329526916, "grad_norm": 0.29530689120292664, "learning_rate": 2.7784021941553883e-06, "loss": 0.1047, "num_input_tokens_seen": 228468000, "step": 105885 }, { "epoch": 17.274061990212072, "grad_norm": 2.115886688232422, "learning_rate": 2.7767717917199815e-06, "loss": 0.1719, "num_input_tokens_seen": 228479200, "step": 105890 }, { "epoch": 17.274877650897228, "grad_norm": 2.2459418773651123, "learning_rate": 2.7751418396664758e-06, "loss": 0.247, "num_input_tokens_seen": 228490112, "step": 105895 }, { "epoch": 17.275693311582383, "grad_norm": 0.4058682322502136, "learning_rate": 2.7735123380279227e-06, "loss": 0.0493, "num_input_tokens_seen": 228500096, "step": 105900 }, { "epoch": 17.276508972267536, "grad_norm": 1.561405897140503, "learning_rate": 2.7718832868373317e-06, "loss": 0.2935, "num_input_tokens_seen": 228511264, "step": 105905 }, { "epoch": 17.27732463295269, "grad_norm": 1.631590723991394, "learning_rate": 2.7702546861277268e-06, "loss": 0.1366, "num_input_tokens_seen": 228523168, "step": 105910 }, { "epoch": 17.278140293637847, "grad_norm": 0.41102519631385803, "learning_rate": 2.7686265359321118e-06, "loss": 0.0734, "num_input_tokens_seen": 228534304, "step": 105915 }, { "epoch": 17.278955954323003, "grad_norm": 0.18655024468898773, "learning_rate": 2.7669988362834824e-06, "loss": 0.0702, "num_input_tokens_seen": 228542944, "step": 105920 }, { "epoch": 17.27977161500816, "grad_norm": 0.1658184975385666, "learning_rate": 2.765371587214827e-06, "loss": 0.0993, "num_input_tokens_seen": 228553120, "step": 105925 }, { "epoch": 17.28058727569331, "grad_norm": 0.3590485751628876, "learning_rate": 2.7637447887591234e-06, "loss": 0.0216, "num_input_tokens_seen": 228564352, "step": 105930 }, { "epoch": 17.281402936378466, "grad_norm": 2.275304079055786, "learning_rate": 2.7621184409493407e-06, "loss": 0.0549, "num_input_tokens_seen": 228574848, "step": 105935 }, { "epoch": 17.282218597063622, "grad_norm": 0.8060023188591003, "learning_rate": 2.7604925438184352e-06, "loss": 0.0407, "num_input_tokens_seen": 228586560, "step": 105940 }, { "epoch": 17.283034257748778, "grad_norm": 2.8350281715393066, "learning_rate": 2.7588670973993645e-06, "loss": 0.1036, "num_input_tokens_seen": 228596928, "step": 105945 }, { "epoch": 17.28384991843393, "grad_norm": 0.08407241106033325, "learning_rate": 2.757242101725066e-06, "loss": 0.008, "num_input_tokens_seen": 228608576, "step": 105950 }, { "epoch": 17.284665579119086, "grad_norm": 0.8127825856208801, "learning_rate": 2.7556175568284716e-06, "loss": 0.0424, "num_input_tokens_seen": 228619616, "step": 105955 }, { "epoch": 17.28548123980424, "grad_norm": 0.8282436728477478, "learning_rate": 2.75399346274251e-06, "loss": 0.0395, "num_input_tokens_seen": 228629664, "step": 105960 }, { "epoch": 17.286296900489397, "grad_norm": 3.1742284297943115, "learning_rate": 2.752369819500089e-06, "loss": 0.1638, "num_input_tokens_seen": 228640384, "step": 105965 }, { "epoch": 17.287112561174553, "grad_norm": 0.06256214529275894, "learning_rate": 2.7507466271341125e-06, "loss": 0.0238, "num_input_tokens_seen": 228651424, "step": 105970 }, { "epoch": 17.287928221859705, "grad_norm": 1.7233837842941284, "learning_rate": 2.74912388567749e-06, "loss": 0.122, "num_input_tokens_seen": 228662720, "step": 105975 }, { "epoch": 17.28874388254486, "grad_norm": 1.9206864833831787, "learning_rate": 2.7475015951630906e-06, "loss": 0.1408, "num_input_tokens_seen": 228673920, "step": 105980 }, { "epoch": 17.289559543230016, "grad_norm": 0.7633347511291504, "learning_rate": 2.745879755623809e-06, "loss": 0.1137, "num_input_tokens_seen": 228684128, "step": 105985 }, { "epoch": 17.290375203915172, "grad_norm": 0.9543588757514954, "learning_rate": 2.7442583670924977e-06, "loss": 0.2307, "num_input_tokens_seen": 228695296, "step": 105990 }, { "epoch": 17.291190864600328, "grad_norm": 1.8631789684295654, "learning_rate": 2.7426374296020324e-06, "loss": 0.0776, "num_input_tokens_seen": 228706496, "step": 105995 }, { "epoch": 17.29200652528548, "grad_norm": 0.18852448463439941, "learning_rate": 2.7410169431852455e-06, "loss": 0.0164, "num_input_tokens_seen": 228716896, "step": 106000 }, { "epoch": 17.292822185970635, "grad_norm": 1.5590757131576538, "learning_rate": 2.739396907874997e-06, "loss": 0.0978, "num_input_tokens_seen": 228727520, "step": 106005 }, { "epoch": 17.29363784665579, "grad_norm": 0.4339893162250519, "learning_rate": 2.737777323704102e-06, "loss": 0.1832, "num_input_tokens_seen": 228738816, "step": 106010 }, { "epoch": 17.294453507340947, "grad_norm": 0.19712640345096588, "learning_rate": 2.736158190705393e-06, "loss": 0.0942, "num_input_tokens_seen": 228749344, "step": 106015 }, { "epoch": 17.295269168026103, "grad_norm": 0.3391205966472626, "learning_rate": 2.734539508911685e-06, "loss": 0.1168, "num_input_tokens_seen": 228761152, "step": 106020 }, { "epoch": 17.296084828711255, "grad_norm": 0.4814871549606323, "learning_rate": 2.7329212783557767e-06, "loss": 0.1067, "num_input_tokens_seen": 228772352, "step": 106025 }, { "epoch": 17.29690048939641, "grad_norm": 0.42918097972869873, "learning_rate": 2.73130349907047e-06, "loss": 0.0775, "num_input_tokens_seen": 228783328, "step": 106030 }, { "epoch": 17.297716150081566, "grad_norm": 1.1665153503417969, "learning_rate": 2.729686171088544e-06, "loss": 0.0625, "num_input_tokens_seen": 228795392, "step": 106035 }, { "epoch": 17.298531810766722, "grad_norm": 0.064439557492733, "learning_rate": 2.728069294442781e-06, "loss": 0.2426, "num_input_tokens_seen": 228805888, "step": 106040 }, { "epoch": 17.299347471451878, "grad_norm": 0.8039089441299438, "learning_rate": 2.726452869165949e-06, "loss": 0.0504, "num_input_tokens_seen": 228817824, "step": 106045 }, { "epoch": 17.30016313213703, "grad_norm": 1.0413624048233032, "learning_rate": 2.7248368952908053e-06, "loss": 0.1665, "num_input_tokens_seen": 228829856, "step": 106050 }, { "epoch": 17.300978792822185, "grad_norm": 1.5531387329101562, "learning_rate": 2.723221372850099e-06, "loss": 0.1134, "num_input_tokens_seen": 228840864, "step": 106055 }, { "epoch": 17.30179445350734, "grad_norm": 1.7285598516464233, "learning_rate": 2.7216063018765726e-06, "loss": 0.2641, "num_input_tokens_seen": 228851904, "step": 106060 }, { "epoch": 17.302610114192497, "grad_norm": 0.056901611387729645, "learning_rate": 2.719991682402956e-06, "loss": 0.0662, "num_input_tokens_seen": 228863360, "step": 106065 }, { "epoch": 17.303425774877653, "grad_norm": 0.1345437914133072, "learning_rate": 2.7183775144619727e-06, "loss": 0.1028, "num_input_tokens_seen": 228875008, "step": 106070 }, { "epoch": 17.304241435562805, "grad_norm": 0.22417087852954865, "learning_rate": 2.716763798086333e-06, "loss": 0.0544, "num_input_tokens_seen": 228886272, "step": 106075 }, { "epoch": 17.30505709624796, "grad_norm": 2.1058614253997803, "learning_rate": 2.7151505333087468e-06, "loss": 0.221, "num_input_tokens_seen": 228897952, "step": 106080 }, { "epoch": 17.305872756933116, "grad_norm": 0.44703713059425354, "learning_rate": 2.713537720161902e-06, "loss": 0.0549, "num_input_tokens_seen": 228909792, "step": 106085 }, { "epoch": 17.306688417618272, "grad_norm": 2.0735628604888916, "learning_rate": 2.711925358678491e-06, "loss": 0.1471, "num_input_tokens_seen": 228921056, "step": 106090 }, { "epoch": 17.307504078303428, "grad_norm": 1.839292287826538, "learning_rate": 2.710313448891183e-06, "loss": 0.2648, "num_input_tokens_seen": 228931232, "step": 106095 }, { "epoch": 17.30831973898858, "grad_norm": 1.3394349813461304, "learning_rate": 2.7087019908326544e-06, "loss": 0.2304, "num_input_tokens_seen": 228940512, "step": 106100 }, { "epoch": 17.309135399673735, "grad_norm": 0.5413918495178223, "learning_rate": 2.707090984535554e-06, "loss": 0.0435, "num_input_tokens_seen": 228951840, "step": 106105 }, { "epoch": 17.30995106035889, "grad_norm": 0.6508093476295471, "learning_rate": 2.7054804300325364e-06, "loss": 0.1307, "num_input_tokens_seen": 228962400, "step": 106110 }, { "epoch": 17.310766721044047, "grad_norm": 0.06680626422166824, "learning_rate": 2.7038703273562368e-06, "loss": 0.0293, "num_input_tokens_seen": 228972832, "step": 106115 }, { "epoch": 17.3115823817292, "grad_norm": 2.1815409660339355, "learning_rate": 2.702260676539295e-06, "loss": 0.1019, "num_input_tokens_seen": 228982752, "step": 106120 }, { "epoch": 17.312398042414355, "grad_norm": 0.08361799269914627, "learning_rate": 2.700651477614327e-06, "loss": 0.0988, "num_input_tokens_seen": 228992064, "step": 106125 }, { "epoch": 17.31321370309951, "grad_norm": 0.23817148804664612, "learning_rate": 2.699042730613943e-06, "loss": 0.1628, "num_input_tokens_seen": 229002752, "step": 106130 }, { "epoch": 17.314029363784666, "grad_norm": 0.4148363769054413, "learning_rate": 2.697434435570753e-06, "loss": 0.0171, "num_input_tokens_seen": 229014272, "step": 106135 }, { "epoch": 17.31484502446982, "grad_norm": 0.06064208969473839, "learning_rate": 2.695826592517345e-06, "loss": 0.2015, "num_input_tokens_seen": 229023936, "step": 106140 }, { "epoch": 17.315660685154974, "grad_norm": 0.046134114265441895, "learning_rate": 2.694219201486306e-06, "loss": 0.0255, "num_input_tokens_seen": 229034272, "step": 106145 }, { "epoch": 17.31647634584013, "grad_norm": 2.6144533157348633, "learning_rate": 2.692612262510211e-06, "loss": 0.1757, "num_input_tokens_seen": 229045696, "step": 106150 }, { "epoch": 17.317292006525285, "grad_norm": 0.7789590954780579, "learning_rate": 2.6910057756216273e-06, "loss": 0.0198, "num_input_tokens_seen": 229055424, "step": 106155 }, { "epoch": 17.31810766721044, "grad_norm": 1.0912071466445923, "learning_rate": 2.689399740853113e-06, "loss": 0.1708, "num_input_tokens_seen": 229066528, "step": 106160 }, { "epoch": 17.318923327895597, "grad_norm": 0.996113657951355, "learning_rate": 2.6877941582372168e-06, "loss": 0.1338, "num_input_tokens_seen": 229075264, "step": 106165 }, { "epoch": 17.31973898858075, "grad_norm": 0.13318578898906708, "learning_rate": 2.6861890278064765e-06, "loss": 0.0734, "num_input_tokens_seen": 229085600, "step": 106170 }, { "epoch": 17.320554649265905, "grad_norm": 0.6669847965240479, "learning_rate": 2.684584349593422e-06, "loss": 0.1003, "num_input_tokens_seen": 229094912, "step": 106175 }, { "epoch": 17.32137030995106, "grad_norm": 0.2763374447822571, "learning_rate": 2.682980123630574e-06, "loss": 0.0132, "num_input_tokens_seen": 229106496, "step": 106180 }, { "epoch": 17.322185970636216, "grad_norm": 1.3635190725326538, "learning_rate": 2.6813763499504484e-06, "loss": 0.0421, "num_input_tokens_seen": 229116960, "step": 106185 }, { "epoch": 17.32300163132137, "grad_norm": 1.6777184009552002, "learning_rate": 2.6797730285855417e-06, "loss": 0.0626, "num_input_tokens_seen": 229127808, "step": 106190 }, { "epoch": 17.323817292006524, "grad_norm": 0.45113009214401245, "learning_rate": 2.6781701595683497e-06, "loss": 0.0353, "num_input_tokens_seen": 229138752, "step": 106195 }, { "epoch": 17.32463295269168, "grad_norm": 0.3854498565196991, "learning_rate": 2.676567742931357e-06, "loss": 0.1746, "num_input_tokens_seen": 229150560, "step": 106200 }, { "epoch": 17.325448613376835, "grad_norm": 0.5165087580680847, "learning_rate": 2.674965778707039e-06, "loss": 0.0607, "num_input_tokens_seen": 229162464, "step": 106205 }, { "epoch": 17.32626427406199, "grad_norm": 0.3255504071712494, "learning_rate": 2.673364266927861e-06, "loss": 0.1563, "num_input_tokens_seen": 229172384, "step": 106210 }, { "epoch": 17.327079934747147, "grad_norm": 1.3379218578338623, "learning_rate": 2.6717632076262767e-06, "loss": 0.0595, "num_input_tokens_seen": 229182656, "step": 106215 }, { "epoch": 17.3278955954323, "grad_norm": 0.961540162563324, "learning_rate": 2.6701626008347384e-06, "loss": 0.1548, "num_input_tokens_seen": 229193952, "step": 106220 }, { "epoch": 17.328711256117455, "grad_norm": 0.417328417301178, "learning_rate": 2.668562446585679e-06, "loss": 0.0145, "num_input_tokens_seen": 229205120, "step": 106225 }, { "epoch": 17.32952691680261, "grad_norm": 1.21279776096344, "learning_rate": 2.6669627449115387e-06, "loss": 0.0352, "num_input_tokens_seen": 229216608, "step": 106230 }, { "epoch": 17.330342577487766, "grad_norm": 0.047081418335437775, "learning_rate": 2.6653634958447194e-06, "loss": 0.1211, "num_input_tokens_seen": 229226592, "step": 106235 }, { "epoch": 17.33115823817292, "grad_norm": 2.1009695529937744, "learning_rate": 2.6637646994176535e-06, "loss": 0.1687, "num_input_tokens_seen": 229238240, "step": 106240 }, { "epoch": 17.331973898858074, "grad_norm": 0.23036161065101624, "learning_rate": 2.6621663556627234e-06, "loss": 0.0984, "num_input_tokens_seen": 229250240, "step": 106245 }, { "epoch": 17.33278955954323, "grad_norm": 2.7136142253875732, "learning_rate": 2.660568464612337e-06, "loss": 0.3804, "num_input_tokens_seen": 229262784, "step": 106250 }, { "epoch": 17.333605220228385, "grad_norm": 2.0202293395996094, "learning_rate": 2.658971026298862e-06, "loss": 0.222, "num_input_tokens_seen": 229273696, "step": 106255 }, { "epoch": 17.33442088091354, "grad_norm": 0.055513426661491394, "learning_rate": 2.6573740407546844e-06, "loss": 0.1495, "num_input_tokens_seen": 229284256, "step": 106260 }, { "epoch": 17.335236541598697, "grad_norm": 0.4147770404815674, "learning_rate": 2.655777508012164e-06, "loss": 0.0253, "num_input_tokens_seen": 229295744, "step": 106265 }, { "epoch": 17.33605220228385, "grad_norm": 2.428929090499878, "learning_rate": 2.654181428103661e-06, "loss": 0.1499, "num_input_tokens_seen": 229307328, "step": 106270 }, { "epoch": 17.336867862969005, "grad_norm": 0.07306081801652908, "learning_rate": 2.652585801061519e-06, "loss": 0.0921, "num_input_tokens_seen": 229317472, "step": 106275 }, { "epoch": 17.33768352365416, "grad_norm": 0.10996773838996887, "learning_rate": 2.6509906269180734e-06, "loss": 0.1656, "num_input_tokens_seen": 229327360, "step": 106280 }, { "epoch": 17.338499184339316, "grad_norm": 0.1461426466703415, "learning_rate": 2.6493959057056567e-06, "loss": 0.2127, "num_input_tokens_seen": 229337888, "step": 106285 }, { "epoch": 17.339314845024468, "grad_norm": 1.0492092370986938, "learning_rate": 2.6478016374565844e-06, "loss": 0.1346, "num_input_tokens_seen": 229349440, "step": 106290 }, { "epoch": 17.340130505709624, "grad_norm": 0.6927531361579895, "learning_rate": 2.646207822203167e-06, "loss": 0.0542, "num_input_tokens_seen": 229360032, "step": 106295 }, { "epoch": 17.34094616639478, "grad_norm": 0.4802623689174652, "learning_rate": 2.644614459977707e-06, "loss": 0.2551, "num_input_tokens_seen": 229370848, "step": 106300 }, { "epoch": 17.341761827079935, "grad_norm": 0.6792129874229431, "learning_rate": 2.643021550812494e-06, "loss": 0.1311, "num_input_tokens_seen": 229382464, "step": 106305 }, { "epoch": 17.34257748776509, "grad_norm": 0.8284571170806885, "learning_rate": 2.6414290947398114e-06, "loss": 0.0524, "num_input_tokens_seen": 229393280, "step": 106310 }, { "epoch": 17.343393148450243, "grad_norm": 1.7906018495559692, "learning_rate": 2.639837091791933e-06, "loss": 0.0296, "num_input_tokens_seen": 229404608, "step": 106315 }, { "epoch": 17.3442088091354, "grad_norm": 1.7284972667694092, "learning_rate": 2.638245542001119e-06, "loss": 0.2021, "num_input_tokens_seen": 229415456, "step": 106320 }, { "epoch": 17.345024469820554, "grad_norm": 1.1094576120376587, "learning_rate": 2.6366544453996305e-06, "loss": 0.0578, "num_input_tokens_seen": 229426464, "step": 106325 }, { "epoch": 17.34584013050571, "grad_norm": 0.42530909180641174, "learning_rate": 2.635063802019702e-06, "loss": 0.0422, "num_input_tokens_seen": 229438016, "step": 106330 }, { "epoch": 17.346655791190866, "grad_norm": 0.6108071804046631, "learning_rate": 2.633473611893589e-06, "loss": 0.2778, "num_input_tokens_seen": 229448448, "step": 106335 }, { "epoch": 17.347471451876018, "grad_norm": 0.1208205446600914, "learning_rate": 2.631883875053498e-06, "loss": 0.1416, "num_input_tokens_seen": 229457984, "step": 106340 }, { "epoch": 17.348287112561174, "grad_norm": 0.33171260356903076, "learning_rate": 2.6302945915316652e-06, "loss": 0.0137, "num_input_tokens_seen": 229468512, "step": 106345 }, { "epoch": 17.34910277324633, "grad_norm": 2.658200740814209, "learning_rate": 2.628705761360281e-06, "loss": 0.152, "num_input_tokens_seen": 229479392, "step": 106350 }, { "epoch": 17.349918433931485, "grad_norm": 0.20819872617721558, "learning_rate": 2.627117384571562e-06, "loss": 0.1313, "num_input_tokens_seen": 229488640, "step": 106355 }, { "epoch": 17.35073409461664, "grad_norm": 0.06523416936397552, "learning_rate": 2.6255294611976845e-06, "loss": 0.0371, "num_input_tokens_seen": 229498464, "step": 106360 }, { "epoch": 17.351549755301793, "grad_norm": 0.14108510315418243, "learning_rate": 2.623941991270845e-06, "loss": 0.0132, "num_input_tokens_seen": 229508704, "step": 106365 }, { "epoch": 17.35236541598695, "grad_norm": 0.0745946541428566, "learning_rate": 2.622354974823196e-06, "loss": 0.0233, "num_input_tokens_seen": 229519200, "step": 106370 }, { "epoch": 17.353181076672104, "grad_norm": 0.07923934608697891, "learning_rate": 2.620768411886923e-06, "loss": 0.0298, "num_input_tokens_seen": 229530336, "step": 106375 }, { "epoch": 17.35399673735726, "grad_norm": 0.1473112851381302, "learning_rate": 2.619182302494158e-06, "loss": 0.0106, "num_input_tokens_seen": 229540768, "step": 106380 }, { "epoch": 17.354812398042416, "grad_norm": 0.40688997507095337, "learning_rate": 2.6175966466770644e-06, "loss": 0.0338, "num_input_tokens_seen": 229550976, "step": 106385 }, { "epoch": 17.355628058727568, "grad_norm": 0.2214145064353943, "learning_rate": 2.6160114444677613e-06, "loss": 0.0444, "num_input_tokens_seen": 229561920, "step": 106390 }, { "epoch": 17.356443719412724, "grad_norm": 0.4123266637325287, "learning_rate": 2.6144266958983922e-06, "loss": 0.0241, "num_input_tokens_seen": 229572000, "step": 106395 }, { "epoch": 17.35725938009788, "grad_norm": 0.07986707985401154, "learning_rate": 2.6128424010010535e-06, "loss": 0.0484, "num_input_tokens_seen": 229582816, "step": 106400 }, { "epoch": 17.358075040783035, "grad_norm": 0.49344781041145325, "learning_rate": 2.6112585598078666e-06, "loss": 0.0796, "num_input_tokens_seen": 229594176, "step": 106405 }, { "epoch": 17.35889070146819, "grad_norm": 1.1991643905639648, "learning_rate": 2.609675172350928e-06, "loss": 0.0318, "num_input_tokens_seen": 229605312, "step": 106410 }, { "epoch": 17.359706362153343, "grad_norm": 0.1958245038986206, "learning_rate": 2.60809223866233e-06, "loss": 0.1092, "num_input_tokens_seen": 229616192, "step": 106415 }, { "epoch": 17.3605220228385, "grad_norm": 0.9489793181419373, "learning_rate": 2.606509758774145e-06, "loss": 0.1516, "num_input_tokens_seen": 229627456, "step": 106420 }, { "epoch": 17.361337683523654, "grad_norm": 0.03912278264760971, "learning_rate": 2.6049277327184483e-06, "loss": 0.0089, "num_input_tokens_seen": 229638688, "step": 106425 }, { "epoch": 17.36215334420881, "grad_norm": 0.06444448977708817, "learning_rate": 2.6033461605273007e-06, "loss": 0.2365, "num_input_tokens_seen": 229649056, "step": 106430 }, { "epoch": 17.362969004893966, "grad_norm": 0.2744346857070923, "learning_rate": 2.6017650422327564e-06, "loss": 0.1032, "num_input_tokens_seen": 229661024, "step": 106435 }, { "epoch": 17.363784665579118, "grad_norm": 1.8083785772323608, "learning_rate": 2.60018437786686e-06, "loss": 0.2006, "num_input_tokens_seen": 229670304, "step": 106440 }, { "epoch": 17.364600326264274, "grad_norm": 0.06781645119190216, "learning_rate": 2.5986041674616405e-06, "loss": 0.1623, "num_input_tokens_seen": 229682368, "step": 106445 }, { "epoch": 17.36541598694943, "grad_norm": 0.03593713045120239, "learning_rate": 2.597024411049126e-06, "loss": 0.0581, "num_input_tokens_seen": 229692768, "step": 106450 }, { "epoch": 17.366231647634585, "grad_norm": 0.03388197720050812, "learning_rate": 2.5954451086613264e-06, "loss": 0.0758, "num_input_tokens_seen": 229704320, "step": 106455 }, { "epoch": 17.36704730831974, "grad_norm": 0.39939138293266296, "learning_rate": 2.593866260330266e-06, "loss": 0.1908, "num_input_tokens_seen": 229715360, "step": 106460 }, { "epoch": 17.367862969004893, "grad_norm": 0.46386200189590454, "learning_rate": 2.5922878660879194e-06, "loss": 0.1913, "num_input_tokens_seen": 229727744, "step": 106465 }, { "epoch": 17.36867862969005, "grad_norm": 0.05229771509766579, "learning_rate": 2.5907099259662947e-06, "loss": 0.1591, "num_input_tokens_seen": 229738528, "step": 106470 }, { "epoch": 17.369494290375204, "grad_norm": 0.033504340797662735, "learning_rate": 2.589132439997352e-06, "loss": 0.0063, "num_input_tokens_seen": 229749408, "step": 106475 }, { "epoch": 17.37030995106036, "grad_norm": 1.548447847366333, "learning_rate": 2.5875554082130796e-06, "loss": 0.1829, "num_input_tokens_seen": 229759072, "step": 106480 }, { "epoch": 17.371125611745512, "grad_norm": 1.5463744401931763, "learning_rate": 2.585978830645419e-06, "loss": 0.0644, "num_input_tokens_seen": 229769888, "step": 106485 }, { "epoch": 17.371941272430668, "grad_norm": 3.896914482116699, "learning_rate": 2.5844027073263444e-06, "loss": 0.1322, "num_input_tokens_seen": 229781920, "step": 106490 }, { "epoch": 17.372756933115824, "grad_norm": 1.0959745645523071, "learning_rate": 2.582827038287772e-06, "loss": 0.1193, "num_input_tokens_seen": 229793248, "step": 106495 }, { "epoch": 17.37357259380098, "grad_norm": 0.037667736411094666, "learning_rate": 2.581251823561659e-06, "loss": 0.0964, "num_input_tokens_seen": 229803328, "step": 106500 }, { "epoch": 17.374388254486135, "grad_norm": 0.03711045905947685, "learning_rate": 2.5796770631799085e-06, "loss": 0.087, "num_input_tokens_seen": 229814304, "step": 106505 }, { "epoch": 17.375203915171287, "grad_norm": 0.515275776386261, "learning_rate": 2.5781027571744525e-06, "loss": 0.1201, "num_input_tokens_seen": 229824928, "step": 106510 }, { "epoch": 17.376019575856443, "grad_norm": 0.2232823371887207, "learning_rate": 2.57652890557718e-06, "loss": 0.0223, "num_input_tokens_seen": 229834976, "step": 106515 }, { "epoch": 17.3768352365416, "grad_norm": 0.038619913160800934, "learning_rate": 2.5749555084200046e-06, "loss": 0.0125, "num_input_tokens_seen": 229845216, "step": 106520 }, { "epoch": 17.377650897226754, "grad_norm": 1.2623876333236694, "learning_rate": 2.5733825657347972e-06, "loss": 0.1185, "num_input_tokens_seen": 229856896, "step": 106525 }, { "epoch": 17.37846655791191, "grad_norm": 0.2877315282821655, "learning_rate": 2.5718100775534493e-06, "loss": 0.1456, "num_input_tokens_seen": 229867168, "step": 106530 }, { "epoch": 17.379282218597062, "grad_norm": 0.4250560402870178, "learning_rate": 2.5702380439078105e-06, "loss": 0.0285, "num_input_tokens_seen": 229878176, "step": 106535 }, { "epoch": 17.380097879282218, "grad_norm": 1.6512987613677979, "learning_rate": 2.568666464829761e-06, "loss": 0.0708, "num_input_tokens_seen": 229888384, "step": 106540 }, { "epoch": 17.380913539967374, "grad_norm": 0.04357912018895149, "learning_rate": 2.5670953403511388e-06, "loss": 0.0366, "num_input_tokens_seen": 229899200, "step": 106545 }, { "epoch": 17.38172920065253, "grad_norm": 0.5064003467559814, "learning_rate": 2.5655246705037883e-06, "loss": 0.0381, "num_input_tokens_seen": 229909088, "step": 106550 }, { "epoch": 17.382544861337685, "grad_norm": 0.08969651162624359, "learning_rate": 2.5639544553195423e-06, "loss": 0.0543, "num_input_tokens_seen": 229919840, "step": 106555 }, { "epoch": 17.383360522022837, "grad_norm": 2.4571051597595215, "learning_rate": 2.5623846948302193e-06, "loss": 0.102, "num_input_tokens_seen": 229930272, "step": 106560 }, { "epoch": 17.384176182707993, "grad_norm": 0.06582369655370712, "learning_rate": 2.5608153890676364e-06, "loss": 0.1234, "num_input_tokens_seen": 229940864, "step": 106565 }, { "epoch": 17.38499184339315, "grad_norm": 0.21162720024585724, "learning_rate": 2.559246538063595e-06, "loss": 0.01, "num_input_tokens_seen": 229952640, "step": 106570 }, { "epoch": 17.385807504078304, "grad_norm": 0.6195393204689026, "learning_rate": 2.5576781418498895e-06, "loss": 0.025, "num_input_tokens_seen": 229963424, "step": 106575 }, { "epoch": 17.38662316476346, "grad_norm": 2.0903007984161377, "learning_rate": 2.556110200458309e-06, "loss": 0.1662, "num_input_tokens_seen": 229974912, "step": 106580 }, { "epoch": 17.387438825448612, "grad_norm": 0.17001868784427643, "learning_rate": 2.554542713920624e-06, "loss": 0.0547, "num_input_tokens_seen": 229985568, "step": 106585 }, { "epoch": 17.388254486133768, "grad_norm": 2.183659791946411, "learning_rate": 2.5529756822686073e-06, "loss": 0.0783, "num_input_tokens_seen": 229997024, "step": 106590 }, { "epoch": 17.389070146818923, "grad_norm": 0.030908646062016487, "learning_rate": 2.5514091055340138e-06, "loss": 0.087, "num_input_tokens_seen": 230007936, "step": 106595 }, { "epoch": 17.38988580750408, "grad_norm": 1.9538987874984741, "learning_rate": 2.5498429837485902e-06, "loss": 0.1426, "num_input_tokens_seen": 230018528, "step": 106600 }, { "epoch": 17.390701468189235, "grad_norm": 0.052961818873882294, "learning_rate": 2.5482773169440865e-06, "loss": 0.0432, "num_input_tokens_seen": 230028224, "step": 106605 }, { "epoch": 17.391517128874387, "grad_norm": 1.1909253597259521, "learning_rate": 2.546712105152213e-06, "loss": 0.0262, "num_input_tokens_seen": 230039072, "step": 106610 }, { "epoch": 17.392332789559543, "grad_norm": 0.09930123388767242, "learning_rate": 2.545147348404714e-06, "loss": 0.0215, "num_input_tokens_seen": 230050976, "step": 106615 }, { "epoch": 17.3931484502447, "grad_norm": 0.08770950883626938, "learning_rate": 2.5435830467332834e-06, "loss": 0.063, "num_input_tokens_seen": 230061472, "step": 106620 }, { "epoch": 17.393964110929854, "grad_norm": 1.3212645053863525, "learning_rate": 2.5420192001696376e-06, "loss": 0.088, "num_input_tokens_seen": 230072032, "step": 106625 }, { "epoch": 17.39477977161501, "grad_norm": 0.051261965185403824, "learning_rate": 2.540455808745451e-06, "loss": 0.1265, "num_input_tokens_seen": 230082688, "step": 106630 }, { "epoch": 17.395595432300162, "grad_norm": 0.1380591094493866, "learning_rate": 2.5388928724924323e-06, "loss": 0.124, "num_input_tokens_seen": 230094272, "step": 106635 }, { "epoch": 17.396411092985318, "grad_norm": 0.6470154523849487, "learning_rate": 2.5373303914422305e-06, "loss": 0.213, "num_input_tokens_seen": 230104928, "step": 106640 }, { "epoch": 17.397226753670473, "grad_norm": 2.400735378265381, "learning_rate": 2.535768365626537e-06, "loss": 0.128, "num_input_tokens_seen": 230115520, "step": 106645 }, { "epoch": 17.39804241435563, "grad_norm": 0.43260127305984497, "learning_rate": 2.5342067950769847e-06, "loss": 0.04, "num_input_tokens_seen": 230126336, "step": 106650 }, { "epoch": 17.39885807504078, "grad_norm": 1.2703542709350586, "learning_rate": 2.5326456798252353e-06, "loss": 0.0414, "num_input_tokens_seen": 230137120, "step": 106655 }, { "epoch": 17.399673735725937, "grad_norm": 0.16185636818408966, "learning_rate": 2.5310850199029233e-06, "loss": 0.1962, "num_input_tokens_seen": 230148480, "step": 106660 }, { "epoch": 17.400489396411093, "grad_norm": 0.09115934371948242, "learning_rate": 2.52952481534168e-06, "loss": 0.0292, "num_input_tokens_seen": 230159744, "step": 106665 }, { "epoch": 17.40130505709625, "grad_norm": 0.18563498556613922, "learning_rate": 2.5279650661731186e-06, "loss": 0.224, "num_input_tokens_seen": 230170432, "step": 106670 }, { "epoch": 17.402120717781404, "grad_norm": 0.10906551778316498, "learning_rate": 2.526405772428855e-06, "loss": 0.0156, "num_input_tokens_seen": 230182112, "step": 106675 }, { "epoch": 17.402936378466556, "grad_norm": 0.15951190888881683, "learning_rate": 2.5248469341404874e-06, "loss": 0.1785, "num_input_tokens_seen": 230193344, "step": 106680 }, { "epoch": 17.403752039151712, "grad_norm": 1.7641011476516724, "learning_rate": 2.5232885513396087e-06, "loss": 0.059, "num_input_tokens_seen": 230204480, "step": 106685 }, { "epoch": 17.404567699836868, "grad_norm": 0.05803461745381355, "learning_rate": 2.5217306240578024e-06, "loss": 0.1215, "num_input_tokens_seen": 230215584, "step": 106690 }, { "epoch": 17.405383360522023, "grad_norm": 0.096811443567276, "learning_rate": 2.520173152326638e-06, "loss": 0.081, "num_input_tokens_seen": 230226816, "step": 106695 }, { "epoch": 17.40619902120718, "grad_norm": 0.07752072066068649, "learning_rate": 2.5186161361776846e-06, "loss": 0.1293, "num_input_tokens_seen": 230239104, "step": 106700 }, { "epoch": 17.40701468189233, "grad_norm": 1.9461090564727783, "learning_rate": 2.517059575642494e-06, "loss": 0.1175, "num_input_tokens_seen": 230249248, "step": 106705 }, { "epoch": 17.407830342577487, "grad_norm": 1.907061219215393, "learning_rate": 2.515503470752614e-06, "loss": 0.1502, "num_input_tokens_seen": 230259840, "step": 106710 }, { "epoch": 17.408646003262643, "grad_norm": 0.2349308729171753, "learning_rate": 2.5139478215395777e-06, "loss": 0.3046, "num_input_tokens_seen": 230270400, "step": 106715 }, { "epoch": 17.4094616639478, "grad_norm": 0.05238720029592514, "learning_rate": 2.5123926280349154e-06, "loss": 0.022, "num_input_tokens_seen": 230281760, "step": 106720 }, { "epoch": 17.410277324632954, "grad_norm": 1.883245587348938, "learning_rate": 2.510837890270143e-06, "loss": 0.1629, "num_input_tokens_seen": 230292288, "step": 106725 }, { "epoch": 17.411092985318106, "grad_norm": 0.16077829897403717, "learning_rate": 2.509283608276772e-06, "loss": 0.0709, "num_input_tokens_seen": 230303712, "step": 106730 }, { "epoch": 17.411908646003262, "grad_norm": 0.027733201161026955, "learning_rate": 2.5077297820862987e-06, "loss": 0.1559, "num_input_tokens_seen": 230314528, "step": 106735 }, { "epoch": 17.412724306688418, "grad_norm": 0.062410369515419006, "learning_rate": 2.5061764117302156e-06, "loss": 0.3121, "num_input_tokens_seen": 230325248, "step": 106740 }, { "epoch": 17.413539967373573, "grad_norm": 0.03551444411277771, "learning_rate": 2.5046234972399997e-06, "loss": 0.1312, "num_input_tokens_seen": 230335104, "step": 106745 }, { "epoch": 17.41435562805873, "grad_norm": 0.9645147323608398, "learning_rate": 2.503071038647134e-06, "loss": 0.1962, "num_input_tokens_seen": 230346496, "step": 106750 }, { "epoch": 17.41517128874388, "grad_norm": 2.209310293197632, "learning_rate": 2.5015190359830633e-06, "loss": 0.228, "num_input_tokens_seen": 230357536, "step": 106755 }, { "epoch": 17.415986949429037, "grad_norm": 0.06121150776743889, "learning_rate": 2.499967489279256e-06, "loss": 0.2116, "num_input_tokens_seen": 230368000, "step": 106760 }, { "epoch": 17.416802610114193, "grad_norm": 0.0638727992773056, "learning_rate": 2.498416398567149e-06, "loss": 0.2656, "num_input_tokens_seen": 230379520, "step": 106765 }, { "epoch": 17.41761827079935, "grad_norm": 0.19149307906627655, "learning_rate": 2.496865763878181e-06, "loss": 0.0976, "num_input_tokens_seen": 230390304, "step": 106770 }, { "epoch": 17.418433931484504, "grad_norm": 0.40542349219322205, "learning_rate": 2.4953155852437767e-06, "loss": 0.0487, "num_input_tokens_seen": 230400608, "step": 106775 }, { "epoch": 17.419249592169656, "grad_norm": 0.4576667249202728, "learning_rate": 2.493765862695349e-06, "loss": 0.1629, "num_input_tokens_seen": 230411456, "step": 106780 }, { "epoch": 17.420065252854812, "grad_norm": 1.1325600147247314, "learning_rate": 2.4922165962643074e-06, "loss": 0.0666, "num_input_tokens_seen": 230422816, "step": 106785 }, { "epoch": 17.420880913539968, "grad_norm": 0.07530747354030609, "learning_rate": 2.490667785982051e-06, "loss": 0.0395, "num_input_tokens_seen": 230433856, "step": 106790 }, { "epoch": 17.421696574225123, "grad_norm": 0.41503462195396423, "learning_rate": 2.489119431879966e-06, "loss": 0.0228, "num_input_tokens_seen": 230444704, "step": 106795 }, { "epoch": 17.42251223491028, "grad_norm": 0.5733473896980286, "learning_rate": 2.487571533989433e-06, "loss": 0.0228, "num_input_tokens_seen": 230455232, "step": 106800 }, { "epoch": 17.42332789559543, "grad_norm": 0.9836967587471008, "learning_rate": 2.486024092341821e-06, "loss": 0.286, "num_input_tokens_seen": 230465920, "step": 106805 }, { "epoch": 17.424143556280587, "grad_norm": 0.0727970078587532, "learning_rate": 2.484477106968494e-06, "loss": 0.0853, "num_input_tokens_seen": 230474944, "step": 106810 }, { "epoch": 17.424959216965743, "grad_norm": 0.23157815635204315, "learning_rate": 2.4829305779007995e-06, "loss": 0.0908, "num_input_tokens_seen": 230486912, "step": 106815 }, { "epoch": 17.4257748776509, "grad_norm": 0.06183497607707977, "learning_rate": 2.4813845051700813e-06, "loss": 0.0628, "num_input_tokens_seen": 230498048, "step": 106820 }, { "epoch": 17.42659053833605, "grad_norm": 2.4022915363311768, "learning_rate": 2.4798388888076756e-06, "loss": 0.182, "num_input_tokens_seen": 230509120, "step": 106825 }, { "epoch": 17.427406199021206, "grad_norm": 0.6501979827880859, "learning_rate": 2.478293728844902e-06, "loss": 0.1008, "num_input_tokens_seen": 230520224, "step": 106830 }, { "epoch": 17.428221859706362, "grad_norm": 2.0210583209991455, "learning_rate": 2.476749025313077e-06, "loss": 0.1754, "num_input_tokens_seen": 230530144, "step": 106835 }, { "epoch": 17.429037520391518, "grad_norm": 1.500388741493225, "learning_rate": 2.475204778243506e-06, "loss": 0.0628, "num_input_tokens_seen": 230540288, "step": 106840 }, { "epoch": 17.429853181076673, "grad_norm": 0.5677543878555298, "learning_rate": 2.4736609876674837e-06, "loss": 0.1038, "num_input_tokens_seen": 230552096, "step": 106845 }, { "epoch": 17.430668841761825, "grad_norm": 3.3995041847229004, "learning_rate": 2.4721176536162986e-06, "loss": 0.1171, "num_input_tokens_seen": 230562976, "step": 106850 }, { "epoch": 17.43148450244698, "grad_norm": 0.07812321186065674, "learning_rate": 2.4705747761212287e-06, "loss": 0.0643, "num_input_tokens_seen": 230574688, "step": 106855 }, { "epoch": 17.432300163132137, "grad_norm": 0.11864113807678223, "learning_rate": 2.469032355213541e-06, "loss": 0.0486, "num_input_tokens_seen": 230586528, "step": 106860 }, { "epoch": 17.433115823817293, "grad_norm": 0.47546958923339844, "learning_rate": 2.46749039092449e-06, "loss": 0.1002, "num_input_tokens_seen": 230598016, "step": 106865 }, { "epoch": 17.43393148450245, "grad_norm": 0.09085926413536072, "learning_rate": 2.4659488832853433e-06, "loss": 0.0193, "num_input_tokens_seen": 230609312, "step": 106870 }, { "epoch": 17.4347471451876, "grad_norm": 0.41455137729644775, "learning_rate": 2.4644078323273174e-06, "loss": 0.1312, "num_input_tokens_seen": 230620608, "step": 106875 }, { "epoch": 17.435562805872756, "grad_norm": 1.3681261539459229, "learning_rate": 2.462867238081662e-06, "loss": 0.0765, "num_input_tokens_seen": 230630016, "step": 106880 }, { "epoch": 17.436378466557912, "grad_norm": 0.17311285436153412, "learning_rate": 2.4613271005795858e-06, "loss": 0.0872, "num_input_tokens_seen": 230639392, "step": 106885 }, { "epoch": 17.437194127243067, "grad_norm": 0.03694254159927368, "learning_rate": 2.4597874198523137e-06, "loss": 0.1274, "num_input_tokens_seen": 230650688, "step": 106890 }, { "epoch": 17.438009787928223, "grad_norm": 0.04132169857621193, "learning_rate": 2.4582481959310426e-06, "loss": 0.2366, "num_input_tokens_seen": 230661760, "step": 106895 }, { "epoch": 17.438825448613375, "grad_norm": 0.30272066593170166, "learning_rate": 2.4567094288469677e-06, "loss": 0.115, "num_input_tokens_seen": 230673056, "step": 106900 }, { "epoch": 17.43964110929853, "grad_norm": 0.7117446064949036, "learning_rate": 2.455171118631275e-06, "loss": 0.0508, "num_input_tokens_seen": 230684736, "step": 106905 }, { "epoch": 17.440456769983687, "grad_norm": 0.058827027678489685, "learning_rate": 2.453633265315139e-06, "loss": 0.0384, "num_input_tokens_seen": 230695232, "step": 106910 }, { "epoch": 17.441272430668842, "grad_norm": 1.0278345346450806, "learning_rate": 2.4520958689297267e-06, "loss": 0.0641, "num_input_tokens_seen": 230705824, "step": 106915 }, { "epoch": 17.442088091353998, "grad_norm": 3.8291478157043457, "learning_rate": 2.450558929506197e-06, "loss": 0.1851, "num_input_tokens_seen": 230716864, "step": 106920 }, { "epoch": 17.44290375203915, "grad_norm": 1.8516157865524292, "learning_rate": 2.4490224470756968e-06, "loss": 0.1274, "num_input_tokens_seen": 230728000, "step": 106925 }, { "epoch": 17.443719412724306, "grad_norm": 3.9012136459350586, "learning_rate": 2.4474864216693623e-06, "loss": 0.0848, "num_input_tokens_seen": 230738560, "step": 106930 }, { "epoch": 17.44453507340946, "grad_norm": 0.19679580628871918, "learning_rate": 2.4459508533183267e-06, "loss": 0.1031, "num_input_tokens_seen": 230749408, "step": 106935 }, { "epoch": 17.445350734094617, "grad_norm": 2.6172034740448, "learning_rate": 2.44441574205371e-06, "loss": 0.1326, "num_input_tokens_seen": 230757856, "step": 106940 }, { "epoch": 17.446166394779773, "grad_norm": 3.167198657989502, "learning_rate": 2.4428810879066205e-06, "loss": 0.1255, "num_input_tokens_seen": 230769344, "step": 106945 }, { "epoch": 17.446982055464925, "grad_norm": 1.6196779012680054, "learning_rate": 2.441346890908164e-06, "loss": 0.1667, "num_input_tokens_seen": 230779520, "step": 106950 }, { "epoch": 17.44779771615008, "grad_norm": 0.044871218502521515, "learning_rate": 2.4398131510894262e-06, "loss": 0.0618, "num_input_tokens_seen": 230789472, "step": 106955 }, { "epoch": 17.448613376835237, "grad_norm": 1.6853172779083252, "learning_rate": 2.4382798684814995e-06, "loss": 0.1385, "num_input_tokens_seen": 230800704, "step": 106960 }, { "epoch": 17.449429037520392, "grad_norm": 2.191596746444702, "learning_rate": 2.4367470431154506e-06, "loss": 0.2066, "num_input_tokens_seen": 230810624, "step": 106965 }, { "epoch": 17.450244698205548, "grad_norm": 0.048973288387060165, "learning_rate": 2.4352146750223405e-06, "loss": 0.2206, "num_input_tokens_seen": 230821248, "step": 106970 }, { "epoch": 17.4510603588907, "grad_norm": 0.1847914457321167, "learning_rate": 2.433682764233239e-06, "loss": 0.0499, "num_input_tokens_seen": 230832288, "step": 106975 }, { "epoch": 17.451876019575856, "grad_norm": 0.8781411647796631, "learning_rate": 2.4321513107791767e-06, "loss": 0.1334, "num_input_tokens_seen": 230842816, "step": 106980 }, { "epoch": 17.45269168026101, "grad_norm": 1.4866235256195068, "learning_rate": 2.4306203146912067e-06, "loss": 0.1769, "num_input_tokens_seen": 230851936, "step": 106985 }, { "epoch": 17.453507340946167, "grad_norm": 3.5543744564056396, "learning_rate": 2.4290897760003375e-06, "loss": 0.2102, "num_input_tokens_seen": 230862432, "step": 106990 }, { "epoch": 17.454323001631323, "grad_norm": 1.7817140817642212, "learning_rate": 2.427559694737605e-06, "loss": 0.1495, "num_input_tokens_seen": 230873536, "step": 106995 }, { "epoch": 17.455138662316475, "grad_norm": 0.43349242210388184, "learning_rate": 2.4260300709340043e-06, "loss": 0.0346, "num_input_tokens_seen": 230883904, "step": 107000 }, { "epoch": 17.45595432300163, "grad_norm": 1.7152289152145386, "learning_rate": 2.4245009046205496e-06, "loss": 0.0439, "num_input_tokens_seen": 230893312, "step": 107005 }, { "epoch": 17.456769983686787, "grad_norm": 2.385439872741699, "learning_rate": 2.422972195828213e-06, "loss": 0.0455, "num_input_tokens_seen": 230904192, "step": 107010 }, { "epoch": 17.457585644371942, "grad_norm": 1.0423754453659058, "learning_rate": 2.421443944587995e-06, "loss": 0.2286, "num_input_tokens_seen": 230914272, "step": 107015 }, { "epoch": 17.458401305057095, "grad_norm": 0.35692402720451355, "learning_rate": 2.4199161509308484e-06, "loss": 0.1028, "num_input_tokens_seen": 230925088, "step": 107020 }, { "epoch": 17.45921696574225, "grad_norm": 0.8265036940574646, "learning_rate": 2.4183888148877565e-06, "loss": 0.0223, "num_input_tokens_seen": 230935808, "step": 107025 }, { "epoch": 17.460032626427406, "grad_norm": 0.03771339729428291, "learning_rate": 2.4168619364896533e-06, "loss": 0.022, "num_input_tokens_seen": 230946368, "step": 107030 }, { "epoch": 17.46084828711256, "grad_norm": 0.12994486093521118, "learning_rate": 2.4153355157674946e-06, "loss": 0.0308, "num_input_tokens_seen": 230957024, "step": 107035 }, { "epoch": 17.461663947797717, "grad_norm": 0.7755290865898132, "learning_rate": 2.4138095527522135e-06, "loss": 0.05, "num_input_tokens_seen": 230967360, "step": 107040 }, { "epoch": 17.46247960848287, "grad_norm": 1.2182258367538452, "learning_rate": 2.412284047474733e-06, "loss": 0.0946, "num_input_tokens_seen": 230977024, "step": 107045 }, { "epoch": 17.463295269168025, "grad_norm": 0.03968856483697891, "learning_rate": 2.4107589999659728e-06, "loss": 0.0385, "num_input_tokens_seen": 230988928, "step": 107050 }, { "epoch": 17.46411092985318, "grad_norm": 0.18947942554950714, "learning_rate": 2.409234410256836e-06, "loss": 0.0229, "num_input_tokens_seen": 230999648, "step": 107055 }, { "epoch": 17.464926590538337, "grad_norm": 1.0523293018341064, "learning_rate": 2.4077102783782253e-06, "loss": 0.0958, "num_input_tokens_seen": 231010976, "step": 107060 }, { "epoch": 17.465742251223492, "grad_norm": 0.034210577607154846, "learning_rate": 2.406186604361024e-06, "loss": 0.0581, "num_input_tokens_seen": 231021312, "step": 107065 }, { "epoch": 17.466557911908644, "grad_norm": 0.08011601120233536, "learning_rate": 2.404663388236114e-06, "loss": 0.1334, "num_input_tokens_seen": 231033376, "step": 107070 }, { "epoch": 17.4673735725938, "grad_norm": 0.033547379076480865, "learning_rate": 2.4031406300343647e-06, "loss": 0.131, "num_input_tokens_seen": 231043936, "step": 107075 }, { "epoch": 17.468189233278956, "grad_norm": 0.10883569717407227, "learning_rate": 2.401618329786637e-06, "loss": 0.1578, "num_input_tokens_seen": 231053984, "step": 107080 }, { "epoch": 17.46900489396411, "grad_norm": 0.513375461101532, "learning_rate": 2.4000964875237757e-06, "loss": 0.0133, "num_input_tokens_seen": 231064640, "step": 107085 }, { "epoch": 17.469820554649267, "grad_norm": 0.0518328957259655, "learning_rate": 2.3985751032766403e-06, "loss": 0.0617, "num_input_tokens_seen": 231074528, "step": 107090 }, { "epoch": 17.47063621533442, "grad_norm": 0.24452632665634155, "learning_rate": 2.397054177076041e-06, "loss": 0.0346, "num_input_tokens_seen": 231086208, "step": 107095 }, { "epoch": 17.471451876019575, "grad_norm": 0.019278952851891518, "learning_rate": 2.395533708952824e-06, "loss": 0.1197, "num_input_tokens_seen": 231097024, "step": 107100 }, { "epoch": 17.47226753670473, "grad_norm": 0.12438453733921051, "learning_rate": 2.3940136989377805e-06, "loss": 0.1442, "num_input_tokens_seen": 231107136, "step": 107105 }, { "epoch": 17.473083197389887, "grad_norm": 4.060972213745117, "learning_rate": 2.3924941470617386e-06, "loss": 0.222, "num_input_tokens_seen": 231118112, "step": 107110 }, { "epoch": 17.473898858075042, "grad_norm": 1.5493720769882202, "learning_rate": 2.390975053355471e-06, "loss": 0.0566, "num_input_tokens_seen": 231128768, "step": 107115 }, { "epoch": 17.474714518760194, "grad_norm": 0.09941279888153076, "learning_rate": 2.389456417849786e-06, "loss": 0.0326, "num_input_tokens_seen": 231139456, "step": 107120 }, { "epoch": 17.47553017944535, "grad_norm": 0.19589942693710327, "learning_rate": 2.3879382405754435e-06, "loss": 0.0634, "num_input_tokens_seen": 231151712, "step": 107125 }, { "epoch": 17.476345840130506, "grad_norm": 0.08678445965051651, "learning_rate": 2.3864205215632235e-06, "loss": 0.1046, "num_input_tokens_seen": 231162272, "step": 107130 }, { "epoch": 17.47716150081566, "grad_norm": 0.01740301586687565, "learning_rate": 2.3849032608438708e-06, "loss": 0.0503, "num_input_tokens_seen": 231171616, "step": 107135 }, { "epoch": 17.477977161500817, "grad_norm": 1.947279691696167, "learning_rate": 2.383386458448153e-06, "loss": 0.0904, "num_input_tokens_seen": 231182688, "step": 107140 }, { "epoch": 17.47879282218597, "grad_norm": 0.05116935446858406, "learning_rate": 2.3818701144067895e-06, "loss": 0.0264, "num_input_tokens_seen": 231194400, "step": 107145 }, { "epoch": 17.479608482871125, "grad_norm": 0.01884717307984829, "learning_rate": 2.380354228750531e-06, "loss": 0.0358, "num_input_tokens_seen": 231205696, "step": 107150 }, { "epoch": 17.48042414355628, "grad_norm": 1.433176040649414, "learning_rate": 2.3788388015100803e-06, "loss": 0.093, "num_input_tokens_seen": 231215552, "step": 107155 }, { "epoch": 17.481239804241437, "grad_norm": 0.190455824136734, "learning_rate": 2.377323832716169e-06, "loss": 0.1168, "num_input_tokens_seen": 231224768, "step": 107160 }, { "epoch": 17.482055464926592, "grad_norm": 0.6934497356414795, "learning_rate": 2.3758093223994776e-06, "loss": 0.0243, "num_input_tokens_seen": 231234688, "step": 107165 }, { "epoch": 17.482871125611744, "grad_norm": 0.06884051114320755, "learning_rate": 2.3742952705907206e-06, "loss": 0.0865, "num_input_tokens_seen": 231245888, "step": 107170 }, { "epoch": 17.4836867862969, "grad_norm": 0.13139720261096954, "learning_rate": 2.3727816773205625e-06, "loss": 0.0401, "num_input_tokens_seen": 231256224, "step": 107175 }, { "epoch": 17.484502446982056, "grad_norm": 3.5127687454223633, "learning_rate": 2.3712685426196953e-06, "loss": 0.1734, "num_input_tokens_seen": 231267040, "step": 107180 }, { "epoch": 17.48531810766721, "grad_norm": 0.08864755183458328, "learning_rate": 2.3697558665187746e-06, "loss": 0.0091, "num_input_tokens_seen": 231276992, "step": 107185 }, { "epoch": 17.486133768352367, "grad_norm": 0.05800462141633034, "learning_rate": 2.368243649048463e-06, "loss": 0.1164, "num_input_tokens_seen": 231287904, "step": 107190 }, { "epoch": 17.48694942903752, "grad_norm": 1.780643105506897, "learning_rate": 2.366731890239404e-06, "loss": 0.1135, "num_input_tokens_seen": 231298304, "step": 107195 }, { "epoch": 17.487765089722675, "grad_norm": 1.8512436151504517, "learning_rate": 2.365220590122233e-06, "loss": 0.0691, "num_input_tokens_seen": 231309600, "step": 107200 }, { "epoch": 17.48858075040783, "grad_norm": 0.05170108377933502, "learning_rate": 2.363709748727583e-06, "loss": 0.0467, "num_input_tokens_seen": 231320288, "step": 107205 }, { "epoch": 17.489396411092986, "grad_norm": 0.04647871106863022, "learning_rate": 2.362199366086071e-06, "loss": 0.2051, "num_input_tokens_seen": 231331168, "step": 107210 }, { "epoch": 17.49021207177814, "grad_norm": 0.04334748536348343, "learning_rate": 2.360689442228306e-06, "loss": 0.0816, "num_input_tokens_seen": 231341760, "step": 107215 }, { "epoch": 17.491027732463294, "grad_norm": 0.49627965688705444, "learning_rate": 2.3591799771848915e-06, "loss": 0.016, "num_input_tokens_seen": 231352640, "step": 107220 }, { "epoch": 17.49184339314845, "grad_norm": 0.837519645690918, "learning_rate": 2.357670970986414e-06, "loss": 0.1113, "num_input_tokens_seen": 231362560, "step": 107225 }, { "epoch": 17.492659053833606, "grad_norm": 0.15636126697063446, "learning_rate": 2.356162423663455e-06, "loss": 0.0958, "num_input_tokens_seen": 231373408, "step": 107230 }, { "epoch": 17.49347471451876, "grad_norm": 1.5582513809204102, "learning_rate": 2.3546543352465977e-06, "loss": 0.1041, "num_input_tokens_seen": 231384416, "step": 107235 }, { "epoch": 17.494290375203914, "grad_norm": 3.5995917320251465, "learning_rate": 2.35314670576639e-06, "loss": 0.3164, "num_input_tokens_seen": 231394688, "step": 107240 }, { "epoch": 17.49510603588907, "grad_norm": 0.5131469368934631, "learning_rate": 2.3516395352534056e-06, "loss": 0.1388, "num_input_tokens_seen": 231405184, "step": 107245 }, { "epoch": 17.495921696574225, "grad_norm": 0.15677480399608612, "learning_rate": 2.350132823738166e-06, "loss": 0.0227, "num_input_tokens_seen": 231415968, "step": 107250 }, { "epoch": 17.49673735725938, "grad_norm": 1.1249568462371826, "learning_rate": 2.348626571251225e-06, "loss": 0.1247, "num_input_tokens_seen": 231425856, "step": 107255 }, { "epoch": 17.497553017944536, "grad_norm": 1.3017656803131104, "learning_rate": 2.3471207778230948e-06, "loss": 0.1008, "num_input_tokens_seen": 231436416, "step": 107260 }, { "epoch": 17.49836867862969, "grad_norm": 0.03343063220381737, "learning_rate": 2.3456154434843087e-06, "loss": 0.0444, "num_input_tokens_seen": 231448224, "step": 107265 }, { "epoch": 17.499184339314844, "grad_norm": 0.10770637542009354, "learning_rate": 2.3441105682653563e-06, "loss": 0.0126, "num_input_tokens_seen": 231459776, "step": 107270 }, { "epoch": 17.5, "grad_norm": 4.337899208068848, "learning_rate": 2.3426061521967523e-06, "loss": 0.0463, "num_input_tokens_seen": 231469600, "step": 107275 }, { "epoch": 17.500815660685156, "grad_norm": 0.16802772879600525, "learning_rate": 2.3411021953089696e-06, "loss": 0.1978, "num_input_tokens_seen": 231480832, "step": 107280 }, { "epoch": 17.50163132137031, "grad_norm": 0.48638179898262024, "learning_rate": 2.339598697632503e-06, "loss": 0.2478, "num_input_tokens_seen": 231490880, "step": 107285 }, { "epoch": 17.502446982055464, "grad_norm": 0.08873364329338074, "learning_rate": 2.3380956591978086e-06, "loss": 0.0344, "num_input_tokens_seen": 231501472, "step": 107290 }, { "epoch": 17.50326264274062, "grad_norm": 0.06786833703517914, "learning_rate": 2.336593080035357e-06, "loss": 0.0391, "num_input_tokens_seen": 231512512, "step": 107295 }, { "epoch": 17.504078303425775, "grad_norm": 0.04289574176073074, "learning_rate": 2.335090960175598e-06, "loss": 0.126, "num_input_tokens_seen": 231524224, "step": 107300 }, { "epoch": 17.50489396411093, "grad_norm": 0.05207134410738945, "learning_rate": 2.3335892996489723e-06, "loss": 0.0101, "num_input_tokens_seen": 231535200, "step": 107305 }, { "epoch": 17.505709624796086, "grad_norm": 2.6750099658966064, "learning_rate": 2.3320880984859156e-06, "loss": 0.0757, "num_input_tokens_seen": 231545856, "step": 107310 }, { "epoch": 17.50652528548124, "grad_norm": 2.4296159744262695, "learning_rate": 2.3305873567168483e-06, "loss": 0.1846, "num_input_tokens_seen": 231556864, "step": 107315 }, { "epoch": 17.507340946166394, "grad_norm": 0.20335078239440918, "learning_rate": 2.329087074372188e-06, "loss": 0.1129, "num_input_tokens_seen": 231567776, "step": 107320 }, { "epoch": 17.50815660685155, "grad_norm": 0.04723900929093361, "learning_rate": 2.3275872514823382e-06, "loss": 0.0273, "num_input_tokens_seen": 231579040, "step": 107325 }, { "epoch": 17.508972267536706, "grad_norm": 0.3979816734790802, "learning_rate": 2.326087888077694e-06, "loss": 0.0377, "num_input_tokens_seen": 231588864, "step": 107330 }, { "epoch": 17.50978792822186, "grad_norm": 0.07850275188684464, "learning_rate": 2.3245889841886414e-06, "loss": 0.0799, "num_input_tokens_seen": 231599712, "step": 107335 }, { "epoch": 17.510603588907014, "grad_norm": 0.04157834127545357, "learning_rate": 2.32309053984556e-06, "loss": 0.1377, "num_input_tokens_seen": 231611072, "step": 107340 }, { "epoch": 17.51141924959217, "grad_norm": 0.049637213349342346, "learning_rate": 2.3215925550788166e-06, "loss": 0.0229, "num_input_tokens_seen": 231621952, "step": 107345 }, { "epoch": 17.512234910277325, "grad_norm": 0.34834280610084534, "learning_rate": 2.3200950299187674e-06, "loss": 0.1985, "num_input_tokens_seen": 231633344, "step": 107350 }, { "epoch": 17.51305057096248, "grad_norm": 0.0768972858786583, "learning_rate": 2.3185979643957663e-06, "loss": 0.0372, "num_input_tokens_seen": 231644768, "step": 107355 }, { "epoch": 17.513866231647633, "grad_norm": 1.1089643239974976, "learning_rate": 2.3171013585401523e-06, "loss": 0.0895, "num_input_tokens_seen": 231656960, "step": 107360 }, { "epoch": 17.51468189233279, "grad_norm": 0.03187237307429314, "learning_rate": 2.3156052123822517e-06, "loss": 0.135, "num_input_tokens_seen": 231667712, "step": 107365 }, { "epoch": 17.515497553017944, "grad_norm": 2.102586269378662, "learning_rate": 2.3141095259523904e-06, "loss": 0.1018, "num_input_tokens_seen": 231678752, "step": 107370 }, { "epoch": 17.5163132137031, "grad_norm": 0.5979116559028625, "learning_rate": 2.312614299280874e-06, "loss": 0.2325, "num_input_tokens_seen": 231688928, "step": 107375 }, { "epoch": 17.517128874388256, "grad_norm": 1.343406319618225, "learning_rate": 2.3111195323980177e-06, "loss": 0.1636, "num_input_tokens_seen": 231698720, "step": 107380 }, { "epoch": 17.517944535073408, "grad_norm": 0.45066049695014954, "learning_rate": 2.309625225334097e-06, "loss": 0.1857, "num_input_tokens_seen": 231709568, "step": 107385 }, { "epoch": 17.518760195758563, "grad_norm": 0.02472388558089733, "learning_rate": 2.3081313781194186e-06, "loss": 0.1304, "num_input_tokens_seen": 231718080, "step": 107390 }, { "epoch": 17.51957585644372, "grad_norm": 1.814160943031311, "learning_rate": 2.3066379907842327e-06, "loss": 0.117, "num_input_tokens_seen": 231728896, "step": 107395 }, { "epoch": 17.520391517128875, "grad_norm": 0.14085328578948975, "learning_rate": 2.305145063358821e-06, "loss": 0.1336, "num_input_tokens_seen": 231740704, "step": 107400 }, { "epoch": 17.52120717781403, "grad_norm": 0.01372193917632103, "learning_rate": 2.3036525958734372e-06, "loss": 0.0821, "num_input_tokens_seen": 231751968, "step": 107405 }, { "epoch": 17.522022838499183, "grad_norm": 0.30912861227989197, "learning_rate": 2.3021605883583237e-06, "loss": 0.0505, "num_input_tokens_seen": 231762592, "step": 107410 }, { "epoch": 17.52283849918434, "grad_norm": 0.503570020198822, "learning_rate": 2.30066904084372e-06, "loss": 0.0791, "num_input_tokens_seen": 231773344, "step": 107415 }, { "epoch": 17.523654159869494, "grad_norm": 0.2965502142906189, "learning_rate": 2.2991779533598547e-06, "loss": 0.078, "num_input_tokens_seen": 231783232, "step": 107420 }, { "epoch": 17.52446982055465, "grad_norm": 0.2535645663738251, "learning_rate": 2.297687325936948e-06, "loss": 0.0243, "num_input_tokens_seen": 231794336, "step": 107425 }, { "epoch": 17.525285481239806, "grad_norm": 0.6416513323783875, "learning_rate": 2.2961971586052065e-06, "loss": 0.2122, "num_input_tokens_seen": 231805376, "step": 107430 }, { "epoch": 17.526101141924958, "grad_norm": 0.014787412248551846, "learning_rate": 2.2947074513948315e-06, "loss": 0.1605, "num_input_tokens_seen": 231815648, "step": 107435 }, { "epoch": 17.526916802610113, "grad_norm": 0.07272829115390778, "learning_rate": 2.293218204336012e-06, "loss": 0.1372, "num_input_tokens_seen": 231827136, "step": 107440 }, { "epoch": 17.52773246329527, "grad_norm": 0.01914917305111885, "learning_rate": 2.291729417458932e-06, "loss": 0.0085, "num_input_tokens_seen": 231838048, "step": 107445 }, { "epoch": 17.528548123980425, "grad_norm": 0.24647897481918335, "learning_rate": 2.2902410907937655e-06, "loss": 0.1345, "num_input_tokens_seen": 231847808, "step": 107450 }, { "epoch": 17.52936378466558, "grad_norm": 0.4986288547515869, "learning_rate": 2.288753224370671e-06, "loss": 0.1196, "num_input_tokens_seen": 231859136, "step": 107455 }, { "epoch": 17.530179445350733, "grad_norm": 0.09011585265398026, "learning_rate": 2.2872658182198025e-06, "loss": 0.0551, "num_input_tokens_seen": 231869568, "step": 107460 }, { "epoch": 17.53099510603589, "grad_norm": 0.7044808268547058, "learning_rate": 2.285778872371305e-06, "loss": 0.1848, "num_input_tokens_seen": 231880000, "step": 107465 }, { "epoch": 17.531810766721044, "grad_norm": 0.6589785814285278, "learning_rate": 2.284292386855316e-06, "loss": 0.0834, "num_input_tokens_seen": 231890304, "step": 107470 }, { "epoch": 17.5326264274062, "grad_norm": 2.1744275093078613, "learning_rate": 2.2828063617019558e-06, "loss": 0.1075, "num_input_tokens_seen": 231900576, "step": 107475 }, { "epoch": 17.533442088091356, "grad_norm": 0.0939861387014389, "learning_rate": 2.281320796941347e-06, "loss": 0.048, "num_input_tokens_seen": 231912192, "step": 107480 }, { "epoch": 17.534257748776508, "grad_norm": 0.1139606237411499, "learning_rate": 2.2798356926035907e-06, "loss": 0.1074, "num_input_tokens_seen": 231923008, "step": 107485 }, { "epoch": 17.535073409461663, "grad_norm": 0.11975051462650299, "learning_rate": 2.2783510487187883e-06, "loss": 0.0438, "num_input_tokens_seen": 231933632, "step": 107490 }, { "epoch": 17.53588907014682, "grad_norm": 0.17616227269172668, "learning_rate": 2.2768668653170234e-06, "loss": 0.0218, "num_input_tokens_seen": 231942592, "step": 107495 }, { "epoch": 17.536704730831975, "grad_norm": 0.06517991423606873, "learning_rate": 2.2753831424283805e-06, "loss": 0.0493, "num_input_tokens_seen": 231953504, "step": 107500 }, { "epoch": 17.53752039151713, "grad_norm": 3.3937835693359375, "learning_rate": 2.2738998800829194e-06, "loss": 0.391, "num_input_tokens_seen": 231964992, "step": 107505 }, { "epoch": 17.538336052202283, "grad_norm": 0.6710226535797119, "learning_rate": 2.272417078310718e-06, "loss": 0.1656, "num_input_tokens_seen": 231975008, "step": 107510 }, { "epoch": 17.53915171288744, "grad_norm": 0.06716962903738022, "learning_rate": 2.2709347371418084e-06, "loss": 0.0138, "num_input_tokens_seen": 231987168, "step": 107515 }, { "epoch": 17.539967373572594, "grad_norm": 0.09128004312515259, "learning_rate": 2.269452856606244e-06, "loss": 0.0513, "num_input_tokens_seen": 231998912, "step": 107520 }, { "epoch": 17.54078303425775, "grad_norm": 0.07411836087703705, "learning_rate": 2.267971436734051e-06, "loss": 0.107, "num_input_tokens_seen": 232010176, "step": 107525 }, { "epoch": 17.541598694942905, "grad_norm": 0.03655794635415077, "learning_rate": 2.266490477555258e-06, "loss": 0.0758, "num_input_tokens_seen": 232021280, "step": 107530 }, { "epoch": 17.542414355628058, "grad_norm": 0.15750233829021454, "learning_rate": 2.2650099790998718e-06, "loss": 0.1227, "num_input_tokens_seen": 232031392, "step": 107535 }, { "epoch": 17.543230016313213, "grad_norm": 1.5068434476852417, "learning_rate": 2.2635299413979014e-06, "loss": 0.1061, "num_input_tokens_seen": 232041600, "step": 107540 }, { "epoch": 17.54404567699837, "grad_norm": 1.583860993385315, "learning_rate": 2.26205036447934e-06, "loss": 0.1792, "num_input_tokens_seen": 232051488, "step": 107545 }, { "epoch": 17.544861337683525, "grad_norm": 0.1134982630610466, "learning_rate": 2.260571248374174e-06, "loss": 0.0932, "num_input_tokens_seen": 232062336, "step": 107550 }, { "epoch": 17.545676998368677, "grad_norm": 0.03202875331044197, "learning_rate": 2.2590925931123774e-06, "loss": 0.0335, "num_input_tokens_seen": 232074272, "step": 107555 }, { "epoch": 17.546492659053833, "grad_norm": 0.36155447363853455, "learning_rate": 2.2576143987239202e-06, "loss": 0.053, "num_input_tokens_seen": 232085952, "step": 107560 }, { "epoch": 17.54730831973899, "grad_norm": 0.10452256351709366, "learning_rate": 2.256136665238756e-06, "loss": 0.1009, "num_input_tokens_seen": 232097056, "step": 107565 }, { "epoch": 17.548123980424144, "grad_norm": 0.05087210610508919, "learning_rate": 2.254659392686834e-06, "loss": 0.1868, "num_input_tokens_seen": 232108864, "step": 107570 }, { "epoch": 17.5489396411093, "grad_norm": 0.21709027886390686, "learning_rate": 2.2531825810980954e-06, "loss": 0.1286, "num_input_tokens_seen": 232120192, "step": 107575 }, { "epoch": 17.549755301794452, "grad_norm": 0.29749923944473267, "learning_rate": 2.251706230502468e-06, "loss": 0.0894, "num_input_tokens_seen": 232130944, "step": 107580 }, { "epoch": 17.550570962479608, "grad_norm": 2.7066640853881836, "learning_rate": 2.2502303409298707e-06, "loss": 0.1685, "num_input_tokens_seen": 232140192, "step": 107585 }, { "epoch": 17.551386623164763, "grad_norm": 0.14394152164459229, "learning_rate": 2.2487549124102196e-06, "loss": 0.0332, "num_input_tokens_seen": 232151136, "step": 107590 }, { "epoch": 17.55220228384992, "grad_norm": 0.2196996957063675, "learning_rate": 2.247279944973407e-06, "loss": 0.0243, "num_input_tokens_seen": 232162560, "step": 107595 }, { "epoch": 17.553017944535075, "grad_norm": 0.5638487339019775, "learning_rate": 2.2458054386493344e-06, "loss": 0.0215, "num_input_tokens_seen": 232174304, "step": 107600 }, { "epoch": 17.553833605220227, "grad_norm": 2.974750518798828, "learning_rate": 2.24433139346788e-06, "loss": 0.0447, "num_input_tokens_seen": 232184576, "step": 107605 }, { "epoch": 17.554649265905383, "grad_norm": 1.5841444730758667, "learning_rate": 2.2428578094589092e-06, "loss": 0.2221, "num_input_tokens_seen": 232195392, "step": 107610 }, { "epoch": 17.55546492659054, "grad_norm": 0.7692970037460327, "learning_rate": 2.2413846866523064e-06, "loss": 0.0467, "num_input_tokens_seen": 232206112, "step": 107615 }, { "epoch": 17.556280587275694, "grad_norm": 1.023229956626892, "learning_rate": 2.239912025077906e-06, "loss": 0.0792, "num_input_tokens_seen": 232217344, "step": 107620 }, { "epoch": 17.55709624796085, "grad_norm": 0.5377240777015686, "learning_rate": 2.23843982476557e-06, "loss": 0.0889, "num_input_tokens_seen": 232228192, "step": 107625 }, { "epoch": 17.557911908646002, "grad_norm": 0.16462811827659607, "learning_rate": 2.2369680857451142e-06, "loss": 0.0215, "num_input_tokens_seen": 232238752, "step": 107630 }, { "epoch": 17.558727569331158, "grad_norm": 0.40066230297088623, "learning_rate": 2.235496808046389e-06, "loss": 0.1242, "num_input_tokens_seen": 232249728, "step": 107635 }, { "epoch": 17.559543230016313, "grad_norm": 1.7114512920379639, "learning_rate": 2.23402599169919e-06, "loss": 0.0583, "num_input_tokens_seen": 232260352, "step": 107640 }, { "epoch": 17.56035889070147, "grad_norm": 0.021134255453944206, "learning_rate": 2.2325556367333438e-06, "loss": 0.0596, "num_input_tokens_seen": 232270176, "step": 107645 }, { "epoch": 17.561174551386625, "grad_norm": 1.4574347734451294, "learning_rate": 2.231085743178632e-06, "loss": 0.0614, "num_input_tokens_seen": 232281312, "step": 107650 }, { "epoch": 17.561990212071777, "grad_norm": 0.07815641909837723, "learning_rate": 2.229616311064861e-06, "loss": 0.1008, "num_input_tokens_seen": 232292704, "step": 107655 }, { "epoch": 17.562805872756933, "grad_norm": 0.5360491275787354, "learning_rate": 2.2281473404217933e-06, "loss": 0.1447, "num_input_tokens_seen": 232304064, "step": 107660 }, { "epoch": 17.563621533442088, "grad_norm": 0.06021832674741745, "learning_rate": 2.226678831279211e-06, "loss": 0.0424, "num_input_tokens_seen": 232315200, "step": 107665 }, { "epoch": 17.564437194127244, "grad_norm": 1.2704498767852783, "learning_rate": 2.225210783666873e-06, "loss": 0.1324, "num_input_tokens_seen": 232326464, "step": 107670 }, { "epoch": 17.5652528548124, "grad_norm": 0.9072219729423523, "learning_rate": 2.2237431976145308e-06, "loss": 0.0465, "num_input_tokens_seen": 232337696, "step": 107675 }, { "epoch": 17.56606851549755, "grad_norm": 0.5869481563568115, "learning_rate": 2.2222760731519245e-06, "loss": 0.0351, "num_input_tokens_seen": 232349632, "step": 107680 }, { "epoch": 17.566884176182707, "grad_norm": 0.13026508688926697, "learning_rate": 2.2208094103087914e-06, "loss": 0.0606, "num_input_tokens_seen": 232359680, "step": 107685 }, { "epoch": 17.567699836867863, "grad_norm": 0.8432464599609375, "learning_rate": 2.219343209114849e-06, "loss": 0.0958, "num_input_tokens_seen": 232371424, "step": 107690 }, { "epoch": 17.56851549755302, "grad_norm": 0.09455078840255737, "learning_rate": 2.217877469599819e-06, "loss": 0.0655, "num_input_tokens_seen": 232382112, "step": 107695 }, { "epoch": 17.569331158238175, "grad_norm": 0.17682458460330963, "learning_rate": 2.2164121917934043e-06, "loss": 0.0406, "num_input_tokens_seen": 232392832, "step": 107700 }, { "epoch": 17.570146818923327, "grad_norm": 0.06704318523406982, "learning_rate": 2.214947375725299e-06, "loss": 0.041, "num_input_tokens_seen": 232403552, "step": 107705 }, { "epoch": 17.570962479608482, "grad_norm": 1.591423511505127, "learning_rate": 2.2134830214251887e-06, "loss": 0.1695, "num_input_tokens_seen": 232414112, "step": 107710 }, { "epoch": 17.571778140293638, "grad_norm": 1.4009742736816406, "learning_rate": 2.212019128922746e-06, "loss": 0.0524, "num_input_tokens_seen": 232425536, "step": 107715 }, { "epoch": 17.572593800978794, "grad_norm": 0.12796592712402344, "learning_rate": 2.2105556982476573e-06, "loss": 0.01, "num_input_tokens_seen": 232437376, "step": 107720 }, { "epoch": 17.57340946166395, "grad_norm": 0.07724872976541519, "learning_rate": 2.2090927294295545e-06, "loss": 0.3013, "num_input_tokens_seen": 232447552, "step": 107725 }, { "epoch": 17.5742251223491, "grad_norm": 1.0039910078048706, "learning_rate": 2.207630222498111e-06, "loss": 0.2448, "num_input_tokens_seen": 232457952, "step": 107730 }, { "epoch": 17.575040783034257, "grad_norm": 0.05268305912613869, "learning_rate": 2.206168177482948e-06, "loss": 0.1011, "num_input_tokens_seen": 232469120, "step": 107735 }, { "epoch": 17.575856443719413, "grad_norm": 0.16091087460517883, "learning_rate": 2.204706594413708e-06, "loss": 0.1502, "num_input_tokens_seen": 232480928, "step": 107740 }, { "epoch": 17.57667210440457, "grad_norm": 0.6931354999542236, "learning_rate": 2.203245473320001e-06, "loss": 0.1837, "num_input_tokens_seen": 232491936, "step": 107745 }, { "epoch": 17.57748776508972, "grad_norm": 3.579089879989624, "learning_rate": 2.20178481423145e-06, "loss": 0.0743, "num_input_tokens_seen": 232502656, "step": 107750 }, { "epoch": 17.578303425774877, "grad_norm": 0.07251159101724625, "learning_rate": 2.200324617177646e-06, "loss": 0.0661, "num_input_tokens_seen": 232511488, "step": 107755 }, { "epoch": 17.579119086460032, "grad_norm": 0.06486412137746811, "learning_rate": 2.198864882188195e-06, "loss": 0.0411, "num_input_tokens_seen": 232522048, "step": 107760 }, { "epoch": 17.579934747145188, "grad_norm": 0.22467222809791565, "learning_rate": 2.1974056092926633e-06, "loss": 0.1305, "num_input_tokens_seen": 232533216, "step": 107765 }, { "epoch": 17.580750407830344, "grad_norm": 0.04744318127632141, "learning_rate": 2.19594679852064e-06, "loss": 0.119, "num_input_tokens_seen": 232543808, "step": 107770 }, { "epoch": 17.581566068515496, "grad_norm": 0.4230000972747803, "learning_rate": 2.19448844990168e-06, "loss": 0.1906, "num_input_tokens_seen": 232555328, "step": 107775 }, { "epoch": 17.58238172920065, "grad_norm": 0.2791972756385803, "learning_rate": 2.193030563465348e-06, "loss": 0.0092, "num_input_tokens_seen": 232566464, "step": 107780 }, { "epoch": 17.583197389885807, "grad_norm": 0.07330597192049026, "learning_rate": 2.1915731392411765e-06, "loss": 0.0138, "num_input_tokens_seen": 232577024, "step": 107785 }, { "epoch": 17.584013050570963, "grad_norm": 0.04780980199575424, "learning_rate": 2.1901161772587166e-06, "loss": 0.0608, "num_input_tokens_seen": 232588384, "step": 107790 }, { "epoch": 17.58482871125612, "grad_norm": 0.08578182756900787, "learning_rate": 2.188659677547483e-06, "loss": 0.1736, "num_input_tokens_seen": 232599584, "step": 107795 }, { "epoch": 17.58564437194127, "grad_norm": 0.03544297069311142, "learning_rate": 2.1872036401370034e-06, "loss": 0.0631, "num_input_tokens_seen": 232610560, "step": 107800 }, { "epoch": 17.586460032626427, "grad_norm": 0.43361803889274597, "learning_rate": 2.1857480650567807e-06, "loss": 0.1614, "num_input_tokens_seen": 232621440, "step": 107805 }, { "epoch": 17.587275693311582, "grad_norm": 0.06608761101961136, "learning_rate": 2.1842929523363143e-06, "loss": 0.0051, "num_input_tokens_seen": 232631456, "step": 107810 }, { "epoch": 17.588091353996738, "grad_norm": 0.041677992790937424, "learning_rate": 2.182838302005097e-06, "loss": 0.0594, "num_input_tokens_seen": 232642240, "step": 107815 }, { "epoch": 17.588907014681894, "grad_norm": 0.03612853214144707, "learning_rate": 2.181384114092608e-06, "loss": 0.0394, "num_input_tokens_seen": 232652928, "step": 107820 }, { "epoch": 17.589722675367046, "grad_norm": 0.023528549820184708, "learning_rate": 2.1799303886283153e-06, "loss": 0.0272, "num_input_tokens_seen": 232663584, "step": 107825 }, { "epoch": 17.5905383360522, "grad_norm": 0.8974675536155701, "learning_rate": 2.178477125641684e-06, "loss": 0.1067, "num_input_tokens_seen": 232674432, "step": 107830 }, { "epoch": 17.591353996737357, "grad_norm": 1.5547692775726318, "learning_rate": 2.1770243251621637e-06, "loss": 0.093, "num_input_tokens_seen": 232684192, "step": 107835 }, { "epoch": 17.592169657422513, "grad_norm": 0.15927201509475708, "learning_rate": 2.175571987219199e-06, "loss": 0.0492, "num_input_tokens_seen": 232694688, "step": 107840 }, { "epoch": 17.59298531810767, "grad_norm": 0.08046304434537888, "learning_rate": 2.1741201118422234e-06, "loss": 0.2991, "num_input_tokens_seen": 232706400, "step": 107845 }, { "epoch": 17.59380097879282, "grad_norm": 0.03781435266137123, "learning_rate": 2.1726686990606594e-06, "loss": 0.151, "num_input_tokens_seen": 232717344, "step": 107850 }, { "epoch": 17.594616639477977, "grad_norm": 0.5532549619674683, "learning_rate": 2.1712177489039227e-06, "loss": 0.0609, "num_input_tokens_seen": 232727616, "step": 107855 }, { "epoch": 17.595432300163132, "grad_norm": 0.03850451111793518, "learning_rate": 2.1697672614014145e-06, "loss": 0.0305, "num_input_tokens_seen": 232738592, "step": 107860 }, { "epoch": 17.596247960848288, "grad_norm": 2.7501020431518555, "learning_rate": 2.1683172365825455e-06, "loss": 0.271, "num_input_tokens_seen": 232748736, "step": 107865 }, { "epoch": 17.597063621533444, "grad_norm": 0.028304804116487503, "learning_rate": 2.1668676744766803e-06, "loss": 0.0064, "num_input_tokens_seen": 232759040, "step": 107870 }, { "epoch": 17.597879282218596, "grad_norm": 0.29872798919677734, "learning_rate": 2.1654185751132176e-06, "loss": 0.0664, "num_input_tokens_seen": 232770080, "step": 107875 }, { "epoch": 17.59869494290375, "grad_norm": 2.319251775741577, "learning_rate": 2.1639699385215067e-06, "loss": 0.2264, "num_input_tokens_seen": 232781888, "step": 107880 }, { "epoch": 17.599510603588907, "grad_norm": 0.04095696657896042, "learning_rate": 2.1625217647309236e-06, "loss": 0.0181, "num_input_tokens_seen": 232793792, "step": 107885 }, { "epoch": 17.600326264274063, "grad_norm": 1.2857460975646973, "learning_rate": 2.161074053770798e-06, "loss": 0.2643, "num_input_tokens_seen": 232804608, "step": 107890 }, { "epoch": 17.601141924959215, "grad_norm": 0.5486921668052673, "learning_rate": 2.1596268056704894e-06, "loss": 0.0606, "num_input_tokens_seen": 232815680, "step": 107895 }, { "epoch": 17.60195758564437, "grad_norm": 0.03353044018149376, "learning_rate": 2.1581800204593107e-06, "loss": 0.0159, "num_input_tokens_seen": 232825504, "step": 107900 }, { "epoch": 17.602773246329527, "grad_norm": 0.07210277765989304, "learning_rate": 2.156733698166599e-06, "loss": 0.0159, "num_input_tokens_seen": 232834592, "step": 107905 }, { "epoch": 17.603588907014682, "grad_norm": 1.2896403074264526, "learning_rate": 2.1552878388216487e-06, "loss": 0.0544, "num_input_tokens_seen": 232846080, "step": 107910 }, { "epoch": 17.604404567699838, "grad_norm": 1.640791416168213, "learning_rate": 2.1538424424537797e-06, "loss": 0.1622, "num_input_tokens_seen": 232855424, "step": 107915 }, { "epoch": 17.605220228384994, "grad_norm": 0.062074195593595505, "learning_rate": 2.1523975090922687e-06, "loss": 0.0516, "num_input_tokens_seen": 232865760, "step": 107920 }, { "epoch": 17.606035889070146, "grad_norm": 0.13883201777935028, "learning_rate": 2.1509530387664093e-06, "loss": 0.0489, "num_input_tokens_seen": 232876256, "step": 107925 }, { "epoch": 17.6068515497553, "grad_norm": 0.07161686569452286, "learning_rate": 2.1495090315054726e-06, "loss": 0.0228, "num_input_tokens_seen": 232888416, "step": 107930 }, { "epoch": 17.607667210440457, "grad_norm": 0.24788793921470642, "learning_rate": 2.148065487338724e-06, "loss": 0.0498, "num_input_tokens_seen": 232899840, "step": 107935 }, { "epoch": 17.608482871125613, "grad_norm": 0.1959936022758484, "learning_rate": 2.1466224062954176e-06, "loss": 0.1164, "num_input_tokens_seen": 232912320, "step": 107940 }, { "epoch": 17.609298531810765, "grad_norm": 0.19279198348522186, "learning_rate": 2.1451797884047998e-06, "loss": 0.0364, "num_input_tokens_seen": 232922336, "step": 107945 }, { "epoch": 17.61011419249592, "grad_norm": 0.10033025592565536, "learning_rate": 2.143737633696105e-06, "loss": 0.0458, "num_input_tokens_seen": 232932704, "step": 107950 }, { "epoch": 17.610929853181077, "grad_norm": 0.1936500519514084, "learning_rate": 2.1422959421985662e-06, "loss": 0.1096, "num_input_tokens_seen": 232943136, "step": 107955 }, { "epoch": 17.611745513866232, "grad_norm": 0.8577785491943359, "learning_rate": 2.1408547139413954e-06, "loss": 0.1529, "num_input_tokens_seen": 232954304, "step": 107960 }, { "epoch": 17.612561174551388, "grad_norm": 0.06692688912153244, "learning_rate": 2.1394139489538e-06, "loss": 0.1539, "num_input_tokens_seen": 232965760, "step": 107965 }, { "epoch": 17.61337683523654, "grad_norm": 0.05551717430353165, "learning_rate": 2.137973647264985e-06, "loss": 0.1523, "num_input_tokens_seen": 232977472, "step": 107970 }, { "epoch": 17.614192495921696, "grad_norm": 0.16449803113937378, "learning_rate": 2.136533808904137e-06, "loss": 0.0281, "num_input_tokens_seen": 232987328, "step": 107975 }, { "epoch": 17.61500815660685, "grad_norm": 0.45083391666412354, "learning_rate": 2.1350944339004334e-06, "loss": 0.0386, "num_input_tokens_seen": 232997152, "step": 107980 }, { "epoch": 17.615823817292007, "grad_norm": 0.03697352111339569, "learning_rate": 2.1336555222830486e-06, "loss": 0.098, "num_input_tokens_seen": 233008608, "step": 107985 }, { "epoch": 17.616639477977163, "grad_norm": 1.3231072425842285, "learning_rate": 2.1322170740811415e-06, "loss": 0.0672, "num_input_tokens_seen": 233019744, "step": 107990 }, { "epoch": 17.617455138662315, "grad_norm": 0.2662774622440338, "learning_rate": 2.1307790893238616e-06, "loss": 0.0628, "num_input_tokens_seen": 233030272, "step": 107995 }, { "epoch": 17.61827079934747, "grad_norm": 2.253427505493164, "learning_rate": 2.1293415680403633e-06, "loss": 0.2495, "num_input_tokens_seen": 233040384, "step": 108000 }, { "epoch": 17.619086460032626, "grad_norm": 0.4377504885196686, "learning_rate": 2.127904510259765e-06, "loss": 0.0804, "num_input_tokens_seen": 233052096, "step": 108005 }, { "epoch": 17.619902120717782, "grad_norm": 0.056026533246040344, "learning_rate": 2.1264679160112045e-06, "loss": 0.0873, "num_input_tokens_seen": 233063200, "step": 108010 }, { "epoch": 17.620717781402938, "grad_norm": 0.11665968596935272, "learning_rate": 2.1250317853237804e-06, "loss": 0.0207, "num_input_tokens_seen": 233074528, "step": 108015 }, { "epoch": 17.62153344208809, "grad_norm": 0.04553401842713356, "learning_rate": 2.1235961182266143e-06, "loss": 0.1131, "num_input_tokens_seen": 233084608, "step": 108020 }, { "epoch": 17.622349102773246, "grad_norm": 0.07284661382436752, "learning_rate": 2.122160914748783e-06, "loss": 0.0261, "num_input_tokens_seen": 233095808, "step": 108025 }, { "epoch": 17.6231647634584, "grad_norm": 0.0905177891254425, "learning_rate": 2.1207261749193964e-06, "loss": 0.091, "num_input_tokens_seen": 233105984, "step": 108030 }, { "epoch": 17.623980424143557, "grad_norm": 0.6093387007713318, "learning_rate": 2.1192918987675057e-06, "loss": 0.0394, "num_input_tokens_seen": 233116992, "step": 108035 }, { "epoch": 17.624796084828713, "grad_norm": 0.582946240901947, "learning_rate": 2.117858086322197e-06, "loss": 0.1404, "num_input_tokens_seen": 233128256, "step": 108040 }, { "epoch": 17.625611745513865, "grad_norm": 1.5996102094650269, "learning_rate": 2.116424737612524e-06, "loss": 0.0812, "num_input_tokens_seen": 233139840, "step": 108045 }, { "epoch": 17.62642740619902, "grad_norm": 0.0698971152305603, "learning_rate": 2.1149918526675303e-06, "loss": 0.2058, "num_input_tokens_seen": 233151296, "step": 108050 }, { "epoch": 17.627243066884176, "grad_norm": 0.6090481281280518, "learning_rate": 2.11355943151626e-06, "loss": 0.2048, "num_input_tokens_seen": 233162720, "step": 108055 }, { "epoch": 17.628058727569332, "grad_norm": 0.13872374594211578, "learning_rate": 2.112127474187742e-06, "loss": 0.0217, "num_input_tokens_seen": 233173216, "step": 108060 }, { "epoch": 17.628874388254488, "grad_norm": 1.0761200189590454, "learning_rate": 2.1106959807109946e-06, "loss": 0.0199, "num_input_tokens_seen": 233183008, "step": 108065 }, { "epoch": 17.62969004893964, "grad_norm": 2.1111457347869873, "learning_rate": 2.1092649511150308e-06, "loss": 0.1232, "num_input_tokens_seen": 233193472, "step": 108070 }, { "epoch": 17.630505709624796, "grad_norm": 2.62716007232666, "learning_rate": 2.1078343854288503e-06, "loss": 0.0887, "num_input_tokens_seen": 233202656, "step": 108075 }, { "epoch": 17.63132137030995, "grad_norm": 0.058096207678318024, "learning_rate": 2.1064042836814484e-06, "loss": 0.0123, "num_input_tokens_seen": 233213152, "step": 108080 }, { "epoch": 17.632137030995107, "grad_norm": 1.7980365753173828, "learning_rate": 2.1049746459018055e-06, "loss": 0.0871, "num_input_tokens_seen": 233224608, "step": 108085 }, { "epoch": 17.63295269168026, "grad_norm": 0.08569557219743729, "learning_rate": 2.103545472118898e-06, "loss": 0.0139, "num_input_tokens_seen": 233236384, "step": 108090 }, { "epoch": 17.633768352365415, "grad_norm": 1.81427001953125, "learning_rate": 2.1021167623616862e-06, "loss": 0.2138, "num_input_tokens_seen": 233246624, "step": 108095 }, { "epoch": 17.63458401305057, "grad_norm": 0.2451142519712448, "learning_rate": 2.1006885166591245e-06, "loss": 0.0165, "num_input_tokens_seen": 233258240, "step": 108100 }, { "epoch": 17.635399673735726, "grad_norm": 0.0794488787651062, "learning_rate": 2.099260735040162e-06, "loss": 0.0069, "num_input_tokens_seen": 233268992, "step": 108105 }, { "epoch": 17.636215334420882, "grad_norm": 0.1943291872739792, "learning_rate": 2.0978334175337316e-06, "loss": 0.0351, "num_input_tokens_seen": 233279072, "step": 108110 }, { "epoch": 17.637030995106034, "grad_norm": 0.022812869399785995, "learning_rate": 2.0964065641687625e-06, "loss": 0.1602, "num_input_tokens_seen": 233290752, "step": 108115 }, { "epoch": 17.63784665579119, "grad_norm": 0.05906094238162041, "learning_rate": 2.0949801749741676e-06, "loss": 0.014, "num_input_tokens_seen": 233301216, "step": 108120 }, { "epoch": 17.638662316476346, "grad_norm": 1.4504395723342896, "learning_rate": 2.0935542499788572e-06, "loss": 0.2289, "num_input_tokens_seen": 233312096, "step": 108125 }, { "epoch": 17.6394779771615, "grad_norm": 0.06450282782316208, "learning_rate": 2.092128789211728e-06, "loss": 0.1431, "num_input_tokens_seen": 233323520, "step": 108130 }, { "epoch": 17.640293637846657, "grad_norm": 2.0369389057159424, "learning_rate": 2.09070379270167e-06, "loss": 0.1661, "num_input_tokens_seen": 233334240, "step": 108135 }, { "epoch": 17.64110929853181, "grad_norm": 0.18331602215766907, "learning_rate": 2.0892792604775587e-06, "loss": 0.1408, "num_input_tokens_seen": 233345024, "step": 108140 }, { "epoch": 17.641924959216965, "grad_norm": 0.03100716881453991, "learning_rate": 2.0878551925682694e-06, "loss": 0.0493, "num_input_tokens_seen": 233356640, "step": 108145 }, { "epoch": 17.64274061990212, "grad_norm": 0.1389753818511963, "learning_rate": 2.0864315890026633e-06, "loss": 0.019, "num_input_tokens_seen": 233367360, "step": 108150 }, { "epoch": 17.643556280587276, "grad_norm": 0.027701156213879585, "learning_rate": 2.085008449809586e-06, "loss": 0.2306, "num_input_tokens_seen": 233377856, "step": 108155 }, { "epoch": 17.644371941272432, "grad_norm": 0.03652791306376457, "learning_rate": 2.083585775017885e-06, "loss": 0.0172, "num_input_tokens_seen": 233388256, "step": 108160 }, { "epoch": 17.645187601957584, "grad_norm": 1.562423586845398, "learning_rate": 2.0821635646563892e-06, "loss": 0.1556, "num_input_tokens_seen": 233399264, "step": 108165 }, { "epoch": 17.64600326264274, "grad_norm": 2.3265645503997803, "learning_rate": 2.080741818753923e-06, "loss": 0.2414, "num_input_tokens_seen": 233410816, "step": 108170 }, { "epoch": 17.646818923327896, "grad_norm": 0.07713916897773743, "learning_rate": 2.0793205373392964e-06, "loss": 0.135, "num_input_tokens_seen": 233420352, "step": 108175 }, { "epoch": 17.64763458401305, "grad_norm": 0.40290287137031555, "learning_rate": 2.0778997204413176e-06, "loss": 0.043, "num_input_tokens_seen": 233431584, "step": 108180 }, { "epoch": 17.648450244698207, "grad_norm": 0.8570610880851746, "learning_rate": 2.0764793680887797e-06, "loss": 0.1493, "num_input_tokens_seen": 233443616, "step": 108185 }, { "epoch": 17.64926590538336, "grad_norm": 1.0976938009262085, "learning_rate": 2.075059480310468e-06, "loss": 0.0241, "num_input_tokens_seen": 233455264, "step": 108190 }, { "epoch": 17.650081566068515, "grad_norm": 0.056821081787347794, "learning_rate": 2.0736400571351592e-06, "loss": 0.1529, "num_input_tokens_seen": 233465344, "step": 108195 }, { "epoch": 17.65089722675367, "grad_norm": 0.061530936509370804, "learning_rate": 2.0722210985916173e-06, "loss": 0.0213, "num_input_tokens_seen": 233476768, "step": 108200 }, { "epoch": 17.651712887438826, "grad_norm": 0.025501539930701256, "learning_rate": 2.0708026047085992e-06, "loss": 0.0334, "num_input_tokens_seen": 233486496, "step": 108205 }, { "epoch": 17.652528548123982, "grad_norm": 1.2224338054656982, "learning_rate": 2.0693845755148572e-06, "loss": 0.197, "num_input_tokens_seen": 233497248, "step": 108210 }, { "epoch": 17.653344208809134, "grad_norm": 0.7287511825561523, "learning_rate": 2.0679670110391264e-06, "loss": 0.0864, "num_input_tokens_seen": 233507424, "step": 108215 }, { "epoch": 17.65415986949429, "grad_norm": 0.15604931116104126, "learning_rate": 2.066549911310134e-06, "loss": 0.0809, "num_input_tokens_seen": 233518752, "step": 108220 }, { "epoch": 17.654975530179446, "grad_norm": 0.15671709179878235, "learning_rate": 2.0651332763566012e-06, "loss": 0.0507, "num_input_tokens_seen": 233527904, "step": 108225 }, { "epoch": 17.6557911908646, "grad_norm": 0.1024131253361702, "learning_rate": 2.0637171062072358e-06, "loss": 0.0725, "num_input_tokens_seen": 233538336, "step": 108230 }, { "epoch": 17.656606851549757, "grad_norm": 1.5355515480041504, "learning_rate": 2.0623014008907425e-06, "loss": 0.0762, "num_input_tokens_seen": 233549344, "step": 108235 }, { "epoch": 17.65742251223491, "grad_norm": 0.8552919626235962, "learning_rate": 2.0608861604358097e-06, "loss": 0.0811, "num_input_tokens_seen": 233560320, "step": 108240 }, { "epoch": 17.658238172920065, "grad_norm": 2.156585216522217, "learning_rate": 2.0594713848711174e-06, "loss": 0.3422, "num_input_tokens_seen": 233572224, "step": 108245 }, { "epoch": 17.65905383360522, "grad_norm": 2.4385502338409424, "learning_rate": 2.0580570742253364e-06, "loss": 0.2598, "num_input_tokens_seen": 233580480, "step": 108250 }, { "epoch": 17.659869494290376, "grad_norm": 1.6661714315414429, "learning_rate": 2.0566432285271412e-06, "loss": 0.0694, "num_input_tokens_seen": 233592128, "step": 108255 }, { "epoch": 17.660685154975532, "grad_norm": 1.6455833911895752, "learning_rate": 2.055229847805168e-06, "loss": 0.0639, "num_input_tokens_seen": 233602496, "step": 108260 }, { "epoch": 17.661500815660684, "grad_norm": 1.9146851301193237, "learning_rate": 2.053816932088079e-06, "loss": 0.1147, "num_input_tokens_seen": 233612384, "step": 108265 }, { "epoch": 17.66231647634584, "grad_norm": 0.07188332825899124, "learning_rate": 2.052404481404488e-06, "loss": 0.0836, "num_input_tokens_seen": 233623616, "step": 108270 }, { "epoch": 17.663132137030995, "grad_norm": 0.6737003326416016, "learning_rate": 2.0509924957830412e-06, "loss": 0.0859, "num_input_tokens_seen": 233633568, "step": 108275 }, { "epoch": 17.66394779771615, "grad_norm": 0.4404866695404053, "learning_rate": 2.0495809752523383e-06, "loss": 0.0942, "num_input_tokens_seen": 233645152, "step": 108280 }, { "epoch": 17.664763458401303, "grad_norm": 0.046625446528196335, "learning_rate": 2.048169919840992e-06, "loss": 0.0939, "num_input_tokens_seen": 233656224, "step": 108285 }, { "epoch": 17.66557911908646, "grad_norm": 0.7911885976791382, "learning_rate": 2.046759329577602e-06, "loss": 0.074, "num_input_tokens_seen": 233665152, "step": 108290 }, { "epoch": 17.666394779771615, "grad_norm": 0.3151647746562958, "learning_rate": 2.0453492044907513e-06, "loss": 0.0223, "num_input_tokens_seen": 233676096, "step": 108295 }, { "epoch": 17.66721044045677, "grad_norm": 0.8335559964179993, "learning_rate": 2.0439395446090166e-06, "loss": 0.0371, "num_input_tokens_seen": 233686944, "step": 108300 }, { "epoch": 17.668026101141926, "grad_norm": 1.3185149431228638, "learning_rate": 2.0425303499609722e-06, "loss": 0.113, "num_input_tokens_seen": 233697312, "step": 108305 }, { "epoch": 17.66884176182708, "grad_norm": 0.15320292115211487, "learning_rate": 2.0411216205751734e-06, "loss": 0.0664, "num_input_tokens_seen": 233708224, "step": 108310 }, { "epoch": 17.669657422512234, "grad_norm": 0.02644149214029312, "learning_rate": 2.039713356480169e-06, "loss": 0.1256, "num_input_tokens_seen": 233719456, "step": 108315 }, { "epoch": 17.67047308319739, "grad_norm": 0.0884668305516243, "learning_rate": 2.0383055577045008e-06, "loss": 0.0944, "num_input_tokens_seen": 233730080, "step": 108320 }, { "epoch": 17.671288743882545, "grad_norm": 0.18754491209983826, "learning_rate": 2.036898224276701e-06, "loss": 0.0266, "num_input_tokens_seen": 233740224, "step": 108325 }, { "epoch": 17.6721044045677, "grad_norm": 0.2157984972000122, "learning_rate": 2.035491356225286e-06, "loss": 0.0427, "num_input_tokens_seen": 233750784, "step": 108330 }, { "epoch": 17.672920065252853, "grad_norm": 0.030594350770115852, "learning_rate": 2.0340849535787743e-06, "loss": 0.2127, "num_input_tokens_seen": 233762336, "step": 108335 }, { "epoch": 17.67373572593801, "grad_norm": 0.045725271105766296, "learning_rate": 2.0326790163656655e-06, "loss": 0.141, "num_input_tokens_seen": 233772768, "step": 108340 }, { "epoch": 17.674551386623165, "grad_norm": 2.021209955215454, "learning_rate": 2.031273544614451e-06, "loss": 0.1779, "num_input_tokens_seen": 233783296, "step": 108345 }, { "epoch": 17.67536704730832, "grad_norm": 1.8752676248550415, "learning_rate": 2.0298685383536158e-06, "loss": 0.2255, "num_input_tokens_seen": 233794368, "step": 108350 }, { "epoch": 17.676182707993476, "grad_norm": 0.11954125016927719, "learning_rate": 2.028463997611632e-06, "loss": 0.1695, "num_input_tokens_seen": 233804960, "step": 108355 }, { "epoch": 17.67699836867863, "grad_norm": 0.06814074516296387, "learning_rate": 2.027059922416974e-06, "loss": 0.1491, "num_input_tokens_seen": 233816640, "step": 108360 }, { "epoch": 17.677814029363784, "grad_norm": 0.6744316816329956, "learning_rate": 2.02565631279808e-06, "loss": 0.0476, "num_input_tokens_seen": 233828256, "step": 108365 }, { "epoch": 17.67862969004894, "grad_norm": 0.20685580372810364, "learning_rate": 2.024253168783416e-06, "loss": 0.0973, "num_input_tokens_seen": 233838688, "step": 108370 }, { "epoch": 17.679445350734095, "grad_norm": 2.3535079956054688, "learning_rate": 2.022850490401401e-06, "loss": 0.2893, "num_input_tokens_seen": 233849472, "step": 108375 }, { "epoch": 17.68026101141925, "grad_norm": 0.3225003778934479, "learning_rate": 2.0214482776804766e-06, "loss": 0.0766, "num_input_tokens_seen": 233860032, "step": 108380 }, { "epoch": 17.681076672104403, "grad_norm": 0.05676814913749695, "learning_rate": 2.0200465306490447e-06, "loss": 0.0899, "num_input_tokens_seen": 233870656, "step": 108385 }, { "epoch": 17.68189233278956, "grad_norm": 1.0928947925567627, "learning_rate": 2.0186452493355294e-06, "loss": 0.0499, "num_input_tokens_seen": 233882848, "step": 108390 }, { "epoch": 17.682707993474715, "grad_norm": 0.49405136704444885, "learning_rate": 2.0172444337683144e-06, "loss": 0.0149, "num_input_tokens_seen": 233893696, "step": 108395 }, { "epoch": 17.68352365415987, "grad_norm": 0.2937970757484436, "learning_rate": 2.015844083975807e-06, "loss": 0.0192, "num_input_tokens_seen": 233904992, "step": 108400 }, { "epoch": 17.684339314845026, "grad_norm": 1.7464656829833984, "learning_rate": 2.014444199986365e-06, "loss": 0.2145, "num_input_tokens_seen": 233915296, "step": 108405 }, { "epoch": 17.68515497553018, "grad_norm": 2.9181132316589355, "learning_rate": 2.0130447818283827e-06, "loss": 0.1251, "num_input_tokens_seen": 233925344, "step": 108410 }, { "epoch": 17.685970636215334, "grad_norm": 0.10175570845603943, "learning_rate": 2.011645829530198e-06, "loss": 0.072, "num_input_tokens_seen": 233937440, "step": 108415 }, { "epoch": 17.68678629690049, "grad_norm": 1.6623297929763794, "learning_rate": 2.0102473431201806e-06, "loss": 0.0634, "num_input_tokens_seen": 233947264, "step": 108420 }, { "epoch": 17.687601957585645, "grad_norm": 0.01685185916721821, "learning_rate": 2.00884932262666e-06, "loss": 0.1658, "num_input_tokens_seen": 233958432, "step": 108425 }, { "epoch": 17.6884176182708, "grad_norm": 0.5834993720054626, "learning_rate": 2.007451768077975e-06, "loss": 0.1289, "num_input_tokens_seen": 233969728, "step": 108430 }, { "epoch": 17.689233278955953, "grad_norm": 0.03301825374364853, "learning_rate": 2.0060546795024504e-06, "loss": 0.1044, "num_input_tokens_seen": 233980832, "step": 108435 }, { "epoch": 17.69004893964111, "grad_norm": 0.0522066168487072, "learning_rate": 2.004658056928399e-06, "loss": 0.0402, "num_input_tokens_seen": 233990144, "step": 108440 }, { "epoch": 17.690864600326265, "grad_norm": 1.0690157413482666, "learning_rate": 2.0032619003841208e-06, "loss": 0.0367, "num_input_tokens_seen": 234000544, "step": 108445 }, { "epoch": 17.69168026101142, "grad_norm": 1.5792675018310547, "learning_rate": 2.001866209897915e-06, "loss": 0.1042, "num_input_tokens_seen": 234011488, "step": 108450 }, { "epoch": 17.692495921696576, "grad_norm": 0.21893560886383057, "learning_rate": 2.0004709854980654e-06, "loss": 0.1358, "num_input_tokens_seen": 234022976, "step": 108455 }, { "epoch": 17.693311582381728, "grad_norm": 1.649261713027954, "learning_rate": 1.9990762272128483e-06, "loss": 0.1122, "num_input_tokens_seen": 234034528, "step": 108460 }, { "epoch": 17.694127243066884, "grad_norm": 0.05479404330253601, "learning_rate": 1.9976819350705305e-06, "loss": 0.0521, "num_input_tokens_seen": 234045984, "step": 108465 }, { "epoch": 17.69494290375204, "grad_norm": 1.1259756088256836, "learning_rate": 1.9962881090993674e-06, "loss": 0.0456, "num_input_tokens_seen": 234056064, "step": 108470 }, { "epoch": 17.695758564437195, "grad_norm": 0.30676308274269104, "learning_rate": 1.994894749327611e-06, "loss": 0.0406, "num_input_tokens_seen": 234066848, "step": 108475 }, { "epoch": 17.696574225122347, "grad_norm": 0.1693122833967209, "learning_rate": 1.993501855783489e-06, "loss": 0.0955, "num_input_tokens_seen": 234078176, "step": 108480 }, { "epoch": 17.697389885807503, "grad_norm": 0.9066792130470276, "learning_rate": 1.992109428495248e-06, "loss": 0.1336, "num_input_tokens_seen": 234089408, "step": 108485 }, { "epoch": 17.69820554649266, "grad_norm": 1.5073901414871216, "learning_rate": 1.990717467491088e-06, "loss": 0.0405, "num_input_tokens_seen": 234099840, "step": 108490 }, { "epoch": 17.699021207177815, "grad_norm": 3.1930980682373047, "learning_rate": 1.9893259727992357e-06, "loss": 0.203, "num_input_tokens_seen": 234110208, "step": 108495 }, { "epoch": 17.69983686786297, "grad_norm": 0.5703881978988647, "learning_rate": 1.9879349444478778e-06, "loss": 0.2224, "num_input_tokens_seen": 234119552, "step": 108500 }, { "epoch": 17.700652528548122, "grad_norm": 0.2970077097415924, "learning_rate": 1.9865443824652166e-06, "loss": 0.0818, "num_input_tokens_seen": 234130720, "step": 108505 }, { "epoch": 17.701468189233278, "grad_norm": 1.4602330923080444, "learning_rate": 1.9851542868794206e-06, "loss": 0.0633, "num_input_tokens_seen": 234141280, "step": 108510 }, { "epoch": 17.702283849918434, "grad_norm": 0.5703585147857666, "learning_rate": 1.9837646577186786e-06, "loss": 0.1864, "num_input_tokens_seen": 234153280, "step": 108515 }, { "epoch": 17.70309951060359, "grad_norm": 0.2834724485874176, "learning_rate": 1.982375495011135e-06, "loss": 0.1518, "num_input_tokens_seen": 234163488, "step": 108520 }, { "epoch": 17.703915171288745, "grad_norm": 0.19979079067707062, "learning_rate": 1.980986798784962e-06, "loss": 0.0917, "num_input_tokens_seen": 234175360, "step": 108525 }, { "epoch": 17.704730831973897, "grad_norm": 2.3218257427215576, "learning_rate": 1.9795985690682834e-06, "loss": 0.0842, "num_input_tokens_seen": 234185920, "step": 108530 }, { "epoch": 17.705546492659053, "grad_norm": 0.10337413102388382, "learning_rate": 1.9782108058892496e-06, "loss": 0.0396, "num_input_tokens_seen": 234197376, "step": 108535 }, { "epoch": 17.70636215334421, "grad_norm": 0.17493084073066711, "learning_rate": 1.976823509275974e-06, "loss": 0.0635, "num_input_tokens_seen": 234207488, "step": 108540 }, { "epoch": 17.707177814029365, "grad_norm": 0.721642255783081, "learning_rate": 1.9754366792565844e-06, "loss": 0.0442, "num_input_tokens_seen": 234219680, "step": 108545 }, { "epoch": 17.70799347471452, "grad_norm": 6.459913730621338, "learning_rate": 1.974050315859169e-06, "loss": 0.2946, "num_input_tokens_seen": 234230048, "step": 108550 }, { "epoch": 17.708809135399672, "grad_norm": 1.2646273374557495, "learning_rate": 1.9726644191118443e-06, "loss": 0.1674, "num_input_tokens_seen": 234240992, "step": 108555 }, { "epoch": 17.709624796084828, "grad_norm": 1.1411586999893188, "learning_rate": 1.971278989042677e-06, "loss": 0.1022, "num_input_tokens_seen": 234251936, "step": 108560 }, { "epoch": 17.710440456769984, "grad_norm": 0.7064744830131531, "learning_rate": 1.969894025679761e-06, "loss": 0.2281, "num_input_tokens_seen": 234261664, "step": 108565 }, { "epoch": 17.71125611745514, "grad_norm": 0.08611917495727539, "learning_rate": 1.96850952905116e-06, "loss": 0.0654, "num_input_tokens_seen": 234271808, "step": 108570 }, { "epoch": 17.712071778140295, "grad_norm": 1.8662424087524414, "learning_rate": 1.9671254991849298e-06, "loss": 0.0699, "num_input_tokens_seen": 234281984, "step": 108575 }, { "epoch": 17.712887438825447, "grad_norm": 0.05791318789124489, "learning_rate": 1.9657419361091196e-06, "loss": 0.1462, "num_input_tokens_seen": 234293568, "step": 108580 }, { "epoch": 17.713703099510603, "grad_norm": 0.09444610029459, "learning_rate": 1.964358839851771e-06, "loss": 0.1311, "num_input_tokens_seen": 234304480, "step": 108585 }, { "epoch": 17.71451876019576, "grad_norm": 0.576461911201477, "learning_rate": 1.9629762104409115e-06, "loss": 0.0983, "num_input_tokens_seen": 234314656, "step": 108590 }, { "epoch": 17.715334420880914, "grad_norm": 1.3275270462036133, "learning_rate": 1.9615940479045665e-06, "loss": 0.0417, "num_input_tokens_seen": 234325760, "step": 108595 }, { "epoch": 17.71615008156607, "grad_norm": 1.7870744466781616, "learning_rate": 1.9602123522707403e-06, "loss": 0.155, "num_input_tokens_seen": 234336064, "step": 108600 }, { "epoch": 17.716965742251222, "grad_norm": 0.09276753664016724, "learning_rate": 1.9588311235674423e-06, "loss": 0.0132, "num_input_tokens_seen": 234348000, "step": 108605 }, { "epoch": 17.717781402936378, "grad_norm": 0.030637621879577637, "learning_rate": 1.95745036182266e-06, "loss": 0.0633, "num_input_tokens_seen": 234359232, "step": 108610 }, { "epoch": 17.718597063621534, "grad_norm": 0.029835805296897888, "learning_rate": 1.9560700670643806e-06, "loss": 0.0289, "num_input_tokens_seen": 234369056, "step": 108615 }, { "epoch": 17.71941272430669, "grad_norm": 0.11935653537511826, "learning_rate": 1.9546902393205697e-06, "loss": 0.1705, "num_input_tokens_seen": 234379904, "step": 108620 }, { "epoch": 17.72022838499184, "grad_norm": 0.05614151805639267, "learning_rate": 1.9533108786191966e-06, "loss": 0.0986, "num_input_tokens_seen": 234390272, "step": 108625 }, { "epoch": 17.721044045676997, "grad_norm": 0.37741899490356445, "learning_rate": 1.95193198498822e-06, "loss": 0.126, "num_input_tokens_seen": 234401792, "step": 108630 }, { "epoch": 17.721859706362153, "grad_norm": 0.11470960080623627, "learning_rate": 1.9505535584555727e-06, "loss": 0.0747, "num_input_tokens_seen": 234412736, "step": 108635 }, { "epoch": 17.72267536704731, "grad_norm": 0.11143414676189423, "learning_rate": 1.949175599049208e-06, "loss": 0.2536, "num_input_tokens_seen": 234423840, "step": 108640 }, { "epoch": 17.723491027732464, "grad_norm": 1.1738312244415283, "learning_rate": 1.947798106797033e-06, "loss": 0.1812, "num_input_tokens_seen": 234434848, "step": 108645 }, { "epoch": 17.724306688417617, "grad_norm": 0.3962123394012451, "learning_rate": 1.946421081726982e-06, "loss": 0.1328, "num_input_tokens_seen": 234444672, "step": 108650 }, { "epoch": 17.725122349102772, "grad_norm": 0.06585246324539185, "learning_rate": 1.9450445238669427e-06, "loss": 0.1362, "num_input_tokens_seen": 234455968, "step": 108655 }, { "epoch": 17.725938009787928, "grad_norm": 0.14904403686523438, "learning_rate": 1.943668433244833e-06, "loss": 0.1925, "num_input_tokens_seen": 234467392, "step": 108660 }, { "epoch": 17.726753670473084, "grad_norm": 0.39062851667404175, "learning_rate": 1.942292809888524e-06, "loss": 0.1094, "num_input_tokens_seen": 234478272, "step": 108665 }, { "epoch": 17.72756933115824, "grad_norm": 0.43477150797843933, "learning_rate": 1.940917653825908e-06, "loss": 0.099, "num_input_tokens_seen": 234488928, "step": 108670 }, { "epoch": 17.72838499184339, "grad_norm": 1.9316716194152832, "learning_rate": 1.9395429650848423e-06, "loss": 0.1184, "num_input_tokens_seen": 234500128, "step": 108675 }, { "epoch": 17.729200652528547, "grad_norm": 1.0363715887069702, "learning_rate": 1.9381687436931945e-06, "loss": 0.1587, "num_input_tokens_seen": 234509568, "step": 108680 }, { "epoch": 17.730016313213703, "grad_norm": 1.5518510341644287, "learning_rate": 1.9367949896788166e-06, "loss": 0.1163, "num_input_tokens_seen": 234520480, "step": 108685 }, { "epoch": 17.73083197389886, "grad_norm": 0.06771368533372879, "learning_rate": 1.9354217030695424e-06, "loss": 0.224, "num_input_tokens_seen": 234531520, "step": 108690 }, { "epoch": 17.731647634584014, "grad_norm": 0.02582055889070034, "learning_rate": 1.934048883893208e-06, "loss": 0.0309, "num_input_tokens_seen": 234543296, "step": 108695 }, { "epoch": 17.732463295269167, "grad_norm": 0.23254717886447906, "learning_rate": 1.9326765321776352e-06, "loss": 0.0385, "num_input_tokens_seen": 234555232, "step": 108700 }, { "epoch": 17.733278955954322, "grad_norm": 0.5823872089385986, "learning_rate": 1.9313046479506353e-06, "loss": 0.0547, "num_input_tokens_seen": 234566528, "step": 108705 }, { "epoch": 17.734094616639478, "grad_norm": 2.6301891803741455, "learning_rate": 1.929933231240011e-06, "loss": 0.0665, "num_input_tokens_seen": 234578208, "step": 108710 }, { "epoch": 17.734910277324634, "grad_norm": 0.06653955578804016, "learning_rate": 1.9285622820735566e-06, "loss": 0.0197, "num_input_tokens_seen": 234589120, "step": 108715 }, { "epoch": 17.73572593800979, "grad_norm": 1.1944875717163086, "learning_rate": 1.927191800479056e-06, "loss": 0.119, "num_input_tokens_seen": 234599392, "step": 108720 }, { "epoch": 17.73654159869494, "grad_norm": 1.2028480768203735, "learning_rate": 1.925821786484283e-06, "loss": 0.0609, "num_input_tokens_seen": 234611360, "step": 108725 }, { "epoch": 17.737357259380097, "grad_norm": 0.35082170367240906, "learning_rate": 1.9244522401170027e-06, "loss": 0.1973, "num_input_tokens_seen": 234622528, "step": 108730 }, { "epoch": 17.738172920065253, "grad_norm": 0.32091087102890015, "learning_rate": 1.9230831614049703e-06, "loss": 0.1508, "num_input_tokens_seen": 234634784, "step": 108735 }, { "epoch": 17.73898858075041, "grad_norm": 0.03297505900263786, "learning_rate": 1.9217145503759333e-06, "loss": 0.0647, "num_input_tokens_seen": 234645568, "step": 108740 }, { "epoch": 17.739804241435564, "grad_norm": 0.06771441549062729, "learning_rate": 1.92034640705763e-06, "loss": 0.0151, "num_input_tokens_seen": 234656800, "step": 108745 }, { "epoch": 17.740619902120716, "grad_norm": 0.19773687422275543, "learning_rate": 1.918978731477783e-06, "loss": 0.0551, "num_input_tokens_seen": 234667840, "step": 108750 }, { "epoch": 17.741435562805872, "grad_norm": 0.06351426243782043, "learning_rate": 1.9176115236641145e-06, "loss": 0.079, "num_input_tokens_seen": 234679616, "step": 108755 }, { "epoch": 17.742251223491028, "grad_norm": 0.09417407959699631, "learning_rate": 1.9162447836443277e-06, "loss": 0.1113, "num_input_tokens_seen": 234690496, "step": 108760 }, { "epoch": 17.743066884176184, "grad_norm": 0.20899029076099396, "learning_rate": 1.9148785114461278e-06, "loss": 0.0095, "num_input_tokens_seen": 234701600, "step": 108765 }, { "epoch": 17.74388254486134, "grad_norm": 0.9201071858406067, "learning_rate": 1.9135127070971924e-06, "loss": 0.0885, "num_input_tokens_seen": 234711648, "step": 108770 }, { "epoch": 17.74469820554649, "grad_norm": 0.391348659992218, "learning_rate": 1.9121473706252196e-06, "loss": 0.099, "num_input_tokens_seen": 234723040, "step": 108775 }, { "epoch": 17.745513866231647, "grad_norm": 0.1136215403676033, "learning_rate": 1.910782502057862e-06, "loss": 0.0243, "num_input_tokens_seen": 234733216, "step": 108780 }, { "epoch": 17.746329526916803, "grad_norm": 0.1118127852678299, "learning_rate": 1.909418101422791e-06, "loss": 0.0096, "num_input_tokens_seen": 234742976, "step": 108785 }, { "epoch": 17.74714518760196, "grad_norm": 0.11002842336893082, "learning_rate": 1.908054168747653e-06, "loss": 0.0074, "num_input_tokens_seen": 234754112, "step": 108790 }, { "epoch": 17.747960848287114, "grad_norm": 0.5510108470916748, "learning_rate": 1.9066907040600934e-06, "loss": 0.0608, "num_input_tokens_seen": 234764960, "step": 108795 }, { "epoch": 17.748776508972266, "grad_norm": 0.03819489851593971, "learning_rate": 1.9053277073877412e-06, "loss": 0.1133, "num_input_tokens_seen": 234775936, "step": 108800 }, { "epoch": 17.749592169657422, "grad_norm": 2.039001941680908, "learning_rate": 1.903965178758224e-06, "loss": 0.1295, "num_input_tokens_seen": 234787392, "step": 108805 }, { "epoch": 17.750407830342578, "grad_norm": 1.8007038831710815, "learning_rate": 1.9026031181991477e-06, "loss": 0.18, "num_input_tokens_seen": 234799200, "step": 108810 }, { "epoch": 17.751223491027734, "grad_norm": 0.2139073610305786, "learning_rate": 1.9012415257381232e-06, "loss": 0.0785, "num_input_tokens_seen": 234811200, "step": 108815 }, { "epoch": 17.752039151712886, "grad_norm": 0.5369565486907959, "learning_rate": 1.89988040140274e-06, "loss": 0.0618, "num_input_tokens_seen": 234822848, "step": 108820 }, { "epoch": 17.75285481239804, "grad_norm": 1.5432440042495728, "learning_rate": 1.8985197452205867e-06, "loss": 0.0578, "num_input_tokens_seen": 234833984, "step": 108825 }, { "epoch": 17.753670473083197, "grad_norm": 0.13260212540626526, "learning_rate": 1.8971595572192358e-06, "loss": 0.0184, "num_input_tokens_seen": 234844448, "step": 108830 }, { "epoch": 17.754486133768353, "grad_norm": 2.2035250663757324, "learning_rate": 1.8957998374262542e-06, "loss": 0.1093, "num_input_tokens_seen": 234855712, "step": 108835 }, { "epoch": 17.75530179445351, "grad_norm": 2.2141666412353516, "learning_rate": 1.8944405858692004e-06, "loss": 0.0938, "num_input_tokens_seen": 234867648, "step": 108840 }, { "epoch": 17.75611745513866, "grad_norm": 0.049774833023548126, "learning_rate": 1.8930818025756192e-06, "loss": 0.2299, "num_input_tokens_seen": 234879456, "step": 108845 }, { "epoch": 17.756933115823816, "grad_norm": 0.10576783865690231, "learning_rate": 1.8917234875730493e-06, "loss": 0.0732, "num_input_tokens_seen": 234890304, "step": 108850 }, { "epoch": 17.757748776508972, "grad_norm": 1.203880786895752, "learning_rate": 1.8903656408890164e-06, "loss": 0.2107, "num_input_tokens_seen": 234900480, "step": 108855 }, { "epoch": 17.758564437194128, "grad_norm": 0.2211589366197586, "learning_rate": 1.8890082625510398e-06, "loss": 0.1461, "num_input_tokens_seen": 234910400, "step": 108860 }, { "epoch": 17.759380097879284, "grad_norm": 0.11467738449573517, "learning_rate": 1.8876513525866307e-06, "loss": 0.0385, "num_input_tokens_seen": 234920000, "step": 108865 }, { "epoch": 17.760195758564436, "grad_norm": 0.18911512196063995, "learning_rate": 1.8862949110232842e-06, "loss": 0.0653, "num_input_tokens_seen": 234930432, "step": 108870 }, { "epoch": 17.76101141924959, "grad_norm": 1.5722410678863525, "learning_rate": 1.8849389378884974e-06, "loss": 0.0717, "num_input_tokens_seen": 234941024, "step": 108875 }, { "epoch": 17.761827079934747, "grad_norm": 0.1320784091949463, "learning_rate": 1.883583433209743e-06, "loss": 0.1417, "num_input_tokens_seen": 234951552, "step": 108880 }, { "epoch": 17.762642740619903, "grad_norm": 0.40239498019218445, "learning_rate": 1.882228397014496e-06, "loss": 0.0124, "num_input_tokens_seen": 234961856, "step": 108885 }, { "epoch": 17.76345840130506, "grad_norm": 0.17032507061958313, "learning_rate": 1.8808738293302153e-06, "loss": 0.1123, "num_input_tokens_seen": 234972576, "step": 108890 }, { "epoch": 17.76427406199021, "grad_norm": 0.1298917680978775, "learning_rate": 1.879519730184362e-06, "loss": 0.0994, "num_input_tokens_seen": 234984672, "step": 108895 }, { "epoch": 17.765089722675366, "grad_norm": 0.044308193027973175, "learning_rate": 1.8781660996043644e-06, "loss": 0.0517, "num_input_tokens_seen": 234995584, "step": 108900 }, { "epoch": 17.765905383360522, "grad_norm": 0.5416932702064514, "learning_rate": 1.8768129376176723e-06, "loss": 0.1127, "num_input_tokens_seen": 235005120, "step": 108905 }, { "epoch": 17.766721044045678, "grad_norm": 0.05301477015018463, "learning_rate": 1.8754602442516894e-06, "loss": 0.245, "num_input_tokens_seen": 235017792, "step": 108910 }, { "epoch": 17.767536704730833, "grad_norm": 0.49725326895713806, "learning_rate": 1.8741080195338433e-06, "loss": 0.0921, "num_input_tokens_seen": 235028704, "step": 108915 }, { "epoch": 17.768352365415986, "grad_norm": 1.218611478805542, "learning_rate": 1.8727562634915374e-06, "loss": 0.2669, "num_input_tokens_seen": 235039008, "step": 108920 }, { "epoch": 17.76916802610114, "grad_norm": 0.3029075562953949, "learning_rate": 1.8714049761521663e-06, "loss": 0.0786, "num_input_tokens_seen": 235049344, "step": 108925 }, { "epoch": 17.769983686786297, "grad_norm": 0.20998896658420563, "learning_rate": 1.8700541575431107e-06, "loss": 0.0361, "num_input_tokens_seen": 235060320, "step": 108930 }, { "epoch": 17.770799347471453, "grad_norm": 0.7080431580543518, "learning_rate": 1.8687038076917518e-06, "loss": 0.1503, "num_input_tokens_seen": 235071904, "step": 108935 }, { "epoch": 17.77161500815661, "grad_norm": 1.3543378114700317, "learning_rate": 1.8673539266254536e-06, "loss": 0.062, "num_input_tokens_seen": 235081952, "step": 108940 }, { "epoch": 17.77243066884176, "grad_norm": 0.04171782359480858, "learning_rate": 1.8660045143715748e-06, "loss": 0.0223, "num_input_tokens_seen": 235092832, "step": 108945 }, { "epoch": 17.773246329526916, "grad_norm": 0.5493501424789429, "learning_rate": 1.864655570957463e-06, "loss": 0.1342, "num_input_tokens_seen": 235104768, "step": 108950 }, { "epoch": 17.774061990212072, "grad_norm": 1.1405034065246582, "learning_rate": 1.863307096410455e-06, "loss": 0.138, "num_input_tokens_seen": 235116032, "step": 108955 }, { "epoch": 17.774877650897228, "grad_norm": 0.7272151708602905, "learning_rate": 1.8619590907578782e-06, "loss": 0.0386, "num_input_tokens_seen": 235127456, "step": 108960 }, { "epoch": 17.775693311582383, "grad_norm": 1.2443233728408813, "learning_rate": 1.860611554027053e-06, "loss": 0.0768, "num_input_tokens_seen": 235137824, "step": 108965 }, { "epoch": 17.776508972267536, "grad_norm": 0.8436793088912964, "learning_rate": 1.8592644862452906e-06, "loss": 0.0288, "num_input_tokens_seen": 235148928, "step": 108970 }, { "epoch": 17.77732463295269, "grad_norm": 1.003833532333374, "learning_rate": 1.857917887439889e-06, "loss": 0.1337, "num_input_tokens_seen": 235159552, "step": 108975 }, { "epoch": 17.778140293637847, "grad_norm": 0.24271944165229797, "learning_rate": 1.856571757638137e-06, "loss": 0.0533, "num_input_tokens_seen": 235170880, "step": 108980 }, { "epoch": 17.778955954323003, "grad_norm": 0.7970309257507324, "learning_rate": 1.8552260968673213e-06, "loss": 0.1088, "num_input_tokens_seen": 235181568, "step": 108985 }, { "epoch": 17.77977161500816, "grad_norm": 0.1313421130180359, "learning_rate": 1.8538809051547091e-06, "loss": 0.0724, "num_input_tokens_seen": 235192704, "step": 108990 }, { "epoch": 17.78058727569331, "grad_norm": 0.4130101203918457, "learning_rate": 1.852536182527556e-06, "loss": 0.0786, "num_input_tokens_seen": 235202528, "step": 108995 }, { "epoch": 17.781402936378466, "grad_norm": 0.13680684566497803, "learning_rate": 1.8511919290131352e-06, "loss": 0.1162, "num_input_tokens_seen": 235211328, "step": 109000 }, { "epoch": 17.782218597063622, "grad_norm": 2.3126158714294434, "learning_rate": 1.8498481446386635e-06, "loss": 0.0953, "num_input_tokens_seen": 235220928, "step": 109005 }, { "epoch": 17.783034257748778, "grad_norm": 0.028591612353920937, "learning_rate": 1.8485048294313966e-06, "loss": 0.0202, "num_input_tokens_seen": 235231808, "step": 109010 }, { "epoch": 17.78384991843393, "grad_norm": 1.7503000497817993, "learning_rate": 1.8471619834185439e-06, "loss": 0.3631, "num_input_tokens_seen": 235242240, "step": 109015 }, { "epoch": 17.784665579119086, "grad_norm": 0.2158859670162201, "learning_rate": 1.8458196066273303e-06, "loss": 0.0298, "num_input_tokens_seen": 235253216, "step": 109020 }, { "epoch": 17.78548123980424, "grad_norm": 0.07550006359815598, "learning_rate": 1.8444776990849483e-06, "loss": 0.0454, "num_input_tokens_seen": 235264352, "step": 109025 }, { "epoch": 17.786296900489397, "grad_norm": 0.08434519916772842, "learning_rate": 1.8431362608186093e-06, "loss": 0.0338, "num_input_tokens_seen": 235274144, "step": 109030 }, { "epoch": 17.787112561174553, "grad_norm": 0.8875634670257568, "learning_rate": 1.8417952918554805e-06, "loss": 0.0327, "num_input_tokens_seen": 235285856, "step": 109035 }, { "epoch": 17.787928221859705, "grad_norm": 0.1644502431154251, "learning_rate": 1.8404547922227567e-06, "loss": 0.0655, "num_input_tokens_seen": 235296928, "step": 109040 }, { "epoch": 17.78874388254486, "grad_norm": 0.1352797895669937, "learning_rate": 1.8391147619475884e-06, "loss": 0.0087, "num_input_tokens_seen": 235307328, "step": 109045 }, { "epoch": 17.789559543230016, "grad_norm": 0.07385022193193436, "learning_rate": 1.8377752010571514e-06, "loss": 0.006, "num_input_tokens_seen": 235317792, "step": 109050 }, { "epoch": 17.790375203915172, "grad_norm": 0.08841104805469513, "learning_rate": 1.8364361095785736e-06, "loss": 0.1293, "num_input_tokens_seen": 235328736, "step": 109055 }, { "epoch": 17.791190864600328, "grad_norm": 0.12737268209457397, "learning_rate": 1.8350974875390087e-06, "loss": 0.0543, "num_input_tokens_seen": 235339520, "step": 109060 }, { "epoch": 17.79200652528548, "grad_norm": 0.47225892543792725, "learning_rate": 1.833759334965579e-06, "loss": 0.0253, "num_input_tokens_seen": 235350912, "step": 109065 }, { "epoch": 17.792822185970635, "grad_norm": 1.7573025226593018, "learning_rate": 1.832421651885405e-06, "loss": 0.2139, "num_input_tokens_seen": 235361632, "step": 109070 }, { "epoch": 17.79363784665579, "grad_norm": 0.4777980148792267, "learning_rate": 1.8310844383255948e-06, "loss": 0.1045, "num_input_tokens_seen": 235373504, "step": 109075 }, { "epoch": 17.794453507340947, "grad_norm": 2.126403570175171, "learning_rate": 1.8297476943132525e-06, "loss": 0.2297, "num_input_tokens_seen": 235384736, "step": 109080 }, { "epoch": 17.795269168026103, "grad_norm": 0.15473590791225433, "learning_rate": 1.8284114198754642e-06, "loss": 0.0616, "num_input_tokens_seen": 235396544, "step": 109085 }, { "epoch": 17.796084828711255, "grad_norm": 0.17576104402542114, "learning_rate": 1.8270756150393142e-06, "loss": 0.045, "num_input_tokens_seen": 235405920, "step": 109090 }, { "epoch": 17.79690048939641, "grad_norm": 0.07079339772462845, "learning_rate": 1.8257402798318751e-06, "loss": 0.0881, "num_input_tokens_seen": 235416032, "step": 109095 }, { "epoch": 17.797716150081566, "grad_norm": 0.03328017517924309, "learning_rate": 1.8244054142802087e-06, "loss": 0.1206, "num_input_tokens_seen": 235427680, "step": 109100 }, { "epoch": 17.798531810766722, "grad_norm": 1.264744758605957, "learning_rate": 1.8230710184113652e-06, "loss": 0.0386, "num_input_tokens_seen": 235438176, "step": 109105 }, { "epoch": 17.799347471451878, "grad_norm": 0.656599223613739, "learning_rate": 1.8217370922523874e-06, "loss": 0.1157, "num_input_tokens_seen": 235449728, "step": 109110 }, { "epoch": 17.80016313213703, "grad_norm": 0.021657871082425117, "learning_rate": 1.8204036358303173e-06, "loss": 0.0616, "num_input_tokens_seen": 235459424, "step": 109115 }, { "epoch": 17.800978792822185, "grad_norm": 2.381364345550537, "learning_rate": 1.8190706491721637e-06, "loss": 0.1116, "num_input_tokens_seen": 235469536, "step": 109120 }, { "epoch": 17.80179445350734, "grad_norm": 0.04726812615990639, "learning_rate": 1.817738132304961e-06, "loss": 0.0118, "num_input_tokens_seen": 235480288, "step": 109125 }, { "epoch": 17.802610114192497, "grad_norm": 0.09201859682798386, "learning_rate": 1.8164060852556951e-06, "loss": 0.1166, "num_input_tokens_seen": 235489984, "step": 109130 }, { "epoch": 17.803425774877653, "grad_norm": 2.2042832374572754, "learning_rate": 1.8150745080513787e-06, "loss": 0.0824, "num_input_tokens_seen": 235501376, "step": 109135 }, { "epoch": 17.804241435562805, "grad_norm": 0.1872035562992096, "learning_rate": 1.8137434007189812e-06, "loss": 0.1709, "num_input_tokens_seen": 235512416, "step": 109140 }, { "epoch": 17.80505709624796, "grad_norm": 0.7075098156929016, "learning_rate": 1.8124127632854955e-06, "loss": 0.0147, "num_input_tokens_seen": 235523712, "step": 109145 }, { "epoch": 17.805872756933116, "grad_norm": 0.0816347748041153, "learning_rate": 1.8110825957778744e-06, "loss": 0.0751, "num_input_tokens_seen": 235535616, "step": 109150 }, { "epoch": 17.806688417618272, "grad_norm": 0.021742958575487137, "learning_rate": 1.8097528982230882e-06, "loss": 0.0094, "num_input_tokens_seen": 235546976, "step": 109155 }, { "epoch": 17.807504078303424, "grad_norm": 0.09205207228660583, "learning_rate": 1.8084236706480683e-06, "loss": 0.2435, "num_input_tokens_seen": 235557792, "step": 109160 }, { "epoch": 17.80831973898858, "grad_norm": 0.04324979707598686, "learning_rate": 1.8070949130797737e-06, "loss": 0.0241, "num_input_tokens_seen": 235568960, "step": 109165 }, { "epoch": 17.809135399673735, "grad_norm": 0.03635334223508835, "learning_rate": 1.8057666255451162e-06, "loss": 0.0194, "num_input_tokens_seen": 235578624, "step": 109170 }, { "epoch": 17.80995106035889, "grad_norm": 0.05219308286905289, "learning_rate": 1.8044388080710267e-06, "loss": 0.1346, "num_input_tokens_seen": 235588064, "step": 109175 }, { "epoch": 17.810766721044047, "grad_norm": 2.9248316287994385, "learning_rate": 1.8031114606844035e-06, "loss": 0.1649, "num_input_tokens_seen": 235599808, "step": 109180 }, { "epoch": 17.8115823817292, "grad_norm": 0.44816264510154724, "learning_rate": 1.8017845834121638e-06, "loss": 0.0285, "num_input_tokens_seen": 235612160, "step": 109185 }, { "epoch": 17.812398042414355, "grad_norm": 0.06815726310014725, "learning_rate": 1.800458176281178e-06, "loss": 0.2202, "num_input_tokens_seen": 235621664, "step": 109190 }, { "epoch": 17.81321370309951, "grad_norm": 0.3483055830001831, "learning_rate": 1.799132239318349e-06, "loss": 0.0408, "num_input_tokens_seen": 235632768, "step": 109195 }, { "epoch": 17.814029363784666, "grad_norm": 0.4210594594478607, "learning_rate": 1.7978067725505282e-06, "loss": 0.1032, "num_input_tokens_seen": 235643808, "step": 109200 }, { "epoch": 17.81484502446982, "grad_norm": 0.09446238726377487, "learning_rate": 1.796481776004591e-06, "loss": 0.0326, "num_input_tokens_seen": 235655808, "step": 109205 }, { "epoch": 17.815660685154974, "grad_norm": 2.08251953125, "learning_rate": 1.7951572497073854e-06, "loss": 0.0462, "num_input_tokens_seen": 235666720, "step": 109210 }, { "epoch": 17.81647634584013, "grad_norm": 1.467826247215271, "learning_rate": 1.7938331936857567e-06, "loss": 0.1643, "num_input_tokens_seen": 235676960, "step": 109215 }, { "epoch": 17.817292006525285, "grad_norm": 0.466099351644516, "learning_rate": 1.792509607966536e-06, "loss": 0.1385, "num_input_tokens_seen": 235687136, "step": 109220 }, { "epoch": 17.81810766721044, "grad_norm": 0.06935319304466248, "learning_rate": 1.7911864925765493e-06, "loss": 0.077, "num_input_tokens_seen": 235698272, "step": 109225 }, { "epoch": 17.818923327895597, "grad_norm": 0.04968485236167908, "learning_rate": 1.7898638475426138e-06, "loss": 0.0987, "num_input_tokens_seen": 235709056, "step": 109230 }, { "epoch": 17.81973898858075, "grad_norm": 0.05278749018907547, "learning_rate": 1.7885416728915277e-06, "loss": 0.219, "num_input_tokens_seen": 235719424, "step": 109235 }, { "epoch": 17.820554649265905, "grad_norm": 0.8741328716278076, "learning_rate": 1.7872199686500919e-06, "loss": 0.033, "num_input_tokens_seen": 235730560, "step": 109240 }, { "epoch": 17.82137030995106, "grad_norm": 0.178876131772995, "learning_rate": 1.78589873484509e-06, "loss": 0.0989, "num_input_tokens_seen": 235741184, "step": 109245 }, { "epoch": 17.822185970636216, "grad_norm": 0.2729074954986572, "learning_rate": 1.7845779715033011e-06, "loss": 0.1781, "num_input_tokens_seen": 235751840, "step": 109250 }, { "epoch": 17.82300163132137, "grad_norm": 0.13482141494750977, "learning_rate": 1.783257678651487e-06, "loss": 0.0137, "num_input_tokens_seen": 235761952, "step": 109255 }, { "epoch": 17.823817292006524, "grad_norm": 1.5847116708755493, "learning_rate": 1.7819378563164152e-06, "loss": 0.1193, "num_input_tokens_seen": 235771072, "step": 109260 }, { "epoch": 17.82463295269168, "grad_norm": 0.6878790855407715, "learning_rate": 1.7806185045248196e-06, "loss": 0.0264, "num_input_tokens_seen": 235782240, "step": 109265 }, { "epoch": 17.825448613376835, "grad_norm": 0.6732045412063599, "learning_rate": 1.7792996233034515e-06, "loss": 0.085, "num_input_tokens_seen": 235791968, "step": 109270 }, { "epoch": 17.82626427406199, "grad_norm": 0.08856552094221115, "learning_rate": 1.777981212679028e-06, "loss": 0.0442, "num_input_tokens_seen": 235802880, "step": 109275 }, { "epoch": 17.827079934747147, "grad_norm": 2.3357081413269043, "learning_rate": 1.7766632726782833e-06, "loss": 0.0718, "num_input_tokens_seen": 235813024, "step": 109280 }, { "epoch": 17.8278955954323, "grad_norm": 0.22592341899871826, "learning_rate": 1.7753458033279102e-06, "loss": 0.0826, "num_input_tokens_seen": 235825088, "step": 109285 }, { "epoch": 17.828711256117455, "grad_norm": 0.28728461265563965, "learning_rate": 1.7740288046546232e-06, "loss": 0.0308, "num_input_tokens_seen": 235835808, "step": 109290 }, { "epoch": 17.82952691680261, "grad_norm": 1.5094857215881348, "learning_rate": 1.7727122766851013e-06, "loss": 0.1858, "num_input_tokens_seen": 235847008, "step": 109295 }, { "epoch": 17.830342577487766, "grad_norm": 0.029030896723270416, "learning_rate": 1.7713962194460366e-06, "loss": 0.1858, "num_input_tokens_seen": 235858336, "step": 109300 }, { "epoch": 17.83115823817292, "grad_norm": 0.07724049687385559, "learning_rate": 1.7700806329640885e-06, "loss": 0.1266, "num_input_tokens_seen": 235868992, "step": 109305 }, { "epoch": 17.831973898858074, "grad_norm": 1.8498353958129883, "learning_rate": 1.7687655172659273e-06, "loss": 0.0936, "num_input_tokens_seen": 235880064, "step": 109310 }, { "epoch": 17.83278955954323, "grad_norm": 0.5362094640731812, "learning_rate": 1.7674508723782069e-06, "loss": 0.1522, "num_input_tokens_seen": 235891392, "step": 109315 }, { "epoch": 17.833605220228385, "grad_norm": 1.9903556108474731, "learning_rate": 1.766136698327564e-06, "loss": 0.1478, "num_input_tokens_seen": 235903200, "step": 109320 }, { "epoch": 17.83442088091354, "grad_norm": 0.16950324177742004, "learning_rate": 1.7648229951406387e-06, "loss": 0.0459, "num_input_tokens_seen": 235913472, "step": 109325 }, { "epoch": 17.835236541598697, "grad_norm": 0.17422355711460114, "learning_rate": 1.7635097628440484e-06, "loss": 0.1133, "num_input_tokens_seen": 235923744, "step": 109330 }, { "epoch": 17.83605220228385, "grad_norm": 0.053579360246658325, "learning_rate": 1.7621970014644107e-06, "loss": 0.0509, "num_input_tokens_seen": 235934944, "step": 109335 }, { "epoch": 17.836867862969005, "grad_norm": 0.2013283669948578, "learning_rate": 1.7608847110283323e-06, "loss": 0.0137, "num_input_tokens_seen": 235946272, "step": 109340 }, { "epoch": 17.83768352365416, "grad_norm": 0.7610201835632324, "learning_rate": 1.7595728915624026e-06, "loss": 0.1073, "num_input_tokens_seen": 235958176, "step": 109345 }, { "epoch": 17.838499184339316, "grad_norm": 0.8312886357307434, "learning_rate": 1.7582615430932148e-06, "loss": 0.1793, "num_input_tokens_seen": 235967776, "step": 109350 }, { "epoch": 17.839314845024468, "grad_norm": 1.7046703100204468, "learning_rate": 1.756950665647339e-06, "loss": 0.0544, "num_input_tokens_seen": 235978176, "step": 109355 }, { "epoch": 17.840130505709624, "grad_norm": 0.04846533387899399, "learning_rate": 1.7556402592513426e-06, "loss": 0.0929, "num_input_tokens_seen": 235989120, "step": 109360 }, { "epoch": 17.84094616639478, "grad_norm": 0.04673517867922783, "learning_rate": 1.7543303239317854e-06, "loss": 0.0632, "num_input_tokens_seen": 236000384, "step": 109365 }, { "epoch": 17.841761827079935, "grad_norm": 0.5638147592544556, "learning_rate": 1.7530208597152125e-06, "loss": 0.0549, "num_input_tokens_seen": 236011200, "step": 109370 }, { "epoch": 17.84257748776509, "grad_norm": 0.06900713592767715, "learning_rate": 1.751711866628164e-06, "loss": 0.1372, "num_input_tokens_seen": 236022912, "step": 109375 }, { "epoch": 17.843393148450243, "grad_norm": 0.07982973009347916, "learning_rate": 1.7504033446971685e-06, "loss": 0.0381, "num_input_tokens_seen": 236033152, "step": 109380 }, { "epoch": 17.8442088091354, "grad_norm": 0.05258166044950485, "learning_rate": 1.749095293948741e-06, "loss": 0.0784, "num_input_tokens_seen": 236043936, "step": 109385 }, { "epoch": 17.845024469820554, "grad_norm": 0.02735855057835579, "learning_rate": 1.7477877144093934e-06, "loss": 0.0518, "num_input_tokens_seen": 236054720, "step": 109390 }, { "epoch": 17.84584013050571, "grad_norm": 2.1778645515441895, "learning_rate": 1.7464806061056245e-06, "loss": 0.2681, "num_input_tokens_seen": 236066688, "step": 109395 }, { "epoch": 17.846655791190866, "grad_norm": 0.34360161423683167, "learning_rate": 1.7451739690639234e-06, "loss": 0.1155, "num_input_tokens_seen": 236077056, "step": 109400 }, { "epoch": 17.847471451876018, "grad_norm": 0.24664919078350067, "learning_rate": 1.7438678033107808e-06, "loss": 0.0755, "num_input_tokens_seen": 236088864, "step": 109405 }, { "epoch": 17.848287112561174, "grad_norm": 0.3891233205795288, "learning_rate": 1.7425621088726502e-06, "loss": 0.0112, "num_input_tokens_seen": 236100032, "step": 109410 }, { "epoch": 17.84910277324633, "grad_norm": 2.633225440979004, "learning_rate": 1.7412568857760102e-06, "loss": 0.182, "num_input_tokens_seen": 236110944, "step": 109415 }, { "epoch": 17.849918433931485, "grad_norm": 0.661566972732544, "learning_rate": 1.7399521340472986e-06, "loss": 0.0905, "num_input_tokens_seen": 236119904, "step": 109420 }, { "epoch": 17.85073409461664, "grad_norm": 0.23001815378665924, "learning_rate": 1.7386478537129686e-06, "loss": 0.1017, "num_input_tokens_seen": 236130944, "step": 109425 }, { "epoch": 17.851549755301793, "grad_norm": 0.3736265301704407, "learning_rate": 1.7373440447994466e-06, "loss": 0.0597, "num_input_tokens_seen": 236141600, "step": 109430 }, { "epoch": 17.85236541598695, "grad_norm": 0.14326924085617065, "learning_rate": 1.7360407073331613e-06, "loss": 0.162, "num_input_tokens_seen": 236152960, "step": 109435 }, { "epoch": 17.853181076672104, "grad_norm": 0.14442822337150574, "learning_rate": 1.7347378413405223e-06, "loss": 0.0703, "num_input_tokens_seen": 236163648, "step": 109440 }, { "epoch": 17.85399673735726, "grad_norm": 0.9700417518615723, "learning_rate": 1.7334354468479336e-06, "loss": 0.0671, "num_input_tokens_seen": 236175200, "step": 109445 }, { "epoch": 17.854812398042416, "grad_norm": 1.1278882026672363, "learning_rate": 1.7321335238817931e-06, "loss": 0.1018, "num_input_tokens_seen": 236184640, "step": 109450 }, { "epoch": 17.855628058727568, "grad_norm": 0.23111306130886078, "learning_rate": 1.7308320724684828e-06, "loss": 0.1351, "num_input_tokens_seen": 236196160, "step": 109455 }, { "epoch": 17.856443719412724, "grad_norm": 1.8985607624053955, "learning_rate": 1.7295310926343816e-06, "loss": 0.1405, "num_input_tokens_seen": 236208128, "step": 109460 }, { "epoch": 17.85725938009788, "grad_norm": 1.6953976154327393, "learning_rate": 1.7282305844058516e-06, "loss": 0.0768, "num_input_tokens_seen": 236218496, "step": 109465 }, { "epoch": 17.858075040783035, "grad_norm": 0.5548827052116394, "learning_rate": 1.7269305478092524e-06, "loss": 0.0955, "num_input_tokens_seen": 236229280, "step": 109470 }, { "epoch": 17.85889070146819, "grad_norm": 1.1730923652648926, "learning_rate": 1.7256309828709294e-06, "loss": 0.0425, "num_input_tokens_seen": 236240384, "step": 109475 }, { "epoch": 17.859706362153343, "grad_norm": 2.6796047687530518, "learning_rate": 1.72433188961722e-06, "loss": 0.1072, "num_input_tokens_seen": 236251744, "step": 109480 }, { "epoch": 17.8605220228385, "grad_norm": 0.04814108833670616, "learning_rate": 1.7230332680744504e-06, "loss": 0.0107, "num_input_tokens_seen": 236264096, "step": 109485 }, { "epoch": 17.861337683523654, "grad_norm": 0.22098033130168915, "learning_rate": 1.721735118268941e-06, "loss": 0.0142, "num_input_tokens_seen": 236275072, "step": 109490 }, { "epoch": 17.86215334420881, "grad_norm": 1.6599540710449219, "learning_rate": 1.7204374402270018e-06, "loss": 0.0637, "num_input_tokens_seen": 236284640, "step": 109495 }, { "epoch": 17.862969004893966, "grad_norm": 0.1296607106924057, "learning_rate": 1.7191402339749308e-06, "loss": 0.0699, "num_input_tokens_seen": 236295712, "step": 109500 }, { "epoch": 17.863784665579118, "grad_norm": 0.1490330845117569, "learning_rate": 1.7178434995390152e-06, "loss": 0.1401, "num_input_tokens_seen": 236306656, "step": 109505 }, { "epoch": 17.864600326264274, "grad_norm": 1.7941268682479858, "learning_rate": 1.7165472369455372e-06, "loss": 0.1361, "num_input_tokens_seen": 236318432, "step": 109510 }, { "epoch": 17.86541598694943, "grad_norm": 0.10315079987049103, "learning_rate": 1.715251446220764e-06, "loss": 0.0309, "num_input_tokens_seen": 236328896, "step": 109515 }, { "epoch": 17.866231647634585, "grad_norm": 0.06489657610654831, "learning_rate": 1.7139561273909616e-06, "loss": 0.3995, "num_input_tokens_seen": 236340512, "step": 109520 }, { "epoch": 17.86704730831974, "grad_norm": 2.185258388519287, "learning_rate": 1.7126612804823805e-06, "loss": 0.2104, "num_input_tokens_seen": 236351328, "step": 109525 }, { "epoch": 17.867862969004893, "grad_norm": 0.07689697295427322, "learning_rate": 1.7113669055212528e-06, "loss": 0.2167, "num_input_tokens_seen": 236361280, "step": 109530 }, { "epoch": 17.86867862969005, "grad_norm": 2.4442644119262695, "learning_rate": 1.7100730025338296e-06, "loss": 0.2935, "num_input_tokens_seen": 236371584, "step": 109535 }, { "epoch": 17.869494290375204, "grad_norm": 2.3852999210357666, "learning_rate": 1.7087795715463123e-06, "loss": 0.213, "num_input_tokens_seen": 236383072, "step": 109540 }, { "epoch": 17.87030995106036, "grad_norm": 0.10381689667701721, "learning_rate": 1.707486612584927e-06, "loss": 0.1505, "num_input_tokens_seen": 236394688, "step": 109545 }, { "epoch": 17.871125611745512, "grad_norm": 0.3435599207878113, "learning_rate": 1.706194125675878e-06, "loss": 0.2076, "num_input_tokens_seen": 236404992, "step": 109550 }, { "epoch": 17.871941272430668, "grad_norm": 1.834383487701416, "learning_rate": 1.7049021108453522e-06, "loss": 0.0592, "num_input_tokens_seen": 236415584, "step": 109555 }, { "epoch": 17.872756933115824, "grad_norm": 0.07875164598226547, "learning_rate": 1.7036105681195374e-06, "loss": 0.0961, "num_input_tokens_seen": 236426144, "step": 109560 }, { "epoch": 17.87357259380098, "grad_norm": 2.2635676860809326, "learning_rate": 1.7023194975246097e-06, "loss": 0.0556, "num_input_tokens_seen": 236437856, "step": 109565 }, { "epoch": 17.874388254486135, "grad_norm": 0.3052263557910919, "learning_rate": 1.7010288990867317e-06, "loss": 0.1438, "num_input_tokens_seen": 236450112, "step": 109570 }, { "epoch": 17.875203915171287, "grad_norm": 2.382222890853882, "learning_rate": 1.6997387728320602e-06, "loss": 0.1224, "num_input_tokens_seen": 236461056, "step": 109575 }, { "epoch": 17.876019575856443, "grad_norm": 0.029505521059036255, "learning_rate": 1.6984491187867407e-06, "loss": 0.0243, "num_input_tokens_seen": 236471104, "step": 109580 }, { "epoch": 17.8768352365416, "grad_norm": 0.593951940536499, "learning_rate": 1.697159936976908e-06, "loss": 0.0608, "num_input_tokens_seen": 236482816, "step": 109585 }, { "epoch": 17.877650897226754, "grad_norm": 1.1455531120300293, "learning_rate": 1.6958712274286942e-06, "loss": 0.2106, "num_input_tokens_seen": 236494336, "step": 109590 }, { "epoch": 17.87846655791191, "grad_norm": 0.7860552668571472, "learning_rate": 1.6945829901682114e-06, "loss": 0.0241, "num_input_tokens_seen": 236506208, "step": 109595 }, { "epoch": 17.879282218597062, "grad_norm": 0.41338831186294556, "learning_rate": 1.6932952252215695e-06, "loss": 0.1053, "num_input_tokens_seen": 236517248, "step": 109600 }, { "epoch": 17.880097879282218, "grad_norm": 3.0117993354797363, "learning_rate": 1.692007932614867e-06, "loss": 0.1441, "num_input_tokens_seen": 236527264, "step": 109605 }, { "epoch": 17.880913539967374, "grad_norm": 0.4191232919692993, "learning_rate": 1.690721112374191e-06, "loss": 0.0977, "num_input_tokens_seen": 236537184, "step": 109610 }, { "epoch": 17.88172920065253, "grad_norm": 0.2739225924015045, "learning_rate": 1.6894347645256214e-06, "loss": 0.1197, "num_input_tokens_seen": 236547616, "step": 109615 }, { "epoch": 17.882544861337685, "grad_norm": 0.0939335897564888, "learning_rate": 1.6881488890952285e-06, "loss": 0.0612, "num_input_tokens_seen": 236559552, "step": 109620 }, { "epoch": 17.883360522022837, "grad_norm": 0.3435448110103607, "learning_rate": 1.686863486109072e-06, "loss": 0.0893, "num_input_tokens_seen": 236569280, "step": 109625 }, { "epoch": 17.884176182707993, "grad_norm": 0.31537529826164246, "learning_rate": 1.6855785555932007e-06, "loss": 0.0281, "num_input_tokens_seen": 236579680, "step": 109630 }, { "epoch": 17.88499184339315, "grad_norm": 0.30121493339538574, "learning_rate": 1.684294097573655e-06, "loss": 0.1406, "num_input_tokens_seen": 236590592, "step": 109635 }, { "epoch": 17.885807504078304, "grad_norm": 0.10634974390268326, "learning_rate": 1.6830101120764747e-06, "loss": 0.1169, "num_input_tokens_seen": 236601728, "step": 109640 }, { "epoch": 17.88662316476346, "grad_norm": 0.03417489305138588, "learning_rate": 1.6817265991276643e-06, "loss": 0.1002, "num_input_tokens_seen": 236612192, "step": 109645 }, { "epoch": 17.887438825448612, "grad_norm": 0.10227351635694504, "learning_rate": 1.680443558753253e-06, "loss": 0.0094, "num_input_tokens_seen": 236623552, "step": 109650 }, { "epoch": 17.888254486133768, "grad_norm": 0.05091627314686775, "learning_rate": 1.6791609909792312e-06, "loss": 0.019, "num_input_tokens_seen": 236634432, "step": 109655 }, { "epoch": 17.889070146818923, "grad_norm": 0.11630929261445999, "learning_rate": 1.6778788958316028e-06, "loss": 0.0608, "num_input_tokens_seen": 236645216, "step": 109660 }, { "epoch": 17.88988580750408, "grad_norm": 0.03174775838851929, "learning_rate": 1.6765972733363362e-06, "loss": 0.0196, "num_input_tokens_seen": 236654848, "step": 109665 }, { "epoch": 17.890701468189235, "grad_norm": 0.40325185656547546, "learning_rate": 1.6753161235194214e-06, "loss": 0.0758, "num_input_tokens_seen": 236666048, "step": 109670 }, { "epoch": 17.891517128874387, "grad_norm": 0.03387507051229477, "learning_rate": 1.6740354464068103e-06, "loss": 0.2058, "num_input_tokens_seen": 236676672, "step": 109675 }, { "epoch": 17.892332789559543, "grad_norm": 3.269239902496338, "learning_rate": 1.6727552420244652e-06, "loss": 0.2004, "num_input_tokens_seen": 236687520, "step": 109680 }, { "epoch": 17.8931484502447, "grad_norm": 0.689290463924408, "learning_rate": 1.6714755103983237e-06, "loss": 0.1736, "num_input_tokens_seen": 236698240, "step": 109685 }, { "epoch": 17.893964110929854, "grad_norm": 0.03466634824872017, "learning_rate": 1.6701962515543263e-06, "loss": 0.0918, "num_input_tokens_seen": 236708928, "step": 109690 }, { "epoch": 17.894779771615006, "grad_norm": 0.061734821647405624, "learning_rate": 1.6689174655183992e-06, "loss": 0.2172, "num_input_tokens_seen": 236718272, "step": 109695 }, { "epoch": 17.895595432300162, "grad_norm": 0.02450910583138466, "learning_rate": 1.6676391523164581e-06, "loss": 0.1361, "num_input_tokens_seen": 236728800, "step": 109700 }, { "epoch": 17.896411092985318, "grad_norm": 0.11965689808130264, "learning_rate": 1.666361311974407e-06, "loss": 0.0667, "num_input_tokens_seen": 236741152, "step": 109705 }, { "epoch": 17.897226753670473, "grad_norm": 0.02586923912167549, "learning_rate": 1.6650839445181476e-06, "loss": 0.1838, "num_input_tokens_seen": 236752960, "step": 109710 }, { "epoch": 17.89804241435563, "grad_norm": 0.2926822602748871, "learning_rate": 1.663807049973562e-06, "loss": 0.125, "num_input_tokens_seen": 236764000, "step": 109715 }, { "epoch": 17.898858075040785, "grad_norm": 0.21751552820205688, "learning_rate": 1.6625306283665293e-06, "loss": 0.2279, "num_input_tokens_seen": 236776192, "step": 109720 }, { "epoch": 17.899673735725937, "grad_norm": 0.030929654836654663, "learning_rate": 1.6612546797229235e-06, "loss": 0.135, "num_input_tokens_seen": 236786336, "step": 109725 }, { "epoch": 17.900489396411093, "grad_norm": 1.4272782802581787, "learning_rate": 1.6599792040685958e-06, "loss": 0.0746, "num_input_tokens_seen": 236797312, "step": 109730 }, { "epoch": 17.90130505709625, "grad_norm": 0.04741082340478897, "learning_rate": 1.658704201429398e-06, "loss": 0.1801, "num_input_tokens_seen": 236808160, "step": 109735 }, { "epoch": 17.902120717781404, "grad_norm": 0.07869613170623779, "learning_rate": 1.6574296718311678e-06, "loss": 0.0813, "num_input_tokens_seen": 236818304, "step": 109740 }, { "epoch": 17.902936378466556, "grad_norm": 1.8500168323516846, "learning_rate": 1.6561556152997454e-06, "loss": 0.0605, "num_input_tokens_seen": 236828352, "step": 109745 }, { "epoch": 17.903752039151712, "grad_norm": 0.07071725279092789, "learning_rate": 1.6548820318609325e-06, "loss": 0.0785, "num_input_tokens_seen": 236839872, "step": 109750 }, { "epoch": 17.904567699836868, "grad_norm": 0.6165253520011902, "learning_rate": 1.6536089215405614e-06, "loss": 0.0195, "num_input_tokens_seen": 236851520, "step": 109755 }, { "epoch": 17.905383360522023, "grad_norm": 0.36347830295562744, "learning_rate": 1.652336284364414e-06, "loss": 0.0278, "num_input_tokens_seen": 236861600, "step": 109760 }, { "epoch": 17.90619902120718, "grad_norm": 2.3168745040893555, "learning_rate": 1.6510641203582977e-06, "loss": 0.0618, "num_input_tokens_seen": 236872320, "step": 109765 }, { "epoch": 17.90701468189233, "grad_norm": 2.4599976539611816, "learning_rate": 1.6497924295479806e-06, "loss": 0.1248, "num_input_tokens_seen": 236883040, "step": 109770 }, { "epoch": 17.907830342577487, "grad_norm": 1.8726435899734497, "learning_rate": 1.6485212119592474e-06, "loss": 0.2141, "num_input_tokens_seen": 236893824, "step": 109775 }, { "epoch": 17.908646003262643, "grad_norm": 0.5664052367210388, "learning_rate": 1.6472504676178503e-06, "loss": 0.1489, "num_input_tokens_seen": 236903424, "step": 109780 }, { "epoch": 17.9094616639478, "grad_norm": 0.043957553803920746, "learning_rate": 1.645980196549554e-06, "loss": 0.0703, "num_input_tokens_seen": 236914048, "step": 109785 }, { "epoch": 17.910277324632954, "grad_norm": 2.185006856918335, "learning_rate": 1.6447103987800861e-06, "loss": 0.1753, "num_input_tokens_seen": 236924512, "step": 109790 }, { "epoch": 17.911092985318106, "grad_norm": 0.1948578953742981, "learning_rate": 1.6434410743352003e-06, "loss": 0.0207, "num_input_tokens_seen": 236936224, "step": 109795 }, { "epoch": 17.911908646003262, "grad_norm": 2.1247756481170654, "learning_rate": 1.642172223240604e-06, "loss": 0.2332, "num_input_tokens_seen": 236947616, "step": 109800 }, { "epoch": 17.912724306688418, "grad_norm": 0.10477355867624283, "learning_rate": 1.6409038455220238e-06, "loss": 0.0083, "num_input_tokens_seen": 236958848, "step": 109805 }, { "epoch": 17.913539967373573, "grad_norm": 1.7016323804855347, "learning_rate": 1.639635941205156e-06, "loss": 0.0407, "num_input_tokens_seen": 236969280, "step": 109810 }, { "epoch": 17.91435562805873, "grad_norm": 1.2410001754760742, "learning_rate": 1.6383685103157048e-06, "loss": 0.1498, "num_input_tokens_seen": 236979968, "step": 109815 }, { "epoch": 17.91517128874388, "grad_norm": 0.09321068227291107, "learning_rate": 1.6371015528793471e-06, "loss": 0.0918, "num_input_tokens_seen": 236991648, "step": 109820 }, { "epoch": 17.915986949429037, "grad_norm": 0.18292862176895142, "learning_rate": 1.6358350689217677e-06, "loss": 0.0322, "num_input_tokens_seen": 237001824, "step": 109825 }, { "epoch": 17.916802610114193, "grad_norm": 0.9680280685424805, "learning_rate": 1.6345690584686296e-06, "loss": 0.191, "num_input_tokens_seen": 237013216, "step": 109830 }, { "epoch": 17.91761827079935, "grad_norm": 0.45320162177085876, "learning_rate": 1.6333035215455899e-06, "loss": 0.0667, "num_input_tokens_seen": 237023104, "step": 109835 }, { "epoch": 17.918433931484504, "grad_norm": 2.353297710418701, "learning_rate": 1.6320384581783004e-06, "loss": 0.1132, "num_input_tokens_seen": 237034208, "step": 109840 }, { "epoch": 17.919249592169656, "grad_norm": 0.13185471296310425, "learning_rate": 1.6307738683923935e-06, "loss": 0.0093, "num_input_tokens_seen": 237044224, "step": 109845 }, { "epoch": 17.920065252854812, "grad_norm": 0.03097481094300747, "learning_rate": 1.6295097522134983e-06, "loss": 0.0818, "num_input_tokens_seen": 237054272, "step": 109850 }, { "epoch": 17.920880913539968, "grad_norm": 0.027799993753433228, "learning_rate": 1.6282461096672392e-06, "loss": 0.0074, "num_input_tokens_seen": 237064992, "step": 109855 }, { "epoch": 17.921696574225123, "grad_norm": 0.13374081254005432, "learning_rate": 1.6269829407792203e-06, "loss": 0.052, "num_input_tokens_seen": 237076864, "step": 109860 }, { "epoch": 17.92251223491028, "grad_norm": 0.3668889105319977, "learning_rate": 1.625720245575041e-06, "loss": 0.0336, "num_input_tokens_seen": 237087360, "step": 109865 }, { "epoch": 17.92332789559543, "grad_norm": 1.2899143695831299, "learning_rate": 1.6244580240802943e-06, "loss": 0.1477, "num_input_tokens_seen": 237098016, "step": 109870 }, { "epoch": 17.924143556280587, "grad_norm": 1.0778812170028687, "learning_rate": 1.6231962763205626e-06, "loss": 0.3042, "num_input_tokens_seen": 237107776, "step": 109875 }, { "epoch": 17.924959216965743, "grad_norm": 0.5702680945396423, "learning_rate": 1.6219350023214092e-06, "loss": 0.1209, "num_input_tokens_seen": 237116736, "step": 109880 }, { "epoch": 17.9257748776509, "grad_norm": 1.8822791576385498, "learning_rate": 1.6206742021083993e-06, "loss": 0.1765, "num_input_tokens_seen": 237127296, "step": 109885 }, { "epoch": 17.92659053833605, "grad_norm": 0.1686325967311859, "learning_rate": 1.619413875707093e-06, "loss": 0.1449, "num_input_tokens_seen": 237137472, "step": 109890 }, { "epoch": 17.927406199021206, "grad_norm": 0.234162375330925, "learning_rate": 1.6181540231430175e-06, "loss": 0.0109, "num_input_tokens_seen": 237148288, "step": 109895 }, { "epoch": 17.928221859706362, "grad_norm": 0.39264097809791565, "learning_rate": 1.6168946444417188e-06, "loss": 0.0411, "num_input_tokens_seen": 237158272, "step": 109900 }, { "epoch": 17.929037520391518, "grad_norm": 0.3058064877986908, "learning_rate": 1.615635739628707e-06, "loss": 0.0261, "num_input_tokens_seen": 237168192, "step": 109905 }, { "epoch": 17.929853181076673, "grad_norm": 0.12223497033119202, "learning_rate": 1.6143773087295116e-06, "loss": 0.12, "num_input_tokens_seen": 237178944, "step": 109910 }, { "epoch": 17.930668841761825, "grad_norm": 0.09667466580867767, "learning_rate": 1.6131193517696153e-06, "loss": 0.0083, "num_input_tokens_seen": 237190432, "step": 109915 }, { "epoch": 17.93148450244698, "grad_norm": 0.8920242786407471, "learning_rate": 1.6118618687745334e-06, "loss": 0.1366, "num_input_tokens_seen": 237202144, "step": 109920 }, { "epoch": 17.932300163132137, "grad_norm": 2.392289638519287, "learning_rate": 1.6106048597697349e-06, "loss": 0.1722, "num_input_tokens_seen": 237213120, "step": 109925 }, { "epoch": 17.933115823817293, "grad_norm": 0.17563150823116302, "learning_rate": 1.6093483247807045e-06, "loss": 0.0581, "num_input_tokens_seen": 237223872, "step": 109930 }, { "epoch": 17.93393148450245, "grad_norm": 0.21969035267829895, "learning_rate": 1.6080922638328999e-06, "loss": 0.2833, "num_input_tokens_seen": 237235360, "step": 109935 }, { "epoch": 17.9347471451876, "grad_norm": 0.8058040142059326, "learning_rate": 1.6068366769517839e-06, "loss": 0.0859, "num_input_tokens_seen": 237246688, "step": 109940 }, { "epoch": 17.935562805872756, "grad_norm": 0.012648827396333218, "learning_rate": 1.6055815641627946e-06, "loss": 0.0347, "num_input_tokens_seen": 237255936, "step": 109945 }, { "epoch": 17.936378466557912, "grad_norm": 0.18696029484272003, "learning_rate": 1.604326925491373e-06, "loss": 0.0372, "num_input_tokens_seen": 237265920, "step": 109950 }, { "epoch": 17.937194127243067, "grad_norm": 0.13608284294605255, "learning_rate": 1.6030727609629482e-06, "loss": 0.017, "num_input_tokens_seen": 237276096, "step": 109955 }, { "epoch": 17.938009787928223, "grad_norm": 1.0525715351104736, "learning_rate": 1.6018190706029363e-06, "loss": 0.0532, "num_input_tokens_seen": 237285824, "step": 109960 }, { "epoch": 17.938825448613375, "grad_norm": 1.4897443056106567, "learning_rate": 1.6005658544367418e-06, "loss": 0.0346, "num_input_tokens_seen": 237298176, "step": 109965 }, { "epoch": 17.93964110929853, "grad_norm": 0.32988473773002625, "learning_rate": 1.5993131124897642e-06, "loss": 0.0544, "num_input_tokens_seen": 237308224, "step": 109970 }, { "epoch": 17.940456769983687, "grad_norm": 0.49398431181907654, "learning_rate": 1.5980608447873913e-06, "loss": 0.0955, "num_input_tokens_seen": 237320160, "step": 109975 }, { "epoch": 17.941272430668842, "grad_norm": 0.46173596382141113, "learning_rate": 1.5968090513550055e-06, "loss": 0.0292, "num_input_tokens_seen": 237330816, "step": 109980 }, { "epoch": 17.942088091353998, "grad_norm": 0.28582680225372314, "learning_rate": 1.5955577322179727e-06, "loss": 0.067, "num_input_tokens_seen": 237343328, "step": 109985 }, { "epoch": 17.94290375203915, "grad_norm": 0.015138600952923298, "learning_rate": 1.594306887401653e-06, "loss": 0.0084, "num_input_tokens_seen": 237355392, "step": 109990 }, { "epoch": 17.943719412724306, "grad_norm": 0.7836618423461914, "learning_rate": 1.593056516931396e-06, "loss": 0.1748, "num_input_tokens_seen": 237366528, "step": 109995 }, { "epoch": 17.94453507340946, "grad_norm": 1.4477994441986084, "learning_rate": 1.5918066208325422e-06, "loss": 0.0425, "num_input_tokens_seen": 237377184, "step": 110000 }, { "epoch": 17.945350734094617, "grad_norm": 0.052689243108034134, "learning_rate": 1.5905571991304214e-06, "loss": 0.1044, "num_input_tokens_seen": 237387424, "step": 110005 }, { "epoch": 17.946166394779773, "grad_norm": 1.5769648551940918, "learning_rate": 1.5893082518503577e-06, "loss": 0.1572, "num_input_tokens_seen": 237397856, "step": 110010 }, { "epoch": 17.946982055464925, "grad_norm": 0.960979163646698, "learning_rate": 1.5880597790176615e-06, "loss": 0.0275, "num_input_tokens_seen": 237409184, "step": 110015 }, { "epoch": 17.94779771615008, "grad_norm": 0.1107606366276741, "learning_rate": 1.5868117806576293e-06, "loss": 0.0092, "num_input_tokens_seen": 237419488, "step": 110020 }, { "epoch": 17.948613376835237, "grad_norm": 0.13711689412593842, "learning_rate": 1.5855642567955658e-06, "loss": 0.0194, "num_input_tokens_seen": 237429536, "step": 110025 }, { "epoch": 17.949429037520392, "grad_norm": 0.17512190341949463, "learning_rate": 1.584317207456737e-06, "loss": 0.0671, "num_input_tokens_seen": 237440032, "step": 110030 }, { "epoch": 17.950244698205548, "grad_norm": 0.03824919834733009, "learning_rate": 1.5830706326664363e-06, "loss": 0.0238, "num_input_tokens_seen": 237452256, "step": 110035 }, { "epoch": 17.9510603588907, "grad_norm": 0.28776058554649353, "learning_rate": 1.5818245324499048e-06, "loss": 0.0204, "num_input_tokens_seen": 237462816, "step": 110040 }, { "epoch": 17.951876019575856, "grad_norm": 0.22570586204528809, "learning_rate": 1.5805789068324166e-06, "loss": 0.0451, "num_input_tokens_seen": 237474304, "step": 110045 }, { "epoch": 17.95269168026101, "grad_norm": 0.04938310757279396, "learning_rate": 1.5793337558391962e-06, "loss": 0.0426, "num_input_tokens_seen": 237483840, "step": 110050 }, { "epoch": 17.953507340946167, "grad_norm": 2.1316707134246826, "learning_rate": 1.5780890794954955e-06, "loss": 0.1335, "num_input_tokens_seen": 237495296, "step": 110055 }, { "epoch": 17.954323001631323, "grad_norm": 0.035386618226766586, "learning_rate": 1.5768448778265304e-06, "loss": 0.1709, "num_input_tokens_seen": 237506592, "step": 110060 }, { "epoch": 17.955138662316475, "grad_norm": 0.04255133122205734, "learning_rate": 1.5756011508575168e-06, "loss": 0.0869, "num_input_tokens_seen": 237518272, "step": 110065 }, { "epoch": 17.95595432300163, "grad_norm": 0.5450884699821472, "learning_rate": 1.5743578986136653e-06, "loss": 0.0663, "num_input_tokens_seen": 237529632, "step": 110070 }, { "epoch": 17.956769983686787, "grad_norm": 0.23014488816261292, "learning_rate": 1.5731151211201666e-06, "loss": 0.0133, "num_input_tokens_seen": 237540416, "step": 110075 }, { "epoch": 17.957585644371942, "grad_norm": 0.02031717449426651, "learning_rate": 1.5718728184022092e-06, "loss": 0.0585, "num_input_tokens_seen": 237551680, "step": 110080 }, { "epoch": 17.958401305057095, "grad_norm": 0.09831298887729645, "learning_rate": 1.5706309904849698e-06, "loss": 0.0773, "num_input_tokens_seen": 237562048, "step": 110085 }, { "epoch": 17.95921696574225, "grad_norm": 0.4194542169570923, "learning_rate": 1.5693896373936174e-06, "loss": 0.0542, "num_input_tokens_seen": 237571904, "step": 110090 }, { "epoch": 17.960032626427406, "grad_norm": 3.7686407566070557, "learning_rate": 1.5681487591533066e-06, "loss": 0.3337, "num_input_tokens_seen": 237581952, "step": 110095 }, { "epoch": 17.96084828711256, "grad_norm": 0.6199070811271667, "learning_rate": 1.5669083557891845e-06, "loss": 0.016, "num_input_tokens_seen": 237593696, "step": 110100 }, { "epoch": 17.961663947797717, "grad_norm": 2.102903366088867, "learning_rate": 1.5656684273263944e-06, "loss": 0.1947, "num_input_tokens_seen": 237605504, "step": 110105 }, { "epoch": 17.96247960848287, "grad_norm": 0.16225162148475647, "learning_rate": 1.5644289737900608e-06, "loss": 0.0386, "num_input_tokens_seen": 237616544, "step": 110110 }, { "epoch": 17.963295269168025, "grad_norm": 0.7507567405700684, "learning_rate": 1.5631899952053053e-06, "loss": 0.0749, "num_input_tokens_seen": 237627328, "step": 110115 }, { "epoch": 17.96411092985318, "grad_norm": 0.024966662749648094, "learning_rate": 1.5619514915972355e-06, "loss": 0.0855, "num_input_tokens_seen": 237637696, "step": 110120 }, { "epoch": 17.964926590538337, "grad_norm": 3.507458209991455, "learning_rate": 1.560713462990951e-06, "loss": 0.1221, "num_input_tokens_seen": 237648736, "step": 110125 }, { "epoch": 17.965742251223492, "grad_norm": 1.5352263450622559, "learning_rate": 1.5594759094115452e-06, "loss": 0.1868, "num_input_tokens_seen": 237659776, "step": 110130 }, { "epoch": 17.966557911908644, "grad_norm": 0.12000539153814316, "learning_rate": 1.5582388308840955e-06, "loss": 0.0185, "num_input_tokens_seen": 237671328, "step": 110135 }, { "epoch": 17.9673735725938, "grad_norm": 1.4387913942337036, "learning_rate": 1.5570022274336737e-06, "loss": 0.1048, "num_input_tokens_seen": 237681568, "step": 110140 }, { "epoch": 17.968189233278956, "grad_norm": 0.13492682576179504, "learning_rate": 1.55576609908534e-06, "loss": 0.0851, "num_input_tokens_seen": 237691360, "step": 110145 }, { "epoch": 17.96900489396411, "grad_norm": 3.3964130878448486, "learning_rate": 1.5545304458641497e-06, "loss": 0.1273, "num_input_tokens_seen": 237701952, "step": 110150 }, { "epoch": 17.969820554649267, "grad_norm": 0.9304819703102112, "learning_rate": 1.5532952677951434e-06, "loss": 0.1786, "num_input_tokens_seen": 237712480, "step": 110155 }, { "epoch": 17.97063621533442, "grad_norm": 0.02694062329828739, "learning_rate": 1.5520605649033488e-06, "loss": 0.0125, "num_input_tokens_seen": 237723680, "step": 110160 }, { "epoch": 17.971451876019575, "grad_norm": 2.135138750076294, "learning_rate": 1.5508263372137926e-06, "loss": 0.1205, "num_input_tokens_seen": 237732832, "step": 110165 }, { "epoch": 17.97226753670473, "grad_norm": 1.9245635271072388, "learning_rate": 1.5495925847514914e-06, "loss": 0.1572, "num_input_tokens_seen": 237744000, "step": 110170 }, { "epoch": 17.973083197389887, "grad_norm": 0.48140403628349304, "learning_rate": 1.5483593075414444e-06, "loss": 0.0612, "num_input_tokens_seen": 237754784, "step": 110175 }, { "epoch": 17.973898858075042, "grad_norm": 0.7331414222717285, "learning_rate": 1.5471265056086454e-06, "loss": 0.0834, "num_input_tokens_seen": 237764896, "step": 110180 }, { "epoch": 17.974714518760194, "grad_norm": 0.39644792675971985, "learning_rate": 1.5458941789780801e-06, "loss": 0.0687, "num_input_tokens_seen": 237774560, "step": 110185 }, { "epoch": 17.97553017944535, "grad_norm": 0.3620031476020813, "learning_rate": 1.5446623276747257e-06, "loss": 0.0391, "num_input_tokens_seen": 237784256, "step": 110190 }, { "epoch": 17.976345840130506, "grad_norm": 0.4328448176383972, "learning_rate": 1.54343095172354e-06, "loss": 0.0395, "num_input_tokens_seen": 237796096, "step": 110195 }, { "epoch": 17.97716150081566, "grad_norm": 0.07913006842136383, "learning_rate": 1.5422000511494866e-06, "loss": 0.0978, "num_input_tokens_seen": 237807776, "step": 110200 }, { "epoch": 17.977977161500817, "grad_norm": 2.050680637359619, "learning_rate": 1.5409696259775063e-06, "loss": 0.0949, "num_input_tokens_seen": 237817568, "step": 110205 }, { "epoch": 17.97879282218597, "grad_norm": 3.107590675354004, "learning_rate": 1.5397396762325378e-06, "loss": 0.0968, "num_input_tokens_seen": 237827744, "step": 110210 }, { "epoch": 17.979608482871125, "grad_norm": 0.051992107182741165, "learning_rate": 1.5385102019395025e-06, "loss": 0.0557, "num_input_tokens_seen": 237839136, "step": 110215 }, { "epoch": 17.98042414355628, "grad_norm": 0.6951407790184021, "learning_rate": 1.5372812031233252e-06, "loss": 0.1332, "num_input_tokens_seen": 237850048, "step": 110220 }, { "epoch": 17.981239804241437, "grad_norm": 0.1278333067893982, "learning_rate": 1.5360526798089054e-06, "loss": 0.2384, "num_input_tokens_seen": 237860640, "step": 110225 }, { "epoch": 17.982055464926592, "grad_norm": 0.2611331343650818, "learning_rate": 1.534824632021148e-06, "loss": 0.0297, "num_input_tokens_seen": 237872096, "step": 110230 }, { "epoch": 17.982871125611744, "grad_norm": 1.4179797172546387, "learning_rate": 1.533597059784933e-06, "loss": 0.0663, "num_input_tokens_seen": 237883456, "step": 110235 }, { "epoch": 17.9836867862969, "grad_norm": 0.39058932662010193, "learning_rate": 1.5323699631251465e-06, "loss": 0.0728, "num_input_tokens_seen": 237893600, "step": 110240 }, { "epoch": 17.984502446982056, "grad_norm": 0.14623825252056122, "learning_rate": 1.5311433420666515e-06, "loss": 0.123, "num_input_tokens_seen": 237903936, "step": 110245 }, { "epoch": 17.98531810766721, "grad_norm": 0.08642543852329254, "learning_rate": 1.529917196634309e-06, "loss": 0.1164, "num_input_tokens_seen": 237914304, "step": 110250 }, { "epoch": 17.986133768352367, "grad_norm": 0.05589890480041504, "learning_rate": 1.5286915268529684e-06, "loss": 0.0205, "num_input_tokens_seen": 237923712, "step": 110255 }, { "epoch": 17.98694942903752, "grad_norm": 2.5016191005706787, "learning_rate": 1.527466332747468e-06, "loss": 0.1958, "num_input_tokens_seen": 237933856, "step": 110260 }, { "epoch": 17.987765089722675, "grad_norm": 0.8331568241119385, "learning_rate": 1.5262416143426412e-06, "loss": 0.1896, "num_input_tokens_seen": 237944160, "step": 110265 }, { "epoch": 17.98858075040783, "grad_norm": 0.08936797827482224, "learning_rate": 1.5250173716633064e-06, "loss": 0.0682, "num_input_tokens_seen": 237955936, "step": 110270 }, { "epoch": 17.989396411092986, "grad_norm": 0.24127967655658722, "learning_rate": 1.5237936047342721e-06, "loss": 0.2084, "num_input_tokens_seen": 237967008, "step": 110275 }, { "epoch": 17.99021207177814, "grad_norm": 0.5002022385597229, "learning_rate": 1.5225703135803487e-06, "loss": 0.0106, "num_input_tokens_seen": 237975840, "step": 110280 }, { "epoch": 17.991027732463294, "grad_norm": 0.4023144245147705, "learning_rate": 1.5213474982263137e-06, "loss": 0.0353, "num_input_tokens_seen": 237986784, "step": 110285 }, { "epoch": 17.99184339314845, "grad_norm": 0.6775419116020203, "learning_rate": 1.520125158696964e-06, "loss": 0.0756, "num_input_tokens_seen": 237998528, "step": 110290 }, { "epoch": 17.992659053833606, "grad_norm": 0.3619616627693176, "learning_rate": 1.5189032950170573e-06, "loss": 0.038, "num_input_tokens_seen": 238008576, "step": 110295 }, { "epoch": 17.99347471451876, "grad_norm": 0.5920923352241516, "learning_rate": 1.5176819072113684e-06, "loss": 0.0726, "num_input_tokens_seen": 238019200, "step": 110300 }, { "epoch": 17.994290375203914, "grad_norm": 0.32753247022628784, "learning_rate": 1.5164609953046389e-06, "loss": 0.0452, "num_input_tokens_seen": 238030336, "step": 110305 }, { "epoch": 17.99510603588907, "grad_norm": 0.24744877219200134, "learning_rate": 1.5152405593216235e-06, "loss": 0.164, "num_input_tokens_seen": 238040000, "step": 110310 }, { "epoch": 17.995921696574225, "grad_norm": 0.03658754751086235, "learning_rate": 1.5140205992870498e-06, "loss": 0.0307, "num_input_tokens_seen": 238050880, "step": 110315 }, { "epoch": 17.99673735725938, "grad_norm": 0.14467334747314453, "learning_rate": 1.5128011152256427e-06, "loss": 0.0142, "num_input_tokens_seen": 238060768, "step": 110320 }, { "epoch": 17.997553017944536, "grad_norm": 0.03376198559999466, "learning_rate": 1.5115821071621156e-06, "loss": 0.0318, "num_input_tokens_seen": 238071296, "step": 110325 }, { "epoch": 17.99836867862969, "grad_norm": 0.3043508231639862, "learning_rate": 1.5103635751211764e-06, "loss": 0.0868, "num_input_tokens_seen": 238082464, "step": 110330 }, { "epoch": 17.999184339314844, "grad_norm": 0.07751834392547607, "learning_rate": 1.5091455191275167e-06, "loss": 0.1057, "num_input_tokens_seen": 238092288, "step": 110335 }, { "epoch": 18.0, "grad_norm": 0.4649839401245117, "learning_rate": 1.507927939205822e-06, "loss": 0.0268, "num_input_tokens_seen": 238102672, "step": 110340 }, { "epoch": 18.0, "eval_loss": 0.14419381320476532, "eval_runtime": 90.992, "eval_samples_per_second": 29.948, "eval_steps_per_second": 7.495, "num_input_tokens_seen": 238102672, "step": 110340 }, { "epoch": 18.000815660685156, "grad_norm": 0.8537637591362, "learning_rate": 1.5067108353807702e-06, "loss": 0.1283, "num_input_tokens_seen": 238112176, "step": 110345 }, { "epoch": 18.00163132137031, "grad_norm": 1.3924354314804077, "learning_rate": 1.5054942076770274e-06, "loss": 0.1606, "num_input_tokens_seen": 238123920, "step": 110350 }, { "epoch": 18.002446982055464, "grad_norm": 0.028755588456988335, "learning_rate": 1.5042780561192489e-06, "loss": 0.0464, "num_input_tokens_seen": 238134992, "step": 110355 }, { "epoch": 18.00326264274062, "grad_norm": 0.033078089356422424, "learning_rate": 1.5030623807320842e-06, "loss": 0.0888, "num_input_tokens_seen": 238145968, "step": 110360 }, { "epoch": 18.004078303425775, "grad_norm": 0.8748357892036438, "learning_rate": 1.501847181540164e-06, "loss": 0.0718, "num_input_tokens_seen": 238156848, "step": 110365 }, { "epoch": 18.00489396411093, "grad_norm": 0.5347970128059387, "learning_rate": 1.5006324585681241e-06, "loss": 0.0628, "num_input_tokens_seen": 238168272, "step": 110370 }, { "epoch": 18.005709624796086, "grad_norm": 1.9856178760528564, "learning_rate": 1.499418211840578e-06, "loss": 0.3104, "num_input_tokens_seen": 238178736, "step": 110375 }, { "epoch": 18.00652528548124, "grad_norm": 0.9292968511581421, "learning_rate": 1.4982044413821284e-06, "loss": 0.0556, "num_input_tokens_seen": 238190256, "step": 110380 }, { "epoch": 18.007340946166394, "grad_norm": 0.3603101372718811, "learning_rate": 1.4969911472173887e-06, "loss": 0.0572, "num_input_tokens_seen": 238200432, "step": 110385 }, { "epoch": 18.00815660685155, "grad_norm": 0.05305984988808632, "learning_rate": 1.495778329370931e-06, "loss": 0.01, "num_input_tokens_seen": 238210576, "step": 110390 }, { "epoch": 18.008972267536706, "grad_norm": 0.8539916276931763, "learning_rate": 1.4945659878673524e-06, "loss": 0.146, "num_input_tokens_seen": 238221200, "step": 110395 }, { "epoch": 18.00978792822186, "grad_norm": 0.05800618603825569, "learning_rate": 1.4933541227312027e-06, "loss": 0.0116, "num_input_tokens_seen": 238232208, "step": 110400 }, { "epoch": 18.010603588907014, "grad_norm": 0.6884459853172302, "learning_rate": 1.4921427339870619e-06, "loss": 0.0498, "num_input_tokens_seen": 238243632, "step": 110405 }, { "epoch": 18.01141924959217, "grad_norm": 0.5156652927398682, "learning_rate": 1.4909318216594608e-06, "loss": 0.0155, "num_input_tokens_seen": 238254192, "step": 110410 }, { "epoch": 18.012234910277325, "grad_norm": 2.3306803703308105, "learning_rate": 1.4897213857729575e-06, "loss": 0.1928, "num_input_tokens_seen": 238265392, "step": 110415 }, { "epoch": 18.01305057096248, "grad_norm": 0.3530412018299103, "learning_rate": 1.4885114263520684e-06, "loss": 0.0786, "num_input_tokens_seen": 238275376, "step": 110420 }, { "epoch": 18.013866231647636, "grad_norm": 0.05106399580836296, "learning_rate": 1.4873019434213293e-06, "loss": 0.0089, "num_input_tokens_seen": 238286640, "step": 110425 }, { "epoch": 18.01468189233279, "grad_norm": 0.0880780890583992, "learning_rate": 1.4860929370052374e-06, "loss": 0.0834, "num_input_tokens_seen": 238297360, "step": 110430 }, { "epoch": 18.015497553017944, "grad_norm": 0.03160789608955383, "learning_rate": 1.4848844071283063e-06, "loss": 0.2089, "num_input_tokens_seen": 238308240, "step": 110435 }, { "epoch": 18.0163132137031, "grad_norm": 0.20504599809646606, "learning_rate": 1.4836763538150195e-06, "loss": 0.0428, "num_input_tokens_seen": 238318128, "step": 110440 }, { "epoch": 18.017128874388256, "grad_norm": 0.12931662797927856, "learning_rate": 1.482468777089871e-06, "loss": 0.0085, "num_input_tokens_seen": 238330512, "step": 110445 }, { "epoch": 18.017944535073408, "grad_norm": 1.5984572172164917, "learning_rate": 1.4812616769773164e-06, "loss": 0.0567, "num_input_tokens_seen": 238339792, "step": 110450 }, { "epoch": 18.018760195758563, "grad_norm": 2.2653586864471436, "learning_rate": 1.480055053501836e-06, "loss": 0.0675, "num_input_tokens_seen": 238349904, "step": 110455 }, { "epoch": 18.01957585644372, "grad_norm": 0.7012912631034851, "learning_rate": 1.4788489066878768e-06, "loss": 0.0273, "num_input_tokens_seen": 238359600, "step": 110460 }, { "epoch": 18.020391517128875, "grad_norm": 1.5509637594223022, "learning_rate": 1.4776432365598837e-06, "loss": 0.1319, "num_input_tokens_seen": 238370768, "step": 110465 }, { "epoch": 18.02120717781403, "grad_norm": 0.057095061987638474, "learning_rate": 1.4764380431422893e-06, "loss": 0.0487, "num_input_tokens_seen": 238380208, "step": 110470 }, { "epoch": 18.022022838499183, "grad_norm": 1.012843370437622, "learning_rate": 1.4752333264595214e-06, "loss": 0.2959, "num_input_tokens_seen": 238391280, "step": 110475 }, { "epoch": 18.02283849918434, "grad_norm": 0.24071981012821198, "learning_rate": 1.4740290865359913e-06, "loss": 0.1011, "num_input_tokens_seen": 238402192, "step": 110480 }, { "epoch": 18.023654159869494, "grad_norm": 0.8782744407653809, "learning_rate": 1.472825323396107e-06, "loss": 0.0177, "num_input_tokens_seen": 238413200, "step": 110485 }, { "epoch": 18.02446982055465, "grad_norm": 1.1226969957351685, "learning_rate": 1.471622037064266e-06, "loss": 0.0603, "num_input_tokens_seen": 238424816, "step": 110490 }, { "epoch": 18.025285481239806, "grad_norm": 1.716549038887024, "learning_rate": 1.4704192275648483e-06, "loss": 0.1394, "num_input_tokens_seen": 238433808, "step": 110495 }, { "epoch": 18.026101141924958, "grad_norm": 1.3784891366958618, "learning_rate": 1.4692168949222373e-06, "loss": 0.0859, "num_input_tokens_seen": 238445392, "step": 110500 }, { "epoch": 18.026916802610113, "grad_norm": 0.02275237627327442, "learning_rate": 1.4680150391607916e-06, "loss": 0.0292, "num_input_tokens_seen": 238456688, "step": 110505 }, { "epoch": 18.02773246329527, "grad_norm": 0.28320765495300293, "learning_rate": 1.4668136603048832e-06, "loss": 0.0467, "num_input_tokens_seen": 238468272, "step": 110510 }, { "epoch": 18.028548123980425, "grad_norm": 0.0933409035205841, "learning_rate": 1.4656127583788426e-06, "loss": 0.0892, "num_input_tokens_seen": 238479280, "step": 110515 }, { "epoch": 18.02936378466558, "grad_norm": 0.8117561340332031, "learning_rate": 1.4644123334070197e-06, "loss": 0.0193, "num_input_tokens_seen": 238490640, "step": 110520 }, { "epoch": 18.030179445350733, "grad_norm": 3.08648943901062, "learning_rate": 1.4632123854137315e-06, "loss": 0.1832, "num_input_tokens_seen": 238501680, "step": 110525 }, { "epoch": 18.03099510603589, "grad_norm": 0.10187837481498718, "learning_rate": 1.4620129144233108e-06, "loss": 0.0116, "num_input_tokens_seen": 238513264, "step": 110530 }, { "epoch": 18.031810766721044, "grad_norm": 0.25277814269065857, "learning_rate": 1.4608139204600496e-06, "loss": 0.0168, "num_input_tokens_seen": 238523312, "step": 110535 }, { "epoch": 18.0326264274062, "grad_norm": 1.3005188703536987, "learning_rate": 1.4596154035482646e-06, "loss": 0.1265, "num_input_tokens_seen": 238534992, "step": 110540 }, { "epoch": 18.033442088091356, "grad_norm": 1.0550968647003174, "learning_rate": 1.4584173637122307e-06, "loss": 0.076, "num_input_tokens_seen": 238546256, "step": 110545 }, { "epoch": 18.034257748776508, "grad_norm": 1.323224663734436, "learning_rate": 1.4572198009762394e-06, "loss": 0.1214, "num_input_tokens_seen": 238556656, "step": 110550 }, { "epoch": 18.035073409461663, "grad_norm": 1.3024487495422363, "learning_rate": 1.4560227153645467e-06, "loss": 0.2187, "num_input_tokens_seen": 238567216, "step": 110555 }, { "epoch": 18.03588907014682, "grad_norm": 0.18592368066310883, "learning_rate": 1.4548261069014301e-06, "loss": 0.1342, "num_input_tokens_seen": 238576816, "step": 110560 }, { "epoch": 18.036704730831975, "grad_norm": 2.2905938625335693, "learning_rate": 1.4536299756111232e-06, "loss": 0.1531, "num_input_tokens_seen": 238587440, "step": 110565 }, { "epoch": 18.03752039151713, "grad_norm": 2.263345241546631, "learning_rate": 1.4524343215178844e-06, "loss": 0.1537, "num_input_tokens_seen": 238598480, "step": 110570 }, { "epoch": 18.038336052202283, "grad_norm": 2.1218135356903076, "learning_rate": 1.4512391446459273e-06, "loss": 0.1841, "num_input_tokens_seen": 238609488, "step": 110575 }, { "epoch": 18.03915171288744, "grad_norm": 1.2692228555679321, "learning_rate": 1.4500444450194884e-06, "loss": 0.0314, "num_input_tokens_seen": 238620496, "step": 110580 }, { "epoch": 18.039967373572594, "grad_norm": 1.5246140956878662, "learning_rate": 1.4488502226627703e-06, "loss": 0.1274, "num_input_tokens_seen": 238630608, "step": 110585 }, { "epoch": 18.04078303425775, "grad_norm": 2.092233896255493, "learning_rate": 1.4476564775999817e-06, "loss": 0.0809, "num_input_tokens_seen": 238640784, "step": 110590 }, { "epoch": 18.041598694942905, "grad_norm": 2.1238176822662354, "learning_rate": 1.4464632098553115e-06, "loss": 0.1096, "num_input_tokens_seen": 238651440, "step": 110595 }, { "epoch": 18.042414355628058, "grad_norm": 1.3948640823364258, "learning_rate": 1.445270419452946e-06, "loss": 0.0917, "num_input_tokens_seen": 238662864, "step": 110600 }, { "epoch": 18.043230016313213, "grad_norm": 0.37835007905960083, "learning_rate": 1.4440781064170545e-06, "loss": 0.0577, "num_input_tokens_seen": 238672240, "step": 110605 }, { "epoch": 18.04404567699837, "grad_norm": 0.22919733822345734, "learning_rate": 1.4428862707718066e-06, "loss": 0.0394, "num_input_tokens_seen": 238682608, "step": 110610 }, { "epoch": 18.044861337683525, "grad_norm": 0.6127130389213562, "learning_rate": 1.4416949125413498e-06, "loss": 0.0889, "num_input_tokens_seen": 238693072, "step": 110615 }, { "epoch": 18.045676998368677, "grad_norm": 0.06572777032852173, "learning_rate": 1.4405040317498314e-06, "loss": 0.0633, "num_input_tokens_seen": 238704240, "step": 110620 }, { "epoch": 18.046492659053833, "grad_norm": 0.6698492765426636, "learning_rate": 1.4393136284213877e-06, "loss": 0.1537, "num_input_tokens_seen": 238716112, "step": 110625 }, { "epoch": 18.04730831973899, "grad_norm": 1.981872320175171, "learning_rate": 1.4381237025801408e-06, "loss": 0.0369, "num_input_tokens_seen": 238726096, "step": 110630 }, { "epoch": 18.048123980424144, "grad_norm": 3.341243267059326, "learning_rate": 1.436934254250208e-06, "loss": 0.1931, "num_input_tokens_seen": 238737744, "step": 110635 }, { "epoch": 18.0489396411093, "grad_norm": 0.04407093673944473, "learning_rate": 1.4357452834556945e-06, "loss": 0.0595, "num_input_tokens_seen": 238748560, "step": 110640 }, { "epoch": 18.049755301794452, "grad_norm": 0.5714318752288818, "learning_rate": 1.4345567902206952e-06, "loss": 0.0755, "num_input_tokens_seen": 238759760, "step": 110645 }, { "epoch": 18.050570962479608, "grad_norm": 1.7124931812286377, "learning_rate": 1.4333687745692965e-06, "loss": 0.0655, "num_input_tokens_seen": 238770928, "step": 110650 }, { "epoch": 18.051386623164763, "grad_norm": 0.23848426342010498, "learning_rate": 1.4321812365255815e-06, "loss": 0.1225, "num_input_tokens_seen": 238781392, "step": 110655 }, { "epoch": 18.05220228384992, "grad_norm": 1.797158122062683, "learning_rate": 1.4309941761136036e-06, "loss": 0.1538, "num_input_tokens_seen": 238790448, "step": 110660 }, { "epoch": 18.053017944535075, "grad_norm": 1.9665470123291016, "learning_rate": 1.4298075933574352e-06, "loss": 0.0745, "num_input_tokens_seen": 238800624, "step": 110665 }, { "epoch": 18.053833605220227, "grad_norm": 1.9530683755874634, "learning_rate": 1.4286214882811122e-06, "loss": 0.0967, "num_input_tokens_seen": 238811760, "step": 110670 }, { "epoch": 18.054649265905383, "grad_norm": 0.30551567673683167, "learning_rate": 1.4274358609086825e-06, "loss": 0.0681, "num_input_tokens_seen": 238822448, "step": 110675 }, { "epoch": 18.05546492659054, "grad_norm": 1.419770359992981, "learning_rate": 1.42625071126416e-06, "loss": 0.0558, "num_input_tokens_seen": 238833392, "step": 110680 }, { "epoch": 18.056280587275694, "grad_norm": 2.784973621368408, "learning_rate": 1.4250660393715837e-06, "loss": 0.3373, "num_input_tokens_seen": 238843568, "step": 110685 }, { "epoch": 18.05709624796085, "grad_norm": 0.03312661498785019, "learning_rate": 1.4238818452549401e-06, "loss": 0.0845, "num_input_tokens_seen": 238854928, "step": 110690 }, { "epoch": 18.057911908646002, "grad_norm": 0.10987743735313416, "learning_rate": 1.4226981289382435e-06, "loss": 0.0151, "num_input_tokens_seen": 238865136, "step": 110695 }, { "epoch": 18.058727569331158, "grad_norm": 0.9318199753761292, "learning_rate": 1.4215148904454801e-06, "loss": 0.0614, "num_input_tokens_seen": 238875824, "step": 110700 }, { "epoch": 18.059543230016313, "grad_norm": 2.2056827545166016, "learning_rate": 1.4203321298006305e-06, "loss": 0.1166, "num_input_tokens_seen": 238885904, "step": 110705 }, { "epoch": 18.06035889070147, "grad_norm": 1.0565030574798584, "learning_rate": 1.4191498470276622e-06, "loss": 0.1087, "num_input_tokens_seen": 238896208, "step": 110710 }, { "epoch": 18.061174551386625, "grad_norm": 3.1843361854553223, "learning_rate": 1.4179680421505359e-06, "loss": 0.0654, "num_input_tokens_seen": 238906608, "step": 110715 }, { "epoch": 18.061990212071777, "grad_norm": 1.4155347347259521, "learning_rate": 1.4167867151932024e-06, "loss": 0.0469, "num_input_tokens_seen": 238917840, "step": 110720 }, { "epoch": 18.062805872756933, "grad_norm": 0.6001515984535217, "learning_rate": 1.4156058661796063e-06, "loss": 0.0353, "num_input_tokens_seen": 238927376, "step": 110725 }, { "epoch": 18.063621533442088, "grad_norm": 2.039379835128784, "learning_rate": 1.4144254951336728e-06, "loss": 0.1789, "num_input_tokens_seen": 238937680, "step": 110730 }, { "epoch": 18.064437194127244, "grad_norm": 0.8009387850761414, "learning_rate": 1.41324560207933e-06, "loss": 0.0969, "num_input_tokens_seen": 238949168, "step": 110735 }, { "epoch": 18.0652528548124, "grad_norm": 0.2831651270389557, "learning_rate": 1.4120661870404838e-06, "loss": 0.1092, "num_input_tokens_seen": 238959728, "step": 110740 }, { "epoch": 18.06606851549755, "grad_norm": 1.2271249294281006, "learning_rate": 1.410887250041043e-06, "loss": 0.1618, "num_input_tokens_seen": 238969904, "step": 110745 }, { "epoch": 18.066884176182707, "grad_norm": 0.7847725749015808, "learning_rate": 1.4097087911048939e-06, "loss": 0.0305, "num_input_tokens_seen": 238982704, "step": 110750 }, { "epoch": 18.067699836867863, "grad_norm": 1.671954870223999, "learning_rate": 1.4085308102559258e-06, "loss": 0.0614, "num_input_tokens_seen": 238992048, "step": 110755 }, { "epoch": 18.06851549755302, "grad_norm": 2.193176031112671, "learning_rate": 1.4073533075180085e-06, "loss": 0.082, "num_input_tokens_seen": 239003696, "step": 110760 }, { "epoch": 18.069331158238175, "grad_norm": 0.11891468614339828, "learning_rate": 1.4061762829150032e-06, "loss": 0.0222, "num_input_tokens_seen": 239014448, "step": 110765 }, { "epoch": 18.070146818923327, "grad_norm": 0.09637793898582458, "learning_rate": 1.404999736470769e-06, "loss": 0.0194, "num_input_tokens_seen": 239025232, "step": 110770 }, { "epoch": 18.070962479608482, "grad_norm": 0.0647185817360878, "learning_rate": 1.4038236682091448e-06, "loss": 0.0149, "num_input_tokens_seen": 239035600, "step": 110775 }, { "epoch": 18.071778140293638, "grad_norm": 0.2270105481147766, "learning_rate": 1.4026480781539703e-06, "loss": 0.1013, "num_input_tokens_seen": 239047376, "step": 110780 }, { "epoch": 18.072593800978794, "grad_norm": 0.2392417937517166, "learning_rate": 1.4014729663290677e-06, "loss": 0.0229, "num_input_tokens_seen": 239057168, "step": 110785 }, { "epoch": 18.07340946166395, "grad_norm": 0.054445695132017136, "learning_rate": 1.4002983327582515e-06, "loss": 0.1132, "num_input_tokens_seen": 239068496, "step": 110790 }, { "epoch": 18.0742251223491, "grad_norm": 0.05793461948633194, "learning_rate": 1.3991241774653218e-06, "loss": 0.0587, "num_input_tokens_seen": 239079952, "step": 110795 }, { "epoch": 18.075040783034257, "grad_norm": 1.5954750776290894, "learning_rate": 1.3979505004740907e-06, "loss": 0.1551, "num_input_tokens_seen": 239090960, "step": 110800 }, { "epoch": 18.075856443719413, "grad_norm": 1.3868721723556519, "learning_rate": 1.3967773018083274e-06, "loss": 0.1595, "num_input_tokens_seen": 239100336, "step": 110805 }, { "epoch": 18.07667210440457, "grad_norm": 0.08191260695457458, "learning_rate": 1.3956045814918162e-06, "loss": 0.067, "num_input_tokens_seen": 239111216, "step": 110810 }, { "epoch": 18.07748776508972, "grad_norm": 0.3031047284603119, "learning_rate": 1.3944323395483239e-06, "loss": 0.0136, "num_input_tokens_seen": 239122576, "step": 110815 }, { "epoch": 18.078303425774877, "grad_norm": 0.05567648261785507, "learning_rate": 1.3932605760016066e-06, "loss": 0.0509, "num_input_tokens_seen": 239133712, "step": 110820 }, { "epoch": 18.079119086460032, "grad_norm": 0.7147133350372314, "learning_rate": 1.392089290875409e-06, "loss": 0.1933, "num_input_tokens_seen": 239144016, "step": 110825 }, { "epoch": 18.079934747145188, "grad_norm": 0.367963969707489, "learning_rate": 1.3909184841934736e-06, "loss": 0.021, "num_input_tokens_seen": 239154352, "step": 110830 }, { "epoch": 18.080750407830344, "grad_norm": 0.03572545573115349, "learning_rate": 1.3897481559795228e-06, "loss": 0.0944, "num_input_tokens_seen": 239165616, "step": 110835 }, { "epoch": 18.081566068515496, "grad_norm": 0.7195013761520386, "learning_rate": 1.3885783062572793e-06, "loss": 0.0234, "num_input_tokens_seen": 239176048, "step": 110840 }, { "epoch": 18.08238172920065, "grad_norm": 0.9329737424850464, "learning_rate": 1.3874089350504465e-06, "loss": 0.1211, "num_input_tokens_seen": 239186320, "step": 110845 }, { "epoch": 18.083197389885807, "grad_norm": 0.06735306978225708, "learning_rate": 1.3862400423827305e-06, "loss": 0.0101, "num_input_tokens_seen": 239196528, "step": 110850 }, { "epoch": 18.084013050570963, "grad_norm": 0.1179189682006836, "learning_rate": 1.3850716282778148e-06, "loss": 0.0117, "num_input_tokens_seen": 239207760, "step": 110855 }, { "epoch": 18.08482871125612, "grad_norm": 0.029642654582858086, "learning_rate": 1.383903692759378e-06, "loss": 0.0475, "num_input_tokens_seen": 239218992, "step": 110860 }, { "epoch": 18.08564437194127, "grad_norm": 2.39458966255188, "learning_rate": 1.3827362358510925e-06, "loss": 0.1669, "num_input_tokens_seen": 239229808, "step": 110865 }, { "epoch": 18.086460032626427, "grad_norm": 0.17102882266044617, "learning_rate": 1.3815692575766204e-06, "loss": 0.0302, "num_input_tokens_seen": 239240528, "step": 110870 }, { "epoch": 18.087275693311582, "grad_norm": 0.8783814907073975, "learning_rate": 1.3804027579596063e-06, "loss": 0.2072, "num_input_tokens_seen": 239251216, "step": 110875 }, { "epoch": 18.088091353996738, "grad_norm": 1.3893829584121704, "learning_rate": 1.3792367370236954e-06, "loss": 0.1083, "num_input_tokens_seen": 239261488, "step": 110880 }, { "epoch": 18.088907014681894, "grad_norm": 1.6406248807907104, "learning_rate": 1.3780711947925184e-06, "loss": 0.1704, "num_input_tokens_seen": 239272496, "step": 110885 }, { "epoch": 18.089722675367046, "grad_norm": 0.1491154581308365, "learning_rate": 1.3769061312896931e-06, "loss": 0.0097, "num_input_tokens_seen": 239283824, "step": 110890 }, { "epoch": 18.0905383360522, "grad_norm": 0.7881538271903992, "learning_rate": 1.3757415465388334e-06, "loss": 0.0453, "num_input_tokens_seen": 239293936, "step": 110895 }, { "epoch": 18.091353996737357, "grad_norm": 0.02942759543657303, "learning_rate": 1.3745774405635431e-06, "loss": 0.0499, "num_input_tokens_seen": 239305712, "step": 110900 }, { "epoch": 18.092169657422513, "grad_norm": 0.22108983993530273, "learning_rate": 1.3734138133874086e-06, "loss": 0.1385, "num_input_tokens_seen": 239315696, "step": 110905 }, { "epoch": 18.09298531810767, "grad_norm": 0.03784605860710144, "learning_rate": 1.3722506650340167e-06, "loss": 0.0405, "num_input_tokens_seen": 239327568, "step": 110910 }, { "epoch": 18.09380097879282, "grad_norm": 0.6874637007713318, "learning_rate": 1.3710879955269374e-06, "loss": 0.1347, "num_input_tokens_seen": 239339312, "step": 110915 }, { "epoch": 18.094616639477977, "grad_norm": 0.4116474688053131, "learning_rate": 1.3699258048897406e-06, "loss": 0.08, "num_input_tokens_seen": 239349936, "step": 110920 }, { "epoch": 18.095432300163132, "grad_norm": 0.06389790028333664, "learning_rate": 1.368764093145966e-06, "loss": 0.1052, "num_input_tokens_seen": 239359568, "step": 110925 }, { "epoch": 18.096247960848288, "grad_norm": 0.3660227358341217, "learning_rate": 1.3676028603191754e-06, "loss": 0.0655, "num_input_tokens_seen": 239369712, "step": 110930 }, { "epoch": 18.097063621533444, "grad_norm": 0.3070192337036133, "learning_rate": 1.366442106432883e-06, "loss": 0.1642, "num_input_tokens_seen": 239380880, "step": 110935 }, { "epoch": 18.097879282218596, "grad_norm": 0.43672555685043335, "learning_rate": 1.3652818315106287e-06, "loss": 0.1277, "num_input_tokens_seen": 239392464, "step": 110940 }, { "epoch": 18.09869494290375, "grad_norm": 0.9063065648078918, "learning_rate": 1.3641220355759182e-06, "loss": 0.0369, "num_input_tokens_seen": 239402224, "step": 110945 }, { "epoch": 18.099510603588907, "grad_norm": 0.027610085904598236, "learning_rate": 1.3629627186522609e-06, "loss": 0.0249, "num_input_tokens_seen": 239413072, "step": 110950 }, { "epoch": 18.100326264274063, "grad_norm": 0.07813523709774017, "learning_rate": 1.3618038807631489e-06, "loss": 0.0408, "num_input_tokens_seen": 239423472, "step": 110955 }, { "epoch": 18.10114192495922, "grad_norm": 2.6905391216278076, "learning_rate": 1.3606455219320691e-06, "loss": 0.3021, "num_input_tokens_seen": 239434064, "step": 110960 }, { "epoch": 18.10195758564437, "grad_norm": 4.118009090423584, "learning_rate": 1.3594876421824942e-06, "loss": 0.181, "num_input_tokens_seen": 239444560, "step": 110965 }, { "epoch": 18.102773246329527, "grad_norm": 0.6543264985084534, "learning_rate": 1.3583302415378946e-06, "loss": 0.0874, "num_input_tokens_seen": 239456560, "step": 110970 }, { "epoch": 18.103588907014682, "grad_norm": 0.44377756118774414, "learning_rate": 1.3571733200217235e-06, "loss": 0.0153, "num_input_tokens_seen": 239468464, "step": 110975 }, { "epoch": 18.104404567699838, "grad_norm": 0.19676783680915833, "learning_rate": 1.356016877657429e-06, "loss": 0.1291, "num_input_tokens_seen": 239478448, "step": 110980 }, { "epoch": 18.10522022838499, "grad_norm": 0.09806635230779648, "learning_rate": 1.3548609144684448e-06, "loss": 0.0382, "num_input_tokens_seen": 239489968, "step": 110985 }, { "epoch": 18.106035889070146, "grad_norm": 0.08500905334949493, "learning_rate": 1.3537054304782027e-06, "loss": 0.026, "num_input_tokens_seen": 239500368, "step": 110990 }, { "epoch": 18.1068515497553, "grad_norm": 0.1064600795507431, "learning_rate": 1.3525504257101169e-06, "loss": 0.174, "num_input_tokens_seen": 239510928, "step": 110995 }, { "epoch": 18.107667210440457, "grad_norm": 2.917553663253784, "learning_rate": 1.3513959001875937e-06, "loss": 0.0901, "num_input_tokens_seen": 239522064, "step": 111000 }, { "epoch": 18.108482871125613, "grad_norm": 0.07537656277418137, "learning_rate": 1.350241853934034e-06, "loss": 0.0299, "num_input_tokens_seen": 239532912, "step": 111005 }, { "epoch": 18.109298531810765, "grad_norm": 1.8506447076797485, "learning_rate": 1.3490882869728273e-06, "loss": 0.0984, "num_input_tokens_seen": 239544112, "step": 111010 }, { "epoch": 18.11011419249592, "grad_norm": 0.05103439837694168, "learning_rate": 1.3479351993273464e-06, "loss": 0.2253, "num_input_tokens_seen": 239554320, "step": 111015 }, { "epoch": 18.110929853181077, "grad_norm": 0.6672357320785522, "learning_rate": 1.346782591020962e-06, "loss": 0.1514, "num_input_tokens_seen": 239564784, "step": 111020 }, { "epoch": 18.111745513866232, "grad_norm": 0.06385606527328491, "learning_rate": 1.345630462077041e-06, "loss": 0.0205, "num_input_tokens_seen": 239575952, "step": 111025 }, { "epoch": 18.112561174551388, "grad_norm": 1.4422345161437988, "learning_rate": 1.3444788125189206e-06, "loss": 0.0395, "num_input_tokens_seen": 239586032, "step": 111030 }, { "epoch": 18.11337683523654, "grad_norm": 0.19728340208530426, "learning_rate": 1.3433276423699514e-06, "loss": 0.2117, "num_input_tokens_seen": 239597008, "step": 111035 }, { "epoch": 18.114192495921696, "grad_norm": 1.7630479335784912, "learning_rate": 1.342176951653451e-06, "loss": 0.096, "num_input_tokens_seen": 239607344, "step": 111040 }, { "epoch": 18.11500815660685, "grad_norm": 0.167745441198349, "learning_rate": 1.3410267403927562e-06, "loss": 0.0855, "num_input_tokens_seen": 239617552, "step": 111045 }, { "epoch": 18.115823817292007, "grad_norm": 0.07380004227161407, "learning_rate": 1.3398770086111596e-06, "loss": 0.0528, "num_input_tokens_seen": 239628656, "step": 111050 }, { "epoch": 18.116639477977163, "grad_norm": 1.811400294303894, "learning_rate": 1.3387277563319756e-06, "loss": 0.1005, "num_input_tokens_seen": 239639120, "step": 111055 }, { "epoch": 18.117455138662315, "grad_norm": 0.05627536028623581, "learning_rate": 1.3375789835784858e-06, "loss": 0.0551, "num_input_tokens_seen": 239648464, "step": 111060 }, { "epoch": 18.11827079934747, "grad_norm": 0.7681427001953125, "learning_rate": 1.3364306903739798e-06, "loss": 0.0373, "num_input_tokens_seen": 239660080, "step": 111065 }, { "epoch": 18.119086460032626, "grad_norm": 0.9793277978897095, "learning_rate": 1.3352828767417197e-06, "loss": 0.09, "num_input_tokens_seen": 239671280, "step": 111070 }, { "epoch": 18.119902120717782, "grad_norm": 0.13066604733467102, "learning_rate": 1.3341355427049811e-06, "loss": 0.1574, "num_input_tokens_seen": 239681328, "step": 111075 }, { "epoch": 18.120717781402938, "grad_norm": 0.037893831729888916, "learning_rate": 1.3329886882870013e-06, "loss": 0.092, "num_input_tokens_seen": 239691472, "step": 111080 }, { "epoch": 18.12153344208809, "grad_norm": 0.03332643210887909, "learning_rate": 1.3318423135110308e-06, "loss": 0.1205, "num_input_tokens_seen": 239701392, "step": 111085 }, { "epoch": 18.122349102773246, "grad_norm": 0.6336258053779602, "learning_rate": 1.330696418400304e-06, "loss": 0.0331, "num_input_tokens_seen": 239712016, "step": 111090 }, { "epoch": 18.1231647634584, "grad_norm": 0.5712948441505432, "learning_rate": 1.3295510029780383e-06, "loss": 0.0395, "num_input_tokens_seen": 239722512, "step": 111095 }, { "epoch": 18.123980424143557, "grad_norm": 0.038352370262145996, "learning_rate": 1.3284060672674487e-06, "loss": 0.048, "num_input_tokens_seen": 239734352, "step": 111100 }, { "epoch": 18.124796084828713, "grad_norm": 0.03405754268169403, "learning_rate": 1.3272616112917414e-06, "loss": 0.034, "num_input_tokens_seen": 239745904, "step": 111105 }, { "epoch": 18.125611745513865, "grad_norm": 1.9430264234542847, "learning_rate": 1.3261176350741088e-06, "loss": 0.0956, "num_input_tokens_seen": 239756400, "step": 111110 }, { "epoch": 18.12642740619902, "grad_norm": 0.21671099960803986, "learning_rate": 1.3249741386377358e-06, "loss": 0.0915, "num_input_tokens_seen": 239767440, "step": 111115 }, { "epoch": 18.127243066884176, "grad_norm": 1.696933388710022, "learning_rate": 1.3238311220057947e-06, "loss": 0.0815, "num_input_tokens_seen": 239777936, "step": 111120 }, { "epoch": 18.128058727569332, "grad_norm": 0.020340461283922195, "learning_rate": 1.322688585201451e-06, "loss": 0.027, "num_input_tokens_seen": 239788368, "step": 111125 }, { "epoch": 18.128874388254488, "grad_norm": 0.5969160199165344, "learning_rate": 1.3215465282478607e-06, "loss": 0.1024, "num_input_tokens_seen": 239800016, "step": 111130 }, { "epoch": 18.12969004893964, "grad_norm": 0.09050198644399643, "learning_rate": 1.3204049511681637e-06, "loss": 0.009, "num_input_tokens_seen": 239810288, "step": 111135 }, { "epoch": 18.130505709624796, "grad_norm": 0.03261495754122734, "learning_rate": 1.3192638539855058e-06, "loss": 0.074, "num_input_tokens_seen": 239820400, "step": 111140 }, { "epoch": 18.13132137030995, "grad_norm": 0.1715162843465805, "learning_rate": 1.3181232367230012e-06, "loss": 0.05, "num_input_tokens_seen": 239830960, "step": 111145 }, { "epoch": 18.132137030995107, "grad_norm": 0.9929916858673096, "learning_rate": 1.316983099403779e-06, "loss": 0.024, "num_input_tokens_seen": 239840976, "step": 111150 }, { "epoch": 18.13295269168026, "grad_norm": 0.09973084926605225, "learning_rate": 1.3158434420509319e-06, "loss": 0.0104, "num_input_tokens_seen": 239853136, "step": 111155 }, { "epoch": 18.133768352365415, "grad_norm": 0.07735100388526917, "learning_rate": 1.314704264687572e-06, "loss": 0.1076, "num_input_tokens_seen": 239864688, "step": 111160 }, { "epoch": 18.13458401305057, "grad_norm": 0.15902939438819885, "learning_rate": 1.3135655673367665e-06, "loss": 0.0607, "num_input_tokens_seen": 239875312, "step": 111165 }, { "epoch": 18.135399673735726, "grad_norm": 1.3684451580047607, "learning_rate": 1.312427350021611e-06, "loss": 0.112, "num_input_tokens_seen": 239885904, "step": 111170 }, { "epoch": 18.136215334420882, "grad_norm": 0.6643853783607483, "learning_rate": 1.3112896127651596e-06, "loss": 0.1079, "num_input_tokens_seen": 239895856, "step": 111175 }, { "epoch": 18.137030995106034, "grad_norm": 0.08256120979785919, "learning_rate": 1.3101523555904798e-06, "loss": 0.2056, "num_input_tokens_seen": 239906992, "step": 111180 }, { "epoch": 18.13784665579119, "grad_norm": 2.151634693145752, "learning_rate": 1.3090155785206087e-06, "loss": 0.1945, "num_input_tokens_seen": 239916720, "step": 111185 }, { "epoch": 18.138662316476346, "grad_norm": 0.2615625560283661, "learning_rate": 1.3078792815786e-06, "loss": 0.2228, "num_input_tokens_seen": 239928144, "step": 111190 }, { "epoch": 18.1394779771615, "grad_norm": 0.7912247776985168, "learning_rate": 1.306743464787466e-06, "loss": 0.0633, "num_input_tokens_seen": 239939152, "step": 111195 }, { "epoch": 18.140293637846657, "grad_norm": 2.2091758251190186, "learning_rate": 1.305608128170238e-06, "loss": 0.0658, "num_input_tokens_seen": 239950128, "step": 111200 }, { "epoch": 18.14110929853181, "grad_norm": 0.08802473545074463, "learning_rate": 1.3044732717499148e-06, "loss": 0.0771, "num_input_tokens_seen": 239961040, "step": 111205 }, { "epoch": 18.141924959216965, "grad_norm": 0.5112544298171997, "learning_rate": 1.3033388955495053e-06, "loss": 0.0327, "num_input_tokens_seen": 239971184, "step": 111210 }, { "epoch": 18.14274061990212, "grad_norm": 0.17266401648521423, "learning_rate": 1.3022049995919882e-06, "loss": 0.1058, "num_input_tokens_seen": 239982096, "step": 111215 }, { "epoch": 18.143556280587276, "grad_norm": 1.7692408561706543, "learning_rate": 1.3010715839003595e-06, "loss": 0.0998, "num_input_tokens_seen": 239991728, "step": 111220 }, { "epoch": 18.144371941272432, "grad_norm": 0.054425135254859924, "learning_rate": 1.29993864849757e-06, "loss": 0.0627, "num_input_tokens_seen": 240002192, "step": 111225 }, { "epoch": 18.145187601957584, "grad_norm": 0.12703129649162292, "learning_rate": 1.298806193406593e-06, "loss": 0.0593, "num_input_tokens_seen": 240014320, "step": 111230 }, { "epoch": 18.14600326264274, "grad_norm": 0.07806679606437683, "learning_rate": 1.2976742186503743e-06, "loss": 0.1174, "num_input_tokens_seen": 240024688, "step": 111235 }, { "epoch": 18.146818923327896, "grad_norm": 1.7410194873809814, "learning_rate": 1.2965427242518563e-06, "loss": 0.2869, "num_input_tokens_seen": 240034288, "step": 111240 }, { "epoch": 18.14763458401305, "grad_norm": 1.9237992763519287, "learning_rate": 1.2954117102339736e-06, "loss": 0.2785, "num_input_tokens_seen": 240045616, "step": 111245 }, { "epoch": 18.148450244698207, "grad_norm": 0.03590099513530731, "learning_rate": 1.2942811766196412e-06, "loss": 0.0093, "num_input_tokens_seen": 240056496, "step": 111250 }, { "epoch": 18.14926590538336, "grad_norm": 1.7414799928665161, "learning_rate": 1.293151123431774e-06, "loss": 0.0656, "num_input_tokens_seen": 240068080, "step": 111255 }, { "epoch": 18.150081566068515, "grad_norm": 1.2427759170532227, "learning_rate": 1.292021550693273e-06, "loss": 0.0649, "num_input_tokens_seen": 240079184, "step": 111260 }, { "epoch": 18.15089722675367, "grad_norm": 1.9475467205047607, "learning_rate": 1.2908924584270289e-06, "loss": 0.2484, "num_input_tokens_seen": 240089264, "step": 111265 }, { "epoch": 18.151712887438826, "grad_norm": 0.023148367181420326, "learning_rate": 1.289763846655931e-06, "loss": 0.0607, "num_input_tokens_seen": 240101424, "step": 111270 }, { "epoch": 18.152528548123982, "grad_norm": 0.09991708397865295, "learning_rate": 1.2886357154028445e-06, "loss": 0.2118, "num_input_tokens_seen": 240112304, "step": 111275 }, { "epoch": 18.153344208809134, "grad_norm": 0.053574591875076294, "learning_rate": 1.2875080646906317e-06, "loss": 0.0118, "num_input_tokens_seen": 240120944, "step": 111280 }, { "epoch": 18.15415986949429, "grad_norm": 0.060516007244586945, "learning_rate": 1.2863808945421579e-06, "loss": 0.0147, "num_input_tokens_seen": 240131472, "step": 111285 }, { "epoch": 18.154975530179446, "grad_norm": 0.2186863124370575, "learning_rate": 1.285254204980249e-06, "loss": 0.0706, "num_input_tokens_seen": 240142640, "step": 111290 }, { "epoch": 18.1557911908646, "grad_norm": 0.2934170365333557, "learning_rate": 1.2841279960277565e-06, "loss": 0.0106, "num_input_tokens_seen": 240153040, "step": 111295 }, { "epoch": 18.156606851549757, "grad_norm": 0.03517860174179077, "learning_rate": 1.2830022677074893e-06, "loss": 0.0896, "num_input_tokens_seen": 240163728, "step": 111300 }, { "epoch": 18.15742251223491, "grad_norm": 3.933969259262085, "learning_rate": 1.2818770200422742e-06, "loss": 0.0561, "num_input_tokens_seen": 240174448, "step": 111305 }, { "epoch": 18.158238172920065, "grad_norm": 0.08053108304738998, "learning_rate": 1.280752253054901e-06, "loss": 0.0666, "num_input_tokens_seen": 240185616, "step": 111310 }, { "epoch": 18.15905383360522, "grad_norm": 4.103809356689453, "learning_rate": 1.2796279667681848e-06, "loss": 0.0949, "num_input_tokens_seen": 240195888, "step": 111315 }, { "epoch": 18.159869494290376, "grad_norm": 1.9490461349487305, "learning_rate": 1.2785041612048882e-06, "loss": 0.2137, "num_input_tokens_seen": 240205360, "step": 111320 }, { "epoch": 18.160685154975532, "grad_norm": 0.2088102400302887, "learning_rate": 1.2773808363878064e-06, "loss": 0.0302, "num_input_tokens_seen": 240216080, "step": 111325 }, { "epoch": 18.161500815660684, "grad_norm": 0.10346201062202454, "learning_rate": 1.2762579923396912e-06, "loss": 0.0403, "num_input_tokens_seen": 240226672, "step": 111330 }, { "epoch": 18.16231647634584, "grad_norm": 0.07099271565675735, "learning_rate": 1.2751356290833045e-06, "loss": 0.048, "num_input_tokens_seen": 240237776, "step": 111335 }, { "epoch": 18.163132137030995, "grad_norm": 0.33800485730171204, "learning_rate": 1.274013746641392e-06, "loss": 0.0447, "num_input_tokens_seen": 240249040, "step": 111340 }, { "epoch": 18.16394779771615, "grad_norm": 0.19150520861148834, "learning_rate": 1.2728923450366886e-06, "loss": 0.083, "num_input_tokens_seen": 240260464, "step": 111345 }, { "epoch": 18.164763458401303, "grad_norm": 0.4103829562664032, "learning_rate": 1.2717714242919233e-06, "loss": 0.1302, "num_input_tokens_seen": 240270544, "step": 111350 }, { "epoch": 18.16557911908646, "grad_norm": 0.15727293491363525, "learning_rate": 1.2706509844298108e-06, "loss": 0.0894, "num_input_tokens_seen": 240281840, "step": 111355 }, { "epoch": 18.166394779771615, "grad_norm": 0.5587838888168335, "learning_rate": 1.2695310254730587e-06, "loss": 0.0443, "num_input_tokens_seen": 240292752, "step": 111360 }, { "epoch": 18.16721044045677, "grad_norm": 0.0741734579205513, "learning_rate": 1.2684115474443648e-06, "loss": 0.1574, "num_input_tokens_seen": 240303632, "step": 111365 }, { "epoch": 18.168026101141926, "grad_norm": 0.035368308424949646, "learning_rate": 1.2672925503664167e-06, "loss": 0.0366, "num_input_tokens_seen": 240314160, "step": 111370 }, { "epoch": 18.16884176182708, "grad_norm": 0.5363693237304688, "learning_rate": 1.266174034261891e-06, "loss": 0.2967, "num_input_tokens_seen": 240324720, "step": 111375 }, { "epoch": 18.169657422512234, "grad_norm": 1.0587118864059448, "learning_rate": 1.2650559991534584e-06, "loss": 0.0515, "num_input_tokens_seen": 240335632, "step": 111380 }, { "epoch": 18.17047308319739, "grad_norm": 0.13440106809139252, "learning_rate": 1.2639384450637753e-06, "loss": 0.0211, "num_input_tokens_seen": 240346000, "step": 111385 }, { "epoch": 18.171288743882545, "grad_norm": 0.1835474669933319, "learning_rate": 1.2628213720154908e-06, "loss": 0.0076, "num_input_tokens_seen": 240356592, "step": 111390 }, { "epoch": 18.1721044045677, "grad_norm": 1.1555941104888916, "learning_rate": 1.2617047800312448e-06, "loss": 0.047, "num_input_tokens_seen": 240366832, "step": 111395 }, { "epoch": 18.172920065252853, "grad_norm": 0.1953275352716446, "learning_rate": 1.2605886691336638e-06, "loss": 0.0518, "num_input_tokens_seen": 240377328, "step": 111400 }, { "epoch": 18.17373572593801, "grad_norm": 0.04620141535997391, "learning_rate": 1.2594730393453712e-06, "loss": 0.057, "num_input_tokens_seen": 240388144, "step": 111405 }, { "epoch": 18.174551386623165, "grad_norm": 0.19779247045516968, "learning_rate": 1.2583578906889743e-06, "loss": 0.0169, "num_input_tokens_seen": 240399184, "step": 111410 }, { "epoch": 18.17536704730832, "grad_norm": 0.12263461202383041, "learning_rate": 1.2572432231870712e-06, "loss": 0.0516, "num_input_tokens_seen": 240409584, "step": 111415 }, { "epoch": 18.176182707993476, "grad_norm": 0.09332477301359177, "learning_rate": 1.2561290368622552e-06, "loss": 0.0981, "num_input_tokens_seen": 240419600, "step": 111420 }, { "epoch": 18.17699836867863, "grad_norm": 0.11711666733026505, "learning_rate": 1.2550153317371028e-06, "loss": 0.043, "num_input_tokens_seen": 240430448, "step": 111425 }, { "epoch": 18.177814029363784, "grad_norm": 1.582374095916748, "learning_rate": 1.2539021078341929e-06, "loss": 0.1814, "num_input_tokens_seen": 240440784, "step": 111430 }, { "epoch": 18.17862969004894, "grad_norm": 0.08597278594970703, "learning_rate": 1.2527893651760742e-06, "loss": 0.275, "num_input_tokens_seen": 240450800, "step": 111435 }, { "epoch": 18.179445350734095, "grad_norm": 0.1372394561767578, "learning_rate": 1.2516771037853093e-06, "loss": 0.2593, "num_input_tokens_seen": 240461936, "step": 111440 }, { "epoch": 18.18026101141925, "grad_norm": 1.1218265295028687, "learning_rate": 1.2505653236844327e-06, "loss": 0.1248, "num_input_tokens_seen": 240472112, "step": 111445 }, { "epoch": 18.181076672104403, "grad_norm": 0.9553453326225281, "learning_rate": 1.2494540248959795e-06, "loss": 0.064, "num_input_tokens_seen": 240480560, "step": 111450 }, { "epoch": 18.18189233278956, "grad_norm": 2.6689705848693848, "learning_rate": 1.2483432074424706e-06, "loss": 0.174, "num_input_tokens_seen": 240491376, "step": 111455 }, { "epoch": 18.182707993474715, "grad_norm": 1.5457682609558105, "learning_rate": 1.2472328713464153e-06, "loss": 0.1172, "num_input_tokens_seen": 240502384, "step": 111460 }, { "epoch": 18.18352365415987, "grad_norm": 0.27161869406700134, "learning_rate": 1.2461230166303211e-06, "loss": 0.0796, "num_input_tokens_seen": 240511888, "step": 111465 }, { "epoch": 18.184339314845026, "grad_norm": 0.04197124019265175, "learning_rate": 1.245013643316678e-06, "loss": 0.1037, "num_input_tokens_seen": 240523088, "step": 111470 }, { "epoch": 18.18515497553018, "grad_norm": 0.8218522667884827, "learning_rate": 1.2439047514279656e-06, "loss": 0.1449, "num_input_tokens_seen": 240534192, "step": 111475 }, { "epoch": 18.185970636215334, "grad_norm": 1.7828253507614136, "learning_rate": 1.2427963409866628e-06, "loss": 0.0828, "num_input_tokens_seen": 240545072, "step": 111480 }, { "epoch": 18.18678629690049, "grad_norm": 1.9234600067138672, "learning_rate": 1.2416884120152294e-06, "loss": 0.1462, "num_input_tokens_seen": 240556112, "step": 111485 }, { "epoch": 18.187601957585645, "grad_norm": 1.3802915811538696, "learning_rate": 1.24058096453612e-06, "loss": 0.1686, "num_input_tokens_seen": 240566896, "step": 111490 }, { "epoch": 18.1884176182708, "grad_norm": 1.5829440355300903, "learning_rate": 1.2394739985717801e-06, "loss": 0.1235, "num_input_tokens_seen": 240578096, "step": 111495 }, { "epoch": 18.189233278955953, "grad_norm": 2.376199960708618, "learning_rate": 1.2383675141446393e-06, "loss": 0.1159, "num_input_tokens_seen": 240589712, "step": 111500 }, { "epoch": 18.19004893964111, "grad_norm": 0.1407003104686737, "learning_rate": 1.2372615112771268e-06, "loss": 0.1249, "num_input_tokens_seen": 240599600, "step": 111505 }, { "epoch": 18.190864600326265, "grad_norm": 0.022153930738568306, "learning_rate": 1.2361559899916524e-06, "loss": 0.0147, "num_input_tokens_seen": 240610768, "step": 111510 }, { "epoch": 18.19168026101142, "grad_norm": 0.2599550485610962, "learning_rate": 1.235050950310626e-06, "loss": 0.1608, "num_input_tokens_seen": 240621392, "step": 111515 }, { "epoch": 18.192495921696572, "grad_norm": 1.8421592712402344, "learning_rate": 1.233946392256438e-06, "loss": 0.1775, "num_input_tokens_seen": 240633264, "step": 111520 }, { "epoch": 18.193311582381728, "grad_norm": 0.39796656370162964, "learning_rate": 1.2328423158514762e-06, "loss": 0.018, "num_input_tokens_seen": 240643664, "step": 111525 }, { "epoch": 18.194127243066884, "grad_norm": 0.6295039057731628, "learning_rate": 1.231738721118117e-06, "loss": 0.1946, "num_input_tokens_seen": 240654192, "step": 111530 }, { "epoch": 18.19494290375204, "grad_norm": 0.11842066049575806, "learning_rate": 1.230635608078723e-06, "loss": 0.0623, "num_input_tokens_seen": 240665008, "step": 111535 }, { "epoch": 18.195758564437195, "grad_norm": 0.2611891031265259, "learning_rate": 1.229532976755654e-06, "loss": 0.1472, "num_input_tokens_seen": 240676336, "step": 111540 }, { "epoch": 18.196574225122347, "grad_norm": 2.5581867694854736, "learning_rate": 1.2284308271712536e-06, "loss": 0.1552, "num_input_tokens_seen": 240687248, "step": 111545 }, { "epoch": 18.197389885807503, "grad_norm": 0.025593571364879608, "learning_rate": 1.2273291593478564e-06, "loss": 0.1207, "num_input_tokens_seen": 240699568, "step": 111550 }, { "epoch": 18.19820554649266, "grad_norm": 0.06287917494773865, "learning_rate": 1.226227973307792e-06, "loss": 0.2613, "num_input_tokens_seen": 240710640, "step": 111555 }, { "epoch": 18.199021207177815, "grad_norm": 0.16687847673892975, "learning_rate": 1.225127269073381e-06, "loss": 0.0589, "num_input_tokens_seen": 240721456, "step": 111560 }, { "epoch": 18.19983686786297, "grad_norm": 0.207020103931427, "learning_rate": 1.2240270466669202e-06, "loss": 0.0803, "num_input_tokens_seen": 240732336, "step": 111565 }, { "epoch": 18.200652528548122, "grad_norm": 0.3405044972896576, "learning_rate": 1.222927306110716e-06, "loss": 0.0601, "num_input_tokens_seen": 240742736, "step": 111570 }, { "epoch": 18.201468189233278, "grad_norm": 0.27003929018974304, "learning_rate": 1.2218280474270565e-06, "loss": 0.0317, "num_input_tokens_seen": 240753200, "step": 111575 }, { "epoch": 18.202283849918434, "grad_norm": 0.1223931685090065, "learning_rate": 1.2207292706382127e-06, "loss": 0.0574, "num_input_tokens_seen": 240765392, "step": 111580 }, { "epoch": 18.20309951060359, "grad_norm": 0.052146460860967636, "learning_rate": 1.2196309757664587e-06, "loss": 0.0742, "num_input_tokens_seen": 240777456, "step": 111585 }, { "epoch": 18.203915171288745, "grad_norm": 0.12166589498519897, "learning_rate": 1.2185331628340484e-06, "loss": 0.2022, "num_input_tokens_seen": 240788240, "step": 111590 }, { "epoch": 18.204730831973897, "grad_norm": 2.2772183418273926, "learning_rate": 1.217435831863234e-06, "loss": 0.2696, "num_input_tokens_seen": 240799376, "step": 111595 }, { "epoch": 18.205546492659053, "grad_norm": 0.6864228844642639, "learning_rate": 1.216338982876253e-06, "loss": 0.1528, "num_input_tokens_seen": 240810800, "step": 111600 }, { "epoch": 18.20636215334421, "grad_norm": 0.3049562871456146, "learning_rate": 1.2152426158953345e-06, "loss": 0.1235, "num_input_tokens_seen": 240822064, "step": 111605 }, { "epoch": 18.207177814029365, "grad_norm": 0.14477461576461792, "learning_rate": 1.2141467309426946e-06, "loss": 0.0088, "num_input_tokens_seen": 240832272, "step": 111610 }, { "epoch": 18.20799347471452, "grad_norm": 1.9029035568237305, "learning_rate": 1.2130513280405487e-06, "loss": 0.1139, "num_input_tokens_seen": 240842320, "step": 111615 }, { "epoch": 18.208809135399672, "grad_norm": 0.23530684411525726, "learning_rate": 1.2119564072110928e-06, "loss": 0.0094, "num_input_tokens_seen": 240852240, "step": 111620 }, { "epoch": 18.209624796084828, "grad_norm": 0.31829193234443665, "learning_rate": 1.2108619684765177e-06, "loss": 0.0162, "num_input_tokens_seen": 240862544, "step": 111625 }, { "epoch": 18.210440456769984, "grad_norm": 2.542381763458252, "learning_rate": 1.2097680118590027e-06, "loss": 0.1129, "num_input_tokens_seen": 240872112, "step": 111630 }, { "epoch": 18.21125611745514, "grad_norm": 1.3692748546600342, "learning_rate": 1.208674537380719e-06, "loss": 0.1752, "num_input_tokens_seen": 240882352, "step": 111635 }, { "epoch": 18.212071778140295, "grad_norm": 1.8647853136062622, "learning_rate": 1.2075815450638266e-06, "loss": 0.2158, "num_input_tokens_seen": 240893264, "step": 111640 }, { "epoch": 18.212887438825447, "grad_norm": 0.025744417682290077, "learning_rate": 1.2064890349304803e-06, "loss": 0.0615, "num_input_tokens_seen": 240903952, "step": 111645 }, { "epoch": 18.213703099510603, "grad_norm": 0.05084449425339699, "learning_rate": 1.2053970070028147e-06, "loss": 0.0358, "num_input_tokens_seen": 240914768, "step": 111650 }, { "epoch": 18.21451876019576, "grad_norm": 0.07114972919225693, "learning_rate": 1.2043054613029653e-06, "loss": 0.013, "num_input_tokens_seen": 240926064, "step": 111655 }, { "epoch": 18.215334420880914, "grad_norm": 0.510347843170166, "learning_rate": 1.2032143978530503e-06, "loss": 0.1697, "num_input_tokens_seen": 240936720, "step": 111660 }, { "epoch": 18.21615008156607, "grad_norm": 0.10431865602731705, "learning_rate": 1.202123816675188e-06, "loss": 0.1456, "num_input_tokens_seen": 240947728, "step": 111665 }, { "epoch": 18.216965742251222, "grad_norm": 0.24679268896579742, "learning_rate": 1.201033717791472e-06, "loss": 0.0163, "num_input_tokens_seen": 240958704, "step": 111670 }, { "epoch": 18.217781402936378, "grad_norm": 0.11671815812587738, "learning_rate": 1.1999441012240041e-06, "loss": 0.0828, "num_input_tokens_seen": 240970832, "step": 111675 }, { "epoch": 18.218597063621534, "grad_norm": 0.21732260286808014, "learning_rate": 1.1988549669948556e-06, "loss": 0.1893, "num_input_tokens_seen": 240982448, "step": 111680 }, { "epoch": 18.21941272430669, "grad_norm": 0.7514810562133789, "learning_rate": 1.1977663151261114e-06, "loss": 0.0606, "num_input_tokens_seen": 240993328, "step": 111685 }, { "epoch": 18.22022838499184, "grad_norm": 0.046193744987249374, "learning_rate": 1.1966781456398234e-06, "loss": 0.2122, "num_input_tokens_seen": 241003856, "step": 111690 }, { "epoch": 18.221044045676997, "grad_norm": 0.06582420319318771, "learning_rate": 1.1955904585580547e-06, "loss": 0.0946, "num_input_tokens_seen": 241013616, "step": 111695 }, { "epoch": 18.221859706362153, "grad_norm": 0.09061625599861145, "learning_rate": 1.1945032539028344e-06, "loss": 0.0425, "num_input_tokens_seen": 241023312, "step": 111700 }, { "epoch": 18.22267536704731, "grad_norm": 0.11217179894447327, "learning_rate": 1.1934165316962147e-06, "loss": 0.2657, "num_input_tokens_seen": 241033776, "step": 111705 }, { "epoch": 18.223491027732464, "grad_norm": 2.8767035007476807, "learning_rate": 1.192330291960203e-06, "loss": 0.0654, "num_input_tokens_seen": 241044528, "step": 111710 }, { "epoch": 18.224306688417617, "grad_norm": 0.07096675038337708, "learning_rate": 1.1912445347168233e-06, "loss": 0.0224, "num_input_tokens_seen": 241056144, "step": 111715 }, { "epoch": 18.225122349102772, "grad_norm": 0.3446805775165558, "learning_rate": 1.1901592599880745e-06, "loss": 0.0914, "num_input_tokens_seen": 241066960, "step": 111720 }, { "epoch": 18.225938009787928, "grad_norm": 0.2504013478755951, "learning_rate": 1.1890744677959558e-06, "loss": 0.1625, "num_input_tokens_seen": 241077744, "step": 111725 }, { "epoch": 18.226753670473084, "grad_norm": 0.18296842277050018, "learning_rate": 1.187990158162447e-06, "loss": 0.0388, "num_input_tokens_seen": 241089328, "step": 111730 }, { "epoch": 18.22756933115824, "grad_norm": 0.33718279004096985, "learning_rate": 1.1869063311095275e-06, "loss": 0.0399, "num_input_tokens_seen": 241100784, "step": 111735 }, { "epoch": 18.22838499184339, "grad_norm": 0.07462268322706223, "learning_rate": 1.1858229866591603e-06, "loss": 0.1353, "num_input_tokens_seen": 241111504, "step": 111740 }, { "epoch": 18.229200652528547, "grad_norm": 0.03884302079677582, "learning_rate": 1.1847401248332974e-06, "loss": 0.0564, "num_input_tokens_seen": 241122864, "step": 111745 }, { "epoch": 18.230016313213703, "grad_norm": 0.6475749015808105, "learning_rate": 1.1836577456538905e-06, "loss": 0.1454, "num_input_tokens_seen": 241134128, "step": 111750 }, { "epoch": 18.23083197389886, "grad_norm": 2.7378389835357666, "learning_rate": 1.1825758491428723e-06, "loss": 0.0551, "num_input_tokens_seen": 241144912, "step": 111755 }, { "epoch": 18.231647634584014, "grad_norm": 0.1062290146946907, "learning_rate": 1.1814944353221697e-06, "loss": 0.0139, "num_input_tokens_seen": 241155696, "step": 111760 }, { "epoch": 18.232463295269167, "grad_norm": 2.576378107070923, "learning_rate": 1.1804135042136927e-06, "loss": 0.1636, "num_input_tokens_seen": 241166896, "step": 111765 }, { "epoch": 18.233278955954322, "grad_norm": 0.3938957452774048, "learning_rate": 1.1793330558393629e-06, "loss": 0.1318, "num_input_tokens_seen": 241176880, "step": 111770 }, { "epoch": 18.234094616639478, "grad_norm": 0.5248487591743469, "learning_rate": 1.1782530902210598e-06, "loss": 0.2141, "num_input_tokens_seen": 241187568, "step": 111775 }, { "epoch": 18.234910277324634, "grad_norm": 0.6992287039756775, "learning_rate": 1.1771736073806827e-06, "loss": 0.2768, "num_input_tokens_seen": 241198640, "step": 111780 }, { "epoch": 18.23572593800979, "grad_norm": 1.5564098358154297, "learning_rate": 1.1760946073400974e-06, "loss": 0.0912, "num_input_tokens_seen": 241209840, "step": 111785 }, { "epoch": 18.23654159869494, "grad_norm": 0.16567499935626984, "learning_rate": 1.1750160901211866e-06, "loss": 0.0128, "num_input_tokens_seen": 241220720, "step": 111790 }, { "epoch": 18.237357259380097, "grad_norm": 0.7771691083908081, "learning_rate": 1.1739380557457907e-06, "loss": 0.126, "num_input_tokens_seen": 241229744, "step": 111795 }, { "epoch": 18.238172920065253, "grad_norm": 0.35047879815101624, "learning_rate": 1.1728605042357732e-06, "loss": 0.0614, "num_input_tokens_seen": 241240976, "step": 111800 }, { "epoch": 18.23898858075041, "grad_norm": 0.3262999951839447, "learning_rate": 1.1717834356129553e-06, "loss": 0.0348, "num_input_tokens_seen": 241250736, "step": 111805 }, { "epoch": 18.239804241435564, "grad_norm": 0.06839349120855331, "learning_rate": 1.1707068498991835e-06, "loss": 0.1555, "num_input_tokens_seen": 241262160, "step": 111810 }, { "epoch": 18.240619902120716, "grad_norm": 0.035261161625385284, "learning_rate": 1.16963074711626e-06, "loss": 0.0937, "num_input_tokens_seen": 241272112, "step": 111815 }, { "epoch": 18.241435562805872, "grad_norm": 0.5412734746932983, "learning_rate": 1.1685551272860056e-06, "loss": 0.1444, "num_input_tokens_seen": 241283696, "step": 111820 }, { "epoch": 18.242251223491028, "grad_norm": 0.5995345115661621, "learning_rate": 1.1674799904302092e-06, "loss": 0.0918, "num_input_tokens_seen": 241294032, "step": 111825 }, { "epoch": 18.243066884176184, "grad_norm": 0.12336945533752441, "learning_rate": 1.1664053365706694e-06, "loss": 0.0995, "num_input_tokens_seen": 241305296, "step": 111830 }, { "epoch": 18.24388254486134, "grad_norm": 2.288518190383911, "learning_rate": 1.1653311657291556e-06, "loss": 0.1544, "num_input_tokens_seen": 241316176, "step": 111835 }, { "epoch": 18.24469820554649, "grad_norm": 0.1613188534975052, "learning_rate": 1.1642574779274467e-06, "loss": 0.1498, "num_input_tokens_seen": 241326192, "step": 111840 }, { "epoch": 18.245513866231647, "grad_norm": 0.14763952791690826, "learning_rate": 1.1631842731872927e-06, "loss": 0.0058, "num_input_tokens_seen": 241336016, "step": 111845 }, { "epoch": 18.246329526916803, "grad_norm": 1.054453730583191, "learning_rate": 1.1621115515304564e-06, "loss": 0.1139, "num_input_tokens_seen": 241345328, "step": 111850 }, { "epoch": 18.24714518760196, "grad_norm": 3.426743507385254, "learning_rate": 1.1610393129786596e-06, "loss": 0.1341, "num_input_tokens_seen": 241356304, "step": 111855 }, { "epoch": 18.247960848287114, "grad_norm": 0.04925122484564781, "learning_rate": 1.159967557553651e-06, "loss": 0.1395, "num_input_tokens_seen": 241366704, "step": 111860 }, { "epoch": 18.248776508972266, "grad_norm": 0.10328096151351929, "learning_rate": 1.158896285277139e-06, "loss": 0.2342, "num_input_tokens_seen": 241377584, "step": 111865 }, { "epoch": 18.249592169657422, "grad_norm": 0.09841467440128326, "learning_rate": 1.157825496170839e-06, "loss": 0.1219, "num_input_tokens_seen": 241389072, "step": 111870 }, { "epoch": 18.250407830342578, "grad_norm": 0.0505024753510952, "learning_rate": 1.1567551902564534e-06, "loss": 0.0313, "num_input_tokens_seen": 241401488, "step": 111875 }, { "epoch": 18.251223491027734, "grad_norm": 0.823108971118927, "learning_rate": 1.15568536755567e-06, "loss": 0.0925, "num_input_tokens_seen": 241411088, "step": 111880 }, { "epoch": 18.252039151712886, "grad_norm": 0.02956179901957512, "learning_rate": 1.1546160280901718e-06, "loss": 0.0874, "num_input_tokens_seen": 241422736, "step": 111885 }, { "epoch": 18.25285481239804, "grad_norm": 2.0639729499816895, "learning_rate": 1.1535471718816276e-06, "loss": 0.1644, "num_input_tokens_seen": 241433360, "step": 111890 }, { "epoch": 18.253670473083197, "grad_norm": 0.8198709487915039, "learning_rate": 1.1524787989517033e-06, "loss": 0.1298, "num_input_tokens_seen": 241444016, "step": 111895 }, { "epoch": 18.254486133768353, "grad_norm": 0.6074323058128357, "learning_rate": 1.1514109093220483e-06, "loss": 0.0171, "num_input_tokens_seen": 241453232, "step": 111900 }, { "epoch": 18.25530179445351, "grad_norm": 0.27780669927597046, "learning_rate": 1.1503435030143061e-06, "loss": 0.0145, "num_input_tokens_seen": 241464592, "step": 111905 }, { "epoch": 18.25611745513866, "grad_norm": 1.7453566789627075, "learning_rate": 1.1492765800501042e-06, "loss": 0.0904, "num_input_tokens_seen": 241473744, "step": 111910 }, { "epoch": 18.256933115823816, "grad_norm": 2.674747943878174, "learning_rate": 1.148210140451078e-06, "loss": 0.1748, "num_input_tokens_seen": 241484720, "step": 111915 }, { "epoch": 18.257748776508972, "grad_norm": 0.19422248005867004, "learning_rate": 1.1471441842388213e-06, "loss": 0.0485, "num_input_tokens_seen": 241495248, "step": 111920 }, { "epoch": 18.258564437194128, "grad_norm": 0.03508033975958824, "learning_rate": 1.1460787114349557e-06, "loss": 0.1087, "num_input_tokens_seen": 241505104, "step": 111925 }, { "epoch": 18.259380097879284, "grad_norm": 0.8644686937332153, "learning_rate": 1.1450137220610584e-06, "loss": 0.038, "num_input_tokens_seen": 241515664, "step": 111930 }, { "epoch": 18.260195758564436, "grad_norm": 0.47439780831336975, "learning_rate": 1.143949216138729e-06, "loss": 0.0285, "num_input_tokens_seen": 241526672, "step": 111935 }, { "epoch": 18.26101141924959, "grad_norm": 0.08871438354253769, "learning_rate": 1.142885193689522e-06, "loss": 0.3475, "num_input_tokens_seen": 241538032, "step": 111940 }, { "epoch": 18.261827079934747, "grad_norm": 1.1858329772949219, "learning_rate": 1.1418216547350203e-06, "loss": 0.0567, "num_input_tokens_seen": 241549264, "step": 111945 }, { "epoch": 18.262642740619903, "grad_norm": 0.12367760390043259, "learning_rate": 1.1407585992967622e-06, "loss": 0.0258, "num_input_tokens_seen": 241559504, "step": 111950 }, { "epoch": 18.26345840130506, "grad_norm": 0.5340371131896973, "learning_rate": 1.139696027396306e-06, "loss": 0.1445, "num_input_tokens_seen": 241570896, "step": 111955 }, { "epoch": 18.26427406199021, "grad_norm": 0.23787912726402283, "learning_rate": 1.1386339390551725e-06, "loss": 0.0144, "num_input_tokens_seen": 241581776, "step": 111960 }, { "epoch": 18.265089722675366, "grad_norm": 0.09004765748977661, "learning_rate": 1.137572334294898e-06, "loss": 0.1537, "num_input_tokens_seen": 241592240, "step": 111965 }, { "epoch": 18.265905383360522, "grad_norm": 0.15266428887844086, "learning_rate": 1.1365112131369842e-06, "loss": 0.0114, "num_input_tokens_seen": 241602096, "step": 111970 }, { "epoch": 18.266721044045678, "grad_norm": 0.15179461240768433, "learning_rate": 1.1354505756029476e-06, "loss": 0.2153, "num_input_tokens_seen": 241612016, "step": 111975 }, { "epoch": 18.267536704730833, "grad_norm": 1.3452837467193604, "learning_rate": 1.1343904217142791e-06, "loss": 0.1064, "num_input_tokens_seen": 241623280, "step": 111980 }, { "epoch": 18.268352365415986, "grad_norm": 0.24994713068008423, "learning_rate": 1.1333307514924646e-06, "loss": 0.0341, "num_input_tokens_seen": 241633776, "step": 111985 }, { "epoch": 18.26916802610114, "grad_norm": 0.7460962533950806, "learning_rate": 1.132271564958981e-06, "loss": 0.1087, "num_input_tokens_seen": 241644176, "step": 111990 }, { "epoch": 18.269983686786297, "grad_norm": 0.09693332016468048, "learning_rate": 1.1312128621352918e-06, "loss": 0.0087, "num_input_tokens_seen": 241655792, "step": 111995 }, { "epoch": 18.270799347471453, "grad_norm": 1.084405541419983, "learning_rate": 1.1301546430428521e-06, "loss": 0.0801, "num_input_tokens_seen": 241667568, "step": 112000 }, { "epoch": 18.27161500815661, "grad_norm": 0.13183146715164185, "learning_rate": 1.1290969077031117e-06, "loss": 0.0321, "num_input_tokens_seen": 241677584, "step": 112005 }, { "epoch": 18.27243066884176, "grad_norm": 4.317193984985352, "learning_rate": 1.1280396561375028e-06, "loss": 0.1499, "num_input_tokens_seen": 241688208, "step": 112010 }, { "epoch": 18.273246329526916, "grad_norm": 1.3110771179199219, "learning_rate": 1.126982888367456e-06, "loss": 0.1197, "num_input_tokens_seen": 241697936, "step": 112015 }, { "epoch": 18.274061990212072, "grad_norm": 0.034174736589193344, "learning_rate": 1.1259266044143847e-06, "loss": 0.1165, "num_input_tokens_seen": 241708624, "step": 112020 }, { "epoch": 18.274877650897228, "grad_norm": 0.6546769142150879, "learning_rate": 1.124870804299699e-06, "loss": 0.0412, "num_input_tokens_seen": 241718672, "step": 112025 }, { "epoch": 18.275693311582383, "grad_norm": 0.25222325325012207, "learning_rate": 1.123815488044791e-06, "loss": 0.0999, "num_input_tokens_seen": 241728944, "step": 112030 }, { "epoch": 18.276508972267536, "grad_norm": 0.10664492100477219, "learning_rate": 1.122760655671054e-06, "loss": 0.1118, "num_input_tokens_seen": 241740592, "step": 112035 }, { "epoch": 18.27732463295269, "grad_norm": 0.3434044122695923, "learning_rate": 1.1217063071998628e-06, "loss": 0.1002, "num_input_tokens_seen": 241751568, "step": 112040 }, { "epoch": 18.278140293637847, "grad_norm": 0.04020646587014198, "learning_rate": 1.120652442652581e-06, "loss": 0.1248, "num_input_tokens_seen": 241763024, "step": 112045 }, { "epoch": 18.278955954323003, "grad_norm": 0.47089236974716187, "learning_rate": 1.1195990620505747e-06, "loss": 0.0352, "num_input_tokens_seen": 241774128, "step": 112050 }, { "epoch": 18.27977161500816, "grad_norm": 0.11010874062776566, "learning_rate": 1.1185461654151853e-06, "loss": 0.0687, "num_input_tokens_seen": 241784304, "step": 112055 }, { "epoch": 18.28058727569331, "grad_norm": 1.906785249710083, "learning_rate": 1.1174937527677565e-06, "loss": 0.1257, "num_input_tokens_seen": 241794608, "step": 112060 }, { "epoch": 18.281402936378466, "grad_norm": 0.06569743156433105, "learning_rate": 1.1164418241296103e-06, "loss": 0.0363, "num_input_tokens_seen": 241804784, "step": 112065 }, { "epoch": 18.282218597063622, "grad_norm": 0.4054315388202667, "learning_rate": 1.1153903795220744e-06, "loss": 0.1487, "num_input_tokens_seen": 241815824, "step": 112070 }, { "epoch": 18.283034257748778, "grad_norm": 0.3085690140724182, "learning_rate": 1.1143394189664452e-06, "loss": 0.2215, "num_input_tokens_seen": 241827120, "step": 112075 }, { "epoch": 18.28384991843393, "grad_norm": 0.12387159466743469, "learning_rate": 1.1132889424840337e-06, "loss": 0.1738, "num_input_tokens_seen": 241838448, "step": 112080 }, { "epoch": 18.284665579119086, "grad_norm": 0.7407680153846741, "learning_rate": 1.1122389500961227e-06, "loss": 0.038, "num_input_tokens_seen": 241849808, "step": 112085 }, { "epoch": 18.28548123980424, "grad_norm": 0.049286503344774246, "learning_rate": 1.1111894418239926e-06, "loss": 0.0311, "num_input_tokens_seen": 241859792, "step": 112090 }, { "epoch": 18.286296900489397, "grad_norm": 0.10425817966461182, "learning_rate": 1.1101404176889151e-06, "loss": 0.2802, "num_input_tokens_seen": 241870736, "step": 112095 }, { "epoch": 18.287112561174553, "grad_norm": 0.26182684302330017, "learning_rate": 1.1090918777121484e-06, "loss": 0.0547, "num_input_tokens_seen": 241880816, "step": 112100 }, { "epoch": 18.287928221859705, "grad_norm": 0.36258894205093384, "learning_rate": 1.108043821914942e-06, "loss": 0.1296, "num_input_tokens_seen": 241890960, "step": 112105 }, { "epoch": 18.28874388254486, "grad_norm": 1.026785969734192, "learning_rate": 1.1069962503185372e-06, "loss": 0.0612, "num_input_tokens_seen": 241902288, "step": 112110 }, { "epoch": 18.289559543230016, "grad_norm": 0.2966945171356201, "learning_rate": 1.1059491629441615e-06, "loss": 0.0778, "num_input_tokens_seen": 241912368, "step": 112115 }, { "epoch": 18.290375203915172, "grad_norm": 0.6738225817680359, "learning_rate": 1.1049025598130397e-06, "loss": 0.0476, "num_input_tokens_seen": 241922096, "step": 112120 }, { "epoch": 18.291190864600328, "grad_norm": 0.39693501591682434, "learning_rate": 1.1038564409463826e-06, "loss": 0.0221, "num_input_tokens_seen": 241932944, "step": 112125 }, { "epoch": 18.29200652528548, "grad_norm": 0.030152037739753723, "learning_rate": 1.102810806365387e-06, "loss": 0.0451, "num_input_tokens_seen": 241943760, "step": 112130 }, { "epoch": 18.292822185970635, "grad_norm": 0.6539421677589417, "learning_rate": 1.1017656560912442e-06, "loss": 0.0371, "num_input_tokens_seen": 241956080, "step": 112135 }, { "epoch": 18.29363784665579, "grad_norm": 0.08474380522966385, "learning_rate": 1.1007209901451404e-06, "loss": 0.0536, "num_input_tokens_seen": 241966064, "step": 112140 }, { "epoch": 18.294453507340947, "grad_norm": 2.2427873611450195, "learning_rate": 1.0996768085482418e-06, "loss": 0.192, "num_input_tokens_seen": 241976240, "step": 112145 }, { "epoch": 18.295269168026103, "grad_norm": 1.476315975189209, "learning_rate": 1.0986331113217146e-06, "loss": 0.0375, "num_input_tokens_seen": 241985840, "step": 112150 }, { "epoch": 18.296084828711255, "grad_norm": 0.1446811556816101, "learning_rate": 1.0975898984867088e-06, "loss": 0.1497, "num_input_tokens_seen": 241997616, "step": 112155 }, { "epoch": 18.29690048939641, "grad_norm": 0.7562087774276733, "learning_rate": 1.096547170064363e-06, "loss": 0.0205, "num_input_tokens_seen": 242007504, "step": 112160 }, { "epoch": 18.297716150081566, "grad_norm": 1.6498440504074097, "learning_rate": 1.095504926075816e-06, "loss": 0.1587, "num_input_tokens_seen": 242018096, "step": 112165 }, { "epoch": 18.298531810766722, "grad_norm": 1.2007821798324585, "learning_rate": 1.094463166542184e-06, "loss": 0.2078, "num_input_tokens_seen": 242029264, "step": 112170 }, { "epoch": 18.299347471451878, "grad_norm": 0.4342266917228699, "learning_rate": 1.0934218914845834e-06, "loss": 0.154, "num_input_tokens_seen": 242039408, "step": 112175 }, { "epoch": 18.30016313213703, "grad_norm": 0.20667323470115662, "learning_rate": 1.0923811009241142e-06, "loss": 0.1651, "num_input_tokens_seen": 242049904, "step": 112180 }, { "epoch": 18.300978792822185, "grad_norm": 0.15266621112823486, "learning_rate": 1.0913407948818706e-06, "loss": 0.0766, "num_input_tokens_seen": 242060848, "step": 112185 }, { "epoch": 18.30179445350734, "grad_norm": 0.056949298828840256, "learning_rate": 1.0903009733789382e-06, "loss": 0.1379, "num_input_tokens_seen": 242071216, "step": 112190 }, { "epoch": 18.302610114192497, "grad_norm": 0.7130948305130005, "learning_rate": 1.0892616364363868e-06, "loss": 0.1469, "num_input_tokens_seen": 242083408, "step": 112195 }, { "epoch": 18.303425774877653, "grad_norm": 0.3402724266052246, "learning_rate": 1.0882227840752823e-06, "loss": 0.0711, "num_input_tokens_seen": 242093680, "step": 112200 }, { "epoch": 18.304241435562805, "grad_norm": 0.17602738738059998, "learning_rate": 1.0871844163166778e-06, "loss": 0.2317, "num_input_tokens_seen": 242104752, "step": 112205 }, { "epoch": 18.30505709624796, "grad_norm": 0.059365395456552505, "learning_rate": 1.0861465331816172e-06, "loss": 0.065, "num_input_tokens_seen": 242114064, "step": 112210 }, { "epoch": 18.305872756933116, "grad_norm": 0.2079605758190155, "learning_rate": 1.0851091346911313e-06, "loss": 0.0394, "num_input_tokens_seen": 242125840, "step": 112215 }, { "epoch": 18.306688417618272, "grad_norm": 0.02941949851810932, "learning_rate": 1.0840722208662502e-06, "loss": 0.0712, "num_input_tokens_seen": 242137680, "step": 112220 }, { "epoch": 18.307504078303428, "grad_norm": 0.06547810137271881, "learning_rate": 1.083035791727982e-06, "loss": 0.1566, "num_input_tokens_seen": 242148496, "step": 112225 }, { "epoch": 18.30831973898858, "grad_norm": 1.3635783195495605, "learning_rate": 1.0819998472973352e-06, "loss": 0.1232, "num_input_tokens_seen": 242158992, "step": 112230 }, { "epoch": 18.309135399673735, "grad_norm": 0.033630941063165665, "learning_rate": 1.0809643875953068e-06, "loss": 0.0065, "num_input_tokens_seen": 242168720, "step": 112235 }, { "epoch": 18.30995106035889, "grad_norm": 2.5423128604888916, "learning_rate": 1.0799294126428745e-06, "loss": 0.073, "num_input_tokens_seen": 242181168, "step": 112240 }, { "epoch": 18.310766721044047, "grad_norm": 0.042779453098773956, "learning_rate": 1.0788949224610184e-06, "loss": 0.1062, "num_input_tokens_seen": 242192656, "step": 112245 }, { "epoch": 18.3115823817292, "grad_norm": 1.644538402557373, "learning_rate": 1.0778609170707028e-06, "loss": 0.1622, "num_input_tokens_seen": 242203088, "step": 112250 }, { "epoch": 18.312398042414355, "grad_norm": 0.12595881521701813, "learning_rate": 1.0768273964928854e-06, "loss": 0.127, "num_input_tokens_seen": 242213968, "step": 112255 }, { "epoch": 18.31321370309951, "grad_norm": 2.24504017829895, "learning_rate": 1.0757943607485082e-06, "loss": 0.4788, "num_input_tokens_seen": 242223952, "step": 112260 }, { "epoch": 18.314029363784666, "grad_norm": 1.2424793243408203, "learning_rate": 1.0747618098585072e-06, "loss": 0.0388, "num_input_tokens_seen": 242234800, "step": 112265 }, { "epoch": 18.31484502446982, "grad_norm": 0.020096460357308388, "learning_rate": 1.0737297438438098e-06, "loss": 0.0437, "num_input_tokens_seen": 242246064, "step": 112270 }, { "epoch": 18.315660685154974, "grad_norm": 0.7480965852737427, "learning_rate": 1.0726981627253302e-06, "loss": 0.0733, "num_input_tokens_seen": 242257232, "step": 112275 }, { "epoch": 18.31647634584013, "grad_norm": 0.2095641940832138, "learning_rate": 1.0716670665239793e-06, "loss": 0.1324, "num_input_tokens_seen": 242268304, "step": 112280 }, { "epoch": 18.317292006525285, "grad_norm": 0.31891047954559326, "learning_rate": 1.0706364552606485e-06, "loss": 0.204, "num_input_tokens_seen": 242279408, "step": 112285 }, { "epoch": 18.31810766721044, "grad_norm": 0.20627166330814362, "learning_rate": 1.069606328956227e-06, "loss": 0.0407, "num_input_tokens_seen": 242289520, "step": 112290 }, { "epoch": 18.318923327895597, "grad_norm": 2.097710371017456, "learning_rate": 1.0685766876315923e-06, "loss": 0.0458, "num_input_tokens_seen": 242300048, "step": 112295 }, { "epoch": 18.31973898858075, "grad_norm": 1.380753755569458, "learning_rate": 1.0675475313076056e-06, "loss": 0.2097, "num_input_tokens_seen": 242311152, "step": 112300 }, { "epoch": 18.320554649265905, "grad_norm": 0.04963398724794388, "learning_rate": 1.0665188600051363e-06, "loss": 0.0697, "num_input_tokens_seen": 242321808, "step": 112305 }, { "epoch": 18.32137030995106, "grad_norm": 0.10045920312404633, "learning_rate": 1.0654906737450177e-06, "loss": 0.0503, "num_input_tokens_seen": 242333840, "step": 112310 }, { "epoch": 18.322185970636216, "grad_norm": 0.03870956972241402, "learning_rate": 1.0644629725480999e-06, "loss": 0.1157, "num_input_tokens_seen": 242345680, "step": 112315 }, { "epoch": 18.32300163132137, "grad_norm": 0.0530848428606987, "learning_rate": 1.0634357564351966e-06, "loss": 0.0239, "num_input_tokens_seen": 242356624, "step": 112320 }, { "epoch": 18.323817292006524, "grad_norm": 0.5420478582382202, "learning_rate": 1.0624090254271384e-06, "loss": 0.1336, "num_input_tokens_seen": 242368144, "step": 112325 }, { "epoch": 18.32463295269168, "grad_norm": 0.9140323996543884, "learning_rate": 1.0613827795447256e-06, "loss": 0.211, "num_input_tokens_seen": 242379056, "step": 112330 }, { "epoch": 18.325448613376835, "grad_norm": 0.13943882286548615, "learning_rate": 1.0603570188087608e-06, "loss": 0.4049, "num_input_tokens_seen": 242390096, "step": 112335 }, { "epoch": 18.32626427406199, "grad_norm": 0.057346880435943604, "learning_rate": 1.05933174324003e-06, "loss": 0.0249, "num_input_tokens_seen": 242400464, "step": 112340 }, { "epoch": 18.327079934747147, "grad_norm": 1.1303837299346924, "learning_rate": 1.0583069528593114e-06, "loss": 0.0884, "num_input_tokens_seen": 242410192, "step": 112345 }, { "epoch": 18.3278955954323, "grad_norm": 0.05955947935581207, "learning_rate": 1.057282647687377e-06, "loss": 0.111, "num_input_tokens_seen": 242419088, "step": 112350 }, { "epoch": 18.328711256117455, "grad_norm": 0.04564538225531578, "learning_rate": 1.0562588277449826e-06, "loss": 0.014, "num_input_tokens_seen": 242429808, "step": 112355 }, { "epoch": 18.32952691680261, "grad_norm": 0.5332499742507935, "learning_rate": 1.055235493052878e-06, "loss": 0.1797, "num_input_tokens_seen": 242441232, "step": 112360 }, { "epoch": 18.330342577487766, "grad_norm": 0.2470005303621292, "learning_rate": 1.0542126436318023e-06, "loss": 0.1093, "num_input_tokens_seen": 242452144, "step": 112365 }, { "epoch": 18.33115823817292, "grad_norm": 0.02908427082002163, "learning_rate": 1.0531902795024834e-06, "loss": 0.1195, "num_input_tokens_seen": 242463472, "step": 112370 }, { "epoch": 18.331973898858074, "grad_norm": 0.47747814655303955, "learning_rate": 1.0521684006856435e-06, "loss": 0.0131, "num_input_tokens_seen": 242474640, "step": 112375 }, { "epoch": 18.33278955954323, "grad_norm": 0.0868307501077652, "learning_rate": 1.0511470072019885e-06, "loss": 0.0097, "num_input_tokens_seen": 242486576, "step": 112380 }, { "epoch": 18.333605220228385, "grad_norm": 0.23547320067882538, "learning_rate": 1.050126099072224e-06, "loss": 0.0407, "num_input_tokens_seen": 242498992, "step": 112385 }, { "epoch": 18.33442088091354, "grad_norm": 0.10861809551715851, "learning_rate": 1.049105676317033e-06, "loss": 0.0254, "num_input_tokens_seen": 242510992, "step": 112390 }, { "epoch": 18.335236541598697, "grad_norm": 1.5008291006088257, "learning_rate": 1.0480857389571025e-06, "loss": 0.2153, "num_input_tokens_seen": 242521904, "step": 112395 }, { "epoch": 18.33605220228385, "grad_norm": 1.8383821249008179, "learning_rate": 1.0470662870130986e-06, "loss": 0.1458, "num_input_tokens_seen": 242532208, "step": 112400 }, { "epoch": 18.336867862969005, "grad_norm": 0.5621327757835388, "learning_rate": 1.0460473205056803e-06, "loss": 0.0681, "num_input_tokens_seen": 242542192, "step": 112405 }, { "epoch": 18.33768352365416, "grad_norm": 0.17765405774116516, "learning_rate": 1.0450288394555058e-06, "loss": 0.1031, "num_input_tokens_seen": 242553712, "step": 112410 }, { "epoch": 18.338499184339316, "grad_norm": 0.26312175393104553, "learning_rate": 1.0440108438832058e-06, "loss": 0.0124, "num_input_tokens_seen": 242564240, "step": 112415 }, { "epoch": 18.339314845024468, "grad_norm": 0.1058235689997673, "learning_rate": 1.0429933338094222e-06, "loss": 0.344, "num_input_tokens_seen": 242576304, "step": 112420 }, { "epoch": 18.340130505709624, "grad_norm": 0.07675274461507797, "learning_rate": 1.0419763092547634e-06, "loss": 0.0864, "num_input_tokens_seen": 242587376, "step": 112425 }, { "epoch": 18.34094616639478, "grad_norm": 0.19915620982646942, "learning_rate": 1.040959770239855e-06, "loss": 0.0438, "num_input_tokens_seen": 242596528, "step": 112430 }, { "epoch": 18.341761827079935, "grad_norm": 0.025410957634449005, "learning_rate": 1.0399437167852855e-06, "loss": 0.0485, "num_input_tokens_seen": 242606704, "step": 112435 }, { "epoch": 18.34257748776509, "grad_norm": 0.3694601058959961, "learning_rate": 1.0389281489116581e-06, "loss": 0.1795, "num_input_tokens_seen": 242616528, "step": 112440 }, { "epoch": 18.343393148450243, "grad_norm": 0.032413285225629807, "learning_rate": 1.0379130666395397e-06, "loss": 0.0635, "num_input_tokens_seen": 242628176, "step": 112445 }, { "epoch": 18.3442088091354, "grad_norm": 1.886814832687378, "learning_rate": 1.0368984699895196e-06, "loss": 0.1213, "num_input_tokens_seen": 242638576, "step": 112450 }, { "epoch": 18.345024469820554, "grad_norm": 0.04296426847577095, "learning_rate": 1.0358843589821448e-06, "loss": 0.0897, "num_input_tokens_seen": 242650256, "step": 112455 }, { "epoch": 18.34584013050571, "grad_norm": 0.7450714707374573, "learning_rate": 1.0348707336379798e-06, "loss": 0.2046, "num_input_tokens_seen": 242662384, "step": 112460 }, { "epoch": 18.346655791190866, "grad_norm": 0.03933030739426613, "learning_rate": 1.0338575939775552e-06, "loss": 0.062, "num_input_tokens_seen": 242671952, "step": 112465 }, { "epoch": 18.347471451876018, "grad_norm": 1.938458800315857, "learning_rate": 1.0328449400214158e-06, "loss": 0.1945, "num_input_tokens_seen": 242683184, "step": 112470 }, { "epoch": 18.348287112561174, "grad_norm": 2.4963786602020264, "learning_rate": 1.0318327717900727e-06, "loss": 0.3025, "num_input_tokens_seen": 242693616, "step": 112475 }, { "epoch": 18.34910277324633, "grad_norm": 0.8328155279159546, "learning_rate": 1.030821089304046e-06, "loss": 0.0513, "num_input_tokens_seen": 242705360, "step": 112480 }, { "epoch": 18.349918433931485, "grad_norm": 2.5627191066741943, "learning_rate": 1.0298098925838384e-06, "loss": 0.2776, "num_input_tokens_seen": 242716528, "step": 112485 }, { "epoch": 18.35073409461664, "grad_norm": 1.9380167722702026, "learning_rate": 1.028799181649942e-06, "loss": 0.2604, "num_input_tokens_seen": 242727888, "step": 112490 }, { "epoch": 18.351549755301793, "grad_norm": 2.056913375854492, "learning_rate": 1.0277889565228377e-06, "loss": 0.0675, "num_input_tokens_seen": 242738832, "step": 112495 }, { "epoch": 18.35236541598695, "grad_norm": 0.8156781196594238, "learning_rate": 1.0267792172230034e-06, "loss": 0.0826, "num_input_tokens_seen": 242749776, "step": 112500 }, { "epoch": 18.353181076672104, "grad_norm": 0.8009330034255981, "learning_rate": 1.0257699637708978e-06, "loss": 0.1249, "num_input_tokens_seen": 242759536, "step": 112505 }, { "epoch": 18.35399673735726, "grad_norm": 0.0525355339050293, "learning_rate": 1.0247611961869795e-06, "loss": 0.0205, "num_input_tokens_seen": 242770192, "step": 112510 }, { "epoch": 18.354812398042416, "grad_norm": 0.1306840479373932, "learning_rate": 1.0237529144916907e-06, "loss": 0.0576, "num_input_tokens_seen": 242781936, "step": 112515 }, { "epoch": 18.355628058727568, "grad_norm": 0.08706201612949371, "learning_rate": 1.0227451187054621e-06, "loss": 0.0784, "num_input_tokens_seen": 242794352, "step": 112520 }, { "epoch": 18.356443719412724, "grad_norm": 0.733359158039093, "learning_rate": 1.0217378088487245e-06, "loss": 0.0791, "num_input_tokens_seen": 242805488, "step": 112525 }, { "epoch": 18.35725938009788, "grad_norm": 0.5664524435997009, "learning_rate": 1.0207309849418839e-06, "loss": 0.0799, "num_input_tokens_seen": 242815280, "step": 112530 }, { "epoch": 18.358075040783035, "grad_norm": 2.0594632625579834, "learning_rate": 1.0197246470053574e-06, "loss": 0.1465, "num_input_tokens_seen": 242826064, "step": 112535 }, { "epoch": 18.35889070146819, "grad_norm": 0.11362939327955246, "learning_rate": 1.0187187950595229e-06, "loss": 0.0102, "num_input_tokens_seen": 242836592, "step": 112540 }, { "epoch": 18.359706362153343, "grad_norm": 0.6054386496543884, "learning_rate": 1.0177134291247835e-06, "loss": 0.101, "num_input_tokens_seen": 242846736, "step": 112545 }, { "epoch": 18.3605220228385, "grad_norm": 0.48493775725364685, "learning_rate": 1.0167085492214984e-06, "loss": 0.1382, "num_input_tokens_seen": 242855952, "step": 112550 }, { "epoch": 18.361337683523654, "grad_norm": 3.0205485820770264, "learning_rate": 1.0157041553700452e-06, "loss": 0.096, "num_input_tokens_seen": 242867824, "step": 112555 }, { "epoch": 18.36215334420881, "grad_norm": 1.436124563217163, "learning_rate": 1.0147002475907662e-06, "loss": 0.1931, "num_input_tokens_seen": 242879920, "step": 112560 }, { "epoch": 18.362969004893966, "grad_norm": 1.3964557647705078, "learning_rate": 1.013696825904023e-06, "loss": 0.1686, "num_input_tokens_seen": 242890064, "step": 112565 }, { "epoch": 18.363784665579118, "grad_norm": 2.3380093574523926, "learning_rate": 1.0126938903301325e-06, "loss": 0.0632, "num_input_tokens_seen": 242899472, "step": 112570 }, { "epoch": 18.364600326264274, "grad_norm": 0.8761448264122009, "learning_rate": 1.0116914408894395e-06, "loss": 0.061, "num_input_tokens_seen": 242911184, "step": 112575 }, { "epoch": 18.36541598694943, "grad_norm": 0.2269328385591507, "learning_rate": 1.0106894776022446e-06, "loss": 0.0603, "num_input_tokens_seen": 242921008, "step": 112580 }, { "epoch": 18.366231647634585, "grad_norm": 2.470064878463745, "learning_rate": 1.0096880004888648e-06, "loss": 0.2807, "num_input_tokens_seen": 242930800, "step": 112585 }, { "epoch": 18.36704730831974, "grad_norm": 0.2688320577144623, "learning_rate": 1.0086870095695866e-06, "loss": 0.1833, "num_input_tokens_seen": 242942704, "step": 112590 }, { "epoch": 18.367862969004893, "grad_norm": 0.04048720747232437, "learning_rate": 1.0076865048647076e-06, "loss": 0.115, "num_input_tokens_seen": 242955504, "step": 112595 }, { "epoch": 18.36867862969005, "grad_norm": 2.330383539199829, "learning_rate": 1.0066864863944896e-06, "loss": 0.1924, "num_input_tokens_seen": 242966256, "step": 112600 }, { "epoch": 18.369494290375204, "grad_norm": 1.1422191858291626, "learning_rate": 1.0056869541792164e-06, "loss": 0.0698, "num_input_tokens_seen": 242977776, "step": 112605 }, { "epoch": 18.37030995106036, "grad_norm": 0.30476927757263184, "learning_rate": 1.0046879082391297e-06, "loss": 0.0308, "num_input_tokens_seen": 242987536, "step": 112610 }, { "epoch": 18.371125611745512, "grad_norm": 1.6636906862258911, "learning_rate": 1.0036893485944832e-06, "loss": 0.121, "num_input_tokens_seen": 242998416, "step": 112615 }, { "epoch": 18.371941272430668, "grad_norm": 0.05663802847266197, "learning_rate": 1.002691275265516e-06, "loss": 0.0179, "num_input_tokens_seen": 243009232, "step": 112620 }, { "epoch": 18.372756933115824, "grad_norm": 0.06586074829101562, "learning_rate": 1.001693688272451e-06, "loss": 0.1158, "num_input_tokens_seen": 243021392, "step": 112625 }, { "epoch": 18.37357259380098, "grad_norm": 1.471336841583252, "learning_rate": 1.0006965876355106e-06, "loss": 0.0423, "num_input_tokens_seen": 243031632, "step": 112630 }, { "epoch": 18.374388254486135, "grad_norm": 2.0314955711364746, "learning_rate": 9.996999733748957e-07, "loss": 0.3101, "num_input_tokens_seen": 243043184, "step": 112635 }, { "epoch": 18.375203915171287, "grad_norm": 0.028797505423426628, "learning_rate": 9.987038455108095e-07, "loss": 0.0859, "num_input_tokens_seen": 243054416, "step": 112640 }, { "epoch": 18.376019575856443, "grad_norm": 0.2518249452114105, "learning_rate": 9.977082040634357e-07, "loss": 0.1353, "num_input_tokens_seen": 243065264, "step": 112645 }, { "epoch": 18.3768352365416, "grad_norm": 0.8383133411407471, "learning_rate": 9.967130490529554e-07, "loss": 0.079, "num_input_tokens_seen": 243075632, "step": 112650 }, { "epoch": 18.377650897226754, "grad_norm": 1.4083107709884644, "learning_rate": 9.957183804995358e-07, "loss": 0.0845, "num_input_tokens_seen": 243086832, "step": 112655 }, { "epoch": 18.37846655791191, "grad_norm": 0.05885236710309982, "learning_rate": 9.947241984233357e-07, "loss": 0.173, "num_input_tokens_seen": 243097584, "step": 112660 }, { "epoch": 18.379282218597062, "grad_norm": 0.11627762019634247, "learning_rate": 9.937305028445003e-07, "loss": 0.018, "num_input_tokens_seen": 243108816, "step": 112665 }, { "epoch": 18.380097879282218, "grad_norm": 0.7981256246566772, "learning_rate": 9.927372937831719e-07, "loss": 0.0604, "num_input_tokens_seen": 243119056, "step": 112670 }, { "epoch": 18.380913539967374, "grad_norm": 0.2273404896259308, "learning_rate": 9.91744571259473e-07, "loss": 0.1305, "num_input_tokens_seen": 243129840, "step": 112675 }, { "epoch": 18.38172920065253, "grad_norm": 0.5782637596130371, "learning_rate": 9.90752335293535e-07, "loss": 0.1396, "num_input_tokens_seen": 243140336, "step": 112680 }, { "epoch": 18.382544861337685, "grad_norm": 0.09888411313295364, "learning_rate": 9.897605859054527e-07, "loss": 0.0766, "num_input_tokens_seen": 243151344, "step": 112685 }, { "epoch": 18.383360522022837, "grad_norm": 2.147448778152466, "learning_rate": 9.887693231153377e-07, "loss": 0.2342, "num_input_tokens_seen": 243162832, "step": 112690 }, { "epoch": 18.384176182707993, "grad_norm": 0.17985592782497406, "learning_rate": 9.877785469432633e-07, "loss": 0.0736, "num_input_tokens_seen": 243173072, "step": 112695 }, { "epoch": 18.38499184339315, "grad_norm": 1.4773763418197632, "learning_rate": 9.867882574093269e-07, "loss": 0.3393, "num_input_tokens_seen": 243184144, "step": 112700 }, { "epoch": 18.385807504078304, "grad_norm": 0.08339478075504303, "learning_rate": 9.857984545335846e-07, "loss": 0.035, "num_input_tokens_seen": 243196496, "step": 112705 }, { "epoch": 18.38662316476346, "grad_norm": 0.527732253074646, "learning_rate": 9.848091383361041e-07, "loss": 0.2708, "num_input_tokens_seen": 243206416, "step": 112710 }, { "epoch": 18.387438825448612, "grad_norm": 0.36046868562698364, "learning_rate": 9.838203088369274e-07, "loss": 0.1361, "num_input_tokens_seen": 243218128, "step": 112715 }, { "epoch": 18.388254486133768, "grad_norm": 2.2236015796661377, "learning_rate": 9.828319660561024e-07, "loss": 0.0348, "num_input_tokens_seen": 243230480, "step": 112720 }, { "epoch": 18.389070146818923, "grad_norm": 0.09780963510274887, "learning_rate": 9.81844110013655e-07, "loss": 0.1706, "num_input_tokens_seen": 243242128, "step": 112725 }, { "epoch": 18.38988580750408, "grad_norm": 3.2582924365997314, "learning_rate": 9.80856740729605e-07, "loss": 0.1323, "num_input_tokens_seen": 243253008, "step": 112730 }, { "epoch": 18.390701468189235, "grad_norm": 0.04120452329516411, "learning_rate": 9.79869858223964e-07, "loss": 0.0169, "num_input_tokens_seen": 243263664, "step": 112735 }, { "epoch": 18.391517128874387, "grad_norm": 1.9104762077331543, "learning_rate": 9.78883462516733e-07, "loss": 0.1901, "num_input_tokens_seen": 243274416, "step": 112740 }, { "epoch": 18.392332789559543, "grad_norm": 0.7613754868507385, "learning_rate": 9.778975536278989e-07, "loss": 0.0245, "num_input_tokens_seen": 243283824, "step": 112745 }, { "epoch": 18.3931484502447, "grad_norm": 0.29319265484809875, "learning_rate": 9.769121315774482e-07, "loss": 0.1161, "num_input_tokens_seen": 243294384, "step": 112750 }, { "epoch": 18.393964110929854, "grad_norm": 1.2714606523513794, "learning_rate": 9.759271963853483e-07, "loss": 0.1419, "num_input_tokens_seen": 243305392, "step": 112755 }, { "epoch": 18.39477977161501, "grad_norm": 0.40041735768318176, "learning_rate": 9.749427480715584e-07, "loss": 0.1528, "num_input_tokens_seen": 243316176, "step": 112760 }, { "epoch": 18.395595432300162, "grad_norm": 1.5497132539749146, "learning_rate": 9.739587866560345e-07, "loss": 0.1401, "num_input_tokens_seen": 243326800, "step": 112765 }, { "epoch": 18.396411092985318, "grad_norm": 0.03520517796278, "learning_rate": 9.72975312158711e-07, "loss": 0.0842, "num_input_tokens_seen": 243338128, "step": 112770 }, { "epoch": 18.397226753670473, "grad_norm": 0.02777208760380745, "learning_rate": 9.719923245995272e-07, "loss": 0.0682, "num_input_tokens_seen": 243349936, "step": 112775 }, { "epoch": 18.39804241435563, "grad_norm": 0.21819327771663666, "learning_rate": 9.710098239983978e-07, "loss": 0.1301, "num_input_tokens_seen": 243360528, "step": 112780 }, { "epoch": 18.39885807504078, "grad_norm": 1.443578839302063, "learning_rate": 9.700278103752404e-07, "loss": 0.1936, "num_input_tokens_seen": 243370704, "step": 112785 }, { "epoch": 18.399673735725937, "grad_norm": 1.2022901773452759, "learning_rate": 9.690462837499499e-07, "loss": 0.0865, "num_input_tokens_seen": 243382416, "step": 112790 }, { "epoch": 18.400489396411093, "grad_norm": 0.5459975004196167, "learning_rate": 9.680652441424244e-07, "loss": 0.0157, "num_input_tokens_seen": 243394096, "step": 112795 }, { "epoch": 18.40130505709625, "grad_norm": 0.13099241256713867, "learning_rate": 9.670846915725423e-07, "loss": 0.1372, "num_input_tokens_seen": 243405744, "step": 112800 }, { "epoch": 18.402120717781404, "grad_norm": 0.47526392340660095, "learning_rate": 9.66104626060177e-07, "loss": 0.1265, "num_input_tokens_seen": 243415312, "step": 112805 }, { "epoch": 18.402936378466556, "grad_norm": 1.2170060873031616, "learning_rate": 9.6512504762519e-07, "loss": 0.0838, "num_input_tokens_seen": 243426320, "step": 112810 }, { "epoch": 18.403752039151712, "grad_norm": 0.22002409398555756, "learning_rate": 9.64145956287435e-07, "loss": 0.0638, "num_input_tokens_seen": 243437200, "step": 112815 }, { "epoch": 18.404567699836868, "grad_norm": 0.5027481913566589, "learning_rate": 9.631673520667516e-07, "loss": 0.0547, "num_input_tokens_seen": 243447568, "step": 112820 }, { "epoch": 18.405383360522023, "grad_norm": 0.1706058382987976, "learning_rate": 9.62189234982977e-07, "loss": 0.2022, "num_input_tokens_seen": 243459088, "step": 112825 }, { "epoch": 18.40619902120718, "grad_norm": 0.03707601875066757, "learning_rate": 9.612116050559311e-07, "loss": 0.0507, "num_input_tokens_seen": 243470352, "step": 112830 }, { "epoch": 18.40701468189233, "grad_norm": 2.507127046585083, "learning_rate": 9.602344623054289e-07, "loss": 0.1867, "num_input_tokens_seen": 243480240, "step": 112835 }, { "epoch": 18.407830342577487, "grad_norm": 0.03721499815583229, "learning_rate": 9.59257806751271e-07, "loss": 0.0376, "num_input_tokens_seen": 243491760, "step": 112840 }, { "epoch": 18.408646003262643, "grad_norm": 0.12417411059141159, "learning_rate": 9.582816384132499e-07, "loss": 0.0129, "num_input_tokens_seen": 243502512, "step": 112845 }, { "epoch": 18.4094616639478, "grad_norm": 0.05641478672623634, "learning_rate": 9.573059573111525e-07, "loss": 0.0735, "num_input_tokens_seen": 243512496, "step": 112850 }, { "epoch": 18.410277324632954, "grad_norm": 0.6150732040405273, "learning_rate": 9.563307634647496e-07, "loss": 0.0389, "num_input_tokens_seen": 243523664, "step": 112855 }, { "epoch": 18.411092985318106, "grad_norm": 0.8608668446540833, "learning_rate": 9.553560568938053e-07, "loss": 0.1373, "num_input_tokens_seen": 243534800, "step": 112860 }, { "epoch": 18.411908646003262, "grad_norm": 0.33615124225616455, "learning_rate": 9.543818376180736e-07, "loss": 0.0799, "num_input_tokens_seen": 243546448, "step": 112865 }, { "epoch": 18.412724306688418, "grad_norm": 0.009923113510012627, "learning_rate": 9.534081056572997e-07, "loss": 0.1011, "num_input_tokens_seen": 243556560, "step": 112870 }, { "epoch": 18.413539967373573, "grad_norm": 1.6115132570266724, "learning_rate": 9.524348610312122e-07, "loss": 0.2675, "num_input_tokens_seen": 243566352, "step": 112875 }, { "epoch": 18.41435562805873, "grad_norm": 1.8642992973327637, "learning_rate": 9.514621037595428e-07, "loss": 0.1152, "num_input_tokens_seen": 243578032, "step": 112880 }, { "epoch": 18.41517128874388, "grad_norm": 0.12924394011497498, "learning_rate": 9.504898338619977e-07, "loss": 0.118, "num_input_tokens_seen": 243588656, "step": 112885 }, { "epoch": 18.415986949429037, "grad_norm": 2.296478509902954, "learning_rate": 9.495180513582863e-07, "loss": 0.1368, "num_input_tokens_seen": 243599824, "step": 112890 }, { "epoch": 18.416802610114193, "grad_norm": 0.6172699928283691, "learning_rate": 9.485467562681038e-07, "loss": 0.0684, "num_input_tokens_seen": 243610928, "step": 112895 }, { "epoch": 18.41761827079935, "grad_norm": 0.27090057730674744, "learning_rate": 9.475759486111319e-07, "loss": 0.0639, "num_input_tokens_seen": 243621936, "step": 112900 }, { "epoch": 18.418433931484504, "grad_norm": 3.7946619987487793, "learning_rate": 9.466056284070434e-07, "loss": 0.1595, "num_input_tokens_seen": 243632432, "step": 112905 }, { "epoch": 18.419249592169656, "grad_norm": 0.15033084154129028, "learning_rate": 9.456357956755091e-07, "loss": 0.1071, "num_input_tokens_seen": 243641264, "step": 112910 }, { "epoch": 18.420065252854812, "grad_norm": 0.0349593348801136, "learning_rate": 9.446664504361796e-07, "loss": 0.0523, "num_input_tokens_seen": 243652144, "step": 112915 }, { "epoch": 18.420880913539968, "grad_norm": 0.31354326009750366, "learning_rate": 9.436975927086978e-07, "loss": 0.0255, "num_input_tokens_seen": 243663152, "step": 112920 }, { "epoch": 18.421696574225123, "grad_norm": 0.1647365838289261, "learning_rate": 9.42729222512706e-07, "loss": 0.0515, "num_input_tokens_seen": 243673552, "step": 112925 }, { "epoch": 18.42251223491028, "grad_norm": 0.8798108696937561, "learning_rate": 9.417613398678221e-07, "loss": 0.1025, "num_input_tokens_seen": 243683632, "step": 112930 }, { "epoch": 18.42332789559543, "grad_norm": 0.33285975456237793, "learning_rate": 9.407939447936665e-07, "loss": 0.0171, "num_input_tokens_seen": 243693264, "step": 112935 }, { "epoch": 18.424143556280587, "grad_norm": 1.4815309047698975, "learning_rate": 9.3982703730984e-07, "loss": 0.0935, "num_input_tokens_seen": 243702832, "step": 112940 }, { "epoch": 18.424959216965743, "grad_norm": 0.3849729299545288, "learning_rate": 9.388606174359466e-07, "loss": 0.023, "num_input_tokens_seen": 243713904, "step": 112945 }, { "epoch": 18.4257748776509, "grad_norm": 0.09490373730659485, "learning_rate": 9.37894685191562e-07, "loss": 0.284, "num_input_tokens_seen": 243724592, "step": 112950 }, { "epoch": 18.42659053833605, "grad_norm": 0.6916152238845825, "learning_rate": 9.369292405962709e-07, "loss": 0.1568, "num_input_tokens_seen": 243734448, "step": 112955 }, { "epoch": 18.427406199021206, "grad_norm": 1.419819712638855, "learning_rate": 9.359642836696298e-07, "loss": 0.1303, "num_input_tokens_seen": 243746192, "step": 112960 }, { "epoch": 18.428221859706362, "grad_norm": 0.11725887656211853, "learning_rate": 9.349998144312006e-07, "loss": 0.0543, "num_input_tokens_seen": 243756944, "step": 112965 }, { "epoch": 18.429037520391518, "grad_norm": 0.09993528574705124, "learning_rate": 9.340358329005317e-07, "loss": 0.02, "num_input_tokens_seen": 243768560, "step": 112970 }, { "epoch": 18.429853181076673, "grad_norm": 0.35872384905815125, "learning_rate": 9.330723390971546e-07, "loss": 0.0592, "num_input_tokens_seen": 243779440, "step": 112975 }, { "epoch": 18.430668841761825, "grad_norm": 0.0542202889919281, "learning_rate": 9.321093330405983e-07, "loss": 0.0978, "num_input_tokens_seen": 243789488, "step": 112980 }, { "epoch": 18.43148450244698, "grad_norm": 0.13407118618488312, "learning_rate": 9.311468147503804e-07, "loss": 0.0083, "num_input_tokens_seen": 243799536, "step": 112985 }, { "epoch": 18.432300163132137, "grad_norm": 0.20532557368278503, "learning_rate": 9.301847842460021e-07, "loss": 0.008, "num_input_tokens_seen": 243809840, "step": 112990 }, { "epoch": 18.433115823817293, "grad_norm": 0.1718408763408661, "learning_rate": 9.29223241546967e-07, "loss": 0.017, "num_input_tokens_seen": 243820688, "step": 112995 }, { "epoch": 18.43393148450245, "grad_norm": 0.862343966960907, "learning_rate": 9.28262186672757e-07, "loss": 0.0864, "num_input_tokens_seen": 243831696, "step": 113000 }, { "epoch": 18.4347471451876, "grad_norm": 2.4926412105560303, "learning_rate": 9.273016196428508e-07, "loss": 0.1715, "num_input_tokens_seen": 243842384, "step": 113005 }, { "epoch": 18.435562805872756, "grad_norm": 0.11689820140600204, "learning_rate": 9.26341540476719e-07, "loss": 0.1371, "num_input_tokens_seen": 243853680, "step": 113010 }, { "epoch": 18.436378466557912, "grad_norm": 0.3672705292701721, "learning_rate": 9.253819491938126e-07, "loss": 0.1523, "num_input_tokens_seen": 243864368, "step": 113015 }, { "epoch": 18.437194127243067, "grad_norm": 0.019278910011053085, "learning_rate": 9.244228458135801e-07, "loss": 0.0496, "num_input_tokens_seen": 243875920, "step": 113020 }, { "epoch": 18.438009787928223, "grad_norm": 1.182093620300293, "learning_rate": 9.234642303554641e-07, "loss": 0.2386, "num_input_tokens_seen": 243887568, "step": 113025 }, { "epoch": 18.438825448613375, "grad_norm": 1.7732545137405396, "learning_rate": 9.225061028388882e-07, "loss": 0.2561, "num_input_tokens_seen": 243897712, "step": 113030 }, { "epoch": 18.43964110929853, "grad_norm": 0.05099521577358246, "learning_rate": 9.2154846328327e-07, "loss": 0.0106, "num_input_tokens_seen": 243907920, "step": 113035 }, { "epoch": 18.440456769983687, "grad_norm": 0.3731491267681122, "learning_rate": 9.205913117080162e-07, "loss": 0.154, "num_input_tokens_seen": 243917136, "step": 113040 }, { "epoch": 18.441272430668842, "grad_norm": 0.05515388399362564, "learning_rate": 9.196346481325252e-07, "loss": 0.0997, "num_input_tokens_seen": 243927312, "step": 113045 }, { "epoch": 18.442088091353998, "grad_norm": 0.025877710431814194, "learning_rate": 9.186784725761927e-07, "loss": 0.158, "num_input_tokens_seen": 243937776, "step": 113050 }, { "epoch": 18.44290375203915, "grad_norm": 0.3176073431968689, "learning_rate": 9.177227850583836e-07, "loss": 0.0303, "num_input_tokens_seen": 243949424, "step": 113055 }, { "epoch": 18.443719412724306, "grad_norm": 2.0155277252197266, "learning_rate": 9.16767585598477e-07, "loss": 0.1182, "num_input_tokens_seen": 243958960, "step": 113060 }, { "epoch": 18.44453507340946, "grad_norm": 1.2733036279678345, "learning_rate": 9.15812874215824e-07, "loss": 0.1592, "num_input_tokens_seen": 243970416, "step": 113065 }, { "epoch": 18.445350734094617, "grad_norm": 0.081389419734478, "learning_rate": 9.148586509297785e-07, "loss": 0.1046, "num_input_tokens_seen": 243981072, "step": 113070 }, { "epoch": 18.446166394779773, "grad_norm": 0.12825363874435425, "learning_rate": 9.139049157596751e-07, "loss": 0.0151, "num_input_tokens_seen": 243992112, "step": 113075 }, { "epoch": 18.446982055464925, "grad_norm": 1.0242117643356323, "learning_rate": 9.129516687248457e-07, "loss": 0.0984, "num_input_tokens_seen": 244002672, "step": 113080 }, { "epoch": 18.44779771615008, "grad_norm": 0.025425106287002563, "learning_rate": 9.119989098446052e-07, "loss": 0.0246, "num_input_tokens_seen": 244013456, "step": 113085 }, { "epoch": 18.448613376835237, "grad_norm": 1.8301607370376587, "learning_rate": 9.110466391382688e-07, "loss": 0.081, "num_input_tokens_seen": 244024080, "step": 113090 }, { "epoch": 18.449429037520392, "grad_norm": 0.433852881193161, "learning_rate": 9.100948566251238e-07, "loss": 0.0275, "num_input_tokens_seen": 244035568, "step": 113095 }, { "epoch": 18.450244698205548, "grad_norm": 1.6663970947265625, "learning_rate": 9.091435623244742e-07, "loss": 0.2821, "num_input_tokens_seen": 244045872, "step": 113100 }, { "epoch": 18.4510603588907, "grad_norm": 1.8564817905426025, "learning_rate": 9.081927562555853e-07, "loss": 0.07, "num_input_tokens_seen": 244056368, "step": 113105 }, { "epoch": 18.451876019575856, "grad_norm": 0.6162790656089783, "learning_rate": 9.072424384377359e-07, "loss": 0.0866, "num_input_tokens_seen": 244066544, "step": 113110 }, { "epoch": 18.45269168026101, "grad_norm": 0.022137124091386795, "learning_rate": 9.06292608890183e-07, "loss": 0.0333, "num_input_tokens_seen": 244078416, "step": 113115 }, { "epoch": 18.453507340946167, "grad_norm": 2.124619722366333, "learning_rate": 9.05343267632175e-07, "loss": 0.0555, "num_input_tokens_seen": 244089008, "step": 113120 }, { "epoch": 18.454323001631323, "grad_norm": 1.893562912940979, "learning_rate": 9.043944146829519e-07, "loss": 0.2018, "num_input_tokens_seen": 244099536, "step": 113125 }, { "epoch": 18.455138662316475, "grad_norm": 0.20841945707798004, "learning_rate": 9.034460500617431e-07, "loss": 0.0897, "num_input_tokens_seen": 244110992, "step": 113130 }, { "epoch": 18.45595432300163, "grad_norm": 0.112004354596138, "learning_rate": 9.024981737877691e-07, "loss": 0.0162, "num_input_tokens_seen": 244120784, "step": 113135 }, { "epoch": 18.456769983686787, "grad_norm": 0.09027431160211563, "learning_rate": 9.015507858802397e-07, "loss": 0.1935, "num_input_tokens_seen": 244131120, "step": 113140 }, { "epoch": 18.457585644371942, "grad_norm": 1.9613505601882935, "learning_rate": 9.006038863583533e-07, "loss": 0.1586, "num_input_tokens_seen": 244140848, "step": 113145 }, { "epoch": 18.458401305057095, "grad_norm": 0.03156869485974312, "learning_rate": 8.996574752413028e-07, "loss": 0.1869, "num_input_tokens_seen": 244151856, "step": 113150 }, { "epoch": 18.45921696574225, "grad_norm": 1.3550063371658325, "learning_rate": 8.987115525482675e-07, "loss": 0.2294, "num_input_tokens_seen": 244162064, "step": 113155 }, { "epoch": 18.460032626427406, "grad_norm": 2.955678939819336, "learning_rate": 8.977661182984126e-07, "loss": 0.0429, "num_input_tokens_seen": 244173104, "step": 113160 }, { "epoch": 18.46084828711256, "grad_norm": 0.11019784212112427, "learning_rate": 8.968211725109116e-07, "loss": 0.0543, "num_input_tokens_seen": 244184944, "step": 113165 }, { "epoch": 18.461663947797717, "grad_norm": 0.37588027119636536, "learning_rate": 8.958767152048992e-07, "loss": 0.1176, "num_input_tokens_seen": 244195696, "step": 113170 }, { "epoch": 18.46247960848287, "grad_norm": 3.1124074459075928, "learning_rate": 8.949327463995294e-07, "loss": 0.1977, "num_input_tokens_seen": 244205008, "step": 113175 }, { "epoch": 18.463295269168025, "grad_norm": 0.05578882619738579, "learning_rate": 8.939892661139204e-07, "loss": 0.18, "num_input_tokens_seen": 244215728, "step": 113180 }, { "epoch": 18.46411092985318, "grad_norm": 0.03903643786907196, "learning_rate": 8.930462743672041e-07, "loss": 0.0825, "num_input_tokens_seen": 244226096, "step": 113185 }, { "epoch": 18.464926590538337, "grad_norm": 0.16139915585517883, "learning_rate": 8.921037711784847e-07, "loss": 0.072, "num_input_tokens_seen": 244237552, "step": 113190 }, { "epoch": 18.465742251223492, "grad_norm": 0.10376419126987457, "learning_rate": 8.91161756566869e-07, "loss": 0.1385, "num_input_tokens_seen": 244247696, "step": 113195 }, { "epoch": 18.466557911908644, "grad_norm": 0.4325108528137207, "learning_rate": 8.90220230551439e-07, "loss": 0.0353, "num_input_tokens_seen": 244259312, "step": 113200 }, { "epoch": 18.4673735725938, "grad_norm": 0.05291520431637764, "learning_rate": 8.892791931512878e-07, "loss": 0.1358, "num_input_tokens_seen": 244271280, "step": 113205 }, { "epoch": 18.468189233278956, "grad_norm": 0.0854385495185852, "learning_rate": 8.883386443854724e-07, "loss": 0.0468, "num_input_tokens_seen": 244281648, "step": 113210 }, { "epoch": 18.46900489396411, "grad_norm": 0.9150951504707336, "learning_rate": 8.873985842730693e-07, "loss": 0.0395, "num_input_tokens_seen": 244292848, "step": 113215 }, { "epoch": 18.469820554649267, "grad_norm": 0.0885915458202362, "learning_rate": 8.864590128331185e-07, "loss": 0.1743, "num_input_tokens_seen": 244304048, "step": 113220 }, { "epoch": 18.47063621533442, "grad_norm": 0.20347614586353302, "learning_rate": 8.855199300846718e-07, "loss": 0.1062, "num_input_tokens_seen": 244314192, "step": 113225 }, { "epoch": 18.471451876019575, "grad_norm": 2.3140084743499756, "learning_rate": 8.845813360467469e-07, "loss": 0.0945, "num_input_tokens_seen": 244325584, "step": 113230 }, { "epoch": 18.47226753670473, "grad_norm": 2.1292357444763184, "learning_rate": 8.836432307383818e-07, "loss": 0.0897, "num_input_tokens_seen": 244335632, "step": 113235 }, { "epoch": 18.473083197389887, "grad_norm": 1.636668086051941, "learning_rate": 8.827056141785722e-07, "loss": 0.0377, "num_input_tokens_seen": 244346768, "step": 113240 }, { "epoch": 18.473898858075042, "grad_norm": 0.18625788390636444, "learning_rate": 8.817684863863363e-07, "loss": 0.1204, "num_input_tokens_seen": 244357904, "step": 113245 }, { "epoch": 18.474714518760194, "grad_norm": 0.07612419128417969, "learning_rate": 8.808318473806531e-07, "loss": 0.015, "num_input_tokens_seen": 244368944, "step": 113250 }, { "epoch": 18.47553017944535, "grad_norm": 2.01532244682312, "learning_rate": 8.798956971805105e-07, "loss": 0.1705, "num_input_tokens_seen": 244380080, "step": 113255 }, { "epoch": 18.476345840130506, "grad_norm": 0.18740862607955933, "learning_rate": 8.789600358048822e-07, "loss": 0.0213, "num_input_tokens_seen": 244390288, "step": 113260 }, { "epoch": 18.47716150081566, "grad_norm": 0.1013510525226593, "learning_rate": 8.78024863272725e-07, "loss": 0.1341, "num_input_tokens_seen": 244400848, "step": 113265 }, { "epoch": 18.477977161500817, "grad_norm": 1.6270005702972412, "learning_rate": 8.77090179602999e-07, "loss": 0.0751, "num_input_tokens_seen": 244411984, "step": 113270 }, { "epoch": 18.47879282218597, "grad_norm": 2.480363607406616, "learning_rate": 8.761559848146389e-07, "loss": 0.1833, "num_input_tokens_seen": 244423664, "step": 113275 }, { "epoch": 18.479608482871125, "grad_norm": 2.4558603763580322, "learning_rate": 8.752222789265851e-07, "loss": 0.1185, "num_input_tokens_seen": 244434576, "step": 113280 }, { "epoch": 18.48042414355628, "grad_norm": 0.046496402472257614, "learning_rate": 8.742890619577531e-07, "loss": 0.1574, "num_input_tokens_seen": 244444848, "step": 113285 }, { "epoch": 18.481239804241437, "grad_norm": 0.07696302235126495, "learning_rate": 8.733563339270612e-07, "loss": 0.1197, "num_input_tokens_seen": 244455984, "step": 113290 }, { "epoch": 18.482055464926592, "grad_norm": 0.6171598434448242, "learning_rate": 8.724240948534079e-07, "loss": 0.1767, "num_input_tokens_seen": 244467024, "step": 113295 }, { "epoch": 18.482871125611744, "grad_norm": 1.4776668548583984, "learning_rate": 8.714923447556895e-07, "loss": 0.1446, "num_input_tokens_seen": 244477968, "step": 113300 }, { "epoch": 18.4836867862969, "grad_norm": 0.5073621869087219, "learning_rate": 8.705610836527822e-07, "loss": 0.0182, "num_input_tokens_seen": 244489232, "step": 113305 }, { "epoch": 18.484502446982056, "grad_norm": 0.184418722987175, "learning_rate": 8.69630311563574e-07, "loss": 0.0549, "num_input_tokens_seen": 244500240, "step": 113310 }, { "epoch": 18.48531810766721, "grad_norm": 0.7713567018508911, "learning_rate": 8.687000285069136e-07, "loss": 0.0189, "num_input_tokens_seen": 244511664, "step": 113315 }, { "epoch": 18.486133768352367, "grad_norm": 0.03583939000964165, "learning_rate": 8.677702345016636e-07, "loss": 0.01, "num_input_tokens_seen": 244522928, "step": 113320 }, { "epoch": 18.48694942903752, "grad_norm": 1.4918020963668823, "learning_rate": 8.66840929566659e-07, "loss": 0.067, "num_input_tokens_seen": 244533296, "step": 113325 }, { "epoch": 18.487765089722675, "grad_norm": 0.343951553106308, "learning_rate": 8.659121137207432e-07, "loss": 0.1817, "num_input_tokens_seen": 244545136, "step": 113330 }, { "epoch": 18.48858075040783, "grad_norm": 1.4912455081939697, "learning_rate": 8.649837869827287e-07, "loss": 0.0814, "num_input_tokens_seen": 244555984, "step": 113335 }, { "epoch": 18.489396411092986, "grad_norm": 0.6140611171722412, "learning_rate": 8.640559493714423e-07, "loss": 0.0164, "num_input_tokens_seen": 244567760, "step": 113340 }, { "epoch": 18.49021207177814, "grad_norm": 0.07242042571306229, "learning_rate": 8.631286009056743e-07, "loss": 0.0162, "num_input_tokens_seen": 244577328, "step": 113345 }, { "epoch": 18.491027732463294, "grad_norm": 1.0881668329238892, "learning_rate": 8.622017416042294e-07, "loss": 0.0495, "num_input_tokens_seen": 244589104, "step": 113350 }, { "epoch": 18.49184339314845, "grad_norm": 1.0810129642486572, "learning_rate": 8.61275371485884e-07, "loss": 0.1637, "num_input_tokens_seen": 244600880, "step": 113355 }, { "epoch": 18.492659053833606, "grad_norm": 1.109167218208313, "learning_rate": 8.603494905694176e-07, "loss": 0.1029, "num_input_tokens_seen": 244611344, "step": 113360 }, { "epoch": 18.49347471451876, "grad_norm": 0.031198693439364433, "learning_rate": 8.594240988735902e-07, "loss": 0.0561, "num_input_tokens_seen": 244622416, "step": 113365 }, { "epoch": 18.494290375203914, "grad_norm": 2.644179582595825, "learning_rate": 8.58499196417159e-07, "loss": 0.1365, "num_input_tokens_seen": 244632112, "step": 113370 }, { "epoch": 18.49510603588907, "grad_norm": 0.2767803370952606, "learning_rate": 8.575747832188674e-07, "loss": 0.0878, "num_input_tokens_seen": 244643088, "step": 113375 }, { "epoch": 18.495921696574225, "grad_norm": 0.16891378164291382, "learning_rate": 8.566508592974504e-07, "loss": 0.1453, "num_input_tokens_seen": 244654800, "step": 113380 }, { "epoch": 18.49673735725938, "grad_norm": 0.7424428462982178, "learning_rate": 8.55727424671629e-07, "loss": 0.0847, "num_input_tokens_seen": 244666032, "step": 113385 }, { "epoch": 18.497553017944536, "grad_norm": 0.4332757890224457, "learning_rate": 8.548044793601245e-07, "loss": 0.2518, "num_input_tokens_seen": 244676016, "step": 113390 }, { "epoch": 18.49836867862969, "grad_norm": 0.10274694114923477, "learning_rate": 8.538820233816358e-07, "loss": 0.0157, "num_input_tokens_seen": 244686896, "step": 113395 }, { "epoch": 18.499184339314844, "grad_norm": 0.047285642474889755, "learning_rate": 8.52960056754859e-07, "loss": 0.089, "num_input_tokens_seen": 244696784, "step": 113400 }, { "epoch": 18.5, "grad_norm": 0.3200155198574066, "learning_rate": 8.520385794984792e-07, "loss": 0.1385, "num_input_tokens_seen": 244707024, "step": 113405 }, { "epoch": 18.500815660685156, "grad_norm": 0.03311979025602341, "learning_rate": 8.511175916311704e-07, "loss": 0.1554, "num_input_tokens_seen": 244717808, "step": 113410 }, { "epoch": 18.50163132137031, "grad_norm": 1.3647174835205078, "learning_rate": 8.501970931716008e-07, "loss": 0.1116, "num_input_tokens_seen": 244729008, "step": 113415 }, { "epoch": 18.502446982055464, "grad_norm": 1.0857988595962524, "learning_rate": 8.492770841384223e-07, "loss": 0.0614, "num_input_tokens_seen": 244739792, "step": 113420 }, { "epoch": 18.50326264274062, "grad_norm": 0.06168708950281143, "learning_rate": 8.483575645502811e-07, "loss": 0.0721, "num_input_tokens_seen": 244750224, "step": 113425 }, { "epoch": 18.504078303425775, "grad_norm": 0.05845339596271515, "learning_rate": 8.474385344258123e-07, "loss": 0.0182, "num_input_tokens_seen": 244760592, "step": 113430 }, { "epoch": 18.50489396411093, "grad_norm": 0.08662335574626923, "learning_rate": 8.465199937836427e-07, "loss": 0.1489, "num_input_tokens_seen": 244771120, "step": 113435 }, { "epoch": 18.505709624796086, "grad_norm": 1.8070740699768066, "learning_rate": 8.45601942642385e-07, "loss": 0.1504, "num_input_tokens_seen": 244782160, "step": 113440 }, { "epoch": 18.50652528548124, "grad_norm": 0.03562052547931671, "learning_rate": 8.446843810206468e-07, "loss": 0.0146, "num_input_tokens_seen": 244792688, "step": 113445 }, { "epoch": 18.507340946166394, "grad_norm": 2.7828361988067627, "learning_rate": 8.437673089370185e-07, "loss": 0.2, "num_input_tokens_seen": 244802800, "step": 113450 }, { "epoch": 18.50815660685155, "grad_norm": 0.5789440870285034, "learning_rate": 8.428507264100965e-07, "loss": 0.0135, "num_input_tokens_seen": 244813552, "step": 113455 }, { "epoch": 18.508972267536706, "grad_norm": 0.07621687650680542, "learning_rate": 8.419346334584438e-07, "loss": 0.0263, "num_input_tokens_seen": 244824752, "step": 113460 }, { "epoch": 18.50978792822186, "grad_norm": 1.5317885875701904, "learning_rate": 8.410190301006371e-07, "loss": 0.1963, "num_input_tokens_seen": 244834832, "step": 113465 }, { "epoch": 18.510603588907014, "grad_norm": 0.6248045563697815, "learning_rate": 8.401039163552254e-07, "loss": 0.0498, "num_input_tokens_seen": 244844624, "step": 113470 }, { "epoch": 18.51141924959217, "grad_norm": 0.6149938106536865, "learning_rate": 8.39189292240758e-07, "loss": 0.0247, "num_input_tokens_seen": 244855280, "step": 113475 }, { "epoch": 18.512234910277325, "grad_norm": 0.05034225806593895, "learning_rate": 8.382751577757697e-07, "loss": 0.179, "num_input_tokens_seen": 244864976, "step": 113480 }, { "epoch": 18.51305057096248, "grad_norm": 0.026739228516817093, "learning_rate": 8.373615129787849e-07, "loss": 0.1268, "num_input_tokens_seen": 244875952, "step": 113485 }, { "epoch": 18.513866231647633, "grad_norm": 0.3860969543457031, "learning_rate": 8.364483578683246e-07, "loss": 0.0683, "num_input_tokens_seen": 244886448, "step": 113490 }, { "epoch": 18.51468189233279, "grad_norm": 0.677682638168335, "learning_rate": 8.35535692462891e-07, "loss": 0.0534, "num_input_tokens_seen": 244897232, "step": 113495 }, { "epoch": 18.515497553017944, "grad_norm": 0.04509349912405014, "learning_rate": 8.346235167809801e-07, "loss": 0.0534, "num_input_tokens_seen": 244908656, "step": 113500 }, { "epoch": 18.5163132137031, "grad_norm": 0.3074038326740265, "learning_rate": 8.337118308410802e-07, "loss": 0.0555, "num_input_tokens_seen": 244919632, "step": 113505 }, { "epoch": 18.517128874388256, "grad_norm": 1.977696180343628, "learning_rate": 8.328006346616679e-07, "loss": 0.1325, "num_input_tokens_seen": 244929616, "step": 113510 }, { "epoch": 18.517944535073408, "grad_norm": 2.652456760406494, "learning_rate": 8.318899282612063e-07, "loss": 0.0738, "num_input_tokens_seen": 244940848, "step": 113515 }, { "epoch": 18.518760195758563, "grad_norm": 2.254340887069702, "learning_rate": 8.309797116581585e-07, "loss": 0.2415, "num_input_tokens_seen": 244950896, "step": 113520 }, { "epoch": 18.51957585644372, "grad_norm": 0.518277108669281, "learning_rate": 8.300699848709653e-07, "loss": 0.2557, "num_input_tokens_seen": 244961296, "step": 113525 }, { "epoch": 18.520391517128875, "grad_norm": 0.10757024586200714, "learning_rate": 8.291607479180674e-07, "loss": 0.0131, "num_input_tokens_seen": 244972720, "step": 113530 }, { "epoch": 18.52120717781403, "grad_norm": 2.110461950302124, "learning_rate": 8.282520008178862e-07, "loss": 0.1627, "num_input_tokens_seen": 244983184, "step": 113535 }, { "epoch": 18.522022838499183, "grad_norm": 0.062012068927288055, "learning_rate": 8.273437435888459e-07, "loss": 0.009, "num_input_tokens_seen": 244994512, "step": 113540 }, { "epoch": 18.52283849918434, "grad_norm": 0.1856488734483719, "learning_rate": 8.264359762493484e-07, "loss": 0.1121, "num_input_tokens_seen": 245005616, "step": 113545 }, { "epoch": 18.523654159869494, "grad_norm": 0.17668575048446655, "learning_rate": 8.255286988177929e-07, "loss": 0.1937, "num_input_tokens_seen": 245015568, "step": 113550 }, { "epoch": 18.52446982055465, "grad_norm": 0.05205658823251724, "learning_rate": 8.246219113125648e-07, "loss": 0.1103, "num_input_tokens_seen": 245026576, "step": 113555 }, { "epoch": 18.525285481239806, "grad_norm": 1.9609121084213257, "learning_rate": 8.237156137520435e-07, "loss": 0.094, "num_input_tokens_seen": 245037840, "step": 113560 }, { "epoch": 18.526101141924958, "grad_norm": 0.4457264840602875, "learning_rate": 8.228098061545925e-07, "loss": 0.0391, "num_input_tokens_seen": 245047312, "step": 113565 }, { "epoch": 18.526916802610113, "grad_norm": 0.08606993407011032, "learning_rate": 8.219044885385718e-07, "loss": 0.0458, "num_input_tokens_seen": 245058896, "step": 113570 }, { "epoch": 18.52773246329527, "grad_norm": 0.1473669856786728, "learning_rate": 8.209996609223336e-07, "loss": 0.1381, "num_input_tokens_seen": 245069232, "step": 113575 }, { "epoch": 18.528548123980425, "grad_norm": 0.04233183711767197, "learning_rate": 8.20095323324202e-07, "loss": 0.0088, "num_input_tokens_seen": 245080368, "step": 113580 }, { "epoch": 18.52936378466558, "grad_norm": 0.04535253345966339, "learning_rate": 8.191914757625235e-07, "loss": 0.1791, "num_input_tokens_seen": 245091664, "step": 113585 }, { "epoch": 18.530179445350733, "grad_norm": 0.012234046123921871, "learning_rate": 8.182881182555973e-07, "loss": 0.0098, "num_input_tokens_seen": 245102736, "step": 113590 }, { "epoch": 18.53099510603589, "grad_norm": 0.03655095770955086, "learning_rate": 8.173852508217422e-07, "loss": 0.0937, "num_input_tokens_seen": 245113840, "step": 113595 }, { "epoch": 18.531810766721044, "grad_norm": 0.09862828999757767, "learning_rate": 8.164828734792518e-07, "loss": 0.157, "num_input_tokens_seen": 245125872, "step": 113600 }, { "epoch": 18.5326264274062, "grad_norm": 0.05204029008746147, "learning_rate": 8.155809862464142e-07, "loss": 0.028, "num_input_tokens_seen": 245137328, "step": 113605 }, { "epoch": 18.533442088091356, "grad_norm": 0.5129894018173218, "learning_rate": 8.146795891415093e-07, "loss": 0.0505, "num_input_tokens_seen": 245147728, "step": 113610 }, { "epoch": 18.534257748776508, "grad_norm": 0.09252146631479263, "learning_rate": 8.137786821828058e-07, "loss": 0.1104, "num_input_tokens_seen": 245158096, "step": 113615 }, { "epoch": 18.535073409461663, "grad_norm": 0.2661254405975342, "learning_rate": 8.128782653885558e-07, "loss": 0.1906, "num_input_tokens_seen": 245167504, "step": 113620 }, { "epoch": 18.53588907014682, "grad_norm": 0.2576051354408264, "learning_rate": 8.119783387770141e-07, "loss": 0.1146, "num_input_tokens_seen": 245176624, "step": 113625 }, { "epoch": 18.536704730831975, "grad_norm": 0.2183603048324585, "learning_rate": 8.110789023664162e-07, "loss": 0.1625, "num_input_tokens_seen": 245187280, "step": 113630 }, { "epoch": 18.53752039151713, "grad_norm": 0.11755310744047165, "learning_rate": 8.10179956174989e-07, "loss": 0.098, "num_input_tokens_seen": 245198544, "step": 113635 }, { "epoch": 18.538336052202283, "grad_norm": 0.348682701587677, "learning_rate": 8.092815002209513e-07, "loss": 0.0203, "num_input_tokens_seen": 245208720, "step": 113640 }, { "epoch": 18.53915171288744, "grad_norm": 0.16498401761054993, "learning_rate": 8.083835345225138e-07, "loss": 0.106, "num_input_tokens_seen": 245219792, "step": 113645 }, { "epoch": 18.539967373572594, "grad_norm": 0.11020814627408981, "learning_rate": 8.074860590978728e-07, "loss": 0.028, "num_input_tokens_seen": 245231024, "step": 113650 }, { "epoch": 18.54078303425775, "grad_norm": 1.8561440706253052, "learning_rate": 8.065890739652166e-07, "loss": 0.1119, "num_input_tokens_seen": 245240432, "step": 113655 }, { "epoch": 18.541598694942905, "grad_norm": 0.2879124879837036, "learning_rate": 8.05692579142725e-07, "loss": 0.0877, "num_input_tokens_seen": 245251632, "step": 113660 }, { "epoch": 18.542414355628058, "grad_norm": 1.0405349731445312, "learning_rate": 8.047965746485642e-07, "loss": 0.1329, "num_input_tokens_seen": 245262192, "step": 113665 }, { "epoch": 18.543230016313213, "grad_norm": 1.7577885389328003, "learning_rate": 8.039010605008974e-07, "loss": 0.3191, "num_input_tokens_seen": 245273680, "step": 113670 }, { "epoch": 18.54404567699837, "grad_norm": 0.049805693328380585, "learning_rate": 8.030060367178682e-07, "loss": 0.0455, "num_input_tokens_seen": 245284112, "step": 113675 }, { "epoch": 18.544861337683525, "grad_norm": 1.5721447467803955, "learning_rate": 8.02111503317618e-07, "loss": 0.0722, "num_input_tokens_seen": 245295760, "step": 113680 }, { "epoch": 18.545676998368677, "grad_norm": 0.01764323003590107, "learning_rate": 8.012174603182738e-07, "loss": 0.0067, "num_input_tokens_seen": 245305552, "step": 113685 }, { "epoch": 18.546492659053833, "grad_norm": 0.07610037177801132, "learning_rate": 8.003239077379599e-07, "loss": 0.0418, "num_input_tokens_seen": 245316272, "step": 113690 }, { "epoch": 18.54730831973899, "grad_norm": 0.5421299934387207, "learning_rate": 7.994308455947786e-07, "loss": 0.2839, "num_input_tokens_seen": 245327088, "step": 113695 }, { "epoch": 18.548123980424144, "grad_norm": 1.8045971393585205, "learning_rate": 7.985382739068347e-07, "loss": 0.2763, "num_input_tokens_seen": 245337968, "step": 113700 }, { "epoch": 18.5489396411093, "grad_norm": 0.0363825261592865, "learning_rate": 7.976461926922113e-07, "loss": 0.1279, "num_input_tokens_seen": 245349616, "step": 113705 }, { "epoch": 18.549755301794452, "grad_norm": 1.882826328277588, "learning_rate": 7.967546019689936e-07, "loss": 0.0658, "num_input_tokens_seen": 245360560, "step": 113710 }, { "epoch": 18.550570962479608, "grad_norm": 0.03263191133737564, "learning_rate": 7.958635017552452e-07, "loss": 0.1785, "num_input_tokens_seen": 245371376, "step": 113715 }, { "epoch": 18.551386623164763, "grad_norm": 0.5539503693580627, "learning_rate": 7.94972892069032e-07, "loss": 0.1445, "num_input_tokens_seen": 245381552, "step": 113720 }, { "epoch": 18.55220228384992, "grad_norm": 0.06536229699850082, "learning_rate": 7.940827729283923e-07, "loss": 0.0202, "num_input_tokens_seen": 245392336, "step": 113725 }, { "epoch": 18.553017944535075, "grad_norm": 1.9383502006530762, "learning_rate": 7.931931443513812e-07, "loss": 0.2057, "num_input_tokens_seen": 245401456, "step": 113730 }, { "epoch": 18.553833605220227, "grad_norm": 0.1418929100036621, "learning_rate": 7.923040063560122e-07, "loss": 0.0882, "num_input_tokens_seen": 245412368, "step": 113735 }, { "epoch": 18.554649265905383, "grad_norm": 0.2618991434574127, "learning_rate": 7.914153589603151e-07, "loss": 0.1764, "num_input_tokens_seen": 245422448, "step": 113740 }, { "epoch": 18.55546492659054, "grad_norm": 0.03345131501555443, "learning_rate": 7.905272021822979e-07, "loss": 0.0067, "num_input_tokens_seen": 245433296, "step": 113745 }, { "epoch": 18.556280587275694, "grad_norm": 0.0688086599111557, "learning_rate": 7.896395360399572e-07, "loss": 0.2091, "num_input_tokens_seen": 245443792, "step": 113750 }, { "epoch": 18.55709624796085, "grad_norm": 0.17361168563365936, "learning_rate": 7.887523605512843e-07, "loss": 0.0469, "num_input_tokens_seen": 245455024, "step": 113755 }, { "epoch": 18.557911908646002, "grad_norm": 2.017112970352173, "learning_rate": 7.87865675734259e-07, "loss": 0.2953, "num_input_tokens_seen": 245464944, "step": 113760 }, { "epoch": 18.558727569331158, "grad_norm": 3.2304837703704834, "learning_rate": 7.869794816068504e-07, "loss": 0.0418, "num_input_tokens_seen": 245475856, "step": 113765 }, { "epoch": 18.559543230016313, "grad_norm": 0.3716276288032532, "learning_rate": 7.86093778187022e-07, "loss": 0.0828, "num_input_tokens_seen": 245487184, "step": 113770 }, { "epoch": 18.56035889070147, "grad_norm": 0.5762207508087158, "learning_rate": 7.852085654927178e-07, "loss": 0.1127, "num_input_tokens_seen": 245498736, "step": 113775 }, { "epoch": 18.561174551386625, "grad_norm": 2.329348564147949, "learning_rate": 7.843238435418815e-07, "loss": 0.0835, "num_input_tokens_seen": 245509552, "step": 113780 }, { "epoch": 18.561990212071777, "grad_norm": 0.16597646474838257, "learning_rate": 7.834396123524434e-07, "loss": 0.136, "num_input_tokens_seen": 245520112, "step": 113785 }, { "epoch": 18.562805872756933, "grad_norm": 0.02539789117872715, "learning_rate": 7.825558719423171e-07, "loss": 0.1022, "num_input_tokens_seen": 245530608, "step": 113790 }, { "epoch": 18.563621533442088, "grad_norm": 0.2391592115163803, "learning_rate": 7.816726223294269e-07, "loss": 0.0529, "num_input_tokens_seen": 245541104, "step": 113795 }, { "epoch": 18.564437194127244, "grad_norm": 2.5224456787109375, "learning_rate": 7.807898635316558e-07, "loss": 0.1429, "num_input_tokens_seen": 245550064, "step": 113800 }, { "epoch": 18.5652528548124, "grad_norm": 0.1444721668958664, "learning_rate": 7.799075955669088e-07, "loss": 0.0736, "num_input_tokens_seen": 245560848, "step": 113805 }, { "epoch": 18.56606851549755, "grad_norm": 0.11590167880058289, "learning_rate": 7.790258184530552e-07, "loss": 0.1601, "num_input_tokens_seen": 245572112, "step": 113810 }, { "epoch": 18.566884176182707, "grad_norm": 0.11170890927314758, "learning_rate": 7.78144532207975e-07, "loss": 0.0782, "num_input_tokens_seen": 245582800, "step": 113815 }, { "epoch": 18.567699836867863, "grad_norm": 0.30528387427330017, "learning_rate": 7.772637368495178e-07, "loss": 0.0333, "num_input_tokens_seen": 245593488, "step": 113820 }, { "epoch": 18.56851549755302, "grad_norm": 0.6524743437767029, "learning_rate": 7.763834323955443e-07, "loss": 0.1785, "num_input_tokens_seen": 245605296, "step": 113825 }, { "epoch": 18.569331158238175, "grad_norm": 1.087241291999817, "learning_rate": 7.755036188638848e-07, "loss": 0.0236, "num_input_tokens_seen": 245616464, "step": 113830 }, { "epoch": 18.570146818923327, "grad_norm": 1.766340970993042, "learning_rate": 7.746242962723832e-07, "loss": 0.065, "num_input_tokens_seen": 245627408, "step": 113835 }, { "epoch": 18.570962479608482, "grad_norm": 0.034936144948005676, "learning_rate": 7.737454646388448e-07, "loss": 0.1152, "num_input_tokens_seen": 245637136, "step": 113840 }, { "epoch": 18.571778140293638, "grad_norm": 0.025248907506465912, "learning_rate": 7.728671239810941e-07, "loss": 0.0659, "num_input_tokens_seen": 245646736, "step": 113845 }, { "epoch": 18.572593800978794, "grad_norm": 0.9880133271217346, "learning_rate": 7.7198927431692e-07, "loss": 0.192, "num_input_tokens_seen": 245657936, "step": 113850 }, { "epoch": 18.57340946166395, "grad_norm": 0.02761317603290081, "learning_rate": 7.711119156641216e-07, "loss": 0.0121, "num_input_tokens_seen": 245668944, "step": 113855 }, { "epoch": 18.5742251223491, "grad_norm": 0.10931814461946487, "learning_rate": 7.702350480404741e-07, "loss": 0.0735, "num_input_tokens_seen": 245679856, "step": 113860 }, { "epoch": 18.575040783034257, "grad_norm": 0.1415833979845047, "learning_rate": 7.693586714637547e-07, "loss": 0.0287, "num_input_tokens_seen": 245689360, "step": 113865 }, { "epoch": 18.575856443719413, "grad_norm": 2.569243907928467, "learning_rate": 7.684827859517185e-07, "loss": 0.1163, "num_input_tokens_seen": 245699568, "step": 113870 }, { "epoch": 18.57667210440457, "grad_norm": 0.11883614212274551, "learning_rate": 7.67607391522121e-07, "loss": 0.2417, "num_input_tokens_seen": 245710288, "step": 113875 }, { "epoch": 18.57748776508972, "grad_norm": 0.19739820063114166, "learning_rate": 7.667324881926951e-07, "loss": 0.0197, "num_input_tokens_seen": 245721968, "step": 113880 }, { "epoch": 18.578303425774877, "grad_norm": 0.02736516110599041, "learning_rate": 7.658580759811823e-07, "loss": 0.0945, "num_input_tokens_seen": 245733360, "step": 113885 }, { "epoch": 18.579119086460032, "grad_norm": 0.05068874731659889, "learning_rate": 7.649841549052961e-07, "loss": 0.0804, "num_input_tokens_seen": 245743632, "step": 113890 }, { "epoch": 18.579934747145188, "grad_norm": 1.397116780281067, "learning_rate": 7.641107249827528e-07, "loss": 0.0289, "num_input_tokens_seen": 245755568, "step": 113895 }, { "epoch": 18.580750407830344, "grad_norm": 0.9821116328239441, "learning_rate": 7.632377862312495e-07, "loss": 0.0351, "num_input_tokens_seen": 245766448, "step": 113900 }, { "epoch": 18.581566068515496, "grad_norm": 2.310176372528076, "learning_rate": 7.623653386684776e-07, "loss": 0.165, "num_input_tokens_seen": 245777712, "step": 113905 }, { "epoch": 18.58238172920065, "grad_norm": 1.6471329927444458, "learning_rate": 7.614933823121228e-07, "loss": 0.0486, "num_input_tokens_seen": 245788016, "step": 113910 }, { "epoch": 18.583197389885807, "grad_norm": 0.0749223604798317, "learning_rate": 7.606219171798517e-07, "loss": 0.0952, "num_input_tokens_seen": 245799536, "step": 113915 }, { "epoch": 18.584013050570963, "grad_norm": 0.2728598713874817, "learning_rate": 7.59750943289328e-07, "loss": 0.0586, "num_input_tokens_seen": 245809968, "step": 113920 }, { "epoch": 18.58482871125612, "grad_norm": 1.3012945652008057, "learning_rate": 7.588804606582011e-07, "loss": 0.1112, "num_input_tokens_seen": 245820880, "step": 113925 }, { "epoch": 18.58564437194127, "grad_norm": 0.11694615334272385, "learning_rate": 7.580104693041157e-07, "loss": 0.0105, "num_input_tokens_seen": 245831312, "step": 113930 }, { "epoch": 18.586460032626427, "grad_norm": 0.06754902750253677, "learning_rate": 7.571409692446963e-07, "loss": 0.0531, "num_input_tokens_seen": 245842608, "step": 113935 }, { "epoch": 18.587275693311582, "grad_norm": 2.115083932876587, "learning_rate": 7.56271960497576e-07, "loss": 0.1819, "num_input_tokens_seen": 245852432, "step": 113940 }, { "epoch": 18.588091353996738, "grad_norm": 0.10008543729782104, "learning_rate": 7.554034430803547e-07, "loss": 0.0216, "num_input_tokens_seen": 245862896, "step": 113945 }, { "epoch": 18.588907014681894, "grad_norm": 0.5054004788398743, "learning_rate": 7.545354170106434e-07, "loss": 0.2488, "num_input_tokens_seen": 245873744, "step": 113950 }, { "epoch": 18.589722675367046, "grad_norm": 0.8442584872245789, "learning_rate": 7.53667882306025e-07, "loss": 0.0231, "num_input_tokens_seen": 245883440, "step": 113955 }, { "epoch": 18.5905383360522, "grad_norm": 2.6512959003448486, "learning_rate": 7.528008389840912e-07, "loss": 0.2401, "num_input_tokens_seen": 245894480, "step": 113960 }, { "epoch": 18.591353996737357, "grad_norm": 2.111320972442627, "learning_rate": 7.51934287062403e-07, "loss": 0.0745, "num_input_tokens_seen": 245904368, "step": 113965 }, { "epoch": 18.592169657422513, "grad_norm": 0.08936728537082672, "learning_rate": 7.510682265585294e-07, "loss": 0.0106, "num_input_tokens_seen": 245914864, "step": 113970 }, { "epoch": 18.59298531810767, "grad_norm": 4.003523826599121, "learning_rate": 7.502026574900178e-07, "loss": 0.1238, "num_input_tokens_seen": 245926512, "step": 113975 }, { "epoch": 18.59380097879282, "grad_norm": 1.3115570545196533, "learning_rate": 7.493375798744179e-07, "loss": 0.2022, "num_input_tokens_seen": 245937744, "step": 113980 }, { "epoch": 18.594616639477977, "grad_norm": 0.050452347844839096, "learning_rate": 7.484729937292517e-07, "loss": 0.0781, "num_input_tokens_seen": 245948080, "step": 113985 }, { "epoch": 18.595432300163132, "grad_norm": 0.38543158769607544, "learning_rate": 7.47608899072047e-07, "loss": 0.1282, "num_input_tokens_seen": 245957296, "step": 113990 }, { "epoch": 18.596247960848288, "grad_norm": 0.21020406484603882, "learning_rate": 7.46745295920312e-07, "loss": 0.1131, "num_input_tokens_seen": 245967088, "step": 113995 }, { "epoch": 18.597063621533444, "grad_norm": 0.17216399312019348, "learning_rate": 7.45882184291552e-07, "loss": 0.0328, "num_input_tokens_seen": 245977776, "step": 114000 }, { "epoch": 18.597879282218596, "grad_norm": 0.6142116189002991, "learning_rate": 7.450195642032614e-07, "loss": 0.1326, "num_input_tokens_seen": 245987600, "step": 114005 }, { "epoch": 18.59869494290375, "grad_norm": 0.11591057479381561, "learning_rate": 7.441574356729152e-07, "loss": 0.0453, "num_input_tokens_seen": 245997680, "step": 114010 }, { "epoch": 18.599510603588907, "grad_norm": 0.1016453355550766, "learning_rate": 7.432957987179911e-07, "loss": 0.0267, "num_input_tokens_seen": 246007664, "step": 114015 }, { "epoch": 18.600326264274063, "grad_norm": 0.7066153883934021, "learning_rate": 7.4243465335595e-07, "loss": 0.0254, "num_input_tokens_seen": 246018576, "step": 114020 }, { "epoch": 18.601141924959215, "grad_norm": 0.060091450810432434, "learning_rate": 7.41573999604242e-07, "loss": 0.0465, "num_input_tokens_seen": 246029712, "step": 114025 }, { "epoch": 18.60195758564437, "grad_norm": 1.2253884077072144, "learning_rate": 7.407138374803113e-07, "loss": 0.0907, "num_input_tokens_seen": 246040336, "step": 114030 }, { "epoch": 18.602773246329527, "grad_norm": 0.09817573428153992, "learning_rate": 7.398541670015885e-07, "loss": 0.1759, "num_input_tokens_seen": 246050896, "step": 114035 }, { "epoch": 18.603588907014682, "grad_norm": 1.6728601455688477, "learning_rate": 7.389949881854985e-07, "loss": 0.0638, "num_input_tokens_seen": 246061264, "step": 114040 }, { "epoch": 18.604404567699838, "grad_norm": 0.4481481611728668, "learning_rate": 7.381363010494524e-07, "loss": 0.0095, "num_input_tokens_seen": 246072048, "step": 114045 }, { "epoch": 18.605220228384994, "grad_norm": 0.08078223466873169, "learning_rate": 7.37278105610853e-07, "loss": 0.0344, "num_input_tokens_seen": 246082640, "step": 114050 }, { "epoch": 18.606035889070146, "grad_norm": 0.16284337639808655, "learning_rate": 7.364204018870918e-07, "loss": 0.2606, "num_input_tokens_seen": 246093680, "step": 114055 }, { "epoch": 18.6068515497553, "grad_norm": 0.0420825257897377, "learning_rate": 7.355631898955522e-07, "loss": 0.0988, "num_input_tokens_seen": 246105104, "step": 114060 }, { "epoch": 18.607667210440457, "grad_norm": 1.5764144659042358, "learning_rate": 7.347064696536066e-07, "loss": 0.1757, "num_input_tokens_seen": 246115856, "step": 114065 }, { "epoch": 18.608482871125613, "grad_norm": 2.0137083530426025, "learning_rate": 7.338502411786157e-07, "loss": 0.3206, "num_input_tokens_seen": 246125552, "step": 114070 }, { "epoch": 18.609298531810765, "grad_norm": 0.3236488699913025, "learning_rate": 7.329945044879327e-07, "loss": 0.0332, "num_input_tokens_seen": 246135664, "step": 114075 }, { "epoch": 18.61011419249592, "grad_norm": 0.08606671541929245, "learning_rate": 7.321392595989018e-07, "loss": 0.0428, "num_input_tokens_seen": 246146064, "step": 114080 }, { "epoch": 18.610929853181077, "grad_norm": 0.20377217233181, "learning_rate": 7.312845065288565e-07, "loss": 0.0317, "num_input_tokens_seen": 246156688, "step": 114085 }, { "epoch": 18.611745513866232, "grad_norm": 0.10659906268119812, "learning_rate": 7.304302452951134e-07, "loss": 0.0153, "num_input_tokens_seen": 246167792, "step": 114090 }, { "epoch": 18.612561174551388, "grad_norm": 0.4086419343948364, "learning_rate": 7.295764759149948e-07, "loss": 0.0546, "num_input_tokens_seen": 246178224, "step": 114095 }, { "epoch": 18.61337683523654, "grad_norm": 2.8219432830810547, "learning_rate": 7.287231984057952e-07, "loss": 0.1607, "num_input_tokens_seen": 246189776, "step": 114100 }, { "epoch": 18.614192495921696, "grad_norm": 0.04350900277495384, "learning_rate": 7.27870412784809e-07, "loss": 0.0892, "num_input_tokens_seen": 246200688, "step": 114105 }, { "epoch": 18.61500815660685, "grad_norm": 3.250288248062134, "learning_rate": 7.270181190693226e-07, "loss": 0.2176, "num_input_tokens_seen": 246211536, "step": 114110 }, { "epoch": 18.615823817292007, "grad_norm": 1.9201431274414062, "learning_rate": 7.261663172766081e-07, "loss": 0.238, "num_input_tokens_seen": 246221424, "step": 114115 }, { "epoch": 18.616639477977163, "grad_norm": 1.6760064363479614, "learning_rate": 7.25315007423924e-07, "loss": 0.2318, "num_input_tokens_seen": 246233232, "step": 114120 }, { "epoch": 18.617455138662315, "grad_norm": 0.17183199524879456, "learning_rate": 7.244641895285259e-07, "loss": 0.0497, "num_input_tokens_seen": 246244784, "step": 114125 }, { "epoch": 18.61827079934747, "grad_norm": 0.24169160425662994, "learning_rate": 7.236138636076584e-07, "loss": 0.0163, "num_input_tokens_seen": 246256432, "step": 114130 }, { "epoch": 18.619086460032626, "grad_norm": 0.7726346254348755, "learning_rate": 7.22764029678552e-07, "loss": 0.0455, "num_input_tokens_seen": 246267568, "step": 114135 }, { "epoch": 18.619902120717782, "grad_norm": 1.1488300561904907, "learning_rate": 7.219146877584293e-07, "loss": 0.0757, "num_input_tokens_seen": 246276656, "step": 114140 }, { "epoch": 18.620717781402938, "grad_norm": 0.15649664402008057, "learning_rate": 7.210658378645041e-07, "loss": 0.0344, "num_input_tokens_seen": 246287760, "step": 114145 }, { "epoch": 18.62153344208809, "grad_norm": 1.534028172492981, "learning_rate": 7.202174800139822e-07, "loss": 0.329, "num_input_tokens_seen": 246299504, "step": 114150 }, { "epoch": 18.622349102773246, "grad_norm": 0.0978139117360115, "learning_rate": 7.193696142240524e-07, "loss": 0.0811, "num_input_tokens_seen": 246310512, "step": 114155 }, { "epoch": 18.6231647634584, "grad_norm": 0.04226471856236458, "learning_rate": 7.185222405118985e-07, "loss": 0.1363, "num_input_tokens_seen": 246321744, "step": 114160 }, { "epoch": 18.623980424143557, "grad_norm": 0.5713316798210144, "learning_rate": 7.176753588946982e-07, "loss": 0.1364, "num_input_tokens_seen": 246332400, "step": 114165 }, { "epoch": 18.624796084828713, "grad_norm": 0.15812210738658905, "learning_rate": 7.168289693896074e-07, "loss": 0.0381, "num_input_tokens_seen": 246343376, "step": 114170 }, { "epoch": 18.625611745513865, "grad_norm": 0.040475521236658096, "learning_rate": 7.159830720137844e-07, "loss": 0.024, "num_input_tokens_seen": 246354256, "step": 114175 }, { "epoch": 18.62642740619902, "grad_norm": 0.0947103500366211, "learning_rate": 7.151376667843712e-07, "loss": 0.0344, "num_input_tokens_seen": 246364848, "step": 114180 }, { "epoch": 18.627243066884176, "grad_norm": 0.18247482180595398, "learning_rate": 7.142927537185013e-07, "loss": 0.0853, "num_input_tokens_seen": 246377072, "step": 114185 }, { "epoch": 18.628058727569332, "grad_norm": 0.7311158180236816, "learning_rate": 7.134483328332969e-07, "loss": 0.0271, "num_input_tokens_seen": 246388656, "step": 114190 }, { "epoch": 18.628874388254488, "grad_norm": 0.1564362645149231, "learning_rate": 7.126044041458696e-07, "loss": 0.0499, "num_input_tokens_seen": 246400208, "step": 114195 }, { "epoch": 18.62969004893964, "grad_norm": 0.2865218222141266, "learning_rate": 7.117609676733277e-07, "loss": 0.1027, "num_input_tokens_seen": 246410672, "step": 114200 }, { "epoch": 18.630505709624796, "grad_norm": 0.14150482416152954, "learning_rate": 7.109180234327606e-07, "loss": 0.0862, "num_input_tokens_seen": 246421136, "step": 114205 }, { "epoch": 18.63132137030995, "grad_norm": 2.022427558898926, "learning_rate": 7.100755714412488e-07, "loss": 0.1092, "num_input_tokens_seen": 246430480, "step": 114210 }, { "epoch": 18.632137030995107, "grad_norm": 0.3377339243888855, "learning_rate": 7.092336117158733e-07, "loss": 0.0309, "num_input_tokens_seen": 246441552, "step": 114215 }, { "epoch": 18.63295269168026, "grad_norm": 1.9650628566741943, "learning_rate": 7.083921442736951e-07, "loss": 0.0856, "num_input_tokens_seen": 246453040, "step": 114220 }, { "epoch": 18.633768352365415, "grad_norm": 2.2578179836273193, "learning_rate": 7.075511691317649e-07, "loss": 0.2754, "num_input_tokens_seen": 246464304, "step": 114225 }, { "epoch": 18.63458401305057, "grad_norm": 0.15077805519104004, "learning_rate": 7.067106863071271e-07, "loss": 0.2183, "num_input_tokens_seen": 246477072, "step": 114230 }, { "epoch": 18.635399673735726, "grad_norm": 0.05293215811252594, "learning_rate": 7.058706958168155e-07, "loss": 0.0652, "num_input_tokens_seen": 246487984, "step": 114235 }, { "epoch": 18.636215334420882, "grad_norm": 0.03365549445152283, "learning_rate": 7.050311976778523e-07, "loss": 0.1346, "num_input_tokens_seen": 246498864, "step": 114240 }, { "epoch": 18.637030995106034, "grad_norm": 0.3411160707473755, "learning_rate": 7.041921919072547e-07, "loss": 0.1211, "num_input_tokens_seen": 246509424, "step": 114245 }, { "epoch": 18.63784665579119, "grad_norm": 0.8185516595840454, "learning_rate": 7.033536785220229e-07, "loss": 0.1417, "num_input_tokens_seen": 246520944, "step": 114250 }, { "epoch": 18.638662316476346, "grad_norm": 0.039622578769922256, "learning_rate": 7.02515657539149e-07, "loss": 0.0209, "num_input_tokens_seen": 246531632, "step": 114255 }, { "epoch": 18.6394779771615, "grad_norm": 0.1569099873304367, "learning_rate": 7.016781289756219e-07, "loss": 0.0217, "num_input_tokens_seen": 246542576, "step": 114260 }, { "epoch": 18.640293637846657, "grad_norm": 0.2624889314174652, "learning_rate": 7.008410928484116e-07, "loss": 0.061, "num_input_tokens_seen": 246552912, "step": 114265 }, { "epoch": 18.64110929853181, "grad_norm": 0.06580711901187897, "learning_rate": 7.000045491744794e-07, "loss": 0.0924, "num_input_tokens_seen": 246562960, "step": 114270 }, { "epoch": 18.641924959216965, "grad_norm": 0.8187777996063232, "learning_rate": 6.991684979707841e-07, "loss": 0.0956, "num_input_tokens_seen": 246574224, "step": 114275 }, { "epoch": 18.64274061990212, "grad_norm": 2.454387664794922, "learning_rate": 6.983329392542675e-07, "loss": 0.1382, "num_input_tokens_seen": 246585616, "step": 114280 }, { "epoch": 18.643556280587276, "grad_norm": 0.019009433686733246, "learning_rate": 6.974978730418636e-07, "loss": 0.1518, "num_input_tokens_seen": 246595472, "step": 114285 }, { "epoch": 18.644371941272432, "grad_norm": 0.23558630049228668, "learning_rate": 6.966632993504917e-07, "loss": 0.1165, "num_input_tokens_seen": 246606032, "step": 114290 }, { "epoch": 18.645187601957584, "grad_norm": 0.9168035387992859, "learning_rate": 6.958292181970721e-07, "loss": 0.0677, "num_input_tokens_seen": 246617072, "step": 114295 }, { "epoch": 18.64600326264274, "grad_norm": 3.8577005863189697, "learning_rate": 6.949956295985049e-07, "loss": 0.096, "num_input_tokens_seen": 246627728, "step": 114300 }, { "epoch": 18.646818923327896, "grad_norm": 0.3817637264728546, "learning_rate": 6.941625335716822e-07, "loss": 0.1234, "num_input_tokens_seen": 246637168, "step": 114305 }, { "epoch": 18.64763458401305, "grad_norm": 0.067592553794384, "learning_rate": 6.933299301334934e-07, "loss": 0.1518, "num_input_tokens_seen": 246647312, "step": 114310 }, { "epoch": 18.648450244698207, "grad_norm": 0.05181519314646721, "learning_rate": 6.924978193008025e-07, "loss": 0.2711, "num_input_tokens_seen": 246658224, "step": 114315 }, { "epoch": 18.64926590538336, "grad_norm": 0.09373335540294647, "learning_rate": 6.916662010904879e-07, "loss": 0.0518, "num_input_tokens_seen": 246670320, "step": 114320 }, { "epoch": 18.650081566068515, "grad_norm": 1.996394395828247, "learning_rate": 6.908350755193887e-07, "loss": 0.1964, "num_input_tokens_seen": 246680624, "step": 114325 }, { "epoch": 18.65089722675367, "grad_norm": 0.39195573329925537, "learning_rate": 6.900044426043584e-07, "loss": 0.1095, "num_input_tokens_seen": 246691952, "step": 114330 }, { "epoch": 18.651712887438826, "grad_norm": 0.056363627314567566, "learning_rate": 6.891743023622249e-07, "loss": 0.0291, "num_input_tokens_seen": 246704112, "step": 114335 }, { "epoch": 18.652528548123982, "grad_norm": 1.1166763305664062, "learning_rate": 6.883446548098193e-07, "loss": 0.0358, "num_input_tokens_seen": 246714960, "step": 114340 }, { "epoch": 18.653344208809134, "grad_norm": 2.3951802253723145, "learning_rate": 6.87515499963945e-07, "loss": 0.1088, "num_input_tokens_seen": 246725712, "step": 114345 }, { "epoch": 18.65415986949429, "grad_norm": 1.0278414487838745, "learning_rate": 6.86686837841416e-07, "loss": 0.1803, "num_input_tokens_seen": 246736528, "step": 114350 }, { "epoch": 18.654975530179446, "grad_norm": 0.03600456193089485, "learning_rate": 6.858586684590191e-07, "loss": 0.1664, "num_input_tokens_seen": 246745968, "step": 114355 }, { "epoch": 18.6557911908646, "grad_norm": 0.11445800960063934, "learning_rate": 6.850309918335406e-07, "loss": 0.1296, "num_input_tokens_seen": 246757424, "step": 114360 }, { "epoch": 18.656606851549757, "grad_norm": 0.08276664465665817, "learning_rate": 6.842038079817564e-07, "loss": 0.1654, "num_input_tokens_seen": 246767792, "step": 114365 }, { "epoch": 18.65742251223491, "grad_norm": 0.036390095949172974, "learning_rate": 6.833771169204306e-07, "loss": 0.0432, "num_input_tokens_seen": 246779088, "step": 114370 }, { "epoch": 18.658238172920065, "grad_norm": 0.26558294892311096, "learning_rate": 6.825509186663109e-07, "loss": 0.0202, "num_input_tokens_seen": 246789136, "step": 114375 }, { "epoch": 18.65905383360522, "grad_norm": 0.23758849501609802, "learning_rate": 6.817252132361507e-07, "loss": 0.2069, "num_input_tokens_seen": 246800816, "step": 114380 }, { "epoch": 18.659869494290376, "grad_norm": 0.04550931602716446, "learning_rate": 6.809000006466754e-07, "loss": 0.0055, "num_input_tokens_seen": 246812016, "step": 114385 }, { "epoch": 18.660685154975532, "grad_norm": 1.00081467628479, "learning_rate": 6.800752809146132e-07, "loss": 0.0724, "num_input_tokens_seen": 246822608, "step": 114390 }, { "epoch": 18.661500815660684, "grad_norm": 0.3336728513240814, "learning_rate": 6.792510540566788e-07, "loss": 0.0271, "num_input_tokens_seen": 246833872, "step": 114395 }, { "epoch": 18.66231647634584, "grad_norm": 0.0237360168248415, "learning_rate": 6.784273200895724e-07, "loss": 0.0405, "num_input_tokens_seen": 246845808, "step": 114400 }, { "epoch": 18.663132137030995, "grad_norm": 0.08237612992525101, "learning_rate": 6.77604079029992e-07, "loss": 0.0655, "num_input_tokens_seen": 246855088, "step": 114405 }, { "epoch": 18.66394779771615, "grad_norm": 0.09756216406822205, "learning_rate": 6.767813308946213e-07, "loss": 0.1145, "num_input_tokens_seen": 246866896, "step": 114410 }, { "epoch": 18.664763458401303, "grad_norm": 2.2490932941436768, "learning_rate": 6.759590757001332e-07, "loss": 0.0793, "num_input_tokens_seen": 246878512, "step": 114415 }, { "epoch": 18.66557911908646, "grad_norm": 1.2597988843917847, "learning_rate": 6.751373134631894e-07, "loss": 0.1201, "num_input_tokens_seen": 246889104, "step": 114420 }, { "epoch": 18.666394779771615, "grad_norm": 2.0085954666137695, "learning_rate": 6.743160442004459e-07, "loss": 0.0738, "num_input_tokens_seen": 246900208, "step": 114425 }, { "epoch": 18.66721044045677, "grad_norm": 1.3183830976486206, "learning_rate": 6.734952679285478e-07, "loss": 0.021, "num_input_tokens_seen": 246910288, "step": 114430 }, { "epoch": 18.668026101141926, "grad_norm": 1.2080204486846924, "learning_rate": 6.726749846641317e-07, "loss": 0.1534, "num_input_tokens_seen": 246920336, "step": 114435 }, { "epoch": 18.66884176182708, "grad_norm": 1.9927512407302856, "learning_rate": 6.718551944238122e-07, "loss": 0.2748, "num_input_tokens_seen": 246931344, "step": 114440 }, { "epoch": 18.669657422512234, "grad_norm": 1.994675874710083, "learning_rate": 6.710358972242176e-07, "loss": 0.1231, "num_input_tokens_seen": 246942736, "step": 114445 }, { "epoch": 18.67047308319739, "grad_norm": 0.832517147064209, "learning_rate": 6.702170930819346e-07, "loss": 0.1277, "num_input_tokens_seen": 246952944, "step": 114450 }, { "epoch": 18.671288743882545, "grad_norm": 0.7203454971313477, "learning_rate": 6.69398782013575e-07, "loss": 0.0992, "num_input_tokens_seen": 246964464, "step": 114455 }, { "epoch": 18.6721044045677, "grad_norm": 1.0092967748641968, "learning_rate": 6.685809640357088e-07, "loss": 0.066, "num_input_tokens_seen": 246975280, "step": 114460 }, { "epoch": 18.672920065252853, "grad_norm": 0.09504575282335281, "learning_rate": 6.677636391649228e-07, "loss": 0.0311, "num_input_tokens_seen": 246987056, "step": 114465 }, { "epoch": 18.67373572593801, "grad_norm": 0.053156718611717224, "learning_rate": 6.669468074177676e-07, "loss": 0.0124, "num_input_tokens_seen": 246998032, "step": 114470 }, { "epoch": 18.674551386623165, "grad_norm": 1.6416798830032349, "learning_rate": 6.661304688108077e-07, "loss": 0.0878, "num_input_tokens_seen": 247009904, "step": 114475 }, { "epoch": 18.67536704730832, "grad_norm": 1.685816764831543, "learning_rate": 6.6531462336058e-07, "loss": 0.0405, "num_input_tokens_seen": 247019472, "step": 114480 }, { "epoch": 18.676182707993476, "grad_norm": 0.20773248374462128, "learning_rate": 6.644992710836268e-07, "loss": 0.0242, "num_input_tokens_seen": 247030192, "step": 114485 }, { "epoch": 18.67699836867863, "grad_norm": 1.1300928592681885, "learning_rate": 6.636844119964625e-07, "loss": 0.0502, "num_input_tokens_seen": 247040656, "step": 114490 }, { "epoch": 18.677814029363784, "grad_norm": 0.218120738863945, "learning_rate": 6.628700461156129e-07, "loss": 0.2021, "num_input_tokens_seen": 247052336, "step": 114495 }, { "epoch": 18.67862969004894, "grad_norm": 0.04965199530124664, "learning_rate": 6.620561734575703e-07, "loss": 0.0561, "num_input_tokens_seen": 247062576, "step": 114500 }, { "epoch": 18.679445350734095, "grad_norm": 1.4313150644302368, "learning_rate": 6.612427940388355e-07, "loss": 0.2685, "num_input_tokens_seen": 247072976, "step": 114505 }, { "epoch": 18.68026101141925, "grad_norm": 0.08379170298576355, "learning_rate": 6.604299078758924e-07, "loss": 0.0869, "num_input_tokens_seen": 247083696, "step": 114510 }, { "epoch": 18.681076672104403, "grad_norm": 0.048325344920158386, "learning_rate": 6.596175149852113e-07, "loss": 0.1061, "num_input_tokens_seen": 247093808, "step": 114515 }, { "epoch": 18.68189233278956, "grad_norm": 0.5094038248062134, "learning_rate": 6.588056153832623e-07, "loss": 0.0223, "num_input_tokens_seen": 247103888, "step": 114520 }, { "epoch": 18.682707993474715, "grad_norm": 0.1495194435119629, "learning_rate": 6.57994209086496e-07, "loss": 0.028, "num_input_tokens_seen": 247115664, "step": 114525 }, { "epoch": 18.68352365415987, "grad_norm": 0.34589505195617676, "learning_rate": 6.571832961113577e-07, "loss": 0.0178, "num_input_tokens_seen": 247126192, "step": 114530 }, { "epoch": 18.684339314845026, "grad_norm": 0.8046870827674866, "learning_rate": 6.563728764742788e-07, "loss": 0.0515, "num_input_tokens_seen": 247136880, "step": 114535 }, { "epoch": 18.68515497553018, "grad_norm": 0.05498296394944191, "learning_rate": 6.555629501916877e-07, "loss": 0.032, "num_input_tokens_seen": 247149424, "step": 114540 }, { "epoch": 18.685970636215334, "grad_norm": 0.03345243260264397, "learning_rate": 6.547535172799963e-07, "loss": 0.1936, "num_input_tokens_seen": 247159600, "step": 114545 }, { "epoch": 18.68678629690049, "grad_norm": 2.3719637393951416, "learning_rate": 6.53944577755608e-07, "loss": 0.0879, "num_input_tokens_seen": 247170992, "step": 114550 }, { "epoch": 18.687601957585645, "grad_norm": 0.02101186290383339, "learning_rate": 6.531361316349155e-07, "loss": 0.0262, "num_input_tokens_seen": 247181808, "step": 114555 }, { "epoch": 18.6884176182708, "grad_norm": 0.6157136559486389, "learning_rate": 6.52328178934311e-07, "loss": 0.1905, "num_input_tokens_seen": 247193232, "step": 114560 }, { "epoch": 18.689233278955953, "grad_norm": 0.5369846820831299, "learning_rate": 6.515207196701595e-07, "loss": 0.03, "num_input_tokens_seen": 247203568, "step": 114565 }, { "epoch": 18.69004893964111, "grad_norm": 0.06054702401161194, "learning_rate": 6.507137538588309e-07, "loss": 0.0766, "num_input_tokens_seen": 247215216, "step": 114570 }, { "epoch": 18.690864600326265, "grad_norm": 0.12353559583425522, "learning_rate": 6.499072815166734e-07, "loss": 0.0445, "num_input_tokens_seen": 247225072, "step": 114575 }, { "epoch": 18.69168026101142, "grad_norm": 0.26428964734077454, "learning_rate": 6.491013026600407e-07, "loss": 0.0202, "num_input_tokens_seen": 247234864, "step": 114580 }, { "epoch": 18.692495921696576, "grad_norm": 0.14192289113998413, "learning_rate": 6.482958173052584e-07, "loss": 0.0159, "num_input_tokens_seen": 247246192, "step": 114585 }, { "epoch": 18.693311582381728, "grad_norm": 0.2002129852771759, "learning_rate": 6.474908254686552e-07, "loss": 0.1554, "num_input_tokens_seen": 247257488, "step": 114590 }, { "epoch": 18.694127243066884, "grad_norm": 1.1119695901870728, "learning_rate": 6.466863271665402e-07, "loss": 0.0678, "num_input_tokens_seen": 247268240, "step": 114595 }, { "epoch": 18.69494290375204, "grad_norm": 0.07809260487556458, "learning_rate": 6.458823224152255e-07, "loss": 0.0327, "num_input_tokens_seen": 247280336, "step": 114600 }, { "epoch": 18.695758564437195, "grad_norm": 0.04264763370156288, "learning_rate": 6.45078811230998e-07, "loss": 0.0629, "num_input_tokens_seen": 247290928, "step": 114605 }, { "epoch": 18.696574225122347, "grad_norm": 0.244639590382576, "learning_rate": 6.442757936301475e-07, "loss": 0.0299, "num_input_tokens_seen": 247301904, "step": 114610 }, { "epoch": 18.697389885807503, "grad_norm": 1.422540545463562, "learning_rate": 6.434732696289414e-07, "loss": 0.056, "num_input_tokens_seen": 247312976, "step": 114615 }, { "epoch": 18.69820554649266, "grad_norm": 2.490347385406494, "learning_rate": 6.426712392436529e-07, "loss": 0.3434, "num_input_tokens_seen": 247323632, "step": 114620 }, { "epoch": 18.699021207177815, "grad_norm": 0.12668287754058838, "learning_rate": 6.418697024905273e-07, "loss": 0.0209, "num_input_tokens_seen": 247334704, "step": 114625 }, { "epoch": 18.69983686786297, "grad_norm": 0.9415653944015503, "learning_rate": 6.410686593858156e-07, "loss": 0.1083, "num_input_tokens_seen": 247345200, "step": 114630 }, { "epoch": 18.700652528548122, "grad_norm": 3.002351999282837, "learning_rate": 6.402681099457436e-07, "loss": 0.1612, "num_input_tokens_seen": 247355056, "step": 114635 }, { "epoch": 18.701468189233278, "grad_norm": 0.06972191482782364, "learning_rate": 6.394680541865456e-07, "loss": 0.0185, "num_input_tokens_seen": 247366480, "step": 114640 }, { "epoch": 18.702283849918434, "grad_norm": 3.2697184085845947, "learning_rate": 6.386684921244307e-07, "loss": 0.3558, "num_input_tokens_seen": 247376880, "step": 114645 }, { "epoch": 18.70309951060359, "grad_norm": 0.14036458730697632, "learning_rate": 6.378694237756028e-07, "loss": 0.0378, "num_input_tokens_seen": 247388112, "step": 114650 }, { "epoch": 18.703915171288745, "grad_norm": 3.4942219257354736, "learning_rate": 6.370708491562572e-07, "loss": 0.2023, "num_input_tokens_seen": 247399728, "step": 114655 }, { "epoch": 18.704730831973897, "grad_norm": 0.6527872681617737, "learning_rate": 6.362727682825753e-07, "loss": 0.2178, "num_input_tokens_seen": 247411760, "step": 114660 }, { "epoch": 18.705546492659053, "grad_norm": 0.0737457349896431, "learning_rate": 6.35475181170736e-07, "loss": 0.086, "num_input_tokens_seen": 247424176, "step": 114665 }, { "epoch": 18.70636215334421, "grad_norm": 3.0213663578033447, "learning_rate": 6.346780878368985e-07, "loss": 0.1015, "num_input_tokens_seen": 247434704, "step": 114670 }, { "epoch": 18.707177814029365, "grad_norm": 0.040466438978910446, "learning_rate": 6.338814882972194e-07, "loss": 0.1981, "num_input_tokens_seen": 247447216, "step": 114675 }, { "epoch": 18.70799347471452, "grad_norm": 0.5263296365737915, "learning_rate": 6.33085382567844e-07, "loss": 0.0212, "num_input_tokens_seen": 247458192, "step": 114680 }, { "epoch": 18.708809135399672, "grad_norm": 0.09689219295978546, "learning_rate": 6.322897706649039e-07, "loss": 0.1822, "num_input_tokens_seen": 247469040, "step": 114685 }, { "epoch": 18.709624796084828, "grad_norm": 1.6256053447723389, "learning_rate": 6.314946526045251e-07, "loss": 0.0359, "num_input_tokens_seen": 247481104, "step": 114690 }, { "epoch": 18.710440456769984, "grad_norm": 1.0551401376724243, "learning_rate": 6.307000284028197e-07, "loss": 0.0516, "num_input_tokens_seen": 247491600, "step": 114695 }, { "epoch": 18.71125611745514, "grad_norm": 0.7632931470870972, "learning_rate": 6.299058980758915e-07, "loss": 0.1778, "num_input_tokens_seen": 247503568, "step": 114700 }, { "epoch": 18.712071778140295, "grad_norm": 0.10097946226596832, "learning_rate": 6.291122616398415e-07, "loss": 0.1483, "num_input_tokens_seen": 247514800, "step": 114705 }, { "epoch": 18.712887438825447, "grad_norm": 0.231990247964859, "learning_rate": 6.28319119110743e-07, "loss": 0.085, "num_input_tokens_seen": 247525424, "step": 114710 }, { "epoch": 18.713703099510603, "grad_norm": 0.22300468385219574, "learning_rate": 6.275264705046774e-07, "loss": 0.0362, "num_input_tokens_seen": 247537232, "step": 114715 }, { "epoch": 18.71451876019576, "grad_norm": 0.04535643011331558, "learning_rate": 6.267343158377043e-07, "loss": 0.0272, "num_input_tokens_seen": 247548368, "step": 114720 }, { "epoch": 18.715334420880914, "grad_norm": 0.059740979224443436, "learning_rate": 6.259426551258829e-07, "loss": 0.0993, "num_input_tokens_seen": 247558416, "step": 114725 }, { "epoch": 18.71615008156607, "grad_norm": 0.1178140863776207, "learning_rate": 6.251514883852505e-07, "loss": 0.0102, "num_input_tokens_seen": 247568304, "step": 114730 }, { "epoch": 18.716965742251222, "grad_norm": 1.3185690641403198, "learning_rate": 6.243608156318525e-07, "loss": 0.1574, "num_input_tokens_seen": 247579888, "step": 114735 }, { "epoch": 18.717781402936378, "grad_norm": 0.8299902677536011, "learning_rate": 6.235706368816957e-07, "loss": 0.0381, "num_input_tokens_seen": 247592336, "step": 114740 }, { "epoch": 18.718597063621534, "grad_norm": 0.14969724416732788, "learning_rate": 6.227809521508087e-07, "loss": 0.031, "num_input_tokens_seen": 247604016, "step": 114745 }, { "epoch": 18.71941272430669, "grad_norm": 2.0438649654388428, "learning_rate": 6.219917614551901e-07, "loss": 0.1024, "num_input_tokens_seen": 247613872, "step": 114750 }, { "epoch": 18.72022838499184, "grad_norm": 1.6421152353286743, "learning_rate": 6.212030648108324e-07, "loss": 0.1634, "num_input_tokens_seen": 247624784, "step": 114755 }, { "epoch": 18.721044045676997, "grad_norm": 1.1061617136001587, "learning_rate": 6.20414862233723e-07, "loss": 0.0488, "num_input_tokens_seen": 247636048, "step": 114760 }, { "epoch": 18.721859706362153, "grad_norm": 0.33836108446121216, "learning_rate": 6.196271537398352e-07, "loss": 0.1509, "num_input_tokens_seen": 247647248, "step": 114765 }, { "epoch": 18.72267536704731, "grad_norm": 0.27253270149230957, "learning_rate": 6.18839939345131e-07, "loss": 0.1196, "num_input_tokens_seen": 247658320, "step": 114770 }, { "epoch": 18.723491027732464, "grad_norm": 1.6570234298706055, "learning_rate": 6.180532190655647e-07, "loss": 0.107, "num_input_tokens_seen": 247668176, "step": 114775 }, { "epoch": 18.724306688417617, "grad_norm": 0.1298588514328003, "learning_rate": 6.172669929170788e-07, "loss": 0.0092, "num_input_tokens_seen": 247678384, "step": 114780 }, { "epoch": 18.725122349102772, "grad_norm": 0.014952503144741058, "learning_rate": 6.164812609156107e-07, "loss": 0.0306, "num_input_tokens_seen": 247690288, "step": 114785 }, { "epoch": 18.725938009787928, "grad_norm": 0.08842556923627853, "learning_rate": 6.156960230770837e-07, "loss": 0.0256, "num_input_tokens_seen": 247700368, "step": 114790 }, { "epoch": 18.726753670473084, "grad_norm": 1.864488124847412, "learning_rate": 6.1491127941741e-07, "loss": 0.028, "num_input_tokens_seen": 247711824, "step": 114795 }, { "epoch": 18.72756933115824, "grad_norm": 1.0500720739364624, "learning_rate": 6.141270299524937e-07, "loss": 0.0749, "num_input_tokens_seen": 247722672, "step": 114800 }, { "epoch": 18.72838499184339, "grad_norm": 3.383218765258789, "learning_rate": 6.133432746982277e-07, "loss": 0.1612, "num_input_tokens_seen": 247734192, "step": 114805 }, { "epoch": 18.729200652528547, "grad_norm": 0.13543589413166046, "learning_rate": 6.125600136704962e-07, "loss": 0.086, "num_input_tokens_seen": 247744368, "step": 114810 }, { "epoch": 18.730016313213703, "grad_norm": 0.04987654089927673, "learning_rate": 6.117772468851756e-07, "loss": 0.0743, "num_input_tokens_seen": 247754064, "step": 114815 }, { "epoch": 18.73083197389886, "grad_norm": 1.4684139490127563, "learning_rate": 6.109949743581283e-07, "loss": 0.043, "num_input_tokens_seen": 247764720, "step": 114820 }, { "epoch": 18.731647634584014, "grad_norm": 0.5503188371658325, "learning_rate": 6.102131961052054e-07, "loss": 0.0635, "num_input_tokens_seen": 247774576, "step": 114825 }, { "epoch": 18.732463295269167, "grad_norm": 0.03563242405653, "learning_rate": 6.094319121422553e-07, "loss": 0.2142, "num_input_tokens_seen": 247785360, "step": 114830 }, { "epoch": 18.733278955954322, "grad_norm": 0.5021492838859558, "learning_rate": 6.086511224851071e-07, "loss": 0.1287, "num_input_tokens_seen": 247796784, "step": 114835 }, { "epoch": 18.734094616639478, "grad_norm": 1.2779109477996826, "learning_rate": 6.07870827149587e-07, "loss": 0.1146, "num_input_tokens_seen": 247807024, "step": 114840 }, { "epoch": 18.734910277324634, "grad_norm": 0.21933849155902863, "learning_rate": 6.070910261515045e-07, "loss": 0.2396, "num_input_tokens_seen": 247817712, "step": 114845 }, { "epoch": 18.73572593800979, "grad_norm": 0.04463709518313408, "learning_rate": 6.06311719506672e-07, "loss": 0.1978, "num_input_tokens_seen": 247828720, "step": 114850 }, { "epoch": 18.73654159869494, "grad_norm": 0.5256091952323914, "learning_rate": 6.055329072308768e-07, "loss": 0.2356, "num_input_tokens_seen": 247839504, "step": 114855 }, { "epoch": 18.737357259380097, "grad_norm": 2.4881672859191895, "learning_rate": 6.047545893399037e-07, "loss": 0.081, "num_input_tokens_seen": 247850320, "step": 114860 }, { "epoch": 18.738172920065253, "grad_norm": 0.0492103174328804, "learning_rate": 6.03976765849526e-07, "loss": 0.0228, "num_input_tokens_seen": 247860912, "step": 114865 }, { "epoch": 18.73898858075041, "grad_norm": 0.08116170763969421, "learning_rate": 6.031994367755089e-07, "loss": 0.2012, "num_input_tokens_seen": 247872112, "step": 114870 }, { "epoch": 18.739804241435564, "grad_norm": 1.0419541597366333, "learning_rate": 6.024226021336038e-07, "loss": 0.1355, "num_input_tokens_seen": 247883056, "step": 114875 }, { "epoch": 18.740619902120716, "grad_norm": 0.09857356548309326, "learning_rate": 6.016462619395535e-07, "loss": 0.1586, "num_input_tokens_seen": 247893776, "step": 114880 }, { "epoch": 18.741435562805872, "grad_norm": 0.5289402008056641, "learning_rate": 6.008704162090956e-07, "loss": 0.0737, "num_input_tokens_seen": 247904176, "step": 114885 }, { "epoch": 18.742251223491028, "grad_norm": 0.1396363526582718, "learning_rate": 6.000950649579506e-07, "loss": 0.0271, "num_input_tokens_seen": 247913904, "step": 114890 }, { "epoch": 18.743066884176184, "grad_norm": 1.809685468673706, "learning_rate": 5.99320208201834e-07, "loss": 0.173, "num_input_tokens_seen": 247924624, "step": 114895 }, { "epoch": 18.74388254486134, "grad_norm": 0.45939087867736816, "learning_rate": 5.985458459564469e-07, "loss": 0.0433, "num_input_tokens_seen": 247935568, "step": 114900 }, { "epoch": 18.74469820554649, "grad_norm": 0.22757519781589508, "learning_rate": 5.977719782374824e-07, "loss": 0.0383, "num_input_tokens_seen": 247947056, "step": 114905 }, { "epoch": 18.745513866231647, "grad_norm": 0.033365827053785324, "learning_rate": 5.969986050606252e-07, "loss": 0.1139, "num_input_tokens_seen": 247958448, "step": 114910 }, { "epoch": 18.746329526916803, "grad_norm": 0.4549063742160797, "learning_rate": 5.962257264415517e-07, "loss": 0.0428, "num_input_tokens_seen": 247968816, "step": 114915 }, { "epoch": 18.74714518760196, "grad_norm": 1.262701392173767, "learning_rate": 5.954533423959186e-07, "loss": 0.2261, "num_input_tokens_seen": 247978768, "step": 114920 }, { "epoch": 18.747960848287114, "grad_norm": 0.7997840642929077, "learning_rate": 5.946814529393857e-07, "loss": 0.0254, "num_input_tokens_seen": 247990448, "step": 114925 }, { "epoch": 18.748776508972266, "grad_norm": 2.406078577041626, "learning_rate": 5.939100580875906e-07, "loss": 0.187, "num_input_tokens_seen": 248002448, "step": 114930 }, { "epoch": 18.749592169657422, "grad_norm": 1.7142541408538818, "learning_rate": 5.931391578561707e-07, "loss": 0.0623, "num_input_tokens_seen": 248010192, "step": 114935 }, { "epoch": 18.750407830342578, "grad_norm": 0.34266820549964905, "learning_rate": 5.923687522607496e-07, "loss": 0.0587, "num_input_tokens_seen": 248019600, "step": 114940 }, { "epoch": 18.751223491027734, "grad_norm": 0.9729750752449036, "learning_rate": 5.915988413169371e-07, "loss": 0.0604, "num_input_tokens_seen": 248030800, "step": 114945 }, { "epoch": 18.752039151712886, "grad_norm": 0.9282469153404236, "learning_rate": 5.908294250403401e-07, "loss": 0.0441, "num_input_tokens_seen": 248040464, "step": 114950 }, { "epoch": 18.75285481239804, "grad_norm": 0.12667061388492584, "learning_rate": 5.900605034465462e-07, "loss": 0.1894, "num_input_tokens_seen": 248049872, "step": 114955 }, { "epoch": 18.753670473083197, "grad_norm": 0.14667874574661255, "learning_rate": 5.892920765511484e-07, "loss": 0.1177, "num_input_tokens_seen": 248060912, "step": 114960 }, { "epoch": 18.754486133768353, "grad_norm": 2.088502883911133, "learning_rate": 5.885241443697065e-07, "loss": 0.1712, "num_input_tokens_seen": 248070384, "step": 114965 }, { "epoch": 18.75530179445351, "grad_norm": 0.691935658454895, "learning_rate": 5.877567069177997e-07, "loss": 0.0146, "num_input_tokens_seen": 248081680, "step": 114970 }, { "epoch": 18.75611745513866, "grad_norm": 0.41596147418022156, "learning_rate": 5.869897642109629e-07, "loss": 0.1239, "num_input_tokens_seen": 248092592, "step": 114975 }, { "epoch": 18.756933115823816, "grad_norm": 1.2455685138702393, "learning_rate": 5.862233162647584e-07, "loss": 0.0727, "num_input_tokens_seen": 248103728, "step": 114980 }, { "epoch": 18.757748776508972, "grad_norm": 0.7151696085929871, "learning_rate": 5.854573630947018e-07, "loss": 0.0456, "num_input_tokens_seen": 248115280, "step": 114985 }, { "epoch": 18.758564437194128, "grad_norm": 0.24051380157470703, "learning_rate": 5.846919047163252e-07, "loss": 0.1091, "num_input_tokens_seen": 248126960, "step": 114990 }, { "epoch": 18.759380097879284, "grad_norm": 0.1961752027273178, "learning_rate": 5.839269411451409e-07, "loss": 0.0168, "num_input_tokens_seen": 248138416, "step": 114995 }, { "epoch": 18.760195758564436, "grad_norm": 0.04373498260974884, "learning_rate": 5.831624723966534e-07, "loss": 0.0311, "num_input_tokens_seen": 248149328, "step": 115000 }, { "epoch": 18.76101141924959, "grad_norm": 0.03407707065343857, "learning_rate": 5.823984984863501e-07, "loss": 0.0141, "num_input_tokens_seen": 248159920, "step": 115005 }, { "epoch": 18.761827079934747, "grad_norm": 0.6036208271980286, "learning_rate": 5.816350194297188e-07, "loss": 0.0128, "num_input_tokens_seen": 248172176, "step": 115010 }, { "epoch": 18.762642740619903, "grad_norm": 0.4113132655620575, "learning_rate": 5.808720352422331e-07, "loss": 0.1478, "num_input_tokens_seen": 248183216, "step": 115015 }, { "epoch": 18.76345840130506, "grad_norm": 0.17999331653118134, "learning_rate": 5.801095459393502e-07, "loss": 0.0318, "num_input_tokens_seen": 248194064, "step": 115020 }, { "epoch": 18.76427406199021, "grad_norm": 0.2787046730518341, "learning_rate": 5.793475515365271e-07, "loss": 0.1872, "num_input_tokens_seen": 248205072, "step": 115025 }, { "epoch": 18.765089722675366, "grad_norm": 2.392000436782837, "learning_rate": 5.785860520492042e-07, "loss": 0.2234, "num_input_tokens_seen": 248217264, "step": 115030 }, { "epoch": 18.765905383360522, "grad_norm": 0.0213834997266531, "learning_rate": 5.778250474928193e-07, "loss": 0.0285, "num_input_tokens_seen": 248228528, "step": 115035 }, { "epoch": 18.766721044045678, "grad_norm": 1.7685600519180298, "learning_rate": 5.770645378827877e-07, "loss": 0.0851, "num_input_tokens_seen": 248240240, "step": 115040 }, { "epoch": 18.767536704730833, "grad_norm": 0.30043041706085205, "learning_rate": 5.763045232345276e-07, "loss": 0.0364, "num_input_tokens_seen": 248250448, "step": 115045 }, { "epoch": 18.768352365415986, "grad_norm": 0.0686866044998169, "learning_rate": 5.755450035634407e-07, "loss": 0.0165, "num_input_tokens_seen": 248261040, "step": 115050 }, { "epoch": 18.76916802610114, "grad_norm": 1.219976782798767, "learning_rate": 5.747859788849175e-07, "loss": 0.076, "num_input_tokens_seen": 248270576, "step": 115055 }, { "epoch": 18.769983686786297, "grad_norm": 0.11873333156108856, "learning_rate": 5.740274492143427e-07, "loss": 0.1091, "num_input_tokens_seen": 248281008, "step": 115060 }, { "epoch": 18.770799347471453, "grad_norm": 0.35695338249206543, "learning_rate": 5.732694145670902e-07, "loss": 0.0417, "num_input_tokens_seen": 248291280, "step": 115065 }, { "epoch": 18.77161500815661, "grad_norm": 0.03992535173892975, "learning_rate": 5.725118749585146e-07, "loss": 0.2178, "num_input_tokens_seen": 248301968, "step": 115070 }, { "epoch": 18.77243066884176, "grad_norm": 0.6853916049003601, "learning_rate": 5.717548304039811e-07, "loss": 0.1343, "num_input_tokens_seen": 248312720, "step": 115075 }, { "epoch": 18.773246329526916, "grad_norm": 0.1736103892326355, "learning_rate": 5.709982809188191e-07, "loss": 0.1083, "num_input_tokens_seen": 248322992, "step": 115080 }, { "epoch": 18.774061990212072, "grad_norm": 0.06766581535339355, "learning_rate": 5.702422265183749e-07, "loss": 0.1676, "num_input_tokens_seen": 248334224, "step": 115085 }, { "epoch": 18.774877650897228, "grad_norm": 1.2636535167694092, "learning_rate": 5.694866672179555e-07, "loss": 0.0857, "num_input_tokens_seen": 248345104, "step": 115090 }, { "epoch": 18.775693311582383, "grad_norm": 0.10637809336185455, "learning_rate": 5.687316030328877e-07, "loss": 0.0796, "num_input_tokens_seen": 248356048, "step": 115095 }, { "epoch": 18.776508972267536, "grad_norm": 0.028712332248687744, "learning_rate": 5.679770339784618e-07, "loss": 0.1057, "num_input_tokens_seen": 248365712, "step": 115100 }, { "epoch": 18.77732463295269, "grad_norm": 3.3171842098236084, "learning_rate": 5.672229600699796e-07, "loss": 0.1356, "num_input_tokens_seen": 248377136, "step": 115105 }, { "epoch": 18.778140293637847, "grad_norm": 0.26645219326019287, "learning_rate": 5.664693813227179e-07, "loss": 0.0266, "num_input_tokens_seen": 248386896, "step": 115110 }, { "epoch": 18.778955954323003, "grad_norm": 0.18462900817394257, "learning_rate": 5.657162977519503e-07, "loss": 0.0076, "num_input_tokens_seen": 248398640, "step": 115115 }, { "epoch": 18.77977161500816, "grad_norm": 1.9363141059875488, "learning_rate": 5.649637093729371e-07, "loss": 0.0625, "num_input_tokens_seen": 248409744, "step": 115120 }, { "epoch": 18.78058727569331, "grad_norm": 0.038733989000320435, "learning_rate": 5.642116162009381e-07, "loss": 0.0403, "num_input_tokens_seen": 248420688, "step": 115125 }, { "epoch": 18.781402936378466, "grad_norm": 1.8395088911056519, "learning_rate": 5.634600182511829e-07, "loss": 0.1477, "num_input_tokens_seen": 248430960, "step": 115130 }, { "epoch": 18.782218597063622, "grad_norm": 0.9571599960327148, "learning_rate": 5.62708915538912e-07, "loss": 0.038, "num_input_tokens_seen": 248441936, "step": 115135 }, { "epoch": 18.783034257748778, "grad_norm": 0.06994081288576126, "learning_rate": 5.619583080793467e-07, "loss": 0.0517, "num_input_tokens_seen": 248452080, "step": 115140 }, { "epoch": 18.78384991843393, "grad_norm": 0.014191019348800182, "learning_rate": 5.612081958876996e-07, "loss": 0.0423, "num_input_tokens_seen": 248461328, "step": 115145 }, { "epoch": 18.784665579119086, "grad_norm": 0.10003793984651566, "learning_rate": 5.604585789791699e-07, "loss": 0.1989, "num_input_tokens_seen": 248472912, "step": 115150 }, { "epoch": 18.78548123980424, "grad_norm": 0.4351337254047394, "learning_rate": 5.597094573689509e-07, "loss": 0.1122, "num_input_tokens_seen": 248485424, "step": 115155 }, { "epoch": 18.786296900489397, "grad_norm": 0.47821035981178284, "learning_rate": 5.58960831072225e-07, "loss": 0.0896, "num_input_tokens_seen": 248496688, "step": 115160 }, { "epoch": 18.787112561174553, "grad_norm": 0.3640025854110718, "learning_rate": 5.582127001041632e-07, "loss": 0.0941, "num_input_tokens_seen": 248508208, "step": 115165 }, { "epoch": 18.787928221859705, "grad_norm": 0.17582286894321442, "learning_rate": 5.574650644799284e-07, "loss": 0.0279, "num_input_tokens_seen": 248518672, "step": 115170 }, { "epoch": 18.78874388254486, "grad_norm": 0.22481992840766907, "learning_rate": 5.567179242146697e-07, "loss": 0.0516, "num_input_tokens_seen": 248529680, "step": 115175 }, { "epoch": 18.789559543230016, "grad_norm": 2.493790626525879, "learning_rate": 5.559712793235333e-07, "loss": 0.1963, "num_input_tokens_seen": 248540720, "step": 115180 }, { "epoch": 18.790375203915172, "grad_norm": 0.4674835503101349, "learning_rate": 5.552251298216432e-07, "loss": 0.0162, "num_input_tokens_seen": 248551568, "step": 115185 }, { "epoch": 18.791190864600328, "grad_norm": 0.8560178875923157, "learning_rate": 5.544794757241318e-07, "loss": 0.0233, "num_input_tokens_seen": 248563248, "step": 115190 }, { "epoch": 18.79200652528548, "grad_norm": 2.0609331130981445, "learning_rate": 5.537343170461034e-07, "loss": 0.3797, "num_input_tokens_seen": 248574736, "step": 115195 }, { "epoch": 18.792822185970635, "grad_norm": 0.08488886803388596, "learning_rate": 5.52989653802663e-07, "loss": 0.0268, "num_input_tokens_seen": 248585168, "step": 115200 }, { "epoch": 18.79363784665579, "grad_norm": 0.5194891691207886, "learning_rate": 5.522454860088955e-07, "loss": 0.0085, "num_input_tokens_seen": 248595472, "step": 115205 }, { "epoch": 18.794453507340947, "grad_norm": 1.5959038734436035, "learning_rate": 5.515018136798944e-07, "loss": 0.1242, "num_input_tokens_seen": 248607024, "step": 115210 }, { "epoch": 18.795269168026103, "grad_norm": 1.9198503494262695, "learning_rate": 5.507586368307172e-07, "loss": 0.2524, "num_input_tokens_seen": 248618160, "step": 115215 }, { "epoch": 18.796084828711255, "grad_norm": 0.08823125064373016, "learning_rate": 5.500159554764378e-07, "loss": 0.1199, "num_input_tokens_seen": 248629424, "step": 115220 }, { "epoch": 18.79690048939641, "grad_norm": 3.089090585708618, "learning_rate": 5.492737696320971e-07, "loss": 0.1712, "num_input_tokens_seen": 248640208, "step": 115225 }, { "epoch": 18.797716150081566, "grad_norm": 0.06490481644868851, "learning_rate": 5.485320793127468e-07, "loss": 0.0502, "num_input_tokens_seen": 248652272, "step": 115230 }, { "epoch": 18.798531810766722, "grad_norm": 0.10836553573608398, "learning_rate": 5.477908845334085e-07, "loss": 0.0169, "num_input_tokens_seen": 248663888, "step": 115235 }, { "epoch": 18.799347471451878, "grad_norm": 0.040644388645887375, "learning_rate": 5.470501853091115e-07, "loss": 0.1968, "num_input_tokens_seen": 248673712, "step": 115240 }, { "epoch": 18.80016313213703, "grad_norm": 2.1161956787109375, "learning_rate": 5.463099816548579e-07, "loss": 0.1027, "num_input_tokens_seen": 248686416, "step": 115245 }, { "epoch": 18.800978792822185, "grad_norm": 0.06723345071077347, "learning_rate": 5.455702735856605e-07, "loss": 0.2407, "num_input_tokens_seen": 248697872, "step": 115250 }, { "epoch": 18.80179445350734, "grad_norm": 0.19845429062843323, "learning_rate": 5.448310611164992e-07, "loss": 0.0637, "num_input_tokens_seen": 248709360, "step": 115255 }, { "epoch": 18.802610114192497, "grad_norm": 0.03167327865958214, "learning_rate": 5.440923442623646e-07, "loss": 0.191, "num_input_tokens_seen": 248721168, "step": 115260 }, { "epoch": 18.803425774877653, "grad_norm": 0.1080411747097969, "learning_rate": 5.433541230382172e-07, "loss": 0.0286, "num_input_tokens_seen": 248731920, "step": 115265 }, { "epoch": 18.804241435562805, "grad_norm": 0.15851151943206787, "learning_rate": 5.42616397459031e-07, "loss": 0.1721, "num_input_tokens_seen": 248742992, "step": 115270 }, { "epoch": 18.80505709624796, "grad_norm": 0.045597732067108154, "learning_rate": 5.418791675397439e-07, "loss": 0.014, "num_input_tokens_seen": 248751984, "step": 115275 }, { "epoch": 18.805872756933116, "grad_norm": 0.5160804986953735, "learning_rate": 5.411424332953052e-07, "loss": 0.1567, "num_input_tokens_seen": 248762128, "step": 115280 }, { "epoch": 18.806688417618272, "grad_norm": 0.36454010009765625, "learning_rate": 5.404061947406419e-07, "loss": 0.0906, "num_input_tokens_seen": 248772112, "step": 115285 }, { "epoch": 18.807504078303424, "grad_norm": 0.060125287622213364, "learning_rate": 5.396704518906781e-07, "loss": 0.1677, "num_input_tokens_seen": 248783856, "step": 115290 }, { "epoch": 18.80831973898858, "grad_norm": 1.485925555229187, "learning_rate": 5.389352047603213e-07, "loss": 0.0581, "num_input_tokens_seen": 248794800, "step": 115295 }, { "epoch": 18.809135399673735, "grad_norm": 0.018234508112072945, "learning_rate": 5.382004533644763e-07, "loss": 0.0339, "num_input_tokens_seen": 248805168, "step": 115300 }, { "epoch": 18.80995106035889, "grad_norm": 0.7375343441963196, "learning_rate": 5.374661977180284e-07, "loss": 0.0205, "num_input_tokens_seen": 248815696, "step": 115305 }, { "epoch": 18.810766721044047, "grad_norm": 0.07943865656852722, "learning_rate": 5.367324378358629e-07, "loss": 0.023, "num_input_tokens_seen": 248828048, "step": 115310 }, { "epoch": 18.8115823817292, "grad_norm": 0.44040828943252563, "learning_rate": 5.359991737328457e-07, "loss": 0.0943, "num_input_tokens_seen": 248838512, "step": 115315 }, { "epoch": 18.812398042414355, "grad_norm": 0.0489351823925972, "learning_rate": 5.352664054238427e-07, "loss": 0.1144, "num_input_tokens_seen": 248849712, "step": 115320 }, { "epoch": 18.81321370309951, "grad_norm": 0.5894021987915039, "learning_rate": 5.345341329237003e-07, "loss": 0.0151, "num_input_tokens_seen": 248861872, "step": 115325 }, { "epoch": 18.814029363784666, "grad_norm": 3.2773373126983643, "learning_rate": 5.338023562472593e-07, "loss": 0.303, "num_input_tokens_seen": 248873488, "step": 115330 }, { "epoch": 18.81484502446982, "grad_norm": 0.07992511987686157, "learning_rate": 5.330710754093554e-07, "loss": 0.0184, "num_input_tokens_seen": 248885104, "step": 115335 }, { "epoch": 18.815660685154974, "grad_norm": 0.9001695513725281, "learning_rate": 5.323402904247987e-07, "loss": 0.05, "num_input_tokens_seen": 248896528, "step": 115340 }, { "epoch": 18.81647634584013, "grad_norm": 0.6918050646781921, "learning_rate": 5.316100013084107e-07, "loss": 0.1103, "num_input_tokens_seen": 248907600, "step": 115345 }, { "epoch": 18.817292006525285, "grad_norm": 0.06726797670125961, "learning_rate": 5.308802080749825e-07, "loss": 0.0123, "num_input_tokens_seen": 248918320, "step": 115350 }, { "epoch": 18.81810766721044, "grad_norm": 0.15146327018737793, "learning_rate": 5.301509107393133e-07, "loss": 0.0263, "num_input_tokens_seen": 248928560, "step": 115355 }, { "epoch": 18.818923327895597, "grad_norm": 0.2376195192337036, "learning_rate": 5.294221093161717e-07, "loss": 0.0081, "num_input_tokens_seen": 248937936, "step": 115360 }, { "epoch": 18.81973898858075, "grad_norm": 1.6598315238952637, "learning_rate": 5.286938038203404e-07, "loss": 0.1695, "num_input_tokens_seen": 248948848, "step": 115365 }, { "epoch": 18.820554649265905, "grad_norm": 0.03313792869448662, "learning_rate": 5.27965994266566e-07, "loss": 0.0105, "num_input_tokens_seen": 248958224, "step": 115370 }, { "epoch": 18.82137030995106, "grad_norm": 0.07032492756843567, "learning_rate": 5.272386806696144e-07, "loss": 0.1188, "num_input_tokens_seen": 248970704, "step": 115375 }, { "epoch": 18.822185970636216, "grad_norm": 1.0972267389297485, "learning_rate": 5.265118630442073e-07, "loss": 0.0979, "num_input_tokens_seen": 248980560, "step": 115380 }, { "epoch": 18.82300163132137, "grad_norm": 0.05141341686248779, "learning_rate": 5.25785541405091e-07, "loss": 0.1006, "num_input_tokens_seen": 248991056, "step": 115385 }, { "epoch": 18.823817292006524, "grad_norm": 1.9424891471862793, "learning_rate": 5.250597157669762e-07, "loss": 0.1122, "num_input_tokens_seen": 249002288, "step": 115390 }, { "epoch": 18.82463295269168, "grad_norm": 0.06324783712625504, "learning_rate": 5.243343861445759e-07, "loss": 0.018, "num_input_tokens_seen": 249012848, "step": 115395 }, { "epoch": 18.825448613376835, "grad_norm": 0.4521702826023102, "learning_rate": 5.236095525525869e-07, "loss": 0.0379, "num_input_tokens_seen": 249023408, "step": 115400 }, { "epoch": 18.82626427406199, "grad_norm": 0.07375620305538177, "learning_rate": 5.228852150057029e-07, "loss": 0.1463, "num_input_tokens_seen": 249034288, "step": 115405 }, { "epoch": 18.827079934747147, "grad_norm": 0.1300206482410431, "learning_rate": 5.221613735186009e-07, "loss": 0.1251, "num_input_tokens_seen": 249046064, "step": 115410 }, { "epoch": 18.8278955954323, "grad_norm": 0.2828027009963989, "learning_rate": 5.2143802810595e-07, "loss": 0.0859, "num_input_tokens_seen": 249056816, "step": 115415 }, { "epoch": 18.828711256117455, "grad_norm": 2.2450289726257324, "learning_rate": 5.207151787824105e-07, "loss": 0.2107, "num_input_tokens_seen": 249067248, "step": 115420 }, { "epoch": 18.82952691680261, "grad_norm": 0.15902991592884064, "learning_rate": 5.199928255626347e-07, "loss": 0.0241, "num_input_tokens_seen": 249078256, "step": 115425 }, { "epoch": 18.830342577487766, "grad_norm": 0.0969504714012146, "learning_rate": 5.192709684612579e-07, "loss": 0.0899, "num_input_tokens_seen": 249089360, "step": 115430 }, { "epoch": 18.83115823817292, "grad_norm": 0.3650354743003845, "learning_rate": 5.185496074929102e-07, "loss": 0.0295, "num_input_tokens_seen": 249100944, "step": 115435 }, { "epoch": 18.831973898858074, "grad_norm": 1.330566644668579, "learning_rate": 5.178287426722105e-07, "loss": 0.1086, "num_input_tokens_seen": 249111664, "step": 115440 }, { "epoch": 18.83278955954323, "grad_norm": 1.1589796543121338, "learning_rate": 5.171083740137722e-07, "loss": 0.0433, "num_input_tokens_seen": 249120304, "step": 115445 }, { "epoch": 18.833605220228385, "grad_norm": 0.10694395750761032, "learning_rate": 5.163885015321889e-07, "loss": 0.0237, "num_input_tokens_seen": 249131088, "step": 115450 }, { "epoch": 18.83442088091354, "grad_norm": 0.4411776661872864, "learning_rate": 5.156691252420548e-07, "loss": 0.0201, "num_input_tokens_seen": 249141040, "step": 115455 }, { "epoch": 18.835236541598697, "grad_norm": 0.2445434331893921, "learning_rate": 5.149502451579441e-07, "loss": 0.0582, "num_input_tokens_seen": 249149552, "step": 115460 }, { "epoch": 18.83605220228385, "grad_norm": 1.9421942234039307, "learning_rate": 5.142318612944313e-07, "loss": 0.074, "num_input_tokens_seen": 249159536, "step": 115465 }, { "epoch": 18.836867862969005, "grad_norm": 2.5484538078308105, "learning_rate": 5.135139736660688e-07, "loss": 0.115, "num_input_tokens_seen": 249169904, "step": 115470 }, { "epoch": 18.83768352365416, "grad_norm": 0.06850970536470413, "learning_rate": 5.127965822874086e-07, "loss": 0.1331, "num_input_tokens_seen": 249180240, "step": 115475 }, { "epoch": 18.838499184339316, "grad_norm": 0.1542031466960907, "learning_rate": 5.120796871729949e-07, "loss": 0.1214, "num_input_tokens_seen": 249190448, "step": 115480 }, { "epoch": 18.839314845024468, "grad_norm": 1.3365823030471802, "learning_rate": 5.113632883373437e-07, "loss": 0.0563, "num_input_tokens_seen": 249202640, "step": 115485 }, { "epoch": 18.840130505709624, "grad_norm": 0.059292905032634735, "learning_rate": 5.106473857949878e-07, "loss": 0.0726, "num_input_tokens_seen": 249214224, "step": 115490 }, { "epoch": 18.84094616639478, "grad_norm": 0.04584912210702896, "learning_rate": 5.099319795604268e-07, "loss": 0.1065, "num_input_tokens_seen": 249225360, "step": 115495 }, { "epoch": 18.841761827079935, "grad_norm": 1.661756992340088, "learning_rate": 5.09217069648163e-07, "loss": 0.1593, "num_input_tokens_seen": 249236624, "step": 115500 }, { "epoch": 18.84257748776509, "grad_norm": 1.6921664476394653, "learning_rate": 5.085026560726846e-07, "loss": 0.15, "num_input_tokens_seen": 249247408, "step": 115505 }, { "epoch": 18.843393148450243, "grad_norm": 3.004441261291504, "learning_rate": 5.077887388484692e-07, "loss": 0.1974, "num_input_tokens_seen": 249258000, "step": 115510 }, { "epoch": 18.8442088091354, "grad_norm": 0.041973475366830826, "learning_rate": 5.070753179899857e-07, "loss": 0.0809, "num_input_tokens_seen": 249268848, "step": 115515 }, { "epoch": 18.845024469820554, "grad_norm": 0.13657055795192719, "learning_rate": 5.063623935116917e-07, "loss": 0.0816, "num_input_tokens_seen": 249279312, "step": 115520 }, { "epoch": 18.84584013050571, "grad_norm": 0.16620321571826935, "learning_rate": 5.056499654280344e-07, "loss": 0.118, "num_input_tokens_seen": 249289392, "step": 115525 }, { "epoch": 18.846655791190866, "grad_norm": 2.5936574935913086, "learning_rate": 5.049380337534576e-07, "loss": 0.1054, "num_input_tokens_seen": 249300976, "step": 115530 }, { "epoch": 18.847471451876018, "grad_norm": 0.08115827292203903, "learning_rate": 5.042265985023831e-07, "loss": 0.1002, "num_input_tokens_seen": 249312528, "step": 115535 }, { "epoch": 18.848287112561174, "grad_norm": 0.6434553861618042, "learning_rate": 5.035156596892327e-07, "loss": 0.0161, "num_input_tokens_seen": 249322544, "step": 115540 }, { "epoch": 18.84910277324633, "grad_norm": 0.08707216382026672, "learning_rate": 5.028052173284142e-07, "loss": 0.1136, "num_input_tokens_seen": 249333136, "step": 115545 }, { "epoch": 18.849918433931485, "grad_norm": 0.05240638926625252, "learning_rate": 5.020952714343246e-07, "loss": 0.0203, "num_input_tokens_seen": 249343888, "step": 115550 }, { "epoch": 18.85073409461664, "grad_norm": 0.464194655418396, "learning_rate": 5.013858220213524e-07, "loss": 0.1042, "num_input_tokens_seen": 249354448, "step": 115555 }, { "epoch": 18.851549755301793, "grad_norm": 0.03703694045543671, "learning_rate": 5.006768691038722e-07, "loss": 0.2167, "num_input_tokens_seen": 249365904, "step": 115560 }, { "epoch": 18.85236541598695, "grad_norm": 0.8855855464935303, "learning_rate": 4.999684126962584e-07, "loss": 0.1134, "num_input_tokens_seen": 249375920, "step": 115565 }, { "epoch": 18.853181076672104, "grad_norm": 2.2167162895202637, "learning_rate": 4.992604528128636e-07, "loss": 0.0983, "num_input_tokens_seen": 249386480, "step": 115570 }, { "epoch": 18.85399673735726, "grad_norm": 0.1726694256067276, "learning_rate": 4.985529894680374e-07, "loss": 0.1022, "num_input_tokens_seen": 249397232, "step": 115575 }, { "epoch": 18.854812398042416, "grad_norm": 2.1117517948150635, "learning_rate": 4.978460226761183e-07, "loss": 0.3141, "num_input_tokens_seen": 249407088, "step": 115580 }, { "epoch": 18.855628058727568, "grad_norm": 2.129657030105591, "learning_rate": 4.971395524514311e-07, "loss": 0.1777, "num_input_tokens_seen": 249417936, "step": 115585 }, { "epoch": 18.856443719412724, "grad_norm": 0.2702946364879608, "learning_rate": 4.964335788082947e-07, "loss": 0.1039, "num_input_tokens_seen": 249428272, "step": 115590 }, { "epoch": 18.85725938009788, "grad_norm": 0.1422773152589798, "learning_rate": 4.957281017610144e-07, "loss": 0.1314, "num_input_tokens_seen": 249439056, "step": 115595 }, { "epoch": 18.858075040783035, "grad_norm": 0.1818191260099411, "learning_rate": 4.950231213238954e-07, "loss": 0.045, "num_input_tokens_seen": 249449808, "step": 115600 }, { "epoch": 18.85889070146819, "grad_norm": 1.4592334032058716, "learning_rate": 4.943186375112152e-07, "loss": 0.0808, "num_input_tokens_seen": 249460496, "step": 115605 }, { "epoch": 18.859706362153343, "grad_norm": 1.022962212562561, "learning_rate": 4.936146503372596e-07, "loss": 0.1038, "num_input_tokens_seen": 249470704, "step": 115610 }, { "epoch": 18.8605220228385, "grad_norm": 0.7205404043197632, "learning_rate": 4.929111598162895e-07, "loss": 0.0628, "num_input_tokens_seen": 249480400, "step": 115615 }, { "epoch": 18.861337683523654, "grad_norm": 0.05602673813700676, "learning_rate": 4.922081659625627e-07, "loss": 0.0196, "num_input_tokens_seen": 249491024, "step": 115620 }, { "epoch": 18.86215334420881, "grad_norm": 0.014451880939304829, "learning_rate": 4.915056687903291e-07, "loss": 0.2657, "num_input_tokens_seen": 249502000, "step": 115625 }, { "epoch": 18.862969004893966, "grad_norm": 0.20301829278469086, "learning_rate": 4.908036683138273e-07, "loss": 0.1109, "num_input_tokens_seen": 249512624, "step": 115630 }, { "epoch": 18.863784665579118, "grad_norm": 0.08435695618391037, "learning_rate": 4.901021645472764e-07, "loss": 0.0082, "num_input_tokens_seen": 249523056, "step": 115635 }, { "epoch": 18.864600326264274, "grad_norm": 0.09150931239128113, "learning_rate": 4.894011575049012e-07, "loss": 0.0099, "num_input_tokens_seen": 249534032, "step": 115640 }, { "epoch": 18.86541598694943, "grad_norm": 0.09808040410280228, "learning_rate": 4.887006472009042e-07, "loss": 0.2105, "num_input_tokens_seen": 249543760, "step": 115645 }, { "epoch": 18.866231647634585, "grad_norm": 0.021395046263933182, "learning_rate": 4.880006336494853e-07, "loss": 0.0105, "num_input_tokens_seen": 249555504, "step": 115650 }, { "epoch": 18.86704730831974, "grad_norm": 1.7288894653320312, "learning_rate": 4.873011168648273e-07, "loss": 0.0904, "num_input_tokens_seen": 249566960, "step": 115655 }, { "epoch": 18.867862969004893, "grad_norm": 0.21187488734722137, "learning_rate": 4.866020968611079e-07, "loss": 0.0306, "num_input_tokens_seen": 249577712, "step": 115660 }, { "epoch": 18.86867862969005, "grad_norm": 0.049171410501003265, "learning_rate": 4.859035736524964e-07, "loss": 0.0596, "num_input_tokens_seen": 249588208, "step": 115665 }, { "epoch": 18.869494290375204, "grad_norm": 1.980304479598999, "learning_rate": 4.85205547253148e-07, "loss": 0.2125, "num_input_tokens_seen": 249599440, "step": 115670 }, { "epoch": 18.87030995106036, "grad_norm": 1.7351738214492798, "learning_rate": 4.845080176772071e-07, "loss": 0.1807, "num_input_tokens_seen": 249609360, "step": 115675 }, { "epoch": 18.871125611745512, "grad_norm": 1.3957581520080566, "learning_rate": 4.838109849388095e-07, "loss": 0.33, "num_input_tokens_seen": 249619312, "step": 115680 }, { "epoch": 18.871941272430668, "grad_norm": 0.019485127180814743, "learning_rate": 4.831144490520856e-07, "loss": 0.0073, "num_input_tokens_seen": 249630800, "step": 115685 }, { "epoch": 18.872756933115824, "grad_norm": 0.7258216738700867, "learning_rate": 4.824184100311491e-07, "loss": 0.0233, "num_input_tokens_seen": 249642160, "step": 115690 }, { "epoch": 18.87357259380098, "grad_norm": 0.042892757803201675, "learning_rate": 4.817228678901053e-07, "loss": 0.0252, "num_input_tokens_seen": 249652432, "step": 115695 }, { "epoch": 18.874388254486135, "grad_norm": 1.908912181854248, "learning_rate": 4.810278226430515e-07, "loss": 0.1782, "num_input_tokens_seen": 249663312, "step": 115700 }, { "epoch": 18.875203915171287, "grad_norm": 0.12976683676242828, "learning_rate": 4.803332743040761e-07, "loss": 0.1034, "num_input_tokens_seen": 249673456, "step": 115705 }, { "epoch": 18.876019575856443, "grad_norm": 0.12546919286251068, "learning_rate": 4.79639222887246e-07, "loss": 0.0569, "num_input_tokens_seen": 249683920, "step": 115710 }, { "epoch": 18.8768352365416, "grad_norm": 1.833895206451416, "learning_rate": 4.789456684066412e-07, "loss": 0.1017, "num_input_tokens_seen": 249694160, "step": 115715 }, { "epoch": 18.877650897226754, "grad_norm": 0.5236322283744812, "learning_rate": 4.782526108763035e-07, "loss": 0.0591, "num_input_tokens_seen": 249705776, "step": 115720 }, { "epoch": 18.87846655791191, "grad_norm": 2.3330142498016357, "learning_rate": 4.775600503102884e-07, "loss": 0.1954, "num_input_tokens_seen": 249715376, "step": 115725 }, { "epoch": 18.879282218597062, "grad_norm": 1.5244847536087036, "learning_rate": 4.768679867226233e-07, "loss": 0.197, "num_input_tokens_seen": 249726064, "step": 115730 }, { "epoch": 18.880097879282218, "grad_norm": 0.12033236026763916, "learning_rate": 4.761764201273444e-07, "loss": 0.0458, "num_input_tokens_seen": 249736496, "step": 115735 }, { "epoch": 18.880913539967374, "grad_norm": 0.11865932494401932, "learning_rate": 4.7548535053845434e-07, "loss": 0.0794, "num_input_tokens_seen": 249745264, "step": 115740 }, { "epoch": 18.88172920065253, "grad_norm": 1.0965646505355835, "learning_rate": 4.747947779699724e-07, "loss": 0.0365, "num_input_tokens_seen": 249755088, "step": 115745 }, { "epoch": 18.882544861337685, "grad_norm": 0.23683692514896393, "learning_rate": 4.741047024358819e-07, "loss": 0.1031, "num_input_tokens_seen": 249765744, "step": 115750 }, { "epoch": 18.883360522022837, "grad_norm": 0.06150338053703308, "learning_rate": 4.734151239501744e-07, "loss": 0.1781, "num_input_tokens_seen": 249776688, "step": 115755 }, { "epoch": 18.884176182707993, "grad_norm": 1.0695945024490356, "learning_rate": 4.7272604252682206e-07, "loss": 0.0697, "num_input_tokens_seen": 249787696, "step": 115760 }, { "epoch": 18.88499184339315, "grad_norm": 0.17061205208301544, "learning_rate": 4.720374581797915e-07, "loss": 0.0432, "num_input_tokens_seen": 249799760, "step": 115765 }, { "epoch": 18.885807504078304, "grad_norm": 0.06488896906375885, "learning_rate": 4.71349370923041e-07, "loss": 0.0187, "num_input_tokens_seen": 249810768, "step": 115770 }, { "epoch": 18.88662316476346, "grad_norm": 1.313515543937683, "learning_rate": 4.7066178077050935e-07, "loss": 0.1398, "num_input_tokens_seen": 249819504, "step": 115775 }, { "epoch": 18.887438825448612, "grad_norm": 0.42287611961364746, "learning_rate": 4.699746877361355e-07, "loss": 0.0603, "num_input_tokens_seen": 249830960, "step": 115780 }, { "epoch": 18.888254486133768, "grad_norm": 0.7480752468109131, "learning_rate": 4.6928809183384157e-07, "loss": 0.0171, "num_input_tokens_seen": 249841232, "step": 115785 }, { "epoch": 18.889070146818923, "grad_norm": 0.2613367736339569, "learning_rate": 4.6860199307754706e-07, "loss": 0.0159, "num_input_tokens_seen": 249851600, "step": 115790 }, { "epoch": 18.88988580750408, "grad_norm": 0.9333821535110474, "learning_rate": 4.6791639148115183e-07, "loss": 0.0441, "num_input_tokens_seen": 249862032, "step": 115795 }, { "epoch": 18.890701468189235, "grad_norm": 0.07930614054203033, "learning_rate": 4.6723128705855046e-07, "loss": 0.1452, "num_input_tokens_seen": 249871888, "step": 115800 }, { "epoch": 18.891517128874387, "grad_norm": 0.2985171377658844, "learning_rate": 4.665466798236318e-07, "loss": 0.0366, "num_input_tokens_seen": 249883024, "step": 115805 }, { "epoch": 18.892332789559543, "grad_norm": 1.060011863708496, "learning_rate": 4.658625697902652e-07, "loss": 0.0745, "num_input_tokens_seen": 249894512, "step": 115810 }, { "epoch": 18.8931484502447, "grad_norm": 0.6561942100524902, "learning_rate": 4.651789569723175e-07, "loss": 0.0452, "num_input_tokens_seen": 249905296, "step": 115815 }, { "epoch": 18.893964110929854, "grad_norm": 0.08441788703203201, "learning_rate": 4.64495841383647e-07, "loss": 0.058, "num_input_tokens_seen": 249916240, "step": 115820 }, { "epoch": 18.894779771615006, "grad_norm": 0.030635444447398186, "learning_rate": 4.6381322303808706e-07, "loss": 0.1016, "num_input_tokens_seen": 249926384, "step": 115825 }, { "epoch": 18.895595432300162, "grad_norm": 0.04082086309790611, "learning_rate": 4.6313110194948493e-07, "loss": 0.0645, "num_input_tokens_seen": 249936240, "step": 115830 }, { "epoch": 18.896411092985318, "grad_norm": 0.32714399695396423, "learning_rate": 4.6244947813165187e-07, "loss": 0.2395, "num_input_tokens_seen": 249946896, "step": 115835 }, { "epoch": 18.897226753670473, "grad_norm": 0.057974960654973984, "learning_rate": 4.617683515984156e-07, "loss": 0.0202, "num_input_tokens_seen": 249957552, "step": 115840 }, { "epoch": 18.89804241435563, "grad_norm": 0.15278631448745728, "learning_rate": 4.610877223635651e-07, "loss": 0.0697, "num_input_tokens_seen": 249967664, "step": 115845 }, { "epoch": 18.898858075040785, "grad_norm": 0.6369754076004028, "learning_rate": 4.604075904409089e-07, "loss": 0.0547, "num_input_tokens_seen": 249977520, "step": 115850 }, { "epoch": 18.899673735725937, "grad_norm": 0.12628360092639923, "learning_rate": 4.5972795584421635e-07, "loss": 0.13, "num_input_tokens_seen": 249989136, "step": 115855 }, { "epoch": 18.900489396411093, "grad_norm": 0.09640073776245117, "learning_rate": 4.5904881858727655e-07, "loss": 0.0117, "num_input_tokens_seen": 249999824, "step": 115860 }, { "epoch": 18.90130505709625, "grad_norm": 1.7694835662841797, "learning_rate": 4.583701786838368e-07, "loss": 0.1932, "num_input_tokens_seen": 250012016, "step": 115865 }, { "epoch": 18.902120717781404, "grad_norm": 1.8535231351852417, "learning_rate": 4.576920361476639e-07, "loss": 0.2036, "num_input_tokens_seen": 250022544, "step": 115870 }, { "epoch": 18.902936378466556, "grad_norm": 0.1806659996509552, "learning_rate": 4.57014390992494e-07, "loss": 0.0934, "num_input_tokens_seen": 250032784, "step": 115875 }, { "epoch": 18.903752039151712, "grad_norm": 1.637657880783081, "learning_rate": 4.563372432320634e-07, "loss": 0.0548, "num_input_tokens_seen": 250043728, "step": 115880 }, { "epoch": 18.904567699836868, "grad_norm": 2.48612642288208, "learning_rate": 4.5566059288009166e-07, "loss": 0.3981, "num_input_tokens_seen": 250053488, "step": 115885 }, { "epoch": 18.905383360522023, "grad_norm": 1.0177794694900513, "learning_rate": 4.5498443995029836e-07, "loss": 0.1269, "num_input_tokens_seen": 250064464, "step": 115890 }, { "epoch": 18.90619902120718, "grad_norm": 0.4883611500263214, "learning_rate": 4.543087844563809e-07, "loss": 0.1426, "num_input_tokens_seen": 250075184, "step": 115895 }, { "epoch": 18.90701468189233, "grad_norm": 3.0025923252105713, "learning_rate": 4.536336264120367e-07, "loss": 0.1058, "num_input_tokens_seen": 250085264, "step": 115900 }, { "epoch": 18.907830342577487, "grad_norm": 0.12240870296955109, "learning_rate": 4.5295896583094356e-07, "loss": 0.0108, "num_input_tokens_seen": 250096752, "step": 115905 }, { "epoch": 18.908646003262643, "grad_norm": 2.6806440353393555, "learning_rate": 4.5228480272677674e-07, "loss": 0.0741, "num_input_tokens_seen": 250108560, "step": 115910 }, { "epoch": 18.9094616639478, "grad_norm": 1.9956167936325073, "learning_rate": 4.5161113711320034e-07, "loss": 0.1468, "num_input_tokens_seen": 250119024, "step": 115915 }, { "epoch": 18.910277324632954, "grad_norm": 0.17968949675559998, "learning_rate": 4.509379690038673e-07, "loss": 0.0722, "num_input_tokens_seen": 250129648, "step": 115920 }, { "epoch": 18.911092985318106, "grad_norm": 0.0678587481379509, "learning_rate": 4.502652984124167e-07, "loss": 0.0449, "num_input_tokens_seen": 250140304, "step": 115925 }, { "epoch": 18.911908646003262, "grad_norm": 1.51612389087677, "learning_rate": 4.4959312535248487e-07, "loss": 0.1119, "num_input_tokens_seen": 250152528, "step": 115930 }, { "epoch": 18.912724306688418, "grad_norm": 0.15018796920776367, "learning_rate": 4.4892144983769424e-07, "loss": 0.0077, "num_input_tokens_seen": 250163440, "step": 115935 }, { "epoch": 18.913539967373573, "grad_norm": 2.059805393218994, "learning_rate": 4.4825027188165334e-07, "loss": 0.0665, "num_input_tokens_seen": 250174288, "step": 115940 }, { "epoch": 18.91435562805873, "grad_norm": 0.5049750804901123, "learning_rate": 4.47579591497968e-07, "loss": 0.0196, "num_input_tokens_seen": 250185168, "step": 115945 }, { "epoch": 18.91517128874388, "grad_norm": 0.7851022481918335, "learning_rate": 4.469094087002301e-07, "loss": 0.1737, "num_input_tokens_seen": 250196080, "step": 115950 }, { "epoch": 18.915986949429037, "grad_norm": 0.16481205821037292, "learning_rate": 4.462397235020177e-07, "loss": 0.0699, "num_input_tokens_seen": 250207088, "step": 115955 }, { "epoch": 18.916802610114193, "grad_norm": 0.4424644410610199, "learning_rate": 4.45570535916906e-07, "loss": 0.0168, "num_input_tokens_seen": 250218000, "step": 115960 }, { "epoch": 18.91761827079935, "grad_norm": 0.10042542964220047, "learning_rate": 4.44901845958462e-07, "loss": 0.0502, "num_input_tokens_seen": 250229200, "step": 115965 }, { "epoch": 18.918433931484504, "grad_norm": 1.5359253883361816, "learning_rate": 4.442336536402275e-07, "loss": 0.2306, "num_input_tokens_seen": 250240240, "step": 115970 }, { "epoch": 18.919249592169656, "grad_norm": 1.8365490436553955, "learning_rate": 4.4356595897575294e-07, "loss": 0.1228, "num_input_tokens_seen": 250250800, "step": 115975 }, { "epoch": 18.920065252854812, "grad_norm": 1.6395888328552246, "learning_rate": 4.4289876197856073e-07, "loss": 0.0486, "num_input_tokens_seen": 250260336, "step": 115980 }, { "epoch": 18.920880913539968, "grad_norm": 0.038403384387493134, "learning_rate": 4.4223206266218455e-07, "loss": 0.0883, "num_input_tokens_seen": 250270800, "step": 115985 }, { "epoch": 18.921696574225123, "grad_norm": 1.8812012672424316, "learning_rate": 4.4156586104012466e-07, "loss": 0.0703, "num_input_tokens_seen": 250280784, "step": 115990 }, { "epoch": 18.92251223491028, "grad_norm": 0.9018236398696899, "learning_rate": 4.409001571258897e-07, "loss": 0.1108, "num_input_tokens_seen": 250292560, "step": 115995 }, { "epoch": 18.92332789559543, "grad_norm": 0.08284491300582886, "learning_rate": 4.4023495093296616e-07, "loss": 0.1276, "num_input_tokens_seen": 250303632, "step": 116000 }, { "epoch": 18.924143556280587, "grad_norm": 0.7249894142150879, "learning_rate": 4.3957024247484047e-07, "loss": 0.0566, "num_input_tokens_seen": 250313360, "step": 116005 }, { "epoch": 18.924959216965743, "grad_norm": 2.0366671085357666, "learning_rate": 4.389060317649768e-07, "loss": 0.1411, "num_input_tokens_seen": 250323824, "step": 116010 }, { "epoch": 18.9257748776509, "grad_norm": 2.3334712982177734, "learning_rate": 4.38242318816845e-07, "loss": 0.1886, "num_input_tokens_seen": 250334320, "step": 116015 }, { "epoch": 18.92659053833605, "grad_norm": 1.1949021816253662, "learning_rate": 4.37579103643887e-07, "loss": 0.2227, "num_input_tokens_seen": 250345456, "step": 116020 }, { "epoch": 18.927406199021206, "grad_norm": 1.3325732946395874, "learning_rate": 4.369163862595532e-07, "loss": 0.0214, "num_input_tokens_seen": 250356688, "step": 116025 }, { "epoch": 18.928221859706362, "grad_norm": 0.29957911372184753, "learning_rate": 4.3625416667726624e-07, "loss": 0.015, "num_input_tokens_seen": 250366480, "step": 116030 }, { "epoch": 18.929037520391518, "grad_norm": 0.24869906902313232, "learning_rate": 4.355924449104487e-07, "loss": 0.0553, "num_input_tokens_seen": 250377840, "step": 116035 }, { "epoch": 18.929853181076673, "grad_norm": 0.04456155747175217, "learning_rate": 4.349312209725148e-07, "loss": 0.1596, "num_input_tokens_seen": 250388528, "step": 116040 }, { "epoch": 18.930668841761825, "grad_norm": 0.06918278336524963, "learning_rate": 4.3427049487686225e-07, "loss": 0.0783, "num_input_tokens_seen": 250400656, "step": 116045 }, { "epoch": 18.93148450244698, "grad_norm": 0.14121297001838684, "learning_rate": 4.336102666368802e-07, "loss": 0.1338, "num_input_tokens_seen": 250411856, "step": 116050 }, { "epoch": 18.932300163132137, "grad_norm": 0.7293893694877625, "learning_rate": 4.329505362659525e-07, "loss": 0.1847, "num_input_tokens_seen": 250423632, "step": 116055 }, { "epoch": 18.933115823817293, "grad_norm": 1.8078532218933105, "learning_rate": 4.3229130377744896e-07, "loss": 0.1447, "num_input_tokens_seen": 250435760, "step": 116060 }, { "epoch": 18.93393148450245, "grad_norm": 0.06555253267288208, "learning_rate": 4.316325691847256e-07, "loss": 0.1711, "num_input_tokens_seen": 250447504, "step": 116065 }, { "epoch": 18.9347471451876, "grad_norm": 0.29039090871810913, "learning_rate": 4.309743325011384e-07, "loss": 0.1802, "num_input_tokens_seen": 250458448, "step": 116070 }, { "epoch": 18.935562805872756, "grad_norm": 0.67439866065979, "learning_rate": 4.303165937400211e-07, "loss": 0.0275, "num_input_tokens_seen": 250470096, "step": 116075 }, { "epoch": 18.936378466557912, "grad_norm": 0.236915722489357, "learning_rate": 4.2965935291471027e-07, "loss": 0.1596, "num_input_tokens_seen": 250479312, "step": 116080 }, { "epoch": 18.937194127243067, "grad_norm": 0.16876491904258728, "learning_rate": 4.290026100385203e-07, "loss": 0.1137, "num_input_tokens_seen": 250491536, "step": 116085 }, { "epoch": 18.938009787928223, "grad_norm": 0.09637635946273804, "learning_rate": 4.2834636512476265e-07, "loss": 0.0402, "num_input_tokens_seen": 250502128, "step": 116090 }, { "epoch": 18.938825448613375, "grad_norm": 0.015513195656239986, "learning_rate": 4.2769061818673795e-07, "loss": 0.0576, "num_input_tokens_seen": 250514128, "step": 116095 }, { "epoch": 18.93964110929853, "grad_norm": 0.03875856474041939, "learning_rate": 4.2703536923773544e-07, "loss": 0.0054, "num_input_tokens_seen": 250524784, "step": 116100 }, { "epoch": 18.940456769983687, "grad_norm": 0.031327955424785614, "learning_rate": 4.2638061829103073e-07, "loss": 0.092, "num_input_tokens_seen": 250535536, "step": 116105 }, { "epoch": 18.941272430668842, "grad_norm": 0.04433701932430267, "learning_rate": 4.2572636535990195e-07, "loss": 0.2485, "num_input_tokens_seen": 250547184, "step": 116110 }, { "epoch": 18.942088091353998, "grad_norm": 0.08932740986347198, "learning_rate": 4.2507261045759973e-07, "loss": 0.2401, "num_input_tokens_seen": 250558608, "step": 116115 }, { "epoch": 18.94290375203915, "grad_norm": 0.026964332908391953, "learning_rate": 4.2441935359738006e-07, "loss": 0.0189, "num_input_tokens_seen": 250569616, "step": 116120 }, { "epoch": 18.943719412724306, "grad_norm": 2.095966100692749, "learning_rate": 4.2376659479247404e-07, "loss": 0.2416, "num_input_tokens_seen": 250580432, "step": 116125 }, { "epoch": 18.94453507340946, "grad_norm": 0.05869228392839432, "learning_rate": 4.2311433405611554e-07, "loss": 0.0213, "num_input_tokens_seen": 250590896, "step": 116130 }, { "epoch": 18.945350734094617, "grad_norm": 0.8433217406272888, "learning_rate": 4.224625714015246e-07, "loss": 0.0437, "num_input_tokens_seen": 250602384, "step": 116135 }, { "epoch": 18.946166394779773, "grad_norm": 0.09214206039905548, "learning_rate": 4.2181130684190995e-07, "loss": 0.2731, "num_input_tokens_seen": 250613680, "step": 116140 }, { "epoch": 18.946982055464925, "grad_norm": 0.5156226754188538, "learning_rate": 4.211605403904667e-07, "loss": 0.1094, "num_input_tokens_seen": 250624240, "step": 116145 }, { "epoch": 18.94779771615008, "grad_norm": 0.10901381820440292, "learning_rate": 4.205102720603871e-07, "loss": 0.0702, "num_input_tokens_seen": 250634672, "step": 116150 }, { "epoch": 18.948613376835237, "grad_norm": 0.2612948715686798, "learning_rate": 4.198605018648466e-07, "loss": 0.072, "num_input_tokens_seen": 250645552, "step": 116155 }, { "epoch": 18.949429037520392, "grad_norm": 0.12682181596755981, "learning_rate": 4.1921122981701533e-07, "loss": 0.1025, "num_input_tokens_seen": 250657232, "step": 116160 }, { "epoch": 18.950244698205548, "grad_norm": 0.05361585691571236, "learning_rate": 4.1856245593005215e-07, "loss": 0.0454, "num_input_tokens_seen": 250667120, "step": 116165 }, { "epoch": 18.9510603588907, "grad_norm": 1.0937577486038208, "learning_rate": 4.1791418021710496e-07, "loss": 0.1311, "num_input_tokens_seen": 250677040, "step": 116170 }, { "epoch": 18.951876019575856, "grad_norm": 0.5268681645393372, "learning_rate": 4.1726640269131034e-07, "loss": 0.0139, "num_input_tokens_seen": 250688016, "step": 116175 }, { "epoch": 18.95269168026101, "grad_norm": 1.0219674110412598, "learning_rate": 4.1661912336579957e-07, "loss": 0.0248, "num_input_tokens_seen": 250698960, "step": 116180 }, { "epoch": 18.953507340946167, "grad_norm": 0.08444897830486298, "learning_rate": 4.1597234225368707e-07, "loss": 0.0235, "num_input_tokens_seen": 250709744, "step": 116185 }, { "epoch": 18.954323001631323, "grad_norm": 0.9973253607749939, "learning_rate": 4.1532605936808187e-07, "loss": 0.1344, "num_input_tokens_seen": 250721008, "step": 116190 }, { "epoch": 18.955138662316475, "grad_norm": 0.10032158344984055, "learning_rate": 4.146802747220818e-07, "loss": 0.0137, "num_input_tokens_seen": 250731568, "step": 116195 }, { "epoch": 18.95595432300163, "grad_norm": 2.75028920173645, "learning_rate": 4.140349883287764e-07, "loss": 0.2698, "num_input_tokens_seen": 250742288, "step": 116200 }, { "epoch": 18.956769983686787, "grad_norm": 0.7700361609458923, "learning_rate": 4.133902002012385e-07, "loss": 0.1272, "num_input_tokens_seen": 250754128, "step": 116205 }, { "epoch": 18.957585644371942, "grad_norm": 0.6893945336341858, "learning_rate": 4.1274591035254107e-07, "loss": 0.0406, "num_input_tokens_seen": 250766416, "step": 116210 }, { "epoch": 18.958401305057095, "grad_norm": 0.05686119571328163, "learning_rate": 4.121021187957375e-07, "loss": 0.1148, "num_input_tokens_seen": 250776496, "step": 116215 }, { "epoch": 18.95921696574225, "grad_norm": 0.18763020634651184, "learning_rate": 4.114588255438756e-07, "loss": 0.0308, "num_input_tokens_seen": 250786736, "step": 116220 }, { "epoch": 18.960032626427406, "grad_norm": 2.727081060409546, "learning_rate": 4.108160306099951e-07, "loss": 0.0987, "num_input_tokens_seen": 250796816, "step": 116225 }, { "epoch": 18.96084828711256, "grad_norm": 0.017170829698443413, "learning_rate": 4.1017373400712155e-07, "loss": 0.0479, "num_input_tokens_seen": 250806704, "step": 116230 }, { "epoch": 18.961663947797717, "grad_norm": 1.4360196590423584, "learning_rate": 4.0953193574826967e-07, "loss": 0.0884, "num_input_tokens_seen": 250817552, "step": 116235 }, { "epoch": 18.96247960848287, "grad_norm": 0.05450773984193802, "learning_rate": 4.0889063584644837e-07, "loss": 0.0251, "num_input_tokens_seen": 250828496, "step": 116240 }, { "epoch": 18.963295269168025, "grad_norm": 2.3119149208068848, "learning_rate": 4.082498343146557e-07, "loss": 0.1287, "num_input_tokens_seen": 250838352, "step": 116245 }, { "epoch": 18.96411092985318, "grad_norm": 0.03670786693692207, "learning_rate": 4.076095311658784e-07, "loss": 0.0707, "num_input_tokens_seen": 250850032, "step": 116250 }, { "epoch": 18.964926590538337, "grad_norm": 0.742719292640686, "learning_rate": 4.069697264130895e-07, "loss": 0.0875, "num_input_tokens_seen": 250860976, "step": 116255 }, { "epoch": 18.965742251223492, "grad_norm": 0.1631420999765396, "learning_rate": 4.0633042006925913e-07, "loss": 0.1179, "num_input_tokens_seen": 250871664, "step": 116260 }, { "epoch": 18.966557911908644, "grad_norm": 0.10429248958826065, "learning_rate": 4.0569161214734085e-07, "loss": 0.0078, "num_input_tokens_seen": 250882256, "step": 116265 }, { "epoch": 18.9673735725938, "grad_norm": 0.17344005405902863, "learning_rate": 4.050533026602826e-07, "loss": 0.0184, "num_input_tokens_seen": 250892656, "step": 116270 }, { "epoch": 18.968189233278956, "grad_norm": 0.9229426980018616, "learning_rate": 4.0441549162101853e-07, "loss": 0.0201, "num_input_tokens_seen": 250903600, "step": 116275 }, { "epoch": 18.96900489396411, "grad_norm": 1.8934049606323242, "learning_rate": 4.0377817904247996e-07, "loss": 0.1812, "num_input_tokens_seen": 250914480, "step": 116280 }, { "epoch": 18.969820554649267, "grad_norm": 0.036157526075839996, "learning_rate": 4.031413649375759e-07, "loss": 0.1183, "num_input_tokens_seen": 250925264, "step": 116285 }, { "epoch": 18.97063621533442, "grad_norm": 0.043205879628658295, "learning_rate": 4.025050493192156e-07, "loss": 0.0383, "num_input_tokens_seen": 250937456, "step": 116290 }, { "epoch": 18.971451876019575, "grad_norm": 1.8912558555603027, "learning_rate": 4.0186923220029704e-07, "loss": 0.0451, "num_input_tokens_seen": 250948016, "step": 116295 }, { "epoch": 18.97226753670473, "grad_norm": 0.03506806865334511, "learning_rate": 4.0123391359370155e-07, "loss": 0.0104, "num_input_tokens_seen": 250959536, "step": 116300 }, { "epoch": 18.973083197389887, "grad_norm": 0.08275596052408218, "learning_rate": 4.00599093512305e-07, "loss": 0.1849, "num_input_tokens_seen": 250969488, "step": 116305 }, { "epoch": 18.973898858075042, "grad_norm": 0.04489680007100105, "learning_rate": 3.999647719689775e-07, "loss": 0.1695, "num_input_tokens_seen": 250980176, "step": 116310 }, { "epoch": 18.974714518760194, "grad_norm": 0.12628352642059326, "learning_rate": 3.993309489765701e-07, "loss": 0.0361, "num_input_tokens_seen": 250990352, "step": 116315 }, { "epoch": 18.97553017944535, "grad_norm": 1.24312424659729, "learning_rate": 3.9869762454792793e-07, "loss": 0.1228, "num_input_tokens_seen": 251001424, "step": 116320 }, { "epoch": 18.976345840130506, "grad_norm": 0.05633251368999481, "learning_rate": 3.9806479869588796e-07, "loss": 0.0191, "num_input_tokens_seen": 251011472, "step": 116325 }, { "epoch": 18.97716150081566, "grad_norm": 1.9610785245895386, "learning_rate": 3.974324714332761e-07, "loss": 0.3198, "num_input_tokens_seen": 251022288, "step": 116330 }, { "epoch": 18.977977161500817, "grad_norm": 1.1195429563522339, "learning_rate": 3.968006427729043e-07, "loss": 0.0759, "num_input_tokens_seen": 251033072, "step": 116335 }, { "epoch": 18.97879282218597, "grad_norm": 0.03445887565612793, "learning_rate": 3.961693127275762e-07, "loss": 0.1382, "num_input_tokens_seen": 251042992, "step": 116340 }, { "epoch": 18.979608482871125, "grad_norm": 0.05015967786312103, "learning_rate": 3.9553848131009543e-07, "loss": 0.0651, "num_input_tokens_seen": 251054448, "step": 116345 }, { "epoch": 18.98042414355628, "grad_norm": 0.15659846365451813, "learning_rate": 3.949081485332351e-07, "loss": 0.1013, "num_input_tokens_seen": 251066832, "step": 116350 }, { "epoch": 18.981239804241437, "grad_norm": 1.4031120538711548, "learning_rate": 3.942783144097795e-07, "loss": 0.0354, "num_input_tokens_seen": 251078448, "step": 116355 }, { "epoch": 18.982055464926592, "grad_norm": 0.2519621253013611, "learning_rate": 3.9364897895248223e-07, "loss": 0.0727, "num_input_tokens_seen": 251089392, "step": 116360 }, { "epoch": 18.982871125611744, "grad_norm": 2.2629449367523193, "learning_rate": 3.930201421741109e-07, "loss": 0.0929, "num_input_tokens_seen": 251098416, "step": 116365 }, { "epoch": 18.9836867862969, "grad_norm": 0.3654005825519562, "learning_rate": 3.9239180408739693e-07, "loss": 0.1663, "num_input_tokens_seen": 251108688, "step": 116370 }, { "epoch": 18.984502446982056, "grad_norm": 0.13817442953586578, "learning_rate": 3.91763964705083e-07, "loss": 0.058, "num_input_tokens_seen": 251120432, "step": 116375 }, { "epoch": 18.98531810766721, "grad_norm": 1.1360524892807007, "learning_rate": 3.911366240398867e-07, "loss": 0.0272, "num_input_tokens_seen": 251130096, "step": 116380 }, { "epoch": 18.986133768352367, "grad_norm": 2.179107666015625, "learning_rate": 3.905097821045284e-07, "loss": 0.2027, "num_input_tokens_seen": 251139984, "step": 116385 }, { "epoch": 18.98694942903752, "grad_norm": 0.05336332693696022, "learning_rate": 3.8988343891170897e-07, "loss": 0.02, "num_input_tokens_seen": 251150352, "step": 116390 }, { "epoch": 18.987765089722675, "grad_norm": 0.4492463767528534, "learning_rate": 3.892575944741211e-07, "loss": 0.0456, "num_input_tokens_seen": 251160912, "step": 116395 }, { "epoch": 18.98858075040783, "grad_norm": 0.22820954024791718, "learning_rate": 3.8863224880444917e-07, "loss": 0.0208, "num_input_tokens_seen": 251172400, "step": 116400 }, { "epoch": 18.989396411092986, "grad_norm": 1.5307643413543701, "learning_rate": 3.880074019153662e-07, "loss": 0.1244, "num_input_tokens_seen": 251182288, "step": 116405 }, { "epoch": 18.99021207177814, "grad_norm": 0.09280452877283096, "learning_rate": 3.873830538195344e-07, "loss": 0.0777, "num_input_tokens_seen": 251192336, "step": 116410 }, { "epoch": 18.991027732463294, "grad_norm": 0.09740028530359268, "learning_rate": 3.8675920452961034e-07, "loss": 0.0311, "num_input_tokens_seen": 251202480, "step": 116415 }, { "epoch": 18.99184339314845, "grad_norm": 0.0652400329709053, "learning_rate": 3.861358540582338e-07, "loss": 0.0263, "num_input_tokens_seen": 251213328, "step": 116420 }, { "epoch": 18.992659053833606, "grad_norm": 0.12119600176811218, "learning_rate": 3.8551300241803924e-07, "loss": 0.0144, "num_input_tokens_seen": 251224240, "step": 116425 }, { "epoch": 18.99347471451876, "grad_norm": 0.05126061290502548, "learning_rate": 3.848906496216498e-07, "loss": 0.1016, "num_input_tokens_seen": 251236112, "step": 116430 }, { "epoch": 18.994290375203914, "grad_norm": 0.09732840210199356, "learning_rate": 3.8426879568168037e-07, "loss": 0.0924, "num_input_tokens_seen": 251246096, "step": 116435 }, { "epoch": 18.99510603588907, "grad_norm": 0.025127500295639038, "learning_rate": 3.8364744061072653e-07, "loss": 0.0236, "num_input_tokens_seen": 251257680, "step": 116440 }, { "epoch": 18.995921696574225, "grad_norm": 1.3634387254714966, "learning_rate": 3.830265844213865e-07, "loss": 0.2211, "num_input_tokens_seen": 251267696, "step": 116445 }, { "epoch": 18.99673735725938, "grad_norm": 0.05870484933257103, "learning_rate": 3.824062271262446e-07, "loss": 0.2207, "num_input_tokens_seen": 251278512, "step": 116450 }, { "epoch": 18.997553017944536, "grad_norm": 0.03658789023756981, "learning_rate": 3.817863687378659e-07, "loss": 0.0352, "num_input_tokens_seen": 251289520, "step": 116455 }, { "epoch": 18.99836867862969, "grad_norm": 0.11427640169858932, "learning_rate": 3.8116700926882075e-07, "loss": 0.0553, "num_input_tokens_seen": 251300528, "step": 116460 }, { "epoch": 18.999184339314844, "grad_norm": 0.025892114266753197, "learning_rate": 3.805481487316548e-07, "loss": 0.1281, "num_input_tokens_seen": 251310992, "step": 116465 }, { "epoch": 19.0, "grad_norm": 0.2693730294704437, "learning_rate": 3.7992978713891347e-07, "loss": 0.0331, "num_input_tokens_seen": 251320768, "step": 116470 }, { "epoch": 19.0, "eval_loss": 0.14396439492702484, "eval_runtime": 90.9103, "eval_samples_per_second": 29.975, "eval_steps_per_second": 7.502, "num_input_tokens_seen": 251320768, "step": 116470 }, { "epoch": 19.000815660685156, "grad_norm": 0.1442999243736267, "learning_rate": 3.793119245031257e-07, "loss": 0.1494, "num_input_tokens_seen": 251331200, "step": 116475 }, { "epoch": 19.00163132137031, "grad_norm": 0.031815387308597565, "learning_rate": 3.786945608368203e-07, "loss": 0.0864, "num_input_tokens_seen": 251342368, "step": 116480 }, { "epoch": 19.002446982055464, "grad_norm": 5.905822277069092, "learning_rate": 3.7807769615249845e-07, "loss": 0.1231, "num_input_tokens_seen": 251351840, "step": 116485 }, { "epoch": 19.00326264274062, "grad_norm": 0.10731855779886246, "learning_rate": 3.7746133046267227e-07, "loss": 0.2285, "num_input_tokens_seen": 251363296, "step": 116490 }, { "epoch": 19.004078303425775, "grad_norm": 0.04266731068491936, "learning_rate": 3.768454637798235e-07, "loss": 0.1091, "num_input_tokens_seen": 251373536, "step": 116495 }, { "epoch": 19.00489396411093, "grad_norm": 0.11004915833473206, "learning_rate": 3.762300961164422e-07, "loss": 0.1833, "num_input_tokens_seen": 251383776, "step": 116500 }, { "epoch": 19.005709624796086, "grad_norm": 0.033269111067056656, "learning_rate": 3.7561522748499055e-07, "loss": 0.1605, "num_input_tokens_seen": 251394208, "step": 116505 }, { "epoch": 19.00652528548124, "grad_norm": 2.1269545555114746, "learning_rate": 3.7500085789794195e-07, "loss": 0.1323, "num_input_tokens_seen": 251405504, "step": 116510 }, { "epoch": 19.007340946166394, "grad_norm": 0.09740648418664932, "learning_rate": 3.7438698736773357e-07, "loss": 0.1013, "num_input_tokens_seen": 251415648, "step": 116515 }, { "epoch": 19.00815660685155, "grad_norm": 1.5702238082885742, "learning_rate": 3.737736159068167e-07, "loss": 0.1466, "num_input_tokens_seen": 251426528, "step": 116520 }, { "epoch": 19.008972267536706, "grad_norm": 1.7494643926620483, "learning_rate": 3.731607435276119e-07, "loss": 0.0794, "num_input_tokens_seen": 251437632, "step": 116525 }, { "epoch": 19.00978792822186, "grad_norm": 0.4490993916988373, "learning_rate": 3.7254837024255085e-07, "loss": 0.0135, "num_input_tokens_seen": 251448576, "step": 116530 }, { "epoch": 19.010603588907014, "grad_norm": 0.4223712980747223, "learning_rate": 3.7193649606403757e-07, "loss": 0.0312, "num_input_tokens_seen": 251458656, "step": 116535 }, { "epoch": 19.01141924959217, "grad_norm": 1.105884313583374, "learning_rate": 3.7132512100447605e-07, "loss": 0.0826, "num_input_tokens_seen": 251469216, "step": 116540 }, { "epoch": 19.012234910277325, "grad_norm": 0.04170854389667511, "learning_rate": 3.707142450762535e-07, "loss": 0.105, "num_input_tokens_seen": 251479456, "step": 116545 }, { "epoch": 19.01305057096248, "grad_norm": 0.13956885039806366, "learning_rate": 3.7010386829175183e-07, "loss": 0.1296, "num_input_tokens_seen": 251490112, "step": 116550 }, { "epoch": 19.013866231647636, "grad_norm": 0.8003728985786438, "learning_rate": 3.6949399066334155e-07, "loss": 0.0568, "num_input_tokens_seen": 251500960, "step": 116555 }, { "epoch": 19.01468189233279, "grad_norm": 2.359325408935547, "learning_rate": 3.6888461220337955e-07, "loss": 0.2179, "num_input_tokens_seen": 251510752, "step": 116560 }, { "epoch": 19.015497553017944, "grad_norm": 1.9559993743896484, "learning_rate": 3.68275732924217e-07, "loss": 0.0629, "num_input_tokens_seen": 251521216, "step": 116565 }, { "epoch": 19.0163132137031, "grad_norm": 0.7522616982460022, "learning_rate": 3.6766735283819684e-07, "loss": 0.0544, "num_input_tokens_seen": 251531200, "step": 116570 }, { "epoch": 19.017128874388256, "grad_norm": 0.01816455088555813, "learning_rate": 3.6705947195764533e-07, "loss": 0.0044, "num_input_tokens_seen": 251542528, "step": 116575 }, { "epoch": 19.017944535073408, "grad_norm": 0.6390687227249146, "learning_rate": 3.664520902948776e-07, "loss": 0.026, "num_input_tokens_seen": 251553408, "step": 116580 }, { "epoch": 19.018760195758563, "grad_norm": 1.0698574781417847, "learning_rate": 3.658452078622171e-07, "loss": 0.0328, "num_input_tokens_seen": 251565568, "step": 116585 }, { "epoch": 19.01957585644372, "grad_norm": 0.07881877571344376, "learning_rate": 3.652388246719457e-07, "loss": 0.0874, "num_input_tokens_seen": 251576128, "step": 116590 }, { "epoch": 19.020391517128875, "grad_norm": 1.219201683998108, "learning_rate": 3.6463294073636743e-07, "loss": 0.0279, "num_input_tokens_seen": 251586592, "step": 116595 }, { "epoch": 19.02120717781403, "grad_norm": 1.1433345079421997, "learning_rate": 3.640275560677503e-07, "loss": 0.1222, "num_input_tokens_seen": 251598048, "step": 116600 }, { "epoch": 19.022022838499183, "grad_norm": 0.3301929831504822, "learning_rate": 3.6342267067837334e-07, "loss": 0.2071, "num_input_tokens_seen": 251609472, "step": 116605 }, { "epoch": 19.02283849918434, "grad_norm": 2.580487012863159, "learning_rate": 3.628182845804823e-07, "loss": 0.2412, "num_input_tokens_seen": 251620320, "step": 116610 }, { "epoch": 19.023654159869494, "grad_norm": 1.9176256656646729, "learning_rate": 3.6221439778633957e-07, "loss": 0.1621, "num_input_tokens_seen": 251631648, "step": 116615 }, { "epoch": 19.02446982055465, "grad_norm": 1.4857875108718872, "learning_rate": 3.616110103081716e-07, "loss": 0.1778, "num_input_tokens_seen": 251642624, "step": 116620 }, { "epoch": 19.025285481239806, "grad_norm": 0.09246860444545746, "learning_rate": 3.6100812215821844e-07, "loss": 0.0098, "num_input_tokens_seen": 251653920, "step": 116625 }, { "epoch": 19.026101141924958, "grad_norm": 1.7909023761749268, "learning_rate": 3.604057333486871e-07, "loss": 0.0554, "num_input_tokens_seen": 251664800, "step": 116630 }, { "epoch": 19.026916802610113, "grad_norm": 0.032732151448726654, "learning_rate": 3.5980384389179556e-07, "loss": 0.0751, "num_input_tokens_seen": 251676000, "step": 116635 }, { "epoch": 19.02773246329527, "grad_norm": 1.9019149541854858, "learning_rate": 3.5920245379973404e-07, "loss": 0.1048, "num_input_tokens_seen": 251686432, "step": 116640 }, { "epoch": 19.028548123980425, "grad_norm": 0.16870787739753723, "learning_rate": 3.5860156308469837e-07, "loss": 0.1335, "num_input_tokens_seen": 251698048, "step": 116645 }, { "epoch": 19.02936378466558, "grad_norm": 0.04478928819298744, "learning_rate": 3.580011717588566e-07, "loss": 0.1393, "num_input_tokens_seen": 251707776, "step": 116650 }, { "epoch": 19.030179445350733, "grad_norm": 0.6564226746559143, "learning_rate": 3.574012798343851e-07, "loss": 0.0199, "num_input_tokens_seen": 251719328, "step": 116655 }, { "epoch": 19.03099510603589, "grad_norm": 0.11249345541000366, "learning_rate": 3.568018873234352e-07, "loss": 0.0808, "num_input_tokens_seen": 251731136, "step": 116660 }, { "epoch": 19.031810766721044, "grad_norm": 2.452589273452759, "learning_rate": 3.562029942381584e-07, "loss": 0.1685, "num_input_tokens_seen": 251742528, "step": 116665 }, { "epoch": 19.0326264274062, "grad_norm": 0.08236692100763321, "learning_rate": 3.5560460059069213e-07, "loss": 0.0554, "num_input_tokens_seen": 251751392, "step": 116670 }, { "epoch": 19.033442088091356, "grad_norm": 1.8392579555511475, "learning_rate": 3.5500670639316005e-07, "loss": 0.05, "num_input_tokens_seen": 251762112, "step": 116675 }, { "epoch": 19.034257748776508, "grad_norm": 0.039361372590065, "learning_rate": 3.5440931165768296e-07, "loss": 0.0711, "num_input_tokens_seen": 251772576, "step": 116680 }, { "epoch": 19.035073409461663, "grad_norm": 0.1594410091638565, "learning_rate": 3.5381241639636795e-07, "loss": 0.1079, "num_input_tokens_seen": 251784448, "step": 116685 }, { "epoch": 19.03588907014682, "grad_norm": 0.13299790024757385, "learning_rate": 3.532160206213081e-07, "loss": 0.1281, "num_input_tokens_seen": 251794368, "step": 116690 }, { "epoch": 19.036704730831975, "grad_norm": 0.11286459118127823, "learning_rate": 3.526201243445909e-07, "loss": 0.0954, "num_input_tokens_seen": 251805824, "step": 116695 }, { "epoch": 19.03752039151713, "grad_norm": 0.18070808053016663, "learning_rate": 3.5202472757829564e-07, "loss": 0.1266, "num_input_tokens_seen": 251815424, "step": 116700 }, { "epoch": 19.038336052202283, "grad_norm": 1.1944681406021118, "learning_rate": 3.514298303344904e-07, "loss": 0.0757, "num_input_tokens_seen": 251826144, "step": 116705 }, { "epoch": 19.03915171288744, "grad_norm": 2.1634860038757324, "learning_rate": 3.508354326252239e-07, "loss": 0.1226, "num_input_tokens_seen": 251837792, "step": 116710 }, { "epoch": 19.039967373572594, "grad_norm": 0.06013872101902962, "learning_rate": 3.5024153446255034e-07, "loss": 0.0727, "num_input_tokens_seen": 251848640, "step": 116715 }, { "epoch": 19.04078303425775, "grad_norm": 0.1791391223669052, "learning_rate": 3.4964813585850185e-07, "loss": 0.0881, "num_input_tokens_seen": 251860960, "step": 116720 }, { "epoch": 19.041598694942905, "grad_norm": 0.5516982078552246, "learning_rate": 3.490552368251021e-07, "loss": 0.1844, "num_input_tokens_seen": 251871872, "step": 116725 }, { "epoch": 19.042414355628058, "grad_norm": 0.4018259644508362, "learning_rate": 3.484628373743748e-07, "loss": 0.1919, "num_input_tokens_seen": 251883072, "step": 116730 }, { "epoch": 19.043230016313213, "grad_norm": 0.370054692029953, "learning_rate": 3.478709375183159e-07, "loss": 0.0353, "num_input_tokens_seen": 251892864, "step": 116735 }, { "epoch": 19.04404567699837, "grad_norm": 0.102371945977211, "learning_rate": 3.472795372689297e-07, "loss": 0.1107, "num_input_tokens_seen": 251903424, "step": 116740 }, { "epoch": 19.044861337683525, "grad_norm": 0.4750288128852844, "learning_rate": 3.466886366381955e-07, "loss": 0.1735, "num_input_tokens_seen": 251914048, "step": 116745 }, { "epoch": 19.045676998368677, "grad_norm": 0.11912184953689575, "learning_rate": 3.4609823563809264e-07, "loss": 0.0255, "num_input_tokens_seen": 251924448, "step": 116750 }, { "epoch": 19.046492659053833, "grad_norm": 0.13551323115825653, "learning_rate": 3.455083342805837e-07, "loss": 0.1079, "num_input_tokens_seen": 251935296, "step": 116755 }, { "epoch": 19.04730831973899, "grad_norm": 0.06203719973564148, "learning_rate": 3.4491893257762584e-07, "loss": 0.0329, "num_input_tokens_seen": 251946752, "step": 116760 }, { "epoch": 19.048123980424144, "grad_norm": 0.11689861863851547, "learning_rate": 3.443300305411595e-07, "loss": 0.0352, "num_input_tokens_seen": 251958208, "step": 116765 }, { "epoch": 19.0489396411093, "grad_norm": 0.018095387145876884, "learning_rate": 3.4374162818312795e-07, "loss": 0.0962, "num_input_tokens_seen": 251968800, "step": 116770 }, { "epoch": 19.049755301794452, "grad_norm": 0.25350433588027954, "learning_rate": 3.4315372551544656e-07, "loss": 0.1353, "num_input_tokens_seen": 251980032, "step": 116775 }, { "epoch": 19.050570962479608, "grad_norm": 0.03769192844629288, "learning_rate": 3.425663225500392e-07, "loss": 0.1158, "num_input_tokens_seen": 251991296, "step": 116780 }, { "epoch": 19.051386623164763, "grad_norm": 1.0505133867263794, "learning_rate": 3.4197941929880184e-07, "loss": 0.1718, "num_input_tokens_seen": 252002784, "step": 116785 }, { "epoch": 19.05220228384992, "grad_norm": 0.02231842838227749, "learning_rate": 3.4139301577363335e-07, "loss": 0.0174, "num_input_tokens_seen": 252014112, "step": 116790 }, { "epoch": 19.053017944535075, "grad_norm": 0.09509560465812683, "learning_rate": 3.4080711198641867e-07, "loss": 0.0299, "num_input_tokens_seen": 252025568, "step": 116795 }, { "epoch": 19.053833605220227, "grad_norm": 0.1026022881269455, "learning_rate": 3.4022170794902885e-07, "loss": 0.1244, "num_input_tokens_seen": 252037408, "step": 116800 }, { "epoch": 19.054649265905383, "grad_norm": 1.9723273515701294, "learning_rate": 3.396368036733294e-07, "loss": 0.1349, "num_input_tokens_seen": 252048448, "step": 116805 }, { "epoch": 19.05546492659054, "grad_norm": 0.11001644283533096, "learning_rate": 3.3905239917117194e-07, "loss": 0.1533, "num_input_tokens_seen": 252058560, "step": 116810 }, { "epoch": 19.056280587275694, "grad_norm": 0.25405436754226685, "learning_rate": 3.384684944544053e-07, "loss": 0.0944, "num_input_tokens_seen": 252069216, "step": 116815 }, { "epoch": 19.05709624796085, "grad_norm": 0.3638489842414856, "learning_rate": 3.37885089534859e-07, "loss": 0.0955, "num_input_tokens_seen": 252077920, "step": 116820 }, { "epoch": 19.057911908646002, "grad_norm": 2.314784049987793, "learning_rate": 3.373021844243568e-07, "loss": 0.1221, "num_input_tokens_seen": 252088320, "step": 116825 }, { "epoch": 19.058727569331158, "grad_norm": 0.04312445968389511, "learning_rate": 3.367197791347143e-07, "loss": 0.0416, "num_input_tokens_seen": 252098368, "step": 116830 }, { "epoch": 19.059543230016313, "grad_norm": 0.34288516640663147, "learning_rate": 3.3613787367773044e-07, "loss": 0.1651, "num_input_tokens_seen": 252109440, "step": 116835 }, { "epoch": 19.06035889070147, "grad_norm": 0.07775752991437912, "learning_rate": 3.3555646806520403e-07, "loss": 0.0156, "num_input_tokens_seen": 252118816, "step": 116840 }, { "epoch": 19.061174551386625, "grad_norm": 1.9190961122512817, "learning_rate": 3.349755623089118e-07, "loss": 0.0903, "num_input_tokens_seen": 252130560, "step": 116845 }, { "epoch": 19.061990212071777, "grad_norm": 0.22430942952632904, "learning_rate": 3.3439515642063326e-07, "loss": 0.2096, "num_input_tokens_seen": 252140992, "step": 116850 }, { "epoch": 19.062805872756933, "grad_norm": 2.244880199432373, "learning_rate": 3.3381525041212556e-07, "loss": 0.1493, "num_input_tokens_seen": 252151264, "step": 116855 }, { "epoch": 19.063621533442088, "grad_norm": 2.8618249893188477, "learning_rate": 3.3323584429514053e-07, "loss": 0.1275, "num_input_tokens_seen": 252161216, "step": 116860 }, { "epoch": 19.064437194127244, "grad_norm": 0.1268618255853653, "learning_rate": 3.3265693808142705e-07, "loss": 0.1157, "num_input_tokens_seen": 252172192, "step": 116865 }, { "epoch": 19.0652528548124, "grad_norm": 0.910424530506134, "learning_rate": 3.3207853178270906e-07, "loss": 0.1065, "num_input_tokens_seen": 252182400, "step": 116870 }, { "epoch": 19.06606851549755, "grad_norm": 0.49927327036857605, "learning_rate": 3.315006254107161e-07, "loss": 0.0539, "num_input_tokens_seen": 252192352, "step": 116875 }, { "epoch": 19.066884176182707, "grad_norm": 0.10761703550815582, "learning_rate": 3.3092321897715825e-07, "loss": 0.2226, "num_input_tokens_seen": 252202080, "step": 116880 }, { "epoch": 19.067699836867863, "grad_norm": 0.08670642226934433, "learning_rate": 3.303463124937345e-07, "loss": 0.1147, "num_input_tokens_seen": 252213312, "step": 116885 }, { "epoch": 19.06851549755302, "grad_norm": 0.1534726768732071, "learning_rate": 3.2976990597213817e-07, "loss": 0.013, "num_input_tokens_seen": 252224512, "step": 116890 }, { "epoch": 19.069331158238175, "grad_norm": 2.0294976234436035, "learning_rate": 3.291939994240517e-07, "loss": 0.2035, "num_input_tokens_seen": 252236384, "step": 116895 }, { "epoch": 19.070146818923327, "grad_norm": 0.24025534093379974, "learning_rate": 3.286185928611463e-07, "loss": 0.0277, "num_input_tokens_seen": 252247168, "step": 116900 }, { "epoch": 19.070962479608482, "grad_norm": 1.1599817276000977, "learning_rate": 3.2804368629508473e-07, "loss": 0.0513, "num_input_tokens_seen": 252257152, "step": 116905 }, { "epoch": 19.071778140293638, "grad_norm": 0.3977825343608856, "learning_rate": 3.274692797375134e-07, "loss": 0.0335, "num_input_tokens_seen": 252266624, "step": 116910 }, { "epoch": 19.072593800978794, "grad_norm": 0.5338597893714905, "learning_rate": 3.2689537320007844e-07, "loss": 0.0441, "num_input_tokens_seen": 252278464, "step": 116915 }, { "epoch": 19.07340946166395, "grad_norm": 0.10149301588535309, "learning_rate": 3.2632196669440673e-07, "loss": 0.0529, "num_input_tokens_seen": 252291104, "step": 116920 }, { "epoch": 19.0742251223491, "grad_norm": 2.2892324924468994, "learning_rate": 3.257490602321223e-07, "loss": 0.2517, "num_input_tokens_seen": 252302368, "step": 116925 }, { "epoch": 19.075040783034257, "grad_norm": 1.8788585662841797, "learning_rate": 3.251766538248352e-07, "loss": 0.226, "num_input_tokens_seen": 252313504, "step": 116930 }, { "epoch": 19.075856443719413, "grad_norm": 0.05760541930794716, "learning_rate": 3.246047474841446e-07, "loss": 0.1759, "num_input_tokens_seen": 252324896, "step": 116935 }, { "epoch": 19.07667210440457, "grad_norm": 1.3699430227279663, "learning_rate": 3.24033341221644e-07, "loss": 0.1148, "num_input_tokens_seen": 252334976, "step": 116940 }, { "epoch": 19.07748776508972, "grad_norm": 2.8691458702087402, "learning_rate": 3.2346243504890737e-07, "loss": 0.379, "num_input_tokens_seen": 252344704, "step": 116945 }, { "epoch": 19.078303425774877, "grad_norm": 3.510800361633301, "learning_rate": 3.228920289775117e-07, "loss": 0.0599, "num_input_tokens_seen": 252355072, "step": 116950 }, { "epoch": 19.079119086460032, "grad_norm": 2.0819849967956543, "learning_rate": 3.223221230190143e-07, "loss": 0.3413, "num_input_tokens_seen": 252366432, "step": 116955 }, { "epoch": 19.079934747145188, "grad_norm": 0.18896661698818207, "learning_rate": 3.2175271718496435e-07, "loss": 0.1984, "num_input_tokens_seen": 252377792, "step": 116960 }, { "epoch": 19.080750407830344, "grad_norm": 0.02954123541712761, "learning_rate": 3.2118381148690257e-07, "loss": 0.0329, "num_input_tokens_seen": 252388864, "step": 116965 }, { "epoch": 19.081566068515496, "grad_norm": 0.5301835536956787, "learning_rate": 3.206154059363559e-07, "loss": 0.103, "num_input_tokens_seen": 252399296, "step": 116970 }, { "epoch": 19.08238172920065, "grad_norm": 1.5663866996765137, "learning_rate": 3.200475005448483e-07, "loss": 0.1407, "num_input_tokens_seen": 252409856, "step": 116975 }, { "epoch": 19.083197389885807, "grad_norm": 0.0995897501707077, "learning_rate": 3.1948009532388467e-07, "loss": 0.0369, "num_input_tokens_seen": 252421568, "step": 116980 }, { "epoch": 19.084013050570963, "grad_norm": 2.6384830474853516, "learning_rate": 3.189131902849696e-07, "loss": 0.3025, "num_input_tokens_seen": 252433568, "step": 116985 }, { "epoch": 19.08482871125612, "grad_norm": 0.6008479595184326, "learning_rate": 3.183467854395855e-07, "loss": 0.0284, "num_input_tokens_seen": 252442528, "step": 116990 }, { "epoch": 19.08564437194127, "grad_norm": 0.11570762097835541, "learning_rate": 3.1778088079921777e-07, "loss": 0.0951, "num_input_tokens_seen": 252452448, "step": 116995 }, { "epoch": 19.086460032626427, "grad_norm": 0.0787283182144165, "learning_rate": 3.1721547637532933e-07, "loss": 0.1112, "num_input_tokens_seen": 252463296, "step": 117000 }, { "epoch": 19.087275693311582, "grad_norm": 1.4645615816116333, "learning_rate": 3.166505721793861e-07, "loss": 0.1628, "num_input_tokens_seen": 252471840, "step": 117005 }, { "epoch": 19.088091353996738, "grad_norm": 0.12784206867218018, "learning_rate": 3.160861682228261e-07, "loss": 0.0124, "num_input_tokens_seen": 252482816, "step": 117010 }, { "epoch": 19.088907014681894, "grad_norm": 0.2663303017616272, "learning_rate": 3.1552226451709854e-07, "loss": 0.0789, "num_input_tokens_seen": 252493696, "step": 117015 }, { "epoch": 19.089722675367046, "grad_norm": 2.463198661804199, "learning_rate": 3.149588610736248e-07, "loss": 0.1974, "num_input_tokens_seen": 252504768, "step": 117020 }, { "epoch": 19.0905383360522, "grad_norm": 1.7122653722763062, "learning_rate": 3.143959579038236e-07, "loss": 0.0713, "num_input_tokens_seen": 252515712, "step": 117025 }, { "epoch": 19.091353996737357, "grad_norm": 0.1652657836675644, "learning_rate": 3.138335550191052e-07, "loss": 0.0803, "num_input_tokens_seen": 252526880, "step": 117030 }, { "epoch": 19.092169657422513, "grad_norm": 0.13007883727550507, "learning_rate": 3.132716524308688e-07, "loss": 0.1407, "num_input_tokens_seen": 252537728, "step": 117035 }, { "epoch": 19.09298531810767, "grad_norm": 1.0374126434326172, "learning_rate": 3.1271025015049696e-07, "loss": 0.1833, "num_input_tokens_seen": 252549568, "step": 117040 }, { "epoch": 19.09380097879282, "grad_norm": 0.11119984090328217, "learning_rate": 3.1214934818937223e-07, "loss": 0.2306, "num_input_tokens_seen": 252560864, "step": 117045 }, { "epoch": 19.094616639477977, "grad_norm": 2.0893070697784424, "learning_rate": 3.115889465588578e-07, "loss": 0.1194, "num_input_tokens_seen": 252570816, "step": 117050 }, { "epoch": 19.095432300163132, "grad_norm": 0.1489880383014679, "learning_rate": 3.1102904527031127e-07, "loss": 0.1003, "num_input_tokens_seen": 252581280, "step": 117055 }, { "epoch": 19.096247960848288, "grad_norm": 0.1826489418745041, "learning_rate": 3.104696443350846e-07, "loss": 0.1218, "num_input_tokens_seen": 252592064, "step": 117060 }, { "epoch": 19.097063621533444, "grad_norm": 0.040915753692388535, "learning_rate": 3.0991074376451044e-07, "loss": 0.1095, "num_input_tokens_seen": 252602400, "step": 117065 }, { "epoch": 19.097879282218596, "grad_norm": 1.8004086017608643, "learning_rate": 3.093523435699158e-07, "loss": 0.2332, "num_input_tokens_seen": 252613312, "step": 117070 }, { "epoch": 19.09869494290375, "grad_norm": 0.6441082954406738, "learning_rate": 3.0879444376261947e-07, "loss": 0.1159, "num_input_tokens_seen": 252623872, "step": 117075 }, { "epoch": 19.099510603588907, "grad_norm": 0.09402953833341599, "learning_rate": 3.082370443539262e-07, "loss": 0.0432, "num_input_tokens_seen": 252635168, "step": 117080 }, { "epoch": 19.100326264274063, "grad_norm": 1.2720807790756226, "learning_rate": 3.076801453551298e-07, "loss": 0.1714, "num_input_tokens_seen": 252646080, "step": 117085 }, { "epoch": 19.10114192495922, "grad_norm": 1.335307002067566, "learning_rate": 3.0712374677752674e-07, "loss": 0.0403, "num_input_tokens_seen": 252656512, "step": 117090 }, { "epoch": 19.10195758564437, "grad_norm": 0.3826900124549866, "learning_rate": 3.065678486323803e-07, "loss": 0.0339, "num_input_tokens_seen": 252667776, "step": 117095 }, { "epoch": 19.102773246329527, "grad_norm": 0.11124753206968307, "learning_rate": 3.0601245093096477e-07, "loss": 0.2449, "num_input_tokens_seen": 252678464, "step": 117100 }, { "epoch": 19.103588907014682, "grad_norm": 1.326790452003479, "learning_rate": 3.054575536845322e-07, "loss": 0.0735, "num_input_tokens_seen": 252689440, "step": 117105 }, { "epoch": 19.104404567699838, "grad_norm": 1.1727687120437622, "learning_rate": 3.04903156904332e-07, "loss": 0.0594, "num_input_tokens_seen": 252700960, "step": 117110 }, { "epoch": 19.10522022838499, "grad_norm": 1.8392032384872437, "learning_rate": 3.043492606015941e-07, "loss": 0.2228, "num_input_tokens_seen": 252712000, "step": 117115 }, { "epoch": 19.106035889070146, "grad_norm": 1.5327937602996826, "learning_rate": 3.037958647875483e-07, "loss": 0.1699, "num_input_tokens_seen": 252722880, "step": 117120 }, { "epoch": 19.1068515497553, "grad_norm": 0.4118456244468689, "learning_rate": 3.032429694734079e-07, "loss": 0.0517, "num_input_tokens_seen": 252733248, "step": 117125 }, { "epoch": 19.107667210440457, "grad_norm": 0.0968184843659401, "learning_rate": 3.0269057467038066e-07, "loss": 0.0307, "num_input_tokens_seen": 252743872, "step": 117130 }, { "epoch": 19.108482871125613, "grad_norm": 2.366999387741089, "learning_rate": 3.0213868038965756e-07, "loss": 0.0856, "num_input_tokens_seen": 252754336, "step": 117135 }, { "epoch": 19.109298531810765, "grad_norm": 0.25440171360969543, "learning_rate": 3.0158728664242964e-07, "loss": 0.0182, "num_input_tokens_seen": 252765120, "step": 117140 }, { "epoch": 19.11011419249592, "grad_norm": 1.9813365936279297, "learning_rate": 3.01036393439863e-07, "loss": 0.1449, "num_input_tokens_seen": 252776832, "step": 117145 }, { "epoch": 19.110929853181077, "grad_norm": 0.05818299204111099, "learning_rate": 3.0048600079312926e-07, "loss": 0.0536, "num_input_tokens_seen": 252787040, "step": 117150 }, { "epoch": 19.111745513866232, "grad_norm": 1.2877546548843384, "learning_rate": 2.9993610871337786e-07, "loss": 0.0404, "num_input_tokens_seen": 252797472, "step": 117155 }, { "epoch": 19.112561174551388, "grad_norm": 0.4189968407154083, "learning_rate": 2.9938671721175536e-07, "loss": 0.0167, "num_input_tokens_seen": 252808768, "step": 117160 }, { "epoch": 19.11337683523654, "grad_norm": 2.6852054595947266, "learning_rate": 2.988378262993974e-07, "loss": 0.0691, "num_input_tokens_seen": 252819584, "step": 117165 }, { "epoch": 19.114192495921696, "grad_norm": 0.07083602249622345, "learning_rate": 2.982894359874283e-07, "loss": 0.1444, "num_input_tokens_seen": 252830528, "step": 117170 }, { "epoch": 19.11500815660685, "grad_norm": 0.19592943787574768, "learning_rate": 2.9774154628695874e-07, "loss": 0.0269, "num_input_tokens_seen": 252839936, "step": 117175 }, { "epoch": 19.115823817292007, "grad_norm": 1.18824303150177, "learning_rate": 2.971941572090936e-07, "loss": 0.2189, "num_input_tokens_seen": 252850848, "step": 117180 }, { "epoch": 19.116639477977163, "grad_norm": 0.11659998446702957, "learning_rate": 2.9664726876492686e-07, "loss": 0.0532, "num_input_tokens_seen": 252861408, "step": 117185 }, { "epoch": 19.117455138662315, "grad_norm": 1.4694279432296753, "learning_rate": 2.9610088096554135e-07, "loss": 0.0622, "num_input_tokens_seen": 252871456, "step": 117190 }, { "epoch": 19.11827079934747, "grad_norm": 2.0083048343658447, "learning_rate": 2.955549938220087e-07, "loss": 0.1296, "num_input_tokens_seen": 252883200, "step": 117195 }, { "epoch": 19.119086460032626, "grad_norm": 0.838312566280365, "learning_rate": 2.9500960734539785e-07, "loss": 0.0136, "num_input_tokens_seen": 252892896, "step": 117200 }, { "epoch": 19.119902120717782, "grad_norm": 0.15084055066108704, "learning_rate": 2.944647215467555e-07, "loss": 0.1596, "num_input_tokens_seen": 252903648, "step": 117205 }, { "epoch": 19.120717781402938, "grad_norm": 0.18465475738048553, "learning_rate": 2.939203364371229e-07, "loss": 0.0551, "num_input_tokens_seen": 252914880, "step": 117210 }, { "epoch": 19.12153344208809, "grad_norm": 0.09525101631879807, "learning_rate": 2.933764520275439e-07, "loss": 0.1027, "num_input_tokens_seen": 252925056, "step": 117215 }, { "epoch": 19.122349102773246, "grad_norm": 2.5854337215423584, "learning_rate": 2.9283306832902923e-07, "loss": 0.0754, "num_input_tokens_seen": 252934624, "step": 117220 }, { "epoch": 19.1231647634584, "grad_norm": 0.18205270171165466, "learning_rate": 2.9229018535259775e-07, "loss": 0.0695, "num_input_tokens_seen": 252944800, "step": 117225 }, { "epoch": 19.123980424143557, "grad_norm": 0.048301421105861664, "learning_rate": 2.917478031092463e-07, "loss": 0.0204, "num_input_tokens_seen": 252955200, "step": 117230 }, { "epoch": 19.124796084828713, "grad_norm": 0.2843913733959198, "learning_rate": 2.912059216099744e-07, "loss": 0.288, "num_input_tokens_seen": 252966752, "step": 117235 }, { "epoch": 19.125611745513865, "grad_norm": 2.534383535385132, "learning_rate": 2.9066454086575655e-07, "loss": 0.1661, "num_input_tokens_seen": 252977472, "step": 117240 }, { "epoch": 19.12642740619902, "grad_norm": 0.12414009124040604, "learning_rate": 2.9012366088757295e-07, "loss": 0.11, "num_input_tokens_seen": 252988192, "step": 117245 }, { "epoch": 19.127243066884176, "grad_norm": 0.23424440622329712, "learning_rate": 2.8958328168637305e-07, "loss": 0.0187, "num_input_tokens_seen": 252999488, "step": 117250 }, { "epoch": 19.128058727569332, "grad_norm": 0.07694140821695328, "learning_rate": 2.890434032731204e-07, "loss": 0.1944, "num_input_tokens_seen": 253011104, "step": 117255 }, { "epoch": 19.128874388254488, "grad_norm": 0.9113555550575256, "learning_rate": 2.885040256587479e-07, "loss": 0.0314, "num_input_tokens_seen": 253021888, "step": 117260 }, { "epoch": 19.12969004893964, "grad_norm": 0.6580409407615662, "learning_rate": 2.87965148854194e-07, "loss": 0.1161, "num_input_tokens_seen": 253032352, "step": 117265 }, { "epoch": 19.130505709624796, "grad_norm": 2.8260061740875244, "learning_rate": 2.874267728703722e-07, "loss": 0.0438, "num_input_tokens_seen": 253043744, "step": 117270 }, { "epoch": 19.13132137030995, "grad_norm": 0.09613692760467529, "learning_rate": 2.868888977182016e-07, "loss": 0.2843, "num_input_tokens_seen": 253053856, "step": 117275 }, { "epoch": 19.132137030995107, "grad_norm": 0.8147186040878296, "learning_rate": 2.863515234085734e-07, "loss": 0.0159, "num_input_tokens_seen": 253064288, "step": 117280 }, { "epoch": 19.13295269168026, "grad_norm": 0.03955058380961418, "learning_rate": 2.858146499523845e-07, "loss": 0.0694, "num_input_tokens_seen": 253075104, "step": 117285 }, { "epoch": 19.133768352365415, "grad_norm": 0.055036500096321106, "learning_rate": 2.8527827736051503e-07, "loss": 0.0169, "num_input_tokens_seen": 253085728, "step": 117290 }, { "epoch": 19.13458401305057, "grad_norm": 2.837233543395996, "learning_rate": 2.847424056438341e-07, "loss": 0.1411, "num_input_tokens_seen": 253094816, "step": 117295 }, { "epoch": 19.135399673735726, "grad_norm": 0.4169272184371948, "learning_rate": 2.8420703481319977e-07, "loss": 0.0761, "num_input_tokens_seen": 253105600, "step": 117300 }, { "epoch": 19.136215334420882, "grad_norm": 1.1413158178329468, "learning_rate": 2.8367216487946437e-07, "loss": 0.1421, "num_input_tokens_seen": 253116352, "step": 117305 }, { "epoch": 19.137030995106034, "grad_norm": 1.2844094038009644, "learning_rate": 2.831377958534692e-07, "loss": 0.2261, "num_input_tokens_seen": 253128448, "step": 117310 }, { "epoch": 19.13784665579119, "grad_norm": 0.0920163244009018, "learning_rate": 2.8260392774604185e-07, "loss": 0.2684, "num_input_tokens_seen": 253138208, "step": 117315 }, { "epoch": 19.138662316476346, "grad_norm": 1.7585124969482422, "learning_rate": 2.820705605680013e-07, "loss": 0.1968, "num_input_tokens_seen": 253148928, "step": 117320 }, { "epoch": 19.1394779771615, "grad_norm": 0.36272019147872925, "learning_rate": 2.815376943301584e-07, "loss": 0.06, "num_input_tokens_seen": 253160480, "step": 117325 }, { "epoch": 19.140293637846657, "grad_norm": 2.2087535858154297, "learning_rate": 2.810053290433101e-07, "loss": 0.076, "num_input_tokens_seen": 253171456, "step": 117330 }, { "epoch": 19.14110929853181, "grad_norm": 0.8495621085166931, "learning_rate": 2.8047346471824777e-07, "loss": 0.0684, "num_input_tokens_seen": 253181824, "step": 117335 }, { "epoch": 19.141924959216965, "grad_norm": 0.07264474779367447, "learning_rate": 2.799421013657516e-07, "loss": 0.018, "num_input_tokens_seen": 253192512, "step": 117340 }, { "epoch": 19.14274061990212, "grad_norm": 0.2638968825340271, "learning_rate": 2.794112389965881e-07, "loss": 0.0625, "num_input_tokens_seen": 253202816, "step": 117345 }, { "epoch": 19.143556280587276, "grad_norm": 0.21730320155620575, "learning_rate": 2.7888087762151516e-07, "loss": 0.0087, "num_input_tokens_seen": 253214560, "step": 117350 }, { "epoch": 19.144371941272432, "grad_norm": 0.19242314994335175, "learning_rate": 2.783510172512799e-07, "loss": 0.1352, "num_input_tokens_seen": 253224704, "step": 117355 }, { "epoch": 19.145187601957584, "grad_norm": 2.0451512336730957, "learning_rate": 2.778216578966264e-07, "loss": 0.1834, "num_input_tokens_seen": 253234176, "step": 117360 }, { "epoch": 19.14600326264274, "grad_norm": 2.713585138320923, "learning_rate": 2.772927995682767e-07, "loss": 0.1992, "num_input_tokens_seen": 253244544, "step": 117365 }, { "epoch": 19.146818923327896, "grad_norm": 1.8318281173706055, "learning_rate": 2.7676444227695274e-07, "loss": 0.202, "num_input_tokens_seen": 253254944, "step": 117370 }, { "epoch": 19.14763458401305, "grad_norm": 0.10511509329080582, "learning_rate": 2.762365860333599e-07, "loss": 0.0656, "num_input_tokens_seen": 253266080, "step": 117375 }, { "epoch": 19.148450244698207, "grad_norm": 0.6269886493682861, "learning_rate": 2.7570923084820067e-07, "loss": 0.105, "num_input_tokens_seen": 253276736, "step": 117380 }, { "epoch": 19.14926590538336, "grad_norm": 0.272381454706192, "learning_rate": 2.751823767321554e-07, "loss": 0.1174, "num_input_tokens_seen": 253287072, "step": 117385 }, { "epoch": 19.150081566068515, "grad_norm": 0.15293313562870026, "learning_rate": 2.7465602369590726e-07, "loss": 0.1991, "num_input_tokens_seen": 253297280, "step": 117390 }, { "epoch": 19.15089722675367, "grad_norm": 0.10020040720701218, "learning_rate": 2.7413017175011715e-07, "loss": 0.1706, "num_input_tokens_seen": 253308032, "step": 117395 }, { "epoch": 19.151712887438826, "grad_norm": 2.4678120613098145, "learning_rate": 2.736048209054487e-07, "loss": 0.1403, "num_input_tokens_seen": 253318080, "step": 117400 }, { "epoch": 19.152528548123982, "grad_norm": 0.046212490648031235, "learning_rate": 2.7307997117254623e-07, "loss": 0.0357, "num_input_tokens_seen": 253328032, "step": 117405 }, { "epoch": 19.153344208809134, "grad_norm": 0.10124670714139938, "learning_rate": 2.725556225620457e-07, "loss": 0.1509, "num_input_tokens_seen": 253339392, "step": 117410 }, { "epoch": 19.15415986949429, "grad_norm": 1.7402422428131104, "learning_rate": 2.7203177508457467e-07, "loss": 0.1663, "num_input_tokens_seen": 253349248, "step": 117415 }, { "epoch": 19.154975530179446, "grad_norm": 0.7060520052909851, "learning_rate": 2.7150842875074964e-07, "loss": 0.0433, "num_input_tokens_seen": 253359872, "step": 117420 }, { "epoch": 19.1557911908646, "grad_norm": 1.8772306442260742, "learning_rate": 2.7098558357117887e-07, "loss": 0.1788, "num_input_tokens_seen": 253370336, "step": 117425 }, { "epoch": 19.156606851549757, "grad_norm": 1.289037823677063, "learning_rate": 2.704632395564538e-07, "loss": 0.0979, "num_input_tokens_seen": 253381600, "step": 117430 }, { "epoch": 19.15742251223491, "grad_norm": 1.8125933408737183, "learning_rate": 2.6994139671716324e-07, "loss": 0.0794, "num_input_tokens_seen": 253393184, "step": 117435 }, { "epoch": 19.158238172920065, "grad_norm": 0.6202453374862671, "learning_rate": 2.6942005506388203e-07, "loss": 0.0456, "num_input_tokens_seen": 253404736, "step": 117440 }, { "epoch": 19.15905383360522, "grad_norm": 0.3495720624923706, "learning_rate": 2.688992146071767e-07, "loss": 0.1995, "num_input_tokens_seen": 253415008, "step": 117445 }, { "epoch": 19.159869494290376, "grad_norm": 0.21187126636505127, "learning_rate": 2.6837887535760277e-07, "loss": 0.0258, "num_input_tokens_seen": 253426176, "step": 117450 }, { "epoch": 19.160685154975532, "grad_norm": 0.044325798749923706, "learning_rate": 2.678590373257045e-07, "loss": 0.0428, "num_input_tokens_seen": 253435680, "step": 117455 }, { "epoch": 19.161500815660684, "grad_norm": 2.2781012058258057, "learning_rate": 2.6733970052201797e-07, "loss": 0.0645, "num_input_tokens_seen": 253447200, "step": 117460 }, { "epoch": 19.16231647634584, "grad_norm": 0.03801277279853821, "learning_rate": 2.6682086495706805e-07, "loss": 0.1134, "num_input_tokens_seen": 253458496, "step": 117465 }, { "epoch": 19.163132137030995, "grad_norm": 0.07678389549255371, "learning_rate": 2.6630253064137135e-07, "loss": 0.019, "num_input_tokens_seen": 253467712, "step": 117470 }, { "epoch": 19.16394779771615, "grad_norm": 0.04821491613984108, "learning_rate": 2.657846975854278e-07, "loss": 0.1875, "num_input_tokens_seen": 253478848, "step": 117475 }, { "epoch": 19.164763458401303, "grad_norm": 1.8873562812805176, "learning_rate": 2.6526736579973734e-07, "loss": 0.118, "num_input_tokens_seen": 253487648, "step": 117480 }, { "epoch": 19.16557911908646, "grad_norm": 0.06982660293579102, "learning_rate": 2.6475053529478046e-07, "loss": 0.0286, "num_input_tokens_seen": 253498496, "step": 117485 }, { "epoch": 19.166394779771615, "grad_norm": 2.087033271789551, "learning_rate": 2.642342060810349e-07, "loss": 0.0884, "num_input_tokens_seen": 253510112, "step": 117490 }, { "epoch": 19.16721044045677, "grad_norm": 0.07998278737068176, "learning_rate": 2.637183781689617e-07, "loss": 0.034, "num_input_tokens_seen": 253520448, "step": 117495 }, { "epoch": 19.168026101141926, "grad_norm": 0.1286356896162033, "learning_rate": 2.632030515690137e-07, "loss": 0.386, "num_input_tokens_seen": 253529920, "step": 117500 }, { "epoch": 19.16884176182708, "grad_norm": 0.24457024037837982, "learning_rate": 2.6268822629163805e-07, "loss": 0.0416, "num_input_tokens_seen": 253541216, "step": 117505 }, { "epoch": 19.169657422512234, "grad_norm": 0.30771324038505554, "learning_rate": 2.6217390234726524e-07, "loss": 0.0239, "num_input_tokens_seen": 253552256, "step": 117510 }, { "epoch": 19.17047308319739, "grad_norm": 2.605587959289551, "learning_rate": 2.616600797463231e-07, "loss": 0.1063, "num_input_tokens_seen": 253562688, "step": 117515 }, { "epoch": 19.171288743882545, "grad_norm": 0.16061149537563324, "learning_rate": 2.6114675849922277e-07, "loss": 0.0262, "num_input_tokens_seen": 253573632, "step": 117520 }, { "epoch": 19.1721044045677, "grad_norm": 1.911163330078125, "learning_rate": 2.606339386163642e-07, "loss": 0.0777, "num_input_tokens_seen": 253585120, "step": 117525 }, { "epoch": 19.172920065252853, "grad_norm": 1.7513657808303833, "learning_rate": 2.601216201081447e-07, "loss": 0.0503, "num_input_tokens_seen": 253596736, "step": 117530 }, { "epoch": 19.17373572593801, "grad_norm": 1.253668189048767, "learning_rate": 2.5960980298494474e-07, "loss": 0.0563, "num_input_tokens_seen": 253607616, "step": 117535 }, { "epoch": 19.174551386623165, "grad_norm": 1.2650740146636963, "learning_rate": 2.5909848725713946e-07, "loss": 0.0372, "num_input_tokens_seen": 253617280, "step": 117540 }, { "epoch": 19.17536704730832, "grad_norm": 1.001626968383789, "learning_rate": 2.585876729350872e-07, "loss": 0.0233, "num_input_tokens_seen": 253628096, "step": 117545 }, { "epoch": 19.176182707993476, "grad_norm": 0.022638384252786636, "learning_rate": 2.5807736002914365e-07, "loss": 0.1555, "num_input_tokens_seen": 253639968, "step": 117550 }, { "epoch": 19.17699836867863, "grad_norm": 1.4473682641983032, "learning_rate": 2.5756754854965037e-07, "loss": 0.0977, "num_input_tokens_seen": 253651584, "step": 117555 }, { "epoch": 19.177814029363784, "grad_norm": 0.3421834111213684, "learning_rate": 2.570582385069353e-07, "loss": 0.0408, "num_input_tokens_seen": 253662240, "step": 117560 }, { "epoch": 19.17862969004894, "grad_norm": 0.0885394737124443, "learning_rate": 2.5654942991132635e-07, "loss": 0.2208, "num_input_tokens_seen": 253672800, "step": 117565 }, { "epoch": 19.179445350734095, "grad_norm": 1.1352684497833252, "learning_rate": 2.5604112277313187e-07, "loss": 0.0366, "num_input_tokens_seen": 253682400, "step": 117570 }, { "epoch": 19.18026101141925, "grad_norm": 0.0710463672876358, "learning_rate": 2.555333171026547e-07, "loss": 0.1391, "num_input_tokens_seen": 253692992, "step": 117575 }, { "epoch": 19.181076672104403, "grad_norm": 0.21672728657722473, "learning_rate": 2.550260129101867e-07, "loss": 0.1129, "num_input_tokens_seen": 253703520, "step": 117580 }, { "epoch": 19.18189233278956, "grad_norm": 0.581896185874939, "learning_rate": 2.545192102060057e-07, "loss": 0.0207, "num_input_tokens_seen": 253713376, "step": 117585 }, { "epoch": 19.182707993474715, "grad_norm": 1.3293732404708862, "learning_rate": 2.5401290900038397e-07, "loss": 0.1236, "num_input_tokens_seen": 253723680, "step": 117590 }, { "epoch": 19.18352365415987, "grad_norm": 0.1488991379737854, "learning_rate": 2.5350710930358565e-07, "loss": 0.0853, "num_input_tokens_seen": 253735264, "step": 117595 }, { "epoch": 19.184339314845026, "grad_norm": 0.854313313961029, "learning_rate": 2.530018111258581e-07, "loss": 0.0489, "num_input_tokens_seen": 253746592, "step": 117600 }, { "epoch": 19.18515497553018, "grad_norm": 1.4026027917861938, "learning_rate": 2.524970144774402e-07, "loss": 0.0751, "num_input_tokens_seen": 253757728, "step": 117605 }, { "epoch": 19.185970636215334, "grad_norm": 0.06042877957224846, "learning_rate": 2.5199271936856837e-07, "loss": 0.1434, "num_input_tokens_seen": 253769152, "step": 117610 }, { "epoch": 19.18678629690049, "grad_norm": 0.06605565547943115, "learning_rate": 2.514889258094566e-07, "loss": 0.0076, "num_input_tokens_seen": 253780832, "step": 117615 }, { "epoch": 19.187601957585645, "grad_norm": 0.4578821063041687, "learning_rate": 2.509856338103161e-07, "loss": 0.0644, "num_input_tokens_seen": 253790400, "step": 117620 }, { "epoch": 19.1884176182708, "grad_norm": 0.939652681350708, "learning_rate": 2.5048284338135276e-07, "loss": 0.1185, "num_input_tokens_seen": 253802272, "step": 117625 }, { "epoch": 19.189233278955953, "grad_norm": 0.03725510463118553, "learning_rate": 2.4998055453274716e-07, "loss": 0.1784, "num_input_tokens_seen": 253812960, "step": 117630 }, { "epoch": 19.19004893964111, "grad_norm": 2.3156938552856445, "learning_rate": 2.4947876727468565e-07, "loss": 0.0542, "num_input_tokens_seen": 253824320, "step": 117635 }, { "epoch": 19.190864600326265, "grad_norm": 0.08908866345882416, "learning_rate": 2.4897748161733515e-07, "loss": 0.0095, "num_input_tokens_seen": 253833472, "step": 117640 }, { "epoch": 19.19168026101142, "grad_norm": 1.2219024896621704, "learning_rate": 2.4847669757085414e-07, "loss": 0.0362, "num_input_tokens_seen": 253843648, "step": 117645 }, { "epoch": 19.192495921696572, "grad_norm": 0.42734336853027344, "learning_rate": 2.4797641514539283e-07, "loss": 0.1708, "num_input_tokens_seen": 253853568, "step": 117650 }, { "epoch": 19.193311582381728, "grad_norm": 2.291480541229248, "learning_rate": 2.474766343510876e-07, "loss": 0.1622, "num_input_tokens_seen": 253864416, "step": 117655 }, { "epoch": 19.194127243066884, "grad_norm": 1.785043716430664, "learning_rate": 2.46977355198072e-07, "loss": 0.0964, "num_input_tokens_seen": 253875200, "step": 117660 }, { "epoch": 19.19494290375204, "grad_norm": 0.8086695075035095, "learning_rate": 2.464785776964601e-07, "loss": 0.0343, "num_input_tokens_seen": 253886560, "step": 117665 }, { "epoch": 19.195758564437195, "grad_norm": 0.9012464880943298, "learning_rate": 2.4598030185636335e-07, "loss": 0.0808, "num_input_tokens_seen": 253898752, "step": 117670 }, { "epoch": 19.196574225122347, "grad_norm": 0.4242400527000427, "learning_rate": 2.4548252768787917e-07, "loss": 0.0368, "num_input_tokens_seen": 253910496, "step": 117675 }, { "epoch": 19.197389885807503, "grad_norm": 0.31950536370277405, "learning_rate": 2.4498525520109115e-07, "loss": 0.2082, "num_input_tokens_seen": 253921056, "step": 117680 }, { "epoch": 19.19820554649266, "grad_norm": 2.6183054447174072, "learning_rate": 2.4448848440608575e-07, "loss": 0.0756, "num_input_tokens_seen": 253932928, "step": 117685 }, { "epoch": 19.199021207177815, "grad_norm": 2.2383148670196533, "learning_rate": 2.439922153129215e-07, "loss": 0.1474, "num_input_tokens_seen": 253943776, "step": 117690 }, { "epoch": 19.19983686786297, "grad_norm": 0.2806086838245392, "learning_rate": 2.434964479316626e-07, "loss": 0.0372, "num_input_tokens_seen": 253955936, "step": 117695 }, { "epoch": 19.200652528548122, "grad_norm": 0.9014402031898499, "learning_rate": 2.430011822723538e-07, "loss": 0.2031, "num_input_tokens_seen": 253967520, "step": 117700 }, { "epoch": 19.201468189233278, "grad_norm": 0.2971310317516327, "learning_rate": 2.425064183450315e-07, "loss": 0.1223, "num_input_tokens_seen": 253977824, "step": 117705 }, { "epoch": 19.202283849918434, "grad_norm": 0.912817656993866, "learning_rate": 2.4201215615972386e-07, "loss": 0.1461, "num_input_tokens_seen": 253988960, "step": 117710 }, { "epoch": 19.20309951060359, "grad_norm": 0.2109287977218628, "learning_rate": 2.415183957264505e-07, "loss": 0.0112, "num_input_tokens_seen": 253998784, "step": 117715 }, { "epoch": 19.203915171288745, "grad_norm": 0.0965643897652626, "learning_rate": 2.410251370552119e-07, "loss": 0.143, "num_input_tokens_seen": 254010304, "step": 117720 }, { "epoch": 19.204730831973897, "grad_norm": 0.02044367417693138, "learning_rate": 2.4053238015600833e-07, "loss": 0.0145, "num_input_tokens_seen": 254019616, "step": 117725 }, { "epoch": 19.205546492659053, "grad_norm": 1.1025224924087524, "learning_rate": 2.4004012503882624e-07, "loss": 0.0311, "num_input_tokens_seen": 254030112, "step": 117730 }, { "epoch": 19.20636215334421, "grad_norm": 0.0734790712594986, "learning_rate": 2.39548371713641e-07, "loss": 0.0909, "num_input_tokens_seen": 254039968, "step": 117735 }, { "epoch": 19.207177814029365, "grad_norm": 0.16499273478984833, "learning_rate": 2.390571201904196e-07, "loss": 0.0241, "num_input_tokens_seen": 254050240, "step": 117740 }, { "epoch": 19.20799347471452, "grad_norm": 2.1722123622894287, "learning_rate": 2.3856637047911525e-07, "loss": 0.2119, "num_input_tokens_seen": 254060128, "step": 117745 }, { "epoch": 19.208809135399672, "grad_norm": 1.0557165145874023, "learning_rate": 2.3807612258967825e-07, "loss": 0.0897, "num_input_tokens_seen": 254070976, "step": 117750 }, { "epoch": 19.209624796084828, "grad_norm": 0.49062275886535645, "learning_rate": 2.3758637653203964e-07, "loss": 0.022, "num_input_tokens_seen": 254081312, "step": 117755 }, { "epoch": 19.210440456769984, "grad_norm": 0.06794047355651855, "learning_rate": 2.3709713231612752e-07, "loss": 0.0385, "num_input_tokens_seen": 254092704, "step": 117760 }, { "epoch": 19.21125611745514, "grad_norm": 1.2888435125350952, "learning_rate": 2.366083899518534e-07, "loss": 0.0802, "num_input_tokens_seen": 254103264, "step": 117765 }, { "epoch": 19.212071778140295, "grad_norm": 0.3726148307323456, "learning_rate": 2.3612014944912885e-07, "loss": 0.0582, "num_input_tokens_seen": 254113600, "step": 117770 }, { "epoch": 19.212887438825447, "grad_norm": 0.0717509314417839, "learning_rate": 2.356324108178404e-07, "loss": 0.0222, "num_input_tokens_seen": 254124704, "step": 117775 }, { "epoch": 19.213703099510603, "grad_norm": 1.4948077201843262, "learning_rate": 2.3514517406788006e-07, "loss": 0.1557, "num_input_tokens_seen": 254134784, "step": 117780 }, { "epoch": 19.21451876019576, "grad_norm": 0.6366252899169922, "learning_rate": 2.3465843920911778e-07, "loss": 0.1139, "num_input_tokens_seen": 254146048, "step": 117785 }, { "epoch": 19.215334420880914, "grad_norm": 0.09429292380809784, "learning_rate": 2.3417220625142065e-07, "loss": 0.0672, "num_input_tokens_seen": 254157600, "step": 117790 }, { "epoch": 19.21615008156607, "grad_norm": 0.24050268530845642, "learning_rate": 2.3368647520463916e-07, "loss": 0.0124, "num_input_tokens_seen": 254168128, "step": 117795 }, { "epoch": 19.216965742251222, "grad_norm": 0.027505425736308098, "learning_rate": 2.3320124607862093e-07, "loss": 0.0607, "num_input_tokens_seen": 254179008, "step": 117800 }, { "epoch": 19.217781402936378, "grad_norm": 0.15359561145305634, "learning_rate": 2.3271651888319978e-07, "loss": 0.0397, "num_input_tokens_seen": 254190784, "step": 117805 }, { "epoch": 19.218597063621534, "grad_norm": 0.07345601171255112, "learning_rate": 2.3223229362819565e-07, "loss": 0.0106, "num_input_tokens_seen": 254201024, "step": 117810 }, { "epoch": 19.21941272430669, "grad_norm": 0.37652453780174255, "learning_rate": 2.3174857032342568e-07, "loss": 0.1428, "num_input_tokens_seen": 254212000, "step": 117815 }, { "epoch": 19.22022838499184, "grad_norm": 0.23238427937030792, "learning_rate": 2.3126534897869034e-07, "loss": 0.0716, "num_input_tokens_seen": 254223296, "step": 117820 }, { "epoch": 19.221044045676997, "grad_norm": 0.7789695262908936, "learning_rate": 2.3078262960378739e-07, "loss": 0.0124, "num_input_tokens_seen": 254234912, "step": 117825 }, { "epoch": 19.221859706362153, "grad_norm": 0.1217644140124321, "learning_rate": 2.303004122084923e-07, "loss": 0.0581, "num_input_tokens_seen": 254246080, "step": 117830 }, { "epoch": 19.22267536704731, "grad_norm": 0.146907240152359, "learning_rate": 2.298186968025834e-07, "loss": 0.0378, "num_input_tokens_seen": 254257056, "step": 117835 }, { "epoch": 19.223491027732464, "grad_norm": 0.5775670409202576, "learning_rate": 2.2933748339582227e-07, "loss": 0.223, "num_input_tokens_seen": 254267968, "step": 117840 }, { "epoch": 19.224306688417617, "grad_norm": 0.5515333414077759, "learning_rate": 2.2885677199795951e-07, "loss": 0.0671, "num_input_tokens_seen": 254278144, "step": 117845 }, { "epoch": 19.225122349102772, "grad_norm": 0.3127385079860687, "learning_rate": 2.2837656261874008e-07, "loss": 0.0878, "num_input_tokens_seen": 254288992, "step": 117850 }, { "epoch": 19.225938009787928, "grad_norm": 0.10730236768722534, "learning_rate": 2.2789685526789505e-07, "loss": 0.0802, "num_input_tokens_seen": 254299872, "step": 117855 }, { "epoch": 19.226753670473084, "grad_norm": 0.11089038848876953, "learning_rate": 2.2741764995514448e-07, "loss": 0.1934, "num_input_tokens_seen": 254311328, "step": 117860 }, { "epoch": 19.22756933115824, "grad_norm": 0.1927236020565033, "learning_rate": 2.2693894669020277e-07, "loss": 0.0331, "num_input_tokens_seen": 254322624, "step": 117865 }, { "epoch": 19.22838499184339, "grad_norm": 0.8007044196128845, "learning_rate": 2.2646074548276773e-07, "loss": 0.2349, "num_input_tokens_seen": 254333472, "step": 117870 }, { "epoch": 19.229200652528547, "grad_norm": 0.25708937644958496, "learning_rate": 2.2598304634253719e-07, "loss": 0.0854, "num_input_tokens_seen": 254344000, "step": 117875 }, { "epoch": 19.230016313213703, "grad_norm": 0.3134186863899231, "learning_rate": 2.2550584927918395e-07, "loss": 0.1061, "num_input_tokens_seen": 254355296, "step": 117880 }, { "epoch": 19.23083197389886, "grad_norm": 0.22889482975006104, "learning_rate": 2.2502915430238915e-07, "loss": 0.1442, "num_input_tokens_seen": 254365952, "step": 117885 }, { "epoch": 19.231647634584014, "grad_norm": 1.5911775827407837, "learning_rate": 2.245529614218006e-07, "loss": 0.118, "num_input_tokens_seen": 254376704, "step": 117890 }, { "epoch": 19.232463295269167, "grad_norm": 0.37671950459480286, "learning_rate": 2.2407727064708283e-07, "loss": 0.0477, "num_input_tokens_seen": 254386528, "step": 117895 }, { "epoch": 19.233278955954322, "grad_norm": 1.4858105182647705, "learning_rate": 2.2360208198786427e-07, "loss": 0.2681, "num_input_tokens_seen": 254397280, "step": 117900 }, { "epoch": 19.234094616639478, "grad_norm": 0.17339959740638733, "learning_rate": 2.2312739545378436e-07, "loss": 0.1162, "num_input_tokens_seen": 254408544, "step": 117905 }, { "epoch": 19.234910277324634, "grad_norm": 2.2744433879852295, "learning_rate": 2.2265321105445768e-07, "loss": 0.0956, "num_input_tokens_seen": 254420288, "step": 117910 }, { "epoch": 19.23572593800979, "grad_norm": 0.12468334287405014, "learning_rate": 2.22179528799496e-07, "loss": 0.0073, "num_input_tokens_seen": 254429984, "step": 117915 }, { "epoch": 19.23654159869494, "grad_norm": 0.05536206439137459, "learning_rate": 2.2170634869849993e-07, "loss": 0.0358, "num_input_tokens_seen": 254440128, "step": 117920 }, { "epoch": 19.237357259380097, "grad_norm": 0.024573352187871933, "learning_rate": 2.2123367076105905e-07, "loss": 0.1393, "num_input_tokens_seen": 254450976, "step": 117925 }, { "epoch": 19.238172920065253, "grad_norm": 0.10406262427568436, "learning_rate": 2.2076149499674903e-07, "loss": 0.0202, "num_input_tokens_seen": 254461952, "step": 117930 }, { "epoch": 19.23898858075041, "grad_norm": 2.355272054672241, "learning_rate": 2.2028982141514277e-07, "loss": 0.3882, "num_input_tokens_seen": 254471808, "step": 117935 }, { "epoch": 19.239804241435564, "grad_norm": 0.40705353021621704, "learning_rate": 2.1981865002579927e-07, "loss": 0.0331, "num_input_tokens_seen": 254482624, "step": 117940 }, { "epoch": 19.240619902120716, "grad_norm": 0.07944490760564804, "learning_rate": 2.1934798083826647e-07, "loss": 0.0085, "num_input_tokens_seen": 254492768, "step": 117945 }, { "epoch": 19.241435562805872, "grad_norm": 0.28877443075180054, "learning_rate": 2.1887781386208395e-07, "loss": 0.1164, "num_input_tokens_seen": 254504000, "step": 117950 }, { "epoch": 19.242251223491028, "grad_norm": 0.29584991931915283, "learning_rate": 2.184081491067802e-07, "loss": 0.0642, "num_input_tokens_seen": 254515840, "step": 117955 }, { "epoch": 19.243066884176184, "grad_norm": 0.24356570839881897, "learning_rate": 2.1793898658186984e-07, "loss": 0.0766, "num_input_tokens_seen": 254526432, "step": 117960 }, { "epoch": 19.24388254486134, "grad_norm": 0.5042481422424316, "learning_rate": 2.1747032629686746e-07, "loss": 0.024, "num_input_tokens_seen": 254537952, "step": 117965 }, { "epoch": 19.24469820554649, "grad_norm": 0.12027264386415482, "learning_rate": 2.170021682612655e-07, "loss": 0.0222, "num_input_tokens_seen": 254549024, "step": 117970 }, { "epoch": 19.245513866231647, "grad_norm": 0.8676365613937378, "learning_rate": 2.1653451248455637e-07, "loss": 0.1821, "num_input_tokens_seen": 254559712, "step": 117975 }, { "epoch": 19.246329526916803, "grad_norm": 0.6405666470527649, "learning_rate": 2.1606735897621577e-07, "loss": 0.1013, "num_input_tokens_seen": 254571840, "step": 117980 }, { "epoch": 19.24714518760196, "grad_norm": 0.10537061840295792, "learning_rate": 2.1560070774570562e-07, "loss": 0.0372, "num_input_tokens_seen": 254581536, "step": 117985 }, { "epoch": 19.247960848287114, "grad_norm": 0.19855256378650665, "learning_rate": 2.151345588024961e-07, "loss": 0.0396, "num_input_tokens_seen": 254592416, "step": 117990 }, { "epoch": 19.248776508972266, "grad_norm": 0.10498271137475967, "learning_rate": 2.146689121560186e-07, "loss": 0.0705, "num_input_tokens_seen": 254603392, "step": 117995 }, { "epoch": 19.249592169657422, "grad_norm": 0.08890246599912643, "learning_rate": 2.142037678157238e-07, "loss": 0.0585, "num_input_tokens_seen": 254613536, "step": 118000 }, { "epoch": 19.250407830342578, "grad_norm": 0.44360485672950745, "learning_rate": 2.1373912579102928e-07, "loss": 0.0275, "num_input_tokens_seen": 254624096, "step": 118005 }, { "epoch": 19.251223491027734, "grad_norm": 0.14748738706111908, "learning_rate": 2.13274986091358e-07, "loss": 0.1092, "num_input_tokens_seen": 254634240, "step": 118010 }, { "epoch": 19.252039151712886, "grad_norm": 0.07869995385408401, "learning_rate": 2.1281134872611074e-07, "loss": 0.1619, "num_input_tokens_seen": 254644576, "step": 118015 }, { "epoch": 19.25285481239804, "grad_norm": 0.2039504051208496, "learning_rate": 2.123482137046884e-07, "loss": 0.0172, "num_input_tokens_seen": 254656000, "step": 118020 }, { "epoch": 19.253670473083197, "grad_norm": 0.6007906794548035, "learning_rate": 2.1188558103647505e-07, "loss": 0.1167, "num_input_tokens_seen": 254666976, "step": 118025 }, { "epoch": 19.254486133768353, "grad_norm": 0.9781150817871094, "learning_rate": 2.1142345073084657e-07, "loss": 0.0569, "num_input_tokens_seen": 254676608, "step": 118030 }, { "epoch": 19.25530179445351, "grad_norm": 0.5132603049278259, "learning_rate": 2.109618227971649e-07, "loss": 0.1024, "num_input_tokens_seen": 254688000, "step": 118035 }, { "epoch": 19.25611745513866, "grad_norm": 0.11159976571798325, "learning_rate": 2.1050069724479204e-07, "loss": 0.0956, "num_input_tokens_seen": 254698944, "step": 118040 }, { "epoch": 19.256933115823816, "grad_norm": 0.36629095673561096, "learning_rate": 2.100400740830677e-07, "loss": 0.0309, "num_input_tokens_seen": 254709216, "step": 118045 }, { "epoch": 19.257748776508972, "grad_norm": 0.06966105848550797, "learning_rate": 2.0957995332133163e-07, "loss": 0.0605, "num_input_tokens_seen": 254720832, "step": 118050 }, { "epoch": 19.258564437194128, "grad_norm": 0.1107395812869072, "learning_rate": 2.0912033496890694e-07, "loss": 0.0103, "num_input_tokens_seen": 254732224, "step": 118055 }, { "epoch": 19.259380097879284, "grad_norm": 0.05185611918568611, "learning_rate": 2.0866121903510562e-07, "loss": 0.167, "num_input_tokens_seen": 254742144, "step": 118060 }, { "epoch": 19.260195758564436, "grad_norm": 2.4043071269989014, "learning_rate": 2.0820260552923688e-07, "loss": 0.0885, "num_input_tokens_seen": 254753120, "step": 118065 }, { "epoch": 19.26101141924959, "grad_norm": 4.104417324066162, "learning_rate": 2.077444944605933e-07, "loss": 0.1754, "num_input_tokens_seen": 254763136, "step": 118070 }, { "epoch": 19.261827079934747, "grad_norm": 0.05070294812321663, "learning_rate": 2.0728688583845912e-07, "loss": 0.036, "num_input_tokens_seen": 254774016, "step": 118075 }, { "epoch": 19.262642740619903, "grad_norm": 0.043342240154743195, "learning_rate": 2.068297796721075e-07, "loss": 0.1706, "num_input_tokens_seen": 254785312, "step": 118080 }, { "epoch": 19.26345840130506, "grad_norm": 1.429972767829895, "learning_rate": 2.063731759708004e-07, "loss": 0.1159, "num_input_tokens_seen": 254796608, "step": 118085 }, { "epoch": 19.26427406199021, "grad_norm": 0.5502247214317322, "learning_rate": 2.059170747437972e-07, "loss": 0.0415, "num_input_tokens_seen": 254806240, "step": 118090 }, { "epoch": 19.265089722675366, "grad_norm": 0.06209968030452728, "learning_rate": 2.0546147600033484e-07, "loss": 0.0519, "num_input_tokens_seen": 254817440, "step": 118095 }, { "epoch": 19.265905383360522, "grad_norm": 0.08871649205684662, "learning_rate": 2.050063797496532e-07, "loss": 0.0917, "num_input_tokens_seen": 254828672, "step": 118100 }, { "epoch": 19.266721044045678, "grad_norm": 0.4219987690448761, "learning_rate": 2.0455178600096991e-07, "loss": 0.0227, "num_input_tokens_seen": 254838880, "step": 118105 }, { "epoch": 19.267536704730833, "grad_norm": 0.4269753694534302, "learning_rate": 2.0409769476349984e-07, "loss": 0.0247, "num_input_tokens_seen": 254849856, "step": 118110 }, { "epoch": 19.268352365415986, "grad_norm": 0.3360973298549652, "learning_rate": 2.036441060464467e-07, "loss": 0.1336, "num_input_tokens_seen": 254861472, "step": 118115 }, { "epoch": 19.26916802610114, "grad_norm": 1.124426245689392, "learning_rate": 2.0319101985900036e-07, "loss": 0.1238, "num_input_tokens_seen": 254872128, "step": 118120 }, { "epoch": 19.269983686786297, "grad_norm": 2.4345529079437256, "learning_rate": 2.0273843621034795e-07, "loss": 0.1248, "num_input_tokens_seen": 254883776, "step": 118125 }, { "epoch": 19.270799347471453, "grad_norm": 0.304499089717865, "learning_rate": 2.0228635510965432e-07, "loss": 0.1142, "num_input_tokens_seen": 254895424, "step": 118130 }, { "epoch": 19.27161500815661, "grad_norm": 0.026768870651721954, "learning_rate": 2.0183477656608995e-07, "loss": 0.1794, "num_input_tokens_seen": 254906080, "step": 118135 }, { "epoch": 19.27243066884176, "grad_norm": 0.03341158106923103, "learning_rate": 2.0138370058880028e-07, "loss": 0.0398, "num_input_tokens_seen": 254917312, "step": 118140 }, { "epoch": 19.273246329526916, "grad_norm": 0.1869005262851715, "learning_rate": 2.009331271869308e-07, "loss": 0.0105, "num_input_tokens_seen": 254927392, "step": 118145 }, { "epoch": 19.274061990212072, "grad_norm": 0.8846730589866638, "learning_rate": 2.004830563696103e-07, "loss": 0.1199, "num_input_tokens_seen": 254937664, "step": 118150 }, { "epoch": 19.274877650897228, "grad_norm": 0.05555561184883118, "learning_rate": 2.00033488145962e-07, "loss": 0.1303, "num_input_tokens_seen": 254948032, "step": 118155 }, { "epoch": 19.275693311582383, "grad_norm": 0.2181624323129654, "learning_rate": 1.9958442252509536e-07, "loss": 0.046, "num_input_tokens_seen": 254958208, "step": 118160 }, { "epoch": 19.276508972267536, "grad_norm": 0.09714119136333466, "learning_rate": 1.9913585951611134e-07, "loss": 0.0373, "num_input_tokens_seen": 254968544, "step": 118165 }, { "epoch": 19.27732463295269, "grad_norm": 2.1447033882141113, "learning_rate": 1.9868779912810275e-07, "loss": 0.1687, "num_input_tokens_seen": 254979488, "step": 118170 }, { "epoch": 19.278140293637847, "grad_norm": 2.1407554149627686, "learning_rate": 1.9824024137014562e-07, "loss": 0.0823, "num_input_tokens_seen": 254989440, "step": 118175 }, { "epoch": 19.278955954323003, "grad_norm": 0.38292500376701355, "learning_rate": 1.9779318625131604e-07, "loss": 0.1669, "num_input_tokens_seen": 255002144, "step": 118180 }, { "epoch": 19.27977161500816, "grad_norm": 2.0159380435943604, "learning_rate": 1.9734663378067063e-07, "loss": 0.145, "num_input_tokens_seen": 255014048, "step": 118185 }, { "epoch": 19.28058727569331, "grad_norm": 2.3091511726379395, "learning_rate": 1.9690058396726053e-07, "loss": 0.064, "num_input_tokens_seen": 255025376, "step": 118190 }, { "epoch": 19.281402936378466, "grad_norm": 0.5103996992111206, "learning_rate": 1.9645503682012567e-07, "loss": 0.0118, "num_input_tokens_seen": 255034912, "step": 118195 }, { "epoch": 19.282218597063622, "grad_norm": 2.5427112579345703, "learning_rate": 1.960099923482922e-07, "loss": 0.0799, "num_input_tokens_seen": 255046848, "step": 118200 }, { "epoch": 19.283034257748778, "grad_norm": 0.655647337436676, "learning_rate": 1.9556545056078625e-07, "loss": 0.1088, "num_input_tokens_seen": 255057312, "step": 118205 }, { "epoch": 19.28384991843393, "grad_norm": 1.4470444917678833, "learning_rate": 1.9512141146661168e-07, "loss": 0.1496, "num_input_tokens_seen": 255068064, "step": 118210 }, { "epoch": 19.284665579119086, "grad_norm": 0.0906776562333107, "learning_rate": 1.9467787507476687e-07, "loss": 0.3123, "num_input_tokens_seen": 255079168, "step": 118215 }, { "epoch": 19.28548123980424, "grad_norm": 0.5730345845222473, "learning_rate": 1.9423484139424463e-07, "loss": 0.0811, "num_input_tokens_seen": 255089440, "step": 118220 }, { "epoch": 19.286296900489397, "grad_norm": 0.06896401196718216, "learning_rate": 1.9379231043402112e-07, "loss": 0.0689, "num_input_tokens_seen": 255101184, "step": 118225 }, { "epoch": 19.287112561174553, "grad_norm": 0.03805439919233322, "learning_rate": 1.9335028220306417e-07, "loss": 0.1578, "num_input_tokens_seen": 255113280, "step": 118230 }, { "epoch": 19.287928221859705, "grad_norm": 0.08357656002044678, "learning_rate": 1.9290875671033603e-07, "loss": 0.0458, "num_input_tokens_seen": 255124992, "step": 118235 }, { "epoch": 19.28874388254486, "grad_norm": 0.015145530924201012, "learning_rate": 1.9246773396477956e-07, "loss": 0.0139, "num_input_tokens_seen": 255136032, "step": 118240 }, { "epoch": 19.289559543230016, "grad_norm": 0.018253423273563385, "learning_rate": 1.9202721397533761e-07, "loss": 0.2558, "num_input_tokens_seen": 255146848, "step": 118245 }, { "epoch": 19.290375203915172, "grad_norm": 1.5797523260116577, "learning_rate": 1.9158719675093362e-07, "loss": 0.077, "num_input_tokens_seen": 255156992, "step": 118250 }, { "epoch": 19.291190864600328, "grad_norm": 2.732854127883911, "learning_rate": 1.911476823004854e-07, "loss": 0.2104, "num_input_tokens_seen": 255168608, "step": 118255 }, { "epoch": 19.29200652528548, "grad_norm": 0.14406876266002655, "learning_rate": 1.9070867063290255e-07, "loss": 0.0587, "num_input_tokens_seen": 255179712, "step": 118260 }, { "epoch": 19.292822185970635, "grad_norm": 0.08140246570110321, "learning_rate": 1.9027016175708345e-07, "loss": 0.0866, "num_input_tokens_seen": 255190560, "step": 118265 }, { "epoch": 19.29363784665579, "grad_norm": 1.1729322671890259, "learning_rate": 1.8983215568190992e-07, "loss": 0.1104, "num_input_tokens_seen": 255201504, "step": 118270 }, { "epoch": 19.294453507340947, "grad_norm": 0.5321022868156433, "learning_rate": 1.8939465241626097e-07, "loss": 0.1201, "num_input_tokens_seen": 255212672, "step": 118275 }, { "epoch": 19.295269168026103, "grad_norm": 1.7347702980041504, "learning_rate": 1.8895765196900727e-07, "loss": 0.1225, "num_input_tokens_seen": 255224256, "step": 118280 }, { "epoch": 19.296084828711255, "grad_norm": 0.41972506046295166, "learning_rate": 1.8852115434900007e-07, "loss": 0.0603, "num_input_tokens_seen": 255235680, "step": 118285 }, { "epoch": 19.29690048939641, "grad_norm": 0.5271171927452087, "learning_rate": 1.8808515956508787e-07, "loss": 0.0431, "num_input_tokens_seen": 255246720, "step": 118290 }, { "epoch": 19.297716150081566, "grad_norm": 0.12493055313825607, "learning_rate": 1.8764966762610526e-07, "loss": 0.2209, "num_input_tokens_seen": 255257792, "step": 118295 }, { "epoch": 19.298531810766722, "grad_norm": 0.2872360944747925, "learning_rate": 1.8721467854088125e-07, "loss": 0.0183, "num_input_tokens_seen": 255268768, "step": 118300 }, { "epoch": 19.299347471451878, "grad_norm": 2.2061402797698975, "learning_rate": 1.8678019231822553e-07, "loss": 0.1587, "num_input_tokens_seen": 255280224, "step": 118305 }, { "epoch": 19.30016313213703, "grad_norm": 0.1691485196352005, "learning_rate": 1.8634620896695043e-07, "loss": 0.1175, "num_input_tokens_seen": 255291008, "step": 118310 }, { "epoch": 19.300978792822185, "grad_norm": 1.936841368675232, "learning_rate": 1.8591272849584618e-07, "loss": 0.0905, "num_input_tokens_seen": 255301952, "step": 118315 }, { "epoch": 19.30179445350734, "grad_norm": 0.6127886176109314, "learning_rate": 1.8547975091369742e-07, "loss": 0.0582, "num_input_tokens_seen": 255312096, "step": 118320 }, { "epoch": 19.302610114192497, "grad_norm": 0.8009691834449768, "learning_rate": 1.850472762292832e-07, "loss": 0.1376, "num_input_tokens_seen": 255323424, "step": 118325 }, { "epoch": 19.303425774877653, "grad_norm": 0.5444374084472656, "learning_rate": 1.84615304451366e-07, "loss": 0.1516, "num_input_tokens_seen": 255333280, "step": 118330 }, { "epoch": 19.304241435562805, "grad_norm": 0.21048526465892792, "learning_rate": 1.841838355886999e-07, "loss": 0.1206, "num_input_tokens_seen": 255344544, "step": 118335 }, { "epoch": 19.30505709624796, "grad_norm": 1.10372793674469, "learning_rate": 1.8375286965002793e-07, "loss": 0.163, "num_input_tokens_seen": 255355424, "step": 118340 }, { "epoch": 19.305872756933116, "grad_norm": 0.3457396328449249, "learning_rate": 1.833224066440875e-07, "loss": 0.1494, "num_input_tokens_seen": 255365632, "step": 118345 }, { "epoch": 19.306688417618272, "grad_norm": 0.28746500611305237, "learning_rate": 1.8289244657959947e-07, "loss": 0.0173, "num_input_tokens_seen": 255376672, "step": 118350 }, { "epoch": 19.307504078303428, "grad_norm": 0.058437593281269073, "learning_rate": 1.8246298946528184e-07, "loss": 0.0354, "num_input_tokens_seen": 255387584, "step": 118355 }, { "epoch": 19.30831973898858, "grad_norm": 0.13516956567764282, "learning_rate": 1.8203403530983043e-07, "loss": 0.0611, "num_input_tokens_seen": 255397600, "step": 118360 }, { "epoch": 19.309135399673735, "grad_norm": 1.2092276811599731, "learning_rate": 1.8160558412194383e-07, "loss": 0.0478, "num_input_tokens_seen": 255407680, "step": 118365 }, { "epoch": 19.30995106035889, "grad_norm": 1.4175559282302856, "learning_rate": 1.8117763591030957e-07, "loss": 0.1837, "num_input_tokens_seen": 255418944, "step": 118370 }, { "epoch": 19.310766721044047, "grad_norm": 1.4767794609069824, "learning_rate": 1.807501906835901e-07, "loss": 0.1672, "num_input_tokens_seen": 255429824, "step": 118375 }, { "epoch": 19.3115823817292, "grad_norm": 0.04712728038430214, "learning_rate": 1.8032324845045635e-07, "loss": 0.0976, "num_input_tokens_seen": 255441632, "step": 118380 }, { "epoch": 19.312398042414355, "grad_norm": 0.27403590083122253, "learning_rate": 1.798968092195541e-07, "loss": 0.1735, "num_input_tokens_seen": 255451520, "step": 118385 }, { "epoch": 19.31321370309951, "grad_norm": 3.952288866043091, "learning_rate": 1.794708729995348e-07, "loss": 0.2022, "num_input_tokens_seen": 255461472, "step": 118390 }, { "epoch": 19.314029363784666, "grad_norm": 0.08598506450653076, "learning_rate": 1.7904543979902212e-07, "loss": 0.0611, "num_input_tokens_seen": 255472704, "step": 118395 }, { "epoch": 19.31484502446982, "grad_norm": 0.06172516569495201, "learning_rate": 1.7862050962664245e-07, "loss": 0.261, "num_input_tokens_seen": 255484416, "step": 118400 }, { "epoch": 19.315660685154974, "grad_norm": 0.09814174473285675, "learning_rate": 1.7819608249100562e-07, "loss": 0.0885, "num_input_tokens_seen": 255495616, "step": 118405 }, { "epoch": 19.31647634584013, "grad_norm": 0.9607176184654236, "learning_rate": 1.7777215840071305e-07, "loss": 0.1009, "num_input_tokens_seen": 255506304, "step": 118410 }, { "epoch": 19.317292006525285, "grad_norm": 0.7751756906509399, "learning_rate": 1.7734873736435787e-07, "loss": 0.061, "num_input_tokens_seen": 255518240, "step": 118415 }, { "epoch": 19.31810766721044, "grad_norm": 2.51869535446167, "learning_rate": 1.7692581939051934e-07, "loss": 0.2134, "num_input_tokens_seen": 255529760, "step": 118420 }, { "epoch": 19.318923327895597, "grad_norm": 0.06086082383990288, "learning_rate": 1.7650340448777113e-07, "loss": 0.0842, "num_input_tokens_seen": 255540576, "step": 118425 }, { "epoch": 19.31973898858075, "grad_norm": 2.936136245727539, "learning_rate": 1.7608149266467034e-07, "loss": 0.2577, "num_input_tokens_seen": 255550976, "step": 118430 }, { "epoch": 19.320554649265905, "grad_norm": 0.08846066892147064, "learning_rate": 1.756600839297712e-07, "loss": 0.1235, "num_input_tokens_seen": 255561280, "step": 118435 }, { "epoch": 19.32137030995106, "grad_norm": 0.2475421279668808, "learning_rate": 1.752391782916113e-07, "loss": 0.009, "num_input_tokens_seen": 255571296, "step": 118440 }, { "epoch": 19.322185970636216, "grad_norm": 0.06008722260594368, "learning_rate": 1.7481877575872275e-07, "loss": 0.0047, "num_input_tokens_seen": 255582816, "step": 118445 }, { "epoch": 19.32300163132137, "grad_norm": 1.2036643028259277, "learning_rate": 1.743988763396265e-07, "loss": 0.1516, "num_input_tokens_seen": 255593760, "step": 118450 }, { "epoch": 19.323817292006524, "grad_norm": 0.6422199010848999, "learning_rate": 1.7397948004282683e-07, "loss": 0.0115, "num_input_tokens_seen": 255604256, "step": 118455 }, { "epoch": 19.32463295269168, "grad_norm": 0.39275848865509033, "learning_rate": 1.7356058687682808e-07, "loss": 0.0276, "num_input_tokens_seen": 255614976, "step": 118460 }, { "epoch": 19.325448613376835, "grad_norm": 0.038077134639024734, "learning_rate": 1.7314219685012067e-07, "loss": 0.0089, "num_input_tokens_seen": 255626048, "step": 118465 }, { "epoch": 19.32626427406199, "grad_norm": 0.20337587594985962, "learning_rate": 1.727243099711784e-07, "loss": 0.0631, "num_input_tokens_seen": 255637568, "step": 118470 }, { "epoch": 19.327079934747147, "grad_norm": 0.20837834477424622, "learning_rate": 1.72306926248475e-07, "loss": 0.0391, "num_input_tokens_seen": 255648384, "step": 118475 }, { "epoch": 19.3278955954323, "grad_norm": 1.2246580123901367, "learning_rate": 1.7189004569046763e-07, "loss": 0.1634, "num_input_tokens_seen": 255658240, "step": 118480 }, { "epoch": 19.328711256117455, "grad_norm": 0.11336968839168549, "learning_rate": 1.7147366830560785e-07, "loss": 0.0295, "num_input_tokens_seen": 255669536, "step": 118485 }, { "epoch": 19.32952691680261, "grad_norm": 0.058770280331373215, "learning_rate": 1.7105779410232782e-07, "loss": 0.0834, "num_input_tokens_seen": 255680096, "step": 118490 }, { "epoch": 19.330342577487766, "grad_norm": 2.100450277328491, "learning_rate": 1.7064242308906242e-07, "loss": 0.3179, "num_input_tokens_seen": 255691008, "step": 118495 }, { "epoch": 19.33115823817292, "grad_norm": 2.943194627761841, "learning_rate": 1.702275552742244e-07, "loss": 0.0469, "num_input_tokens_seen": 255702976, "step": 118500 }, { "epoch": 19.331973898858074, "grad_norm": 0.02041197195649147, "learning_rate": 1.6981319066622647e-07, "loss": 0.0152, "num_input_tokens_seen": 255714400, "step": 118505 }, { "epoch": 19.33278955954323, "grad_norm": 0.29405105113983154, "learning_rate": 1.6939932927345913e-07, "loss": 0.0314, "num_input_tokens_seen": 255724736, "step": 118510 }, { "epoch": 19.333605220228385, "grad_norm": 3.3807265758514404, "learning_rate": 1.6898597110431846e-07, "loss": 0.1522, "num_input_tokens_seen": 255735328, "step": 118515 }, { "epoch": 19.33442088091354, "grad_norm": 0.41769564151763916, "learning_rate": 1.6857311616717276e-07, "loss": 0.0393, "num_input_tokens_seen": 255747072, "step": 118520 }, { "epoch": 19.335236541598697, "grad_norm": 0.1284618228673935, "learning_rate": 1.6816076447039865e-07, "loss": 0.0239, "num_input_tokens_seen": 255758304, "step": 118525 }, { "epoch": 19.33605220228385, "grad_norm": 0.32267361879348755, "learning_rate": 1.6774891602234498e-07, "loss": 0.2715, "num_input_tokens_seen": 255769056, "step": 118530 }, { "epoch": 19.336867862969005, "grad_norm": 0.803115963935852, "learning_rate": 1.673375708313635e-07, "loss": 0.0752, "num_input_tokens_seen": 255780832, "step": 118535 }, { "epoch": 19.33768352365416, "grad_norm": 0.6552756428718567, "learning_rate": 1.6692672890578632e-07, "loss": 0.0157, "num_input_tokens_seen": 255790720, "step": 118540 }, { "epoch": 19.338499184339316, "grad_norm": 0.49581095576286316, "learning_rate": 1.665163902539457e-07, "loss": 0.0399, "num_input_tokens_seen": 255801792, "step": 118545 }, { "epoch": 19.339314845024468, "grad_norm": 2.244370222091675, "learning_rate": 1.6610655488414894e-07, "loss": 0.2421, "num_input_tokens_seen": 255813184, "step": 118550 }, { "epoch": 19.340130505709624, "grad_norm": 0.061269793659448624, "learning_rate": 1.6569722280471157e-07, "loss": 0.0952, "num_input_tokens_seen": 255824000, "step": 118555 }, { "epoch": 19.34094616639478, "grad_norm": 0.9245729446411133, "learning_rate": 1.6528839402392137e-07, "loss": 0.2313, "num_input_tokens_seen": 255834400, "step": 118560 }, { "epoch": 19.341761827079935, "grad_norm": 0.6121954321861267, "learning_rate": 1.6488006855006898e-07, "loss": 0.2632, "num_input_tokens_seen": 255845984, "step": 118565 }, { "epoch": 19.34257748776509, "grad_norm": 3.4623301029205322, "learning_rate": 1.6447224639142556e-07, "loss": 0.153, "num_input_tokens_seen": 255855968, "step": 118570 }, { "epoch": 19.343393148450243, "grad_norm": 3.929856300354004, "learning_rate": 1.6406492755625946e-07, "loss": 0.0915, "num_input_tokens_seen": 255866848, "step": 118575 }, { "epoch": 19.3442088091354, "grad_norm": 0.025878533720970154, "learning_rate": 1.6365811205282521e-07, "loss": 0.0851, "num_input_tokens_seen": 255877984, "step": 118580 }, { "epoch": 19.345024469820554, "grad_norm": 3.0518696308135986, "learning_rate": 1.6325179988936344e-07, "loss": 0.2863, "num_input_tokens_seen": 255889408, "step": 118585 }, { "epoch": 19.34584013050571, "grad_norm": 0.5410655736923218, "learning_rate": 1.628459910741148e-07, "loss": 0.1493, "num_input_tokens_seen": 255900640, "step": 118590 }, { "epoch": 19.346655791190866, "grad_norm": 1.5944255590438843, "learning_rate": 1.624406856152977e-07, "loss": 0.0326, "num_input_tokens_seen": 255910880, "step": 118595 }, { "epoch": 19.347471451876018, "grad_norm": 0.34403908252716064, "learning_rate": 1.6203588352113052e-07, "loss": 0.0467, "num_input_tokens_seen": 255921728, "step": 118600 }, { "epoch": 19.348287112561174, "grad_norm": 0.03739530220627785, "learning_rate": 1.6163158479981232e-07, "loss": 0.0336, "num_input_tokens_seen": 255932736, "step": 118605 }, { "epoch": 19.34910277324633, "grad_norm": 0.5989723205566406, "learning_rate": 1.6122778945954486e-07, "loss": 0.0516, "num_input_tokens_seen": 255943968, "step": 118610 }, { "epoch": 19.349918433931485, "grad_norm": 0.4594796895980835, "learning_rate": 1.6082449750850214e-07, "loss": 0.1167, "num_input_tokens_seen": 255954720, "step": 118615 }, { "epoch": 19.35073409461664, "grad_norm": 0.9955466985702515, "learning_rate": 1.6042170895486374e-07, "loss": 0.0795, "num_input_tokens_seen": 255966656, "step": 118620 }, { "epoch": 19.351549755301793, "grad_norm": 0.14740368723869324, "learning_rate": 1.600194238067898e-07, "loss": 0.1582, "num_input_tokens_seen": 255977984, "step": 118625 }, { "epoch": 19.35236541598695, "grad_norm": 0.25036129355430603, "learning_rate": 1.5961764207243767e-07, "loss": 0.0423, "num_input_tokens_seen": 255988640, "step": 118630 }, { "epoch": 19.353181076672104, "grad_norm": 0.03692512959241867, "learning_rate": 1.5921636375993976e-07, "loss": 0.1672, "num_input_tokens_seen": 256000576, "step": 118635 }, { "epoch": 19.35399673735726, "grad_norm": 0.06997328996658325, "learning_rate": 1.5881558887743952e-07, "loss": 0.029, "num_input_tokens_seen": 256011104, "step": 118640 }, { "epoch": 19.354812398042416, "grad_norm": 1.710785984992981, "learning_rate": 1.584153174330527e-07, "loss": 0.1957, "num_input_tokens_seen": 256023200, "step": 118645 }, { "epoch": 19.355628058727568, "grad_norm": 0.33591926097869873, "learning_rate": 1.5801554943489505e-07, "loss": 0.1395, "num_input_tokens_seen": 256034080, "step": 118650 }, { "epoch": 19.356443719412724, "grad_norm": 0.02803216502070427, "learning_rate": 1.576162848910656e-07, "loss": 0.0332, "num_input_tokens_seen": 256043168, "step": 118655 }, { "epoch": 19.35725938009788, "grad_norm": 0.7779743671417236, "learning_rate": 1.5721752380965793e-07, "loss": 0.0789, "num_input_tokens_seen": 256054432, "step": 118660 }, { "epoch": 19.358075040783035, "grad_norm": 0.4171850085258484, "learning_rate": 1.5681926619875166e-07, "loss": 0.1099, "num_input_tokens_seen": 256065472, "step": 118665 }, { "epoch": 19.35889070146819, "grad_norm": 1.7681543827056885, "learning_rate": 1.5642151206641808e-07, "loss": 0.1083, "num_input_tokens_seen": 256075840, "step": 118670 }, { "epoch": 19.359706362153343, "grad_norm": 2.6115305423736572, "learning_rate": 1.5602426142071747e-07, "loss": 0.2213, "num_input_tokens_seen": 256085632, "step": 118675 }, { "epoch": 19.3605220228385, "grad_norm": 0.2842753827571869, "learning_rate": 1.5562751426970723e-07, "loss": 0.068, "num_input_tokens_seen": 256097120, "step": 118680 }, { "epoch": 19.361337683523654, "grad_norm": 0.07708297669887543, "learning_rate": 1.5523127062141706e-07, "loss": 0.0417, "num_input_tokens_seen": 256108256, "step": 118685 }, { "epoch": 19.36215334420881, "grad_norm": 1.957991600036621, "learning_rate": 1.54835530483885e-07, "loss": 0.2218, "num_input_tokens_seen": 256119648, "step": 118690 }, { "epoch": 19.362969004893966, "grad_norm": 0.4293927252292633, "learning_rate": 1.544402938651296e-07, "loss": 0.1543, "num_input_tokens_seen": 256130080, "step": 118695 }, { "epoch": 19.363784665579118, "grad_norm": 1.5507110357284546, "learning_rate": 1.5404556077316113e-07, "loss": 0.1154, "num_input_tokens_seen": 256139680, "step": 118700 }, { "epoch": 19.364600326264274, "grad_norm": 1.1436741352081299, "learning_rate": 1.5365133121597875e-07, "loss": 0.1167, "num_input_tokens_seen": 256149280, "step": 118705 }, { "epoch": 19.36541598694943, "grad_norm": 0.42238715291023254, "learning_rate": 1.532576052015705e-07, "loss": 0.0215, "num_input_tokens_seen": 256160032, "step": 118710 }, { "epoch": 19.366231647634585, "grad_norm": 0.03008614107966423, "learning_rate": 1.5286438273791892e-07, "loss": 0.0183, "num_input_tokens_seen": 256171520, "step": 118715 }, { "epoch": 19.36704730831974, "grad_norm": 0.0776735469698906, "learning_rate": 1.5247166383298984e-07, "loss": 0.0245, "num_input_tokens_seen": 256182208, "step": 118720 }, { "epoch": 19.367862969004893, "grad_norm": 1.3138341903686523, "learning_rate": 1.520794484947463e-07, "loss": 0.1278, "num_input_tokens_seen": 256194080, "step": 118725 }, { "epoch": 19.36867862969005, "grad_norm": 0.1980065256357193, "learning_rate": 1.5168773673113202e-07, "loss": 0.0706, "num_input_tokens_seen": 256204640, "step": 118730 }, { "epoch": 19.369494290375204, "grad_norm": 0.054336074739694595, "learning_rate": 1.512965285500878e-07, "loss": 0.0331, "num_input_tokens_seen": 256215872, "step": 118735 }, { "epoch": 19.37030995106036, "grad_norm": 0.05104942247271538, "learning_rate": 1.5090582395954344e-07, "loss": 0.0126, "num_input_tokens_seen": 256226240, "step": 118740 }, { "epoch": 19.371125611745512, "grad_norm": 0.035823673009872437, "learning_rate": 1.505156229674176e-07, "loss": 0.0126, "num_input_tokens_seen": 256237728, "step": 118745 }, { "epoch": 19.371941272430668, "grad_norm": 0.04185718670487404, "learning_rate": 1.501259255816123e-07, "loss": 0.0059, "num_input_tokens_seen": 256246720, "step": 118750 }, { "epoch": 19.372756933115824, "grad_norm": 0.09390122443437576, "learning_rate": 1.4973673181003234e-07, "loss": 0.0233, "num_input_tokens_seen": 256257984, "step": 118755 }, { "epoch": 19.37357259380098, "grad_norm": 0.15552319586277008, "learning_rate": 1.4934804166056027e-07, "loss": 0.0781, "num_input_tokens_seen": 256268000, "step": 118760 }, { "epoch": 19.374388254486135, "grad_norm": 1.9965543746948242, "learning_rate": 1.489598551410787e-07, "loss": 0.2962, "num_input_tokens_seen": 256278368, "step": 118765 }, { "epoch": 19.375203915171287, "grad_norm": 2.0841076374053955, "learning_rate": 1.4857217225944797e-07, "loss": 0.0896, "num_input_tokens_seen": 256288800, "step": 118770 }, { "epoch": 19.376019575856443, "grad_norm": 0.14230020344257355, "learning_rate": 1.481849930235313e-07, "loss": 0.125, "num_input_tokens_seen": 256299744, "step": 118775 }, { "epoch": 19.3768352365416, "grad_norm": 0.04191659018397331, "learning_rate": 1.4779831744116956e-07, "loss": 0.1366, "num_input_tokens_seen": 256310720, "step": 118780 }, { "epoch": 19.377650897226754, "grad_norm": 0.12604553997516632, "learning_rate": 1.4741214552020655e-07, "loss": 0.1554, "num_input_tokens_seen": 256321504, "step": 118785 }, { "epoch": 19.37846655791191, "grad_norm": 0.04753655567765236, "learning_rate": 1.4702647726846097e-07, "loss": 0.0165, "num_input_tokens_seen": 256331904, "step": 118790 }, { "epoch": 19.379282218597062, "grad_norm": 2.7563717365264893, "learning_rate": 1.4664131269375158e-07, "loss": 0.1693, "num_input_tokens_seen": 256342848, "step": 118795 }, { "epoch": 19.380097879282218, "grad_norm": 0.060773756355047226, "learning_rate": 1.4625665180388603e-07, "loss": 0.0325, "num_input_tokens_seen": 256353856, "step": 118800 }, { "epoch": 19.380913539967374, "grad_norm": 0.14584453403949738, "learning_rate": 1.458724946066581e-07, "loss": 0.0307, "num_input_tokens_seen": 256364416, "step": 118805 }, { "epoch": 19.38172920065253, "grad_norm": 0.543487548828125, "learning_rate": 1.4548884110985318e-07, "loss": 0.1354, "num_input_tokens_seen": 256373472, "step": 118810 }, { "epoch": 19.382544861337685, "grad_norm": 0.060386646538972855, "learning_rate": 1.451056913212484e-07, "loss": 0.1169, "num_input_tokens_seen": 256385216, "step": 118815 }, { "epoch": 19.383360522022837, "grad_norm": 1.1558798551559448, "learning_rate": 1.4472304524860702e-07, "loss": 0.0975, "num_input_tokens_seen": 256395648, "step": 118820 }, { "epoch": 19.384176182707993, "grad_norm": 0.3125946521759033, "learning_rate": 1.4434090289968393e-07, "loss": 0.0196, "num_input_tokens_seen": 256407360, "step": 118825 }, { "epoch": 19.38499184339315, "grad_norm": 0.8522194027900696, "learning_rate": 1.4395926428222572e-07, "loss": 0.0284, "num_input_tokens_seen": 256417984, "step": 118830 }, { "epoch": 19.385807504078304, "grad_norm": 0.9993022084236145, "learning_rate": 1.435781294039623e-07, "loss": 0.098, "num_input_tokens_seen": 256428608, "step": 118835 }, { "epoch": 19.38662316476346, "grad_norm": 2.4591684341430664, "learning_rate": 1.4319749827262363e-07, "loss": 0.1205, "num_input_tokens_seen": 256440192, "step": 118840 }, { "epoch": 19.387438825448612, "grad_norm": 0.637285053730011, "learning_rate": 1.428173708959174e-07, "loss": 0.1034, "num_input_tokens_seen": 256452224, "step": 118845 }, { "epoch": 19.388254486133768, "grad_norm": 0.6885797381401062, "learning_rate": 1.4243774728155413e-07, "loss": 0.1828, "num_input_tokens_seen": 256462944, "step": 118850 }, { "epoch": 19.389070146818923, "grad_norm": 0.09161798655986786, "learning_rate": 1.420586274372221e-07, "loss": 0.1938, "num_input_tokens_seen": 256473440, "step": 118855 }, { "epoch": 19.38988580750408, "grad_norm": 0.5878241062164307, "learning_rate": 1.4168001137060682e-07, "loss": 0.3004, "num_input_tokens_seen": 256483104, "step": 118860 }, { "epoch": 19.390701468189235, "grad_norm": 1.4701406955718994, "learning_rate": 1.4130189908937996e-07, "loss": 0.2192, "num_input_tokens_seen": 256494016, "step": 118865 }, { "epoch": 19.391517128874387, "grad_norm": 0.10844212770462036, "learning_rate": 1.409242906012076e-07, "loss": 0.0415, "num_input_tokens_seen": 256503968, "step": 118870 }, { "epoch": 19.392332789559543, "grad_norm": 0.7475206255912781, "learning_rate": 1.4054718591373918e-07, "loss": 0.0469, "num_input_tokens_seen": 256515136, "step": 118875 }, { "epoch": 19.3931484502447, "grad_norm": 1.3760420083999634, "learning_rate": 1.4017058503462133e-07, "loss": 0.1058, "num_input_tokens_seen": 256525984, "step": 118880 }, { "epoch": 19.393964110929854, "grad_norm": 1.5994514226913452, "learning_rate": 1.397944879714813e-07, "loss": 0.3494, "num_input_tokens_seen": 256537248, "step": 118885 }, { "epoch": 19.39477977161501, "grad_norm": 0.36608433723449707, "learning_rate": 1.394188947319408e-07, "loss": 0.0694, "num_input_tokens_seen": 256547552, "step": 118890 }, { "epoch": 19.395595432300162, "grad_norm": 3.0609917640686035, "learning_rate": 1.3904380532361594e-07, "loss": 0.1147, "num_input_tokens_seen": 256557728, "step": 118895 }, { "epoch": 19.396411092985318, "grad_norm": 0.038104377686977386, "learning_rate": 1.3866921975410897e-07, "loss": 0.0503, "num_input_tokens_seen": 256568800, "step": 118900 }, { "epoch": 19.397226753670473, "grad_norm": 0.06404092907905579, "learning_rate": 1.3829513803100824e-07, "loss": 0.0272, "num_input_tokens_seen": 256577888, "step": 118905 }, { "epoch": 19.39804241435563, "grad_norm": 0.10560498386621475, "learning_rate": 1.3792156016189383e-07, "loss": 0.0094, "num_input_tokens_seen": 256588512, "step": 118910 }, { "epoch": 19.39885807504078, "grad_norm": 0.7280520796775818, "learning_rate": 1.3754848615434023e-07, "loss": 0.0299, "num_input_tokens_seen": 256598400, "step": 118915 }, { "epoch": 19.399673735725937, "grad_norm": 0.3863482177257538, "learning_rate": 1.3717591601590807e-07, "loss": 0.2431, "num_input_tokens_seen": 256608928, "step": 118920 }, { "epoch": 19.400489396411093, "grad_norm": 0.054270293563604355, "learning_rate": 1.3680384975414405e-07, "loss": 0.0522, "num_input_tokens_seen": 256618048, "step": 118925 }, { "epoch": 19.40130505709625, "grad_norm": 0.9043380618095398, "learning_rate": 1.364322873765922e-07, "loss": 0.0239, "num_input_tokens_seen": 256630432, "step": 118930 }, { "epoch": 19.402120717781404, "grad_norm": 1.4198180437088013, "learning_rate": 1.3606122889078254e-07, "loss": 0.229, "num_input_tokens_seen": 256639680, "step": 118935 }, { "epoch": 19.402936378466556, "grad_norm": 1.294343113899231, "learning_rate": 1.356906743042341e-07, "loss": 0.2595, "num_input_tokens_seen": 256650688, "step": 118940 }, { "epoch": 19.403752039151712, "grad_norm": 2.392075300216675, "learning_rate": 1.3532062362445752e-07, "loss": 0.1821, "num_input_tokens_seen": 256660256, "step": 118945 }, { "epoch": 19.404567699836868, "grad_norm": 0.03168617561459541, "learning_rate": 1.3495107685894958e-07, "loss": 0.0102, "num_input_tokens_seen": 256671328, "step": 118950 }, { "epoch": 19.405383360522023, "grad_norm": 0.0378035344183445, "learning_rate": 1.3458203401520432e-07, "loss": 0.1172, "num_input_tokens_seen": 256682816, "step": 118955 }, { "epoch": 19.40619902120718, "grad_norm": 0.6407273411750793, "learning_rate": 1.3421349510069626e-07, "loss": 0.0201, "num_input_tokens_seen": 256694080, "step": 118960 }, { "epoch": 19.40701468189233, "grad_norm": 0.1724127233028412, "learning_rate": 1.3384546012289444e-07, "loss": 0.1489, "num_input_tokens_seen": 256704864, "step": 118965 }, { "epoch": 19.407830342577487, "grad_norm": 0.15590336918830872, "learning_rate": 1.3347792908926238e-07, "loss": 0.1529, "num_input_tokens_seen": 256715136, "step": 118970 }, { "epoch": 19.408646003262643, "grad_norm": 0.05888601392507553, "learning_rate": 1.3311090200724409e-07, "loss": 0.0101, "num_input_tokens_seen": 256725120, "step": 118975 }, { "epoch": 19.4094616639478, "grad_norm": 1.5928025245666504, "learning_rate": 1.327443788842808e-07, "loss": 0.1021, "num_input_tokens_seen": 256734720, "step": 118980 }, { "epoch": 19.410277324632954, "grad_norm": 0.07924876362085342, "learning_rate": 1.323783597277972e-07, "loss": 0.0564, "num_input_tokens_seen": 256745344, "step": 118985 }, { "epoch": 19.411092985318106, "grad_norm": 0.04756234213709831, "learning_rate": 1.3201284454521234e-07, "loss": 0.042, "num_input_tokens_seen": 256757248, "step": 118990 }, { "epoch": 19.411908646003262, "grad_norm": 0.776591956615448, "learning_rate": 1.3164783334393693e-07, "loss": 0.2274, "num_input_tokens_seen": 256768000, "step": 118995 }, { "epoch": 19.412724306688418, "grad_norm": 2.4033446311950684, "learning_rate": 1.3128332613136506e-07, "loss": 0.1112, "num_input_tokens_seen": 256778176, "step": 119000 }, { "epoch": 19.413539967373573, "grad_norm": 0.2561802864074707, "learning_rate": 1.3091932291488252e-07, "loss": 0.01, "num_input_tokens_seen": 256789376, "step": 119005 }, { "epoch": 19.41435562805873, "grad_norm": 1.435275673866272, "learning_rate": 1.3055582370187224e-07, "loss": 0.1356, "num_input_tokens_seen": 256799136, "step": 119010 }, { "epoch": 19.41517128874388, "grad_norm": 1.4772669076919556, "learning_rate": 1.3019282849969506e-07, "loss": 0.092, "num_input_tokens_seen": 256810592, "step": 119015 }, { "epoch": 19.415986949429037, "grad_norm": 0.18888887763023376, "learning_rate": 1.298303373157117e-07, "loss": 0.0304, "num_input_tokens_seen": 256821184, "step": 119020 }, { "epoch": 19.416802610114193, "grad_norm": 0.07852033525705338, "learning_rate": 1.2946835015726356e-07, "loss": 0.1146, "num_input_tokens_seen": 256832160, "step": 119025 }, { "epoch": 19.41761827079935, "grad_norm": 0.7481980323791504, "learning_rate": 1.2910686703169194e-07, "loss": 0.1485, "num_input_tokens_seen": 256841408, "step": 119030 }, { "epoch": 19.418433931484504, "grad_norm": 0.04481193423271179, "learning_rate": 1.287458879463188e-07, "loss": 0.0144, "num_input_tokens_seen": 256853024, "step": 119035 }, { "epoch": 19.419249592169656, "grad_norm": 0.1362031102180481, "learning_rate": 1.2838541290846328e-07, "loss": 0.1568, "num_input_tokens_seen": 256865120, "step": 119040 }, { "epoch": 19.420065252854812, "grad_norm": 0.008101830258965492, "learning_rate": 1.2802544192542788e-07, "loss": 0.021, "num_input_tokens_seen": 256876768, "step": 119045 }, { "epoch": 19.420880913539968, "grad_norm": 0.08112131804227829, "learning_rate": 1.2766597500451227e-07, "loss": 0.0873, "num_input_tokens_seen": 256888544, "step": 119050 }, { "epoch": 19.421696574225123, "grad_norm": 1.2757350206375122, "learning_rate": 1.273070121529968e-07, "loss": 0.1113, "num_input_tokens_seen": 256899168, "step": 119055 }, { "epoch": 19.42251223491028, "grad_norm": 0.07678970694541931, "learning_rate": 1.2694855337815614e-07, "loss": 0.0068, "num_input_tokens_seen": 256908384, "step": 119060 }, { "epoch": 19.42332789559543, "grad_norm": 4.928576469421387, "learning_rate": 1.2659059868725953e-07, "loss": 0.2366, "num_input_tokens_seen": 256920672, "step": 119065 }, { "epoch": 19.424143556280587, "grad_norm": 1.9669609069824219, "learning_rate": 1.262331480875567e-07, "loss": 0.1526, "num_input_tokens_seen": 256932000, "step": 119070 }, { "epoch": 19.424959216965743, "grad_norm": 0.06527654081583023, "learning_rate": 1.2587620158629466e-07, "loss": 0.0658, "num_input_tokens_seen": 256943488, "step": 119075 }, { "epoch": 19.4257748776509, "grad_norm": 0.7269445061683655, "learning_rate": 1.2551975919070648e-07, "loss": 0.1349, "num_input_tokens_seen": 256955904, "step": 119080 }, { "epoch": 19.42659053833605, "grad_norm": 0.19160640239715576, "learning_rate": 1.2516382090801416e-07, "loss": 0.0508, "num_input_tokens_seen": 256967968, "step": 119085 }, { "epoch": 19.427406199021206, "grad_norm": 0.036859311163425446, "learning_rate": 1.2480838674543416e-07, "loss": 0.0762, "num_input_tokens_seen": 256978368, "step": 119090 }, { "epoch": 19.428221859706362, "grad_norm": 1.499250054359436, "learning_rate": 1.2445345671016907e-07, "loss": 0.0681, "num_input_tokens_seen": 256989856, "step": 119095 }, { "epoch": 19.429037520391518, "grad_norm": 0.4310036599636078, "learning_rate": 1.240990308094131e-07, "loss": 0.0186, "num_input_tokens_seen": 257000768, "step": 119100 }, { "epoch": 19.429853181076673, "grad_norm": 0.2925342321395874, "learning_rate": 1.2374510905034388e-07, "loss": 0.0968, "num_input_tokens_seen": 257011840, "step": 119105 }, { "epoch": 19.430668841761825, "grad_norm": 0.9663375616073608, "learning_rate": 1.2339169144013895e-07, "loss": 0.0256, "num_input_tokens_seen": 257022464, "step": 119110 }, { "epoch": 19.43148450244698, "grad_norm": 2.2545299530029297, "learning_rate": 1.2303877798596208e-07, "loss": 0.184, "num_input_tokens_seen": 257032960, "step": 119115 }, { "epoch": 19.432300163132137, "grad_norm": 0.0475943349301815, "learning_rate": 1.226863686949603e-07, "loss": 0.0415, "num_input_tokens_seen": 257043168, "step": 119120 }, { "epoch": 19.433115823817293, "grad_norm": 0.05516574904322624, "learning_rate": 1.223344635742807e-07, "loss": 0.1045, "num_input_tokens_seen": 257053920, "step": 119125 }, { "epoch": 19.43393148450245, "grad_norm": 0.0587860606610775, "learning_rate": 1.2198306263105085e-07, "loss": 0.0576, "num_input_tokens_seen": 257066592, "step": 119130 }, { "epoch": 19.4347471451876, "grad_norm": 0.8524600267410278, "learning_rate": 1.2163216587239568e-07, "loss": 0.0858, "num_input_tokens_seen": 257077024, "step": 119135 }, { "epoch": 19.435562805872756, "grad_norm": 0.23151546716690063, "learning_rate": 1.2128177330542334e-07, "loss": 0.161, "num_input_tokens_seen": 257087456, "step": 119140 }, { "epoch": 19.436378466557912, "grad_norm": 0.07021848112344742, "learning_rate": 1.2093188493723927e-07, "loss": 0.0985, "num_input_tokens_seen": 257097216, "step": 119145 }, { "epoch": 19.437194127243067, "grad_norm": 0.0854434221982956, "learning_rate": 1.205825007749295e-07, "loss": 0.131, "num_input_tokens_seen": 257108192, "step": 119150 }, { "epoch": 19.438009787928223, "grad_norm": 3.5884525775909424, "learning_rate": 1.2023362082557997e-07, "loss": 0.1882, "num_input_tokens_seen": 257118560, "step": 119155 }, { "epoch": 19.438825448613375, "grad_norm": 0.10011127591133118, "learning_rate": 1.1988524509625453e-07, "loss": 0.0582, "num_input_tokens_seen": 257129216, "step": 119160 }, { "epoch": 19.43964110929853, "grad_norm": 0.1422126144170761, "learning_rate": 1.1953737359401973e-07, "loss": 0.0466, "num_input_tokens_seen": 257137760, "step": 119165 }, { "epoch": 19.440456769983687, "grad_norm": 0.11156832426786423, "learning_rate": 1.1919000632592269e-07, "loss": 0.0119, "num_input_tokens_seen": 257147808, "step": 119170 }, { "epoch": 19.441272430668842, "grad_norm": 1.7064207792282104, "learning_rate": 1.1884314329900503e-07, "loss": 0.155, "num_input_tokens_seen": 257159136, "step": 119175 }, { "epoch": 19.442088091353998, "grad_norm": 1.5958431959152222, "learning_rate": 1.1849678452029167e-07, "loss": 0.1548, "num_input_tokens_seen": 257169824, "step": 119180 }, { "epoch": 19.44290375203915, "grad_norm": 0.1429663598537445, "learning_rate": 1.1815092999680754e-07, "loss": 0.0864, "num_input_tokens_seen": 257181504, "step": 119185 }, { "epoch": 19.443719412724306, "grad_norm": 0.5866190195083618, "learning_rate": 1.1780557973555817e-07, "loss": 0.0081, "num_input_tokens_seen": 257191488, "step": 119190 }, { "epoch": 19.44453507340946, "grad_norm": 3.7977192401885986, "learning_rate": 1.1746073374354627e-07, "loss": 0.1495, "num_input_tokens_seen": 257203392, "step": 119195 }, { "epoch": 19.445350734094617, "grad_norm": 0.5695561766624451, "learning_rate": 1.1711639202775793e-07, "loss": 0.1484, "num_input_tokens_seen": 257214656, "step": 119200 }, { "epoch": 19.446166394779773, "grad_norm": 1.432039737701416, "learning_rate": 1.167725545951709e-07, "loss": 0.0381, "num_input_tokens_seen": 257224576, "step": 119205 }, { "epoch": 19.446982055464925, "grad_norm": 0.9316999316215515, "learning_rate": 1.1642922145275459e-07, "loss": 0.0333, "num_input_tokens_seen": 257236288, "step": 119210 }, { "epoch": 19.44779771615008, "grad_norm": 0.329375296831131, "learning_rate": 1.1608639260747012e-07, "loss": 0.2062, "num_input_tokens_seen": 257245952, "step": 119215 }, { "epoch": 19.448613376835237, "grad_norm": 0.23570136725902557, "learning_rate": 1.1574406806625914e-07, "loss": 0.0966, "num_input_tokens_seen": 257257280, "step": 119220 }, { "epoch": 19.449429037520392, "grad_norm": 0.026606852188706398, "learning_rate": 1.1540224783606335e-07, "loss": 0.0676, "num_input_tokens_seen": 257268736, "step": 119225 }, { "epoch": 19.450244698205548, "grad_norm": 0.30926430225372314, "learning_rate": 1.1506093192381052e-07, "loss": 0.0299, "num_input_tokens_seen": 257279872, "step": 119230 }, { "epoch": 19.4510603588907, "grad_norm": 1.4312397241592407, "learning_rate": 1.1472012033641455e-07, "loss": 0.0799, "num_input_tokens_seen": 257290112, "step": 119235 }, { "epoch": 19.451876019575856, "grad_norm": 0.0354197695851326, "learning_rate": 1.1437981308078661e-07, "loss": 0.0076, "num_input_tokens_seen": 257300608, "step": 119240 }, { "epoch": 19.45269168026101, "grad_norm": 0.07851660251617432, "learning_rate": 1.1404001016381837e-07, "loss": 0.0591, "num_input_tokens_seen": 257312320, "step": 119245 }, { "epoch": 19.453507340946167, "grad_norm": 0.1637926548719406, "learning_rate": 1.1370071159240437e-07, "loss": 0.2751, "num_input_tokens_seen": 257322784, "step": 119250 }, { "epoch": 19.454323001631323, "grad_norm": 0.04356047883629799, "learning_rate": 1.1336191737341128e-07, "loss": 0.0737, "num_input_tokens_seen": 257333824, "step": 119255 }, { "epoch": 19.455138662316475, "grad_norm": 1.5420936346054077, "learning_rate": 1.1302362751371143e-07, "loss": 0.0957, "num_input_tokens_seen": 257342976, "step": 119260 }, { "epoch": 19.45595432300163, "grad_norm": 0.39659249782562256, "learning_rate": 1.1268584202016042e-07, "loss": 0.0337, "num_input_tokens_seen": 257355072, "step": 119265 }, { "epoch": 19.456769983686787, "grad_norm": 0.1529998779296875, "learning_rate": 1.1234856089960278e-07, "loss": 0.0754, "num_input_tokens_seen": 257364832, "step": 119270 }, { "epoch": 19.457585644371942, "grad_norm": 1.9006619453430176, "learning_rate": 1.1201178415887193e-07, "loss": 0.0886, "num_input_tokens_seen": 257374976, "step": 119275 }, { "epoch": 19.458401305057095, "grad_norm": 0.0929538756608963, "learning_rate": 1.1167551180479574e-07, "loss": 0.0364, "num_input_tokens_seen": 257386848, "step": 119280 }, { "epoch": 19.45921696574225, "grad_norm": 1.4380558729171753, "learning_rate": 1.1133974384418821e-07, "loss": 0.0549, "num_input_tokens_seen": 257397920, "step": 119285 }, { "epoch": 19.460032626427406, "grad_norm": 0.24200470745563507, "learning_rate": 1.1100448028385502e-07, "loss": 0.0194, "num_input_tokens_seen": 257410048, "step": 119290 }, { "epoch": 19.46084828711256, "grad_norm": 0.2075984627008438, "learning_rate": 1.1066972113058793e-07, "loss": 0.0321, "num_input_tokens_seen": 257421280, "step": 119295 }, { "epoch": 19.461663947797717, "grad_norm": 1.3970859050750732, "learning_rate": 1.1033546639117597e-07, "loss": 0.1358, "num_input_tokens_seen": 257433024, "step": 119300 }, { "epoch": 19.46247960848287, "grad_norm": 2.400787353515625, "learning_rate": 1.1000171607238874e-07, "loss": 0.03, "num_input_tokens_seen": 257444672, "step": 119305 }, { "epoch": 19.463295269168025, "grad_norm": 0.1013457179069519, "learning_rate": 1.0966847018099302e-07, "loss": 0.0484, "num_input_tokens_seen": 257455744, "step": 119310 }, { "epoch": 19.46411092985318, "grad_norm": 1.2576215267181396, "learning_rate": 1.0933572872373899e-07, "loss": 0.1385, "num_input_tokens_seen": 257465920, "step": 119315 }, { "epoch": 19.464926590538337, "grad_norm": 0.6602963805198669, "learning_rate": 1.0900349170737678e-07, "loss": 0.162, "num_input_tokens_seen": 257475392, "step": 119320 }, { "epoch": 19.465742251223492, "grad_norm": 1.732026219367981, "learning_rate": 1.0867175913863159e-07, "loss": 0.0514, "num_input_tokens_seen": 257487808, "step": 119325 }, { "epoch": 19.466557911908644, "grad_norm": 0.1588408201932907, "learning_rate": 1.0834053102423136e-07, "loss": 0.1276, "num_input_tokens_seen": 257498208, "step": 119330 }, { "epoch": 19.4673735725938, "grad_norm": 0.8723531365394592, "learning_rate": 1.0800980737088739e-07, "loss": 0.0498, "num_input_tokens_seen": 257509376, "step": 119335 }, { "epoch": 19.468189233278956, "grad_norm": 0.040985025465488434, "learning_rate": 1.0767958818530265e-07, "loss": 0.0334, "num_input_tokens_seen": 257520032, "step": 119340 }, { "epoch": 19.46900489396411, "grad_norm": 0.5903245806694031, "learning_rate": 1.0734987347416902e-07, "loss": 0.0439, "num_input_tokens_seen": 257531072, "step": 119345 }, { "epoch": 19.469820554649267, "grad_norm": 0.02707243338227272, "learning_rate": 1.0702066324417004e-07, "loss": 0.0196, "num_input_tokens_seen": 257541024, "step": 119350 }, { "epoch": 19.47063621533442, "grad_norm": 2.368941068649292, "learning_rate": 1.0669195750197259e-07, "loss": 0.2664, "num_input_tokens_seen": 257551264, "step": 119355 }, { "epoch": 19.471451876019575, "grad_norm": 2.5522572994232178, "learning_rate": 1.0636375625424632e-07, "loss": 0.0769, "num_input_tokens_seen": 257561248, "step": 119360 }, { "epoch": 19.47226753670473, "grad_norm": 0.12662972509860992, "learning_rate": 1.0603605950763595e-07, "loss": 0.0381, "num_input_tokens_seen": 257572640, "step": 119365 }, { "epoch": 19.473083197389887, "grad_norm": 0.5729917287826538, "learning_rate": 1.0570886726878615e-07, "loss": 0.0111, "num_input_tokens_seen": 257583328, "step": 119370 }, { "epoch": 19.473898858075042, "grad_norm": 2.654414415359497, "learning_rate": 1.0538217954432494e-07, "loss": 0.2919, "num_input_tokens_seen": 257594016, "step": 119375 }, { "epoch": 19.474714518760194, "grad_norm": 2.2059566974639893, "learning_rate": 1.0505599634087759e-07, "loss": 0.1931, "num_input_tokens_seen": 257602464, "step": 119380 }, { "epoch": 19.47553017944535, "grad_norm": 0.05355695262551308, "learning_rate": 1.0473031766504993e-07, "loss": 0.0427, "num_input_tokens_seen": 257612640, "step": 119385 }, { "epoch": 19.476345840130506, "grad_norm": 0.542310357093811, "learning_rate": 1.0440514352344499e-07, "loss": 0.1278, "num_input_tokens_seen": 257623456, "step": 119390 }, { "epoch": 19.47716150081566, "grad_norm": 1.6298433542251587, "learning_rate": 1.0408047392265197e-07, "loss": 0.0713, "num_input_tokens_seen": 257633664, "step": 119395 }, { "epoch": 19.477977161500817, "grad_norm": 0.21142369508743286, "learning_rate": 1.0375630886925169e-07, "loss": 0.2009, "num_input_tokens_seen": 257644832, "step": 119400 }, { "epoch": 19.47879282218597, "grad_norm": 0.4620598554611206, "learning_rate": 1.0343264836981393e-07, "loss": 0.1271, "num_input_tokens_seen": 257655008, "step": 119405 }, { "epoch": 19.479608482871125, "grad_norm": 0.36338233947753906, "learning_rate": 1.0310949243089451e-07, "loss": 0.0184, "num_input_tokens_seen": 257665984, "step": 119410 }, { "epoch": 19.48042414355628, "grad_norm": 0.8657607436180115, "learning_rate": 1.0278684105904657e-07, "loss": 0.0836, "num_input_tokens_seen": 257676384, "step": 119415 }, { "epoch": 19.481239804241437, "grad_norm": 1.5618994235992432, "learning_rate": 1.0246469426080651e-07, "loss": 0.2154, "num_input_tokens_seen": 257686528, "step": 119420 }, { "epoch": 19.482055464926592, "grad_norm": 1.7736674547195435, "learning_rate": 1.0214305204270525e-07, "loss": 0.0576, "num_input_tokens_seen": 257697696, "step": 119425 }, { "epoch": 19.482871125611744, "grad_norm": 0.17729303240776062, "learning_rate": 1.0182191441125976e-07, "loss": 0.0265, "num_input_tokens_seen": 257708960, "step": 119430 }, { "epoch": 19.4836867862969, "grad_norm": 0.15029750764369965, "learning_rate": 1.0150128137297876e-07, "loss": 0.1835, "num_input_tokens_seen": 257718976, "step": 119435 }, { "epoch": 19.484502446982056, "grad_norm": 0.13161614537239075, "learning_rate": 1.0118115293435981e-07, "loss": 0.0251, "num_input_tokens_seen": 257730432, "step": 119440 }, { "epoch": 19.48531810766721, "grad_norm": 1.7866827249526978, "learning_rate": 1.0086152910189217e-07, "loss": 0.2917, "num_input_tokens_seen": 257740960, "step": 119445 }, { "epoch": 19.486133768352367, "grad_norm": 0.24375902116298676, "learning_rate": 1.0054240988205121e-07, "loss": 0.0265, "num_input_tokens_seen": 257752000, "step": 119450 }, { "epoch": 19.48694942903752, "grad_norm": 2.651777744293213, "learning_rate": 1.0022379528130677e-07, "loss": 0.2382, "num_input_tokens_seen": 257763456, "step": 119455 }, { "epoch": 19.487765089722675, "grad_norm": 0.23198071122169495, "learning_rate": 9.990568530611477e-08, "loss": 0.0481, "num_input_tokens_seen": 257774112, "step": 119460 }, { "epoch": 19.48858075040783, "grad_norm": 0.05237904191017151, "learning_rate": 9.958807996292008e-08, "loss": 0.0096, "num_input_tokens_seen": 257784672, "step": 119465 }, { "epoch": 19.489396411092986, "grad_norm": 0.49693751335144043, "learning_rate": 9.927097925816476e-08, "loss": 0.0241, "num_input_tokens_seen": 257794720, "step": 119470 }, { "epoch": 19.49021207177814, "grad_norm": 0.12963776290416718, "learning_rate": 9.895438319826867e-08, "loss": 0.0386, "num_input_tokens_seen": 257806400, "step": 119475 }, { "epoch": 19.491027732463294, "grad_norm": 1.0187046527862549, "learning_rate": 9.863829178965444e-08, "loss": 0.0471, "num_input_tokens_seen": 257816256, "step": 119480 }, { "epoch": 19.49184339314845, "grad_norm": 0.8233152627944946, "learning_rate": 9.832270503872254e-08, "loss": 0.1013, "num_input_tokens_seen": 257828000, "step": 119485 }, { "epoch": 19.492659053833606, "grad_norm": 1.328521490097046, "learning_rate": 9.80076229518706e-08, "loss": 0.0625, "num_input_tokens_seen": 257837312, "step": 119490 }, { "epoch": 19.49347471451876, "grad_norm": 0.03259563073515892, "learning_rate": 9.76930455354852e-08, "loss": 0.2092, "num_input_tokens_seen": 257847552, "step": 119495 }, { "epoch": 19.494290375203914, "grad_norm": 0.14137457311153412, "learning_rate": 9.737897279594177e-08, "loss": 0.23, "num_input_tokens_seen": 257858336, "step": 119500 }, { "epoch": 19.49510603588907, "grad_norm": 0.33901557326316833, "learning_rate": 9.706540473960468e-08, "loss": 0.102, "num_input_tokens_seen": 257868864, "step": 119505 }, { "epoch": 19.495921696574225, "grad_norm": 1.6783708333969116, "learning_rate": 9.675234137282719e-08, "loss": 0.1595, "num_input_tokens_seen": 257878816, "step": 119510 }, { "epoch": 19.49673735725938, "grad_norm": 1.4362720251083374, "learning_rate": 9.643978270195697e-08, "loss": 0.1144, "num_input_tokens_seen": 257889056, "step": 119515 }, { "epoch": 19.497553017944536, "grad_norm": 0.04401761293411255, "learning_rate": 9.612772873332787e-08, "loss": 0.2124, "num_input_tokens_seen": 257899360, "step": 119520 }, { "epoch": 19.49836867862969, "grad_norm": 1.7311687469482422, "learning_rate": 9.581617947325982e-08, "loss": 0.2303, "num_input_tokens_seen": 257911264, "step": 119525 }, { "epoch": 19.499184339314844, "grad_norm": 0.08133582025766373, "learning_rate": 9.550513492807278e-08, "loss": 0.1548, "num_input_tokens_seen": 257922432, "step": 119530 }, { "epoch": 19.5, "grad_norm": 0.465688556432724, "learning_rate": 9.519459510406725e-08, "loss": 0.1032, "num_input_tokens_seen": 257932448, "step": 119535 }, { "epoch": 19.500815660685156, "grad_norm": 0.04498860985040665, "learning_rate": 9.488456000753543e-08, "loss": 0.0872, "num_input_tokens_seen": 257943776, "step": 119540 }, { "epoch": 19.50163132137031, "grad_norm": 0.18820448219776154, "learning_rate": 9.457502964476672e-08, "loss": 0.0227, "num_input_tokens_seen": 257954880, "step": 119545 }, { "epoch": 19.502446982055464, "grad_norm": 0.3115454316139221, "learning_rate": 9.426600402202556e-08, "loss": 0.0129, "num_input_tokens_seen": 257966208, "step": 119550 }, { "epoch": 19.50326264274062, "grad_norm": 0.1564156413078308, "learning_rate": 9.395748314558195e-08, "loss": 0.1019, "num_input_tokens_seen": 257975872, "step": 119555 }, { "epoch": 19.504078303425775, "grad_norm": 0.35302382707595825, "learning_rate": 9.364946702168364e-08, "loss": 0.0178, "num_input_tokens_seen": 257986272, "step": 119560 }, { "epoch": 19.50489396411093, "grad_norm": 0.7322589159011841, "learning_rate": 9.334195565657567e-08, "loss": 0.0179, "num_input_tokens_seen": 257996032, "step": 119565 }, { "epoch": 19.505709624796086, "grad_norm": 2.420563220977783, "learning_rate": 9.303494905648913e-08, "loss": 0.0483, "num_input_tokens_seen": 258006976, "step": 119570 }, { "epoch": 19.50652528548124, "grad_norm": 1.0729188919067383, "learning_rate": 9.272844722764406e-08, "loss": 0.1227, "num_input_tokens_seen": 258017152, "step": 119575 }, { "epoch": 19.507340946166394, "grad_norm": 2.1829326152801514, "learning_rate": 9.242245017625772e-08, "loss": 0.1166, "num_input_tokens_seen": 258027232, "step": 119580 }, { "epoch": 19.50815660685155, "grad_norm": 1.8223541975021362, "learning_rate": 9.211695790852515e-08, "loss": 0.1215, "num_input_tokens_seen": 258038560, "step": 119585 }, { "epoch": 19.508972267536706, "grad_norm": 0.13003158569335938, "learning_rate": 9.181197043064138e-08, "loss": 0.0208, "num_input_tokens_seen": 258049984, "step": 119590 }, { "epoch": 19.50978792822186, "grad_norm": 0.5162330865859985, "learning_rate": 9.150748774878759e-08, "loss": 0.0199, "num_input_tokens_seen": 258059680, "step": 119595 }, { "epoch": 19.510603588907014, "grad_norm": 1.9357539415359497, "learning_rate": 9.120350986913106e-08, "loss": 0.0853, "num_input_tokens_seen": 258071136, "step": 119600 }, { "epoch": 19.51141924959217, "grad_norm": 0.024063246324658394, "learning_rate": 9.090003679783632e-08, "loss": 0.0832, "num_input_tokens_seen": 258081312, "step": 119605 }, { "epoch": 19.512234910277325, "grad_norm": 2.9220385551452637, "learning_rate": 9.059706854105121e-08, "loss": 0.0815, "num_input_tokens_seen": 258092384, "step": 119610 }, { "epoch": 19.51305057096248, "grad_norm": 1.0220680236816406, "learning_rate": 9.029460510491527e-08, "loss": 0.0202, "num_input_tokens_seen": 258103328, "step": 119615 }, { "epoch": 19.513866231647633, "grad_norm": 0.7271577715873718, "learning_rate": 8.999264649555971e-08, "loss": 0.0533, "num_input_tokens_seen": 258114464, "step": 119620 }, { "epoch": 19.51468189233279, "grad_norm": 0.4366215765476227, "learning_rate": 8.969119271910465e-08, "loss": 0.0856, "num_input_tokens_seen": 258124992, "step": 119625 }, { "epoch": 19.515497553017944, "grad_norm": 0.21350732445716858, "learning_rate": 8.939024378165627e-08, "loss": 0.0484, "num_input_tokens_seen": 258135904, "step": 119630 }, { "epoch": 19.5163132137031, "grad_norm": 2.4199373722076416, "learning_rate": 8.908979968931807e-08, "loss": 0.0937, "num_input_tokens_seen": 258146848, "step": 119635 }, { "epoch": 19.517128874388256, "grad_norm": 0.36092904210090637, "learning_rate": 8.878986044817683e-08, "loss": 0.031, "num_input_tokens_seen": 258157920, "step": 119640 }, { "epoch": 19.517944535073408, "grad_norm": 0.5683991312980652, "learning_rate": 8.849042606431102e-08, "loss": 0.1398, "num_input_tokens_seen": 258169280, "step": 119645 }, { "epoch": 19.518760195758563, "grad_norm": 1.5321012735366821, "learning_rate": 8.8191496543788e-08, "loss": 0.2114, "num_input_tokens_seen": 258179488, "step": 119650 }, { "epoch": 19.51957585644372, "grad_norm": 0.6580101251602173, "learning_rate": 8.789307189266682e-08, "loss": 0.029, "num_input_tokens_seen": 258190272, "step": 119655 }, { "epoch": 19.520391517128875, "grad_norm": 0.07762105762958527, "learning_rate": 8.75951521169982e-08, "loss": 0.0519, "num_input_tokens_seen": 258201280, "step": 119660 }, { "epoch": 19.52120717781403, "grad_norm": 0.6161643862724304, "learning_rate": 8.72977372228162e-08, "loss": 0.0366, "num_input_tokens_seen": 258211424, "step": 119665 }, { "epoch": 19.522022838499183, "grad_norm": 2.324378728866577, "learning_rate": 8.700082721614933e-08, "loss": 0.0731, "num_input_tokens_seen": 258222688, "step": 119670 }, { "epoch": 19.52283849918434, "grad_norm": 2.0209484100341797, "learning_rate": 8.6704422103015e-08, "loss": 0.1626, "num_input_tokens_seen": 258232256, "step": 119675 }, { "epoch": 19.523654159869494, "grad_norm": 0.7043356895446777, "learning_rate": 8.640852188942227e-08, "loss": 0.1063, "num_input_tokens_seen": 258243936, "step": 119680 }, { "epoch": 19.52446982055465, "grad_norm": 1.268171787261963, "learning_rate": 8.611312658136361e-08, "loss": 0.1344, "num_input_tokens_seen": 258252480, "step": 119685 }, { "epoch": 19.525285481239806, "grad_norm": 0.1699720174074173, "learning_rate": 8.581823618482865e-08, "loss": 0.2358, "num_input_tokens_seen": 258262976, "step": 119690 }, { "epoch": 19.526101141924958, "grad_norm": 0.0768313929438591, "learning_rate": 8.552385070579316e-08, "loss": 0.0127, "num_input_tokens_seen": 258273312, "step": 119695 }, { "epoch": 19.526916802610113, "grad_norm": 0.10217474400997162, "learning_rate": 8.52299701502246e-08, "loss": 0.0191, "num_input_tokens_seen": 258284384, "step": 119700 }, { "epoch": 19.52773246329527, "grad_norm": 1.89658784866333, "learning_rate": 8.493659452407376e-08, "loss": 0.0933, "num_input_tokens_seen": 258295936, "step": 119705 }, { "epoch": 19.528548123980425, "grad_norm": 0.07415401935577393, "learning_rate": 8.464372383329422e-08, "loss": 0.1355, "num_input_tokens_seen": 258307392, "step": 119710 }, { "epoch": 19.52936378466558, "grad_norm": 1.527915596961975, "learning_rate": 8.435135808381456e-08, "loss": 0.4149, "num_input_tokens_seen": 258319936, "step": 119715 }, { "epoch": 19.530179445350733, "grad_norm": 0.026699548587203026, "learning_rate": 8.40594972815606e-08, "loss": 0.0139, "num_input_tokens_seen": 258331168, "step": 119720 }, { "epoch": 19.53099510603589, "grad_norm": 0.1515720933675766, "learning_rate": 8.37681414324526e-08, "loss": 0.0846, "num_input_tokens_seen": 258341472, "step": 119725 }, { "epoch": 19.531810766721044, "grad_norm": 0.40841004252433777, "learning_rate": 8.347729054238862e-08, "loss": 0.0561, "num_input_tokens_seen": 258353504, "step": 119730 }, { "epoch": 19.5326264274062, "grad_norm": 2.357335329055786, "learning_rate": 8.31869446172695e-08, "loss": 0.1677, "num_input_tokens_seen": 258364992, "step": 119735 }, { "epoch": 19.533442088091356, "grad_norm": 0.17649568617343903, "learning_rate": 8.289710366297387e-08, "loss": 0.0425, "num_input_tokens_seen": 258375520, "step": 119740 }, { "epoch": 19.534257748776508, "grad_norm": 1.2314120531082153, "learning_rate": 8.260776768537759e-08, "loss": 0.0633, "num_input_tokens_seen": 258384992, "step": 119745 }, { "epoch": 19.535073409461663, "grad_norm": 0.5294865965843201, "learning_rate": 8.231893669034541e-08, "loss": 0.0528, "num_input_tokens_seen": 258395648, "step": 119750 }, { "epoch": 19.53588907014682, "grad_norm": 0.2608337998390198, "learning_rate": 8.203061068373097e-08, "loss": 0.013, "num_input_tokens_seen": 258405248, "step": 119755 }, { "epoch": 19.536704730831975, "grad_norm": 0.052452851086854935, "learning_rate": 8.174278967137406e-08, "loss": 0.1596, "num_input_tokens_seen": 258416192, "step": 119760 }, { "epoch": 19.53752039151713, "grad_norm": 0.2409874051809311, "learning_rate": 8.145547365911443e-08, "loss": 0.1039, "num_input_tokens_seen": 258426272, "step": 119765 }, { "epoch": 19.538336052202283, "grad_norm": 0.02415737695991993, "learning_rate": 8.116866265276968e-08, "loss": 0.0416, "num_input_tokens_seen": 258437152, "step": 119770 }, { "epoch": 19.53915171288744, "grad_norm": 0.07877097278833389, "learning_rate": 8.088235665815458e-08, "loss": 0.1221, "num_input_tokens_seen": 258447776, "step": 119775 }, { "epoch": 19.539967373572594, "grad_norm": 0.03986343368887901, "learning_rate": 8.059655568106727e-08, "loss": 0.0195, "num_input_tokens_seen": 258458400, "step": 119780 }, { "epoch": 19.54078303425775, "grad_norm": 0.16265030205249786, "learning_rate": 8.031125972730869e-08, "loss": 0.1184, "num_input_tokens_seen": 258467776, "step": 119785 }, { "epoch": 19.541598694942905, "grad_norm": 0.08554327487945557, "learning_rate": 8.002646880265196e-08, "loss": 0.0221, "num_input_tokens_seen": 258477312, "step": 119790 }, { "epoch": 19.542414355628058, "grad_norm": 0.10014414042234421, "learning_rate": 7.974218291287306e-08, "loss": 0.0882, "num_input_tokens_seen": 258487392, "step": 119795 }, { "epoch": 19.543230016313213, "grad_norm": 0.10500039160251617, "learning_rate": 7.945840206373123e-08, "loss": 0.1391, "num_input_tokens_seen": 258497568, "step": 119800 }, { "epoch": 19.54404567699837, "grad_norm": 0.17500385642051697, "learning_rate": 7.917512626098022e-08, "loss": 0.1313, "num_input_tokens_seen": 258507808, "step": 119805 }, { "epoch": 19.544861337683525, "grad_norm": 1.6689218282699585, "learning_rate": 7.889235551035712e-08, "loss": 0.2145, "num_input_tokens_seen": 258519104, "step": 119810 }, { "epoch": 19.545676998368677, "grad_norm": 0.11798031628131866, "learning_rate": 7.861008981759622e-08, "loss": 0.1289, "num_input_tokens_seen": 258529824, "step": 119815 }, { "epoch": 19.546492659053833, "grad_norm": 0.1652778536081314, "learning_rate": 7.832832918841793e-08, "loss": 0.0926, "num_input_tokens_seen": 258541184, "step": 119820 }, { "epoch": 19.54730831973899, "grad_norm": 1.920267939567566, "learning_rate": 7.804707362853158e-08, "loss": 0.0487, "num_input_tokens_seen": 258552672, "step": 119825 }, { "epoch": 19.548123980424144, "grad_norm": 0.09829552471637726, "learning_rate": 7.776632314363542e-08, "loss": 0.0103, "num_input_tokens_seen": 258564032, "step": 119830 }, { "epoch": 19.5489396411093, "grad_norm": 2.052924871444702, "learning_rate": 7.748607773942207e-08, "loss": 0.1528, "num_input_tokens_seen": 258575296, "step": 119835 }, { "epoch": 19.549755301794452, "grad_norm": 0.03292516618967056, "learning_rate": 7.720633742157035e-08, "loss": 0.1186, "num_input_tokens_seen": 258585568, "step": 119840 }, { "epoch": 19.550570962479608, "grad_norm": 1.4110496044158936, "learning_rate": 7.692710219574795e-08, "loss": 0.2935, "num_input_tokens_seen": 258596224, "step": 119845 }, { "epoch": 19.551386623164763, "grad_norm": 0.31068214774131775, "learning_rate": 7.664837206761422e-08, "loss": 0.1335, "num_input_tokens_seen": 258606304, "step": 119850 }, { "epoch": 19.55220228384992, "grad_norm": 0.025979774072766304, "learning_rate": 7.63701470428202e-08, "loss": 0.0771, "num_input_tokens_seen": 258616960, "step": 119855 }, { "epoch": 19.553017944535075, "grad_norm": 0.018471576273441315, "learning_rate": 7.609242712700304e-08, "loss": 0.0794, "num_input_tokens_seen": 258628672, "step": 119860 }, { "epoch": 19.553833605220227, "grad_norm": 1.7365739345550537, "learning_rate": 7.581521232578881e-08, "loss": 0.1819, "num_input_tokens_seen": 258640192, "step": 119865 }, { "epoch": 19.554649265905383, "grad_norm": 0.10067118704319, "learning_rate": 7.553850264480078e-08, "loss": 0.0688, "num_input_tokens_seen": 258652288, "step": 119870 }, { "epoch": 19.55546492659054, "grad_norm": 0.10834342986345291, "learning_rate": 7.52622980896428e-08, "loss": 0.043, "num_input_tokens_seen": 258662880, "step": 119875 }, { "epoch": 19.556280587275694, "grad_norm": 0.1021018773317337, "learning_rate": 7.498659866591318e-08, "loss": 0.1014, "num_input_tokens_seen": 258674112, "step": 119880 }, { "epoch": 19.55709624796085, "grad_norm": 0.9926788210868835, "learning_rate": 7.47114043791991e-08, "loss": 0.0895, "num_input_tokens_seen": 258685152, "step": 119885 }, { "epoch": 19.557911908646002, "grad_norm": 1.6372054815292358, "learning_rate": 7.443671523508222e-08, "loss": 0.1689, "num_input_tokens_seen": 258696288, "step": 119890 }, { "epoch": 19.558727569331158, "grad_norm": 0.04712746664881706, "learning_rate": 7.416253123912197e-08, "loss": 0.2132, "num_input_tokens_seen": 258707040, "step": 119895 }, { "epoch": 19.559543230016313, "grad_norm": 0.7426547408103943, "learning_rate": 7.38888523968806e-08, "loss": 0.1265, "num_input_tokens_seen": 258717984, "step": 119900 }, { "epoch": 19.56035889070147, "grad_norm": 1.1194124221801758, "learning_rate": 7.361567871390085e-08, "loss": 0.2636, "num_input_tokens_seen": 258728960, "step": 119905 }, { "epoch": 19.561174551386625, "grad_norm": 1.732566475868225, "learning_rate": 7.334301019572276e-08, "loss": 0.1028, "num_input_tokens_seen": 258739648, "step": 119910 }, { "epoch": 19.561990212071777, "grad_norm": 0.03693385422229767, "learning_rate": 7.307084684786691e-08, "loss": 0.012, "num_input_tokens_seen": 258750560, "step": 119915 }, { "epoch": 19.562805872756933, "grad_norm": 1.1387377977371216, "learning_rate": 7.279918867585666e-08, "loss": 0.0371, "num_input_tokens_seen": 258760608, "step": 119920 }, { "epoch": 19.563621533442088, "grad_norm": 0.13713467121124268, "learning_rate": 7.25280356851904e-08, "loss": 0.0637, "num_input_tokens_seen": 258772736, "step": 119925 }, { "epoch": 19.564437194127244, "grad_norm": 0.12934327125549316, "learning_rate": 7.225738788136649e-08, "loss": 0.0092, "num_input_tokens_seen": 258783008, "step": 119930 }, { "epoch": 19.5652528548124, "grad_norm": 1.1205830574035645, "learning_rate": 7.198724526986945e-08, "loss": 0.0635, "num_input_tokens_seen": 258793728, "step": 119935 }, { "epoch": 19.56606851549755, "grad_norm": 0.02926565147936344, "learning_rate": 7.171760785617543e-08, "loss": 0.0321, "num_input_tokens_seen": 258804416, "step": 119940 }, { "epoch": 19.566884176182707, "grad_norm": 0.39052051305770874, "learning_rate": 7.144847564574675e-08, "loss": 0.0968, "num_input_tokens_seen": 258816320, "step": 119945 }, { "epoch": 19.567699836867863, "grad_norm": 2.6485910415649414, "learning_rate": 7.117984864404015e-08, "loss": 0.1192, "num_input_tokens_seen": 258827616, "step": 119950 }, { "epoch": 19.56851549755302, "grad_norm": 0.4841826260089874, "learning_rate": 7.091172685649849e-08, "loss": 0.0161, "num_input_tokens_seen": 258839456, "step": 119955 }, { "epoch": 19.569331158238175, "grad_norm": 0.5771685242652893, "learning_rate": 7.064411028855356e-08, "loss": 0.016, "num_input_tokens_seen": 258850240, "step": 119960 }, { "epoch": 19.570146818923327, "grad_norm": 1.6424686908721924, "learning_rate": 7.037699894563154e-08, "loss": 0.1037, "num_input_tokens_seen": 258861440, "step": 119965 }, { "epoch": 19.570962479608482, "grad_norm": 0.166341170668602, "learning_rate": 7.011039283314758e-08, "loss": 0.0469, "num_input_tokens_seen": 258871520, "step": 119970 }, { "epoch": 19.571778140293638, "grad_norm": 0.9719927310943604, "learning_rate": 6.984429195650011e-08, "loss": 0.0227, "num_input_tokens_seen": 258882368, "step": 119975 }, { "epoch": 19.572593800978794, "grad_norm": 0.28381040692329407, "learning_rate": 6.957869632108482e-08, "loss": 0.1641, "num_input_tokens_seen": 258893152, "step": 119980 }, { "epoch": 19.57340946166395, "grad_norm": 0.07611414045095444, "learning_rate": 6.931360593228354e-08, "loss": 0.1424, "num_input_tokens_seen": 258903648, "step": 119985 }, { "epoch": 19.5742251223491, "grad_norm": 0.07825674116611481, "learning_rate": 6.904902079546694e-08, "loss": 0.016, "num_input_tokens_seen": 258914432, "step": 119990 }, { "epoch": 19.575040783034257, "grad_norm": 1.8679873943328857, "learning_rate": 6.878494091600018e-08, "loss": 0.1973, "num_input_tokens_seen": 258924736, "step": 119995 }, { "epoch": 19.575856443719413, "grad_norm": 0.9330843091011047, "learning_rate": 6.852136629923734e-08, "loss": 0.0335, "num_input_tokens_seen": 258935776, "step": 120000 }, { "epoch": 19.57667210440457, "grad_norm": 0.03843369334936142, "learning_rate": 6.825829695051301e-08, "loss": 0.0351, "num_input_tokens_seen": 258948480, "step": 120005 }, { "epoch": 19.57748776508972, "grad_norm": 0.40421146154403687, "learning_rate": 6.799573287516182e-08, "loss": 0.0166, "num_input_tokens_seen": 258959712, "step": 120010 }, { "epoch": 19.578303425774877, "grad_norm": 0.2580637037754059, "learning_rate": 6.773367407851005e-08, "loss": 0.0236, "num_input_tokens_seen": 258970432, "step": 120015 }, { "epoch": 19.579119086460032, "grad_norm": 0.23593072593212128, "learning_rate": 6.747212056585906e-08, "loss": 0.0582, "num_input_tokens_seen": 258981792, "step": 120020 }, { "epoch": 19.579934747145188, "grad_norm": 0.24649657309055328, "learning_rate": 6.721107234251845e-08, "loss": 0.1068, "num_input_tokens_seen": 258992192, "step": 120025 }, { "epoch": 19.580750407830344, "grad_norm": 2.2923948764801025, "learning_rate": 6.69505294137729e-08, "loss": 0.3241, "num_input_tokens_seen": 259003168, "step": 120030 }, { "epoch": 19.581566068515496, "grad_norm": 0.08559122681617737, "learning_rate": 6.669049178490706e-08, "loss": 0.0553, "num_input_tokens_seen": 259013856, "step": 120035 }, { "epoch": 19.58238172920065, "grad_norm": 0.05365551635622978, "learning_rate": 6.643095946118616e-08, "loss": 0.1978, "num_input_tokens_seen": 259024960, "step": 120040 }, { "epoch": 19.583197389885807, "grad_norm": 1.3277339935302734, "learning_rate": 6.617193244787546e-08, "loss": 0.1956, "num_input_tokens_seen": 259036096, "step": 120045 }, { "epoch": 19.584013050570963, "grad_norm": 0.24198035895824432, "learning_rate": 6.591341075021795e-08, "loss": 0.0726, "num_input_tokens_seen": 259046432, "step": 120050 }, { "epoch": 19.58482871125612, "grad_norm": 1.4237282276153564, "learning_rate": 6.56553943734567e-08, "loss": 0.1448, "num_input_tokens_seen": 259056640, "step": 120055 }, { "epoch": 19.58564437194127, "grad_norm": 0.041378509253263474, "learning_rate": 6.539788332282081e-08, "loss": 0.0845, "num_input_tokens_seen": 259068800, "step": 120060 }, { "epoch": 19.586460032626427, "grad_norm": 2.234380006790161, "learning_rate": 6.514087760353116e-08, "loss": 0.1468, "num_input_tokens_seen": 259080672, "step": 120065 }, { "epoch": 19.587275693311582, "grad_norm": 2.384345531463623, "learning_rate": 6.488437722078911e-08, "loss": 0.1541, "num_input_tokens_seen": 259091968, "step": 120070 }, { "epoch": 19.588091353996738, "grad_norm": 1.0951443910598755, "learning_rate": 6.46283821798016e-08, "loss": 0.0919, "num_input_tokens_seen": 259103328, "step": 120075 }, { "epoch": 19.588907014681894, "grad_norm": 0.023917069658637047, "learning_rate": 6.437289248575063e-08, "loss": 0.0566, "num_input_tokens_seen": 259114528, "step": 120080 }, { "epoch": 19.589722675367046, "grad_norm": 1.4490548372268677, "learning_rate": 6.411790814381536e-08, "loss": 0.1036, "num_input_tokens_seen": 259125664, "step": 120085 }, { "epoch": 19.5905383360522, "grad_norm": 0.2449958622455597, "learning_rate": 6.386342915916665e-08, "loss": 0.0282, "num_input_tokens_seen": 259136768, "step": 120090 }, { "epoch": 19.591353996737357, "grad_norm": 2.3599064350128174, "learning_rate": 6.360945553695596e-08, "loss": 0.0989, "num_input_tokens_seen": 259148064, "step": 120095 }, { "epoch": 19.592169657422513, "grad_norm": 0.5050599575042725, "learning_rate": 6.335598728233749e-08, "loss": 0.0375, "num_input_tokens_seen": 259158144, "step": 120100 }, { "epoch": 19.59298531810767, "grad_norm": 1.933156132698059, "learning_rate": 6.310302440044046e-08, "loss": 0.059, "num_input_tokens_seen": 259169568, "step": 120105 }, { "epoch": 19.59380097879282, "grad_norm": 1.9712563753128052, "learning_rate": 6.285056689639968e-08, "loss": 0.2424, "num_input_tokens_seen": 259178400, "step": 120110 }, { "epoch": 19.594616639477977, "grad_norm": 0.6254565119743347, "learning_rate": 6.259861477532492e-08, "loss": 0.1165, "num_input_tokens_seen": 259189696, "step": 120115 }, { "epoch": 19.595432300163132, "grad_norm": 1.9789986610412598, "learning_rate": 6.234716804232322e-08, "loss": 0.1721, "num_input_tokens_seen": 259201024, "step": 120120 }, { "epoch": 19.596247960848288, "grad_norm": 0.036184825003147125, "learning_rate": 6.209622670249326e-08, "loss": 0.0535, "num_input_tokens_seen": 259210944, "step": 120125 }, { "epoch": 19.597063621533444, "grad_norm": 0.2881450653076172, "learning_rate": 6.18457907609199e-08, "loss": 0.1426, "num_input_tokens_seen": 259221888, "step": 120130 }, { "epoch": 19.597879282218596, "grad_norm": 0.07808113098144531, "learning_rate": 6.159586022267682e-08, "loss": 0.206, "num_input_tokens_seen": 259232992, "step": 120135 }, { "epoch": 19.59869494290375, "grad_norm": 0.5445345044136047, "learning_rate": 6.134643509282945e-08, "loss": 0.0366, "num_input_tokens_seen": 259242912, "step": 120140 }, { "epoch": 19.599510603588907, "grad_norm": 2.5222628116607666, "learning_rate": 6.109751537643482e-08, "loss": 0.3726, "num_input_tokens_seen": 259253568, "step": 120145 }, { "epoch": 19.600326264274063, "grad_norm": 0.08651773631572723, "learning_rate": 6.084910107853614e-08, "loss": 0.027, "num_input_tokens_seen": 259264640, "step": 120150 }, { "epoch": 19.601141924959215, "grad_norm": 0.43082666397094727, "learning_rate": 6.060119220416826e-08, "loss": 0.0733, "num_input_tokens_seen": 259276448, "step": 120155 }, { "epoch": 19.60195758564437, "grad_norm": 0.8170081973075867, "learning_rate": 6.035378875835496e-08, "loss": 0.1926, "num_input_tokens_seen": 259288064, "step": 120160 }, { "epoch": 19.602773246329527, "grad_norm": 0.027276024222373962, "learning_rate": 6.010689074610887e-08, "loss": 0.0445, "num_input_tokens_seen": 259298880, "step": 120165 }, { "epoch": 19.603588907014682, "grad_norm": 0.0718470886349678, "learning_rate": 5.98604981724371e-08, "loss": 0.0195, "num_input_tokens_seen": 259309888, "step": 120170 }, { "epoch": 19.604404567699838, "grad_norm": 0.04763038828969002, "learning_rate": 5.961461104233013e-08, "loss": 0.0084, "num_input_tokens_seen": 259320288, "step": 120175 }, { "epoch": 19.605220228384994, "grad_norm": 1.931679129600525, "learning_rate": 5.936922936077283e-08, "loss": 0.0862, "num_input_tokens_seen": 259329888, "step": 120180 }, { "epoch": 19.606035889070146, "grad_norm": 0.7456988096237183, "learning_rate": 5.9124353132736234e-08, "loss": 0.0467, "num_input_tokens_seen": 259342176, "step": 120185 }, { "epoch": 19.6068515497553, "grad_norm": 0.1234646812081337, "learning_rate": 5.8879982363185813e-08, "loss": 0.1005, "num_input_tokens_seen": 259352768, "step": 120190 }, { "epoch": 19.607667210440457, "grad_norm": 0.4020390808582306, "learning_rate": 5.86361170570704e-08, "loss": 0.1061, "num_input_tokens_seen": 259363552, "step": 120195 }, { "epoch": 19.608482871125613, "grad_norm": 2.2056314945220947, "learning_rate": 5.8392757219336016e-08, "loss": 0.1775, "num_input_tokens_seen": 259374688, "step": 120200 }, { "epoch": 19.609298531810765, "grad_norm": 0.10093231499195099, "learning_rate": 5.814990285491484e-08, "loss": 0.0087, "num_input_tokens_seen": 259385856, "step": 120205 }, { "epoch": 19.61011419249592, "grad_norm": 0.36843234300613403, "learning_rate": 5.7907553968725156e-08, "loss": 0.1719, "num_input_tokens_seen": 259396640, "step": 120210 }, { "epoch": 19.610929853181077, "grad_norm": 0.9335181713104248, "learning_rate": 5.7665710565679706e-08, "loss": 0.2613, "num_input_tokens_seen": 259407904, "step": 120215 }, { "epoch": 19.611745513866232, "grad_norm": 0.5112674832344055, "learning_rate": 5.742437265068013e-08, "loss": 0.1776, "num_input_tokens_seen": 259419968, "step": 120220 }, { "epoch": 19.612561174551388, "grad_norm": 0.7007908225059509, "learning_rate": 5.7183540228616935e-08, "loss": 0.1111, "num_input_tokens_seen": 259430272, "step": 120225 }, { "epoch": 19.61337683523654, "grad_norm": 0.47261151671409607, "learning_rate": 5.694321330437513e-08, "loss": 0.0874, "num_input_tokens_seen": 259441824, "step": 120230 }, { "epoch": 19.614192495921696, "grad_norm": 0.10070919990539551, "learning_rate": 5.670339188281748e-08, "loss": 0.2424, "num_input_tokens_seen": 259450848, "step": 120235 }, { "epoch": 19.61500815660685, "grad_norm": 2.2680530548095703, "learning_rate": 5.646407596880954e-08, "loss": 0.2377, "num_input_tokens_seen": 259462912, "step": 120240 }, { "epoch": 19.615823817292007, "grad_norm": 0.11001936346292496, "learning_rate": 5.622526556720298e-08, "loss": 0.1076, "num_input_tokens_seen": 259473536, "step": 120245 }, { "epoch": 19.616639477977163, "grad_norm": 0.0924912840127945, "learning_rate": 5.598696068283282e-08, "loss": 0.1925, "num_input_tokens_seen": 259483744, "step": 120250 }, { "epoch": 19.617455138662315, "grad_norm": 0.04165790602564812, "learning_rate": 5.5749161320528544e-08, "loss": 0.0382, "num_input_tokens_seen": 259494752, "step": 120255 }, { "epoch": 19.61827079934747, "grad_norm": 1.9146945476531982, "learning_rate": 5.5511867485114055e-08, "loss": 0.0392, "num_input_tokens_seen": 259506240, "step": 120260 }, { "epoch": 19.619086460032626, "grad_norm": 0.03059176355600357, "learning_rate": 5.5275079181396624e-08, "loss": 0.1563, "num_input_tokens_seen": 259517536, "step": 120265 }, { "epoch": 19.619902120717782, "grad_norm": 0.03586047887802124, "learning_rate": 5.5038796414172425e-08, "loss": 0.1124, "num_input_tokens_seen": 259529024, "step": 120270 }, { "epoch": 19.620717781402938, "grad_norm": 3.1104605197906494, "learning_rate": 5.480301918823205e-08, "loss": 0.0985, "num_input_tokens_seen": 259539936, "step": 120275 }, { "epoch": 19.62153344208809, "grad_norm": 1.2260899543762207, "learning_rate": 5.456774750835503e-08, "loss": 0.0232, "num_input_tokens_seen": 259551456, "step": 120280 }, { "epoch": 19.622349102773246, "grad_norm": 0.04842658340930939, "learning_rate": 5.433298137930698e-08, "loss": 0.1056, "num_input_tokens_seen": 259563328, "step": 120285 }, { "epoch": 19.6231647634584, "grad_norm": 0.05441288650035858, "learning_rate": 5.409872080584799e-08, "loss": 0.0142, "num_input_tokens_seen": 259573216, "step": 120290 }, { "epoch": 19.623980424143557, "grad_norm": 1.8358922004699707, "learning_rate": 5.386496579272149e-08, "loss": 0.1008, "num_input_tokens_seen": 259583776, "step": 120295 }, { "epoch": 19.624796084828713, "grad_norm": 2.0598082542419434, "learning_rate": 5.3631716344670904e-08, "loss": 0.1432, "num_input_tokens_seen": 259594560, "step": 120300 }, { "epoch": 19.625611745513865, "grad_norm": 2.6908276081085205, "learning_rate": 5.3398972466420226e-08, "loss": 0.0675, "num_input_tokens_seen": 259604512, "step": 120305 }, { "epoch": 19.62642740619902, "grad_norm": 0.38753700256347656, "learning_rate": 5.316673416268514e-08, "loss": 0.0327, "num_input_tokens_seen": 259614848, "step": 120310 }, { "epoch": 19.627243066884176, "grad_norm": 0.32096654176712036, "learning_rate": 5.2935001438172983e-08, "loss": 0.1328, "num_input_tokens_seen": 259624800, "step": 120315 }, { "epoch": 19.628058727569332, "grad_norm": 0.3269042372703552, "learning_rate": 5.270377429758277e-08, "loss": 0.0216, "num_input_tokens_seen": 259635200, "step": 120320 }, { "epoch": 19.628874388254488, "grad_norm": 0.10111971944570541, "learning_rate": 5.247305274559689e-08, "loss": 0.1308, "num_input_tokens_seen": 259646880, "step": 120325 }, { "epoch": 19.62969004893964, "grad_norm": 0.05089841037988663, "learning_rate": 5.2242836786892126e-08, "loss": 0.0444, "num_input_tokens_seen": 259658048, "step": 120330 }, { "epoch": 19.630505709624796, "grad_norm": 0.05732046812772751, "learning_rate": 5.201312642613698e-08, "loss": 0.1741, "num_input_tokens_seen": 259668576, "step": 120335 }, { "epoch": 19.63132137030995, "grad_norm": 2.926544427871704, "learning_rate": 5.178392166798329e-08, "loss": 0.0474, "num_input_tokens_seen": 259679392, "step": 120340 }, { "epoch": 19.632137030995107, "grad_norm": 0.16008177399635315, "learning_rate": 5.155522251707734e-08, "loss": 0.0576, "num_input_tokens_seen": 259689440, "step": 120345 }, { "epoch": 19.63295269168026, "grad_norm": 0.07934984564781189, "learning_rate": 5.13270289780543e-08, "loss": 0.0257, "num_input_tokens_seen": 259698784, "step": 120350 }, { "epoch": 19.633768352365415, "grad_norm": 0.659115731716156, "learning_rate": 5.109934105554104e-08, "loss": 0.1361, "num_input_tokens_seen": 259710304, "step": 120355 }, { "epoch": 19.63458401305057, "grad_norm": 0.1076749786734581, "learning_rate": 5.087215875414497e-08, "loss": 0.0249, "num_input_tokens_seen": 259720992, "step": 120360 }, { "epoch": 19.635399673735726, "grad_norm": 0.061976898461580276, "learning_rate": 5.064548207847908e-08, "loss": 0.0141, "num_input_tokens_seen": 259731712, "step": 120365 }, { "epoch": 19.636215334420882, "grad_norm": 0.030530283227562904, "learning_rate": 5.0419311033131356e-08, "loss": 0.0536, "num_input_tokens_seen": 259741600, "step": 120370 }, { "epoch": 19.637030995106034, "grad_norm": 0.09506402164697647, "learning_rate": 5.019364562268702e-08, "loss": 0.0303, "num_input_tokens_seen": 259752608, "step": 120375 }, { "epoch": 19.63784665579119, "grad_norm": 0.3871196210384369, "learning_rate": 4.996848585171743e-08, "loss": 0.1271, "num_input_tokens_seen": 259762176, "step": 120380 }, { "epoch": 19.638662316476346, "grad_norm": 0.8489508032798767, "learning_rate": 4.974383172479113e-08, "loss": 0.0184, "num_input_tokens_seen": 259772928, "step": 120385 }, { "epoch": 19.6394779771615, "grad_norm": 0.8884273171424866, "learning_rate": 4.9519683246454504e-08, "loss": 0.1122, "num_input_tokens_seen": 259783808, "step": 120390 }, { "epoch": 19.640293637846657, "grad_norm": 0.3708791732788086, "learning_rate": 4.929604042125668e-08, "loss": 0.155, "num_input_tokens_seen": 259793952, "step": 120395 }, { "epoch": 19.64110929853181, "grad_norm": 0.6051025390625, "learning_rate": 4.90729032537246e-08, "loss": 0.1842, "num_input_tokens_seen": 259804672, "step": 120400 }, { "epoch": 19.641924959216965, "grad_norm": 0.05868436023592949, "learning_rate": 4.885027174838519e-08, "loss": 0.1041, "num_input_tokens_seen": 259815456, "step": 120405 }, { "epoch": 19.64274061990212, "grad_norm": 0.404121071100235, "learning_rate": 4.862814590974596e-08, "loss": 0.0848, "num_input_tokens_seen": 259825216, "step": 120410 }, { "epoch": 19.643556280587276, "grad_norm": 0.2797270715236664, "learning_rate": 4.840652574231164e-08, "loss": 0.0476, "num_input_tokens_seen": 259836576, "step": 120415 }, { "epoch": 19.644371941272432, "grad_norm": 0.02969221957027912, "learning_rate": 4.81854112505703e-08, "loss": 0.0753, "num_input_tokens_seen": 259846592, "step": 120420 }, { "epoch": 19.645187601957584, "grad_norm": 1.0852108001708984, "learning_rate": 4.796480243900725e-08, "loss": 0.0494, "num_input_tokens_seen": 259857728, "step": 120425 }, { "epoch": 19.64600326264274, "grad_norm": 0.07608522474765778, "learning_rate": 4.774469931208836e-08, "loss": 0.0184, "num_input_tokens_seen": 259867712, "step": 120430 }, { "epoch": 19.646818923327896, "grad_norm": 0.1222764328122139, "learning_rate": 4.7525101874279495e-08, "loss": 0.1327, "num_input_tokens_seen": 259879872, "step": 120435 }, { "epoch": 19.64763458401305, "grad_norm": 0.2782529294490814, "learning_rate": 4.730601013002989e-08, "loss": 0.0613, "num_input_tokens_seen": 259891072, "step": 120440 }, { "epoch": 19.648450244698207, "grad_norm": 0.025901570916175842, "learning_rate": 4.708742408377764e-08, "loss": 0.2118, "num_input_tokens_seen": 259901440, "step": 120445 }, { "epoch": 19.64926590538336, "grad_norm": 0.06991752982139587, "learning_rate": 4.686934373995255e-08, "loss": 0.0672, "num_input_tokens_seen": 259911968, "step": 120450 }, { "epoch": 19.650081566068515, "grad_norm": 0.031081123277544975, "learning_rate": 4.665176910297608e-08, "loss": 0.0429, "num_input_tokens_seen": 259922496, "step": 120455 }, { "epoch": 19.65089722675367, "grad_norm": 1.7687530517578125, "learning_rate": 4.6434700177258594e-08, "loss": 0.1245, "num_input_tokens_seen": 259934272, "step": 120460 }, { "epoch": 19.651712887438826, "grad_norm": 0.17032308876514435, "learning_rate": 4.621813696719657e-08, "loss": 0.3265, "num_input_tokens_seen": 259944608, "step": 120465 }, { "epoch": 19.652528548123982, "grad_norm": 1.047698974609375, "learning_rate": 4.600207947718094e-08, "loss": 0.1592, "num_input_tokens_seen": 259954368, "step": 120470 }, { "epoch": 19.653344208809134, "grad_norm": 0.12752746045589447, "learning_rate": 4.5786527711588756e-08, "loss": 0.0339, "num_input_tokens_seen": 259965568, "step": 120475 }, { "epoch": 19.65415986949429, "grad_norm": 0.5985997915267944, "learning_rate": 4.5571481674788754e-08, "loss": 0.1832, "num_input_tokens_seen": 259975392, "step": 120480 }, { "epoch": 19.654975530179446, "grad_norm": 1.6340768337249756, "learning_rate": 4.535694137114133e-08, "loss": 0.25, "num_input_tokens_seen": 259986144, "step": 120485 }, { "epoch": 19.6557911908646, "grad_norm": 0.46068087220191956, "learning_rate": 4.514290680499023e-08, "loss": 0.0268, "num_input_tokens_seen": 259996640, "step": 120490 }, { "epoch": 19.656606851549757, "grad_norm": 0.2969217896461487, "learning_rate": 4.4929377980676425e-08, "loss": 0.0743, "num_input_tokens_seen": 260007904, "step": 120495 }, { "epoch": 19.65742251223491, "grad_norm": 0.0764157697558403, "learning_rate": 4.471635490252979e-08, "loss": 0.0043, "num_input_tokens_seen": 260017952, "step": 120500 }, { "epoch": 19.658238172920065, "grad_norm": 0.5062439441680908, "learning_rate": 4.4503837574860764e-08, "loss": 0.0164, "num_input_tokens_seen": 260028544, "step": 120505 }, { "epoch": 19.65905383360522, "grad_norm": 0.15082110464572906, "learning_rate": 4.429182600197979e-08, "loss": 0.0954, "num_input_tokens_seen": 260037760, "step": 120510 }, { "epoch": 19.659869494290376, "grad_norm": 0.18597938120365143, "learning_rate": 4.408032018818342e-08, "loss": 0.0717, "num_input_tokens_seen": 260047968, "step": 120515 }, { "epoch": 19.660685154975532, "grad_norm": 0.06650611013174057, "learning_rate": 4.3869320137759907e-08, "loss": 0.0877, "num_input_tokens_seen": 260059264, "step": 120520 }, { "epoch": 19.661500815660684, "grad_norm": 0.1709548532962799, "learning_rate": 4.36588258549836e-08, "loss": 0.1116, "num_input_tokens_seen": 260070176, "step": 120525 }, { "epoch": 19.66231647634584, "grad_norm": 0.07162460684776306, "learning_rate": 4.344883734412053e-08, "loss": 0.0344, "num_input_tokens_seen": 260081120, "step": 120530 }, { "epoch": 19.663132137030995, "grad_norm": 1.5839707851409912, "learning_rate": 4.323935460942563e-08, "loss": 0.0341, "num_input_tokens_seen": 260092576, "step": 120535 }, { "epoch": 19.66394779771615, "grad_norm": 0.34914296865463257, "learning_rate": 4.30303776551455e-08, "loss": 0.0798, "num_input_tokens_seen": 260103392, "step": 120540 }, { "epoch": 19.664763458401303, "grad_norm": 0.0379275307059288, "learning_rate": 4.2821906485512874e-08, "loss": 0.0123, "num_input_tokens_seen": 260114048, "step": 120545 }, { "epoch": 19.66557911908646, "grad_norm": 0.08150219172239304, "learning_rate": 4.261394110475769e-08, "loss": 0.1662, "num_input_tokens_seen": 260125152, "step": 120550 }, { "epoch": 19.666394779771615, "grad_norm": 0.21998441219329834, "learning_rate": 4.240648151709048e-08, "loss": 0.0299, "num_input_tokens_seen": 260136192, "step": 120555 }, { "epoch": 19.66721044045677, "grad_norm": 0.07751118391752243, "learning_rate": 4.219952772671898e-08, "loss": 0.0437, "num_input_tokens_seen": 260147424, "step": 120560 }, { "epoch": 19.668026101141926, "grad_norm": 0.02763032726943493, "learning_rate": 4.199307973783151e-08, "loss": 0.0146, "num_input_tokens_seen": 260156448, "step": 120565 }, { "epoch": 19.66884176182708, "grad_norm": 3.3782846927642822, "learning_rate": 4.178713755461916e-08, "loss": 0.2098, "num_input_tokens_seen": 260165408, "step": 120570 }, { "epoch": 19.669657422512234, "grad_norm": 1.1815146207809448, "learning_rate": 4.1581701181248046e-08, "loss": 0.1713, "num_input_tokens_seen": 260176224, "step": 120575 }, { "epoch": 19.67047308319739, "grad_norm": 1.965391755104065, "learning_rate": 4.137677062188983e-08, "loss": 0.3287, "num_input_tokens_seen": 260187008, "step": 120580 }, { "epoch": 19.671288743882545, "grad_norm": 0.04438489302992821, "learning_rate": 4.1172345880691185e-08, "loss": 0.0236, "num_input_tokens_seen": 260197440, "step": 120585 }, { "epoch": 19.6721044045677, "grad_norm": 0.057047683745622635, "learning_rate": 4.0968426961798815e-08, "loss": 0.0706, "num_input_tokens_seen": 260208160, "step": 120590 }, { "epoch": 19.672920065252853, "grad_norm": 0.5143575072288513, "learning_rate": 4.0765013869342725e-08, "loss": 0.0503, "num_input_tokens_seen": 260219072, "step": 120595 }, { "epoch": 19.67373572593801, "grad_norm": 0.04158926382660866, "learning_rate": 4.056210660744741e-08, "loss": 0.0871, "num_input_tokens_seen": 260228320, "step": 120600 }, { "epoch": 19.674551386623165, "grad_norm": 0.0258739423006773, "learning_rate": 4.035970518022625e-08, "loss": 0.0062, "num_input_tokens_seen": 260240576, "step": 120605 }, { "epoch": 19.67536704730832, "grad_norm": 0.2504585385322571, "learning_rate": 4.015780959177595e-08, "loss": 0.1948, "num_input_tokens_seen": 260250144, "step": 120610 }, { "epoch": 19.676182707993476, "grad_norm": 0.03982679918408394, "learning_rate": 3.995641984619325e-08, "loss": 0.0101, "num_input_tokens_seen": 260261248, "step": 120615 }, { "epoch": 19.67699836867863, "grad_norm": 0.06055925413966179, "learning_rate": 3.975553594755821e-08, "loss": 0.2251, "num_input_tokens_seen": 260271232, "step": 120620 }, { "epoch": 19.677814029363784, "grad_norm": 1.2737659215927124, "learning_rate": 3.9555157899939796e-08, "loss": 0.0501, "num_input_tokens_seen": 260282656, "step": 120625 }, { "epoch": 19.67862969004894, "grad_norm": 0.6635410785675049, "learning_rate": 3.935528570740144e-08, "loss": 0.0441, "num_input_tokens_seen": 260293408, "step": 120630 }, { "epoch": 19.679445350734095, "grad_norm": 0.16215330362319946, "learning_rate": 3.9155919373992655e-08, "loss": 0.0599, "num_input_tokens_seen": 260305856, "step": 120635 }, { "epoch": 19.68026101141925, "grad_norm": 0.09907793253660202, "learning_rate": 3.8957058903754675e-08, "loss": 0.015, "num_input_tokens_seen": 260315648, "step": 120640 }, { "epoch": 19.681076672104403, "grad_norm": 0.5449886918067932, "learning_rate": 3.8758704300717596e-08, "loss": 0.0779, "num_input_tokens_seen": 260325408, "step": 120645 }, { "epoch": 19.68189233278956, "grad_norm": 0.06506723165512085, "learning_rate": 3.8560855568900435e-08, "loss": 0.014, "num_input_tokens_seen": 260335360, "step": 120650 }, { "epoch": 19.682707993474715, "grad_norm": 0.1474182903766632, "learning_rate": 3.836351271231387e-08, "loss": 0.0367, "num_input_tokens_seen": 260345888, "step": 120655 }, { "epoch": 19.68352365415987, "grad_norm": 0.0724688172340393, "learning_rate": 3.816667573495747e-08, "loss": 0.1255, "num_input_tokens_seen": 260356000, "step": 120660 }, { "epoch": 19.684339314845026, "grad_norm": 1.8142530918121338, "learning_rate": 3.7970344640819725e-08, "loss": 0.1415, "num_input_tokens_seen": 260366624, "step": 120665 }, { "epoch": 19.68515497553018, "grad_norm": 0.10832662135362625, "learning_rate": 3.7774519433878e-08, "loss": 0.0184, "num_input_tokens_seen": 260377280, "step": 120670 }, { "epoch": 19.685970636215334, "grad_norm": 0.2455078363418579, "learning_rate": 3.757920011810412e-08, "loss": 0.0931, "num_input_tokens_seen": 260387520, "step": 120675 }, { "epoch": 19.68678629690049, "grad_norm": 0.5441701412200928, "learning_rate": 3.738438669745326e-08, "loss": 0.0516, "num_input_tokens_seen": 260399264, "step": 120680 }, { "epoch": 19.687601957585645, "grad_norm": 1.0233451128005981, "learning_rate": 3.719007917587502e-08, "loss": 0.1988, "num_input_tokens_seen": 260410144, "step": 120685 }, { "epoch": 19.6884176182708, "grad_norm": 1.1912071704864502, "learning_rate": 3.699627755730794e-08, "loss": 0.0453, "num_input_tokens_seen": 260421728, "step": 120690 }, { "epoch": 19.689233278955953, "grad_norm": 0.9483962655067444, "learning_rate": 3.680298184567943e-08, "loss": 0.1452, "num_input_tokens_seen": 260433056, "step": 120695 }, { "epoch": 19.69004893964111, "grad_norm": 0.2663557827472687, "learning_rate": 3.6610192044905786e-08, "loss": 0.1456, "num_input_tokens_seen": 260443328, "step": 120700 }, { "epoch": 19.690864600326265, "grad_norm": 0.23127569258213043, "learning_rate": 3.641790815889501e-08, "loss": 0.1493, "num_input_tokens_seen": 260454816, "step": 120705 }, { "epoch": 19.69168026101142, "grad_norm": 0.047050103545188904, "learning_rate": 3.622613019154397e-08, "loss": 0.1075, "num_input_tokens_seen": 260466208, "step": 120710 }, { "epoch": 19.692495921696576, "grad_norm": 0.12362682819366455, "learning_rate": 3.603485814674124e-08, "loss": 0.1275, "num_input_tokens_seen": 260477920, "step": 120715 }, { "epoch": 19.693311582381728, "grad_norm": 0.23617371916770935, "learning_rate": 3.58440920283587e-08, "loss": 0.0979, "num_input_tokens_seen": 260490592, "step": 120720 }, { "epoch": 19.694127243066884, "grad_norm": 0.20079071819782257, "learning_rate": 3.5653831840265497e-08, "loss": 0.0587, "num_input_tokens_seen": 260501312, "step": 120725 }, { "epoch": 19.69494290375204, "grad_norm": 0.07341620326042175, "learning_rate": 3.546407758631687e-08, "loss": 0.0557, "num_input_tokens_seen": 260511328, "step": 120730 }, { "epoch": 19.695758564437195, "grad_norm": 2.4796597957611084, "learning_rate": 3.5274829270359746e-08, "loss": 0.2333, "num_input_tokens_seen": 260521728, "step": 120735 }, { "epoch": 19.696574225122347, "grad_norm": 2.0747852325439453, "learning_rate": 3.5086086896227167e-08, "loss": 0.0706, "num_input_tokens_seen": 260531136, "step": 120740 }, { "epoch": 19.697389885807503, "grad_norm": 0.8496473431587219, "learning_rate": 3.4897850467743853e-08, "loss": 0.14, "num_input_tokens_seen": 260541184, "step": 120745 }, { "epoch": 19.69820554649266, "grad_norm": 1.67948317527771, "learning_rate": 3.471011998872897e-08, "loss": 0.0873, "num_input_tokens_seen": 260551552, "step": 120750 }, { "epoch": 19.699021207177815, "grad_norm": 0.07316499948501587, "learning_rate": 3.452289546298226e-08, "loss": 0.0686, "num_input_tokens_seen": 260562112, "step": 120755 }, { "epoch": 19.69983686786297, "grad_norm": 0.017729388549923897, "learning_rate": 3.433617689430069e-08, "loss": 0.0979, "num_input_tokens_seen": 260571776, "step": 120760 }, { "epoch": 19.700652528548122, "grad_norm": 0.12108980864286423, "learning_rate": 3.414996428646733e-08, "loss": 0.1495, "num_input_tokens_seen": 260584032, "step": 120765 }, { "epoch": 19.701468189233278, "grad_norm": 1.833533763885498, "learning_rate": 3.396425764325695e-08, "loss": 0.2292, "num_input_tokens_seen": 260594688, "step": 120770 }, { "epoch": 19.702283849918434, "grad_norm": 0.07824623584747314, "learning_rate": 3.377905696843042e-08, "loss": 0.0603, "num_input_tokens_seen": 260605248, "step": 120775 }, { "epoch": 19.70309951060359, "grad_norm": 0.057084839791059494, "learning_rate": 3.359436226574586e-08, "loss": 0.1389, "num_input_tokens_seen": 260617376, "step": 120780 }, { "epoch": 19.703915171288745, "grad_norm": 3.6858580112457275, "learning_rate": 3.3410173538941936e-08, "loss": 0.2156, "num_input_tokens_seen": 260628384, "step": 120785 }, { "epoch": 19.704730831973897, "grad_norm": 0.033114030957221985, "learning_rate": 3.322649079175455e-08, "loss": 0.0765, "num_input_tokens_seen": 260640320, "step": 120790 }, { "epoch": 19.705546492659053, "grad_norm": 0.11588653922080994, "learning_rate": 3.304331402790295e-08, "loss": 0.0278, "num_input_tokens_seen": 260651488, "step": 120795 }, { "epoch": 19.70636215334421, "grad_norm": 0.055376455187797546, "learning_rate": 3.2860643251103606e-08, "loss": 0.1317, "num_input_tokens_seen": 260661184, "step": 120800 }, { "epoch": 19.707177814029365, "grad_norm": 0.03526926785707474, "learning_rate": 3.2678478465056336e-08, "loss": 0.0308, "num_input_tokens_seen": 260673056, "step": 120805 }, { "epoch": 19.70799347471452, "grad_norm": 1.4509341716766357, "learning_rate": 3.249681967345264e-08, "loss": 0.1153, "num_input_tokens_seen": 260683712, "step": 120810 }, { "epoch": 19.708809135399672, "grad_norm": 0.11203952878713608, "learning_rate": 3.2315666879972896e-08, "loss": 0.0172, "num_input_tokens_seen": 260695168, "step": 120815 }, { "epoch": 19.709624796084828, "grad_norm": 0.20390962064266205, "learning_rate": 3.2135020088291944e-08, "loss": 0.0385, "num_input_tokens_seen": 260705888, "step": 120820 }, { "epoch": 19.710440456769984, "grad_norm": 0.022482657805085182, "learning_rate": 3.195487930206798e-08, "loss": 0.0331, "num_input_tokens_seen": 260717376, "step": 120825 }, { "epoch": 19.71125611745514, "grad_norm": 1.8386656045913696, "learning_rate": 3.1775244524953616e-08, "loss": 0.1763, "num_input_tokens_seen": 260728896, "step": 120830 }, { "epoch": 19.712071778140295, "grad_norm": 1.040657877922058, "learning_rate": 3.159611576058763e-08, "loss": 0.1037, "num_input_tokens_seen": 260740224, "step": 120835 }, { "epoch": 19.712887438825447, "grad_norm": 0.2104744017124176, "learning_rate": 3.141749301260044e-08, "loss": 0.0748, "num_input_tokens_seen": 260751168, "step": 120840 }, { "epoch": 19.713703099510603, "grad_norm": 0.31879040598869324, "learning_rate": 3.1239376284611375e-08, "loss": 0.1119, "num_input_tokens_seen": 260762528, "step": 120845 }, { "epoch": 19.71451876019576, "grad_norm": 1.3316264152526855, "learning_rate": 3.106176558023422e-08, "loss": 0.3571, "num_input_tokens_seen": 260772832, "step": 120850 }, { "epoch": 19.715334420880914, "grad_norm": 1.8885360956192017, "learning_rate": 3.08846609030633e-08, "loss": 0.3921, "num_input_tokens_seen": 260782720, "step": 120855 }, { "epoch": 19.71615008156607, "grad_norm": 0.1790401190519333, "learning_rate": 3.070806225669298e-08, "loss": 0.0057, "num_input_tokens_seen": 260792448, "step": 120860 }, { "epoch": 19.716965742251222, "grad_norm": 3.2431564331054688, "learning_rate": 3.0531969644698175e-08, "loss": 0.1311, "num_input_tokens_seen": 260802688, "step": 120865 }, { "epoch": 19.717781402936378, "grad_norm": 0.030869686976075172, "learning_rate": 3.0356383070648256e-08, "loss": 0.0556, "num_input_tokens_seen": 260813568, "step": 120870 }, { "epoch": 19.718597063621534, "grad_norm": 1.1025059223175049, "learning_rate": 3.018130253810425e-08, "loss": 0.0271, "num_input_tokens_seen": 260825152, "step": 120875 }, { "epoch": 19.71941272430669, "grad_norm": 0.1717909723520279, "learning_rate": 3.0006728050610554e-08, "loss": 0.1847, "num_input_tokens_seen": 260836288, "step": 120880 }, { "epoch": 19.72022838499184, "grad_norm": 0.6602253317832947, "learning_rate": 2.983265961170878e-08, "loss": 0.1612, "num_input_tokens_seen": 260846400, "step": 120885 }, { "epoch": 19.721044045676997, "grad_norm": 0.06764359027147293, "learning_rate": 2.965909722492388e-08, "loss": 0.0589, "num_input_tokens_seen": 260856224, "step": 120890 }, { "epoch": 19.721859706362153, "grad_norm": 0.9061763286590576, "learning_rate": 2.948604089377527e-08, "loss": 0.1085, "num_input_tokens_seen": 260867104, "step": 120895 }, { "epoch": 19.72267536704731, "grad_norm": 1.1938203573226929, "learning_rate": 2.931349062176847e-08, "loss": 0.0966, "num_input_tokens_seen": 260875072, "step": 120900 }, { "epoch": 19.723491027732464, "grad_norm": 0.10807834565639496, "learning_rate": 2.914144641240346e-08, "loss": 0.0498, "num_input_tokens_seen": 260885728, "step": 120905 }, { "epoch": 19.724306688417617, "grad_norm": 0.3167218267917633, "learning_rate": 2.896990826916357e-08, "loss": 0.079, "num_input_tokens_seen": 260897536, "step": 120910 }, { "epoch": 19.725122349102772, "grad_norm": 0.4825938940048218, "learning_rate": 2.879887619552657e-08, "loss": 0.0329, "num_input_tokens_seen": 260908832, "step": 120915 }, { "epoch": 19.725938009787928, "grad_norm": 0.9245253801345825, "learning_rate": 2.8628350194956355e-08, "loss": 0.0353, "num_input_tokens_seen": 260919328, "step": 120920 }, { "epoch": 19.726753670473084, "grad_norm": 1.8893663883209229, "learning_rate": 2.8458330270914047e-08, "loss": 0.2226, "num_input_tokens_seen": 260930016, "step": 120925 }, { "epoch": 19.72756933115824, "grad_norm": 0.03517075628042221, "learning_rate": 2.8288816426841336e-08, "loss": 0.0797, "num_input_tokens_seen": 260941632, "step": 120930 }, { "epoch": 19.72838499184339, "grad_norm": 0.16923195123672485, "learning_rate": 2.8119808666174365e-08, "loss": 0.0868, "num_input_tokens_seen": 260951680, "step": 120935 }, { "epoch": 19.729200652528547, "grad_norm": 0.028723059222102165, "learning_rate": 2.7951306992338165e-08, "loss": 0.1124, "num_input_tokens_seen": 260962432, "step": 120940 }, { "epoch": 19.730016313213703, "grad_norm": 0.05151870846748352, "learning_rate": 2.7783311408749458e-08, "loss": 0.0102, "num_input_tokens_seen": 260974016, "step": 120945 }, { "epoch": 19.73083197389886, "grad_norm": 0.04834376275539398, "learning_rate": 2.7615821918811068e-08, "loss": 0.2402, "num_input_tokens_seen": 260985536, "step": 120950 }, { "epoch": 19.731647634584014, "grad_norm": 0.3048883378505707, "learning_rate": 2.7448838525917507e-08, "loss": 0.0713, "num_input_tokens_seen": 260995872, "step": 120955 }, { "epoch": 19.732463295269167, "grad_norm": 0.052099332213401794, "learning_rate": 2.728236123345218e-08, "loss": 0.1404, "num_input_tokens_seen": 261004960, "step": 120960 }, { "epoch": 19.733278955954322, "grad_norm": 0.10518886148929596, "learning_rate": 2.7116390044790164e-08, "loss": 0.0428, "num_input_tokens_seen": 261015616, "step": 120965 }, { "epoch": 19.734094616639478, "grad_norm": 0.3105810582637787, "learning_rate": 2.6950924963298208e-08, "loss": 0.1073, "num_input_tokens_seen": 261025728, "step": 120970 }, { "epoch": 19.734910277324634, "grad_norm": 0.07243768125772476, "learning_rate": 2.6785965992323636e-08, "loss": 0.1255, "num_input_tokens_seen": 261037120, "step": 120975 }, { "epoch": 19.73572593800979, "grad_norm": 0.0693022608757019, "learning_rate": 2.6621513135210995e-08, "loss": 0.2474, "num_input_tokens_seen": 261048608, "step": 120980 }, { "epoch": 19.73654159869494, "grad_norm": 0.8301425576210022, "learning_rate": 2.6457566395296507e-08, "loss": 0.0259, "num_input_tokens_seen": 261060000, "step": 120985 }, { "epoch": 19.737357259380097, "grad_norm": 2.126703977584839, "learning_rate": 2.6294125775899736e-08, "loss": 0.1345, "num_input_tokens_seen": 261069568, "step": 120990 }, { "epoch": 19.738172920065253, "grad_norm": 0.5681048631668091, "learning_rate": 2.6131191280331922e-08, "loss": 0.1111, "num_input_tokens_seen": 261080992, "step": 120995 }, { "epoch": 19.73898858075041, "grad_norm": 2.276679277420044, "learning_rate": 2.596876291189876e-08, "loss": 0.2579, "num_input_tokens_seen": 261092928, "step": 121000 }, { "epoch": 19.739804241435564, "grad_norm": 0.7443335652351379, "learning_rate": 2.5806840673892054e-08, "loss": 0.1017, "num_input_tokens_seen": 261104064, "step": 121005 }, { "epoch": 19.740619902120716, "grad_norm": 0.09224314242601395, "learning_rate": 2.564542456958974e-08, "loss": 0.079, "num_input_tokens_seen": 261115072, "step": 121010 }, { "epoch": 19.741435562805872, "grad_norm": 1.9741612672805786, "learning_rate": 2.5484514602266975e-08, "loss": 0.1563, "num_input_tokens_seen": 261125664, "step": 121015 }, { "epoch": 19.742251223491028, "grad_norm": 0.360342413187027, "learning_rate": 2.5324110775182263e-08, "loss": 0.1615, "num_input_tokens_seen": 261136288, "step": 121020 }, { "epoch": 19.743066884176184, "grad_norm": 0.04135699197649956, "learning_rate": 2.5164213091585786e-08, "loss": 0.0253, "num_input_tokens_seen": 261146432, "step": 121025 }, { "epoch": 19.74388254486134, "grad_norm": 0.1958724707365036, "learning_rate": 2.5004821554719393e-08, "loss": 0.0247, "num_input_tokens_seen": 261156576, "step": 121030 }, { "epoch": 19.74469820554649, "grad_norm": 4.050878047943115, "learning_rate": 2.4845936167813832e-08, "loss": 0.1527, "num_input_tokens_seen": 261167936, "step": 121035 }, { "epoch": 19.745513866231647, "grad_norm": 0.292462021112442, "learning_rate": 2.468755693408875e-08, "loss": 0.0547, "num_input_tokens_seen": 261179744, "step": 121040 }, { "epoch": 19.746329526916803, "grad_norm": 1.3749922513961792, "learning_rate": 2.452968385675547e-08, "loss": 0.0916, "num_input_tokens_seen": 261191616, "step": 121045 }, { "epoch": 19.74714518760196, "grad_norm": 0.2330084890127182, "learning_rate": 2.4372316939008655e-08, "loss": 0.0582, "num_input_tokens_seen": 261203072, "step": 121050 }, { "epoch": 19.747960848287114, "grad_norm": 1.1144722700119019, "learning_rate": 2.4215456184042973e-08, "loss": 0.0424, "num_input_tokens_seen": 261213984, "step": 121055 }, { "epoch": 19.748776508972266, "grad_norm": 0.3317478895187378, "learning_rate": 2.405910159503366e-08, "loss": 0.1625, "num_input_tokens_seen": 261224672, "step": 121060 }, { "epoch": 19.749592169657422, "grad_norm": 0.2863028347492218, "learning_rate": 2.390325317515041e-08, "loss": 0.0537, "num_input_tokens_seen": 261235232, "step": 121065 }, { "epoch": 19.750407830342578, "grad_norm": 0.18220154941082, "learning_rate": 2.3747910927554572e-08, "loss": 0.0079, "num_input_tokens_seen": 261245152, "step": 121070 }, { "epoch": 19.751223491027734, "grad_norm": 0.20012328028678894, "learning_rate": 2.359307485539086e-08, "loss": 0.06, "num_input_tokens_seen": 261254592, "step": 121075 }, { "epoch": 19.752039151712886, "grad_norm": 0.06351850181818008, "learning_rate": 2.343874496179843e-08, "loss": 0.0625, "num_input_tokens_seen": 261264448, "step": 121080 }, { "epoch": 19.75285481239804, "grad_norm": 0.2471948117017746, "learning_rate": 2.3284921249902557e-08, "loss": 0.0155, "num_input_tokens_seen": 261276256, "step": 121085 }, { "epoch": 19.753670473083197, "grad_norm": 0.05612612143158913, "learning_rate": 2.3131603722825744e-08, "loss": 0.2154, "num_input_tokens_seen": 261287360, "step": 121090 }, { "epoch": 19.754486133768353, "grad_norm": 2.3451950550079346, "learning_rate": 2.2978792383671068e-08, "loss": 0.0949, "num_input_tokens_seen": 261298976, "step": 121095 }, { "epoch": 19.75530179445351, "grad_norm": 1.8107922077178955, "learning_rate": 2.2826487235538818e-08, "loss": 0.0814, "num_input_tokens_seen": 261310816, "step": 121100 }, { "epoch": 19.75611745513866, "grad_norm": 0.02457215078175068, "learning_rate": 2.267468828150987e-08, "loss": 0.0082, "num_input_tokens_seen": 261321536, "step": 121105 }, { "epoch": 19.756933115823816, "grad_norm": 0.018123093992471695, "learning_rate": 2.2523395524667867e-08, "loss": 0.0684, "num_input_tokens_seen": 261331456, "step": 121110 }, { "epoch": 19.757748776508972, "grad_norm": 0.11949112266302109, "learning_rate": 2.237260896807425e-08, "loss": 0.0162, "num_input_tokens_seen": 261343392, "step": 121115 }, { "epoch": 19.758564437194128, "grad_norm": 0.10219444334506989, "learning_rate": 2.2222328614784905e-08, "loss": 0.0375, "num_input_tokens_seen": 261352608, "step": 121120 }, { "epoch": 19.759380097879284, "grad_norm": 1.2909313440322876, "learning_rate": 2.2072554467847394e-08, "loss": 0.3082, "num_input_tokens_seen": 261362688, "step": 121125 }, { "epoch": 19.760195758564436, "grad_norm": 1.8936916589736938, "learning_rate": 2.192328653029818e-08, "loss": 0.0432, "num_input_tokens_seen": 261374432, "step": 121130 }, { "epoch": 19.76101141924959, "grad_norm": 0.14691247045993805, "learning_rate": 2.177452480515707e-08, "loss": 0.0937, "num_input_tokens_seen": 261385984, "step": 121135 }, { "epoch": 19.761827079934747, "grad_norm": 0.043339479714632034, "learning_rate": 2.1626269295446645e-08, "loss": 0.0353, "num_input_tokens_seen": 261396000, "step": 121140 }, { "epoch": 19.762642740619903, "grad_norm": 0.9951147437095642, "learning_rate": 2.147852000416173e-08, "loss": 0.1696, "num_input_tokens_seen": 261406400, "step": 121145 }, { "epoch": 19.76345840130506, "grad_norm": 0.06327535212039948, "learning_rate": 2.1331276934305478e-08, "loss": 0.0739, "num_input_tokens_seen": 261417344, "step": 121150 }, { "epoch": 19.76427406199021, "grad_norm": 0.14565002918243408, "learning_rate": 2.1184540088858838e-08, "loss": 0.0926, "num_input_tokens_seen": 261426752, "step": 121155 }, { "epoch": 19.765089722675366, "grad_norm": 0.6941319704055786, "learning_rate": 2.1038309470794437e-08, "loss": 0.0641, "num_input_tokens_seen": 261438336, "step": 121160 }, { "epoch": 19.765905383360522, "grad_norm": 0.7852985858917236, "learning_rate": 2.0892585083076565e-08, "loss": 0.1114, "num_input_tokens_seen": 261448928, "step": 121165 }, { "epoch": 19.766721044045678, "grad_norm": 2.1848769187927246, "learning_rate": 2.0747366928661193e-08, "loss": 0.3068, "num_input_tokens_seen": 261460256, "step": 121170 }, { "epoch": 19.767536704730833, "grad_norm": 3.239229679107666, "learning_rate": 2.060265501048486e-08, "loss": 0.0887, "num_input_tokens_seen": 261472000, "step": 121175 }, { "epoch": 19.768352365415986, "grad_norm": 0.030132707208395004, "learning_rate": 2.0458449331486884e-08, "loss": 0.0217, "num_input_tokens_seen": 261483104, "step": 121180 }, { "epoch": 19.76916802610114, "grad_norm": 0.02744828350841999, "learning_rate": 2.0314749894587147e-08, "loss": 0.1966, "num_input_tokens_seen": 261493856, "step": 121185 }, { "epoch": 19.769983686786297, "grad_norm": 3.3845837116241455, "learning_rate": 2.0171556702697214e-08, "loss": 0.2288, "num_input_tokens_seen": 261506272, "step": 121190 }, { "epoch": 19.770799347471453, "grad_norm": 0.022116826847195625, "learning_rate": 2.0028869758720314e-08, "loss": 0.0085, "num_input_tokens_seen": 261517088, "step": 121195 }, { "epoch": 19.77161500815661, "grad_norm": 0.09974789619445801, "learning_rate": 1.988668906554858e-08, "loss": 0.0508, "num_input_tokens_seen": 261528640, "step": 121200 }, { "epoch": 19.77243066884176, "grad_norm": 0.17066460847854614, "learning_rate": 1.9745014626060266e-08, "loss": 0.0265, "num_input_tokens_seen": 261539904, "step": 121205 }, { "epoch": 19.773246329526916, "grad_norm": 0.33674386143684387, "learning_rate": 1.9603846443130847e-08, "loss": 0.0895, "num_input_tokens_seen": 261550496, "step": 121210 }, { "epoch": 19.774061990212072, "grad_norm": 0.09006605297327042, "learning_rate": 1.9463184519619147e-08, "loss": 0.1012, "num_input_tokens_seen": 261560032, "step": 121215 }, { "epoch": 19.774877650897228, "grad_norm": 2.7763772010803223, "learning_rate": 1.932302885837567e-08, "loss": 0.0646, "num_input_tokens_seen": 261571360, "step": 121220 }, { "epoch": 19.775693311582383, "grad_norm": 0.06374063342809677, "learning_rate": 1.9183379462239805e-08, "loss": 0.1265, "num_input_tokens_seen": 261580896, "step": 121225 }, { "epoch": 19.776508972267536, "grad_norm": 0.05818534642457962, "learning_rate": 1.9044236334045397e-08, "loss": 0.0151, "num_input_tokens_seen": 261591648, "step": 121230 }, { "epoch": 19.77732463295269, "grad_norm": 0.8296601176261902, "learning_rate": 1.8905599476609636e-08, "loss": 0.0256, "num_input_tokens_seen": 261601408, "step": 121235 }, { "epoch": 19.778140293637847, "grad_norm": 1.8949699401855469, "learning_rate": 1.8767468892741392e-08, "loss": 0.2916, "num_input_tokens_seen": 261611648, "step": 121240 }, { "epoch": 19.778955954323003, "grad_norm": 0.05641309544444084, "learning_rate": 1.862984458524397e-08, "loss": 0.0912, "num_input_tokens_seen": 261623552, "step": 121245 }, { "epoch": 19.77977161500816, "grad_norm": 2.3899717330932617, "learning_rate": 1.8492726556901264e-08, "loss": 0.0789, "num_input_tokens_seen": 261634720, "step": 121250 }, { "epoch": 19.78058727569331, "grad_norm": 0.33279430866241455, "learning_rate": 1.8356114810494374e-08, "loss": 0.0985, "num_input_tokens_seen": 261646080, "step": 121255 }, { "epoch": 19.781402936378466, "grad_norm": 0.0262431837618351, "learning_rate": 1.822000934879331e-08, "loss": 0.1234, "num_input_tokens_seen": 261656640, "step": 121260 }, { "epoch": 19.782218597063622, "grad_norm": 0.03950538486242294, "learning_rate": 1.8084410174556977e-08, "loss": 0.0235, "num_input_tokens_seen": 261668352, "step": 121265 }, { "epoch": 19.783034257748778, "grad_norm": 0.35428550839424133, "learning_rate": 1.7949317290530398e-08, "loss": 0.1569, "num_input_tokens_seen": 261678624, "step": 121270 }, { "epoch": 19.78384991843393, "grad_norm": 0.0513625405728817, "learning_rate": 1.781473069945305e-08, "loss": 0.0358, "num_input_tokens_seen": 261690240, "step": 121275 }, { "epoch": 19.784665579119086, "grad_norm": 0.3149155378341675, "learning_rate": 1.7680650404053313e-08, "loss": 0.077, "num_input_tokens_seen": 261701536, "step": 121280 }, { "epoch": 19.78548123980424, "grad_norm": 0.04757058247923851, "learning_rate": 1.754707640704567e-08, "loss": 0.1476, "num_input_tokens_seen": 261712512, "step": 121285 }, { "epoch": 19.786296900489397, "grad_norm": 0.08322060108184814, "learning_rate": 1.7414008711139073e-08, "loss": 0.2683, "num_input_tokens_seen": 261723680, "step": 121290 }, { "epoch": 19.787112561174553, "grad_norm": 0.5076606273651123, "learning_rate": 1.7281447319031364e-08, "loss": 0.0256, "num_input_tokens_seen": 261734368, "step": 121295 }, { "epoch": 19.787928221859705, "grad_norm": 0.06871487200260162, "learning_rate": 1.714939223340928e-08, "loss": 0.0575, "num_input_tokens_seen": 261744512, "step": 121300 }, { "epoch": 19.78874388254486, "grad_norm": 1.9757604598999023, "learning_rate": 1.701784345694568e-08, "loss": 0.1819, "num_input_tokens_seen": 261756416, "step": 121305 }, { "epoch": 19.789559543230016, "grad_norm": 2.2550840377807617, "learning_rate": 1.6886800992307882e-08, "loss": 0.1733, "num_input_tokens_seen": 261766656, "step": 121310 }, { "epoch": 19.790375203915172, "grad_norm": 1.2577519416809082, "learning_rate": 1.6756264842154867e-08, "loss": 0.0377, "num_input_tokens_seen": 261777792, "step": 121315 }, { "epoch": 19.791190864600328, "grad_norm": 1.5587332248687744, "learning_rate": 1.6626235009128965e-08, "loss": 0.1889, "num_input_tokens_seen": 261789344, "step": 121320 }, { "epoch": 19.79200652528548, "grad_norm": 0.05612488463521004, "learning_rate": 1.649671149586418e-08, "loss": 0.0609, "num_input_tokens_seen": 261800544, "step": 121325 }, { "epoch": 19.792822185970635, "grad_norm": 0.27767738699913025, "learning_rate": 1.6367694304988966e-08, "loss": 0.0217, "num_input_tokens_seen": 261811616, "step": 121330 }, { "epoch": 19.79363784665579, "grad_norm": 0.39192596077919006, "learning_rate": 1.62391834391179e-08, "loss": 0.2702, "num_input_tokens_seen": 261822592, "step": 121335 }, { "epoch": 19.794453507340947, "grad_norm": 0.12601962685585022, "learning_rate": 1.6111178900851675e-08, "loss": 0.2583, "num_input_tokens_seen": 261834400, "step": 121340 }, { "epoch": 19.795269168026103, "grad_norm": 0.2026178538799286, "learning_rate": 1.5983680692785442e-08, "loss": 0.0114, "num_input_tokens_seen": 261844640, "step": 121345 }, { "epoch": 19.796084828711255, "grad_norm": 1.6784861087799072, "learning_rate": 1.5856688817506015e-08, "loss": 0.0994, "num_input_tokens_seen": 261854432, "step": 121350 }, { "epoch": 19.79690048939641, "grad_norm": 0.10461166501045227, "learning_rate": 1.5730203277586343e-08, "loss": 0.0115, "num_input_tokens_seen": 261862784, "step": 121355 }, { "epoch": 19.797716150081566, "grad_norm": 0.14240464568138123, "learning_rate": 1.5604224075588258e-08, "loss": 0.0154, "num_input_tokens_seen": 261874336, "step": 121360 }, { "epoch": 19.798531810766722, "grad_norm": 0.013996239751577377, "learning_rate": 1.547875121406528e-08, "loss": 0.0397, "num_input_tokens_seen": 261884416, "step": 121365 }, { "epoch": 19.799347471451878, "grad_norm": 0.07860195636749268, "learning_rate": 1.5353784695559812e-08, "loss": 0.089, "num_input_tokens_seen": 261895360, "step": 121370 }, { "epoch": 19.80016313213703, "grad_norm": 0.03990057110786438, "learning_rate": 1.522932452260595e-08, "loss": 0.1699, "num_input_tokens_seen": 261905472, "step": 121375 }, { "epoch": 19.800978792822185, "grad_norm": 0.07464180886745453, "learning_rate": 1.5105370697726662e-08, "loss": 0.0629, "num_input_tokens_seen": 261915680, "step": 121380 }, { "epoch": 19.80179445350734, "grad_norm": 0.10678240656852722, "learning_rate": 1.498192322343106e-08, "loss": 0.1457, "num_input_tokens_seen": 261925440, "step": 121385 }, { "epoch": 19.802610114192497, "grad_norm": 0.026096442714333534, "learning_rate": 1.4858982102222696e-08, "loss": 0.0144, "num_input_tokens_seen": 261936000, "step": 121390 }, { "epoch": 19.803425774877653, "grad_norm": 0.1528959721326828, "learning_rate": 1.473654733659402e-08, "loss": 0.1598, "num_input_tokens_seen": 261948128, "step": 121395 }, { "epoch": 19.804241435562805, "grad_norm": 0.17017383873462677, "learning_rate": 1.4614618929023605e-08, "loss": 0.1542, "num_input_tokens_seen": 261957568, "step": 121400 }, { "epoch": 19.80505709624796, "grad_norm": 1.8944908380508423, "learning_rate": 1.4493196881984472e-08, "loss": 0.182, "num_input_tokens_seen": 261969120, "step": 121405 }, { "epoch": 19.805872756933116, "grad_norm": 1.8019174337387085, "learning_rate": 1.4372281197938542e-08, "loss": 0.1281, "num_input_tokens_seen": 261979136, "step": 121410 }, { "epoch": 19.806688417618272, "grad_norm": 1.0490552186965942, "learning_rate": 1.425187187933108e-08, "loss": 0.0242, "num_input_tokens_seen": 261989824, "step": 121415 }, { "epoch": 19.807504078303424, "grad_norm": 4.789123058319092, "learning_rate": 1.4131968928610128e-08, "loss": 0.3131, "num_input_tokens_seen": 262000544, "step": 121420 }, { "epoch": 19.80831973898858, "grad_norm": 0.27851852774620056, "learning_rate": 1.401257234819875e-08, "loss": 0.0838, "num_input_tokens_seen": 262010912, "step": 121425 }, { "epoch": 19.809135399673735, "grad_norm": 2.3423361778259277, "learning_rate": 1.3893682140522779e-08, "loss": 0.1481, "num_input_tokens_seen": 262022080, "step": 121430 }, { "epoch": 19.80995106035889, "grad_norm": 0.6813262104988098, "learning_rate": 1.3775298307985852e-08, "loss": 0.036, "num_input_tokens_seen": 262033216, "step": 121435 }, { "epoch": 19.810766721044047, "grad_norm": 0.17860959470272064, "learning_rate": 1.3657420852991598e-08, "loss": 0.0202, "num_input_tokens_seen": 262044576, "step": 121440 }, { "epoch": 19.8115823817292, "grad_norm": 0.18810927867889404, "learning_rate": 1.3540049777924224e-08, "loss": 0.0119, "num_input_tokens_seen": 262054752, "step": 121445 }, { "epoch": 19.812398042414355, "grad_norm": 2.7392544746398926, "learning_rate": 1.3423185085167934e-08, "loss": 0.1403, "num_input_tokens_seen": 262066464, "step": 121450 }, { "epoch": 19.81321370309951, "grad_norm": 0.2745525538921356, "learning_rate": 1.33068267770875e-08, "loss": 0.0321, "num_input_tokens_seen": 262077856, "step": 121455 }, { "epoch": 19.814029363784666, "grad_norm": 0.2135656327009201, "learning_rate": 1.3190974856044925e-08, "loss": 0.0468, "num_input_tokens_seen": 262087840, "step": 121460 }, { "epoch": 19.81484502446982, "grad_norm": 0.11545306444168091, "learning_rate": 1.3075629324382776e-08, "loss": 0.076, "num_input_tokens_seen": 262099968, "step": 121465 }, { "epoch": 19.815660685154974, "grad_norm": 2.5311105251312256, "learning_rate": 1.2960790184440852e-08, "loss": 0.2152, "num_input_tokens_seen": 262111040, "step": 121470 }, { "epoch": 19.81647634584013, "grad_norm": 0.7471633553504944, "learning_rate": 1.2846457438545068e-08, "loss": 0.1303, "num_input_tokens_seen": 262122080, "step": 121475 }, { "epoch": 19.817292006525285, "grad_norm": 0.08685802668333054, "learning_rate": 1.2732631089018566e-08, "loss": 0.0511, "num_input_tokens_seen": 262132640, "step": 121480 }, { "epoch": 19.81810766721044, "grad_norm": 0.04472506046295166, "learning_rate": 1.2619311138159506e-08, "loss": 0.0682, "num_input_tokens_seen": 262143840, "step": 121485 }, { "epoch": 19.818923327895597, "grad_norm": 0.24460139870643616, "learning_rate": 1.2506497588271604e-08, "loss": 0.0485, "num_input_tokens_seen": 262155040, "step": 121490 }, { "epoch": 19.81973898858075, "grad_norm": 0.16051194071769714, "learning_rate": 1.2394190441639142e-08, "loss": 0.0126, "num_input_tokens_seen": 262166816, "step": 121495 }, { "epoch": 19.820554649265905, "grad_norm": 1.129638433456421, "learning_rate": 1.2282389700535302e-08, "loss": 0.0533, "num_input_tokens_seen": 262177312, "step": 121500 }, { "epoch": 19.82137030995106, "grad_norm": 0.38899266719818115, "learning_rate": 1.2171095367227714e-08, "loss": 0.0295, "num_input_tokens_seen": 262188096, "step": 121505 }, { "epoch": 19.822185970636216, "grad_norm": 0.19045056402683258, "learning_rate": 1.2060307443975683e-08, "loss": 0.0855, "num_input_tokens_seen": 262198304, "step": 121510 }, { "epoch": 19.82300163132137, "grad_norm": 0.04656248912215233, "learning_rate": 1.1950025933016306e-08, "loss": 0.1772, "num_input_tokens_seen": 262209664, "step": 121515 }, { "epoch": 19.823817292006524, "grad_norm": 1.8354309797286987, "learning_rate": 1.1840250836592238e-08, "loss": 0.135, "num_input_tokens_seen": 262220672, "step": 121520 }, { "epoch": 19.82463295269168, "grad_norm": 0.11930087953805923, "learning_rate": 1.173098215692392e-08, "loss": 0.1615, "num_input_tokens_seen": 262231776, "step": 121525 }, { "epoch": 19.825448613376835, "grad_norm": 0.05769001692533493, "learning_rate": 1.1622219896229025e-08, "loss": 0.0943, "num_input_tokens_seen": 262242880, "step": 121530 }, { "epoch": 19.82626427406199, "grad_norm": 0.25211548805236816, "learning_rate": 1.1513964056708571e-08, "loss": 0.0561, "num_input_tokens_seen": 262254304, "step": 121535 }, { "epoch": 19.827079934747147, "grad_norm": 0.06997331976890564, "learning_rate": 1.1406214640558022e-08, "loss": 0.0366, "num_input_tokens_seen": 262264640, "step": 121540 }, { "epoch": 19.8278955954323, "grad_norm": 0.02864750660955906, "learning_rate": 1.1298971649961742e-08, "loss": 0.1196, "num_input_tokens_seen": 262276256, "step": 121545 }, { "epoch": 19.828711256117455, "grad_norm": 0.1756308376789093, "learning_rate": 1.119223508709022e-08, "loss": 0.1452, "num_input_tokens_seen": 262287008, "step": 121550 }, { "epoch": 19.82952691680261, "grad_norm": 0.7150293588638306, "learning_rate": 1.1086004954108386e-08, "loss": 0.02, "num_input_tokens_seen": 262297920, "step": 121555 }, { "epoch": 19.830342577487766, "grad_norm": 0.45903950929641724, "learning_rate": 1.0980281253170078e-08, "loss": 0.0188, "num_input_tokens_seen": 262307200, "step": 121560 }, { "epoch": 19.83115823817292, "grad_norm": 0.3886467218399048, "learning_rate": 1.0875063986418022e-08, "loss": 0.1335, "num_input_tokens_seen": 262317024, "step": 121565 }, { "epoch": 19.831973898858074, "grad_norm": 0.32778868079185486, "learning_rate": 1.0770353155983848e-08, "loss": 0.0463, "num_input_tokens_seen": 262327456, "step": 121570 }, { "epoch": 19.83278955954323, "grad_norm": 2.3478081226348877, "learning_rate": 1.0666148763988082e-08, "loss": 0.1266, "num_input_tokens_seen": 262338976, "step": 121575 }, { "epoch": 19.833605220228385, "grad_norm": 0.06823649257421494, "learning_rate": 1.0562450812545698e-08, "loss": 0.0258, "num_input_tokens_seen": 262349408, "step": 121580 }, { "epoch": 19.83442088091354, "grad_norm": 1.296412467956543, "learning_rate": 1.045925930375502e-08, "loss": 0.0942, "num_input_tokens_seen": 262359904, "step": 121585 }, { "epoch": 19.835236541598697, "grad_norm": 0.09736718237400055, "learning_rate": 1.0356574239708817e-08, "loss": 0.0657, "num_input_tokens_seen": 262371200, "step": 121590 }, { "epoch": 19.83605220228385, "grad_norm": 0.22729437053203583, "learning_rate": 1.0254395622491531e-08, "loss": 0.0324, "num_input_tokens_seen": 262382624, "step": 121595 }, { "epoch": 19.836867862969005, "grad_norm": 0.8803805708885193, "learning_rate": 1.0152723454168178e-08, "loss": 0.0373, "num_input_tokens_seen": 262393152, "step": 121600 }, { "epoch": 19.83768352365416, "grad_norm": 0.040282975882291794, "learning_rate": 1.0051557736800998e-08, "loss": 0.0924, "num_input_tokens_seen": 262404192, "step": 121605 }, { "epoch": 19.838499184339316, "grad_norm": 0.8902567625045776, "learning_rate": 9.950898472441128e-09, "loss": 0.0248, "num_input_tokens_seen": 262414528, "step": 121610 }, { "epoch": 19.839314845024468, "grad_norm": 0.2718485891819, "learning_rate": 9.850745663128603e-09, "loss": 0.0972, "num_input_tokens_seen": 262426144, "step": 121615 }, { "epoch": 19.840130505709624, "grad_norm": 0.10542832314968109, "learning_rate": 9.75109931089513e-09, "loss": 0.1765, "num_input_tokens_seen": 262437696, "step": 121620 }, { "epoch": 19.84094616639478, "grad_norm": 1.986027479171753, "learning_rate": 9.651959417755762e-09, "loss": 0.0958, "num_input_tokens_seen": 262447872, "step": 121625 }, { "epoch": 19.841761827079935, "grad_norm": 0.24865606427192688, "learning_rate": 9.553325985722782e-09, "loss": 0.0861, "num_input_tokens_seen": 262458496, "step": 121630 }, { "epoch": 19.84257748776509, "grad_norm": 0.5630926489830017, "learning_rate": 9.455199016794592e-09, "loss": 0.1683, "num_input_tokens_seen": 262470464, "step": 121635 }, { "epoch": 19.843393148450243, "grad_norm": 2.2199273109436035, "learning_rate": 9.357578512958487e-09, "loss": 0.126, "num_input_tokens_seen": 262480000, "step": 121640 }, { "epoch": 19.8442088091354, "grad_norm": 0.3017628788948059, "learning_rate": 9.260464476193442e-09, "loss": 0.1295, "num_input_tokens_seen": 262491424, "step": 121645 }, { "epoch": 19.845024469820554, "grad_norm": 0.026859484612941742, "learning_rate": 9.163856908470104e-09, "loss": 0.0401, "num_input_tokens_seen": 262501568, "step": 121650 }, { "epoch": 19.84584013050571, "grad_norm": 0.8337011337280273, "learning_rate": 9.067755811742463e-09, "loss": 0.0408, "num_input_tokens_seen": 262511616, "step": 121655 }, { "epoch": 19.846655791190866, "grad_norm": 0.4652291238307953, "learning_rate": 8.972161187961735e-09, "loss": 0.0669, "num_input_tokens_seen": 262522880, "step": 121660 }, { "epoch": 19.847471451876018, "grad_norm": 0.47268304228782654, "learning_rate": 8.877073039062489e-09, "loss": 0.0189, "num_input_tokens_seen": 262535008, "step": 121665 }, { "epoch": 19.848287112561174, "grad_norm": 0.5447670817375183, "learning_rate": 8.782491366973734e-09, "loss": 0.2305, "num_input_tokens_seen": 262546144, "step": 121670 }, { "epoch": 19.84910277324633, "grad_norm": 0.4415687918663025, "learning_rate": 8.688416173610603e-09, "loss": 0.0313, "num_input_tokens_seen": 262556736, "step": 121675 }, { "epoch": 19.849918433931485, "grad_norm": 0.05667281150817871, "learning_rate": 8.594847460879907e-09, "loss": 0.1519, "num_input_tokens_seen": 262566560, "step": 121680 }, { "epoch": 19.85073409461664, "grad_norm": 0.04641318321228027, "learning_rate": 8.501785230680126e-09, "loss": 0.1349, "num_input_tokens_seen": 262577440, "step": 121685 }, { "epoch": 19.851549755301793, "grad_norm": 1.7425249814987183, "learning_rate": 8.40922948489309e-09, "loss": 0.2291, "num_input_tokens_seen": 262588224, "step": 121690 }, { "epoch": 19.85236541598695, "grad_norm": 0.3299284875392914, "learning_rate": 8.317180225397847e-09, "loss": 0.0261, "num_input_tokens_seen": 262598240, "step": 121695 }, { "epoch": 19.853181076672104, "grad_norm": 0.03951938822865486, "learning_rate": 8.225637454062352e-09, "loss": 0.1319, "num_input_tokens_seen": 262608288, "step": 121700 }, { "epoch": 19.85399673735726, "grad_norm": 2.413130044937134, "learning_rate": 8.134601172735123e-09, "loss": 0.1415, "num_input_tokens_seen": 262619616, "step": 121705 }, { "epoch": 19.854812398042416, "grad_norm": 0.06489048898220062, "learning_rate": 8.044071383264684e-09, "loss": 0.0873, "num_input_tokens_seen": 262630560, "step": 121710 }, { "epoch": 19.855628058727568, "grad_norm": 0.041891198605298996, "learning_rate": 7.954048087488453e-09, "loss": 0.0217, "num_input_tokens_seen": 262642240, "step": 121715 }, { "epoch": 19.856443719412724, "grad_norm": 2.191417694091797, "learning_rate": 7.86453128722442e-09, "loss": 0.0553, "num_input_tokens_seen": 262653952, "step": 121720 }, { "epoch": 19.85725938009788, "grad_norm": 2.1951050758361816, "learning_rate": 7.77552098429335e-09, "loss": 0.1718, "num_input_tokens_seen": 262664704, "step": 121725 }, { "epoch": 19.858075040783035, "grad_norm": 1.293967366218567, "learning_rate": 7.68701718049658e-09, "loss": 0.126, "num_input_tokens_seen": 262675936, "step": 121730 }, { "epoch": 19.85889070146819, "grad_norm": 0.022372564300894737, "learning_rate": 7.599019877624346e-09, "loss": 0.0988, "num_input_tokens_seen": 262686624, "step": 121735 }, { "epoch": 19.859706362153343, "grad_norm": 0.1627684235572815, "learning_rate": 7.511529077464108e-09, "loss": 0.0363, "num_input_tokens_seen": 262696608, "step": 121740 }, { "epoch": 19.8605220228385, "grad_norm": 0.040404461324214935, "learning_rate": 7.424544781789444e-09, "loss": 0.1275, "num_input_tokens_seen": 262707520, "step": 121745 }, { "epoch": 19.861337683523654, "grad_norm": 0.2285115271806717, "learning_rate": 7.338066992362836e-09, "loss": 0.1043, "num_input_tokens_seen": 262719008, "step": 121750 }, { "epoch": 19.86215334420881, "grad_norm": 0.03301723673939705, "learning_rate": 7.252095710932883e-09, "loss": 0.0618, "num_input_tokens_seen": 262728800, "step": 121755 }, { "epoch": 19.862969004893966, "grad_norm": 0.06865900754928589, "learning_rate": 7.166630939245411e-09, "loss": 0.1163, "num_input_tokens_seen": 262739200, "step": 121760 }, { "epoch": 19.863784665579118, "grad_norm": 0.059812191873788834, "learning_rate": 7.0816726790323695e-09, "loss": 0.1331, "num_input_tokens_seen": 262750144, "step": 121765 }, { "epoch": 19.864600326264274, "grad_norm": 1.4327867031097412, "learning_rate": 6.9972209320146035e-09, "loss": 0.0442, "num_input_tokens_seen": 262760832, "step": 121770 }, { "epoch": 19.86541598694943, "grad_norm": 1.7242789268493652, "learning_rate": 6.913275699904631e-09, "loss": 0.1197, "num_input_tokens_seen": 262771104, "step": 121775 }, { "epoch": 19.866231647634585, "grad_norm": 0.1187661811709404, "learning_rate": 6.829836984401095e-09, "loss": 0.0447, "num_input_tokens_seen": 262782976, "step": 121780 }, { "epoch": 19.86704730831974, "grad_norm": 1.9454994201660156, "learning_rate": 6.746904787199859e-09, "loss": 0.1876, "num_input_tokens_seen": 262794112, "step": 121785 }, { "epoch": 19.867862969004893, "grad_norm": 1.3476886749267578, "learning_rate": 6.664479109977361e-09, "loss": 0.2456, "num_input_tokens_seen": 262805728, "step": 121790 }, { "epoch": 19.86867862969005, "grad_norm": 1.4973421096801758, "learning_rate": 6.582559954404488e-09, "loss": 0.2505, "num_input_tokens_seen": 262815168, "step": 121795 }, { "epoch": 19.869494290375204, "grad_norm": 1.5727022886276245, "learning_rate": 6.501147322143797e-09, "loss": 0.0749, "num_input_tokens_seen": 262826816, "step": 121800 }, { "epoch": 19.87030995106036, "grad_norm": 0.055202215909957886, "learning_rate": 6.420241214843969e-09, "loss": 0.0244, "num_input_tokens_seen": 262837440, "step": 121805 }, { "epoch": 19.871125611745512, "grad_norm": 1.3748406171798706, "learning_rate": 6.3398416341453606e-09, "loss": 0.0295, "num_input_tokens_seen": 262847904, "step": 121810 }, { "epoch": 19.871941272430668, "grad_norm": 1.8804869651794434, "learning_rate": 6.259948581674446e-09, "loss": 0.141, "num_input_tokens_seen": 262858368, "step": 121815 }, { "epoch": 19.872756933115824, "grad_norm": 0.1874294877052307, "learning_rate": 6.180562059054929e-09, "loss": 0.0976, "num_input_tokens_seen": 262869408, "step": 121820 }, { "epoch": 19.87357259380098, "grad_norm": 0.02210954762995243, "learning_rate": 6.101682067891079e-09, "loss": 0.0899, "num_input_tokens_seen": 262881024, "step": 121825 }, { "epoch": 19.874388254486135, "grad_norm": 0.11565112322568893, "learning_rate": 6.023308609784395e-09, "loss": 0.0557, "num_input_tokens_seen": 262891264, "step": 121830 }, { "epoch": 19.875203915171287, "grad_norm": 1.8602725267410278, "learning_rate": 5.945441686322495e-09, "loss": 0.0419, "num_input_tokens_seen": 262901888, "step": 121835 }, { "epoch": 19.876019575856443, "grad_norm": 1.4949678182601929, "learning_rate": 5.868081299081896e-09, "loss": 0.0394, "num_input_tokens_seen": 262914336, "step": 121840 }, { "epoch": 19.8768352365416, "grad_norm": 0.9575582146644592, "learning_rate": 5.791227449633562e-09, "loss": 0.1416, "num_input_tokens_seen": 262924352, "step": 121845 }, { "epoch": 19.877650897226754, "grad_norm": 0.2865937054157257, "learning_rate": 5.714880139534584e-09, "loss": 0.029, "num_input_tokens_seen": 262935264, "step": 121850 }, { "epoch": 19.87846655791191, "grad_norm": 3.194875478744507, "learning_rate": 5.63903937032817e-09, "loss": 0.1249, "num_input_tokens_seen": 262945920, "step": 121855 }, { "epoch": 19.879282218597062, "grad_norm": 0.15277951955795288, "learning_rate": 5.563705143557529e-09, "loss": 0.077, "num_input_tokens_seen": 262957376, "step": 121860 }, { "epoch": 19.880097879282218, "grad_norm": 0.05614316463470459, "learning_rate": 5.4888774607436685e-09, "loss": 0.0247, "num_input_tokens_seen": 262967168, "step": 121865 }, { "epoch": 19.880913539967374, "grad_norm": 1.4233661890029907, "learning_rate": 5.414556323404818e-09, "loss": 0.1149, "num_input_tokens_seen": 262976832, "step": 121870 }, { "epoch": 19.88172920065253, "grad_norm": 0.05131173133850098, "learning_rate": 5.34074173305088e-09, "loss": 0.0083, "num_input_tokens_seen": 262988512, "step": 121875 }, { "epoch": 19.882544861337685, "grad_norm": 0.08220246434211731, "learning_rate": 5.2674336911723296e-09, "loss": 0.0379, "num_input_tokens_seen": 262999840, "step": 121880 }, { "epoch": 19.883360522022837, "grad_norm": 0.4570997357368469, "learning_rate": 5.194632199259642e-09, "loss": 0.0443, "num_input_tokens_seen": 263010912, "step": 121885 }, { "epoch": 19.884176182707993, "grad_norm": 0.9969030022621155, "learning_rate": 5.12233725878386e-09, "loss": 0.1346, "num_input_tokens_seen": 263020992, "step": 121890 }, { "epoch": 19.88499184339315, "grad_norm": 1.313612699508667, "learning_rate": 5.050548871213256e-09, "loss": 0.0417, "num_input_tokens_seen": 263031456, "step": 121895 }, { "epoch": 19.885807504078304, "grad_norm": 0.020313313230872154, "learning_rate": 4.979267037999447e-09, "loss": 0.0993, "num_input_tokens_seen": 263043136, "step": 121900 }, { "epoch": 19.88662316476346, "grad_norm": 0.3852814733982086, "learning_rate": 4.9084917605912714e-09, "loss": 0.151, "num_input_tokens_seen": 263053728, "step": 121905 }, { "epoch": 19.887438825448612, "grad_norm": 0.15856340527534485, "learning_rate": 4.838223040420919e-09, "loss": 0.0115, "num_input_tokens_seen": 263064832, "step": 121910 }, { "epoch": 19.888254486133768, "grad_norm": 0.017367210239171982, "learning_rate": 4.768460878912251e-09, "loss": 0.0195, "num_input_tokens_seen": 263075360, "step": 121915 }, { "epoch": 19.889070146818923, "grad_norm": 0.04969681799411774, "learning_rate": 4.699205277478025e-09, "loss": 0.0446, "num_input_tokens_seen": 263086880, "step": 121920 }, { "epoch": 19.88988580750408, "grad_norm": 0.08153853565454483, "learning_rate": 4.630456237525449e-09, "loss": 0.0304, "num_input_tokens_seen": 263097696, "step": 121925 }, { "epoch": 19.890701468189235, "grad_norm": 0.12007607519626617, "learning_rate": 4.562213760445078e-09, "loss": 0.1697, "num_input_tokens_seen": 263108384, "step": 121930 }, { "epoch": 19.891517128874387, "grad_norm": 3.76242733001709, "learning_rate": 4.494477847619138e-09, "loss": 0.183, "num_input_tokens_seen": 263117984, "step": 121935 }, { "epoch": 19.892332789559543, "grad_norm": 0.33296093344688416, "learning_rate": 4.427248500421532e-09, "loss": 0.088, "num_input_tokens_seen": 263128544, "step": 121940 }, { "epoch": 19.8931484502447, "grad_norm": 0.1922794133424759, "learning_rate": 4.360525720217834e-09, "loss": 0.0281, "num_input_tokens_seen": 263139136, "step": 121945 }, { "epoch": 19.893964110929854, "grad_norm": 0.06533188372850418, "learning_rate": 4.294309508354189e-09, "loss": 0.0972, "num_input_tokens_seen": 263149536, "step": 121950 }, { "epoch": 19.894779771615006, "grad_norm": 0.07562132179737091, "learning_rate": 4.228599866176741e-09, "loss": 0.068, "num_input_tokens_seen": 263159808, "step": 121955 }, { "epoch": 19.895595432300162, "grad_norm": 0.2826738655567169, "learning_rate": 4.16339679501776e-09, "loss": 0.283, "num_input_tokens_seen": 263170752, "step": 121960 }, { "epoch": 19.896411092985318, "grad_norm": 2.0484845638275146, "learning_rate": 4.098700296195635e-09, "loss": 0.101, "num_input_tokens_seen": 263180736, "step": 121965 }, { "epoch": 19.897226753670473, "grad_norm": 0.6317391395568848, "learning_rate": 4.034510371020428e-09, "loss": 0.1545, "num_input_tokens_seen": 263190464, "step": 121970 }, { "epoch": 19.89804241435563, "grad_norm": 1.8356554508209229, "learning_rate": 3.970827020799428e-09, "loss": 0.0319, "num_input_tokens_seen": 263201344, "step": 121975 }, { "epoch": 19.898858075040785, "grad_norm": 1.680661678314209, "learning_rate": 3.907650246817718e-09, "loss": 0.1202, "num_input_tokens_seen": 263212672, "step": 121980 }, { "epoch": 19.899673735725937, "grad_norm": 0.03269829601049423, "learning_rate": 3.844980050357605e-09, "loss": 0.1934, "num_input_tokens_seen": 263223136, "step": 121985 }, { "epoch": 19.900489396411093, "grad_norm": 0.029648929834365845, "learning_rate": 3.782816432687519e-09, "loss": 0.1571, "num_input_tokens_seen": 263233632, "step": 121990 }, { "epoch": 19.90130505709625, "grad_norm": 0.1952977329492569, "learning_rate": 3.721159395070339e-09, "loss": 0.0088, "num_input_tokens_seen": 263243136, "step": 121995 }, { "epoch": 19.902120717781404, "grad_norm": 0.9690614938735962, "learning_rate": 3.6600089387522906e-09, "loss": 0.1565, "num_input_tokens_seen": 263255008, "step": 122000 }, { "epoch": 19.902936378466556, "grad_norm": 0.33401167392730713, "learning_rate": 3.599365064974047e-09, "loss": 0.1415, "num_input_tokens_seen": 263266656, "step": 122005 }, { "epoch": 19.903752039151712, "grad_norm": 0.9615111351013184, "learning_rate": 3.5392277749679568e-09, "loss": 0.1143, "num_input_tokens_seen": 263277120, "step": 122010 }, { "epoch": 19.904567699836868, "grad_norm": 1.7485337257385254, "learning_rate": 3.4795970699469383e-09, "loss": 0.0349, "num_input_tokens_seen": 263288640, "step": 122015 }, { "epoch": 19.905383360522023, "grad_norm": 0.06527096778154373, "learning_rate": 3.420472951121134e-09, "loss": 0.0426, "num_input_tokens_seen": 263300256, "step": 122020 }, { "epoch": 19.90619902120718, "grad_norm": 0.10665667802095413, "learning_rate": 3.3618554196923613e-09, "loss": 0.1066, "num_input_tokens_seen": 263311552, "step": 122025 }, { "epoch": 19.90701468189233, "grad_norm": 1.658725619316101, "learning_rate": 3.3037444768457826e-09, "loss": 0.0927, "num_input_tokens_seen": 263322112, "step": 122030 }, { "epoch": 19.907830342577487, "grad_norm": 0.03634597361087799, "learning_rate": 3.2461401237582346e-09, "loss": 0.1392, "num_input_tokens_seen": 263331680, "step": 122035 }, { "epoch": 19.908646003262643, "grad_norm": 1.372179388999939, "learning_rate": 3.189042361598227e-09, "loss": 0.1141, "num_input_tokens_seen": 263342784, "step": 122040 }, { "epoch": 19.9094616639478, "grad_norm": 0.1819254755973816, "learning_rate": 3.1324511915259423e-09, "loss": 0.0929, "num_input_tokens_seen": 263354336, "step": 122045 }, { "epoch": 19.910277324632954, "grad_norm": 0.15652428567409515, "learning_rate": 3.0763666146821357e-09, "loss": 0.027, "num_input_tokens_seen": 263365408, "step": 122050 }, { "epoch": 19.911092985318106, "grad_norm": 0.08151589334011078, "learning_rate": 3.0207886322075608e-09, "loss": 0.1097, "num_input_tokens_seen": 263376096, "step": 122055 }, { "epoch": 19.911908646003262, "grad_norm": 1.9418085813522339, "learning_rate": 2.965717245229094e-09, "loss": 0.0475, "num_input_tokens_seen": 263386944, "step": 122060 }, { "epoch": 19.912724306688418, "grad_norm": 0.06855130940675735, "learning_rate": 2.9111524548597334e-09, "loss": 0.1793, "num_input_tokens_seen": 263396768, "step": 122065 }, { "epoch": 19.913539967373573, "grad_norm": 1.79339599609375, "learning_rate": 2.8570942622069274e-09, "loss": 0.1067, "num_input_tokens_seen": 263407104, "step": 122070 }, { "epoch": 19.91435562805873, "grad_norm": 0.09806057065725327, "learning_rate": 2.803542668364245e-09, "loss": 0.0534, "num_input_tokens_seen": 263417184, "step": 122075 }, { "epoch": 19.91517128874388, "grad_norm": 1.6939568519592285, "learning_rate": 2.7504976744224807e-09, "loss": 0.0441, "num_input_tokens_seen": 263428608, "step": 122080 }, { "epoch": 19.915986949429037, "grad_norm": 0.37778690457344055, "learning_rate": 2.697959281450224e-09, "loss": 0.0822, "num_input_tokens_seen": 263439360, "step": 122085 }, { "epoch": 19.916802610114193, "grad_norm": 2.190861701965332, "learning_rate": 2.6459274905188403e-09, "loss": 0.1107, "num_input_tokens_seen": 263449664, "step": 122090 }, { "epoch": 19.91761827079935, "grad_norm": 0.026324797421693802, "learning_rate": 2.5944023026747145e-09, "loss": 0.1262, "num_input_tokens_seen": 263460512, "step": 122095 }, { "epoch": 19.918433931484504, "grad_norm": 0.7591081261634827, "learning_rate": 2.543383718969783e-09, "loss": 0.0621, "num_input_tokens_seen": 263470976, "step": 122100 }, { "epoch": 19.919249592169656, "grad_norm": 0.08725504577159882, "learning_rate": 2.4928717404337774e-09, "loss": 0.1975, "num_input_tokens_seen": 263481600, "step": 122105 }, { "epoch": 19.920065252854812, "grad_norm": 0.08151744306087494, "learning_rate": 2.442866368090879e-09, "loss": 0.0311, "num_input_tokens_seen": 263492896, "step": 122110 }, { "epoch": 19.920880913539968, "grad_norm": 0.3245460093021393, "learning_rate": 2.393367602956942e-09, "loss": 0.0264, "num_input_tokens_seen": 263504480, "step": 122115 }, { "epoch": 19.921696574225123, "grad_norm": 0.026916628703475, "learning_rate": 2.3443754460311662e-09, "loss": 0.0214, "num_input_tokens_seen": 263515872, "step": 122120 }, { "epoch": 19.92251223491028, "grad_norm": 0.8574168682098389, "learning_rate": 2.2958898983072017e-09, "loss": 0.0605, "num_input_tokens_seen": 263526656, "step": 122125 }, { "epoch": 19.92332789559543, "grad_norm": 0.03087957389652729, "learning_rate": 2.247910960770372e-09, "loss": 0.1326, "num_input_tokens_seen": 263537920, "step": 122130 }, { "epoch": 19.924143556280587, "grad_norm": 0.1489318460226059, "learning_rate": 2.2004386343921212e-09, "loss": 0.1323, "num_input_tokens_seen": 263549984, "step": 122135 }, { "epoch": 19.924959216965743, "grad_norm": 0.14789476990699768, "learning_rate": 2.153472920132793e-09, "loss": 0.1292, "num_input_tokens_seen": 263559392, "step": 122140 }, { "epoch": 19.9257748776509, "grad_norm": 0.04197624325752258, "learning_rate": 2.1070138189471788e-09, "loss": 0.0273, "num_input_tokens_seen": 263569920, "step": 122145 }, { "epoch": 19.92659053833605, "grad_norm": 0.16040568053722382, "learning_rate": 2.061061331773417e-09, "loss": 0.0353, "num_input_tokens_seen": 263580352, "step": 122150 }, { "epoch": 19.927406199021206, "grad_norm": 0.03285755589604378, "learning_rate": 2.015615459544096e-09, "loss": 0.0723, "num_input_tokens_seen": 263591104, "step": 122155 }, { "epoch": 19.928221859706362, "grad_norm": 2.5062832832336426, "learning_rate": 1.970676203180699e-09, "loss": 0.0629, "num_input_tokens_seen": 263602336, "step": 122160 }, { "epoch": 19.929037520391518, "grad_norm": 0.3353431522846222, "learning_rate": 1.9262435635936104e-09, "loss": 0.0394, "num_input_tokens_seen": 263613472, "step": 122165 }, { "epoch": 19.929853181076673, "grad_norm": 0.03684893250465393, "learning_rate": 1.882317541684886e-09, "loss": 0.1694, "num_input_tokens_seen": 263623104, "step": 122170 }, { "epoch": 19.930668841761825, "grad_norm": 1.4661005735397339, "learning_rate": 1.838898138342704e-09, "loss": 0.2531, "num_input_tokens_seen": 263634592, "step": 122175 }, { "epoch": 19.93148450244698, "grad_norm": 0.7821072340011597, "learning_rate": 1.7959853544469162e-09, "loss": 0.1135, "num_input_tokens_seen": 263643808, "step": 122180 }, { "epoch": 19.932300163132137, "grad_norm": 1.1093024015426636, "learning_rate": 1.753579190869048e-09, "loss": 0.1194, "num_input_tokens_seen": 263654336, "step": 122185 }, { "epoch": 19.933115823817293, "grad_norm": 0.17084777355194092, "learning_rate": 1.7116796484667464e-09, "loss": 0.0393, "num_input_tokens_seen": 263665440, "step": 122190 }, { "epoch": 19.93393148450245, "grad_norm": 1.1940195560455322, "learning_rate": 1.6702867280921074e-09, "loss": 0.1425, "num_input_tokens_seen": 263676224, "step": 122195 }, { "epoch": 19.9347471451876, "grad_norm": 0.1884172558784485, "learning_rate": 1.6294004305777987e-09, "loss": 0.1081, "num_input_tokens_seen": 263687040, "step": 122200 }, { "epoch": 19.935562805872756, "grad_norm": 0.1252693384885788, "learning_rate": 1.589020756759263e-09, "loss": 0.0338, "num_input_tokens_seen": 263697504, "step": 122205 }, { "epoch": 19.936378466557912, "grad_norm": 0.5475061535835266, "learning_rate": 1.5491477074497384e-09, "loss": 0.0248, "num_input_tokens_seen": 263708672, "step": 122210 }, { "epoch": 19.937194127243067, "grad_norm": 0.11603488773107529, "learning_rate": 1.5097812834624635e-09, "loss": 0.1208, "num_input_tokens_seen": 263718688, "step": 122215 }, { "epoch": 19.938009787928223, "grad_norm": 0.1267077773809433, "learning_rate": 1.4709214855884722e-09, "loss": 0.2291, "num_input_tokens_seen": 263730464, "step": 122220 }, { "epoch": 19.938825448613375, "grad_norm": 0.1087704449892044, "learning_rate": 1.432568314621574e-09, "loss": 0.0106, "num_input_tokens_seen": 263741696, "step": 122225 }, { "epoch": 19.93964110929853, "grad_norm": 1.3558018207550049, "learning_rate": 1.3947217713361493e-09, "loss": 0.0927, "num_input_tokens_seen": 263752448, "step": 122230 }, { "epoch": 19.940456769983687, "grad_norm": 0.3790532052516937, "learning_rate": 1.3573818565010276e-09, "loss": 0.1979, "num_input_tokens_seen": 263763232, "step": 122235 }, { "epoch": 19.941272430668842, "grad_norm": 0.050111137330532074, "learning_rate": 1.3205485708711608e-09, "loss": 0.011, "num_input_tokens_seen": 263774624, "step": 122240 }, { "epoch": 19.942088091353998, "grad_norm": 0.038422055542469025, "learning_rate": 1.2842219151903978e-09, "loss": 0.1093, "num_input_tokens_seen": 263784992, "step": 122245 }, { "epoch": 19.94290375203915, "grad_norm": 0.035215891897678375, "learning_rate": 1.2484018902025884e-09, "loss": 0.0168, "num_input_tokens_seen": 263796576, "step": 122250 }, { "epoch": 19.943719412724306, "grad_norm": 1.7406100034713745, "learning_rate": 1.213088496626602e-09, "loss": 0.0852, "num_input_tokens_seen": 263806752, "step": 122255 }, { "epoch": 19.94453507340946, "grad_norm": 0.5623646974563599, "learning_rate": 1.1782817351785324e-09, "loss": 0.122, "num_input_tokens_seen": 263817344, "step": 122260 }, { "epoch": 19.945350734094617, "grad_norm": 0.13766300678253174, "learning_rate": 1.1439816065689224e-09, "loss": 0.0195, "num_input_tokens_seen": 263828416, "step": 122265 }, { "epoch": 19.946166394779773, "grad_norm": 2.548496723175049, "learning_rate": 1.1101881114888857e-09, "loss": 0.1369, "num_input_tokens_seen": 263838976, "step": 122270 }, { "epoch": 19.946982055464925, "grad_norm": 0.10973536968231201, "learning_rate": 1.076901250623985e-09, "loss": 0.0477, "num_input_tokens_seen": 263848800, "step": 122275 }, { "epoch": 19.94779771615008, "grad_norm": 0.11582442373037338, "learning_rate": 1.0441210246514565e-09, "loss": 0.0347, "num_input_tokens_seen": 263859520, "step": 122280 }, { "epoch": 19.948613376835237, "grad_norm": 1.0323830842971802, "learning_rate": 1.0118474342291074e-09, "loss": 0.0915, "num_input_tokens_seen": 263869792, "step": 122285 }, { "epoch": 19.949429037520392, "grad_norm": 0.2097722291946411, "learning_rate": 9.800804800175202e-10, "loss": 0.015, "num_input_tokens_seen": 263880160, "step": 122290 }, { "epoch": 19.950244698205548, "grad_norm": 1.2969844341278076, "learning_rate": 9.488201626578485e-10, "loss": 0.0724, "num_input_tokens_seen": 263889824, "step": 122295 }, { "epoch": 19.9510603588907, "grad_norm": 0.06781689077615738, "learning_rate": 9.180664827856955e-10, "loss": 0.0357, "num_input_tokens_seen": 263901472, "step": 122300 }, { "epoch": 19.951876019575856, "grad_norm": 0.12021990865468979, "learning_rate": 8.878194410200102e-10, "loss": 0.2167, "num_input_tokens_seen": 263911808, "step": 122305 }, { "epoch": 19.95269168026101, "grad_norm": 0.3322199881076813, "learning_rate": 8.580790379769665e-10, "loss": 0.1764, "num_input_tokens_seen": 263922784, "step": 122310 }, { "epoch": 19.953507340946167, "grad_norm": 0.0646962970495224, "learning_rate": 8.288452742588604e-10, "loss": 0.0421, "num_input_tokens_seen": 263932768, "step": 122315 }, { "epoch": 19.954323001631323, "grad_norm": 2.0265605449676514, "learning_rate": 8.001181504568855e-10, "loss": 0.1007, "num_input_tokens_seen": 263944320, "step": 122320 }, { "epoch": 19.955138662316475, "grad_norm": 1.062121868133545, "learning_rate": 7.718976671539091e-10, "loss": 0.057, "num_input_tokens_seen": 263955456, "step": 122325 }, { "epoch": 19.95595432300163, "grad_norm": 0.05890626087784767, "learning_rate": 7.441838249244715e-10, "loss": 0.0347, "num_input_tokens_seen": 263965664, "step": 122330 }, { "epoch": 19.956769983686787, "grad_norm": 0.03743197023868561, "learning_rate": 7.169766243264597e-10, "loss": 0.0132, "num_input_tokens_seen": 263976960, "step": 122335 }, { "epoch": 19.957585644371942, "grad_norm": 0.05002625286579132, "learning_rate": 6.902760659122099e-10, "loss": 0.2786, "num_input_tokens_seen": 263987776, "step": 122340 }, { "epoch": 19.958401305057095, "grad_norm": 0.14696939289569855, "learning_rate": 6.640821502257311e-10, "loss": 0.0238, "num_input_tokens_seen": 263997536, "step": 122345 }, { "epoch": 19.95921696574225, "grad_norm": 0.034686848521232605, "learning_rate": 6.383948777943794e-10, "loss": 0.2211, "num_input_tokens_seen": 264008864, "step": 122350 }, { "epoch": 19.960032626427406, "grad_norm": 0.15277336537837982, "learning_rate": 6.132142491371839e-10, "loss": 0.1226, "num_input_tokens_seen": 264018784, "step": 122355 }, { "epoch": 19.96084828711256, "grad_norm": 0.644124448299408, "learning_rate": 5.885402647703986e-10, "loss": 0.0282, "num_input_tokens_seen": 264029312, "step": 122360 }, { "epoch": 19.961663947797717, "grad_norm": 1.7306236028671265, "learning_rate": 5.64372925190848e-10, "loss": 0.1954, "num_input_tokens_seen": 264040672, "step": 122365 }, { "epoch": 19.96247960848287, "grad_norm": 0.023943183943629265, "learning_rate": 5.407122308870305e-10, "loss": 0.1353, "num_input_tokens_seen": 264050432, "step": 122370 }, { "epoch": 19.963295269168025, "grad_norm": 0.1032557338476181, "learning_rate": 5.175581823391173e-10, "loss": 0.0486, "num_input_tokens_seen": 264061280, "step": 122375 }, { "epoch": 19.96411092985318, "grad_norm": 0.7333072423934937, "learning_rate": 4.949107800189535e-10, "loss": 0.0735, "num_input_tokens_seen": 264072000, "step": 122380 }, { "epoch": 19.964926590538337, "grad_norm": 0.7718019485473633, "learning_rate": 4.727700243817301e-10, "loss": 0.1858, "num_input_tokens_seen": 264083296, "step": 122385 }, { "epoch": 19.965742251223492, "grad_norm": 0.6279097199440002, "learning_rate": 4.511359158798634e-10, "loss": 0.1413, "num_input_tokens_seen": 264093568, "step": 122390 }, { "epoch": 19.966557911908644, "grad_norm": 1.9523394107818604, "learning_rate": 4.3000845494911566e-10, "loss": 0.0979, "num_input_tokens_seen": 264105248, "step": 122395 }, { "epoch": 19.9673735725938, "grad_norm": 0.5654808878898621, "learning_rate": 4.093876420169229e-10, "loss": 0.0847, "num_input_tokens_seen": 264115232, "step": 122400 }, { "epoch": 19.968189233278956, "grad_norm": 0.9443913698196411, "learning_rate": 3.8927347750516985e-10, "loss": 0.1187, "num_input_tokens_seen": 264126912, "step": 122405 }, { "epoch": 19.96900489396411, "grad_norm": 0.2222462296485901, "learning_rate": 3.696659618163123e-10, "loss": 0.0338, "num_input_tokens_seen": 264138176, "step": 122410 }, { "epoch": 19.969820554649267, "grad_norm": 0.26853299140930176, "learning_rate": 3.505650953528061e-10, "loss": 0.0239, "num_input_tokens_seen": 264149888, "step": 122415 }, { "epoch": 19.97063621533442, "grad_norm": 0.05927435681223869, "learning_rate": 3.3197087850045383e-10, "loss": 0.0269, "num_input_tokens_seen": 264160064, "step": 122420 }, { "epoch": 19.971451876019575, "grad_norm": 0.2130175679922104, "learning_rate": 3.138833116311801e-10, "loss": 0.0675, "num_input_tokens_seen": 264170304, "step": 122425 }, { "epoch": 19.97226753670473, "grad_norm": 0.04535222798585892, "learning_rate": 2.9630239511968525e-10, "loss": 0.1657, "num_input_tokens_seen": 264181344, "step": 122430 }, { "epoch": 19.973083197389887, "grad_norm": 0.03471606224775314, "learning_rate": 2.7922812931568953e-10, "loss": 0.0823, "num_input_tokens_seen": 264192736, "step": 122435 }, { "epoch": 19.973898858075042, "grad_norm": 0.20828357338905334, "learning_rate": 2.626605145661376e-10, "loss": 0.094, "num_input_tokens_seen": 264204096, "step": 122440 }, { "epoch": 19.974714518760194, "grad_norm": 0.38146528601646423, "learning_rate": 2.465995512096475e-10, "loss": 0.0227, "num_input_tokens_seen": 264215520, "step": 122445 }, { "epoch": 19.97553017944535, "grad_norm": 3.2851734161376953, "learning_rate": 2.3104523957095947e-10, "loss": 0.1367, "num_input_tokens_seen": 264226464, "step": 122450 }, { "epoch": 19.976345840130506, "grad_norm": 0.264967679977417, "learning_rate": 2.159975799637115e-10, "loss": 0.0693, "num_input_tokens_seen": 264237760, "step": 122455 }, { "epoch": 19.97716150081566, "grad_norm": 0.4229033589363098, "learning_rate": 2.0145657269321494e-10, "loss": 0.1247, "num_input_tokens_seen": 264248096, "step": 122460 }, { "epoch": 19.977977161500817, "grad_norm": 0.04761987179517746, "learning_rate": 1.8742221805645444e-10, "loss": 0.0056, "num_input_tokens_seen": 264258752, "step": 122465 }, { "epoch": 19.97879282218597, "grad_norm": 0.12940838932991028, "learning_rate": 1.738945163337613e-10, "loss": 0.0944, "num_input_tokens_seen": 264268864, "step": 122470 }, { "epoch": 19.979608482871125, "grad_norm": 0.08404816687107086, "learning_rate": 1.608734678026913e-10, "loss": 0.0165, "num_input_tokens_seen": 264280096, "step": 122475 }, { "epoch": 19.98042414355628, "grad_norm": 0.16252419352531433, "learning_rate": 1.483590727269224e-10, "loss": 0.08, "num_input_tokens_seen": 264291424, "step": 122480 }, { "epoch": 19.981239804241437, "grad_norm": 1.4342366456985474, "learning_rate": 1.3635133135903033e-10, "loss": 0.1289, "num_input_tokens_seen": 264302528, "step": 122485 }, { "epoch": 19.982055464926592, "grad_norm": 0.24660566449165344, "learning_rate": 1.2485024394326417e-10, "loss": 0.0946, "num_input_tokens_seen": 264312960, "step": 122490 }, { "epoch": 19.982871125611744, "grad_norm": 3.160301685333252, "learning_rate": 1.1385581070999518e-10, "loss": 0.0993, "num_input_tokens_seen": 264322368, "step": 122495 }, { "epoch": 19.9836867862969, "grad_norm": 0.02933911792933941, "learning_rate": 1.0336803188404354e-10, "loss": 0.0897, "num_input_tokens_seen": 264332800, "step": 122500 }, { "epoch": 19.984502446982056, "grad_norm": 0.03227359801530838, "learning_rate": 9.338690767912717e-11, "loss": 0.0497, "num_input_tokens_seen": 264342176, "step": 122505 }, { "epoch": 19.98531810766721, "grad_norm": 1.5050967931747437, "learning_rate": 8.391243829786178e-11, "loss": 0.2302, "num_input_tokens_seen": 264352576, "step": 122510 }, { "epoch": 19.986133768352367, "grad_norm": 3.0288193225860596, "learning_rate": 7.494462392898527e-11, "loss": 0.0349, "num_input_tokens_seen": 264364192, "step": 122515 }, { "epoch": 19.98694942903752, "grad_norm": 1.5076173543930054, "learning_rate": 6.648346475568445e-11, "loss": 0.3224, "num_input_tokens_seen": 264375104, "step": 122520 }, { "epoch": 19.987765089722675, "grad_norm": 0.042695071548223495, "learning_rate": 5.852896095004389e-11, "loss": 0.1434, "num_input_tokens_seen": 264385600, "step": 122525 }, { "epoch": 19.98858075040783, "grad_norm": 1.8874894380569458, "learning_rate": 5.108111267304594e-11, "loss": 0.1456, "num_input_tokens_seen": 264396864, "step": 122530 }, { "epoch": 19.989396411092986, "grad_norm": 0.7193653583526611, "learning_rate": 4.413992007457068e-11, "loss": 0.028, "num_input_tokens_seen": 264408416, "step": 122535 }, { "epoch": 19.99021207177814, "grad_norm": 0.04919110983610153, "learning_rate": 3.770538329617157e-11, "loss": 0.0274, "num_input_tokens_seen": 264418656, "step": 122540 }, { "epoch": 19.991027732463294, "grad_norm": 1.3630719184875488, "learning_rate": 3.177750247107536e-11, "loss": 0.1662, "num_input_tokens_seen": 264430432, "step": 122545 }, { "epoch": 19.99184339314845, "grad_norm": 0.4269044101238251, "learning_rate": 2.6356277715855472e-11, "loss": 0.0387, "num_input_tokens_seen": 264441248, "step": 122550 }, { "epoch": 19.992659053833606, "grad_norm": 0.056920748203992844, "learning_rate": 2.1441709138758646e-11, "loss": 0.103, "num_input_tokens_seen": 264451520, "step": 122555 }, { "epoch": 19.99347471451876, "grad_norm": 1.5892914533615112, "learning_rate": 1.7033796845256078e-11, "loss": 0.0915, "num_input_tokens_seen": 264461696, "step": 122560 }, { "epoch": 19.994290375203914, "grad_norm": 0.8725868463516235, "learning_rate": 1.3132540918614489e-11, "loss": 0.1335, "num_input_tokens_seen": 264474336, "step": 122565 }, { "epoch": 19.99510603588907, "grad_norm": 1.417527675628662, "learning_rate": 9.737941442100607e-12, "loss": 0.1333, "num_input_tokens_seen": 264486016, "step": 122570 }, { "epoch": 19.995921696574225, "grad_norm": 0.7625856995582581, "learning_rate": 6.849998482327813e-12, "loss": 0.1176, "num_input_tokens_seen": 264496320, "step": 122575 }, { "epoch": 19.99673735725938, "grad_norm": 0.8121002316474915, "learning_rate": 4.4687120975828165e-12, "loss": 0.0824, "num_input_tokens_seen": 264505856, "step": 122580 }, { "epoch": 19.997553017944536, "grad_norm": 0.09873341768980026, "learning_rate": 2.594082337825654e-12, "loss": 0.1198, "num_input_tokens_seen": 264515744, "step": 122585 }, { "epoch": 19.99836867862969, "grad_norm": 3.1286568641662598, "learning_rate": 1.2261092419141307e-12, "loss": 0.1871, "num_input_tokens_seen": 264526496, "step": 122590 }, { "epoch": 19.999184339314844, "grad_norm": 0.27642419934272766, "learning_rate": 3.6479283482826476e-13, "loss": 0.1029, "num_input_tokens_seen": 264537760, "step": 122595 }, { "epoch": 20.0, "grad_norm": 0.13737432658672333, "learning_rate": 1.0133133221401636e-14, "loss": 0.06, "num_input_tokens_seen": 264547520, "step": 122600 }, { "epoch": 20.0, "eval_loss": 0.14502523839473724, "eval_runtime": 90.9308, "eval_samples_per_second": 29.968, "eval_steps_per_second": 7.5, "num_input_tokens_seen": 264547520, "step": 122600 }, { "epoch": 20.0, "num_input_tokens_seen": 264547520, "step": 122600, "total_flos": 1.1912767331539354e+19, "train_loss": 0.12777723142658443, "train_runtime": 55024.6242, "train_samples_per_second": 8.912, "train_steps_per_second": 2.228 } ], "logging_steps": 5, "max_steps": 122600, "num_input_tokens_seen": 264547520, "num_train_epochs": 20, "save_steps": 6130, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1912767331539354e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }