{ "best_global_step": 12260, "best_metric": 0.14923527836799622, "best_model_checkpoint": "saves/lntuning/llama-3-8b-instruct/train_multirc_1753094164/checkpoint-12260", "epoch": 10.0, "eval_steps": 3065, "global_step": 61300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008156606851549756, "grad_norm": 3.3118317127227783, "learning_rate": 3.262642740619902e-08, "loss": 1.8213, "num_input_tokens_seen": 8928, "step": 5 }, { "epoch": 0.0016313213703099511, "grad_norm": 18.186201095581055, "learning_rate": 7.34094616639478e-08, "loss": 1.1136, "num_input_tokens_seen": 20448, "step": 10 }, { "epoch": 0.0024469820554649264, "grad_norm": 7.66547155380249, "learning_rate": 1.1419249592169658e-07, "loss": 0.4922, "num_input_tokens_seen": 31072, "step": 15 }, { "epoch": 0.0032626427406199023, "grad_norm": 26.939163208007812, "learning_rate": 1.5497553017944535e-07, "loss": 1.0222, "num_input_tokens_seen": 42528, "step": 20 }, { "epoch": 0.004078303425774877, "grad_norm": 10.637971878051758, "learning_rate": 1.9575856443719413e-07, "loss": 1.2831, "num_input_tokens_seen": 52576, "step": 25 }, { "epoch": 0.004893964110929853, "grad_norm": 16.892108917236328, "learning_rate": 2.365415986949429e-07, "loss": 0.7375, "num_input_tokens_seen": 62944, "step": 30 }, { "epoch": 0.005709624796084829, "grad_norm": 5.437825679779053, "learning_rate": 2.773246329526917e-07, "loss": 0.7678, "num_input_tokens_seen": 73504, "step": 35 }, { "epoch": 0.0065252854812398045, "grad_norm": 17.547048568725586, "learning_rate": 3.1810766721044045e-07, "loss": 0.994, "num_input_tokens_seen": 84640, "step": 40 }, { "epoch": 0.00734094616639478, "grad_norm": 4.541830062866211, "learning_rate": 3.5889070146818926e-07, "loss": 0.7076, "num_input_tokens_seen": 96288, "step": 45 }, { "epoch": 0.008156606851549755, "grad_norm": 13.801979064941406, "learning_rate": 3.99673735725938e-07, "loss": 0.9782, "num_input_tokens_seen": 107360, "step": 50 }, { "epoch": 0.00897226753670473, "grad_norm": 14.718707084655762, "learning_rate": 4.4045676998368683e-07, "loss": 0.83, "num_input_tokens_seen": 118432, "step": 55 }, { "epoch": 0.009787928221859706, "grad_norm": 5.254812717437744, "learning_rate": 4.812398042414356e-07, "loss": 0.9871, "num_input_tokens_seen": 128416, "step": 60 }, { "epoch": 0.010603588907014683, "grad_norm": 4.80765962600708, "learning_rate": 5.220228384991843e-07, "loss": 0.4167, "num_input_tokens_seen": 138432, "step": 65 }, { "epoch": 0.011419249592169658, "grad_norm": 18.233137130737305, "learning_rate": 5.628058727569332e-07, "loss": 0.6041, "num_input_tokens_seen": 149504, "step": 70 }, { "epoch": 0.012234910277324634, "grad_norm": 9.67054557800293, "learning_rate": 6.03588907014682e-07, "loss": 0.6387, "num_input_tokens_seen": 160192, "step": 75 }, { "epoch": 0.013050570962479609, "grad_norm": 5.773282051086426, "learning_rate": 6.443719412724307e-07, "loss": 0.7717, "num_input_tokens_seen": 170432, "step": 80 }, { "epoch": 0.013866231647634585, "grad_norm": 5.768527030944824, "learning_rate": 6.851549755301795e-07, "loss": 0.6047, "num_input_tokens_seen": 180672, "step": 85 }, { "epoch": 0.01468189233278956, "grad_norm": 8.126620292663574, "learning_rate": 7.259380097879283e-07, "loss": 0.7343, "num_input_tokens_seen": 192352, "step": 90 }, { "epoch": 0.015497553017944535, "grad_norm": 23.64755630493164, "learning_rate": 7.66721044045677e-07, "loss": 0.7276, "num_input_tokens_seen": 203040, "step": 95 }, { "epoch": 0.01631321370309951, "grad_norm": 13.55368709564209, "learning_rate": 8.075040783034258e-07, "loss": 0.9729, "num_input_tokens_seen": 214272, "step": 100 }, { "epoch": 0.017128874388254486, "grad_norm": 10.714468955993652, "learning_rate": 8.482871125611746e-07, "loss": 0.6276, "num_input_tokens_seen": 225664, "step": 105 }, { "epoch": 0.01794453507340946, "grad_norm": 7.814089775085449, "learning_rate": 8.890701468189233e-07, "loss": 1.608, "num_input_tokens_seen": 237216, "step": 110 }, { "epoch": 0.018760195758564437, "grad_norm": 18.210569381713867, "learning_rate": 9.298531810766722e-07, "loss": 0.6829, "num_input_tokens_seen": 246560, "step": 115 }, { "epoch": 0.01957585644371941, "grad_norm": 22.19220733642578, "learning_rate": 9.70636215334421e-07, "loss": 1.3417, "num_input_tokens_seen": 257184, "step": 120 }, { "epoch": 0.020391517128874388, "grad_norm": 16.29233169555664, "learning_rate": 1.0114192495921699e-06, "loss": 0.6808, "num_input_tokens_seen": 268384, "step": 125 }, { "epoch": 0.021207177814029365, "grad_norm": 6.1285319328308105, "learning_rate": 1.0522022838499183e-06, "loss": 0.5364, "num_input_tokens_seen": 279680, "step": 130 }, { "epoch": 0.02202283849918434, "grad_norm": 14.859761238098145, "learning_rate": 1.0929853181076673e-06, "loss": 1.211, "num_input_tokens_seen": 290688, "step": 135 }, { "epoch": 0.022838499184339316, "grad_norm": 17.23931312561035, "learning_rate": 1.133768352365416e-06, "loss": 0.8324, "num_input_tokens_seen": 301408, "step": 140 }, { "epoch": 0.02365415986949429, "grad_norm": 5.048580646514893, "learning_rate": 1.1745513866231649e-06, "loss": 1.0377, "num_input_tokens_seen": 311968, "step": 145 }, { "epoch": 0.024469820554649267, "grad_norm": 13.512179374694824, "learning_rate": 1.2153344208809136e-06, "loss": 1.3006, "num_input_tokens_seen": 322080, "step": 150 }, { "epoch": 0.02528548123980424, "grad_norm": 12.56611442565918, "learning_rate": 1.2561174551386625e-06, "loss": 1.224, "num_input_tokens_seen": 333824, "step": 155 }, { "epoch": 0.026101141924959218, "grad_norm": 6.651486873626709, "learning_rate": 1.296900489396411e-06, "loss": 1.5912, "num_input_tokens_seen": 344896, "step": 160 }, { "epoch": 0.026916802610114192, "grad_norm": 5.611935138702393, "learning_rate": 1.33768352365416e-06, "loss": 0.8486, "num_input_tokens_seen": 356192, "step": 165 }, { "epoch": 0.02773246329526917, "grad_norm": 6.0018630027771, "learning_rate": 1.3784665579119086e-06, "loss": 0.5632, "num_input_tokens_seen": 367136, "step": 170 }, { "epoch": 0.028548123980424143, "grad_norm": 3.167198419570923, "learning_rate": 1.4192495921696575e-06, "loss": 0.6989, "num_input_tokens_seen": 376832, "step": 175 }, { "epoch": 0.02936378466557912, "grad_norm": 4.618821144104004, "learning_rate": 1.4600326264274062e-06, "loss": 0.4143, "num_input_tokens_seen": 387552, "step": 180 }, { "epoch": 0.030179445350734094, "grad_norm": 23.31751251220703, "learning_rate": 1.5008156606851552e-06, "loss": 0.9319, "num_input_tokens_seen": 398528, "step": 185 }, { "epoch": 0.03099510603588907, "grad_norm": 17.619123458862305, "learning_rate": 1.5415986949429036e-06, "loss": 0.8091, "num_input_tokens_seen": 408896, "step": 190 }, { "epoch": 0.03181076672104405, "grad_norm": 3.5539722442626953, "learning_rate": 1.5823817292006523e-06, "loss": 0.6515, "num_input_tokens_seen": 420416, "step": 195 }, { "epoch": 0.03262642740619902, "grad_norm": 18.352197647094727, "learning_rate": 1.6231647634584013e-06, "loss": 0.4857, "num_input_tokens_seen": 430912, "step": 200 }, { "epoch": 0.033442088091353996, "grad_norm": 26.030839920043945, "learning_rate": 1.6639477977161502e-06, "loss": 0.8654, "num_input_tokens_seen": 442176, "step": 205 }, { "epoch": 0.03425774877650897, "grad_norm": 11.81512451171875, "learning_rate": 1.704730831973899e-06, "loss": 0.5263, "num_input_tokens_seen": 454496, "step": 210 }, { "epoch": 0.03507340946166395, "grad_norm": 12.172980308532715, "learning_rate": 1.7455138662316478e-06, "loss": 0.3995, "num_input_tokens_seen": 464896, "step": 215 }, { "epoch": 0.03588907014681892, "grad_norm": 9.735648155212402, "learning_rate": 1.7862969004893963e-06, "loss": 1.0462, "num_input_tokens_seen": 477024, "step": 220 }, { "epoch": 0.0367047308319739, "grad_norm": 16.229707717895508, "learning_rate": 1.8270799347471452e-06, "loss": 0.5669, "num_input_tokens_seen": 487776, "step": 225 }, { "epoch": 0.037520391517128875, "grad_norm": 1.8517504930496216, "learning_rate": 1.8678629690048941e-06, "loss": 0.6044, "num_input_tokens_seen": 498528, "step": 230 }, { "epoch": 0.03833605220228385, "grad_norm": 5.680665493011475, "learning_rate": 1.908646003262643e-06, "loss": 0.2991, "num_input_tokens_seen": 509184, "step": 235 }, { "epoch": 0.03915171288743882, "grad_norm": 4.058143615722656, "learning_rate": 1.9494290375203913e-06, "loss": 0.5362, "num_input_tokens_seen": 520224, "step": 240 }, { "epoch": 0.0399673735725938, "grad_norm": 19.857254028320312, "learning_rate": 1.9902120717781402e-06, "loss": 1.5284, "num_input_tokens_seen": 530464, "step": 245 }, { "epoch": 0.040783034257748776, "grad_norm": 15.53018569946289, "learning_rate": 2.030995106035889e-06, "loss": 1.1205, "num_input_tokens_seen": 541952, "step": 250 }, { "epoch": 0.041598694942903754, "grad_norm": 3.2097983360290527, "learning_rate": 2.071778140293638e-06, "loss": 0.3677, "num_input_tokens_seen": 551424, "step": 255 }, { "epoch": 0.04241435562805873, "grad_norm": 10.602266311645508, "learning_rate": 2.1125611745513866e-06, "loss": 1.2435, "num_input_tokens_seen": 561440, "step": 260 }, { "epoch": 0.0432300163132137, "grad_norm": 15.465632438659668, "learning_rate": 2.1533442088091355e-06, "loss": 0.9176, "num_input_tokens_seen": 573440, "step": 265 }, { "epoch": 0.04404567699836868, "grad_norm": 6.000158309936523, "learning_rate": 2.1941272430668844e-06, "loss": 0.3343, "num_input_tokens_seen": 584032, "step": 270 }, { "epoch": 0.044861337683523655, "grad_norm": 22.78145980834961, "learning_rate": 2.2349102773246333e-06, "loss": 0.6821, "num_input_tokens_seen": 595328, "step": 275 }, { "epoch": 0.04567699836867863, "grad_norm": 4.328393936157227, "learning_rate": 2.275693311582382e-06, "loss": 1.0264, "num_input_tokens_seen": 605760, "step": 280 }, { "epoch": 0.0464926590538336, "grad_norm": 8.35729694366455, "learning_rate": 2.3164763458401307e-06, "loss": 0.5896, "num_input_tokens_seen": 616384, "step": 285 }, { "epoch": 0.04730831973898858, "grad_norm": 7.698673248291016, "learning_rate": 2.357259380097879e-06, "loss": 0.6132, "num_input_tokens_seen": 626880, "step": 290 }, { "epoch": 0.04812398042414356, "grad_norm": 1.6100306510925293, "learning_rate": 2.398042414355628e-06, "loss": 0.7909, "num_input_tokens_seen": 637536, "step": 295 }, { "epoch": 0.048939641109298535, "grad_norm": 18.588539123535156, "learning_rate": 2.4388254486133766e-06, "loss": 0.6532, "num_input_tokens_seen": 648000, "step": 300 }, { "epoch": 0.049755301794453505, "grad_norm": 8.66322135925293, "learning_rate": 2.4796084828711255e-06, "loss": 1.227, "num_input_tokens_seen": 659648, "step": 305 }, { "epoch": 0.05057096247960848, "grad_norm": 9.428548812866211, "learning_rate": 2.5203915171288745e-06, "loss": 0.5282, "num_input_tokens_seen": 669920, "step": 310 }, { "epoch": 0.05138662316476346, "grad_norm": 4.988115310668945, "learning_rate": 2.5611745513866234e-06, "loss": 0.7282, "num_input_tokens_seen": 681216, "step": 315 }, { "epoch": 0.052202283849918436, "grad_norm": 8.257129669189453, "learning_rate": 2.6019575856443723e-06, "loss": 1.0687, "num_input_tokens_seen": 691872, "step": 320 }, { "epoch": 0.05301794453507341, "grad_norm": 15.32027816772461, "learning_rate": 2.6427406199021208e-06, "loss": 0.3635, "num_input_tokens_seen": 702208, "step": 325 }, { "epoch": 0.053833605220228384, "grad_norm": 13.268648147583008, "learning_rate": 2.6835236541598697e-06, "loss": 0.6009, "num_input_tokens_seen": 712288, "step": 330 }, { "epoch": 0.05464926590538336, "grad_norm": 11.272858619689941, "learning_rate": 2.7243066884176186e-06, "loss": 1.0324, "num_input_tokens_seen": 722976, "step": 335 }, { "epoch": 0.05546492659053834, "grad_norm": 2.6093666553497314, "learning_rate": 2.7650897226753675e-06, "loss": 0.5323, "num_input_tokens_seen": 733760, "step": 340 }, { "epoch": 0.05628058727569331, "grad_norm": 2.8714146614074707, "learning_rate": 2.805872756933116e-06, "loss": 0.7561, "num_input_tokens_seen": 743520, "step": 345 }, { "epoch": 0.057096247960848286, "grad_norm": 3.42722225189209, "learning_rate": 2.8466557911908645e-06, "loss": 0.0811, "num_input_tokens_seen": 753984, "step": 350 }, { "epoch": 0.05791190864600326, "grad_norm": 2.2392630577087402, "learning_rate": 2.8874388254486134e-06, "loss": 0.3957, "num_input_tokens_seen": 765376, "step": 355 }, { "epoch": 0.05872756933115824, "grad_norm": 4.543656349182129, "learning_rate": 2.9282218597063623e-06, "loss": 0.7068, "num_input_tokens_seen": 775904, "step": 360 }, { "epoch": 0.05954323001631321, "grad_norm": 1.3153177499771118, "learning_rate": 2.969004893964111e-06, "loss": 0.7697, "num_input_tokens_seen": 788000, "step": 365 }, { "epoch": 0.06035889070146819, "grad_norm": 13.002714157104492, "learning_rate": 3.0097879282218597e-06, "loss": 0.9799, "num_input_tokens_seen": 798816, "step": 370 }, { "epoch": 0.061174551386623165, "grad_norm": 1.001260757446289, "learning_rate": 3.0505709624796087e-06, "loss": 0.365, "num_input_tokens_seen": 809856, "step": 375 }, { "epoch": 0.06199021207177814, "grad_norm": 1.4286184310913086, "learning_rate": 3.0913539967373576e-06, "loss": 0.523, "num_input_tokens_seen": 820448, "step": 380 }, { "epoch": 0.06280587275693311, "grad_norm": 18.706470489501953, "learning_rate": 3.132137030995106e-06, "loss": 1.088, "num_input_tokens_seen": 830624, "step": 385 }, { "epoch": 0.0636215334420881, "grad_norm": 1.0630874633789062, "learning_rate": 3.1729200652528554e-06, "loss": 0.5391, "num_input_tokens_seen": 840224, "step": 390 }, { "epoch": 0.06443719412724307, "grad_norm": 17.3529109954834, "learning_rate": 3.213703099510604e-06, "loss": 0.5071, "num_input_tokens_seen": 850176, "step": 395 }, { "epoch": 0.06525285481239804, "grad_norm": 3.985969066619873, "learning_rate": 3.2544861337683524e-06, "loss": 0.3553, "num_input_tokens_seen": 861056, "step": 400 }, { "epoch": 0.06606851549755302, "grad_norm": 8.156689643859863, "learning_rate": 3.2952691680261013e-06, "loss": 0.8092, "num_input_tokens_seen": 871360, "step": 405 }, { "epoch": 0.06688417618270799, "grad_norm": 31.989702224731445, "learning_rate": 3.33605220228385e-06, "loss": 1.3592, "num_input_tokens_seen": 882656, "step": 410 }, { "epoch": 0.06769983686786298, "grad_norm": 11.47252368927002, "learning_rate": 3.3768352365415987e-06, "loss": 0.5707, "num_input_tokens_seen": 893952, "step": 415 }, { "epoch": 0.06851549755301795, "grad_norm": 16.895809173583984, "learning_rate": 3.4176182707993476e-06, "loss": 0.3123, "num_input_tokens_seen": 905472, "step": 420 }, { "epoch": 0.06933115823817292, "grad_norm": 0.7314866781234741, "learning_rate": 3.458401305057096e-06, "loss": 0.5118, "num_input_tokens_seen": 916000, "step": 425 }, { "epoch": 0.0701468189233279, "grad_norm": 7.430657863616943, "learning_rate": 3.4991843393148455e-06, "loss": 0.594, "num_input_tokens_seen": 926944, "step": 430 }, { "epoch": 0.07096247960848287, "grad_norm": 2.6516807079315186, "learning_rate": 3.539967373572594e-06, "loss": 0.6643, "num_input_tokens_seen": 938496, "step": 435 }, { "epoch": 0.07177814029363784, "grad_norm": 0.5453417301177979, "learning_rate": 3.5807504078303425e-06, "loss": 0.7201, "num_input_tokens_seen": 949472, "step": 440 }, { "epoch": 0.07259380097879282, "grad_norm": 17.167049407958984, "learning_rate": 3.621533442088092e-06, "loss": 0.8084, "num_input_tokens_seen": 959712, "step": 445 }, { "epoch": 0.0734094616639478, "grad_norm": 19.318767547607422, "learning_rate": 3.6623164763458403e-06, "loss": 0.963, "num_input_tokens_seen": 969888, "step": 450 }, { "epoch": 0.07422512234910278, "grad_norm": 1.3418043851852417, "learning_rate": 3.7030995106035896e-06, "loss": 0.7009, "num_input_tokens_seen": 981024, "step": 455 }, { "epoch": 0.07504078303425775, "grad_norm": 13.274270057678223, "learning_rate": 3.743882544861338e-06, "loss": 0.3469, "num_input_tokens_seen": 991680, "step": 460 }, { "epoch": 0.07585644371941272, "grad_norm": 1.2422374486923218, "learning_rate": 3.7846655791190866e-06, "loss": 0.7731, "num_input_tokens_seen": 1003008, "step": 465 }, { "epoch": 0.0766721044045677, "grad_norm": 3.4855055809020996, "learning_rate": 3.8254486133768355e-06, "loss": 0.2859, "num_input_tokens_seen": 1014336, "step": 470 }, { "epoch": 0.07748776508972267, "grad_norm": 12.034229278564453, "learning_rate": 3.866231647634584e-06, "loss": 0.7338, "num_input_tokens_seen": 1024960, "step": 475 }, { "epoch": 0.07830342577487764, "grad_norm": 0.8161830306053162, "learning_rate": 3.9070146818923325e-06, "loss": 0.2315, "num_input_tokens_seen": 1034880, "step": 480 }, { "epoch": 0.07911908646003263, "grad_norm": 7.478862285614014, "learning_rate": 3.947797716150082e-06, "loss": 0.8651, "num_input_tokens_seen": 1046112, "step": 485 }, { "epoch": 0.0799347471451876, "grad_norm": 22.26531219482422, "learning_rate": 3.98858075040783e-06, "loss": 0.8098, "num_input_tokens_seen": 1056864, "step": 490 }, { "epoch": 0.08075040783034258, "grad_norm": 0.40392860770225525, "learning_rate": 4.02936378466558e-06, "loss": 0.5404, "num_input_tokens_seen": 1067072, "step": 495 }, { "epoch": 0.08156606851549755, "grad_norm": 12.599085807800293, "learning_rate": 4.070146818923328e-06, "loss": 0.498, "num_input_tokens_seen": 1079136, "step": 500 }, { "epoch": 0.08238172920065252, "grad_norm": 0.784478485584259, "learning_rate": 4.110929853181077e-06, "loss": 0.9097, "num_input_tokens_seen": 1089408, "step": 505 }, { "epoch": 0.08319738988580751, "grad_norm": 10.010594367980957, "learning_rate": 4.151712887438826e-06, "loss": 0.2794, "num_input_tokens_seen": 1099488, "step": 510 }, { "epoch": 0.08401305057096248, "grad_norm": 4.048063278198242, "learning_rate": 4.1924959216965745e-06, "loss": 0.675, "num_input_tokens_seen": 1110464, "step": 515 }, { "epoch": 0.08482871125611746, "grad_norm": 19.317440032958984, "learning_rate": 4.233278955954323e-06, "loss": 0.3273, "num_input_tokens_seen": 1121216, "step": 520 }, { "epoch": 0.08564437194127243, "grad_norm": 18.039342880249023, "learning_rate": 4.274061990212072e-06, "loss": 0.493, "num_input_tokens_seen": 1132192, "step": 525 }, { "epoch": 0.0864600326264274, "grad_norm": 11.444881439208984, "learning_rate": 4.314845024469821e-06, "loss": 0.6106, "num_input_tokens_seen": 1142944, "step": 530 }, { "epoch": 0.08727569331158239, "grad_norm": 9.746691703796387, "learning_rate": 4.35562805872757e-06, "loss": 0.6221, "num_input_tokens_seen": 1154080, "step": 535 }, { "epoch": 0.08809135399673736, "grad_norm": 13.027948379516602, "learning_rate": 4.396411092985319e-06, "loss": 0.488, "num_input_tokens_seen": 1165120, "step": 540 }, { "epoch": 0.08890701468189233, "grad_norm": 3.0586557388305664, "learning_rate": 4.437194127243067e-06, "loss": 0.2867, "num_input_tokens_seen": 1176352, "step": 545 }, { "epoch": 0.08972267536704731, "grad_norm": 8.584217071533203, "learning_rate": 4.477977161500816e-06, "loss": 0.2178, "num_input_tokens_seen": 1188288, "step": 550 }, { "epoch": 0.09053833605220228, "grad_norm": 0.5211539268493652, "learning_rate": 4.518760195758565e-06, "loss": 0.4756, "num_input_tokens_seen": 1200800, "step": 555 }, { "epoch": 0.09135399673735727, "grad_norm": 11.939558029174805, "learning_rate": 4.5595432300163135e-06, "loss": 0.5071, "num_input_tokens_seen": 1211360, "step": 560 }, { "epoch": 0.09216965742251224, "grad_norm": 5.799455165863037, "learning_rate": 4.600326264274062e-06, "loss": 0.3327, "num_input_tokens_seen": 1221184, "step": 565 }, { "epoch": 0.0929853181076672, "grad_norm": 4.998054504394531, "learning_rate": 4.6411092985318105e-06, "loss": 0.651, "num_input_tokens_seen": 1231808, "step": 570 }, { "epoch": 0.09380097879282219, "grad_norm": 12.160737037658691, "learning_rate": 4.68189233278956e-06, "loss": 0.3156, "num_input_tokens_seen": 1242560, "step": 575 }, { "epoch": 0.09461663947797716, "grad_norm": 3.426234245300293, "learning_rate": 4.722675367047308e-06, "loss": 0.183, "num_input_tokens_seen": 1253440, "step": 580 }, { "epoch": 0.09543230016313213, "grad_norm": 17.633731842041016, "learning_rate": 4.763458401305057e-06, "loss": 0.8472, "num_input_tokens_seen": 1263616, "step": 585 }, { "epoch": 0.09624796084828711, "grad_norm": 14.605329513549805, "learning_rate": 4.804241435562806e-06, "loss": 0.244, "num_input_tokens_seen": 1275488, "step": 590 }, { "epoch": 0.09706362153344208, "grad_norm": 0.626120388507843, "learning_rate": 4.845024469820555e-06, "loss": 0.2679, "num_input_tokens_seen": 1287200, "step": 595 }, { "epoch": 0.09787928221859707, "grad_norm": 7.383834362030029, "learning_rate": 4.885807504078304e-06, "loss": 0.7493, "num_input_tokens_seen": 1298336, "step": 600 }, { "epoch": 0.09869494290375204, "grad_norm": 4.6556620597839355, "learning_rate": 4.9265905383360524e-06, "loss": 0.2895, "num_input_tokens_seen": 1309056, "step": 605 }, { "epoch": 0.09951060358890701, "grad_norm": 1.5424631834030151, "learning_rate": 4.967373572593801e-06, "loss": 0.333, "num_input_tokens_seen": 1320864, "step": 610 }, { "epoch": 0.100326264274062, "grad_norm": 10.238938331604004, "learning_rate": 5.00815660685155e-06, "loss": 0.7882, "num_input_tokens_seen": 1332448, "step": 615 }, { "epoch": 0.10114192495921696, "grad_norm": 9.649983406066895, "learning_rate": 5.048939641109299e-06, "loss": 0.5933, "num_input_tokens_seen": 1342368, "step": 620 }, { "epoch": 0.10195758564437195, "grad_norm": 14.703304290771484, "learning_rate": 5.089722675367047e-06, "loss": 0.3742, "num_input_tokens_seen": 1353024, "step": 625 }, { "epoch": 0.10277324632952692, "grad_norm": 12.713678359985352, "learning_rate": 5.130505709624797e-06, "loss": 0.2437, "num_input_tokens_seen": 1364000, "step": 630 }, { "epoch": 0.10358890701468189, "grad_norm": 6.326504230499268, "learning_rate": 5.171288743882545e-06, "loss": 0.9262, "num_input_tokens_seen": 1374784, "step": 635 }, { "epoch": 0.10440456769983687, "grad_norm": 0.16197632253170013, "learning_rate": 5.2120717781402944e-06, "loss": 0.4605, "num_input_tokens_seen": 1384192, "step": 640 }, { "epoch": 0.10522022838499184, "grad_norm": 0.30914539098739624, "learning_rate": 5.252854812398043e-06, "loss": 0.1618, "num_input_tokens_seen": 1395392, "step": 645 }, { "epoch": 0.10603588907014681, "grad_norm": 6.694571018218994, "learning_rate": 5.293637846655791e-06, "loss": 0.29, "num_input_tokens_seen": 1406208, "step": 650 }, { "epoch": 0.1068515497553018, "grad_norm": 0.6397295594215393, "learning_rate": 5.334420880913541e-06, "loss": 0.2999, "num_input_tokens_seen": 1417024, "step": 655 }, { "epoch": 0.10766721044045677, "grad_norm": 0.3513702154159546, "learning_rate": 5.375203915171289e-06, "loss": 0.5852, "num_input_tokens_seen": 1427808, "step": 660 }, { "epoch": 0.10848287112561175, "grad_norm": 16.48516273498535, "learning_rate": 5.415986949429038e-06, "loss": 0.1626, "num_input_tokens_seen": 1437344, "step": 665 }, { "epoch": 0.10929853181076672, "grad_norm": 0.9608420729637146, "learning_rate": 5.456769983686786e-06, "loss": 0.3224, "num_input_tokens_seen": 1448352, "step": 670 }, { "epoch": 0.11011419249592169, "grad_norm": 2.4646878242492676, "learning_rate": 5.4975530179445356e-06, "loss": 0.1999, "num_input_tokens_seen": 1459520, "step": 675 }, { "epoch": 0.11092985318107668, "grad_norm": 1.0080846548080444, "learning_rate": 5.538336052202284e-06, "loss": 0.3119, "num_input_tokens_seen": 1470432, "step": 680 }, { "epoch": 0.11174551386623165, "grad_norm": 7.099843978881836, "learning_rate": 5.5791190864600326e-06, "loss": 0.3477, "num_input_tokens_seen": 1480544, "step": 685 }, { "epoch": 0.11256117455138662, "grad_norm": 9.918968200683594, "learning_rate": 5.619902120717781e-06, "loss": 0.3175, "num_input_tokens_seen": 1490752, "step": 690 }, { "epoch": 0.1133768352365416, "grad_norm": 13.043498039245605, "learning_rate": 5.66068515497553e-06, "loss": 0.2591, "num_input_tokens_seen": 1502240, "step": 695 }, { "epoch": 0.11419249592169657, "grad_norm": 9.64283275604248, "learning_rate": 5.701468189233279e-06, "loss": 0.4058, "num_input_tokens_seen": 1512256, "step": 700 }, { "epoch": 0.11500815660685156, "grad_norm": 0.20353975892066956, "learning_rate": 5.742251223491028e-06, "loss": 0.4454, "num_input_tokens_seen": 1523168, "step": 705 }, { "epoch": 0.11582381729200653, "grad_norm": 5.264739990234375, "learning_rate": 5.783034257748777e-06, "loss": 0.1964, "num_input_tokens_seen": 1534976, "step": 710 }, { "epoch": 0.1166394779771615, "grad_norm": 4.052374839782715, "learning_rate": 5.823817292006525e-06, "loss": 0.4494, "num_input_tokens_seen": 1545856, "step": 715 }, { "epoch": 0.11745513866231648, "grad_norm": 2.4262053966522217, "learning_rate": 5.8646003262642745e-06, "loss": 0.0953, "num_input_tokens_seen": 1555680, "step": 720 }, { "epoch": 0.11827079934747145, "grad_norm": 0.06375903636217117, "learning_rate": 5.905383360522023e-06, "loss": 0.0273, "num_input_tokens_seen": 1567328, "step": 725 }, { "epoch": 0.11908646003262642, "grad_norm": 11.60741901397705, "learning_rate": 5.9461663947797715e-06, "loss": 0.3622, "num_input_tokens_seen": 1577184, "step": 730 }, { "epoch": 0.1199021207177814, "grad_norm": 14.03055477142334, "learning_rate": 5.986949429037521e-06, "loss": 0.5287, "num_input_tokens_seen": 1588544, "step": 735 }, { "epoch": 0.12071778140293637, "grad_norm": 7.811862945556641, "learning_rate": 6.027732463295269e-06, "loss": 0.3598, "num_input_tokens_seen": 1599648, "step": 740 }, { "epoch": 0.12153344208809136, "grad_norm": 4.788828372955322, "learning_rate": 6.068515497553019e-06, "loss": 0.5577, "num_input_tokens_seen": 1612096, "step": 745 }, { "epoch": 0.12234910277324633, "grad_norm": 1.1032073497772217, "learning_rate": 6.109298531810767e-06, "loss": 0.0652, "num_input_tokens_seen": 1623584, "step": 750 }, { "epoch": 0.1231647634584013, "grad_norm": 11.049327850341797, "learning_rate": 6.150081566068516e-06, "loss": 0.2363, "num_input_tokens_seen": 1634688, "step": 755 }, { "epoch": 0.12398042414355628, "grad_norm": 10.284933090209961, "learning_rate": 6.190864600326265e-06, "loss": 0.517, "num_input_tokens_seen": 1645600, "step": 760 }, { "epoch": 0.12479608482871125, "grad_norm": 1.0161199569702148, "learning_rate": 6.2316476345840135e-06, "loss": 0.1868, "num_input_tokens_seen": 1655968, "step": 765 }, { "epoch": 0.12561174551386622, "grad_norm": 2.1625585556030273, "learning_rate": 6.272430668841763e-06, "loss": 0.3482, "num_input_tokens_seen": 1666848, "step": 770 }, { "epoch": 0.1264274061990212, "grad_norm": 13.45174789428711, "learning_rate": 6.3132137030995105e-06, "loss": 0.3038, "num_input_tokens_seen": 1676512, "step": 775 }, { "epoch": 0.1272430668841762, "grad_norm": 0.13592009246349335, "learning_rate": 6.35399673735726e-06, "loss": 0.614, "num_input_tokens_seen": 1687488, "step": 780 }, { "epoch": 0.12805872756933115, "grad_norm": 0.35469624400138855, "learning_rate": 6.394779771615008e-06, "loss": 0.5397, "num_input_tokens_seen": 1697312, "step": 785 }, { "epoch": 0.12887438825448613, "grad_norm": 10.634716033935547, "learning_rate": 6.435562805872757e-06, "loss": 0.5566, "num_input_tokens_seen": 1708096, "step": 790 }, { "epoch": 0.12969004893964112, "grad_norm": 0.12184014916419983, "learning_rate": 6.476345840130506e-06, "loss": 0.211, "num_input_tokens_seen": 1717824, "step": 795 }, { "epoch": 0.13050570962479607, "grad_norm": 5.945420265197754, "learning_rate": 6.517128874388255e-06, "loss": 0.7566, "num_input_tokens_seen": 1728608, "step": 800 }, { "epoch": 0.13132137030995106, "grad_norm": 4.466879367828369, "learning_rate": 6.557911908646004e-06, "loss": 0.1479, "num_input_tokens_seen": 1739456, "step": 805 }, { "epoch": 0.13213703099510604, "grad_norm": 7.0077948570251465, "learning_rate": 6.598694942903752e-06, "loss": 0.2857, "num_input_tokens_seen": 1750304, "step": 810 }, { "epoch": 0.132952691680261, "grad_norm": 6.266101837158203, "learning_rate": 6.639477977161501e-06, "loss": 0.0794, "num_input_tokens_seen": 1762496, "step": 815 }, { "epoch": 0.13376835236541598, "grad_norm": 7.936112880706787, "learning_rate": 6.68026101141925e-06, "loss": 0.2907, "num_input_tokens_seen": 1772640, "step": 820 }, { "epoch": 0.13458401305057097, "grad_norm": 0.10595320165157318, "learning_rate": 6.721044045676998e-06, "loss": 0.3284, "num_input_tokens_seen": 1782272, "step": 825 }, { "epoch": 0.13539967373572595, "grad_norm": 0.06858377903699875, "learning_rate": 6.761827079934747e-06, "loss": 0.2335, "num_input_tokens_seen": 1793120, "step": 830 }, { "epoch": 0.1362153344208809, "grad_norm": 10.44363784790039, "learning_rate": 6.802610114192497e-06, "loss": 0.4125, "num_input_tokens_seen": 1803264, "step": 835 }, { "epoch": 0.1370309951060359, "grad_norm": 3.860588312149048, "learning_rate": 6.843393148450244e-06, "loss": 0.137, "num_input_tokens_seen": 1813088, "step": 840 }, { "epoch": 0.13784665579119088, "grad_norm": 0.650309681892395, "learning_rate": 6.884176182707994e-06, "loss": 0.4879, "num_input_tokens_seen": 1824032, "step": 845 }, { "epoch": 0.13866231647634583, "grad_norm": 0.4944702982902527, "learning_rate": 6.924959216965743e-06, "loss": 0.3511, "num_input_tokens_seen": 1836512, "step": 850 }, { "epoch": 0.13947797716150082, "grad_norm": 0.20982003211975098, "learning_rate": 6.965742251223491e-06, "loss": 0.1203, "num_input_tokens_seen": 1848256, "step": 855 }, { "epoch": 0.1402936378466558, "grad_norm": 4.680217742919922, "learning_rate": 7.00652528548124e-06, "loss": 0.2072, "num_input_tokens_seen": 1859040, "step": 860 }, { "epoch": 0.14110929853181076, "grad_norm": 7.44214391708374, "learning_rate": 7.047308319738989e-06, "loss": 0.2866, "num_input_tokens_seen": 1869248, "step": 865 }, { "epoch": 0.14192495921696574, "grad_norm": 7.6567487716674805, "learning_rate": 7.088091353996739e-06, "loss": 0.5006, "num_input_tokens_seen": 1881184, "step": 870 }, { "epoch": 0.14274061990212072, "grad_norm": 0.6429446935653687, "learning_rate": 7.128874388254486e-06, "loss": 0.291, "num_input_tokens_seen": 1893760, "step": 875 }, { "epoch": 0.14355628058727568, "grad_norm": 4.556582927703857, "learning_rate": 7.169657422512236e-06, "loss": 0.1467, "num_input_tokens_seen": 1904800, "step": 880 }, { "epoch": 0.14437194127243066, "grad_norm": 0.15669970214366913, "learning_rate": 7.210440456769985e-06, "loss": 0.3282, "num_input_tokens_seen": 1915296, "step": 885 }, { "epoch": 0.14518760195758565, "grad_norm": 14.141194343566895, "learning_rate": 7.251223491027733e-06, "loss": 0.6234, "num_input_tokens_seen": 1925152, "step": 890 }, { "epoch": 0.14600326264274063, "grad_norm": 1.2011229991912842, "learning_rate": 7.292006525285482e-06, "loss": 0.0766, "num_input_tokens_seen": 1935200, "step": 895 }, { "epoch": 0.1468189233278956, "grad_norm": 0.25357726216316223, "learning_rate": 7.3327895595432304e-06, "loss": 0.4431, "num_input_tokens_seen": 1945632, "step": 900 }, { "epoch": 0.14763458401305057, "grad_norm": 11.212824821472168, "learning_rate": 7.373572593800979e-06, "loss": 0.2857, "num_input_tokens_seen": 1956992, "step": 905 }, { "epoch": 0.14845024469820556, "grad_norm": 0.09775493294000626, "learning_rate": 7.414355628058728e-06, "loss": 0.4232, "num_input_tokens_seen": 1968800, "step": 910 }, { "epoch": 0.14926590538336051, "grad_norm": 5.193148136138916, "learning_rate": 7.455138662316477e-06, "loss": 0.1417, "num_input_tokens_seen": 1979424, "step": 915 }, { "epoch": 0.1500815660685155, "grad_norm": 0.10857293009757996, "learning_rate": 7.495921696574225e-06, "loss": 0.4734, "num_input_tokens_seen": 1990976, "step": 920 }, { "epoch": 0.15089722675367048, "grad_norm": 9.459717750549316, "learning_rate": 7.536704730831974e-06, "loss": 0.1337, "num_input_tokens_seen": 2002432, "step": 925 }, { "epoch": 0.15171288743882544, "grad_norm": 0.15402376651763916, "learning_rate": 7.577487765089723e-06, "loss": 0.026, "num_input_tokens_seen": 2013632, "step": 930 }, { "epoch": 0.15252854812398042, "grad_norm": 12.633416175842285, "learning_rate": 7.6182707993474724e-06, "loss": 0.5885, "num_input_tokens_seen": 2024960, "step": 935 }, { "epoch": 0.1533442088091354, "grad_norm": 7.057833671569824, "learning_rate": 7.659053833605221e-06, "loss": 0.1533, "num_input_tokens_seen": 2035712, "step": 940 }, { "epoch": 0.15415986949429036, "grad_norm": 4.4467034339904785, "learning_rate": 7.69983686786297e-06, "loss": 0.1709, "num_input_tokens_seen": 2046656, "step": 945 }, { "epoch": 0.15497553017944535, "grad_norm": 5.337577819824219, "learning_rate": 7.740619902120718e-06, "loss": 0.2743, "num_input_tokens_seen": 2058112, "step": 950 }, { "epoch": 0.15579119086460033, "grad_norm": 10.638042449951172, "learning_rate": 7.781402936378467e-06, "loss": 0.1755, "num_input_tokens_seen": 2068768, "step": 955 }, { "epoch": 0.1566068515497553, "grad_norm": 5.158874034881592, "learning_rate": 7.822185970636217e-06, "loss": 0.1715, "num_input_tokens_seen": 2080928, "step": 960 }, { "epoch": 0.15742251223491027, "grad_norm": 0.08856307715177536, "learning_rate": 7.862969004893964e-06, "loss": 0.132, "num_input_tokens_seen": 2090784, "step": 965 }, { "epoch": 0.15823817292006526, "grad_norm": 2.6662380695343018, "learning_rate": 7.903752039151714e-06, "loss": 0.1575, "num_input_tokens_seen": 2100416, "step": 970 }, { "epoch": 0.15905383360522024, "grad_norm": 0.36138996481895447, "learning_rate": 7.944535073409461e-06, "loss": 0.0974, "num_input_tokens_seen": 2111424, "step": 975 }, { "epoch": 0.1598694942903752, "grad_norm": 4.972231864929199, "learning_rate": 7.98531810766721e-06, "loss": 0.178, "num_input_tokens_seen": 2122176, "step": 980 }, { "epoch": 0.16068515497553018, "grad_norm": 0.049096524715423584, "learning_rate": 8.026101141924958e-06, "loss": 0.1116, "num_input_tokens_seen": 2132928, "step": 985 }, { "epoch": 0.16150081566068517, "grad_norm": 6.180870532989502, "learning_rate": 8.066884176182708e-06, "loss": 0.1025, "num_input_tokens_seen": 2144480, "step": 990 }, { "epoch": 0.16231647634584012, "grad_norm": 9.751792907714844, "learning_rate": 8.107667210440457e-06, "loss": 0.172, "num_input_tokens_seen": 2154272, "step": 995 }, { "epoch": 0.1631321370309951, "grad_norm": 0.19507698714733124, "learning_rate": 8.148450244698205e-06, "loss": 0.008, "num_input_tokens_seen": 2163936, "step": 1000 }, { "epoch": 0.1639477977161501, "grad_norm": 0.43871402740478516, "learning_rate": 8.189233278955954e-06, "loss": 0.289, "num_input_tokens_seen": 2175392, "step": 1005 }, { "epoch": 0.16476345840130505, "grad_norm": 0.06107112392783165, "learning_rate": 8.230016313213703e-06, "loss": 0.2199, "num_input_tokens_seen": 2185120, "step": 1010 }, { "epoch": 0.16557911908646003, "grad_norm": 9.946898460388184, "learning_rate": 8.270799347471453e-06, "loss": 0.4255, "num_input_tokens_seen": 2196864, "step": 1015 }, { "epoch": 0.16639477977161501, "grad_norm": 1.8367321491241455, "learning_rate": 8.3115823817292e-06, "loss": 0.2934, "num_input_tokens_seen": 2207744, "step": 1020 }, { "epoch": 0.16721044045676997, "grad_norm": 8.737253189086914, "learning_rate": 8.35236541598695e-06, "loss": 0.3628, "num_input_tokens_seen": 2218528, "step": 1025 }, { "epoch": 0.16802610114192496, "grad_norm": 3.1235134601593018, "learning_rate": 8.393148450244699e-06, "loss": 0.0845, "num_input_tokens_seen": 2228320, "step": 1030 }, { "epoch": 0.16884176182707994, "grad_norm": 9.189127922058105, "learning_rate": 8.433931484502447e-06, "loss": 0.2927, "num_input_tokens_seen": 2238464, "step": 1035 }, { "epoch": 0.16965742251223492, "grad_norm": 0.46165475249290466, "learning_rate": 8.474714518760196e-06, "loss": 0.1265, "num_input_tokens_seen": 2249440, "step": 1040 }, { "epoch": 0.17047308319738988, "grad_norm": 0.327440083026886, "learning_rate": 8.515497553017945e-06, "loss": 0.2192, "num_input_tokens_seen": 2260576, "step": 1045 }, { "epoch": 0.17128874388254486, "grad_norm": 0.4591752886772156, "learning_rate": 8.556280587275693e-06, "loss": 0.1878, "num_input_tokens_seen": 2271904, "step": 1050 }, { "epoch": 0.17210440456769985, "grad_norm": 5.08633279800415, "learning_rate": 8.597063621533442e-06, "loss": 0.0593, "num_input_tokens_seen": 2281568, "step": 1055 }, { "epoch": 0.1729200652528548, "grad_norm": 2.9506537914276123, "learning_rate": 8.637846655791192e-06, "loss": 0.2285, "num_input_tokens_seen": 2292704, "step": 1060 }, { "epoch": 0.1737357259380098, "grad_norm": 0.40769487619400024, "learning_rate": 8.67862969004894e-06, "loss": 0.4531, "num_input_tokens_seen": 2303904, "step": 1065 }, { "epoch": 0.17455138662316477, "grad_norm": 10.724939346313477, "learning_rate": 8.719412724306688e-06, "loss": 0.1291, "num_input_tokens_seen": 2314336, "step": 1070 }, { "epoch": 0.17536704730831973, "grad_norm": 0.10260462760925293, "learning_rate": 8.760195758564438e-06, "loss": 0.0487, "num_input_tokens_seen": 2325024, "step": 1075 }, { "epoch": 0.1761827079934747, "grad_norm": 1.2693135738372803, "learning_rate": 8.800978792822187e-06, "loss": 0.3189, "num_input_tokens_seen": 2336224, "step": 1080 }, { "epoch": 0.1769983686786297, "grad_norm": 0.11829324811697006, "learning_rate": 8.841761827079935e-06, "loss": 0.0737, "num_input_tokens_seen": 2346880, "step": 1085 }, { "epoch": 0.17781402936378465, "grad_norm": 0.0629054456949234, "learning_rate": 8.882544861337684e-06, "loss": 0.1405, "num_input_tokens_seen": 2357632, "step": 1090 }, { "epoch": 0.17862969004893964, "grad_norm": 4.260016441345215, "learning_rate": 8.923327895595434e-06, "loss": 0.1017, "num_input_tokens_seen": 2370592, "step": 1095 }, { "epoch": 0.17944535073409462, "grad_norm": 7.259707450866699, "learning_rate": 8.964110929853181e-06, "loss": 0.4943, "num_input_tokens_seen": 2381024, "step": 1100 }, { "epoch": 0.1802610114192496, "grad_norm": 0.5672919154167175, "learning_rate": 9.00489396411093e-06, "loss": 0.0259, "num_input_tokens_seen": 2392160, "step": 1105 }, { "epoch": 0.18107667210440456, "grad_norm": 6.852181434631348, "learning_rate": 9.04567699836868e-06, "loss": 0.6329, "num_input_tokens_seen": 2403616, "step": 1110 }, { "epoch": 0.18189233278955955, "grad_norm": 5.7968573570251465, "learning_rate": 9.086460032626427e-06, "loss": 0.5964, "num_input_tokens_seen": 2416032, "step": 1115 }, { "epoch": 0.18270799347471453, "grad_norm": 0.06541386991739273, "learning_rate": 9.127243066884177e-06, "loss": 0.1879, "num_input_tokens_seen": 2426112, "step": 1120 }, { "epoch": 0.1835236541598695, "grad_norm": 6.544291973114014, "learning_rate": 9.168026101141926e-06, "loss": 0.2227, "num_input_tokens_seen": 2437344, "step": 1125 }, { "epoch": 0.18433931484502447, "grad_norm": 0.17347821593284607, "learning_rate": 9.208809135399674e-06, "loss": 0.2871, "num_input_tokens_seen": 2448672, "step": 1130 }, { "epoch": 0.18515497553017946, "grad_norm": 0.08097555488348007, "learning_rate": 9.249592169657423e-06, "loss": 0.1802, "num_input_tokens_seen": 2459552, "step": 1135 }, { "epoch": 0.1859706362153344, "grad_norm": 5.536982536315918, "learning_rate": 9.290375203915172e-06, "loss": 0.0671, "num_input_tokens_seen": 2469184, "step": 1140 }, { "epoch": 0.1867862969004894, "grad_norm": 2.1592328548431396, "learning_rate": 9.33115823817292e-06, "loss": 0.0956, "num_input_tokens_seen": 2481120, "step": 1145 }, { "epoch": 0.18760195758564438, "grad_norm": 7.730838775634766, "learning_rate": 9.37194127243067e-06, "loss": 0.1967, "num_input_tokens_seen": 2490976, "step": 1150 }, { "epoch": 0.18841761827079934, "grad_norm": 8.524580001831055, "learning_rate": 9.412724306688419e-06, "loss": 0.2055, "num_input_tokens_seen": 2501024, "step": 1155 }, { "epoch": 0.18923327895595432, "grad_norm": 8.385376930236816, "learning_rate": 9.453507340946168e-06, "loss": 0.4198, "num_input_tokens_seen": 2511264, "step": 1160 }, { "epoch": 0.1900489396411093, "grad_norm": 0.6890153288841248, "learning_rate": 9.494290375203916e-06, "loss": 0.1255, "num_input_tokens_seen": 2521920, "step": 1165 }, { "epoch": 0.19086460032626426, "grad_norm": 4.921489715576172, "learning_rate": 9.535073409461665e-06, "loss": 0.4664, "num_input_tokens_seen": 2533056, "step": 1170 }, { "epoch": 0.19168026101141925, "grad_norm": 8.706521034240723, "learning_rate": 9.575856443719414e-06, "loss": 0.5346, "num_input_tokens_seen": 2544320, "step": 1175 }, { "epoch": 0.19249592169657423, "grad_norm": 3.1115901470184326, "learning_rate": 9.616639477977162e-06, "loss": 0.1906, "num_input_tokens_seen": 2555168, "step": 1180 }, { "epoch": 0.1933115823817292, "grad_norm": 0.234554722905159, "learning_rate": 9.657422512234911e-06, "loss": 0.0873, "num_input_tokens_seen": 2566304, "step": 1185 }, { "epoch": 0.19412724306688417, "grad_norm": 0.4108467698097229, "learning_rate": 9.69820554649266e-06, "loss": 0.3393, "num_input_tokens_seen": 2575936, "step": 1190 }, { "epoch": 0.19494290375203915, "grad_norm": 9.332695007324219, "learning_rate": 9.738988580750408e-06, "loss": 0.2337, "num_input_tokens_seen": 2586528, "step": 1195 }, { "epoch": 0.19575856443719414, "grad_norm": 3.612908124923706, "learning_rate": 9.779771615008158e-06, "loss": 0.4979, "num_input_tokens_seen": 2596064, "step": 1200 }, { "epoch": 0.1965742251223491, "grad_norm": 0.4483790099620819, "learning_rate": 9.820554649265905e-06, "loss": 0.4875, "num_input_tokens_seen": 2606464, "step": 1205 }, { "epoch": 0.19738988580750408, "grad_norm": 3.2798922061920166, "learning_rate": 9.861337683523655e-06, "loss": 0.2635, "num_input_tokens_seen": 2617568, "step": 1210 }, { "epoch": 0.19820554649265906, "grad_norm": 0.25610870122909546, "learning_rate": 9.902120717781402e-06, "loss": 0.0868, "num_input_tokens_seen": 2628928, "step": 1215 }, { "epoch": 0.19902120717781402, "grad_norm": 7.029544830322266, "learning_rate": 9.942903752039152e-06, "loss": 0.2243, "num_input_tokens_seen": 2641344, "step": 1220 }, { "epoch": 0.199836867862969, "grad_norm": 14.858476638793945, "learning_rate": 9.983686786296901e-06, "loss": 0.214, "num_input_tokens_seen": 2651584, "step": 1225 }, { "epoch": 0.200652528548124, "grad_norm": 6.908661365509033, "learning_rate": 1.0024469820554649e-05, "loss": 0.2425, "num_input_tokens_seen": 2663360, "step": 1230 }, { "epoch": 0.20146818923327894, "grad_norm": 4.568848133087158, "learning_rate": 1.0065252854812398e-05, "loss": 0.256, "num_input_tokens_seen": 2675168, "step": 1235 }, { "epoch": 0.20228384991843393, "grad_norm": 9.485286712646484, "learning_rate": 1.0106035889070147e-05, "loss": 0.5909, "num_input_tokens_seen": 2685952, "step": 1240 }, { "epoch": 0.2030995106035889, "grad_norm": 6.773355960845947, "learning_rate": 1.0146818923327895e-05, "loss": 0.1391, "num_input_tokens_seen": 2695488, "step": 1245 }, { "epoch": 0.2039151712887439, "grad_norm": 5.654582977294922, "learning_rate": 1.0187601957585644e-05, "loss": 0.2623, "num_input_tokens_seen": 2706880, "step": 1250 }, { "epoch": 0.20473083197389885, "grad_norm": 5.590231418609619, "learning_rate": 1.0228384991843394e-05, "loss": 0.5785, "num_input_tokens_seen": 2717184, "step": 1255 }, { "epoch": 0.20554649265905384, "grad_norm": 0.3414728343486786, "learning_rate": 1.0269168026101141e-05, "loss": 0.1314, "num_input_tokens_seen": 2729184, "step": 1260 }, { "epoch": 0.20636215334420882, "grad_norm": 4.966456890106201, "learning_rate": 1.030995106035889e-05, "loss": 0.2237, "num_input_tokens_seen": 2740768, "step": 1265 }, { "epoch": 0.20717781402936378, "grad_norm": 3.5390045642852783, "learning_rate": 1.035073409461664e-05, "loss": 0.3007, "num_input_tokens_seen": 2751328, "step": 1270 }, { "epoch": 0.20799347471451876, "grad_norm": 0.3213955760002136, "learning_rate": 1.0391517128874388e-05, "loss": 0.3432, "num_input_tokens_seen": 2762464, "step": 1275 }, { "epoch": 0.20880913539967375, "grad_norm": 1.0880823135375977, "learning_rate": 1.0432300163132137e-05, "loss": 0.0912, "num_input_tokens_seen": 2773824, "step": 1280 }, { "epoch": 0.2096247960848287, "grad_norm": 5.550802230834961, "learning_rate": 1.0473083197389886e-05, "loss": 0.3858, "num_input_tokens_seen": 2784544, "step": 1285 }, { "epoch": 0.21044045676998369, "grad_norm": 0.036135949194431305, "learning_rate": 1.0513866231647634e-05, "loss": 0.1073, "num_input_tokens_seen": 2796032, "step": 1290 }, { "epoch": 0.21125611745513867, "grad_norm": 0.06197676435112953, "learning_rate": 1.0554649265905383e-05, "loss": 0.2201, "num_input_tokens_seen": 2808096, "step": 1295 }, { "epoch": 0.21207177814029363, "grad_norm": 6.072765350341797, "learning_rate": 1.0595432300163133e-05, "loss": 0.1024, "num_input_tokens_seen": 2818080, "step": 1300 }, { "epoch": 0.2128874388254486, "grad_norm": 1.302298665046692, "learning_rate": 1.0636215334420882e-05, "loss": 0.2044, "num_input_tokens_seen": 2829248, "step": 1305 }, { "epoch": 0.2137030995106036, "grad_norm": 0.19654478132724762, "learning_rate": 1.067699836867863e-05, "loss": 0.1302, "num_input_tokens_seen": 2840288, "step": 1310 }, { "epoch": 0.21451876019575855, "grad_norm": 5.693123817443848, "learning_rate": 1.0717781402936379e-05, "loss": 0.3089, "num_input_tokens_seen": 2849440, "step": 1315 }, { "epoch": 0.21533442088091354, "grad_norm": 0.02976483479142189, "learning_rate": 1.0758564437194128e-05, "loss": 0.1977, "num_input_tokens_seen": 2861056, "step": 1320 }, { "epoch": 0.21615008156606852, "grad_norm": 2.9492530822753906, "learning_rate": 1.0799347471451876e-05, "loss": 0.1258, "num_input_tokens_seen": 2872832, "step": 1325 }, { "epoch": 0.2169657422512235, "grad_norm": 8.189480781555176, "learning_rate": 1.0840130505709625e-05, "loss": 0.5312, "num_input_tokens_seen": 2883968, "step": 1330 }, { "epoch": 0.21778140293637846, "grad_norm": 0.1490117758512497, "learning_rate": 1.0880913539967375e-05, "loss": 0.1234, "num_input_tokens_seen": 2894880, "step": 1335 }, { "epoch": 0.21859706362153344, "grad_norm": 6.5888671875, "learning_rate": 1.0921696574225122e-05, "loss": 0.4302, "num_input_tokens_seen": 2905216, "step": 1340 }, { "epoch": 0.21941272430668843, "grad_norm": 8.062369346618652, "learning_rate": 1.0962479608482872e-05, "loss": 0.1458, "num_input_tokens_seen": 2917152, "step": 1345 }, { "epoch": 0.22022838499184338, "grad_norm": 5.160257339477539, "learning_rate": 1.1003262642740621e-05, "loss": 0.1481, "num_input_tokens_seen": 2928448, "step": 1350 }, { "epoch": 0.22104404567699837, "grad_norm": 2.4276535511016846, "learning_rate": 1.1044045676998369e-05, "loss": 0.4326, "num_input_tokens_seen": 2938336, "step": 1355 }, { "epoch": 0.22185970636215335, "grad_norm": 7.487486362457275, "learning_rate": 1.1084828711256118e-05, "loss": 0.1281, "num_input_tokens_seen": 2950496, "step": 1360 }, { "epoch": 0.2226753670473083, "grad_norm": 11.373209953308105, "learning_rate": 1.1125611745513867e-05, "loss": 0.2648, "num_input_tokens_seen": 2961024, "step": 1365 }, { "epoch": 0.2234910277324633, "grad_norm": 7.982120037078857, "learning_rate": 1.1166394779771617e-05, "loss": 0.1207, "num_input_tokens_seen": 2971744, "step": 1370 }, { "epoch": 0.22430668841761828, "grad_norm": 5.5169758796691895, "learning_rate": 1.1207177814029364e-05, "loss": 0.1653, "num_input_tokens_seen": 2984192, "step": 1375 }, { "epoch": 0.22512234910277323, "grad_norm": 6.303249835968018, "learning_rate": 1.1247960848287114e-05, "loss": 0.2739, "num_input_tokens_seen": 2994816, "step": 1380 }, { "epoch": 0.22593800978792822, "grad_norm": 0.6037583947181702, "learning_rate": 1.1288743882544863e-05, "loss": 0.178, "num_input_tokens_seen": 3005568, "step": 1385 }, { "epoch": 0.2267536704730832, "grad_norm": 0.6912223696708679, "learning_rate": 1.132952691680261e-05, "loss": 0.3771, "num_input_tokens_seen": 3015680, "step": 1390 }, { "epoch": 0.2275693311582382, "grad_norm": 1.4500184059143066, "learning_rate": 1.137030995106036e-05, "loss": 0.1616, "num_input_tokens_seen": 3026752, "step": 1395 }, { "epoch": 0.22838499184339314, "grad_norm": 0.18793430924415588, "learning_rate": 1.141109298531811e-05, "loss": 0.1681, "num_input_tokens_seen": 3036704, "step": 1400 }, { "epoch": 0.22920065252854813, "grad_norm": 5.307665824890137, "learning_rate": 1.1451876019575857e-05, "loss": 0.3579, "num_input_tokens_seen": 3048960, "step": 1405 }, { "epoch": 0.2300163132137031, "grad_norm": 6.192534446716309, "learning_rate": 1.1492659053833606e-05, "loss": 0.4311, "num_input_tokens_seen": 3059456, "step": 1410 }, { "epoch": 0.23083197389885807, "grad_norm": 2.5860745906829834, "learning_rate": 1.1533442088091356e-05, "loss": 0.141, "num_input_tokens_seen": 3070112, "step": 1415 }, { "epoch": 0.23164763458401305, "grad_norm": 0.14588327705860138, "learning_rate": 1.1574225122349103e-05, "loss": 0.1593, "num_input_tokens_seen": 3080224, "step": 1420 }, { "epoch": 0.23246329526916804, "grad_norm": 4.942193984985352, "learning_rate": 1.1615008156606853e-05, "loss": 0.461, "num_input_tokens_seen": 3090304, "step": 1425 }, { "epoch": 0.233278955954323, "grad_norm": 7.982557773590088, "learning_rate": 1.1655791190864602e-05, "loss": 0.283, "num_input_tokens_seen": 3099232, "step": 1430 }, { "epoch": 0.23409461663947798, "grad_norm": 6.3187947273254395, "learning_rate": 1.169657422512235e-05, "loss": 0.4601, "num_input_tokens_seen": 3109696, "step": 1435 }, { "epoch": 0.23491027732463296, "grad_norm": 6.894016265869141, "learning_rate": 1.1737357259380099e-05, "loss": 0.1709, "num_input_tokens_seen": 3120160, "step": 1440 }, { "epoch": 0.23572593800978792, "grad_norm": 4.156856060028076, "learning_rate": 1.1778140293637847e-05, "loss": 0.1095, "num_input_tokens_seen": 3130784, "step": 1445 }, { "epoch": 0.2365415986949429, "grad_norm": 2.375866413116455, "learning_rate": 1.1818923327895596e-05, "loss": 0.3756, "num_input_tokens_seen": 3141536, "step": 1450 }, { "epoch": 0.23735725938009788, "grad_norm": 0.34968551993370056, "learning_rate": 1.1859706362153344e-05, "loss": 0.2573, "num_input_tokens_seen": 3152160, "step": 1455 }, { "epoch": 0.23817292006525284, "grad_norm": 1.3204271793365479, "learning_rate": 1.1900489396411093e-05, "loss": 0.116, "num_input_tokens_seen": 3162560, "step": 1460 }, { "epoch": 0.23898858075040783, "grad_norm": 1.7406924962997437, "learning_rate": 1.1941272430668842e-05, "loss": 0.1799, "num_input_tokens_seen": 3175360, "step": 1465 }, { "epoch": 0.2398042414355628, "grad_norm": 3.952451705932617, "learning_rate": 1.198205546492659e-05, "loss": 0.1108, "num_input_tokens_seen": 3187936, "step": 1470 }, { "epoch": 0.2406199021207178, "grad_norm": 8.951556205749512, "learning_rate": 1.202283849918434e-05, "loss": 0.508, "num_input_tokens_seen": 3199424, "step": 1475 }, { "epoch": 0.24143556280587275, "grad_norm": 0.3572978079319, "learning_rate": 1.2063621533442089e-05, "loss": 0.3565, "num_input_tokens_seen": 3210272, "step": 1480 }, { "epoch": 0.24225122349102773, "grad_norm": 4.634958744049072, "learning_rate": 1.2104404567699836e-05, "loss": 0.1031, "num_input_tokens_seen": 3219680, "step": 1485 }, { "epoch": 0.24306688417618272, "grad_norm": 0.9866248369216919, "learning_rate": 1.2145187601957586e-05, "loss": 0.149, "num_input_tokens_seen": 3229376, "step": 1490 }, { "epoch": 0.24388254486133767, "grad_norm": 1.492843747138977, "learning_rate": 1.2185970636215335e-05, "loss": 0.1961, "num_input_tokens_seen": 3240896, "step": 1495 }, { "epoch": 0.24469820554649266, "grad_norm": 4.87559700012207, "learning_rate": 1.2226753670473083e-05, "loss": 0.4337, "num_input_tokens_seen": 3251808, "step": 1500 }, { "epoch": 0.24551386623164764, "grad_norm": 1.7679897546768188, "learning_rate": 1.2267536704730832e-05, "loss": 0.1422, "num_input_tokens_seen": 3262688, "step": 1505 }, { "epoch": 0.2463295269168026, "grad_norm": 3.7211174964904785, "learning_rate": 1.2308319738988581e-05, "loss": 0.0588, "num_input_tokens_seen": 3273344, "step": 1510 }, { "epoch": 0.24714518760195758, "grad_norm": 5.718593120574951, "learning_rate": 1.234910277324633e-05, "loss": 0.2599, "num_input_tokens_seen": 3283520, "step": 1515 }, { "epoch": 0.24796084828711257, "grad_norm": 0.31124433875083923, "learning_rate": 1.2389885807504078e-05, "loss": 0.3094, "num_input_tokens_seen": 3294816, "step": 1520 }, { "epoch": 0.24877650897226752, "grad_norm": 3.2396998405456543, "learning_rate": 1.2430668841761828e-05, "loss": 0.3018, "num_input_tokens_seen": 3305152, "step": 1525 }, { "epoch": 0.2495921696574225, "grad_norm": 0.19118860363960266, "learning_rate": 1.2471451876019577e-05, "loss": 0.2159, "num_input_tokens_seen": 3315264, "step": 1530 }, { "epoch": 0.25040783034257746, "grad_norm": 3.2154130935668945, "learning_rate": 1.2512234910277326e-05, "loss": 0.304, "num_input_tokens_seen": 3326208, "step": 1535 }, { "epoch": 0.25122349102773245, "grad_norm": 5.489169597625732, "learning_rate": 1.2553017944535072e-05, "loss": 0.3482, "num_input_tokens_seen": 3337472, "step": 1540 }, { "epoch": 0.25203915171288743, "grad_norm": 1.1349680423736572, "learning_rate": 1.2593800978792822e-05, "loss": 0.1839, "num_input_tokens_seen": 3348576, "step": 1545 }, { "epoch": 0.2528548123980424, "grad_norm": 9.128485679626465, "learning_rate": 1.2634584013050571e-05, "loss": 0.3182, "num_input_tokens_seen": 3359200, "step": 1550 }, { "epoch": 0.2536704730831974, "grad_norm": 0.4159963130950928, "learning_rate": 1.267536704730832e-05, "loss": 0.1003, "num_input_tokens_seen": 3369696, "step": 1555 }, { "epoch": 0.2544861337683524, "grad_norm": 5.541415214538574, "learning_rate": 1.271615008156607e-05, "loss": 0.2741, "num_input_tokens_seen": 3381088, "step": 1560 }, { "epoch": 0.2553017944535073, "grad_norm": 5.9848432540893555, "learning_rate": 1.2756933115823819e-05, "loss": 0.1997, "num_input_tokens_seen": 3392704, "step": 1565 }, { "epoch": 0.2561174551386623, "grad_norm": 1.0845085382461548, "learning_rate": 1.2797716150081568e-05, "loss": 0.447, "num_input_tokens_seen": 3403136, "step": 1570 }, { "epoch": 0.2569331158238173, "grad_norm": 0.04691677913069725, "learning_rate": 1.2838499184339314e-05, "loss": 0.1089, "num_input_tokens_seen": 3411968, "step": 1575 }, { "epoch": 0.25774877650897227, "grad_norm": 0.24104487895965576, "learning_rate": 1.2879282218597064e-05, "loss": 0.1357, "num_input_tokens_seen": 3421216, "step": 1580 }, { "epoch": 0.25856443719412725, "grad_norm": 6.295036792755127, "learning_rate": 1.2920065252854813e-05, "loss": 0.3222, "num_input_tokens_seen": 3433120, "step": 1585 }, { "epoch": 0.25938009787928223, "grad_norm": 4.436316967010498, "learning_rate": 1.2960848287112562e-05, "loss": 0.2229, "num_input_tokens_seen": 3444960, "step": 1590 }, { "epoch": 0.2601957585644372, "grad_norm": 1.5245418548583984, "learning_rate": 1.3001631321370312e-05, "loss": 0.088, "num_input_tokens_seen": 3456512, "step": 1595 }, { "epoch": 0.26101141924959215, "grad_norm": 3.516801595687866, "learning_rate": 1.3042414355628061e-05, "loss": 0.1196, "num_input_tokens_seen": 3466880, "step": 1600 }, { "epoch": 0.26182707993474713, "grad_norm": 0.6709426641464233, "learning_rate": 1.3083197389885807e-05, "loss": 0.1292, "num_input_tokens_seen": 3478528, "step": 1605 }, { "epoch": 0.2626427406199021, "grad_norm": 0.7419503331184387, "learning_rate": 1.3123980424143556e-05, "loss": 0.184, "num_input_tokens_seen": 3490240, "step": 1610 }, { "epoch": 0.2634584013050571, "grad_norm": 4.619818210601807, "learning_rate": 1.3164763458401306e-05, "loss": 0.2783, "num_input_tokens_seen": 3501152, "step": 1615 }, { "epoch": 0.2642740619902121, "grad_norm": 0.15130358934402466, "learning_rate": 1.3205546492659055e-05, "loss": 0.2706, "num_input_tokens_seen": 3511360, "step": 1620 }, { "epoch": 0.26508972267536707, "grad_norm": 3.8601553440093994, "learning_rate": 1.3246329526916804e-05, "loss": 0.1139, "num_input_tokens_seen": 3522880, "step": 1625 }, { "epoch": 0.265905383360522, "grad_norm": 0.13603436946868896, "learning_rate": 1.3287112561174554e-05, "loss": 0.1081, "num_input_tokens_seen": 3534720, "step": 1630 }, { "epoch": 0.266721044045677, "grad_norm": 2.212265729904175, "learning_rate": 1.3327895595432303e-05, "loss": 0.1012, "num_input_tokens_seen": 3544128, "step": 1635 }, { "epoch": 0.26753670473083196, "grad_norm": 4.783210754394531, "learning_rate": 1.3368678629690049e-05, "loss": 0.164, "num_input_tokens_seen": 3555552, "step": 1640 }, { "epoch": 0.26835236541598695, "grad_norm": 0.38099732995033264, "learning_rate": 1.3409461663947798e-05, "loss": 0.106, "num_input_tokens_seen": 3565824, "step": 1645 }, { "epoch": 0.26916802610114193, "grad_norm": 4.968090057373047, "learning_rate": 1.3450244698205548e-05, "loss": 0.1144, "num_input_tokens_seen": 3576864, "step": 1650 }, { "epoch": 0.2699836867862969, "grad_norm": 6.693702220916748, "learning_rate": 1.3491027732463297e-05, "loss": 0.109, "num_input_tokens_seen": 3588032, "step": 1655 }, { "epoch": 0.2707993474714519, "grad_norm": 6.759777069091797, "learning_rate": 1.3531810766721044e-05, "loss": 0.0746, "num_input_tokens_seen": 3599488, "step": 1660 }, { "epoch": 0.27161500815660683, "grad_norm": 0.8844873905181885, "learning_rate": 1.3572593800978794e-05, "loss": 0.0663, "num_input_tokens_seen": 3610368, "step": 1665 }, { "epoch": 0.2724306688417618, "grad_norm": 6.606276988983154, "learning_rate": 1.3613376835236541e-05, "loss": 0.2757, "num_input_tokens_seen": 3620128, "step": 1670 }, { "epoch": 0.2732463295269168, "grad_norm": 2.6373190879821777, "learning_rate": 1.365415986949429e-05, "loss": 0.1603, "num_input_tokens_seen": 3631872, "step": 1675 }, { "epoch": 0.2740619902120718, "grad_norm": 12.086174011230469, "learning_rate": 1.369494290375204e-05, "loss": 0.3898, "num_input_tokens_seen": 3643296, "step": 1680 }, { "epoch": 0.27487765089722677, "grad_norm": 2.731724500656128, "learning_rate": 1.3735725938009788e-05, "loss": 0.2679, "num_input_tokens_seen": 3654592, "step": 1685 }, { "epoch": 0.27569331158238175, "grad_norm": 10.402414321899414, "learning_rate": 1.3776508972267537e-05, "loss": 0.5287, "num_input_tokens_seen": 3665312, "step": 1690 }, { "epoch": 0.2765089722675367, "grad_norm": 5.5846428871154785, "learning_rate": 1.3817292006525286e-05, "loss": 0.1602, "num_input_tokens_seen": 3677024, "step": 1695 }, { "epoch": 0.27732463295269166, "grad_norm": 1.3808804750442505, "learning_rate": 1.3858075040783036e-05, "loss": 0.371, "num_input_tokens_seen": 3687904, "step": 1700 }, { "epoch": 0.27814029363784665, "grad_norm": 2.9026827812194824, "learning_rate": 1.3898858075040783e-05, "loss": 0.3399, "num_input_tokens_seen": 3698848, "step": 1705 }, { "epoch": 0.27895595432300163, "grad_norm": 6.748539447784424, "learning_rate": 1.3939641109298531e-05, "loss": 0.1927, "num_input_tokens_seen": 3709856, "step": 1710 }, { "epoch": 0.2797716150081566, "grad_norm": 0.5773642063140869, "learning_rate": 1.398042414355628e-05, "loss": 0.1874, "num_input_tokens_seen": 3720736, "step": 1715 }, { "epoch": 0.2805872756933116, "grad_norm": 3.2085487842559814, "learning_rate": 1.402120717781403e-05, "loss": 0.3319, "num_input_tokens_seen": 3731264, "step": 1720 }, { "epoch": 0.2814029363784666, "grad_norm": 5.882781028747559, "learning_rate": 1.4061990212071779e-05, "loss": 0.2759, "num_input_tokens_seen": 3743072, "step": 1725 }, { "epoch": 0.2822185970636215, "grad_norm": 2.6723382472991943, "learning_rate": 1.4102773246329528e-05, "loss": 0.1888, "num_input_tokens_seen": 3754656, "step": 1730 }, { "epoch": 0.2830342577487765, "grad_norm": 4.209661960601807, "learning_rate": 1.4143556280587274e-05, "loss": 0.1748, "num_input_tokens_seen": 3765248, "step": 1735 }, { "epoch": 0.2838499184339315, "grad_norm": 8.234864234924316, "learning_rate": 1.4184339314845024e-05, "loss": 0.4846, "num_input_tokens_seen": 3774464, "step": 1740 }, { "epoch": 0.28466557911908646, "grad_norm": 7.3376054763793945, "learning_rate": 1.4225122349102773e-05, "loss": 0.3614, "num_input_tokens_seen": 3784608, "step": 1745 }, { "epoch": 0.28548123980424145, "grad_norm": 2.9826624393463135, "learning_rate": 1.4265905383360522e-05, "loss": 0.2459, "num_input_tokens_seen": 3796672, "step": 1750 }, { "epoch": 0.28629690048939643, "grad_norm": 1.7873778343200684, "learning_rate": 1.4306688417618272e-05, "loss": 0.1198, "num_input_tokens_seen": 3807712, "step": 1755 }, { "epoch": 0.28711256117455136, "grad_norm": 3.915682315826416, "learning_rate": 1.4347471451876021e-05, "loss": 0.1048, "num_input_tokens_seen": 3817984, "step": 1760 }, { "epoch": 0.28792822185970635, "grad_norm": 2.6385345458984375, "learning_rate": 1.438825448613377e-05, "loss": 0.0767, "num_input_tokens_seen": 3829152, "step": 1765 }, { "epoch": 0.28874388254486133, "grad_norm": 6.35852575302124, "learning_rate": 1.4429037520391516e-05, "loss": 0.4122, "num_input_tokens_seen": 3840352, "step": 1770 }, { "epoch": 0.2895595432300163, "grad_norm": 0.30161672830581665, "learning_rate": 1.4469820554649266e-05, "loss": 0.2518, "num_input_tokens_seen": 3851456, "step": 1775 }, { "epoch": 0.2903752039151713, "grad_norm": 1.889992594718933, "learning_rate": 1.4510603588907015e-05, "loss": 0.0475, "num_input_tokens_seen": 3863040, "step": 1780 }, { "epoch": 0.2911908646003263, "grad_norm": 2.7470626831054688, "learning_rate": 1.4551386623164764e-05, "loss": 0.0746, "num_input_tokens_seen": 3874176, "step": 1785 }, { "epoch": 0.29200652528548127, "grad_norm": 0.05625578388571739, "learning_rate": 1.4592169657422514e-05, "loss": 0.2891, "num_input_tokens_seen": 3884736, "step": 1790 }, { "epoch": 0.2928221859706362, "grad_norm": 5.239809513092041, "learning_rate": 1.4632952691680263e-05, "loss": 0.2012, "num_input_tokens_seen": 3895584, "step": 1795 }, { "epoch": 0.2936378466557912, "grad_norm": 0.3846181333065033, "learning_rate": 1.4673735725938009e-05, "loss": 0.1749, "num_input_tokens_seen": 3907584, "step": 1800 }, { "epoch": 0.29445350734094616, "grad_norm": 2.8441622257232666, "learning_rate": 1.4714518760195758e-05, "loss": 0.1626, "num_input_tokens_seen": 3919424, "step": 1805 }, { "epoch": 0.29526916802610115, "grad_norm": 4.441000461578369, "learning_rate": 1.4755301794453508e-05, "loss": 0.1205, "num_input_tokens_seen": 3929152, "step": 1810 }, { "epoch": 0.29608482871125613, "grad_norm": 3.806591749191284, "learning_rate": 1.4796084828711257e-05, "loss": 0.1792, "num_input_tokens_seen": 3940928, "step": 1815 }, { "epoch": 0.2969004893964111, "grad_norm": 4.7424774169921875, "learning_rate": 1.4836867862969006e-05, "loss": 0.1372, "num_input_tokens_seen": 3949568, "step": 1820 }, { "epoch": 0.29771615008156604, "grad_norm": 1.578133225440979, "learning_rate": 1.4877650897226756e-05, "loss": 0.0935, "num_input_tokens_seen": 3961088, "step": 1825 }, { "epoch": 0.29853181076672103, "grad_norm": 0.3075026571750641, "learning_rate": 1.4918433931484505e-05, "loss": 0.3372, "num_input_tokens_seen": 3973088, "step": 1830 }, { "epoch": 0.299347471451876, "grad_norm": 0.20886602997779846, "learning_rate": 1.4959216965742251e-05, "loss": 0.1815, "num_input_tokens_seen": 3982976, "step": 1835 }, { "epoch": 0.300163132137031, "grad_norm": 5.134726524353027, "learning_rate": 1.5e-05, "loss": 0.3164, "num_input_tokens_seen": 3993440, "step": 1840 }, { "epoch": 0.300978792822186, "grad_norm": 0.7109973430633545, "learning_rate": 1.504078303425775e-05, "loss": 0.0632, "num_input_tokens_seen": 4002720, "step": 1845 }, { "epoch": 0.30179445350734097, "grad_norm": 2.32041072845459, "learning_rate": 1.5081566068515499e-05, "loss": 0.4135, "num_input_tokens_seen": 4012896, "step": 1850 }, { "epoch": 0.30261011419249595, "grad_norm": 2.2132656574249268, "learning_rate": 1.5122349102773248e-05, "loss": 0.1208, "num_input_tokens_seen": 4023680, "step": 1855 }, { "epoch": 0.3034257748776509, "grad_norm": 0.8771421909332275, "learning_rate": 1.5163132137030998e-05, "loss": 0.0557, "num_input_tokens_seen": 4033760, "step": 1860 }, { "epoch": 0.30424143556280586, "grad_norm": 5.692502975463867, "learning_rate": 1.5203915171288744e-05, "loss": 0.1944, "num_input_tokens_seen": 4044192, "step": 1865 }, { "epoch": 0.30505709624796085, "grad_norm": 3.3692212104797363, "learning_rate": 1.5244698205546493e-05, "loss": 0.2023, "num_input_tokens_seen": 4055520, "step": 1870 }, { "epoch": 0.30587275693311583, "grad_norm": 0.5607348084449768, "learning_rate": 1.5285481239804242e-05, "loss": 0.1193, "num_input_tokens_seen": 4067264, "step": 1875 }, { "epoch": 0.3066884176182708, "grad_norm": 5.53071403503418, "learning_rate": 1.532626427406199e-05, "loss": 0.2618, "num_input_tokens_seen": 4078016, "step": 1880 }, { "epoch": 0.3075040783034258, "grad_norm": 0.22230537235736847, "learning_rate": 1.536704730831974e-05, "loss": 0.2164, "num_input_tokens_seen": 4088352, "step": 1885 }, { "epoch": 0.3083197389885807, "grad_norm": 6.9999566078186035, "learning_rate": 1.540783034257749e-05, "loss": 0.1746, "num_input_tokens_seen": 4100416, "step": 1890 }, { "epoch": 0.3091353996737357, "grad_norm": 3.544222354888916, "learning_rate": 1.5448613376835236e-05, "loss": 0.2752, "num_input_tokens_seen": 4110080, "step": 1895 }, { "epoch": 0.3099510603588907, "grad_norm": 5.308750629425049, "learning_rate": 1.5489396411092984e-05, "loss": 0.4527, "num_input_tokens_seen": 4120736, "step": 1900 }, { "epoch": 0.3107667210440457, "grad_norm": 0.9366431832313538, "learning_rate": 1.5530179445350735e-05, "loss": 0.16, "num_input_tokens_seen": 4132736, "step": 1905 }, { "epoch": 0.31158238172920066, "grad_norm": 6.9811224937438965, "learning_rate": 1.5570962479608483e-05, "loss": 0.2702, "num_input_tokens_seen": 4143808, "step": 1910 }, { "epoch": 0.31239804241435565, "grad_norm": 4.50189733505249, "learning_rate": 1.5611745513866234e-05, "loss": 0.2213, "num_input_tokens_seen": 4154976, "step": 1915 }, { "epoch": 0.3132137030995106, "grad_norm": 2.1178994178771973, "learning_rate": 1.565252854812398e-05, "loss": 0.3245, "num_input_tokens_seen": 4165504, "step": 1920 }, { "epoch": 0.31402936378466556, "grad_norm": 9.411482810974121, "learning_rate": 1.5693311582381732e-05, "loss": 0.2616, "num_input_tokens_seen": 4177184, "step": 1925 }, { "epoch": 0.31484502446982054, "grad_norm": 6.287456512451172, "learning_rate": 1.5734094616639477e-05, "loss": 0.1775, "num_input_tokens_seen": 4189088, "step": 1930 }, { "epoch": 0.31566068515497553, "grad_norm": 0.42288240790367126, "learning_rate": 1.5774877650897228e-05, "loss": 0.0773, "num_input_tokens_seen": 4200064, "step": 1935 }, { "epoch": 0.3164763458401305, "grad_norm": 0.4060116708278656, "learning_rate": 1.5815660685154975e-05, "loss": 0.2332, "num_input_tokens_seen": 4211936, "step": 1940 }, { "epoch": 0.3172920065252855, "grad_norm": 1.5796598196029663, "learning_rate": 1.5856443719412726e-05, "loss": 0.1302, "num_input_tokens_seen": 4221952, "step": 1945 }, { "epoch": 0.3181076672104405, "grad_norm": 0.36358773708343506, "learning_rate": 1.5897226753670474e-05, "loss": 0.3447, "num_input_tokens_seen": 4233024, "step": 1950 }, { "epoch": 0.3189233278955954, "grad_norm": 2.5228257179260254, "learning_rate": 1.5938009787928225e-05, "loss": 0.0938, "num_input_tokens_seen": 4244480, "step": 1955 }, { "epoch": 0.3197389885807504, "grad_norm": 0.09291154146194458, "learning_rate": 1.597879282218597e-05, "loss": 0.1656, "num_input_tokens_seen": 4255904, "step": 1960 }, { "epoch": 0.3205546492659054, "grad_norm": 3.384127616882324, "learning_rate": 1.601957585644372e-05, "loss": 0.4983, "num_input_tokens_seen": 4265792, "step": 1965 }, { "epoch": 0.32137030995106036, "grad_norm": 0.04204917326569557, "learning_rate": 1.6060358890701468e-05, "loss": 0.1385, "num_input_tokens_seen": 4276160, "step": 1970 }, { "epoch": 0.32218597063621535, "grad_norm": 4.716171741485596, "learning_rate": 1.610114192495922e-05, "loss": 0.1854, "num_input_tokens_seen": 4286624, "step": 1975 }, { "epoch": 0.32300163132137033, "grad_norm": 1.6673164367675781, "learning_rate": 1.6141924959216967e-05, "loss": 0.0894, "num_input_tokens_seen": 4297600, "step": 1980 }, { "epoch": 0.32381729200652526, "grad_norm": 7.22198486328125, "learning_rate": 1.6182707993474718e-05, "loss": 0.2282, "num_input_tokens_seen": 4308352, "step": 1985 }, { "epoch": 0.32463295269168024, "grad_norm": 5.862957000732422, "learning_rate": 1.6223491027732465e-05, "loss": 0.2771, "num_input_tokens_seen": 4319360, "step": 1990 }, { "epoch": 0.3254486133768352, "grad_norm": 3.0516316890716553, "learning_rate": 1.6264274061990213e-05, "loss": 0.0749, "num_input_tokens_seen": 4331552, "step": 1995 }, { "epoch": 0.3262642740619902, "grad_norm": 0.3817313015460968, "learning_rate": 1.630505709624796e-05, "loss": 0.3125, "num_input_tokens_seen": 4343744, "step": 2000 }, { "epoch": 0.3270799347471452, "grad_norm": 0.12780296802520752, "learning_rate": 1.634584013050571e-05, "loss": 0.2445, "num_input_tokens_seen": 4354304, "step": 2005 }, { "epoch": 0.3278955954323002, "grad_norm": 4.317539691925049, "learning_rate": 1.638662316476346e-05, "loss": 0.1835, "num_input_tokens_seen": 4365696, "step": 2010 }, { "epoch": 0.32871125611745516, "grad_norm": 15.590194702148438, "learning_rate": 1.6427406199021207e-05, "loss": 0.3673, "num_input_tokens_seen": 4376736, "step": 2015 }, { "epoch": 0.3295269168026101, "grad_norm": 0.751114547252655, "learning_rate": 1.6468189233278958e-05, "loss": 0.2193, "num_input_tokens_seen": 4388064, "step": 2020 }, { "epoch": 0.3303425774877651, "grad_norm": 9.185699462890625, "learning_rate": 1.6508972267536706e-05, "loss": 0.2041, "num_input_tokens_seen": 4398912, "step": 2025 }, { "epoch": 0.33115823817292006, "grad_norm": 7.339178085327148, "learning_rate": 1.6549755301794453e-05, "loss": 0.2251, "num_input_tokens_seen": 4409856, "step": 2030 }, { "epoch": 0.33197389885807504, "grad_norm": 2.1822028160095215, "learning_rate": 1.6590538336052204e-05, "loss": 0.1465, "num_input_tokens_seen": 4419552, "step": 2035 }, { "epoch": 0.33278955954323003, "grad_norm": 0.5807727575302124, "learning_rate": 1.6631321370309952e-05, "loss": 0.156, "num_input_tokens_seen": 4430080, "step": 2040 }, { "epoch": 0.333605220228385, "grad_norm": 6.257948875427246, "learning_rate": 1.66721044045677e-05, "loss": 0.2496, "num_input_tokens_seen": 4441568, "step": 2045 }, { "epoch": 0.33442088091353994, "grad_norm": 6.7260050773620605, "learning_rate": 1.671288743882545e-05, "loss": 0.2738, "num_input_tokens_seen": 4451200, "step": 2050 }, { "epoch": 0.3352365415986949, "grad_norm": 1.4698982238769531, "learning_rate": 1.6753670473083198e-05, "loss": 0.2373, "num_input_tokens_seen": 4461888, "step": 2055 }, { "epoch": 0.3360522022838499, "grad_norm": 3.716040849685669, "learning_rate": 1.6794453507340946e-05, "loss": 0.1157, "num_input_tokens_seen": 4471392, "step": 2060 }, { "epoch": 0.3368678629690049, "grad_norm": 2.1151514053344727, "learning_rate": 1.6835236541598694e-05, "loss": 0.1184, "num_input_tokens_seen": 4482656, "step": 2065 }, { "epoch": 0.3376835236541599, "grad_norm": 4.903298854827881, "learning_rate": 1.6876019575856445e-05, "loss": 0.1514, "num_input_tokens_seen": 4494336, "step": 2070 }, { "epoch": 0.33849918433931486, "grad_norm": 5.810368537902832, "learning_rate": 1.6916802610114192e-05, "loss": 0.3192, "num_input_tokens_seen": 4505760, "step": 2075 }, { "epoch": 0.33931484502446985, "grad_norm": 5.632075786590576, "learning_rate": 1.6957585644371943e-05, "loss": 0.2564, "num_input_tokens_seen": 4514976, "step": 2080 }, { "epoch": 0.3401305057096248, "grad_norm": 0.09141606837511063, "learning_rate": 1.699836867862969e-05, "loss": 0.1058, "num_input_tokens_seen": 4524928, "step": 2085 }, { "epoch": 0.34094616639477976, "grad_norm": 4.968491077423096, "learning_rate": 1.703915171288744e-05, "loss": 0.2507, "num_input_tokens_seen": 4536192, "step": 2090 }, { "epoch": 0.34176182707993474, "grad_norm": 0.6799984574317932, "learning_rate": 1.7079934747145186e-05, "loss": 0.0741, "num_input_tokens_seen": 4548160, "step": 2095 }, { "epoch": 0.3425774877650897, "grad_norm": 2.8747897148132324, "learning_rate": 1.7120717781402937e-05, "loss": 0.2407, "num_input_tokens_seen": 4558560, "step": 2100 }, { "epoch": 0.3433931484502447, "grad_norm": 6.427094459533691, "learning_rate": 1.7161500815660685e-05, "loss": 0.2703, "num_input_tokens_seen": 4570272, "step": 2105 }, { "epoch": 0.3442088091353997, "grad_norm": 1.3053245544433594, "learning_rate": 1.7202283849918436e-05, "loss": 0.2167, "num_input_tokens_seen": 4580608, "step": 2110 }, { "epoch": 0.3450244698205546, "grad_norm": 1.7092195749282837, "learning_rate": 1.7243066884176184e-05, "loss": 0.0917, "num_input_tokens_seen": 4591072, "step": 2115 }, { "epoch": 0.3458401305057096, "grad_norm": 1.0552736520767212, "learning_rate": 1.7283849918433935e-05, "loss": 0.1147, "num_input_tokens_seen": 4602368, "step": 2120 }, { "epoch": 0.3466557911908646, "grad_norm": 4.107524394989014, "learning_rate": 1.732463295269168e-05, "loss": 0.156, "num_input_tokens_seen": 4613600, "step": 2125 }, { "epoch": 0.3474714518760196, "grad_norm": 0.09314478933811188, "learning_rate": 1.736541598694943e-05, "loss": 0.1241, "num_input_tokens_seen": 4623872, "step": 2130 }, { "epoch": 0.34828711256117456, "grad_norm": 5.320088863372803, "learning_rate": 1.7406199021207178e-05, "loss": 0.3021, "num_input_tokens_seen": 4635136, "step": 2135 }, { "epoch": 0.34910277324632955, "grad_norm": 0.19724801182746887, "learning_rate": 1.744698205546493e-05, "loss": 0.0691, "num_input_tokens_seen": 4645152, "step": 2140 }, { "epoch": 0.34991843393148453, "grad_norm": 4.815561294555664, "learning_rate": 1.7487765089722676e-05, "loss": 0.1682, "num_input_tokens_seen": 4654240, "step": 2145 }, { "epoch": 0.35073409461663946, "grad_norm": 1.5736823081970215, "learning_rate": 1.7528548123980427e-05, "loss": 0.1974, "num_input_tokens_seen": 4666240, "step": 2150 }, { "epoch": 0.35154975530179444, "grad_norm": 1.9475501775741577, "learning_rate": 1.756933115823817e-05, "loss": 0.1333, "num_input_tokens_seen": 4678528, "step": 2155 }, { "epoch": 0.3523654159869494, "grad_norm": 1.0774749517440796, "learning_rate": 1.7610114192495923e-05, "loss": 0.2698, "num_input_tokens_seen": 4688768, "step": 2160 }, { "epoch": 0.3531810766721044, "grad_norm": 2.190877676010132, "learning_rate": 1.765089722675367e-05, "loss": 0.1058, "num_input_tokens_seen": 4698496, "step": 2165 }, { "epoch": 0.3539967373572594, "grad_norm": 0.6246117949485779, "learning_rate": 1.769168026101142e-05, "loss": 0.0693, "num_input_tokens_seen": 4710208, "step": 2170 }, { "epoch": 0.3548123980424144, "grad_norm": 2.2666268348693848, "learning_rate": 1.773246329526917e-05, "loss": 0.2784, "num_input_tokens_seen": 4720480, "step": 2175 }, { "epoch": 0.3556280587275693, "grad_norm": 0.6565126776695251, "learning_rate": 1.777324632952692e-05, "loss": 0.3135, "num_input_tokens_seen": 4730560, "step": 2180 }, { "epoch": 0.3564437194127243, "grad_norm": 1.3437923192977905, "learning_rate": 1.7814029363784668e-05, "loss": 0.0311, "num_input_tokens_seen": 4740768, "step": 2185 }, { "epoch": 0.3572593800978793, "grad_norm": 2.0712358951568604, "learning_rate": 1.7854812398042415e-05, "loss": 0.5265, "num_input_tokens_seen": 4751040, "step": 2190 }, { "epoch": 0.35807504078303426, "grad_norm": 2.5632283687591553, "learning_rate": 1.7895595432300163e-05, "loss": 0.0912, "num_input_tokens_seen": 4762016, "step": 2195 }, { "epoch": 0.35889070146818924, "grad_norm": 2.5864036083221436, "learning_rate": 1.7936378466557914e-05, "loss": 0.1039, "num_input_tokens_seen": 4771648, "step": 2200 }, { "epoch": 0.35970636215334423, "grad_norm": 6.183559894561768, "learning_rate": 1.797716150081566e-05, "loss": 0.1572, "num_input_tokens_seen": 4781440, "step": 2205 }, { "epoch": 0.3605220228384992, "grad_norm": 1.8455876111984253, "learning_rate": 1.8017944535073413e-05, "loss": 0.1543, "num_input_tokens_seen": 4793312, "step": 2210 }, { "epoch": 0.36133768352365414, "grad_norm": 7.173401832580566, "learning_rate": 1.805872756933116e-05, "loss": 0.1579, "num_input_tokens_seen": 4804288, "step": 2215 }, { "epoch": 0.3621533442088091, "grad_norm": 9.608040809631348, "learning_rate": 1.8099510603588908e-05, "loss": 0.4598, "num_input_tokens_seen": 4814784, "step": 2220 }, { "epoch": 0.3629690048939641, "grad_norm": 0.30811965465545654, "learning_rate": 1.8140293637846655e-05, "loss": 0.4575, "num_input_tokens_seen": 4825152, "step": 2225 }, { "epoch": 0.3637846655791191, "grad_norm": 1.2179059982299805, "learning_rate": 1.8181076672104407e-05, "loss": 0.2849, "num_input_tokens_seen": 4835776, "step": 2230 }, { "epoch": 0.3646003262642741, "grad_norm": 1.9670007228851318, "learning_rate": 1.8221859706362154e-05, "loss": 0.1188, "num_input_tokens_seen": 4846848, "step": 2235 }, { "epoch": 0.36541598694942906, "grad_norm": 0.5373203158378601, "learning_rate": 1.8262642740619905e-05, "loss": 0.0217, "num_input_tokens_seen": 4857664, "step": 2240 }, { "epoch": 0.366231647634584, "grad_norm": 8.48862361907959, "learning_rate": 1.8303425774877653e-05, "loss": 0.3324, "num_input_tokens_seen": 4868800, "step": 2245 }, { "epoch": 0.367047308319739, "grad_norm": 0.15726402401924133, "learning_rate": 1.83442088091354e-05, "loss": 0.2802, "num_input_tokens_seen": 4879776, "step": 2250 }, { "epoch": 0.36786296900489396, "grad_norm": 0.5811845660209656, "learning_rate": 1.8384991843393148e-05, "loss": 0.2547, "num_input_tokens_seen": 4890816, "step": 2255 }, { "epoch": 0.36867862969004894, "grad_norm": 5.363942623138428, "learning_rate": 1.84257748776509e-05, "loss": 0.2685, "num_input_tokens_seen": 4899136, "step": 2260 }, { "epoch": 0.3694942903752039, "grad_norm": 0.11295244842767715, "learning_rate": 1.8466557911908647e-05, "loss": 0.144, "num_input_tokens_seen": 4910112, "step": 2265 }, { "epoch": 0.3703099510603589, "grad_norm": 3.5267744064331055, "learning_rate": 1.8507340946166394e-05, "loss": 0.2599, "num_input_tokens_seen": 4920032, "step": 2270 }, { "epoch": 0.37112561174551384, "grad_norm": 4.316346645355225, "learning_rate": 1.8548123980424145e-05, "loss": 0.161, "num_input_tokens_seen": 4930752, "step": 2275 }, { "epoch": 0.3719412724306688, "grad_norm": 0.11088819056749344, "learning_rate": 1.8588907014681893e-05, "loss": 0.1533, "num_input_tokens_seen": 4940960, "step": 2280 }, { "epoch": 0.3727569331158238, "grad_norm": 3.6862525939941406, "learning_rate": 1.862969004893964e-05, "loss": 0.2039, "num_input_tokens_seen": 4952128, "step": 2285 }, { "epoch": 0.3735725938009788, "grad_norm": 1.4289133548736572, "learning_rate": 1.867047308319739e-05, "loss": 0.3226, "num_input_tokens_seen": 4962848, "step": 2290 }, { "epoch": 0.3743882544861338, "grad_norm": 2.987999200820923, "learning_rate": 1.871125611745514e-05, "loss": 0.0703, "num_input_tokens_seen": 4972448, "step": 2295 }, { "epoch": 0.37520391517128876, "grad_norm": 0.7900047898292542, "learning_rate": 1.8752039151712887e-05, "loss": 0.184, "num_input_tokens_seen": 4982432, "step": 2300 }, { "epoch": 0.37601957585644374, "grad_norm": 0.06719323247671127, "learning_rate": 1.8792822185970638e-05, "loss": 0.0839, "num_input_tokens_seen": 4992800, "step": 2305 }, { "epoch": 0.3768352365415987, "grad_norm": 1.453745722770691, "learning_rate": 1.8833605220228386e-05, "loss": 0.0334, "num_input_tokens_seen": 5004352, "step": 2310 }, { "epoch": 0.37765089722675366, "grad_norm": 0.4515019357204437, "learning_rate": 1.8874388254486133e-05, "loss": 0.1465, "num_input_tokens_seen": 5015968, "step": 2315 }, { "epoch": 0.37846655791190864, "grad_norm": 4.4329304695129395, "learning_rate": 1.891517128874388e-05, "loss": 0.0731, "num_input_tokens_seen": 5026624, "step": 2320 }, { "epoch": 0.3792822185970636, "grad_norm": 6.45882511138916, "learning_rate": 1.8955954323001632e-05, "loss": 0.1383, "num_input_tokens_seen": 5037312, "step": 2325 }, { "epoch": 0.3800978792822186, "grad_norm": 1.0029453039169312, "learning_rate": 1.899673735725938e-05, "loss": 0.1604, "num_input_tokens_seen": 5048544, "step": 2330 }, { "epoch": 0.3809135399673736, "grad_norm": 1.8722848892211914, "learning_rate": 1.903752039151713e-05, "loss": 0.1606, "num_input_tokens_seen": 5059584, "step": 2335 }, { "epoch": 0.3817292006525285, "grad_norm": 2.014350175857544, "learning_rate": 1.907830342577488e-05, "loss": 0.2206, "num_input_tokens_seen": 5072736, "step": 2340 }, { "epoch": 0.3825448613376835, "grad_norm": 3.0931100845336914, "learning_rate": 1.911908646003263e-05, "loss": 0.5229, "num_input_tokens_seen": 5083456, "step": 2345 }, { "epoch": 0.3833605220228385, "grad_norm": 1.8392821550369263, "learning_rate": 1.9159869494290374e-05, "loss": 0.0961, "num_input_tokens_seen": 5094240, "step": 2350 }, { "epoch": 0.3841761827079935, "grad_norm": 6.834700107574463, "learning_rate": 1.9200652528548125e-05, "loss": 0.2011, "num_input_tokens_seen": 5103744, "step": 2355 }, { "epoch": 0.38499184339314846, "grad_norm": 5.237668037414551, "learning_rate": 1.9241435562805872e-05, "loss": 0.2381, "num_input_tokens_seen": 5114048, "step": 2360 }, { "epoch": 0.38580750407830344, "grad_norm": 6.375014305114746, "learning_rate": 1.9282218597063623e-05, "loss": 0.3084, "num_input_tokens_seen": 5123744, "step": 2365 }, { "epoch": 0.3866231647634584, "grad_norm": 6.529644966125488, "learning_rate": 1.932300163132137e-05, "loss": 0.2536, "num_input_tokens_seen": 5133888, "step": 2370 }, { "epoch": 0.38743882544861336, "grad_norm": 0.03799702227115631, "learning_rate": 1.9363784665579122e-05, "loss": 0.157, "num_input_tokens_seen": 5145024, "step": 2375 }, { "epoch": 0.38825448613376834, "grad_norm": 2.366711139678955, "learning_rate": 1.9404567699836866e-05, "loss": 0.0423, "num_input_tokens_seen": 5156544, "step": 2380 }, { "epoch": 0.3890701468189233, "grad_norm": 4.6945600509643555, "learning_rate": 1.9445350734094617e-05, "loss": 0.2764, "num_input_tokens_seen": 5168192, "step": 2385 }, { "epoch": 0.3898858075040783, "grad_norm": 2.4937102794647217, "learning_rate": 1.9486133768352365e-05, "loss": 0.1952, "num_input_tokens_seen": 5178240, "step": 2390 }, { "epoch": 0.3907014681892333, "grad_norm": 0.42788416147232056, "learning_rate": 1.9526916802610116e-05, "loss": 0.0542, "num_input_tokens_seen": 5189248, "step": 2395 }, { "epoch": 0.3915171288743883, "grad_norm": 1.277098298072815, "learning_rate": 1.9567699836867864e-05, "loss": 0.0796, "num_input_tokens_seen": 5199392, "step": 2400 }, { "epoch": 0.3923327895595432, "grad_norm": 5.205860614776611, "learning_rate": 1.9608482871125615e-05, "loss": 0.2279, "num_input_tokens_seen": 5210240, "step": 2405 }, { "epoch": 0.3931484502446982, "grad_norm": 8.374488830566406, "learning_rate": 1.9649265905383362e-05, "loss": 0.2954, "num_input_tokens_seen": 5220000, "step": 2410 }, { "epoch": 0.3939641109298532, "grad_norm": 3.9300363063812256, "learning_rate": 1.969004893964111e-05, "loss": 0.0821, "num_input_tokens_seen": 5231520, "step": 2415 }, { "epoch": 0.39477977161500816, "grad_norm": 4.3988776206970215, "learning_rate": 1.9730831973898858e-05, "loss": 0.3133, "num_input_tokens_seen": 5243200, "step": 2420 }, { "epoch": 0.39559543230016314, "grad_norm": 3.794332504272461, "learning_rate": 1.977161500815661e-05, "loss": 0.139, "num_input_tokens_seen": 5253632, "step": 2425 }, { "epoch": 0.3964110929853181, "grad_norm": 0.4077603816986084, "learning_rate": 1.9812398042414356e-05, "loss": 0.1638, "num_input_tokens_seen": 5264736, "step": 2430 }, { "epoch": 0.3972267536704731, "grad_norm": 1.15377938747406, "learning_rate": 1.9853181076672107e-05, "loss": 0.193, "num_input_tokens_seen": 5276192, "step": 2435 }, { "epoch": 0.39804241435562804, "grad_norm": 0.22548221051692963, "learning_rate": 1.9893964110929855e-05, "loss": 0.1289, "num_input_tokens_seen": 5287296, "step": 2440 }, { "epoch": 0.398858075040783, "grad_norm": 0.8492670655250549, "learning_rate": 1.9934747145187603e-05, "loss": 0.1942, "num_input_tokens_seen": 5298336, "step": 2445 }, { "epoch": 0.399673735725938, "grad_norm": 0.9038564562797546, "learning_rate": 1.997553017944535e-05, "loss": 0.0822, "num_input_tokens_seen": 5309248, "step": 2450 }, { "epoch": 0.400489396411093, "grad_norm": 0.5144060254096985, "learning_rate": 2.00163132137031e-05, "loss": 0.3752, "num_input_tokens_seen": 5320384, "step": 2455 }, { "epoch": 0.401305057096248, "grad_norm": 0.5081189274787903, "learning_rate": 2.005709624796085e-05, "loss": 0.0487, "num_input_tokens_seen": 5331904, "step": 2460 }, { "epoch": 0.40212071778140296, "grad_norm": 0.2553884983062744, "learning_rate": 2.00978792822186e-05, "loss": 0.1636, "num_input_tokens_seen": 5343904, "step": 2465 }, { "epoch": 0.4029363784665579, "grad_norm": 1.0112682580947876, "learning_rate": 2.0138662316476348e-05, "loss": 0.0555, "num_input_tokens_seen": 5354336, "step": 2470 }, { "epoch": 0.40375203915171287, "grad_norm": 0.5315742492675781, "learning_rate": 2.0179445350734095e-05, "loss": 0.2583, "num_input_tokens_seen": 5366144, "step": 2475 }, { "epoch": 0.40456769983686786, "grad_norm": 4.658558368682861, "learning_rate": 2.0220228384991843e-05, "loss": 0.1024, "num_input_tokens_seen": 5377344, "step": 2480 }, { "epoch": 0.40538336052202284, "grad_norm": 4.860203266143799, "learning_rate": 2.0261011419249594e-05, "loss": 0.0958, "num_input_tokens_seen": 5386752, "step": 2485 }, { "epoch": 0.4061990212071778, "grad_norm": 0.3144293427467346, "learning_rate": 2.030179445350734e-05, "loss": 0.0502, "num_input_tokens_seen": 5396384, "step": 2490 }, { "epoch": 0.4070146818923328, "grad_norm": 5.124461650848389, "learning_rate": 2.034257748776509e-05, "loss": 0.2127, "num_input_tokens_seen": 5408256, "step": 2495 }, { "epoch": 0.4078303425774878, "grad_norm": 0.35638633370399475, "learning_rate": 2.038336052202284e-05, "loss": 0.2208, "num_input_tokens_seen": 5418752, "step": 2500 }, { "epoch": 0.4086460032626427, "grad_norm": 0.1471187174320221, "learning_rate": 2.0424143556280588e-05, "loss": 0.1479, "num_input_tokens_seen": 5429792, "step": 2505 }, { "epoch": 0.4094616639477977, "grad_norm": 6.38442850112915, "learning_rate": 2.0464926590538336e-05, "loss": 0.3057, "num_input_tokens_seen": 5440608, "step": 2510 }, { "epoch": 0.4102773246329527, "grad_norm": 2.963207483291626, "learning_rate": 2.0505709624796087e-05, "loss": 0.2826, "num_input_tokens_seen": 5450016, "step": 2515 }, { "epoch": 0.4110929853181077, "grad_norm": 12.700345039367676, "learning_rate": 2.0546492659053834e-05, "loss": 0.5221, "num_input_tokens_seen": 5461120, "step": 2520 }, { "epoch": 0.41190864600326266, "grad_norm": 8.157638549804688, "learning_rate": 2.0587275693311582e-05, "loss": 0.0987, "num_input_tokens_seen": 5471648, "step": 2525 }, { "epoch": 0.41272430668841764, "grad_norm": 0.29782620072364807, "learning_rate": 2.0628058727569333e-05, "loss": 0.2887, "num_input_tokens_seen": 5482656, "step": 2530 }, { "epoch": 0.41353996737357257, "grad_norm": 2.9385223388671875, "learning_rate": 2.066884176182708e-05, "loss": 0.0505, "num_input_tokens_seen": 5491168, "step": 2535 }, { "epoch": 0.41435562805872755, "grad_norm": 1.7700849771499634, "learning_rate": 2.070962479608483e-05, "loss": 0.2606, "num_input_tokens_seen": 5501792, "step": 2540 }, { "epoch": 0.41517128874388254, "grad_norm": 5.325303077697754, "learning_rate": 2.0750407830342576e-05, "loss": 0.3087, "num_input_tokens_seen": 5513152, "step": 2545 }, { "epoch": 0.4159869494290375, "grad_norm": 0.27992990612983704, "learning_rate": 2.0791190864600327e-05, "loss": 0.031, "num_input_tokens_seen": 5524480, "step": 2550 }, { "epoch": 0.4168026101141925, "grad_norm": 0.08975686132907867, "learning_rate": 2.0831973898858075e-05, "loss": 0.1271, "num_input_tokens_seen": 5535008, "step": 2555 }, { "epoch": 0.4176182707993475, "grad_norm": 0.9344058036804199, "learning_rate": 2.0872756933115826e-05, "loss": 0.0887, "num_input_tokens_seen": 5546048, "step": 2560 }, { "epoch": 0.4184339314845024, "grad_norm": 7.105401039123535, "learning_rate": 2.0913539967373573e-05, "loss": 0.3133, "num_input_tokens_seen": 5557760, "step": 2565 }, { "epoch": 0.4192495921696574, "grad_norm": 0.6101921200752258, "learning_rate": 2.0954323001631324e-05, "loss": 0.2166, "num_input_tokens_seen": 5567488, "step": 2570 }, { "epoch": 0.4200652528548124, "grad_norm": 4.419879913330078, "learning_rate": 2.099510603588907e-05, "loss": 0.2825, "num_input_tokens_seen": 5578592, "step": 2575 }, { "epoch": 0.42088091353996737, "grad_norm": 3.2440834045410156, "learning_rate": 2.103588907014682e-05, "loss": 0.0648, "num_input_tokens_seen": 5588576, "step": 2580 }, { "epoch": 0.42169657422512236, "grad_norm": 1.5354163646697998, "learning_rate": 2.1076672104404567e-05, "loss": 0.078, "num_input_tokens_seen": 5599264, "step": 2585 }, { "epoch": 0.42251223491027734, "grad_norm": 0.42529988288879395, "learning_rate": 2.1117455138662318e-05, "loss": 0.1078, "num_input_tokens_seen": 5609504, "step": 2590 }, { "epoch": 0.4233278955954323, "grad_norm": 10.483762741088867, "learning_rate": 2.1158238172920066e-05, "loss": 0.2525, "num_input_tokens_seen": 5620256, "step": 2595 }, { "epoch": 0.42414355628058725, "grad_norm": 5.660029888153076, "learning_rate": 2.1199021207177817e-05, "loss": 0.5248, "num_input_tokens_seen": 5630592, "step": 2600 }, { "epoch": 0.42495921696574224, "grad_norm": 6.411313056945801, "learning_rate": 2.123980424143556e-05, "loss": 0.3363, "num_input_tokens_seen": 5641536, "step": 2605 }, { "epoch": 0.4257748776508972, "grad_norm": 0.7426936626434326, "learning_rate": 2.1280587275693312e-05, "loss": 0.0497, "num_input_tokens_seen": 5652416, "step": 2610 }, { "epoch": 0.4265905383360522, "grad_norm": 5.806601047515869, "learning_rate": 2.132137030995106e-05, "loss": 0.2456, "num_input_tokens_seen": 5662528, "step": 2615 }, { "epoch": 0.4274061990212072, "grad_norm": 3.508190631866455, "learning_rate": 2.136215334420881e-05, "loss": 0.1547, "num_input_tokens_seen": 5672288, "step": 2620 }, { "epoch": 0.4282218597063622, "grad_norm": 1.4045436382293701, "learning_rate": 2.140293637846656e-05, "loss": 0.1169, "num_input_tokens_seen": 5682976, "step": 2625 }, { "epoch": 0.4290375203915171, "grad_norm": 0.7463783025741577, "learning_rate": 2.144371941272431e-05, "loss": 0.082, "num_input_tokens_seen": 5694240, "step": 2630 }, { "epoch": 0.4298531810766721, "grad_norm": 0.21157114207744598, "learning_rate": 2.1484502446982057e-05, "loss": 0.1706, "num_input_tokens_seen": 5704192, "step": 2635 }, { "epoch": 0.43066884176182707, "grad_norm": 0.380730003118515, "learning_rate": 2.1525285481239805e-05, "loss": 0.0482, "num_input_tokens_seen": 5715968, "step": 2640 }, { "epoch": 0.43148450244698205, "grad_norm": 2.2118470668792725, "learning_rate": 2.1566068515497553e-05, "loss": 0.1042, "num_input_tokens_seen": 5726784, "step": 2645 }, { "epoch": 0.43230016313213704, "grad_norm": 9.747725486755371, "learning_rate": 2.1606851549755304e-05, "loss": 0.5689, "num_input_tokens_seen": 5737952, "step": 2650 }, { "epoch": 0.433115823817292, "grad_norm": 1.891772985458374, "learning_rate": 2.164763458401305e-05, "loss": 0.1042, "num_input_tokens_seen": 5748512, "step": 2655 }, { "epoch": 0.433931484502447, "grad_norm": 0.3178868591785431, "learning_rate": 2.1688417618270802e-05, "loss": 0.2221, "num_input_tokens_seen": 5758944, "step": 2660 }, { "epoch": 0.43474714518760194, "grad_norm": 1.9077990055084229, "learning_rate": 2.172920065252855e-05, "loss": 0.1318, "num_input_tokens_seen": 5769632, "step": 2665 }, { "epoch": 0.4355628058727569, "grad_norm": 3.1999354362487793, "learning_rate": 2.1769983686786298e-05, "loss": 0.0573, "num_input_tokens_seen": 5781248, "step": 2670 }, { "epoch": 0.4363784665579119, "grad_norm": 1.6499738693237305, "learning_rate": 2.1810766721044045e-05, "loss": 0.2059, "num_input_tokens_seen": 5792192, "step": 2675 }, { "epoch": 0.4371941272430669, "grad_norm": 3.8542420864105225, "learning_rate": 2.1851549755301796e-05, "loss": 0.0722, "num_input_tokens_seen": 5803264, "step": 2680 }, { "epoch": 0.43800978792822187, "grad_norm": 0.095824234187603, "learning_rate": 2.1892332789559544e-05, "loss": 0.0717, "num_input_tokens_seen": 5815072, "step": 2685 }, { "epoch": 0.43882544861337686, "grad_norm": 0.39370542764663696, "learning_rate": 2.1933115823817295e-05, "loss": 0.2932, "num_input_tokens_seen": 5825248, "step": 2690 }, { "epoch": 0.4396411092985318, "grad_norm": 7.779138088226318, "learning_rate": 2.1973898858075043e-05, "loss": 0.2817, "num_input_tokens_seen": 5835904, "step": 2695 }, { "epoch": 0.44045676998368677, "grad_norm": 2.862678050994873, "learning_rate": 2.201468189233279e-05, "loss": 0.1971, "num_input_tokens_seen": 5844736, "step": 2700 }, { "epoch": 0.44127243066884175, "grad_norm": 1.2629541158676147, "learning_rate": 2.2055464926590538e-05, "loss": 0.3501, "num_input_tokens_seen": 5856672, "step": 2705 }, { "epoch": 0.44208809135399674, "grad_norm": 0.3131316602230072, "learning_rate": 2.209624796084829e-05, "loss": 0.1294, "num_input_tokens_seen": 5866848, "step": 2710 }, { "epoch": 0.4429037520391517, "grad_norm": 0.14001265168190002, "learning_rate": 2.2137030995106037e-05, "loss": 0.0316, "num_input_tokens_seen": 5877856, "step": 2715 }, { "epoch": 0.4437194127243067, "grad_norm": 0.40793848037719727, "learning_rate": 2.2177814029363788e-05, "loss": 0.2193, "num_input_tokens_seen": 5888896, "step": 2720 }, { "epoch": 0.4445350734094617, "grad_norm": 0.7146620154380798, "learning_rate": 2.2218597063621535e-05, "loss": 0.2333, "num_input_tokens_seen": 5899872, "step": 2725 }, { "epoch": 0.4453507340946166, "grad_norm": 6.070364475250244, "learning_rate": 2.2259380097879283e-05, "loss": 0.1109, "num_input_tokens_seen": 5910272, "step": 2730 }, { "epoch": 0.4461663947797716, "grad_norm": 6.409831523895264, "learning_rate": 2.230016313213703e-05, "loss": 0.2467, "num_input_tokens_seen": 5921312, "step": 2735 }, { "epoch": 0.4469820554649266, "grad_norm": 1.4456238746643066, "learning_rate": 2.234094616639478e-05, "loss": 0.0908, "num_input_tokens_seen": 5932032, "step": 2740 }, { "epoch": 0.44779771615008157, "grad_norm": 0.36780887842178345, "learning_rate": 2.238172920065253e-05, "loss": 0.0621, "num_input_tokens_seen": 5942176, "step": 2745 }, { "epoch": 0.44861337683523655, "grad_norm": 7.774724006652832, "learning_rate": 2.2422512234910277e-05, "loss": 0.4973, "num_input_tokens_seen": 5954336, "step": 2750 }, { "epoch": 0.44942903752039154, "grad_norm": 8.789905548095703, "learning_rate": 2.2463295269168028e-05, "loss": 0.37, "num_input_tokens_seen": 5965152, "step": 2755 }, { "epoch": 0.45024469820554647, "grad_norm": 2.8583152294158936, "learning_rate": 2.2504078303425776e-05, "loss": 0.3133, "num_input_tokens_seen": 5977216, "step": 2760 }, { "epoch": 0.45106035889070145, "grad_norm": 0.6330432891845703, "learning_rate": 2.2544861337683527e-05, "loss": 0.201, "num_input_tokens_seen": 5988000, "step": 2765 }, { "epoch": 0.45187601957585644, "grad_norm": 5.926965713500977, "learning_rate": 2.258564437194127e-05, "loss": 0.1667, "num_input_tokens_seen": 5998400, "step": 2770 }, { "epoch": 0.4526916802610114, "grad_norm": 4.193154811859131, "learning_rate": 2.2626427406199022e-05, "loss": 0.2632, "num_input_tokens_seen": 6010176, "step": 2775 }, { "epoch": 0.4535073409461664, "grad_norm": 5.64955997467041, "learning_rate": 2.266721044045677e-05, "loss": 0.178, "num_input_tokens_seen": 6021312, "step": 2780 }, { "epoch": 0.4543230016313214, "grad_norm": 5.58404016494751, "learning_rate": 2.270799347471452e-05, "loss": 0.1576, "num_input_tokens_seen": 6033024, "step": 2785 }, { "epoch": 0.4551386623164764, "grad_norm": 0.048357389867305756, "learning_rate": 2.2748776508972268e-05, "loss": 0.0812, "num_input_tokens_seen": 6042432, "step": 2790 }, { "epoch": 0.4559543230016313, "grad_norm": 3.361707925796509, "learning_rate": 2.278955954323002e-05, "loss": 0.205, "num_input_tokens_seen": 6054304, "step": 2795 }, { "epoch": 0.4567699836867863, "grad_norm": 2.470402956008911, "learning_rate": 2.2830342577487763e-05, "loss": 0.2135, "num_input_tokens_seen": 6065312, "step": 2800 }, { "epoch": 0.45758564437194127, "grad_norm": 4.321540832519531, "learning_rate": 2.2871125611745514e-05, "loss": 0.4454, "num_input_tokens_seen": 6075072, "step": 2805 }, { "epoch": 0.45840130505709625, "grad_norm": 6.536118507385254, "learning_rate": 2.2911908646003262e-05, "loss": 0.0626, "num_input_tokens_seen": 6084992, "step": 2810 }, { "epoch": 0.45921696574225124, "grad_norm": 8.955811500549316, "learning_rate": 2.2952691680261013e-05, "loss": 0.0782, "num_input_tokens_seen": 6095424, "step": 2815 }, { "epoch": 0.4600326264274062, "grad_norm": 3.7599077224731445, "learning_rate": 2.299347471451876e-05, "loss": 0.0874, "num_input_tokens_seen": 6107328, "step": 2820 }, { "epoch": 0.46084828711256115, "grad_norm": 2.6868855953216553, "learning_rate": 2.3034257748776512e-05, "loss": 0.0272, "num_input_tokens_seen": 6118048, "step": 2825 }, { "epoch": 0.46166394779771613, "grad_norm": 5.307155132293701, "learning_rate": 2.307504078303426e-05, "loss": 0.4782, "num_input_tokens_seen": 6128512, "step": 2830 }, { "epoch": 0.4624796084828711, "grad_norm": 0.8071622848510742, "learning_rate": 2.3115823817292007e-05, "loss": 0.0383, "num_input_tokens_seen": 6138720, "step": 2835 }, { "epoch": 0.4632952691680261, "grad_norm": 4.2207417488098145, "learning_rate": 2.3156606851549755e-05, "loss": 0.2491, "num_input_tokens_seen": 6149600, "step": 2840 }, { "epoch": 0.4641109298531811, "grad_norm": 8.52420425415039, "learning_rate": 2.3197389885807506e-05, "loss": 0.1157, "num_input_tokens_seen": 6160608, "step": 2845 }, { "epoch": 0.46492659053833607, "grad_norm": 3.6866848468780518, "learning_rate": 2.3238172920065253e-05, "loss": 0.2482, "num_input_tokens_seen": 6171968, "step": 2850 }, { "epoch": 0.46574225122349105, "grad_norm": 4.755484104156494, "learning_rate": 2.3278955954323004e-05, "loss": 0.2233, "num_input_tokens_seen": 6183360, "step": 2855 }, { "epoch": 0.466557911908646, "grad_norm": 2.6169004440307617, "learning_rate": 2.3319738988580752e-05, "loss": 0.1574, "num_input_tokens_seen": 6193664, "step": 2860 }, { "epoch": 0.46737357259380097, "grad_norm": 0.6334134340286255, "learning_rate": 2.33605220228385e-05, "loss": 0.0197, "num_input_tokens_seen": 6204320, "step": 2865 }, { "epoch": 0.46818923327895595, "grad_norm": 6.50516939163208, "learning_rate": 2.3401305057096247e-05, "loss": 0.4034, "num_input_tokens_seen": 6214560, "step": 2870 }, { "epoch": 0.46900489396411094, "grad_norm": 2.0981626510620117, "learning_rate": 2.3442088091354e-05, "loss": 0.3495, "num_input_tokens_seen": 6226176, "step": 2875 }, { "epoch": 0.4698205546492659, "grad_norm": 2.2720494270324707, "learning_rate": 2.3482871125611746e-05, "loss": 0.5628, "num_input_tokens_seen": 6236768, "step": 2880 }, { "epoch": 0.4706362153344209, "grad_norm": 7.052456855773926, "learning_rate": 2.3523654159869497e-05, "loss": 0.1432, "num_input_tokens_seen": 6247424, "step": 2885 }, { "epoch": 0.47145187601957583, "grad_norm": 0.3121151924133301, "learning_rate": 2.3564437194127245e-05, "loss": 0.0287, "num_input_tokens_seen": 6257280, "step": 2890 }, { "epoch": 0.4722675367047308, "grad_norm": 2.9958417415618896, "learning_rate": 2.3605220228384996e-05, "loss": 0.1357, "num_input_tokens_seen": 6268160, "step": 2895 }, { "epoch": 0.4730831973898858, "grad_norm": 0.3367376923561096, "learning_rate": 2.364600326264274e-05, "loss": 0.0691, "num_input_tokens_seen": 6278976, "step": 2900 }, { "epoch": 0.4738988580750408, "grad_norm": 0.07608085870742798, "learning_rate": 2.368678629690049e-05, "loss": 0.1561, "num_input_tokens_seen": 6290144, "step": 2905 }, { "epoch": 0.47471451876019577, "grad_norm": 2.0899765491485596, "learning_rate": 2.372756933115824e-05, "loss": 0.1741, "num_input_tokens_seen": 6301760, "step": 2910 }, { "epoch": 0.47553017944535075, "grad_norm": 0.1436070054769516, "learning_rate": 2.376835236541599e-05, "loss": 0.4465, "num_input_tokens_seen": 6312416, "step": 2915 }, { "epoch": 0.4763458401305057, "grad_norm": 0.6537477970123291, "learning_rate": 2.3809135399673737e-05, "loss": 0.0616, "num_input_tokens_seen": 6323488, "step": 2920 }, { "epoch": 0.47716150081566067, "grad_norm": 5.123737335205078, "learning_rate": 2.384991843393149e-05, "loss": 0.2836, "num_input_tokens_seen": 6334496, "step": 2925 }, { "epoch": 0.47797716150081565, "grad_norm": 2.81748104095459, "learning_rate": 2.3890701468189233e-05, "loss": 0.2396, "num_input_tokens_seen": 6344320, "step": 2930 }, { "epoch": 0.47879282218597063, "grad_norm": 5.903765678405762, "learning_rate": 2.3931484502446984e-05, "loss": 0.2596, "num_input_tokens_seen": 6355904, "step": 2935 }, { "epoch": 0.4796084828711256, "grad_norm": 4.702868461608887, "learning_rate": 2.397226753670473e-05, "loss": 0.3952, "num_input_tokens_seen": 6366016, "step": 2940 }, { "epoch": 0.4804241435562806, "grad_norm": 3.574524402618408, "learning_rate": 2.4013050570962482e-05, "loss": 0.0749, "num_input_tokens_seen": 6375712, "step": 2945 }, { "epoch": 0.4812398042414356, "grad_norm": 0.4243795871734619, "learning_rate": 2.405383360522023e-05, "loss": 0.1481, "num_input_tokens_seen": 6386720, "step": 2950 }, { "epoch": 0.4820554649265905, "grad_norm": 0.09564966708421707, "learning_rate": 2.4094616639477978e-05, "loss": 0.0398, "num_input_tokens_seen": 6397536, "step": 2955 }, { "epoch": 0.4828711256117455, "grad_norm": 4.892819404602051, "learning_rate": 2.4135399673735725e-05, "loss": 0.2196, "num_input_tokens_seen": 6408384, "step": 2960 }, { "epoch": 0.4836867862969005, "grad_norm": 3.464731454849243, "learning_rate": 2.4176182707993476e-05, "loss": 0.0864, "num_input_tokens_seen": 6420640, "step": 2965 }, { "epoch": 0.48450244698205547, "grad_norm": 0.6103954315185547, "learning_rate": 2.4216965742251224e-05, "loss": 0.1325, "num_input_tokens_seen": 6431680, "step": 2970 }, { "epoch": 0.48531810766721045, "grad_norm": 0.8500288724899292, "learning_rate": 2.425774877650897e-05, "loss": 0.1004, "num_input_tokens_seen": 6442368, "step": 2975 }, { "epoch": 0.48613376835236544, "grad_norm": 0.02255425974726677, "learning_rate": 2.4298531810766723e-05, "loss": 0.1682, "num_input_tokens_seen": 6453856, "step": 2980 }, { "epoch": 0.48694942903752036, "grad_norm": 2.5796749591827393, "learning_rate": 2.433931484502447e-05, "loss": 0.3037, "num_input_tokens_seen": 6464640, "step": 2985 }, { "epoch": 0.48776508972267535, "grad_norm": 11.368684768676758, "learning_rate": 2.438009787928222e-05, "loss": 0.1652, "num_input_tokens_seen": 6475392, "step": 2990 }, { "epoch": 0.48858075040783033, "grad_norm": 3.247875690460205, "learning_rate": 2.442088091353997e-05, "loss": 0.1504, "num_input_tokens_seen": 6486880, "step": 2995 }, { "epoch": 0.4893964110929853, "grad_norm": 0.5905230045318604, "learning_rate": 2.4461663947797717e-05, "loss": 0.2028, "num_input_tokens_seen": 6497216, "step": 3000 }, { "epoch": 0.4902120717781403, "grad_norm": 5.586061954498291, "learning_rate": 2.4502446982055464e-05, "loss": 0.248, "num_input_tokens_seen": 6508320, "step": 3005 }, { "epoch": 0.4910277324632953, "grad_norm": 5.63607120513916, "learning_rate": 2.4543230016313215e-05, "loss": 0.1885, "num_input_tokens_seen": 6519520, "step": 3010 }, { "epoch": 0.49184339314845027, "grad_norm": 3.0965116024017334, "learning_rate": 2.4584013050570963e-05, "loss": 0.2348, "num_input_tokens_seen": 6530240, "step": 3015 }, { "epoch": 0.4926590538336052, "grad_norm": 4.9088664054870605, "learning_rate": 2.4624796084828714e-05, "loss": 0.3563, "num_input_tokens_seen": 6540192, "step": 3020 }, { "epoch": 0.4934747145187602, "grad_norm": 0.8939807415008545, "learning_rate": 2.466557911908646e-05, "loss": 0.1468, "num_input_tokens_seen": 6550016, "step": 3025 }, { "epoch": 0.49429037520391517, "grad_norm": 0.1896563619375229, "learning_rate": 2.470636215334421e-05, "loss": 0.1385, "num_input_tokens_seen": 6560672, "step": 3030 }, { "epoch": 0.49510603588907015, "grad_norm": 0.6451339721679688, "learning_rate": 2.4747145187601957e-05, "loss": 0.086, "num_input_tokens_seen": 6571808, "step": 3035 }, { "epoch": 0.49592169657422513, "grad_norm": 2.1462085247039795, "learning_rate": 2.4787928221859708e-05, "loss": 0.292, "num_input_tokens_seen": 6582624, "step": 3040 }, { "epoch": 0.4967373572593801, "grad_norm": 0.92559814453125, "learning_rate": 2.4828711256117456e-05, "loss": 0.2414, "num_input_tokens_seen": 6595008, "step": 3045 }, { "epoch": 0.49755301794453505, "grad_norm": 3.88887882232666, "learning_rate": 2.4869494290375207e-05, "loss": 0.1398, "num_input_tokens_seen": 6606976, "step": 3050 }, { "epoch": 0.49836867862969003, "grad_norm": 4.913876056671143, "learning_rate": 2.4910277324632954e-05, "loss": 0.1024, "num_input_tokens_seen": 6617760, "step": 3055 }, { "epoch": 0.499184339314845, "grad_norm": 6.368566513061523, "learning_rate": 2.4951060358890702e-05, "loss": 0.1389, "num_input_tokens_seen": 6628896, "step": 3060 }, { "epoch": 0.5, "grad_norm": 6.820760250091553, "learning_rate": 2.499184339314845e-05, "loss": 0.2959, "num_input_tokens_seen": 6639424, "step": 3065 }, { "epoch": 0.5, "eval_loss": 0.176969513297081, "eval_runtime": 133.1919, "eval_samples_per_second": 20.459, "eval_steps_per_second": 5.12, "num_input_tokens_seen": 6639424, "step": 3065 }, { "epoch": 0.5008156606851549, "grad_norm": 2.821260690689087, "learning_rate": 2.5032626427406204e-05, "loss": 0.2044, "num_input_tokens_seen": 6649280, "step": 3070 }, { "epoch": 0.50163132137031, "grad_norm": 0.5129310488700867, "learning_rate": 2.507340946166395e-05, "loss": 0.0932, "num_input_tokens_seen": 6659008, "step": 3075 }, { "epoch": 0.5024469820554649, "grad_norm": 5.1510539054870605, "learning_rate": 2.5114192495921696e-05, "loss": 0.1632, "num_input_tokens_seen": 6669312, "step": 3080 }, { "epoch": 0.5032626427406199, "grad_norm": 4.602128028869629, "learning_rate": 2.5154975530179447e-05, "loss": 0.303, "num_input_tokens_seen": 6679968, "step": 3085 }, { "epoch": 0.5040783034257749, "grad_norm": 5.982812404632568, "learning_rate": 2.5195758564437195e-05, "loss": 0.2011, "num_input_tokens_seen": 6689728, "step": 3090 }, { "epoch": 0.5048939641109299, "grad_norm": 4.853014945983887, "learning_rate": 2.5236541598694946e-05, "loss": 0.5461, "num_input_tokens_seen": 6700448, "step": 3095 }, { "epoch": 0.5057096247960848, "grad_norm": 3.3327314853668213, "learning_rate": 2.5277324632952693e-05, "loss": 0.1175, "num_input_tokens_seen": 6711424, "step": 3100 }, { "epoch": 0.5065252854812398, "grad_norm": 2.074833869934082, "learning_rate": 2.5318107667210438e-05, "loss": 0.0559, "num_input_tokens_seen": 6722752, "step": 3105 }, { "epoch": 0.5073409461663948, "grad_norm": 1.116888403892517, "learning_rate": 2.5358890701468192e-05, "loss": 0.3728, "num_input_tokens_seen": 6734080, "step": 3110 }, { "epoch": 0.5081566068515497, "grad_norm": 0.8630831241607666, "learning_rate": 2.5399673735725936e-05, "loss": 0.0812, "num_input_tokens_seen": 6744672, "step": 3115 }, { "epoch": 0.5089722675367048, "grad_norm": 5.993602275848389, "learning_rate": 2.544045676998369e-05, "loss": 0.2033, "num_input_tokens_seen": 6755104, "step": 3120 }, { "epoch": 0.5097879282218597, "grad_norm": 4.010820388793945, "learning_rate": 2.5481239804241435e-05, "loss": 0.1895, "num_input_tokens_seen": 6767168, "step": 3125 }, { "epoch": 0.5106035889070146, "grad_norm": 0.18873050808906555, "learning_rate": 2.552202283849919e-05, "loss": 0.2547, "num_input_tokens_seen": 6777472, "step": 3130 }, { "epoch": 0.5114192495921697, "grad_norm": 9.538300514221191, "learning_rate": 2.5562805872756934e-05, "loss": 0.2986, "num_input_tokens_seen": 6787776, "step": 3135 }, { "epoch": 0.5122349102773246, "grad_norm": 3.935617208480835, "learning_rate": 2.560358890701468e-05, "loss": 0.3441, "num_input_tokens_seen": 6799008, "step": 3140 }, { "epoch": 0.5130505709624796, "grad_norm": 0.1453946828842163, "learning_rate": 2.5644371941272432e-05, "loss": 0.0793, "num_input_tokens_seen": 6809440, "step": 3145 }, { "epoch": 0.5138662316476346, "grad_norm": 0.26558801531791687, "learning_rate": 2.568515497553018e-05, "loss": 0.0611, "num_input_tokens_seen": 6819456, "step": 3150 }, { "epoch": 0.5146818923327896, "grad_norm": 6.354449272155762, "learning_rate": 2.572593800978793e-05, "loss": 0.1466, "num_input_tokens_seen": 6829312, "step": 3155 }, { "epoch": 0.5154975530179445, "grad_norm": 3.53004789352417, "learning_rate": 2.576672104404568e-05, "loss": 0.223, "num_input_tokens_seen": 6839456, "step": 3160 }, { "epoch": 0.5163132137030995, "grad_norm": 0.7306339740753174, "learning_rate": 2.580750407830343e-05, "loss": 0.2472, "num_input_tokens_seen": 6850752, "step": 3165 }, { "epoch": 0.5171288743882545, "grad_norm": 1.7089464664459229, "learning_rate": 2.5848287112561177e-05, "loss": 0.1606, "num_input_tokens_seen": 6862144, "step": 3170 }, { "epoch": 0.5179445350734094, "grad_norm": 6.444469451904297, "learning_rate": 2.588907014681892e-05, "loss": 0.2617, "num_input_tokens_seen": 6872704, "step": 3175 }, { "epoch": 0.5187601957585645, "grad_norm": 7.05857515335083, "learning_rate": 2.5929853181076673e-05, "loss": 0.1505, "num_input_tokens_seen": 6883232, "step": 3180 }, { "epoch": 0.5195758564437194, "grad_norm": 5.446662425994873, "learning_rate": 2.597063621533442e-05, "loss": 0.3952, "num_input_tokens_seen": 6894368, "step": 3185 }, { "epoch": 0.5203915171288744, "grad_norm": 4.710870265960693, "learning_rate": 2.601141924959217e-05, "loss": 0.2189, "num_input_tokens_seen": 6906336, "step": 3190 }, { "epoch": 0.5212071778140294, "grad_norm": 0.2568054497241974, "learning_rate": 2.605220228384992e-05, "loss": 0.0819, "num_input_tokens_seen": 6917440, "step": 3195 }, { "epoch": 0.5220228384991843, "grad_norm": 4.117044448852539, "learning_rate": 2.609298531810767e-05, "loss": 0.4198, "num_input_tokens_seen": 6928736, "step": 3200 }, { "epoch": 0.5228384991843393, "grad_norm": 3.110626697540283, "learning_rate": 2.6133768352365418e-05, "loss": 0.1727, "num_input_tokens_seen": 6939488, "step": 3205 }, { "epoch": 0.5236541598694943, "grad_norm": 0.6773119568824768, "learning_rate": 2.6174551386623165e-05, "loss": 0.0335, "num_input_tokens_seen": 6950016, "step": 3210 }, { "epoch": 0.5244698205546493, "grad_norm": 5.615257263183594, "learning_rate": 2.6215334420880916e-05, "loss": 0.2907, "num_input_tokens_seen": 6961280, "step": 3215 }, { "epoch": 0.5252854812398042, "grad_norm": 0.9965311288833618, "learning_rate": 2.6256117455138664e-05, "loss": 0.0736, "num_input_tokens_seen": 6971360, "step": 3220 }, { "epoch": 0.5261011419249593, "grad_norm": 4.916421413421631, "learning_rate": 2.6296900489396415e-05, "loss": 0.4214, "num_input_tokens_seen": 6982528, "step": 3225 }, { "epoch": 0.5269168026101142, "grad_norm": 1.3702571392059326, "learning_rate": 2.633768352365416e-05, "loss": 0.063, "num_input_tokens_seen": 6992704, "step": 3230 }, { "epoch": 0.5277324632952691, "grad_norm": 4.162288665771484, "learning_rate": 2.6378466557911907e-05, "loss": 0.2509, "num_input_tokens_seen": 7003072, "step": 3235 }, { "epoch": 0.5285481239804242, "grad_norm": 2.886589527130127, "learning_rate": 2.6419249592169658e-05, "loss": 0.1931, "num_input_tokens_seen": 7013728, "step": 3240 }, { "epoch": 0.5293637846655791, "grad_norm": 2.574683904647827, "learning_rate": 2.6460032626427406e-05, "loss": 0.0575, "num_input_tokens_seen": 7024256, "step": 3245 }, { "epoch": 0.5301794453507341, "grad_norm": 3.243692636489868, "learning_rate": 2.6500815660685157e-05, "loss": 0.4423, "num_input_tokens_seen": 7034688, "step": 3250 }, { "epoch": 0.5309951060358891, "grad_norm": 4.145303249359131, "learning_rate": 2.6541598694942904e-05, "loss": 0.1589, "num_input_tokens_seen": 7046080, "step": 3255 }, { "epoch": 0.531810766721044, "grad_norm": 5.3809027671813965, "learning_rate": 2.6582381729200655e-05, "loss": 0.2613, "num_input_tokens_seen": 7056032, "step": 3260 }, { "epoch": 0.532626427406199, "grad_norm": 3.7870163917541504, "learning_rate": 2.6623164763458403e-05, "loss": 0.2471, "num_input_tokens_seen": 7066880, "step": 3265 }, { "epoch": 0.533442088091354, "grad_norm": 5.299981594085693, "learning_rate": 2.666394779771615e-05, "loss": 0.3057, "num_input_tokens_seen": 7077664, "step": 3270 }, { "epoch": 0.534257748776509, "grad_norm": 6.679792404174805, "learning_rate": 2.67047308319739e-05, "loss": 0.1684, "num_input_tokens_seen": 7088928, "step": 3275 }, { "epoch": 0.5350734094616639, "grad_norm": 0.7039870023727417, "learning_rate": 2.6745513866231646e-05, "loss": 0.2108, "num_input_tokens_seen": 7100800, "step": 3280 }, { "epoch": 0.535889070146819, "grad_norm": 2.9606547355651855, "learning_rate": 2.67862969004894e-05, "loss": 0.0965, "num_input_tokens_seen": 7112160, "step": 3285 }, { "epoch": 0.5367047308319739, "grad_norm": 1.262087106704712, "learning_rate": 2.6827079934747145e-05, "loss": 0.1558, "num_input_tokens_seen": 7122848, "step": 3290 }, { "epoch": 0.5375203915171288, "grad_norm": 3.0593302249908447, "learning_rate": 2.68678629690049e-05, "loss": 0.1694, "num_input_tokens_seen": 7132800, "step": 3295 }, { "epoch": 0.5383360522022839, "grad_norm": 0.8122572898864746, "learning_rate": 2.6908646003262643e-05, "loss": 0.1244, "num_input_tokens_seen": 7144064, "step": 3300 }, { "epoch": 0.5391517128874388, "grad_norm": 1.4844942092895508, "learning_rate": 2.694942903752039e-05, "loss": 0.1656, "num_input_tokens_seen": 7155744, "step": 3305 }, { "epoch": 0.5399673735725938, "grad_norm": 0.482521653175354, "learning_rate": 2.6990212071778142e-05, "loss": 0.0457, "num_input_tokens_seen": 7166336, "step": 3310 }, { "epoch": 0.5407830342577488, "grad_norm": 0.0839756652712822, "learning_rate": 2.703099510603589e-05, "loss": 0.1977, "num_input_tokens_seen": 7176832, "step": 3315 }, { "epoch": 0.5415986949429038, "grad_norm": 0.9602549076080322, "learning_rate": 2.707177814029364e-05, "loss": 0.1921, "num_input_tokens_seen": 7187616, "step": 3320 }, { "epoch": 0.5424143556280587, "grad_norm": 0.056319817900657654, "learning_rate": 2.7112561174551388e-05, "loss": 0.1743, "num_input_tokens_seen": 7198144, "step": 3325 }, { "epoch": 0.5432300163132137, "grad_norm": 2.8947079181671143, "learning_rate": 2.715334420880914e-05, "loss": 0.0901, "num_input_tokens_seen": 7208960, "step": 3330 }, { "epoch": 0.5440456769983687, "grad_norm": 0.40086036920547485, "learning_rate": 2.7194127243066887e-05, "loss": 0.2511, "num_input_tokens_seen": 7220064, "step": 3335 }, { "epoch": 0.5448613376835236, "grad_norm": 0.3649408519268036, "learning_rate": 2.723491027732463e-05, "loss": 0.1671, "num_input_tokens_seen": 7231008, "step": 3340 }, { "epoch": 0.5456769983686787, "grad_norm": 2.579338550567627, "learning_rate": 2.7275693311582386e-05, "loss": 0.3123, "num_input_tokens_seen": 7242144, "step": 3345 }, { "epoch": 0.5464926590538336, "grad_norm": 4.868326663970947, "learning_rate": 2.731647634584013e-05, "loss": 0.2258, "num_input_tokens_seen": 7252448, "step": 3350 }, { "epoch": 0.5473083197389886, "grad_norm": 0.2523765563964844, "learning_rate": 2.7357259380097884e-05, "loss": 0.1712, "num_input_tokens_seen": 7261856, "step": 3355 }, { "epoch": 0.5481239804241436, "grad_norm": 0.5494678020477295, "learning_rate": 2.739804241435563e-05, "loss": 0.1164, "num_input_tokens_seen": 7272800, "step": 3360 }, { "epoch": 0.5489396411092985, "grad_norm": 4.799815654754639, "learning_rate": 2.7438825448613376e-05, "loss": 0.2332, "num_input_tokens_seen": 7283808, "step": 3365 }, { "epoch": 0.5497553017944535, "grad_norm": 0.9906727075576782, "learning_rate": 2.7479608482871127e-05, "loss": 0.0323, "num_input_tokens_seen": 7294976, "step": 3370 }, { "epoch": 0.5505709624796085, "grad_norm": 2.8852052688598633, "learning_rate": 2.7520391517128875e-05, "loss": 0.1403, "num_input_tokens_seen": 7306560, "step": 3375 }, { "epoch": 0.5513866231647635, "grad_norm": 0.6081190705299377, "learning_rate": 2.7561174551386626e-05, "loss": 0.0219, "num_input_tokens_seen": 7317600, "step": 3380 }, { "epoch": 0.5522022838499184, "grad_norm": 3.466200590133667, "learning_rate": 2.7601957585644373e-05, "loss": 0.0817, "num_input_tokens_seen": 7330144, "step": 3385 }, { "epoch": 0.5530179445350734, "grad_norm": 8.591158866882324, "learning_rate": 2.7642740619902125e-05, "loss": 0.2721, "num_input_tokens_seen": 7341248, "step": 3390 }, { "epoch": 0.5538336052202284, "grad_norm": 4.575144290924072, "learning_rate": 2.7683523654159872e-05, "loss": 0.2259, "num_input_tokens_seen": 7351200, "step": 3395 }, { "epoch": 0.5546492659053833, "grad_norm": 0.4166700839996338, "learning_rate": 2.7724306688417616e-05, "loss": 0.0269, "num_input_tokens_seen": 7361792, "step": 3400 }, { "epoch": 0.5554649265905384, "grad_norm": 4.073925018310547, "learning_rate": 2.776508972267537e-05, "loss": 0.2099, "num_input_tokens_seen": 7372160, "step": 3405 }, { "epoch": 0.5562805872756933, "grad_norm": 0.2443922460079193, "learning_rate": 2.7805872756933115e-05, "loss": 0.06, "num_input_tokens_seen": 7381408, "step": 3410 }, { "epoch": 0.5570962479608483, "grad_norm": 7.976419925689697, "learning_rate": 2.7846655791190866e-05, "loss": 0.2754, "num_input_tokens_seen": 7392512, "step": 3415 }, { "epoch": 0.5579119086460033, "grad_norm": 4.312510967254639, "learning_rate": 2.7887438825448614e-05, "loss": 0.3246, "num_input_tokens_seen": 7404192, "step": 3420 }, { "epoch": 0.5587275693311582, "grad_norm": 0.02727222442626953, "learning_rate": 2.7928221859706365e-05, "loss": 0.1545, "num_input_tokens_seen": 7415008, "step": 3425 }, { "epoch": 0.5595432300163132, "grad_norm": 2.2358570098876953, "learning_rate": 2.7969004893964112e-05, "loss": 0.0869, "num_input_tokens_seen": 7425088, "step": 3430 }, { "epoch": 0.5603588907014682, "grad_norm": 3.2919957637786865, "learning_rate": 2.800978792822186e-05, "loss": 0.3297, "num_input_tokens_seen": 7436608, "step": 3435 }, { "epoch": 0.5611745513866232, "grad_norm": 0.15448737144470215, "learning_rate": 2.805057096247961e-05, "loss": 0.1976, "num_input_tokens_seen": 7446784, "step": 3440 }, { "epoch": 0.5619902120717781, "grad_norm": 0.3430439233779907, "learning_rate": 2.809135399673736e-05, "loss": 0.1111, "num_input_tokens_seen": 7456640, "step": 3445 }, { "epoch": 0.5628058727569332, "grad_norm": 0.5385096669197083, "learning_rate": 2.813213703099511e-05, "loss": 0.1322, "num_input_tokens_seen": 7468096, "step": 3450 }, { "epoch": 0.5636215334420881, "grad_norm": 3.0548486709594727, "learning_rate": 2.8172920065252857e-05, "loss": 0.2489, "num_input_tokens_seen": 7480096, "step": 3455 }, { "epoch": 0.564437194127243, "grad_norm": 0.20929400622844696, "learning_rate": 2.8213703099510602e-05, "loss": 0.1348, "num_input_tokens_seen": 7491136, "step": 3460 }, { "epoch": 0.5652528548123981, "grad_norm": 0.22163662314414978, "learning_rate": 2.8254486133768353e-05, "loss": 0.1038, "num_input_tokens_seen": 7502304, "step": 3465 }, { "epoch": 0.566068515497553, "grad_norm": 0.5673784017562866, "learning_rate": 2.82952691680261e-05, "loss": 0.2097, "num_input_tokens_seen": 7512928, "step": 3470 }, { "epoch": 0.566884176182708, "grad_norm": 0.227246955037117, "learning_rate": 2.833605220228385e-05, "loss": 0.2233, "num_input_tokens_seen": 7523904, "step": 3475 }, { "epoch": 0.567699836867863, "grad_norm": 5.121859073638916, "learning_rate": 2.83768352365416e-05, "loss": 0.4761, "num_input_tokens_seen": 7534496, "step": 3480 }, { "epoch": 0.5685154975530179, "grad_norm": 5.115076065063477, "learning_rate": 2.841761827079935e-05, "loss": 0.4245, "num_input_tokens_seen": 7545120, "step": 3485 }, { "epoch": 0.5693311582381729, "grad_norm": 3.4532887935638428, "learning_rate": 2.8458401305057098e-05, "loss": 0.0717, "num_input_tokens_seen": 7555712, "step": 3490 }, { "epoch": 0.5701468189233279, "grad_norm": 0.19635164737701416, "learning_rate": 2.8499184339314845e-05, "loss": 0.0957, "num_input_tokens_seen": 7567200, "step": 3495 }, { "epoch": 0.5709624796084829, "grad_norm": 5.896024227142334, "learning_rate": 2.8539967373572596e-05, "loss": 0.3407, "num_input_tokens_seen": 7578464, "step": 3500 }, { "epoch": 0.5717781402936378, "grad_norm": 1.1312222480773926, "learning_rate": 2.858075040783034e-05, "loss": 0.1151, "num_input_tokens_seen": 7588256, "step": 3505 }, { "epoch": 0.5725938009787929, "grad_norm": 1.881733775138855, "learning_rate": 2.8621533442088095e-05, "loss": 0.302, "num_input_tokens_seen": 7598848, "step": 3510 }, { "epoch": 0.5734094616639478, "grad_norm": 1.0623258352279663, "learning_rate": 2.866231647634584e-05, "loss": 0.0913, "num_input_tokens_seen": 7609696, "step": 3515 }, { "epoch": 0.5742251223491027, "grad_norm": 1.1519025564193726, "learning_rate": 2.8703099510603594e-05, "loss": 0.0698, "num_input_tokens_seen": 7620864, "step": 3520 }, { "epoch": 0.5750407830342578, "grad_norm": 4.33655309677124, "learning_rate": 2.8743882544861338e-05, "loss": 0.0705, "num_input_tokens_seen": 7631392, "step": 3525 }, { "epoch": 0.5758564437194127, "grad_norm": 0.47238653898239136, "learning_rate": 2.8784665579119086e-05, "loss": 0.0643, "num_input_tokens_seen": 7643232, "step": 3530 }, { "epoch": 0.5766721044045677, "grad_norm": 0.4578634798526764, "learning_rate": 2.8825448613376837e-05, "loss": 0.1905, "num_input_tokens_seen": 7654944, "step": 3535 }, { "epoch": 0.5774877650897227, "grad_norm": 4.807013988494873, "learning_rate": 2.8866231647634584e-05, "loss": 0.1149, "num_input_tokens_seen": 7665760, "step": 3540 }, { "epoch": 0.5783034257748777, "grad_norm": 0.3435484766960144, "learning_rate": 2.8907014681892335e-05, "loss": 0.041, "num_input_tokens_seen": 7676576, "step": 3545 }, { "epoch": 0.5791190864600326, "grad_norm": 1.3135573863983154, "learning_rate": 2.8947797716150083e-05, "loss": 0.2718, "num_input_tokens_seen": 7687104, "step": 3550 }, { "epoch": 0.5799347471451876, "grad_norm": 5.1496195793151855, "learning_rate": 2.8988580750407834e-05, "loss": 0.0551, "num_input_tokens_seen": 7698080, "step": 3555 }, { "epoch": 0.5807504078303426, "grad_norm": 3.7905707359313965, "learning_rate": 2.9029363784665582e-05, "loss": 0.2317, "num_input_tokens_seen": 7709632, "step": 3560 }, { "epoch": 0.5815660685154975, "grad_norm": 0.5441896915435791, "learning_rate": 2.9070146818923326e-05, "loss": 0.0329, "num_input_tokens_seen": 7719936, "step": 3565 }, { "epoch": 0.5823817292006526, "grad_norm": 0.4525549113750458, "learning_rate": 2.911092985318108e-05, "loss": 0.0733, "num_input_tokens_seen": 7730080, "step": 3570 }, { "epoch": 0.5831973898858075, "grad_norm": 4.433751583099365, "learning_rate": 2.9151712887438825e-05, "loss": 0.2059, "num_input_tokens_seen": 7740576, "step": 3575 }, { "epoch": 0.5840130505709625, "grad_norm": 0.03426478058099747, "learning_rate": 2.919249592169658e-05, "loss": 0.1512, "num_input_tokens_seen": 7751040, "step": 3580 }, { "epoch": 0.5848287112561175, "grad_norm": 0.027612321078777313, "learning_rate": 2.9233278955954323e-05, "loss": 0.0044, "num_input_tokens_seen": 7760768, "step": 3585 }, { "epoch": 0.5856443719412724, "grad_norm": 0.19678345322608948, "learning_rate": 2.927406199021207e-05, "loss": 0.0735, "num_input_tokens_seen": 7771680, "step": 3590 }, { "epoch": 0.5864600326264274, "grad_norm": 0.29991891980171204, "learning_rate": 2.9314845024469822e-05, "loss": 0.1521, "num_input_tokens_seen": 7783072, "step": 3595 }, { "epoch": 0.5872756933115824, "grad_norm": 0.11637599021196365, "learning_rate": 2.935562805872757e-05, "loss": 0.082, "num_input_tokens_seen": 7792608, "step": 3600 }, { "epoch": 0.5880913539967374, "grad_norm": 0.23875682055950165, "learning_rate": 2.939641109298532e-05, "loss": 0.3333, "num_input_tokens_seen": 7803744, "step": 3605 }, { "epoch": 0.5889070146818923, "grad_norm": 0.7318027019500732, "learning_rate": 2.943719412724307e-05, "loss": 0.1406, "num_input_tokens_seen": 7815008, "step": 3610 }, { "epoch": 0.5897226753670473, "grad_norm": 3.634417772293091, "learning_rate": 2.947797716150082e-05, "loss": 0.2403, "num_input_tokens_seen": 7825536, "step": 3615 }, { "epoch": 0.5905383360522023, "grad_norm": 0.05685891583561897, "learning_rate": 2.9518760195758567e-05, "loss": 0.2194, "num_input_tokens_seen": 7836160, "step": 3620 }, { "epoch": 0.5913539967373572, "grad_norm": 1.4172544479370117, "learning_rate": 2.955954323001631e-05, "loss": 0.0216, "num_input_tokens_seen": 7848864, "step": 3625 }, { "epoch": 0.5921696574225123, "grad_norm": 8.229720115661621, "learning_rate": 2.9600326264274066e-05, "loss": 0.2237, "num_input_tokens_seen": 7859488, "step": 3630 }, { "epoch": 0.5929853181076672, "grad_norm": 2.7573013305664062, "learning_rate": 2.964110929853181e-05, "loss": 0.1209, "num_input_tokens_seen": 7869600, "step": 3635 }, { "epoch": 0.5938009787928222, "grad_norm": 4.573496341705322, "learning_rate": 2.968189233278956e-05, "loss": 0.2684, "num_input_tokens_seen": 7879840, "step": 3640 }, { "epoch": 0.5946166394779772, "grad_norm": 0.07046195864677429, "learning_rate": 2.972267536704731e-05, "loss": 0.0752, "num_input_tokens_seen": 7890272, "step": 3645 }, { "epoch": 0.5954323001631321, "grad_norm": 9.139192581176758, "learning_rate": 2.976345840130506e-05, "loss": 0.3903, "num_input_tokens_seen": 7901120, "step": 3650 }, { "epoch": 0.5962479608482871, "grad_norm": 2.8443400859832764, "learning_rate": 2.9804241435562807e-05, "loss": 0.0952, "num_input_tokens_seen": 7912608, "step": 3655 }, { "epoch": 0.5970636215334421, "grad_norm": 2.2369651794433594, "learning_rate": 2.9845024469820555e-05, "loss": 0.2448, "num_input_tokens_seen": 7923936, "step": 3660 }, { "epoch": 0.5978792822185971, "grad_norm": 4.761455535888672, "learning_rate": 2.9885807504078306e-05, "loss": 0.352, "num_input_tokens_seen": 7934496, "step": 3665 }, { "epoch": 0.598694942903752, "grad_norm": 3.5061333179473877, "learning_rate": 2.9926590538336054e-05, "loss": 0.1885, "num_input_tokens_seen": 7945728, "step": 3670 }, { "epoch": 0.5995106035889071, "grad_norm": 0.19609789550304413, "learning_rate": 2.9967373572593805e-05, "loss": 0.0977, "num_input_tokens_seen": 7956960, "step": 3675 }, { "epoch": 0.600326264274062, "grad_norm": 5.6819915771484375, "learning_rate": 3.0008156606851552e-05, "loss": 0.1786, "num_input_tokens_seen": 7969440, "step": 3680 }, { "epoch": 0.6011419249592169, "grad_norm": 4.5360798835754395, "learning_rate": 3.0048939641109303e-05, "loss": 0.1381, "num_input_tokens_seen": 7978560, "step": 3685 }, { "epoch": 0.601957585644372, "grad_norm": 2.839482307434082, "learning_rate": 3.0089722675367048e-05, "loss": 0.2044, "num_input_tokens_seen": 7990400, "step": 3690 }, { "epoch": 0.6027732463295269, "grad_norm": 5.692098617553711, "learning_rate": 3.0130505709624795e-05, "loss": 0.1635, "num_input_tokens_seen": 8000128, "step": 3695 }, { "epoch": 0.6035889070146819, "grad_norm": 0.029796523973345757, "learning_rate": 3.0171288743882546e-05, "loss": 0.1286, "num_input_tokens_seen": 8010592, "step": 3700 }, { "epoch": 0.6044045676998369, "grad_norm": 1.200098991394043, "learning_rate": 3.0212071778140294e-05, "loss": 0.1984, "num_input_tokens_seen": 8022240, "step": 3705 }, { "epoch": 0.6052202283849919, "grad_norm": 0.27159032225608826, "learning_rate": 3.0252854812398045e-05, "loss": 0.1891, "num_input_tokens_seen": 8032096, "step": 3710 }, { "epoch": 0.6060358890701468, "grad_norm": 7.265265941619873, "learning_rate": 3.0293637846655793e-05, "loss": 0.2965, "num_input_tokens_seen": 8042400, "step": 3715 }, { "epoch": 0.6068515497553018, "grad_norm": 0.26513463258743286, "learning_rate": 3.033442088091354e-05, "loss": 0.085, "num_input_tokens_seen": 8052672, "step": 3720 }, { "epoch": 0.6076672104404568, "grad_norm": 2.2898662090301514, "learning_rate": 3.037520391517129e-05, "loss": 0.1487, "num_input_tokens_seen": 8063712, "step": 3725 }, { "epoch": 0.6084828711256117, "grad_norm": 3.571632146835327, "learning_rate": 3.041598694942904e-05, "loss": 0.1305, "num_input_tokens_seen": 8073600, "step": 3730 }, { "epoch": 0.6092985318107668, "grad_norm": 0.5955801606178284, "learning_rate": 3.045676998368679e-05, "loss": 0.2931, "num_input_tokens_seen": 8085664, "step": 3735 }, { "epoch": 0.6101141924959217, "grad_norm": 4.0143351554870605, "learning_rate": 3.0497553017944534e-05, "loss": 0.161, "num_input_tokens_seen": 8097152, "step": 3740 }, { "epoch": 0.6109298531810766, "grad_norm": 3.4268391132354736, "learning_rate": 3.0538336052202285e-05, "loss": 0.0641, "num_input_tokens_seen": 8107648, "step": 3745 }, { "epoch": 0.6117455138662317, "grad_norm": 4.198644638061523, "learning_rate": 3.057911908646003e-05, "loss": 0.3436, "num_input_tokens_seen": 8118272, "step": 3750 }, { "epoch": 0.6125611745513866, "grad_norm": 4.588216304779053, "learning_rate": 3.061990212071778e-05, "loss": 0.1209, "num_input_tokens_seen": 8129120, "step": 3755 }, { "epoch": 0.6133768352365416, "grad_norm": 0.33906522393226624, "learning_rate": 3.0660685154975535e-05, "loss": 0.1412, "num_input_tokens_seen": 8139200, "step": 3760 }, { "epoch": 0.6141924959216966, "grad_norm": 0.1643534004688263, "learning_rate": 3.0701468189233276e-05, "loss": 0.0695, "num_input_tokens_seen": 8150272, "step": 3765 }, { "epoch": 0.6150081566068516, "grad_norm": 5.936880588531494, "learning_rate": 3.074225122349103e-05, "loss": 0.3042, "num_input_tokens_seen": 8162496, "step": 3770 }, { "epoch": 0.6158238172920065, "grad_norm": 1.3060961961746216, "learning_rate": 3.078303425774878e-05, "loss": 0.1619, "num_input_tokens_seen": 8173664, "step": 3775 }, { "epoch": 0.6166394779771615, "grad_norm": 0.3549754321575165, "learning_rate": 3.082381729200653e-05, "loss": 0.2705, "num_input_tokens_seen": 8184512, "step": 3780 }, { "epoch": 0.6174551386623165, "grad_norm": 5.725491523742676, "learning_rate": 3.086460032626427e-05, "loss": 0.2719, "num_input_tokens_seen": 8194432, "step": 3785 }, { "epoch": 0.6182707993474714, "grad_norm": 5.004452705383301, "learning_rate": 3.090538336052202e-05, "loss": 0.095, "num_input_tokens_seen": 8205152, "step": 3790 }, { "epoch": 0.6190864600326265, "grad_norm": 0.28644075989723206, "learning_rate": 3.0946166394779775e-05, "loss": 0.1491, "num_input_tokens_seen": 8215360, "step": 3795 }, { "epoch": 0.6199021207177814, "grad_norm": 0.6890418529510498, "learning_rate": 3.098694942903752e-05, "loss": 0.0998, "num_input_tokens_seen": 8225216, "step": 3800 }, { "epoch": 0.6207177814029364, "grad_norm": 6.75236701965332, "learning_rate": 3.102773246329527e-05, "loss": 0.1457, "num_input_tokens_seen": 8235488, "step": 3805 }, { "epoch": 0.6215334420880914, "grad_norm": 1.7343748807907104, "learning_rate": 3.106851549755302e-05, "loss": 0.1449, "num_input_tokens_seen": 8246208, "step": 3810 }, { "epoch": 0.6223491027732463, "grad_norm": 2.9255847930908203, "learning_rate": 3.1109298531810766e-05, "loss": 0.1294, "num_input_tokens_seen": 8258048, "step": 3815 }, { "epoch": 0.6231647634584013, "grad_norm": 5.216608047485352, "learning_rate": 3.115008156606852e-05, "loss": 0.0629, "num_input_tokens_seen": 8269472, "step": 3820 }, { "epoch": 0.6239804241435563, "grad_norm": 7.773916721343994, "learning_rate": 3.119086460032626e-05, "loss": 0.4178, "num_input_tokens_seen": 8280512, "step": 3825 }, { "epoch": 0.6247960848287113, "grad_norm": 0.6229372620582581, "learning_rate": 3.1231647634584016e-05, "loss": 0.1706, "num_input_tokens_seen": 8291360, "step": 3830 }, { "epoch": 0.6256117455138662, "grad_norm": 0.06130954995751381, "learning_rate": 3.127243066884176e-05, "loss": 0.0877, "num_input_tokens_seen": 8301440, "step": 3835 }, { "epoch": 0.6264274061990212, "grad_norm": 5.62656831741333, "learning_rate": 3.131321370309952e-05, "loss": 0.3183, "num_input_tokens_seen": 8313472, "step": 3840 }, { "epoch": 0.6272430668841762, "grad_norm": 0.0833306834101677, "learning_rate": 3.135399673735726e-05, "loss": 0.0129, "num_input_tokens_seen": 8323488, "step": 3845 }, { "epoch": 0.6280587275693311, "grad_norm": 0.41827625036239624, "learning_rate": 3.1394779771615006e-05, "loss": 0.0189, "num_input_tokens_seen": 8336096, "step": 3850 }, { "epoch": 0.6288743882544862, "grad_norm": 6.341390132904053, "learning_rate": 3.143556280587276e-05, "loss": 0.2367, "num_input_tokens_seen": 8344736, "step": 3855 }, { "epoch": 0.6296900489396411, "grad_norm": 0.191243514418602, "learning_rate": 3.147634584013051e-05, "loss": 0.0972, "num_input_tokens_seen": 8356416, "step": 3860 }, { "epoch": 0.6305057096247961, "grad_norm": 0.08787572383880615, "learning_rate": 3.1517128874388256e-05, "loss": 0.1867, "num_input_tokens_seen": 8367712, "step": 3865 }, { "epoch": 0.6313213703099511, "grad_norm": 0.05379727482795715, "learning_rate": 3.1557911908646004e-05, "loss": 0.2884, "num_input_tokens_seen": 8378752, "step": 3870 }, { "epoch": 0.632137030995106, "grad_norm": 2.846280097961426, "learning_rate": 3.159869494290376e-05, "loss": 0.2594, "num_input_tokens_seen": 8389472, "step": 3875 }, { "epoch": 0.632952691680261, "grad_norm": 0.4946746528148651, "learning_rate": 3.1639477977161506e-05, "loss": 0.1907, "num_input_tokens_seen": 8400064, "step": 3880 }, { "epoch": 0.633768352365416, "grad_norm": 1.0731377601623535, "learning_rate": 3.1680261011419246e-05, "loss": 0.1228, "num_input_tokens_seen": 8410176, "step": 3885 }, { "epoch": 0.634584013050571, "grad_norm": 2.3902738094329834, "learning_rate": 3.1721044045677e-05, "loss": 0.4067, "num_input_tokens_seen": 8420256, "step": 3890 }, { "epoch": 0.6353996737357259, "grad_norm": 0.821153461933136, "learning_rate": 3.176182707993475e-05, "loss": 0.1574, "num_input_tokens_seen": 8431808, "step": 3895 }, { "epoch": 0.636215334420881, "grad_norm": 1.441624641418457, "learning_rate": 3.1802610114192496e-05, "loss": 0.0372, "num_input_tokens_seen": 8444064, "step": 3900 }, { "epoch": 0.6370309951060359, "grad_norm": 3.8657689094543457, "learning_rate": 3.1843393148450244e-05, "loss": 0.2786, "num_input_tokens_seen": 8454944, "step": 3905 }, { "epoch": 0.6378466557911908, "grad_norm": 0.2954348921775818, "learning_rate": 3.1884176182708e-05, "loss": 0.0424, "num_input_tokens_seen": 8466208, "step": 3910 }, { "epoch": 0.6386623164763459, "grad_norm": 0.3389508128166199, "learning_rate": 3.1924959216965746e-05, "loss": 0.1062, "num_input_tokens_seen": 8476480, "step": 3915 }, { "epoch": 0.6394779771615008, "grad_norm": 4.907018661499023, "learning_rate": 3.1965742251223494e-05, "loss": 0.1585, "num_input_tokens_seen": 8486688, "step": 3920 }, { "epoch": 0.6402936378466558, "grad_norm": 0.17697042226791382, "learning_rate": 3.200652528548124e-05, "loss": 0.1982, "num_input_tokens_seen": 8499200, "step": 3925 }, { "epoch": 0.6411092985318108, "grad_norm": 1.9519888162612915, "learning_rate": 3.204730831973899e-05, "loss": 0.1257, "num_input_tokens_seen": 8509728, "step": 3930 }, { "epoch": 0.6419249592169658, "grad_norm": 7.020340442657471, "learning_rate": 3.208809135399674e-05, "loss": 0.1737, "num_input_tokens_seen": 8520928, "step": 3935 }, { "epoch": 0.6427406199021207, "grad_norm": 0.11750974506139755, "learning_rate": 3.2128874388254484e-05, "loss": 0.1452, "num_input_tokens_seen": 8532448, "step": 3940 }, { "epoch": 0.6435562805872757, "grad_norm": 0.19156263768672943, "learning_rate": 3.216965742251223e-05, "loss": 0.1432, "num_input_tokens_seen": 8543360, "step": 3945 }, { "epoch": 0.6443719412724307, "grad_norm": 1.1783024072647095, "learning_rate": 3.2210440456769986e-05, "loss": 0.3555, "num_input_tokens_seen": 8554176, "step": 3950 }, { "epoch": 0.6451876019575856, "grad_norm": 3.0158650875091553, "learning_rate": 3.2251223491027734e-05, "loss": 0.1813, "num_input_tokens_seen": 8565376, "step": 3955 }, { "epoch": 0.6460032626427407, "grad_norm": 0.21972206234931946, "learning_rate": 3.229200652528548e-05, "loss": 0.333, "num_input_tokens_seen": 8576480, "step": 3960 }, { "epoch": 0.6468189233278956, "grad_norm": 0.046773120760917664, "learning_rate": 3.233278955954323e-05, "loss": 0.1075, "num_input_tokens_seen": 8586944, "step": 3965 }, { "epoch": 0.6476345840130505, "grad_norm": 0.25712352991104126, "learning_rate": 3.2373572593800984e-05, "loss": 0.1852, "num_input_tokens_seen": 8596640, "step": 3970 }, { "epoch": 0.6484502446982056, "grad_norm": 0.24655215442180634, "learning_rate": 3.241435562805873e-05, "loss": 0.0594, "num_input_tokens_seen": 8608256, "step": 3975 }, { "epoch": 0.6492659053833605, "grad_norm": 1.8580498695373535, "learning_rate": 3.245513866231648e-05, "loss": 0.182, "num_input_tokens_seen": 8618752, "step": 3980 }, { "epoch": 0.6500815660685155, "grad_norm": 0.043817225843667984, "learning_rate": 3.2495921696574226e-05, "loss": 0.2053, "num_input_tokens_seen": 8629824, "step": 3985 }, { "epoch": 0.6508972267536705, "grad_norm": 0.09651307761669159, "learning_rate": 3.2536704730831974e-05, "loss": 0.2543, "num_input_tokens_seen": 8640832, "step": 3990 }, { "epoch": 0.6517128874388255, "grad_norm": 0.22825945913791656, "learning_rate": 3.257748776508973e-05, "loss": 0.25, "num_input_tokens_seen": 8651104, "step": 3995 }, { "epoch": 0.6525285481239804, "grad_norm": 6.691067218780518, "learning_rate": 3.261827079934747e-05, "loss": 0.2699, "num_input_tokens_seen": 8662112, "step": 4000 }, { "epoch": 0.6533442088091354, "grad_norm": 3.487319231033325, "learning_rate": 3.2659053833605224e-05, "loss": 0.0776, "num_input_tokens_seen": 8672960, "step": 4005 }, { "epoch": 0.6541598694942904, "grad_norm": 0.07446221262216568, "learning_rate": 3.269983686786297e-05, "loss": 0.0553, "num_input_tokens_seen": 8683968, "step": 4010 }, { "epoch": 0.6549755301794453, "grad_norm": 5.4846696853637695, "learning_rate": 3.274061990212072e-05, "loss": 0.1836, "num_input_tokens_seen": 8695200, "step": 4015 }, { "epoch": 0.6557911908646004, "grad_norm": 1.5700244903564453, "learning_rate": 3.278140293637847e-05, "loss": 0.2142, "num_input_tokens_seen": 8705408, "step": 4020 }, { "epoch": 0.6566068515497553, "grad_norm": 6.220165729522705, "learning_rate": 3.2822185970636214e-05, "loss": 0.175, "num_input_tokens_seen": 8716128, "step": 4025 }, { "epoch": 0.6574225122349103, "grad_norm": 1.2835204601287842, "learning_rate": 3.286296900489397e-05, "loss": 0.3228, "num_input_tokens_seen": 8727744, "step": 4030 }, { "epoch": 0.6582381729200653, "grad_norm": 1.2002179622650146, "learning_rate": 3.2903752039151716e-05, "loss": 0.221, "num_input_tokens_seen": 8740128, "step": 4035 }, { "epoch": 0.6590538336052202, "grad_norm": 9.06164836883545, "learning_rate": 3.2944535073409464e-05, "loss": 0.2626, "num_input_tokens_seen": 8752320, "step": 4040 }, { "epoch": 0.6598694942903752, "grad_norm": 0.5982420444488525, "learning_rate": 3.298531810766721e-05, "loss": 0.2252, "num_input_tokens_seen": 8762784, "step": 4045 }, { "epoch": 0.6606851549755302, "grad_norm": 0.3971848785877228, "learning_rate": 3.302610114192496e-05, "loss": 0.1462, "num_input_tokens_seen": 8773312, "step": 4050 }, { "epoch": 0.6615008156606852, "grad_norm": 2.4643943309783936, "learning_rate": 3.3066884176182714e-05, "loss": 0.161, "num_input_tokens_seen": 8783744, "step": 4055 }, { "epoch": 0.6623164763458401, "grad_norm": 0.3730051517486572, "learning_rate": 3.3107667210440455e-05, "loss": 0.036, "num_input_tokens_seen": 8795104, "step": 4060 }, { "epoch": 0.6631321370309952, "grad_norm": 0.6188569068908691, "learning_rate": 3.314845024469821e-05, "loss": 0.1303, "num_input_tokens_seen": 8805760, "step": 4065 }, { "epoch": 0.6639477977161501, "grad_norm": 5.813721656799316, "learning_rate": 3.318923327895596e-05, "loss": 0.1185, "num_input_tokens_seen": 8817536, "step": 4070 }, { "epoch": 0.664763458401305, "grad_norm": 2.0859711170196533, "learning_rate": 3.3230016313213704e-05, "loss": 0.0693, "num_input_tokens_seen": 8829184, "step": 4075 }, { "epoch": 0.6655791190864601, "grad_norm": 0.031587276607751846, "learning_rate": 3.327079934747145e-05, "loss": 0.0916, "num_input_tokens_seen": 8839328, "step": 4080 }, { "epoch": 0.666394779771615, "grad_norm": 0.9004303216934204, "learning_rate": 3.33115823817292e-05, "loss": 0.2124, "num_input_tokens_seen": 8850336, "step": 4085 }, { "epoch": 0.66721044045677, "grad_norm": 0.3769245147705078, "learning_rate": 3.3352365415986954e-05, "loss": 0.1446, "num_input_tokens_seen": 8859392, "step": 4090 }, { "epoch": 0.668026101141925, "grad_norm": 0.039969563484191895, "learning_rate": 3.33931484502447e-05, "loss": 0.0521, "num_input_tokens_seen": 8871008, "step": 4095 }, { "epoch": 0.6688417618270799, "grad_norm": 0.047942303121089935, "learning_rate": 3.343393148450245e-05, "loss": 0.1959, "num_input_tokens_seen": 8881760, "step": 4100 }, { "epoch": 0.6696574225122349, "grad_norm": 3.715879201889038, "learning_rate": 3.34747145187602e-05, "loss": 0.3242, "num_input_tokens_seen": 8892544, "step": 4105 }, { "epoch": 0.6704730831973899, "grad_norm": 8.826482772827148, "learning_rate": 3.3515497553017945e-05, "loss": 0.4686, "num_input_tokens_seen": 8903936, "step": 4110 }, { "epoch": 0.6712887438825449, "grad_norm": 0.09291672706604004, "learning_rate": 3.35562805872757e-05, "loss": 0.227, "num_input_tokens_seen": 8913952, "step": 4115 }, { "epoch": 0.6721044045676998, "grad_norm": 3.907045602798462, "learning_rate": 3.359706362153344e-05, "loss": 0.1729, "num_input_tokens_seen": 8925120, "step": 4120 }, { "epoch": 0.6729200652528549, "grad_norm": 0.5516533255577087, "learning_rate": 3.3637846655791194e-05, "loss": 0.199, "num_input_tokens_seen": 8935872, "step": 4125 }, { "epoch": 0.6737357259380098, "grad_norm": 0.08874432742595673, "learning_rate": 3.367862969004894e-05, "loss": 0.1666, "num_input_tokens_seen": 8946624, "step": 4130 }, { "epoch": 0.6745513866231647, "grad_norm": 0.21990741789340973, "learning_rate": 3.371941272430669e-05, "loss": 0.1323, "num_input_tokens_seen": 8957568, "step": 4135 }, { "epoch": 0.6753670473083198, "grad_norm": 4.526270389556885, "learning_rate": 3.376019575856444e-05, "loss": 0.2651, "num_input_tokens_seen": 8968160, "step": 4140 }, { "epoch": 0.6761827079934747, "grad_norm": 0.9621414542198181, "learning_rate": 3.3800978792822185e-05, "loss": 0.1044, "num_input_tokens_seen": 8978560, "step": 4145 }, { "epoch": 0.6769983686786297, "grad_norm": 8.423593521118164, "learning_rate": 3.384176182707994e-05, "loss": 0.2068, "num_input_tokens_seen": 8988896, "step": 4150 }, { "epoch": 0.6778140293637847, "grad_norm": 6.477504730224609, "learning_rate": 3.388254486133769e-05, "loss": 0.2755, "num_input_tokens_seen": 9000928, "step": 4155 }, { "epoch": 0.6786296900489397, "grad_norm": 0.6715103983879089, "learning_rate": 3.3923327895595435e-05, "loss": 0.0682, "num_input_tokens_seen": 9011712, "step": 4160 }, { "epoch": 0.6794453507340946, "grad_norm": 6.027068614959717, "learning_rate": 3.396411092985318e-05, "loss": 0.1917, "num_input_tokens_seen": 9022624, "step": 4165 }, { "epoch": 0.6802610114192496, "grad_norm": 0.04681265354156494, "learning_rate": 3.400489396411093e-05, "loss": 0.2185, "num_input_tokens_seen": 9033088, "step": 4170 }, { "epoch": 0.6810766721044046, "grad_norm": 8.601092338562012, "learning_rate": 3.404567699836868e-05, "loss": 0.3408, "num_input_tokens_seen": 9043680, "step": 4175 }, { "epoch": 0.6818923327895595, "grad_norm": 2.7318994998931885, "learning_rate": 3.4086460032626425e-05, "loss": 0.1675, "num_input_tokens_seen": 9055424, "step": 4180 }, { "epoch": 0.6827079934747146, "grad_norm": 4.463717937469482, "learning_rate": 3.412724306688418e-05, "loss": 0.0983, "num_input_tokens_seen": 9066816, "step": 4185 }, { "epoch": 0.6835236541598695, "grad_norm": 0.1789654791355133, "learning_rate": 3.416802610114193e-05, "loss": 0.2268, "num_input_tokens_seen": 9077792, "step": 4190 }, { "epoch": 0.6843393148450244, "grad_norm": 0.3223755359649658, "learning_rate": 3.4208809135399675e-05, "loss": 0.3277, "num_input_tokens_seen": 9088160, "step": 4195 }, { "epoch": 0.6851549755301795, "grad_norm": 0.13708998262882233, "learning_rate": 3.424959216965742e-05, "loss": 0.0267, "num_input_tokens_seen": 9099264, "step": 4200 }, { "epoch": 0.6859706362153344, "grad_norm": 4.07029914855957, "learning_rate": 3.429037520391517e-05, "loss": 0.2568, "num_input_tokens_seen": 9110336, "step": 4205 }, { "epoch": 0.6867862969004894, "grad_norm": 7.171268939971924, "learning_rate": 3.4331158238172925e-05, "loss": 0.0953, "num_input_tokens_seen": 9121248, "step": 4210 }, { "epoch": 0.6876019575856444, "grad_norm": 5.684694290161133, "learning_rate": 3.4371941272430666e-05, "loss": 0.0785, "num_input_tokens_seen": 9132896, "step": 4215 }, { "epoch": 0.6884176182707994, "grad_norm": 0.05936404690146446, "learning_rate": 3.441272430668842e-05, "loss": 0.0886, "num_input_tokens_seen": 9143968, "step": 4220 }, { "epoch": 0.6892332789559543, "grad_norm": 0.088518887758255, "learning_rate": 3.445350734094617e-05, "loss": 0.1126, "num_input_tokens_seen": 9154848, "step": 4225 }, { "epoch": 0.6900489396411092, "grad_norm": 0.05748211219906807, "learning_rate": 3.449429037520392e-05, "loss": 0.1716, "num_input_tokens_seen": 9167264, "step": 4230 }, { "epoch": 0.6908646003262643, "grad_norm": 0.12895634770393372, "learning_rate": 3.453507340946166e-05, "loss": 0.0454, "num_input_tokens_seen": 9178016, "step": 4235 }, { "epoch": 0.6916802610114192, "grad_norm": 0.31009072065353394, "learning_rate": 3.457585644371941e-05, "loss": 0.1187, "num_input_tokens_seen": 9188224, "step": 4240 }, { "epoch": 0.6924959216965743, "grad_norm": 0.01949448138475418, "learning_rate": 3.4616639477977165e-05, "loss": 0.0353, "num_input_tokens_seen": 9198208, "step": 4245 }, { "epoch": 0.6933115823817292, "grad_norm": 0.3380926251411438, "learning_rate": 3.465742251223491e-05, "loss": 0.1166, "num_input_tokens_seen": 9208128, "step": 4250 }, { "epoch": 0.6941272430668842, "grad_norm": 0.0255452711135149, "learning_rate": 3.469820554649266e-05, "loss": 0.0684, "num_input_tokens_seen": 9218080, "step": 4255 }, { "epoch": 0.6949429037520392, "grad_norm": 0.8182644844055176, "learning_rate": 3.473898858075041e-05, "loss": 0.0126, "num_input_tokens_seen": 9228416, "step": 4260 }, { "epoch": 0.6957585644371941, "grad_norm": 5.6626715660095215, "learning_rate": 3.477977161500816e-05, "loss": 0.3426, "num_input_tokens_seen": 9239296, "step": 4265 }, { "epoch": 0.6965742251223491, "grad_norm": 0.26666080951690674, "learning_rate": 3.482055464926591e-05, "loss": 0.3668, "num_input_tokens_seen": 9249472, "step": 4270 }, { "epoch": 0.697389885807504, "grad_norm": 4.0731682777404785, "learning_rate": 3.486133768352365e-05, "loss": 0.1809, "num_input_tokens_seen": 9259616, "step": 4275 }, { "epoch": 0.6982055464926591, "grad_norm": 0.7779442071914673, "learning_rate": 3.4902120717781405e-05, "loss": 0.0951, "num_input_tokens_seen": 9271360, "step": 4280 }, { "epoch": 0.699021207177814, "grad_norm": 0.9017668962478638, "learning_rate": 3.494290375203915e-05, "loss": 0.1988, "num_input_tokens_seen": 9281952, "step": 4285 }, { "epoch": 0.6998368678629691, "grad_norm": 8.216322898864746, "learning_rate": 3.498368678629691e-05, "loss": 0.2337, "num_input_tokens_seen": 9292480, "step": 4290 }, { "epoch": 0.700652528548124, "grad_norm": 0.034616608172655106, "learning_rate": 3.502446982055465e-05, "loss": 0.422, "num_input_tokens_seen": 9301696, "step": 4295 }, { "epoch": 0.7014681892332789, "grad_norm": 1.7564533948898315, "learning_rate": 3.5065252854812396e-05, "loss": 0.063, "num_input_tokens_seen": 9311360, "step": 4300 }, { "epoch": 0.702283849918434, "grad_norm": 5.38205623626709, "learning_rate": 3.510603588907015e-05, "loss": 0.2506, "num_input_tokens_seen": 9321504, "step": 4305 }, { "epoch": 0.7030995106035889, "grad_norm": 1.9961726665496826, "learning_rate": 3.51468189233279e-05, "loss": 0.2344, "num_input_tokens_seen": 9333024, "step": 4310 }, { "epoch": 0.7039151712887439, "grad_norm": 4.119628429412842, "learning_rate": 3.5187601957585646e-05, "loss": 0.1086, "num_input_tokens_seen": 9343424, "step": 4315 }, { "epoch": 0.7047308319738989, "grad_norm": 0.02915322594344616, "learning_rate": 3.522838499184339e-05, "loss": 0.0216, "num_input_tokens_seen": 9354016, "step": 4320 }, { "epoch": 0.7055464926590538, "grad_norm": 0.06092199683189392, "learning_rate": 3.526916802610115e-05, "loss": 0.1912, "num_input_tokens_seen": 9365600, "step": 4325 }, { "epoch": 0.7063621533442088, "grad_norm": 0.24240359663963318, "learning_rate": 3.5309951060358895e-05, "loss": 0.2048, "num_input_tokens_seen": 9377216, "step": 4330 }, { "epoch": 0.7071778140293637, "grad_norm": 1.4081376791000366, "learning_rate": 3.5350734094616636e-05, "loss": 0.0669, "num_input_tokens_seen": 9388000, "step": 4335 }, { "epoch": 0.7079934747145188, "grad_norm": 4.129550933837891, "learning_rate": 3.539151712887439e-05, "loss": 0.2263, "num_input_tokens_seen": 9399616, "step": 4340 }, { "epoch": 0.7088091353996737, "grad_norm": 3.214871883392334, "learning_rate": 3.543230016313214e-05, "loss": 0.1523, "num_input_tokens_seen": 9411872, "step": 4345 }, { "epoch": 0.7096247960848288, "grad_norm": 1.9875812530517578, "learning_rate": 3.5473083197389886e-05, "loss": 0.2107, "num_input_tokens_seen": 9422912, "step": 4350 }, { "epoch": 0.7104404567699837, "grad_norm": 2.3759799003601074, "learning_rate": 3.5513866231647634e-05, "loss": 0.0238, "num_input_tokens_seen": 9434176, "step": 4355 }, { "epoch": 0.7112561174551386, "grad_norm": 1.6661633253097534, "learning_rate": 3.555464926590539e-05, "loss": 0.1262, "num_input_tokens_seen": 9443936, "step": 4360 }, { "epoch": 0.7120717781402937, "grad_norm": 0.21446746587753296, "learning_rate": 3.5595432300163136e-05, "loss": 0.1062, "num_input_tokens_seen": 9453984, "step": 4365 }, { "epoch": 0.7128874388254486, "grad_norm": 0.376142680644989, "learning_rate": 3.563621533442088e-05, "loss": 0.2228, "num_input_tokens_seen": 9465408, "step": 4370 }, { "epoch": 0.7137030995106036, "grad_norm": 0.8069067001342773, "learning_rate": 3.567699836867863e-05, "loss": 0.175, "num_input_tokens_seen": 9475392, "step": 4375 }, { "epoch": 0.7145187601957586, "grad_norm": 4.707369327545166, "learning_rate": 3.571778140293638e-05, "loss": 0.2439, "num_input_tokens_seen": 9485280, "step": 4380 }, { "epoch": 0.7153344208809136, "grad_norm": 1.5703744888305664, "learning_rate": 3.575856443719413e-05, "loss": 0.0592, "num_input_tokens_seen": 9496224, "step": 4385 }, { "epoch": 0.7161500815660685, "grad_norm": 1.8300573825836182, "learning_rate": 3.579934747145188e-05, "loss": 0.1236, "num_input_tokens_seen": 9507456, "step": 4390 }, { "epoch": 0.7169657422512234, "grad_norm": 6.576985836029053, "learning_rate": 3.584013050570963e-05, "loss": 0.3141, "num_input_tokens_seen": 9517952, "step": 4395 }, { "epoch": 0.7177814029363785, "grad_norm": 4.5055718421936035, "learning_rate": 3.5880913539967376e-05, "loss": 0.1647, "num_input_tokens_seen": 9528672, "step": 4400 }, { "epoch": 0.7185970636215334, "grad_norm": 0.5434504151344299, "learning_rate": 3.5921696574225124e-05, "loss": 0.0497, "num_input_tokens_seen": 9539552, "step": 4405 }, { "epoch": 0.7194127243066885, "grad_norm": 3.5970358848571777, "learning_rate": 3.596247960848287e-05, "loss": 0.3283, "num_input_tokens_seen": 9550112, "step": 4410 }, { "epoch": 0.7202283849918434, "grad_norm": 0.43845248222351074, "learning_rate": 3.600326264274062e-05, "loss": 0.2055, "num_input_tokens_seen": 9560544, "step": 4415 }, { "epoch": 0.7210440456769984, "grad_norm": 9.78809928894043, "learning_rate": 3.604404567699837e-05, "loss": 0.3818, "num_input_tokens_seen": 9571232, "step": 4420 }, { "epoch": 0.7218597063621534, "grad_norm": 0.12565630674362183, "learning_rate": 3.608482871125612e-05, "loss": 0.0643, "num_input_tokens_seen": 9581920, "step": 4425 }, { "epoch": 0.7226753670473083, "grad_norm": 0.7135125994682312, "learning_rate": 3.612561174551387e-05, "loss": 0.0689, "num_input_tokens_seen": 9592448, "step": 4430 }, { "epoch": 0.7234910277324633, "grad_norm": 0.40397709608078003, "learning_rate": 3.6166394779771616e-05, "loss": 0.1868, "num_input_tokens_seen": 9603360, "step": 4435 }, { "epoch": 0.7243066884176182, "grad_norm": 2.0303561687469482, "learning_rate": 3.6207177814029364e-05, "loss": 0.1742, "num_input_tokens_seen": 9614336, "step": 4440 }, { "epoch": 0.7251223491027733, "grad_norm": 0.05959249660372734, "learning_rate": 3.624796084828712e-05, "loss": 0.065, "num_input_tokens_seen": 9624512, "step": 4445 }, { "epoch": 0.7259380097879282, "grad_norm": 0.4840633273124695, "learning_rate": 3.628874388254486e-05, "loss": 0.1762, "num_input_tokens_seen": 9635168, "step": 4450 }, { "epoch": 0.7267536704730831, "grad_norm": 4.319450855255127, "learning_rate": 3.6329526916802614e-05, "loss": 0.2741, "num_input_tokens_seen": 9645280, "step": 4455 }, { "epoch": 0.7275693311582382, "grad_norm": 3.6525118350982666, "learning_rate": 3.637030995106036e-05, "loss": 0.0815, "num_input_tokens_seen": 9656032, "step": 4460 }, { "epoch": 0.7283849918433931, "grad_norm": 0.30015629529953003, "learning_rate": 3.641109298531811e-05, "loss": 0.2921, "num_input_tokens_seen": 9667968, "step": 4465 }, { "epoch": 0.7292006525285482, "grad_norm": 1.6846249103546143, "learning_rate": 3.6451876019575856e-05, "loss": 0.0801, "num_input_tokens_seen": 9680032, "step": 4470 }, { "epoch": 0.7300163132137031, "grad_norm": 4.6199116706848145, "learning_rate": 3.6492659053833604e-05, "loss": 0.2839, "num_input_tokens_seen": 9691424, "step": 4475 }, { "epoch": 0.7308319738988581, "grad_norm": 4.570083141326904, "learning_rate": 3.653344208809136e-05, "loss": 0.2675, "num_input_tokens_seen": 9702400, "step": 4480 }, { "epoch": 0.731647634584013, "grad_norm": 0.24840693175792694, "learning_rate": 3.6574225122349106e-05, "loss": 0.0725, "num_input_tokens_seen": 9712928, "step": 4485 }, { "epoch": 0.732463295269168, "grad_norm": 5.6841630935668945, "learning_rate": 3.6615008156606854e-05, "loss": 0.5455, "num_input_tokens_seen": 9723360, "step": 4490 }, { "epoch": 0.733278955954323, "grad_norm": 1.1254421472549438, "learning_rate": 3.66557911908646e-05, "loss": 0.0408, "num_input_tokens_seen": 9734496, "step": 4495 }, { "epoch": 0.734094616639478, "grad_norm": 5.724114418029785, "learning_rate": 3.669657422512235e-05, "loss": 0.2773, "num_input_tokens_seen": 9744288, "step": 4500 }, { "epoch": 0.734910277324633, "grad_norm": 5.264193534851074, "learning_rate": 3.6737357259380104e-05, "loss": 0.2471, "num_input_tokens_seen": 9755072, "step": 4505 }, { "epoch": 0.7357259380097879, "grad_norm": 0.7798420786857605, "learning_rate": 3.6778140293637844e-05, "loss": 0.101, "num_input_tokens_seen": 9764192, "step": 4510 }, { "epoch": 0.736541598694943, "grad_norm": 6.110246658325195, "learning_rate": 3.68189233278956e-05, "loss": 0.3526, "num_input_tokens_seen": 9775392, "step": 4515 }, { "epoch": 0.7373572593800979, "grad_norm": 1.7233092784881592, "learning_rate": 3.6859706362153346e-05, "loss": 0.0899, "num_input_tokens_seen": 9784864, "step": 4520 }, { "epoch": 0.7381729200652528, "grad_norm": 1.033625602722168, "learning_rate": 3.6900489396411094e-05, "loss": 0.067, "num_input_tokens_seen": 9796512, "step": 4525 }, { "epoch": 0.7389885807504079, "grad_norm": 1.708029866218567, "learning_rate": 3.694127243066884e-05, "loss": 0.0855, "num_input_tokens_seen": 9808416, "step": 4530 }, { "epoch": 0.7398042414355628, "grad_norm": 5.670646667480469, "learning_rate": 3.698205546492659e-05, "loss": 0.1154, "num_input_tokens_seen": 9819552, "step": 4535 }, { "epoch": 0.7406199021207178, "grad_norm": 0.8500028252601624, "learning_rate": 3.7022838499184344e-05, "loss": 0.0779, "num_input_tokens_seen": 9831168, "step": 4540 }, { "epoch": 0.7414355628058727, "grad_norm": 5.93217134475708, "learning_rate": 3.706362153344209e-05, "loss": 0.0827, "num_input_tokens_seen": 9841280, "step": 4545 }, { "epoch": 0.7422512234910277, "grad_norm": 1.6935220956802368, "learning_rate": 3.710440456769984e-05, "loss": 0.166, "num_input_tokens_seen": 9852224, "step": 4550 }, { "epoch": 0.7430668841761827, "grad_norm": 0.8768059015274048, "learning_rate": 3.714518760195759e-05, "loss": 0.3636, "num_input_tokens_seen": 9862656, "step": 4555 }, { "epoch": 0.7438825448613376, "grad_norm": 0.26617231965065, "learning_rate": 3.7185970636215334e-05, "loss": 0.0367, "num_input_tokens_seen": 9874304, "step": 4560 }, { "epoch": 0.7446982055464927, "grad_norm": 0.44966986775398254, "learning_rate": 3.722675367047309e-05, "loss": 0.0693, "num_input_tokens_seen": 9884128, "step": 4565 }, { "epoch": 0.7455138662316476, "grad_norm": 7.610336780548096, "learning_rate": 3.726753670473083e-05, "loss": 0.173, "num_input_tokens_seen": 9894208, "step": 4570 }, { "epoch": 0.7463295269168027, "grad_norm": 4.594542026519775, "learning_rate": 3.7308319738988584e-05, "loss": 0.1771, "num_input_tokens_seen": 9904864, "step": 4575 }, { "epoch": 0.7471451876019576, "grad_norm": 0.7763423323631287, "learning_rate": 3.734910277324633e-05, "loss": 0.17, "num_input_tokens_seen": 9915904, "step": 4580 }, { "epoch": 0.7479608482871125, "grad_norm": 0.045461248606443405, "learning_rate": 3.738988580750408e-05, "loss": 0.2818, "num_input_tokens_seen": 9926048, "step": 4585 }, { "epoch": 0.7487765089722676, "grad_norm": 0.1534917950630188, "learning_rate": 3.743066884176183e-05, "loss": 0.0633, "num_input_tokens_seen": 9936736, "step": 4590 }, { "epoch": 0.7495921696574225, "grad_norm": 3.768371820449829, "learning_rate": 3.7471451876019575e-05, "loss": 0.1892, "num_input_tokens_seen": 9949408, "step": 4595 }, { "epoch": 0.7504078303425775, "grad_norm": 3.5366828441619873, "learning_rate": 3.751223491027733e-05, "loss": 0.1483, "num_input_tokens_seen": 9960160, "step": 4600 }, { "epoch": 0.7512234910277324, "grad_norm": 5.063387393951416, "learning_rate": 3.755301794453508e-05, "loss": 0.2159, "num_input_tokens_seen": 9970624, "step": 4605 }, { "epoch": 0.7520391517128875, "grad_norm": 1.4474318027496338, "learning_rate": 3.7593800978792824e-05, "loss": 0.0649, "num_input_tokens_seen": 9982080, "step": 4610 }, { "epoch": 0.7528548123980424, "grad_norm": 0.2901652157306671, "learning_rate": 3.763458401305057e-05, "loss": 0.005, "num_input_tokens_seen": 9992480, "step": 4615 }, { "epoch": 0.7536704730831973, "grad_norm": 0.274383544921875, "learning_rate": 3.7675367047308326e-05, "loss": 0.1726, "num_input_tokens_seen": 10002496, "step": 4620 }, { "epoch": 0.7544861337683524, "grad_norm": 6.004702091217041, "learning_rate": 3.771615008156607e-05, "loss": 0.3612, "num_input_tokens_seen": 10013408, "step": 4625 }, { "epoch": 0.7553017944535073, "grad_norm": 1.2919461727142334, "learning_rate": 3.7756933115823815e-05, "loss": 0.1936, "num_input_tokens_seen": 10022880, "step": 4630 }, { "epoch": 0.7561174551386624, "grad_norm": 4.892871379852295, "learning_rate": 3.779771615008157e-05, "loss": 0.1224, "num_input_tokens_seen": 10034208, "step": 4635 }, { "epoch": 0.7569331158238173, "grad_norm": 2.343761682510376, "learning_rate": 3.783849918433932e-05, "loss": 0.1395, "num_input_tokens_seen": 10044832, "step": 4640 }, { "epoch": 0.7577487765089723, "grad_norm": 0.14088532328605652, "learning_rate": 3.7879282218597065e-05, "loss": 0.2024, "num_input_tokens_seen": 10055712, "step": 4645 }, { "epoch": 0.7585644371941273, "grad_norm": 3.9574947357177734, "learning_rate": 3.792006525285481e-05, "loss": 0.2083, "num_input_tokens_seen": 10066976, "step": 4650 }, { "epoch": 0.7593800978792822, "grad_norm": 2.6159284114837646, "learning_rate": 3.796084828711256e-05, "loss": 0.2117, "num_input_tokens_seen": 10078304, "step": 4655 }, { "epoch": 0.7601957585644372, "grad_norm": 9.170280456542969, "learning_rate": 3.8001631321370314e-05, "loss": 0.4091, "num_input_tokens_seen": 10089056, "step": 4660 }, { "epoch": 0.7610114192495921, "grad_norm": 0.13579274713993073, "learning_rate": 3.804241435562806e-05, "loss": 0.2127, "num_input_tokens_seen": 10099904, "step": 4665 }, { "epoch": 0.7618270799347472, "grad_norm": 2.140343189239502, "learning_rate": 3.808319738988581e-05, "loss": 0.0377, "num_input_tokens_seen": 10111552, "step": 4670 }, { "epoch": 0.7626427406199021, "grad_norm": 4.381788730621338, "learning_rate": 3.812398042414356e-05, "loss": 0.331, "num_input_tokens_seen": 10121568, "step": 4675 }, { "epoch": 0.763458401305057, "grad_norm": 1.919498085975647, "learning_rate": 3.816476345840131e-05, "loss": 0.2048, "num_input_tokens_seen": 10132000, "step": 4680 }, { "epoch": 0.7642740619902121, "grad_norm": 4.956511497497559, "learning_rate": 3.820554649265905e-05, "loss": 0.3375, "num_input_tokens_seen": 10141952, "step": 4685 }, { "epoch": 0.765089722675367, "grad_norm": 0.4133996069431305, "learning_rate": 3.82463295269168e-05, "loss": 0.1261, "num_input_tokens_seen": 10151776, "step": 4690 }, { "epoch": 0.765905383360522, "grad_norm": 1.9626580476760864, "learning_rate": 3.8287112561174555e-05, "loss": 0.1644, "num_input_tokens_seen": 10162688, "step": 4695 }, { "epoch": 0.766721044045677, "grad_norm": 0.1816297173500061, "learning_rate": 3.83278955954323e-05, "loss": 0.0105, "num_input_tokens_seen": 10174624, "step": 4700 }, { "epoch": 0.767536704730832, "grad_norm": 0.3867439031600952, "learning_rate": 3.836867862969005e-05, "loss": 0.0274, "num_input_tokens_seen": 10185248, "step": 4705 }, { "epoch": 0.768352365415987, "grad_norm": 0.06129593402147293, "learning_rate": 3.84094616639478e-05, "loss": 0.0324, "num_input_tokens_seen": 10196768, "step": 4710 }, { "epoch": 0.7691680261011419, "grad_norm": 0.291681170463562, "learning_rate": 3.845024469820555e-05, "loss": 0.0494, "num_input_tokens_seen": 10206816, "step": 4715 }, { "epoch": 0.7699836867862969, "grad_norm": 0.4279186725616455, "learning_rate": 3.84910277324633e-05, "loss": 0.061, "num_input_tokens_seen": 10217600, "step": 4720 }, { "epoch": 0.7707993474714518, "grad_norm": 0.767796516418457, "learning_rate": 3.853181076672104e-05, "loss": 0.2075, "num_input_tokens_seen": 10228544, "step": 4725 }, { "epoch": 0.7716150081566069, "grad_norm": 0.10908225923776627, "learning_rate": 3.8572593800978795e-05, "loss": 0.102, "num_input_tokens_seen": 10239168, "step": 4730 }, { "epoch": 0.7724306688417618, "grad_norm": 1.484430193901062, "learning_rate": 3.861337683523654e-05, "loss": 0.1301, "num_input_tokens_seen": 10250784, "step": 4735 }, { "epoch": 0.7732463295269169, "grad_norm": 4.065375328063965, "learning_rate": 3.86541598694943e-05, "loss": 0.1236, "num_input_tokens_seen": 10260480, "step": 4740 }, { "epoch": 0.7740619902120718, "grad_norm": 4.384725570678711, "learning_rate": 3.869494290375204e-05, "loss": 0.2311, "num_input_tokens_seen": 10272064, "step": 4745 }, { "epoch": 0.7748776508972267, "grad_norm": 0.06715980172157288, "learning_rate": 3.873572593800979e-05, "loss": 0.0408, "num_input_tokens_seen": 10281472, "step": 4750 }, { "epoch": 0.7756933115823818, "grad_norm": 1.4746594429016113, "learning_rate": 3.877650897226754e-05, "loss": 0.1715, "num_input_tokens_seen": 10293536, "step": 4755 }, { "epoch": 0.7765089722675367, "grad_norm": 0.1626254916191101, "learning_rate": 3.881729200652529e-05, "loss": 0.1153, "num_input_tokens_seen": 10303968, "step": 4760 }, { "epoch": 0.7773246329526917, "grad_norm": 0.15786142647266388, "learning_rate": 3.8858075040783035e-05, "loss": 0.1055, "num_input_tokens_seen": 10316064, "step": 4765 }, { "epoch": 0.7781402936378466, "grad_norm": 0.27948567271232605, "learning_rate": 3.889885807504078e-05, "loss": 0.1204, "num_input_tokens_seen": 10327328, "step": 4770 }, { "epoch": 0.7789559543230016, "grad_norm": 6.373471736907959, "learning_rate": 3.893964110929854e-05, "loss": 0.3351, "num_input_tokens_seen": 10337088, "step": 4775 }, { "epoch": 0.7797716150081566, "grad_norm": 5.559701442718506, "learning_rate": 3.8980424143556285e-05, "loss": 0.311, "num_input_tokens_seen": 10347488, "step": 4780 }, { "epoch": 0.7805872756933115, "grad_norm": 5.775311470031738, "learning_rate": 3.9021207177814026e-05, "loss": 0.2612, "num_input_tokens_seen": 10358656, "step": 4785 }, { "epoch": 0.7814029363784666, "grad_norm": 2.5803449153900146, "learning_rate": 3.906199021207178e-05, "loss": 0.2102, "num_input_tokens_seen": 10370816, "step": 4790 }, { "epoch": 0.7822185970636215, "grad_norm": 4.448340892791748, "learning_rate": 3.910277324632953e-05, "loss": 0.0847, "num_input_tokens_seen": 10381856, "step": 4795 }, { "epoch": 0.7830342577487766, "grad_norm": 5.02500581741333, "learning_rate": 3.914355628058728e-05, "loss": 0.2397, "num_input_tokens_seen": 10392320, "step": 4800 }, { "epoch": 0.7838499184339315, "grad_norm": 4.225478172302246, "learning_rate": 3.918433931484502e-05, "loss": 0.3016, "num_input_tokens_seen": 10401568, "step": 4805 }, { "epoch": 0.7846655791190864, "grad_norm": 1.1861470937728882, "learning_rate": 3.922512234910278e-05, "loss": 0.2616, "num_input_tokens_seen": 10413248, "step": 4810 }, { "epoch": 0.7854812398042414, "grad_norm": 5.154033660888672, "learning_rate": 3.9265905383360525e-05, "loss": 0.0759, "num_input_tokens_seen": 10423168, "step": 4815 }, { "epoch": 0.7862969004893964, "grad_norm": 0.817079484462738, "learning_rate": 3.930668841761827e-05, "loss": 0.1227, "num_input_tokens_seen": 10433792, "step": 4820 }, { "epoch": 0.7871125611745514, "grad_norm": 3.3402276039123535, "learning_rate": 3.934747145187602e-05, "loss": 0.1032, "num_input_tokens_seen": 10445120, "step": 4825 }, { "epoch": 0.7879282218597063, "grad_norm": 2.4098806381225586, "learning_rate": 3.938825448613377e-05, "loss": 0.1869, "num_input_tokens_seen": 10455584, "step": 4830 }, { "epoch": 0.7887438825448614, "grad_norm": 0.1025315448641777, "learning_rate": 3.942903752039152e-05, "loss": 0.0962, "num_input_tokens_seen": 10466752, "step": 4835 }, { "epoch": 0.7895595432300163, "grad_norm": 3.8288652896881104, "learning_rate": 3.946982055464927e-05, "loss": 0.1866, "num_input_tokens_seen": 10477376, "step": 4840 }, { "epoch": 0.7903752039151712, "grad_norm": 6.3380231857299805, "learning_rate": 3.951060358890702e-05, "loss": 0.1946, "num_input_tokens_seen": 10487392, "step": 4845 }, { "epoch": 0.7911908646003263, "grad_norm": 3.5893802642822266, "learning_rate": 3.9551386623164766e-05, "loss": 0.3604, "num_input_tokens_seen": 10498208, "step": 4850 }, { "epoch": 0.7920065252854812, "grad_norm": 1.9882179498672485, "learning_rate": 3.959216965742251e-05, "loss": 0.1252, "num_input_tokens_seen": 10507200, "step": 4855 }, { "epoch": 0.7928221859706363, "grad_norm": 0.5306270122528076, "learning_rate": 3.963295269168026e-05, "loss": 0.2172, "num_input_tokens_seen": 10516416, "step": 4860 }, { "epoch": 0.7936378466557912, "grad_norm": 3.275817632675171, "learning_rate": 3.967373572593801e-05, "loss": 0.3197, "num_input_tokens_seen": 10527648, "step": 4865 }, { "epoch": 0.7944535073409462, "grad_norm": 1.6505345106124878, "learning_rate": 3.971451876019576e-05, "loss": 0.0284, "num_input_tokens_seen": 10537952, "step": 4870 }, { "epoch": 0.7952691680261011, "grad_norm": 3.008704662322998, "learning_rate": 3.975530179445351e-05, "loss": 0.1331, "num_input_tokens_seen": 10548320, "step": 4875 }, { "epoch": 0.7960848287112561, "grad_norm": 0.39359229803085327, "learning_rate": 3.979608482871126e-05, "loss": 0.0615, "num_input_tokens_seen": 10557824, "step": 4880 }, { "epoch": 0.7969004893964111, "grad_norm": 0.1988229602575302, "learning_rate": 3.9836867862969006e-05, "loss": 0.0253, "num_input_tokens_seen": 10568896, "step": 4885 }, { "epoch": 0.797716150081566, "grad_norm": 3.483168601989746, "learning_rate": 3.9877650897226754e-05, "loss": 0.3658, "num_input_tokens_seen": 10580064, "step": 4890 }, { "epoch": 0.7985318107667211, "grad_norm": 0.05558078736066818, "learning_rate": 3.991843393148451e-05, "loss": 0.158, "num_input_tokens_seen": 10590976, "step": 4895 }, { "epoch": 0.799347471451876, "grad_norm": 4.738973140716553, "learning_rate": 3.995921696574225e-05, "loss": 0.1316, "num_input_tokens_seen": 10600448, "step": 4900 }, { "epoch": 0.8001631321370309, "grad_norm": 2.7792470455169678, "learning_rate": 4e-05, "loss": 0.1654, "num_input_tokens_seen": 10611904, "step": 4905 }, { "epoch": 0.800978792822186, "grad_norm": 1.0185917615890503, "learning_rate": 4.004078303425775e-05, "loss": 0.0977, "num_input_tokens_seen": 10622240, "step": 4910 }, { "epoch": 0.8017944535073409, "grad_norm": 4.466019153594971, "learning_rate": 4.00815660685155e-05, "loss": 0.2974, "num_input_tokens_seen": 10632864, "step": 4915 }, { "epoch": 0.802610114192496, "grad_norm": 0.8997367024421692, "learning_rate": 4.0122349102773246e-05, "loss": 0.0422, "num_input_tokens_seen": 10642464, "step": 4920 }, { "epoch": 0.8034257748776509, "grad_norm": 1.2520920038223267, "learning_rate": 4.0163132137030994e-05, "loss": 0.1862, "num_input_tokens_seen": 10653024, "step": 4925 }, { "epoch": 0.8042414355628059, "grad_norm": 1.0238492488861084, "learning_rate": 4.020391517128875e-05, "loss": 0.1163, "num_input_tokens_seen": 10664384, "step": 4930 }, { "epoch": 0.8050570962479608, "grad_norm": 5.637166976928711, "learning_rate": 4.0244698205546496e-05, "loss": 0.3622, "num_input_tokens_seen": 10675360, "step": 4935 }, { "epoch": 0.8058727569331158, "grad_norm": 0.8354208469390869, "learning_rate": 4.0285481239804244e-05, "loss": 0.1535, "num_input_tokens_seen": 10685824, "step": 4940 }, { "epoch": 0.8066884176182708, "grad_norm": 4.137977600097656, "learning_rate": 4.032626427406199e-05, "loss": 0.2299, "num_input_tokens_seen": 10697536, "step": 4945 }, { "epoch": 0.8075040783034257, "grad_norm": 1.5421990156173706, "learning_rate": 4.036704730831974e-05, "loss": 0.0498, "num_input_tokens_seen": 10709888, "step": 4950 }, { "epoch": 0.8083197389885808, "grad_norm": 0.06120558828115463, "learning_rate": 4.040783034257749e-05, "loss": 0.0596, "num_input_tokens_seen": 10721216, "step": 4955 }, { "epoch": 0.8091353996737357, "grad_norm": 0.17592903971672058, "learning_rate": 4.0448613376835234e-05, "loss": 0.102, "num_input_tokens_seen": 10729952, "step": 4960 }, { "epoch": 0.8099510603588908, "grad_norm": 4.301577568054199, "learning_rate": 4.048939641109299e-05, "loss": 0.1266, "num_input_tokens_seen": 10741216, "step": 4965 }, { "epoch": 0.8107667210440457, "grad_norm": 0.1696639209985733, "learning_rate": 4.0530179445350736e-05, "loss": 0.1888, "num_input_tokens_seen": 10752544, "step": 4970 }, { "epoch": 0.8115823817292006, "grad_norm": 0.501167893409729, "learning_rate": 4.057096247960849e-05, "loss": 0.2189, "num_input_tokens_seen": 10764960, "step": 4975 }, { "epoch": 0.8123980424143556, "grad_norm": 1.5304832458496094, "learning_rate": 4.061174551386623e-05, "loss": 0.0462, "num_input_tokens_seen": 10775296, "step": 4980 }, { "epoch": 0.8132137030995106, "grad_norm": 3.5793774127960205, "learning_rate": 4.065252854812398e-05, "loss": 0.1042, "num_input_tokens_seen": 10786112, "step": 4985 }, { "epoch": 0.8140293637846656, "grad_norm": 0.9823907017707825, "learning_rate": 4.0693311582381734e-05, "loss": 0.1694, "num_input_tokens_seen": 10797152, "step": 4990 }, { "epoch": 0.8148450244698205, "grad_norm": 5.591163635253906, "learning_rate": 4.073409461663948e-05, "loss": 0.1854, "num_input_tokens_seen": 10808480, "step": 4995 }, { "epoch": 0.8156606851549756, "grad_norm": 4.029349327087402, "learning_rate": 4.077487765089723e-05, "loss": 0.0428, "num_input_tokens_seen": 10819424, "step": 5000 }, { "epoch": 0.8164763458401305, "grad_norm": 0.7126393914222717, "learning_rate": 4.0815660685154977e-05, "loss": 0.1138, "num_input_tokens_seen": 10830048, "step": 5005 }, { "epoch": 0.8172920065252854, "grad_norm": 1.9477592706680298, "learning_rate": 4.0856443719412724e-05, "loss": 0.2361, "num_input_tokens_seen": 10840768, "step": 5010 }, { "epoch": 0.8181076672104405, "grad_norm": 0.12592485547065735, "learning_rate": 4.089722675367048e-05, "loss": 0.2728, "num_input_tokens_seen": 10849792, "step": 5015 }, { "epoch": 0.8189233278955954, "grad_norm": 0.0797116607427597, "learning_rate": 4.093800978792822e-05, "loss": 0.0368, "num_input_tokens_seen": 10861472, "step": 5020 }, { "epoch": 0.8197389885807504, "grad_norm": 2.3102195262908936, "learning_rate": 4.0978792822185974e-05, "loss": 0.0623, "num_input_tokens_seen": 10871968, "step": 5025 }, { "epoch": 0.8205546492659054, "grad_norm": 3.467373847961426, "learning_rate": 4.101957585644372e-05, "loss": 0.1971, "num_input_tokens_seen": 10882560, "step": 5030 }, { "epoch": 0.8213703099510603, "grad_norm": 7.26324462890625, "learning_rate": 4.106035889070147e-05, "loss": 0.2299, "num_input_tokens_seen": 10894048, "step": 5035 }, { "epoch": 0.8221859706362153, "grad_norm": 6.0892438888549805, "learning_rate": 4.110114192495922e-05, "loss": 0.3053, "num_input_tokens_seen": 10905408, "step": 5040 }, { "epoch": 0.8230016313213703, "grad_norm": 3.208545684814453, "learning_rate": 4.1141924959216964e-05, "loss": 0.0953, "num_input_tokens_seen": 10915648, "step": 5045 }, { "epoch": 0.8238172920065253, "grad_norm": 2.9109947681427, "learning_rate": 4.118270799347472e-05, "loss": 0.1152, "num_input_tokens_seen": 10926880, "step": 5050 }, { "epoch": 0.8246329526916802, "grad_norm": 0.8327919840812683, "learning_rate": 4.1223491027732467e-05, "loss": 0.2608, "num_input_tokens_seen": 10938432, "step": 5055 }, { "epoch": 0.8254486133768353, "grad_norm": 1.2478437423706055, "learning_rate": 4.1264274061990214e-05, "loss": 0.2698, "num_input_tokens_seen": 10948992, "step": 5060 }, { "epoch": 0.8262642740619902, "grad_norm": 8.861740112304688, "learning_rate": 4.130505709624796e-05, "loss": 0.3661, "num_input_tokens_seen": 10958272, "step": 5065 }, { "epoch": 0.8270799347471451, "grad_norm": 0.43016546964645386, "learning_rate": 4.1345840130505716e-05, "loss": 0.1065, "num_input_tokens_seen": 10968512, "step": 5070 }, { "epoch": 0.8278955954323002, "grad_norm": 0.11255605518817902, "learning_rate": 4.1386623164763464e-05, "loss": 0.0838, "num_input_tokens_seen": 10979072, "step": 5075 }, { "epoch": 0.8287112561174551, "grad_norm": 0.8912560343742371, "learning_rate": 4.1427406199021205e-05, "loss": 0.1326, "num_input_tokens_seen": 10988736, "step": 5080 }, { "epoch": 0.8295269168026101, "grad_norm": 4.577517986297607, "learning_rate": 4.146818923327896e-05, "loss": 0.1894, "num_input_tokens_seen": 11000512, "step": 5085 }, { "epoch": 0.8303425774877651, "grad_norm": 1.794396162033081, "learning_rate": 4.150897226753671e-05, "loss": 0.2606, "num_input_tokens_seen": 11010240, "step": 5090 }, { "epoch": 0.8311582381729201, "grad_norm": 0.0777493417263031, "learning_rate": 4.1549755301794454e-05, "loss": 0.169, "num_input_tokens_seen": 11020672, "step": 5095 }, { "epoch": 0.831973898858075, "grad_norm": 0.19618628919124603, "learning_rate": 4.15905383360522e-05, "loss": 0.0844, "num_input_tokens_seen": 11031808, "step": 5100 }, { "epoch": 0.83278955954323, "grad_norm": 6.898108959197998, "learning_rate": 4.1631321370309957e-05, "loss": 0.2086, "num_input_tokens_seen": 11042144, "step": 5105 }, { "epoch": 0.833605220228385, "grad_norm": 0.18968331813812256, "learning_rate": 4.1672104404567704e-05, "loss": 0.1027, "num_input_tokens_seen": 11053728, "step": 5110 }, { "epoch": 0.8344208809135399, "grad_norm": 2.2396914958953857, "learning_rate": 4.171288743882545e-05, "loss": 0.1606, "num_input_tokens_seen": 11064032, "step": 5115 }, { "epoch": 0.835236541598695, "grad_norm": 4.880401134490967, "learning_rate": 4.17536704730832e-05, "loss": 0.2587, "num_input_tokens_seen": 11073792, "step": 5120 }, { "epoch": 0.8360522022838499, "grad_norm": 0.6538611650466919, "learning_rate": 4.179445350734095e-05, "loss": 0.0975, "num_input_tokens_seen": 11085216, "step": 5125 }, { "epoch": 0.8368678629690048, "grad_norm": 0.5744665861129761, "learning_rate": 4.18352365415987e-05, "loss": 0.2902, "num_input_tokens_seen": 11096288, "step": 5130 }, { "epoch": 0.8376835236541599, "grad_norm": 0.2094990760087967, "learning_rate": 4.187601957585644e-05, "loss": 0.127, "num_input_tokens_seen": 11107584, "step": 5135 }, { "epoch": 0.8384991843393148, "grad_norm": 0.8373374342918396, "learning_rate": 4.191680261011419e-05, "loss": 0.0899, "num_input_tokens_seen": 11118752, "step": 5140 }, { "epoch": 0.8393148450244698, "grad_norm": 2.5657176971435547, "learning_rate": 4.1957585644371944e-05, "loss": 0.1069, "num_input_tokens_seen": 11127840, "step": 5145 }, { "epoch": 0.8401305057096248, "grad_norm": 0.07352052628993988, "learning_rate": 4.199836867862969e-05, "loss": 0.1223, "num_input_tokens_seen": 11137952, "step": 5150 }, { "epoch": 0.8409461663947798, "grad_norm": 0.14405055344104767, "learning_rate": 4.203915171288744e-05, "loss": 0.2233, "num_input_tokens_seen": 11147008, "step": 5155 }, { "epoch": 0.8417618270799347, "grad_norm": 5.144182205200195, "learning_rate": 4.207993474714519e-05, "loss": 0.0808, "num_input_tokens_seen": 11158912, "step": 5160 }, { "epoch": 0.8425774877650897, "grad_norm": 1.5371413230895996, "learning_rate": 4.212071778140294e-05, "loss": 0.0364, "num_input_tokens_seen": 11169440, "step": 5165 }, { "epoch": 0.8433931484502447, "grad_norm": 2.367783784866333, "learning_rate": 4.216150081566069e-05, "loss": 0.1658, "num_input_tokens_seen": 11179488, "step": 5170 }, { "epoch": 0.8442088091353996, "grad_norm": 0.971149742603302, "learning_rate": 4.220228384991843e-05, "loss": 0.2325, "num_input_tokens_seen": 11190912, "step": 5175 }, { "epoch": 0.8450244698205547, "grad_norm": 2.1906208992004395, "learning_rate": 4.2243066884176185e-05, "loss": 0.1955, "num_input_tokens_seen": 11202208, "step": 5180 }, { "epoch": 0.8458401305057096, "grad_norm": 5.115178108215332, "learning_rate": 4.228384991843393e-05, "loss": 0.3306, "num_input_tokens_seen": 11212448, "step": 5185 }, { "epoch": 0.8466557911908646, "grad_norm": 1.3455501794815063, "learning_rate": 4.232463295269169e-05, "loss": 0.1725, "num_input_tokens_seen": 11223648, "step": 5190 }, { "epoch": 0.8474714518760196, "grad_norm": 0.13641172647476196, "learning_rate": 4.236541598694943e-05, "loss": 0.2347, "num_input_tokens_seen": 11234880, "step": 5195 }, { "epoch": 0.8482871125611745, "grad_norm": 1.0995267629623413, "learning_rate": 4.240619902120718e-05, "loss": 0.0977, "num_input_tokens_seen": 11245184, "step": 5200 }, { "epoch": 0.8491027732463295, "grad_norm": 3.476732015609741, "learning_rate": 4.244698205546493e-05, "loss": 0.066, "num_input_tokens_seen": 11256288, "step": 5205 }, { "epoch": 0.8499184339314845, "grad_norm": 0.5097530484199524, "learning_rate": 4.248776508972268e-05, "loss": 0.1736, "num_input_tokens_seen": 11265728, "step": 5210 }, { "epoch": 0.8507340946166395, "grad_norm": 0.16053344309329987, "learning_rate": 4.2528548123980425e-05, "loss": 0.0833, "num_input_tokens_seen": 11275712, "step": 5215 }, { "epoch": 0.8515497553017944, "grad_norm": 0.5644147992134094, "learning_rate": 4.256933115823817e-05, "loss": 0.1389, "num_input_tokens_seen": 11287776, "step": 5220 }, { "epoch": 0.8523654159869495, "grad_norm": 4.689136505126953, "learning_rate": 4.261011419249593e-05, "loss": 0.2571, "num_input_tokens_seen": 11298528, "step": 5225 }, { "epoch": 0.8531810766721044, "grad_norm": 1.5279276371002197, "learning_rate": 4.2650897226753675e-05, "loss": 0.2784, "num_input_tokens_seen": 11309536, "step": 5230 }, { "epoch": 0.8539967373572593, "grad_norm": 1.378790020942688, "learning_rate": 4.2691680261011416e-05, "loss": 0.3113, "num_input_tokens_seen": 11319200, "step": 5235 }, { "epoch": 0.8548123980424144, "grad_norm": 0.20116767287254333, "learning_rate": 4.273246329526917e-05, "loss": 0.0569, "num_input_tokens_seen": 11329120, "step": 5240 }, { "epoch": 0.8556280587275693, "grad_norm": 0.06263948231935501, "learning_rate": 4.277324632952692e-05, "loss": 0.1385, "num_input_tokens_seen": 11339360, "step": 5245 }, { "epoch": 0.8564437194127243, "grad_norm": 6.8799214363098145, "learning_rate": 4.281402936378467e-05, "loss": 0.2868, "num_input_tokens_seen": 11350848, "step": 5250 }, { "epoch": 0.8572593800978793, "grad_norm": 0.3296712040901184, "learning_rate": 4.285481239804241e-05, "loss": 0.1441, "num_input_tokens_seen": 11360704, "step": 5255 }, { "epoch": 0.8580750407830342, "grad_norm": 1.6185336112976074, "learning_rate": 4.289559543230017e-05, "loss": 0.0794, "num_input_tokens_seen": 11370816, "step": 5260 }, { "epoch": 0.8588907014681892, "grad_norm": 3.606036901473999, "learning_rate": 4.2936378466557915e-05, "loss": 0.1085, "num_input_tokens_seen": 11382080, "step": 5265 }, { "epoch": 0.8597063621533442, "grad_norm": 0.3723049461841583, "learning_rate": 4.297716150081566e-05, "loss": 0.2821, "num_input_tokens_seen": 11392736, "step": 5270 }, { "epoch": 0.8605220228384992, "grad_norm": 0.08519739657640457, "learning_rate": 4.301794453507341e-05, "loss": 0.1957, "num_input_tokens_seen": 11402720, "step": 5275 }, { "epoch": 0.8613376835236541, "grad_norm": 0.9225186109542847, "learning_rate": 4.305872756933116e-05, "loss": 0.0355, "num_input_tokens_seen": 11413984, "step": 5280 }, { "epoch": 0.8621533442088092, "grad_norm": 5.518332004547119, "learning_rate": 4.309951060358891e-05, "loss": 0.2303, "num_input_tokens_seen": 11424832, "step": 5285 }, { "epoch": 0.8629690048939641, "grad_norm": 2.6699180603027344, "learning_rate": 4.314029363784666e-05, "loss": 0.0447, "num_input_tokens_seen": 11436128, "step": 5290 }, { "epoch": 0.863784665579119, "grad_norm": 0.0447678305208683, "learning_rate": 4.318107667210441e-05, "loss": 0.2048, "num_input_tokens_seen": 11448448, "step": 5295 }, { "epoch": 0.8646003262642741, "grad_norm": 1.1556092500686646, "learning_rate": 4.3221859706362155e-05, "loss": 0.1352, "num_input_tokens_seen": 11458176, "step": 5300 }, { "epoch": 0.865415986949429, "grad_norm": 1.437555193901062, "learning_rate": 4.32626427406199e-05, "loss": 0.2217, "num_input_tokens_seen": 11468480, "step": 5305 }, { "epoch": 0.866231647634584, "grad_norm": 4.650927543640137, "learning_rate": 4.330342577487765e-05, "loss": 0.1743, "num_input_tokens_seen": 11478880, "step": 5310 }, { "epoch": 0.867047308319739, "grad_norm": 5.042733669281006, "learning_rate": 4.33442088091354e-05, "loss": 0.1615, "num_input_tokens_seen": 11490176, "step": 5315 }, { "epoch": 0.867862969004894, "grad_norm": 0.22132863104343414, "learning_rate": 4.338499184339315e-05, "loss": 0.1746, "num_input_tokens_seen": 11500992, "step": 5320 }, { "epoch": 0.8686786296900489, "grad_norm": 0.5395056009292603, "learning_rate": 4.34257748776509e-05, "loss": 0.0294, "num_input_tokens_seen": 11512288, "step": 5325 }, { "epoch": 0.8694942903752039, "grad_norm": 4.689180850982666, "learning_rate": 4.346655791190865e-05, "loss": 0.163, "num_input_tokens_seen": 11521664, "step": 5330 }, { "epoch": 0.8703099510603589, "grad_norm": 2.488198757171631, "learning_rate": 4.3507340946166396e-05, "loss": 0.0325, "num_input_tokens_seen": 11532608, "step": 5335 }, { "epoch": 0.8711256117455138, "grad_norm": 7.0070037841796875, "learning_rate": 4.354812398042414e-05, "loss": 0.2382, "num_input_tokens_seen": 11543072, "step": 5340 }, { "epoch": 0.8719412724306689, "grad_norm": 4.199175834655762, "learning_rate": 4.35889070146819e-05, "loss": 0.1242, "num_input_tokens_seen": 11554144, "step": 5345 }, { "epoch": 0.8727569331158238, "grad_norm": 0.19933462142944336, "learning_rate": 4.3629690048939645e-05, "loss": 0.1015, "num_input_tokens_seen": 11565376, "step": 5350 }, { "epoch": 0.8735725938009788, "grad_norm": 4.612356185913086, "learning_rate": 4.367047308319739e-05, "loss": 0.2884, "num_input_tokens_seen": 11576128, "step": 5355 }, { "epoch": 0.8743882544861338, "grad_norm": 0.09427787363529205, "learning_rate": 4.371125611745514e-05, "loss": 0.0866, "num_input_tokens_seen": 11588000, "step": 5360 }, { "epoch": 0.8752039151712887, "grad_norm": 0.5477355718612671, "learning_rate": 4.375203915171289e-05, "loss": 0.071, "num_input_tokens_seen": 11599520, "step": 5365 }, { "epoch": 0.8760195758564437, "grad_norm": 0.9199375510215759, "learning_rate": 4.3792822185970636e-05, "loss": 0.1351, "num_input_tokens_seen": 11611168, "step": 5370 }, { "epoch": 0.8768352365415987, "grad_norm": 0.3509751260280609, "learning_rate": 4.3833605220228384e-05, "loss": 0.0722, "num_input_tokens_seen": 11621632, "step": 5375 }, { "epoch": 0.8776508972267537, "grad_norm": 4.951541900634766, "learning_rate": 4.387438825448614e-05, "loss": 0.4848, "num_input_tokens_seen": 11632160, "step": 5380 }, { "epoch": 0.8784665579119086, "grad_norm": 0.33898553252220154, "learning_rate": 4.3915171288743886e-05, "loss": 0.1484, "num_input_tokens_seen": 11641344, "step": 5385 }, { "epoch": 0.8792822185970636, "grad_norm": 0.5188796520233154, "learning_rate": 4.395595432300163e-05, "loss": 0.1587, "num_input_tokens_seen": 11652128, "step": 5390 }, { "epoch": 0.8800978792822186, "grad_norm": 0.25006893277168274, "learning_rate": 4.399673735725938e-05, "loss": 0.0804, "num_input_tokens_seen": 11662528, "step": 5395 }, { "epoch": 0.8809135399673735, "grad_norm": 0.10641758143901825, "learning_rate": 4.403752039151713e-05, "loss": 0.1148, "num_input_tokens_seen": 11672544, "step": 5400 }, { "epoch": 0.8817292006525286, "grad_norm": 4.050087928771973, "learning_rate": 4.407830342577488e-05, "loss": 0.1352, "num_input_tokens_seen": 11683552, "step": 5405 }, { "epoch": 0.8825448613376835, "grad_norm": 2.528367042541504, "learning_rate": 4.4119086460032624e-05, "loss": 0.0982, "num_input_tokens_seen": 11693408, "step": 5410 }, { "epoch": 0.8833605220228385, "grad_norm": 0.33357101678848267, "learning_rate": 4.415986949429038e-05, "loss": 0.1551, "num_input_tokens_seen": 11704160, "step": 5415 }, { "epoch": 0.8841761827079935, "grad_norm": 0.32154837250709534, "learning_rate": 4.4200652528548126e-05, "loss": 0.1043, "num_input_tokens_seen": 11714752, "step": 5420 }, { "epoch": 0.8849918433931484, "grad_norm": 0.09626504778862, "learning_rate": 4.424143556280588e-05, "loss": 0.024, "num_input_tokens_seen": 11725408, "step": 5425 }, { "epoch": 0.8858075040783034, "grad_norm": 0.7662316560745239, "learning_rate": 4.428221859706362e-05, "loss": 0.1123, "num_input_tokens_seen": 11736640, "step": 5430 }, { "epoch": 0.8866231647634584, "grad_norm": 2.163783311843872, "learning_rate": 4.432300163132137e-05, "loss": 0.1954, "num_input_tokens_seen": 11747008, "step": 5435 }, { "epoch": 0.8874388254486134, "grad_norm": 3.4351084232330322, "learning_rate": 4.436378466557912e-05, "loss": 0.1193, "num_input_tokens_seen": 11757728, "step": 5440 }, { "epoch": 0.8882544861337683, "grad_norm": 0.2823643386363983, "learning_rate": 4.440456769983687e-05, "loss": 0.2107, "num_input_tokens_seen": 11768704, "step": 5445 }, { "epoch": 0.8890701468189234, "grad_norm": 11.116686820983887, "learning_rate": 4.444535073409462e-05, "loss": 0.1374, "num_input_tokens_seen": 11780288, "step": 5450 }, { "epoch": 0.8898858075040783, "grad_norm": 7.736528396606445, "learning_rate": 4.4486133768352366e-05, "loss": 0.5311, "num_input_tokens_seen": 11790080, "step": 5455 }, { "epoch": 0.8907014681892332, "grad_norm": 0.37017178535461426, "learning_rate": 4.4526916802610114e-05, "loss": 0.4017, "num_input_tokens_seen": 11801952, "step": 5460 }, { "epoch": 0.8915171288743883, "grad_norm": 0.2874157726764679, "learning_rate": 4.456769983686787e-05, "loss": 0.0954, "num_input_tokens_seen": 11811520, "step": 5465 }, { "epoch": 0.8923327895595432, "grad_norm": 0.4490336775779724, "learning_rate": 4.460848287112561e-05, "loss": 0.1741, "num_input_tokens_seen": 11822240, "step": 5470 }, { "epoch": 0.8931484502446982, "grad_norm": 5.030572414398193, "learning_rate": 4.4649265905383364e-05, "loss": 0.1663, "num_input_tokens_seen": 11833632, "step": 5475 }, { "epoch": 0.8939641109298532, "grad_norm": 0.028783371672034264, "learning_rate": 4.469004893964111e-05, "loss": 0.042, "num_input_tokens_seen": 11844640, "step": 5480 }, { "epoch": 0.8947797716150081, "grad_norm": 4.203812122344971, "learning_rate": 4.4730831973898866e-05, "loss": 0.1365, "num_input_tokens_seen": 11855776, "step": 5485 }, { "epoch": 0.8955954323001631, "grad_norm": 1.6102211475372314, "learning_rate": 4.4771615008156607e-05, "loss": 0.3933, "num_input_tokens_seen": 11865856, "step": 5490 }, { "epoch": 0.8964110929853181, "grad_norm": 2.1297247409820557, "learning_rate": 4.4812398042414354e-05, "loss": 0.0667, "num_input_tokens_seen": 11877504, "step": 5495 }, { "epoch": 0.8972267536704731, "grad_norm": 0.13318589329719543, "learning_rate": 4.485318107667211e-05, "loss": 0.2619, "num_input_tokens_seen": 11888320, "step": 5500 }, { "epoch": 0.898042414355628, "grad_norm": 0.3416684567928314, "learning_rate": 4.4893964110929856e-05, "loss": 0.1459, "num_input_tokens_seen": 11899456, "step": 5505 }, { "epoch": 0.8988580750407831, "grad_norm": 6.452507019042969, "learning_rate": 4.4934747145187604e-05, "loss": 0.1818, "num_input_tokens_seen": 11910240, "step": 5510 }, { "epoch": 0.899673735725938, "grad_norm": 0.9621750712394714, "learning_rate": 4.497553017944535e-05, "loss": 0.0758, "num_input_tokens_seen": 11920512, "step": 5515 }, { "epoch": 0.9004893964110929, "grad_norm": 0.2789660096168518, "learning_rate": 4.5016313213703106e-05, "loss": 0.1328, "num_input_tokens_seen": 11930592, "step": 5520 }, { "epoch": 0.901305057096248, "grad_norm": 2.8978145122528076, "learning_rate": 4.5057096247960854e-05, "loss": 0.371, "num_input_tokens_seen": 11941088, "step": 5525 }, { "epoch": 0.9021207177814029, "grad_norm": 1.2749956846237183, "learning_rate": 4.5097879282218594e-05, "loss": 0.1228, "num_input_tokens_seen": 11952320, "step": 5530 }, { "epoch": 0.9029363784665579, "grad_norm": 1.1564136743545532, "learning_rate": 4.513866231647635e-05, "loss": 0.1583, "num_input_tokens_seen": 11962880, "step": 5535 }, { "epoch": 0.9037520391517129, "grad_norm": 0.5101920962333679, "learning_rate": 4.5179445350734097e-05, "loss": 0.0189, "num_input_tokens_seen": 11973472, "step": 5540 }, { "epoch": 0.9045676998368679, "grad_norm": 4.70657205581665, "learning_rate": 4.5220228384991844e-05, "loss": 0.4949, "num_input_tokens_seen": 11984096, "step": 5545 }, { "epoch": 0.9053833605220228, "grad_norm": 3.420771360397339, "learning_rate": 4.526101141924959e-05, "loss": 0.1192, "num_input_tokens_seen": 11995488, "step": 5550 }, { "epoch": 0.9061990212071778, "grad_norm": 8.305743217468262, "learning_rate": 4.5301794453507346e-05, "loss": 0.1186, "num_input_tokens_seen": 12007104, "step": 5555 }, { "epoch": 0.9070146818923328, "grad_norm": 4.708660125732422, "learning_rate": 4.5342577487765094e-05, "loss": 0.195, "num_input_tokens_seen": 12018688, "step": 5560 }, { "epoch": 0.9078303425774877, "grad_norm": 0.42921167612075806, "learning_rate": 4.538336052202284e-05, "loss": 0.0933, "num_input_tokens_seen": 12028096, "step": 5565 }, { "epoch": 0.9086460032626428, "grad_norm": 3.1014652252197266, "learning_rate": 4.542414355628059e-05, "loss": 0.1672, "num_input_tokens_seen": 12038976, "step": 5570 }, { "epoch": 0.9094616639477977, "grad_norm": 2.6501834392547607, "learning_rate": 4.546492659053834e-05, "loss": 0.1629, "num_input_tokens_seen": 12050816, "step": 5575 }, { "epoch": 0.9102773246329527, "grad_norm": 0.2537744641304016, "learning_rate": 4.550570962479609e-05, "loss": 0.2324, "num_input_tokens_seen": 12062336, "step": 5580 }, { "epoch": 0.9110929853181077, "grad_norm": 6.535691261291504, "learning_rate": 4.554649265905383e-05, "loss": 0.2013, "num_input_tokens_seen": 12072096, "step": 5585 }, { "epoch": 0.9119086460032626, "grad_norm": 0.7936147451400757, "learning_rate": 4.558727569331158e-05, "loss": 0.0583, "num_input_tokens_seen": 12083136, "step": 5590 }, { "epoch": 0.9127243066884176, "grad_norm": 0.3955764174461365, "learning_rate": 4.5628058727569334e-05, "loss": 0.2451, "num_input_tokens_seen": 12094272, "step": 5595 }, { "epoch": 0.9135399673735726, "grad_norm": 0.2736872136592865, "learning_rate": 4.566884176182708e-05, "loss": 0.261, "num_input_tokens_seen": 12105280, "step": 5600 }, { "epoch": 0.9143556280587276, "grad_norm": 0.49092239141464233, "learning_rate": 4.570962479608483e-05, "loss": 0.1982, "num_input_tokens_seen": 12115968, "step": 5605 }, { "epoch": 0.9151712887438825, "grad_norm": 3.235626697540283, "learning_rate": 4.575040783034258e-05, "loss": 0.1825, "num_input_tokens_seen": 12127136, "step": 5610 }, { "epoch": 0.9159869494290375, "grad_norm": 5.939905166625977, "learning_rate": 4.579119086460033e-05, "loss": 0.0918, "num_input_tokens_seen": 12136704, "step": 5615 }, { "epoch": 0.9168026101141925, "grad_norm": 0.28519207239151, "learning_rate": 4.583197389885808e-05, "loss": 0.0813, "num_input_tokens_seen": 12146784, "step": 5620 }, { "epoch": 0.9176182707993474, "grad_norm": 2.29233980178833, "learning_rate": 4.587275693311583e-05, "loss": 0.0455, "num_input_tokens_seen": 12157248, "step": 5625 }, { "epoch": 0.9184339314845025, "grad_norm": 1.8638032674789429, "learning_rate": 4.5913539967373574e-05, "loss": 0.2188, "num_input_tokens_seen": 12168256, "step": 5630 }, { "epoch": 0.9192495921696574, "grad_norm": 3.1932172775268555, "learning_rate": 4.595432300163132e-05, "loss": 0.0907, "num_input_tokens_seen": 12177760, "step": 5635 }, { "epoch": 0.9200652528548124, "grad_norm": 2.8444621562957764, "learning_rate": 4.5995106035889077e-05, "loss": 0.0658, "num_input_tokens_seen": 12188192, "step": 5640 }, { "epoch": 0.9208809135399674, "grad_norm": 4.4717278480529785, "learning_rate": 4.603588907014682e-05, "loss": 0.2657, "num_input_tokens_seen": 12198880, "step": 5645 }, { "epoch": 0.9216965742251223, "grad_norm": 1.8809369802474976, "learning_rate": 4.607667210440457e-05, "loss": 0.2194, "num_input_tokens_seen": 12208800, "step": 5650 }, { "epoch": 0.9225122349102773, "grad_norm": 0.24707771837711334, "learning_rate": 4.611745513866232e-05, "loss": 0.1215, "num_input_tokens_seen": 12219360, "step": 5655 }, { "epoch": 0.9233278955954323, "grad_norm": 3.440296173095703, "learning_rate": 4.615823817292007e-05, "loss": 0.1499, "num_input_tokens_seen": 12229376, "step": 5660 }, { "epoch": 0.9241435562805873, "grad_norm": 4.585781574249268, "learning_rate": 4.6199021207177815e-05, "loss": 0.2547, "num_input_tokens_seen": 12241152, "step": 5665 }, { "epoch": 0.9249592169657422, "grad_norm": 0.6023845076560974, "learning_rate": 4.623980424143556e-05, "loss": 0.2379, "num_input_tokens_seen": 12252192, "step": 5670 }, { "epoch": 0.9257748776508973, "grad_norm": 0.4795178472995758, "learning_rate": 4.628058727569332e-05, "loss": 0.2099, "num_input_tokens_seen": 12262848, "step": 5675 }, { "epoch": 0.9265905383360522, "grad_norm": 3.0352981090545654, "learning_rate": 4.6321370309951064e-05, "loss": 0.1927, "num_input_tokens_seen": 12273728, "step": 5680 }, { "epoch": 0.9274061990212071, "grad_norm": 2.9030098915100098, "learning_rate": 4.636215334420881e-05, "loss": 0.1262, "num_input_tokens_seen": 12284512, "step": 5685 }, { "epoch": 0.9282218597063622, "grad_norm": 5.807152271270752, "learning_rate": 4.640293637846656e-05, "loss": 0.2835, "num_input_tokens_seen": 12295968, "step": 5690 }, { "epoch": 0.9290375203915171, "grad_norm": 0.8265318274497986, "learning_rate": 4.644371941272431e-05, "loss": 0.1558, "num_input_tokens_seen": 12307616, "step": 5695 }, { "epoch": 0.9298531810766721, "grad_norm": 3.8171863555908203, "learning_rate": 4.648450244698206e-05, "loss": 0.1775, "num_input_tokens_seen": 12318400, "step": 5700 }, { "epoch": 0.9306688417618271, "grad_norm": 0.7892698049545288, "learning_rate": 4.65252854812398e-05, "loss": 0.1465, "num_input_tokens_seen": 12329472, "step": 5705 }, { "epoch": 0.9314845024469821, "grad_norm": 2.0436034202575684, "learning_rate": 4.656606851549756e-05, "loss": 0.0662, "num_input_tokens_seen": 12340032, "step": 5710 }, { "epoch": 0.932300163132137, "grad_norm": 0.7949777841567993, "learning_rate": 4.6606851549755305e-05, "loss": 0.0445, "num_input_tokens_seen": 12351584, "step": 5715 }, { "epoch": 0.933115823817292, "grad_norm": 4.621256351470947, "learning_rate": 4.664763458401305e-05, "loss": 0.3786, "num_input_tokens_seen": 12362752, "step": 5720 }, { "epoch": 0.933931484502447, "grad_norm": 2.6562387943267822, "learning_rate": 4.66884176182708e-05, "loss": 0.2519, "num_input_tokens_seen": 12374720, "step": 5725 }, { "epoch": 0.9347471451876019, "grad_norm": 0.8015565276145935, "learning_rate": 4.672920065252855e-05, "loss": 0.1531, "num_input_tokens_seen": 12384896, "step": 5730 }, { "epoch": 0.935562805872757, "grad_norm": 1.8839294910430908, "learning_rate": 4.67699836867863e-05, "loss": 0.099, "num_input_tokens_seen": 12395808, "step": 5735 }, { "epoch": 0.9363784665579119, "grad_norm": 2.145501136779785, "learning_rate": 4.681076672104405e-05, "loss": 0.1181, "num_input_tokens_seen": 12406880, "step": 5740 }, { "epoch": 0.9371941272430668, "grad_norm": 1.8614461421966553, "learning_rate": 4.68515497553018e-05, "loss": 0.2624, "num_input_tokens_seen": 12418240, "step": 5745 }, { "epoch": 0.9380097879282219, "grad_norm": 4.2035346031188965, "learning_rate": 4.6892332789559545e-05, "loss": 0.2653, "num_input_tokens_seen": 12428896, "step": 5750 }, { "epoch": 0.9388254486133768, "grad_norm": 0.3168288767337799, "learning_rate": 4.693311582381729e-05, "loss": 0.0554, "num_input_tokens_seen": 12439872, "step": 5755 }, { "epoch": 0.9396411092985318, "grad_norm": 0.09010463953018188, "learning_rate": 4.697389885807505e-05, "loss": 0.0137, "num_input_tokens_seen": 12451840, "step": 5760 }, { "epoch": 0.9404567699836868, "grad_norm": 0.5660171508789062, "learning_rate": 4.701468189233279e-05, "loss": 0.1133, "num_input_tokens_seen": 12463328, "step": 5765 }, { "epoch": 0.9412724306688418, "grad_norm": 0.30117490887641907, "learning_rate": 4.705546492659054e-05, "loss": 0.2971, "num_input_tokens_seen": 12474080, "step": 5770 }, { "epoch": 0.9420880913539967, "grad_norm": 0.11573529243469238, "learning_rate": 4.709624796084829e-05, "loss": 0.4065, "num_input_tokens_seen": 12484960, "step": 5775 }, { "epoch": 0.9429037520391517, "grad_norm": 0.47013425827026367, "learning_rate": 4.713703099510604e-05, "loss": 0.0553, "num_input_tokens_seen": 12495584, "step": 5780 }, { "epoch": 0.9437194127243067, "grad_norm": 0.387765109539032, "learning_rate": 4.7177814029363785e-05, "loss": 0.157, "num_input_tokens_seen": 12505568, "step": 5785 }, { "epoch": 0.9445350734094616, "grad_norm": 0.29751497507095337, "learning_rate": 4.721859706362153e-05, "loss": 0.3058, "num_input_tokens_seen": 12516480, "step": 5790 }, { "epoch": 0.9453507340946167, "grad_norm": 5.351414203643799, "learning_rate": 4.725938009787929e-05, "loss": 0.2212, "num_input_tokens_seen": 12527136, "step": 5795 }, { "epoch": 0.9461663947797716, "grad_norm": 0.13309234380722046, "learning_rate": 4.7300163132137035e-05, "loss": 0.3603, "num_input_tokens_seen": 12537568, "step": 5800 }, { "epoch": 0.9469820554649266, "grad_norm": 0.9149518609046936, "learning_rate": 4.734094616639478e-05, "loss": 0.0854, "num_input_tokens_seen": 12548736, "step": 5805 }, { "epoch": 0.9477977161500816, "grad_norm": 0.08186270296573639, "learning_rate": 4.738172920065253e-05, "loss": 0.1006, "num_input_tokens_seen": 12560192, "step": 5810 }, { "epoch": 0.9486133768352365, "grad_norm": 0.45442628860473633, "learning_rate": 4.742251223491028e-05, "loss": 0.0944, "num_input_tokens_seen": 12570816, "step": 5815 }, { "epoch": 0.9494290375203915, "grad_norm": 8.15549087524414, "learning_rate": 4.7463295269168026e-05, "loss": 0.3167, "num_input_tokens_seen": 12582272, "step": 5820 }, { "epoch": 0.9502446982055465, "grad_norm": 0.2367493063211441, "learning_rate": 4.750407830342577e-05, "loss": 0.2528, "num_input_tokens_seen": 12594432, "step": 5825 }, { "epoch": 0.9510603588907015, "grad_norm": 0.2754059135913849, "learning_rate": 4.754486133768353e-05, "loss": 0.0117, "num_input_tokens_seen": 12606048, "step": 5830 }, { "epoch": 0.9518760195758564, "grad_norm": 0.514473557472229, "learning_rate": 4.7585644371941275e-05, "loss": 0.2696, "num_input_tokens_seen": 12617088, "step": 5835 }, { "epoch": 0.9526916802610114, "grad_norm": 4.245872974395752, "learning_rate": 4.762642740619902e-05, "loss": 0.2253, "num_input_tokens_seen": 12627808, "step": 5840 }, { "epoch": 0.9535073409461664, "grad_norm": 0.13982197642326355, "learning_rate": 4.766721044045677e-05, "loss": 0.0519, "num_input_tokens_seen": 12638720, "step": 5845 }, { "epoch": 0.9543230016313213, "grad_norm": 2.4407217502593994, "learning_rate": 4.770799347471452e-05, "loss": 0.1421, "num_input_tokens_seen": 12648608, "step": 5850 }, { "epoch": 0.9551386623164764, "grad_norm": 1.4845774173736572, "learning_rate": 4.774877650897227e-05, "loss": 0.0421, "num_input_tokens_seen": 12658944, "step": 5855 }, { "epoch": 0.9559543230016313, "grad_norm": 5.853550434112549, "learning_rate": 4.7789559543230014e-05, "loss": 0.165, "num_input_tokens_seen": 12669856, "step": 5860 }, { "epoch": 0.9567699836867863, "grad_norm": 1.4343255758285522, "learning_rate": 4.783034257748777e-05, "loss": 0.198, "num_input_tokens_seen": 12680096, "step": 5865 }, { "epoch": 0.9575856443719413, "grad_norm": 0.08989223092794418, "learning_rate": 4.7871125611745516e-05, "loss": 0.0122, "num_input_tokens_seen": 12691040, "step": 5870 }, { "epoch": 0.9584013050570962, "grad_norm": 0.16980361938476562, "learning_rate": 4.791190864600327e-05, "loss": 0.1666, "num_input_tokens_seen": 12701728, "step": 5875 }, { "epoch": 0.9592169657422512, "grad_norm": 0.14723515510559082, "learning_rate": 4.795269168026101e-05, "loss": 0.0835, "num_input_tokens_seen": 12712224, "step": 5880 }, { "epoch": 0.9600326264274062, "grad_norm": 2.7446348667144775, "learning_rate": 4.799347471451876e-05, "loss": 0.215, "num_input_tokens_seen": 12723264, "step": 5885 }, { "epoch": 0.9608482871125612, "grad_norm": 0.6583641171455383, "learning_rate": 4.803425774877651e-05, "loss": 0.2323, "num_input_tokens_seen": 12734400, "step": 5890 }, { "epoch": 0.9616639477977161, "grad_norm": 0.018399178981781006, "learning_rate": 4.807504078303426e-05, "loss": 0.1507, "num_input_tokens_seen": 12745504, "step": 5895 }, { "epoch": 0.9624796084828712, "grad_norm": 4.040611743927002, "learning_rate": 4.811582381729201e-05, "loss": 0.2672, "num_input_tokens_seen": 12755520, "step": 5900 }, { "epoch": 0.9632952691680261, "grad_norm": 0.750511884689331, "learning_rate": 4.8156606851549756e-05, "loss": 0.0282, "num_input_tokens_seen": 12766336, "step": 5905 }, { "epoch": 0.964110929853181, "grad_norm": 3.4301741123199463, "learning_rate": 4.819738988580751e-05, "loss": 0.1245, "num_input_tokens_seen": 12776704, "step": 5910 }, { "epoch": 0.9649265905383361, "grad_norm": 0.17444053292274475, "learning_rate": 4.823817292006526e-05, "loss": 0.296, "num_input_tokens_seen": 12787872, "step": 5915 }, { "epoch": 0.965742251223491, "grad_norm": 0.4860982298851013, "learning_rate": 4.8278955954323e-05, "loss": 0.0428, "num_input_tokens_seen": 12798880, "step": 5920 }, { "epoch": 0.966557911908646, "grad_norm": 5.006972789764404, "learning_rate": 4.831973898858075e-05, "loss": 0.283, "num_input_tokens_seen": 12809696, "step": 5925 }, { "epoch": 0.967373572593801, "grad_norm": 4.0059709548950195, "learning_rate": 4.83605220228385e-05, "loss": 0.1566, "num_input_tokens_seen": 12820192, "step": 5930 }, { "epoch": 0.968189233278956, "grad_norm": 5.839953422546387, "learning_rate": 4.8401305057096255e-05, "loss": 0.0971, "num_input_tokens_seen": 12832384, "step": 5935 }, { "epoch": 0.9690048939641109, "grad_norm": 0.6276607513427734, "learning_rate": 4.8442088091353996e-05, "loss": 0.076, "num_input_tokens_seen": 12843936, "step": 5940 }, { "epoch": 0.9698205546492659, "grad_norm": 0.4621135890483856, "learning_rate": 4.8482871125611744e-05, "loss": 0.1962, "num_input_tokens_seen": 12854752, "step": 5945 }, { "epoch": 0.9706362153344209, "grad_norm": 2.2090275287628174, "learning_rate": 4.85236541598695e-05, "loss": 0.0434, "num_input_tokens_seen": 12865248, "step": 5950 }, { "epoch": 0.9714518760195758, "grad_norm": 5.447160720825195, "learning_rate": 4.8564437194127246e-05, "loss": 0.1976, "num_input_tokens_seen": 12876128, "step": 5955 }, { "epoch": 0.9722675367047309, "grad_norm": 2.3920841217041016, "learning_rate": 4.8605220228384994e-05, "loss": 0.2233, "num_input_tokens_seen": 12887008, "step": 5960 }, { "epoch": 0.9730831973898858, "grad_norm": 0.03698760271072388, "learning_rate": 4.864600326264274e-05, "loss": 0.0925, "num_input_tokens_seen": 12898624, "step": 5965 }, { "epoch": 0.9738988580750407, "grad_norm": 0.3625604212284088, "learning_rate": 4.8686786296900496e-05, "loss": 0.0934, "num_input_tokens_seen": 12909888, "step": 5970 }, { "epoch": 0.9747145187601958, "grad_norm": 10.285679817199707, "learning_rate": 4.872756933115824e-05, "loss": 0.4727, "num_input_tokens_seen": 12920032, "step": 5975 }, { "epoch": 0.9755301794453507, "grad_norm": 7.591075897216797, "learning_rate": 4.8768352365415984e-05, "loss": 0.4998, "num_input_tokens_seen": 12931392, "step": 5980 }, { "epoch": 0.9763458401305057, "grad_norm": 4.060094833374023, "learning_rate": 4.880913539967374e-05, "loss": 0.2346, "num_input_tokens_seen": 12941472, "step": 5985 }, { "epoch": 0.9771615008156607, "grad_norm": 2.5794951915740967, "learning_rate": 4.8849918433931486e-05, "loss": 0.1454, "num_input_tokens_seen": 12952192, "step": 5990 }, { "epoch": 0.9779771615008157, "grad_norm": 3.3665897846221924, "learning_rate": 4.8890701468189234e-05, "loss": 0.2409, "num_input_tokens_seen": 12963200, "step": 5995 }, { "epoch": 0.9787928221859706, "grad_norm": 0.7922720909118652, "learning_rate": 4.893148450244698e-05, "loss": 0.2427, "num_input_tokens_seen": 12972768, "step": 6000 }, { "epoch": 0.9796084828711256, "grad_norm": 2.926109790802002, "learning_rate": 4.8972267536704736e-05, "loss": 0.0478, "num_input_tokens_seen": 12984064, "step": 6005 }, { "epoch": 0.9804241435562806, "grad_norm": 0.7572949528694153, "learning_rate": 4.9013050570962484e-05, "loss": 0.1912, "num_input_tokens_seen": 12994336, "step": 6010 }, { "epoch": 0.9812398042414355, "grad_norm": 0.14401741325855255, "learning_rate": 4.905383360522023e-05, "loss": 0.0152, "num_input_tokens_seen": 13004096, "step": 6015 }, { "epoch": 0.9820554649265906, "grad_norm": 0.6650245189666748, "learning_rate": 4.909461663947798e-05, "loss": 0.0563, "num_input_tokens_seen": 13015168, "step": 6020 }, { "epoch": 0.9828711256117455, "grad_norm": 0.5241230130195618, "learning_rate": 4.9135399673735727e-05, "loss": 0.0966, "num_input_tokens_seen": 13026080, "step": 6025 }, { "epoch": 0.9836867862969005, "grad_norm": 5.424735069274902, "learning_rate": 4.917618270799348e-05, "loss": 0.3274, "num_input_tokens_seen": 13037888, "step": 6030 }, { "epoch": 0.9845024469820555, "grad_norm": 6.97016716003418, "learning_rate": 4.921696574225123e-05, "loss": 0.4111, "num_input_tokens_seen": 13049440, "step": 6035 }, { "epoch": 0.9853181076672104, "grad_norm": 0.3491359055042267, "learning_rate": 4.9257748776508976e-05, "loss": 0.0414, "num_input_tokens_seen": 13060480, "step": 6040 }, { "epoch": 0.9861337683523654, "grad_norm": 0.2605774402618408, "learning_rate": 4.9298531810766724e-05, "loss": 0.0565, "num_input_tokens_seen": 13071584, "step": 6045 }, { "epoch": 0.9869494290375204, "grad_norm": 3.8022637367248535, "learning_rate": 4.933931484502447e-05, "loss": 0.1964, "num_input_tokens_seen": 13081376, "step": 6050 }, { "epoch": 0.9877650897226754, "grad_norm": 0.08020448684692383, "learning_rate": 4.938009787928222e-05, "loss": 0.0841, "num_input_tokens_seen": 13092768, "step": 6055 }, { "epoch": 0.9885807504078303, "grad_norm": 0.8128753900527954, "learning_rate": 4.942088091353997e-05, "loss": 0.2018, "num_input_tokens_seen": 13104096, "step": 6060 }, { "epoch": 0.9893964110929854, "grad_norm": 0.16580741107463837, "learning_rate": 4.946166394779772e-05, "loss": 0.1163, "num_input_tokens_seen": 13115488, "step": 6065 }, { "epoch": 0.9902120717781403, "grad_norm": 0.20980337262153625, "learning_rate": 4.950244698205547e-05, "loss": 0.0744, "num_input_tokens_seen": 13126752, "step": 6070 }, { "epoch": 0.9910277324632952, "grad_norm": 0.9464369416236877, "learning_rate": 4.9543230016313217e-05, "loss": 0.1535, "num_input_tokens_seen": 13137920, "step": 6075 }, { "epoch": 0.9918433931484503, "grad_norm": 3.200751781463623, "learning_rate": 4.9584013050570964e-05, "loss": 0.2442, "num_input_tokens_seen": 13148416, "step": 6080 }, { "epoch": 0.9926590538336052, "grad_norm": 5.150273323059082, "learning_rate": 4.962479608482871e-05, "loss": 0.1137, "num_input_tokens_seen": 13157920, "step": 6085 }, { "epoch": 0.9934747145187602, "grad_norm": 0.9907219409942627, "learning_rate": 4.9665579119086466e-05, "loss": 0.0343, "num_input_tokens_seen": 13167968, "step": 6090 }, { "epoch": 0.9942903752039152, "grad_norm": 5.316732883453369, "learning_rate": 4.970636215334421e-05, "loss": 0.2873, "num_input_tokens_seen": 13179648, "step": 6095 }, { "epoch": 0.9951060358890701, "grad_norm": 0.18861132860183716, "learning_rate": 4.974714518760196e-05, "loss": 0.1942, "num_input_tokens_seen": 13190656, "step": 6100 }, { "epoch": 0.9959216965742251, "grad_norm": 0.1188916265964508, "learning_rate": 4.978792822185971e-05, "loss": 0.1605, "num_input_tokens_seen": 13201120, "step": 6105 }, { "epoch": 0.9967373572593801, "grad_norm": 1.468907117843628, "learning_rate": 4.982871125611746e-05, "loss": 0.102, "num_input_tokens_seen": 13211744, "step": 6110 }, { "epoch": 0.9975530179445351, "grad_norm": 0.04991580545902252, "learning_rate": 4.9869494290375205e-05, "loss": 0.1987, "num_input_tokens_seen": 13222752, "step": 6115 }, { "epoch": 0.99836867862969, "grad_norm": 0.611285924911499, "learning_rate": 4.991027732463295e-05, "loss": 0.2471, "num_input_tokens_seen": 13233696, "step": 6120 }, { "epoch": 0.9991843393148451, "grad_norm": 0.6825065612792969, "learning_rate": 4.9951060358890707e-05, "loss": 0.1463, "num_input_tokens_seen": 13245504, "step": 6125 }, { "epoch": 1.0, "grad_norm": 0.14032478630542755, "learning_rate": 4.9991843393148454e-05, "loss": 0.1984, "num_input_tokens_seen": 13255424, "step": 6130 }, { "epoch": 1.0, "eval_loss": 0.15720832347869873, "eval_runtime": 133.1334, "eval_samples_per_second": 20.468, "eval_steps_per_second": 5.123, "num_input_tokens_seen": 13255424, "step": 6130 }, { "epoch": 1.000815660685155, "grad_norm": 3.067218780517578, "learning_rate": 4.999999935147941e-05, "loss": 0.1373, "num_input_tokens_seen": 13265952, "step": 6135 }, { "epoch": 1.0016313213703099, "grad_norm": 0.8992579579353333, "learning_rate": 4.999999671686456e-05, "loss": 0.0564, "num_input_tokens_seen": 13276768, "step": 6140 }, { "epoch": 1.002446982055465, "grad_norm": 1.2270188331604004, "learning_rate": 4.999999205562312e-05, "loss": 0.1712, "num_input_tokens_seen": 13287776, "step": 6145 }, { "epoch": 1.00326264274062, "grad_norm": 0.5810588598251343, "learning_rate": 4.999998536775549e-05, "loss": 0.3012, "num_input_tokens_seen": 13299488, "step": 6150 }, { "epoch": 1.004078303425775, "grad_norm": 2.3066442012786865, "learning_rate": 4.9999976653262184e-05, "loss": 0.1016, "num_input_tokens_seen": 13310208, "step": 6155 }, { "epoch": 1.0048939641109298, "grad_norm": 0.11014427244663239, "learning_rate": 4.999996591214392e-05, "loss": 0.0257, "num_input_tokens_seen": 13319904, "step": 6160 }, { "epoch": 1.0057096247960848, "grad_norm": 0.6082314252853394, "learning_rate": 4.999995314440158e-05, "loss": 0.1063, "num_input_tokens_seen": 13330528, "step": 6165 }, { "epoch": 1.0065252854812399, "grad_norm": 0.16786378622055054, "learning_rate": 4.999993835003618e-05, "loss": 0.1076, "num_input_tokens_seen": 13341760, "step": 6170 }, { "epoch": 1.0073409461663947, "grad_norm": 0.052277714014053345, "learning_rate": 4.9999921529048945e-05, "loss": 0.2745, "num_input_tokens_seen": 13352448, "step": 6175 }, { "epoch": 1.0081566068515497, "grad_norm": 3.4511992931365967, "learning_rate": 4.9999902681441205e-05, "loss": 0.2549, "num_input_tokens_seen": 13363296, "step": 6180 }, { "epoch": 1.0089722675367048, "grad_norm": 1.0652965307235718, "learning_rate": 4.9999881807214515e-05, "loss": 0.0328, "num_input_tokens_seen": 13374368, "step": 6185 }, { "epoch": 1.0097879282218598, "grad_norm": 3.025557518005371, "learning_rate": 4.9999858906370553e-05, "loss": 0.0905, "num_input_tokens_seen": 13385184, "step": 6190 }, { "epoch": 1.0106035889070146, "grad_norm": 3.7201688289642334, "learning_rate": 4.9999833978911185e-05, "loss": 0.2842, "num_input_tokens_seen": 13395104, "step": 6195 }, { "epoch": 1.0114192495921697, "grad_norm": 0.9407456517219543, "learning_rate": 4.999980702483842e-05, "loss": 0.0479, "num_input_tokens_seen": 13406080, "step": 6200 }, { "epoch": 1.0122349102773247, "grad_norm": 1.9295414686203003, "learning_rate": 4.999977804415446e-05, "loss": 0.2064, "num_input_tokens_seen": 13417952, "step": 6205 }, { "epoch": 1.0130505709624795, "grad_norm": 5.365414619445801, "learning_rate": 4.999974703686164e-05, "loss": 0.1248, "num_input_tokens_seen": 13428512, "step": 6210 }, { "epoch": 1.0138662316476346, "grad_norm": 0.3215469419956207, "learning_rate": 4.9999714002962474e-05, "loss": 0.0202, "num_input_tokens_seen": 13439648, "step": 6215 }, { "epoch": 1.0146818923327896, "grad_norm": 0.15902908146381378, "learning_rate": 4.999967894245965e-05, "loss": 0.1377, "num_input_tokens_seen": 13451520, "step": 6220 }, { "epoch": 1.0154975530179446, "grad_norm": 3.213712215423584, "learning_rate": 4.9999641855355995e-05, "loss": 0.1352, "num_input_tokens_seen": 13460480, "step": 6225 }, { "epoch": 1.0163132137030995, "grad_norm": 2.431142568588257, "learning_rate": 4.999960274165453e-05, "loss": 0.0963, "num_input_tokens_seen": 13471328, "step": 6230 }, { "epoch": 1.0171288743882545, "grad_norm": 7.266343593597412, "learning_rate": 4.999956160135842e-05, "loss": 0.2119, "num_input_tokens_seen": 13482048, "step": 6235 }, { "epoch": 1.0179445350734095, "grad_norm": 1.8660143613815308, "learning_rate": 4.999951843447099e-05, "loss": 0.1378, "num_input_tokens_seen": 13492512, "step": 6240 }, { "epoch": 1.0187601957585644, "grad_norm": 3.638964891433716, "learning_rate": 4.999947324099576e-05, "loss": 0.1171, "num_input_tokens_seen": 13504000, "step": 6245 }, { "epoch": 1.0195758564437194, "grad_norm": 0.7262020111083984, "learning_rate": 4.999942602093638e-05, "loss": 0.1194, "num_input_tokens_seen": 13514784, "step": 6250 }, { "epoch": 1.0203915171288744, "grad_norm": 0.6040750741958618, "learning_rate": 4.999937677429669e-05, "loss": 0.1189, "num_input_tokens_seen": 13525376, "step": 6255 }, { "epoch": 1.0212071778140293, "grad_norm": 0.39444753527641296, "learning_rate": 4.9999325501080666e-05, "loss": 0.2655, "num_input_tokens_seen": 13535872, "step": 6260 }, { "epoch": 1.0220228384991843, "grad_norm": 1.2012454271316528, "learning_rate": 4.999927220129247e-05, "loss": 0.1076, "num_input_tokens_seen": 13546464, "step": 6265 }, { "epoch": 1.0228384991843393, "grad_norm": 0.8436634540557861, "learning_rate": 4.9999216874936426e-05, "loss": 0.2635, "num_input_tokens_seen": 13555936, "step": 6270 }, { "epoch": 1.0236541598694944, "grad_norm": 0.17298643290996552, "learning_rate": 4.9999159522017015e-05, "loss": 0.0112, "num_input_tokens_seen": 13566528, "step": 6275 }, { "epoch": 1.0244698205546492, "grad_norm": 0.07080504298210144, "learning_rate": 4.999910014253889e-05, "loss": 0.2112, "num_input_tokens_seen": 13577152, "step": 6280 }, { "epoch": 1.0252854812398042, "grad_norm": 4.6885528564453125, "learning_rate": 4.999903873650687e-05, "loss": 0.1048, "num_input_tokens_seen": 13588704, "step": 6285 }, { "epoch": 1.0261011419249593, "grad_norm": 0.9719001650810242, "learning_rate": 4.999897530392591e-05, "loss": 0.0153, "num_input_tokens_seen": 13599552, "step": 6290 }, { "epoch": 1.026916802610114, "grad_norm": 0.26410770416259766, "learning_rate": 4.9998909844801176e-05, "loss": 0.0158, "num_input_tokens_seen": 13611264, "step": 6295 }, { "epoch": 1.0277324632952691, "grad_norm": 3.8483128547668457, "learning_rate": 4.999884235913797e-05, "loss": 0.2487, "num_input_tokens_seen": 13621760, "step": 6300 }, { "epoch": 1.0285481239804242, "grad_norm": 3.1497063636779785, "learning_rate": 4.999877284694177e-05, "loss": 0.1964, "num_input_tokens_seen": 13632672, "step": 6305 }, { "epoch": 1.0293637846655792, "grad_norm": 0.11703000217676163, "learning_rate": 4.999870130821818e-05, "loss": 0.0095, "num_input_tokens_seen": 13643584, "step": 6310 }, { "epoch": 1.030179445350734, "grad_norm": 0.10111640393733978, "learning_rate": 4.9998627742973025e-05, "loss": 0.3219, "num_input_tokens_seen": 13655488, "step": 6315 }, { "epoch": 1.030995106035889, "grad_norm": 2.653560161590576, "learning_rate": 4.9998552151212276e-05, "loss": 0.1615, "num_input_tokens_seen": 13666368, "step": 6320 }, { "epoch": 1.031810766721044, "grad_norm": 1.1174497604370117, "learning_rate": 4.999847453294204e-05, "loss": 0.2452, "num_input_tokens_seen": 13678048, "step": 6325 }, { "epoch": 1.032626427406199, "grad_norm": 2.7836174964904785, "learning_rate": 4.999839488816861e-05, "loss": 0.0305, "num_input_tokens_seen": 13689024, "step": 6330 }, { "epoch": 1.033442088091354, "grad_norm": 0.1570463925600052, "learning_rate": 4.9998313216898454e-05, "loss": 0.1406, "num_input_tokens_seen": 13699744, "step": 6335 }, { "epoch": 1.034257748776509, "grad_norm": 0.2226400524377823, "learning_rate": 4.99982295191382e-05, "loss": 0.0638, "num_input_tokens_seen": 13710016, "step": 6340 }, { "epoch": 1.035073409461664, "grad_norm": 2.299910306930542, "learning_rate": 4.99981437948946e-05, "loss": 0.1252, "num_input_tokens_seen": 13721664, "step": 6345 }, { "epoch": 1.0358890701468189, "grad_norm": 3.804659605026245, "learning_rate": 4.999805604417464e-05, "loss": 0.0408, "num_input_tokens_seen": 13734752, "step": 6350 }, { "epoch": 1.036704730831974, "grad_norm": 0.15337339043617249, "learning_rate": 4.999796626698542e-05, "loss": 0.0818, "num_input_tokens_seen": 13745504, "step": 6355 }, { "epoch": 1.037520391517129, "grad_norm": 6.057324409484863, "learning_rate": 4.999787446333421e-05, "loss": 0.1707, "num_input_tokens_seen": 13755968, "step": 6360 }, { "epoch": 1.0383360522022838, "grad_norm": 3.1725149154663086, "learning_rate": 4.999778063322846e-05, "loss": 0.193, "num_input_tokens_seen": 13767296, "step": 6365 }, { "epoch": 1.0391517128874388, "grad_norm": 0.1678391546010971, "learning_rate": 4.9997684776675775e-05, "loss": 0.2717, "num_input_tokens_seen": 13778656, "step": 6370 }, { "epoch": 1.0399673735725938, "grad_norm": 6.550046920776367, "learning_rate": 4.999758689368392e-05, "loss": 0.1537, "num_input_tokens_seen": 13789216, "step": 6375 }, { "epoch": 1.0407830342577489, "grad_norm": 4.02315616607666, "learning_rate": 4.999748698426084e-05, "loss": 0.0584, "num_input_tokens_seen": 13799680, "step": 6380 }, { "epoch": 1.0415986949429037, "grad_norm": 0.07721056789159775, "learning_rate": 4.9997385048414624e-05, "loss": 0.1017, "num_input_tokens_seen": 13810720, "step": 6385 }, { "epoch": 1.0424143556280587, "grad_norm": 0.5864396691322327, "learning_rate": 4.999728108615355e-05, "loss": 0.1158, "num_input_tokens_seen": 13821856, "step": 6390 }, { "epoch": 1.0432300163132138, "grad_norm": 0.8936943411827087, "learning_rate": 4.9997175097486026e-05, "loss": 0.2071, "num_input_tokens_seen": 13833984, "step": 6395 }, { "epoch": 1.0440456769983686, "grad_norm": 3.2895848751068115, "learning_rate": 4.9997067082420655e-05, "loss": 0.3046, "num_input_tokens_seen": 13844928, "step": 6400 }, { "epoch": 1.0448613376835236, "grad_norm": 7.387818813323975, "learning_rate": 4.9996957040966205e-05, "loss": 0.1894, "num_input_tokens_seen": 13856384, "step": 6405 }, { "epoch": 1.0456769983686787, "grad_norm": 0.20572587847709656, "learning_rate": 4.999684497313157e-05, "loss": 0.1028, "num_input_tokens_seen": 13867456, "step": 6410 }, { "epoch": 1.0464926590538337, "grad_norm": 1.095292329788208, "learning_rate": 4.9996730878925856e-05, "loss": 0.0209, "num_input_tokens_seen": 13877216, "step": 6415 }, { "epoch": 1.0473083197389885, "grad_norm": 4.3961181640625, "learning_rate": 4.99966147583583e-05, "loss": 0.3864, "num_input_tokens_seen": 13888192, "step": 6420 }, { "epoch": 1.0481239804241436, "grad_norm": 0.32123053073883057, "learning_rate": 4.9996496611438326e-05, "loss": 0.0085, "num_input_tokens_seen": 13899072, "step": 6425 }, { "epoch": 1.0489396411092986, "grad_norm": 7.540583610534668, "learning_rate": 4.99963764381755e-05, "loss": 0.2563, "num_input_tokens_seen": 13909792, "step": 6430 }, { "epoch": 1.0497553017944534, "grad_norm": 0.44093719124794006, "learning_rate": 4.999625423857958e-05, "loss": 0.0607, "num_input_tokens_seen": 13920800, "step": 6435 }, { "epoch": 1.0505709624796085, "grad_norm": 1.7876123189926147, "learning_rate": 4.999613001266045e-05, "loss": 0.0533, "num_input_tokens_seen": 13930976, "step": 6440 }, { "epoch": 1.0513866231647635, "grad_norm": 0.38826173543930054, "learning_rate": 4.999600376042819e-05, "loss": 0.0381, "num_input_tokens_seen": 13942080, "step": 6445 }, { "epoch": 1.0522022838499185, "grad_norm": 5.167582035064697, "learning_rate": 4.999587548189305e-05, "loss": 0.0599, "num_input_tokens_seen": 13952736, "step": 6450 }, { "epoch": 1.0530179445350734, "grad_norm": 3.2204172611236572, "learning_rate": 4.99957451770654e-05, "loss": 0.1036, "num_input_tokens_seen": 13962400, "step": 6455 }, { "epoch": 1.0538336052202284, "grad_norm": 0.04041092470288277, "learning_rate": 4.999561284595583e-05, "loss": 0.0162, "num_input_tokens_seen": 13974304, "step": 6460 }, { "epoch": 1.0546492659053834, "grad_norm": 3.1175785064697266, "learning_rate": 4.9995478488575054e-05, "loss": 0.1, "num_input_tokens_seen": 13985248, "step": 6465 }, { "epoch": 1.0554649265905383, "grad_norm": 5.50757360458374, "learning_rate": 4.999534210493396e-05, "loss": 0.0569, "num_input_tokens_seen": 13995840, "step": 6470 }, { "epoch": 1.0562805872756933, "grad_norm": 0.15792645514011383, "learning_rate": 4.999520369504362e-05, "loss": 0.1343, "num_input_tokens_seen": 14007104, "step": 6475 }, { "epoch": 1.0570962479608483, "grad_norm": 4.697031497955322, "learning_rate": 4.9995063258915235e-05, "loss": 0.0593, "num_input_tokens_seen": 14018144, "step": 6480 }, { "epoch": 1.0579119086460032, "grad_norm": 1.4745863676071167, "learning_rate": 4.9994920796560205e-05, "loss": 0.0781, "num_input_tokens_seen": 14029024, "step": 6485 }, { "epoch": 1.0587275693311582, "grad_norm": 0.05364038795232773, "learning_rate": 4.999477630799007e-05, "loss": 0.1515, "num_input_tokens_seen": 14040480, "step": 6490 }, { "epoch": 1.0595432300163132, "grad_norm": 0.04595933482050896, "learning_rate": 4.999462979321654e-05, "loss": 0.0023, "num_input_tokens_seen": 14050976, "step": 6495 }, { "epoch": 1.0603588907014683, "grad_norm": 1.792112112045288, "learning_rate": 4.9994481252251506e-05, "loss": 0.0109, "num_input_tokens_seen": 14060352, "step": 6500 }, { "epoch": 1.061174551386623, "grad_norm": 19.34042739868164, "learning_rate": 4.999433068510699e-05, "loss": 0.1274, "num_input_tokens_seen": 14071680, "step": 6505 }, { "epoch": 1.0619902120717781, "grad_norm": 2.285905599594116, "learning_rate": 4.999417809179523e-05, "loss": 0.0847, "num_input_tokens_seen": 14083296, "step": 6510 }, { "epoch": 1.0628058727569332, "grad_norm": 5.264870643615723, "learning_rate": 4.9994023472328555e-05, "loss": 0.1433, "num_input_tokens_seen": 14094752, "step": 6515 }, { "epoch": 1.0636215334420882, "grad_norm": 7.430415630340576, "learning_rate": 4.999386682671953e-05, "loss": 0.372, "num_input_tokens_seen": 14105088, "step": 6520 }, { "epoch": 1.064437194127243, "grad_norm": 5.810910224914551, "learning_rate": 4.9993708154980836e-05, "loss": 0.1906, "num_input_tokens_seen": 14114976, "step": 6525 }, { "epoch": 1.065252854812398, "grad_norm": 4.199947834014893, "learning_rate": 4.999354745712534e-05, "loss": 0.2266, "num_input_tokens_seen": 14126592, "step": 6530 }, { "epoch": 1.066068515497553, "grad_norm": 0.7832189798355103, "learning_rate": 4.999338473316607e-05, "loss": 0.1949, "num_input_tokens_seen": 14138048, "step": 6535 }, { "epoch": 1.066884176182708, "grad_norm": 0.1015256941318512, "learning_rate": 4.9993219983116223e-05, "loss": 0.0962, "num_input_tokens_seen": 14148128, "step": 6540 }, { "epoch": 1.067699836867863, "grad_norm": 3.800353765487671, "learning_rate": 4.999305320698915e-05, "loss": 0.1037, "num_input_tokens_seen": 14160096, "step": 6545 }, { "epoch": 1.068515497553018, "grad_norm": 0.3930117189884186, "learning_rate": 4.999288440479837e-05, "loss": 0.0725, "num_input_tokens_seen": 14170752, "step": 6550 }, { "epoch": 1.0693311582381728, "grad_norm": 0.09467582404613495, "learning_rate": 4.999271357655757e-05, "loss": 0.3594, "num_input_tokens_seen": 14181952, "step": 6555 }, { "epoch": 1.0701468189233279, "grad_norm": 0.18013601005077362, "learning_rate": 4.999254072228059e-05, "loss": 0.1734, "num_input_tokens_seen": 14192384, "step": 6560 }, { "epoch": 1.070962479608483, "grad_norm": 0.04747767001390457, "learning_rate": 4.9992365841981456e-05, "loss": 0.0124, "num_input_tokens_seen": 14204384, "step": 6565 }, { "epoch": 1.071778140293638, "grad_norm": 0.08544167876243591, "learning_rate": 4.9992188935674335e-05, "loss": 0.1268, "num_input_tokens_seen": 14215776, "step": 6570 }, { "epoch": 1.0725938009787928, "grad_norm": 0.8479664921760559, "learning_rate": 4.999201000337356e-05, "loss": 0.1506, "num_input_tokens_seen": 14227520, "step": 6575 }, { "epoch": 1.0734094616639478, "grad_norm": 0.21483114361763, "learning_rate": 4.999182904509366e-05, "loss": 0.2186, "num_input_tokens_seen": 14238976, "step": 6580 }, { "epoch": 1.0742251223491028, "grad_norm": 0.05625781789422035, "learning_rate": 4.9991646060849285e-05, "loss": 0.1296, "num_input_tokens_seen": 14249344, "step": 6585 }, { "epoch": 1.0750407830342577, "grad_norm": 0.15453799068927765, "learning_rate": 4.9991461050655264e-05, "loss": 0.0069, "num_input_tokens_seen": 14259840, "step": 6590 }, { "epoch": 1.0758564437194127, "grad_norm": 0.21485309302806854, "learning_rate": 4.999127401452662e-05, "loss": 0.2197, "num_input_tokens_seen": 14271456, "step": 6595 }, { "epoch": 1.0766721044045677, "grad_norm": 4.267156600952148, "learning_rate": 4.999108495247849e-05, "loss": 0.1725, "num_input_tokens_seen": 14282688, "step": 6600 }, { "epoch": 1.0774877650897228, "grad_norm": 4.1854681968688965, "learning_rate": 4.9990893864526214e-05, "loss": 0.1068, "num_input_tokens_seen": 14293920, "step": 6605 }, { "epoch": 1.0783034257748776, "grad_norm": 2.9711430072784424, "learning_rate": 4.9990700750685276e-05, "loss": 0.0176, "num_input_tokens_seen": 14304512, "step": 6610 }, { "epoch": 1.0791190864600326, "grad_norm": 1.481642246246338, "learning_rate": 4.999050561097134e-05, "loss": 0.0216, "num_input_tokens_seen": 14315616, "step": 6615 }, { "epoch": 1.0799347471451877, "grad_norm": 1.1094454526901245, "learning_rate": 4.999030844540021e-05, "loss": 0.0354, "num_input_tokens_seen": 14326400, "step": 6620 }, { "epoch": 1.0807504078303425, "grad_norm": 0.04047788307070732, "learning_rate": 4.999010925398788e-05, "loss": 0.2067, "num_input_tokens_seen": 14336288, "step": 6625 }, { "epoch": 1.0815660685154975, "grad_norm": 0.31624218821525574, "learning_rate": 4.99899080367505e-05, "loss": 0.1622, "num_input_tokens_seen": 14347616, "step": 6630 }, { "epoch": 1.0823817292006526, "grad_norm": 0.08543675392866135, "learning_rate": 4.9989704793704374e-05, "loss": 0.1119, "num_input_tokens_seen": 14357664, "step": 6635 }, { "epoch": 1.0831973898858076, "grad_norm": 5.039618968963623, "learning_rate": 4.998949952486598e-05, "loss": 0.4289, "num_input_tokens_seen": 14367232, "step": 6640 }, { "epoch": 1.0840130505709624, "grad_norm": 0.1791653037071228, "learning_rate": 4.998929223025196e-05, "loss": 0.1644, "num_input_tokens_seen": 14378048, "step": 6645 }, { "epoch": 1.0848287112561175, "grad_norm": 1.6573693752288818, "learning_rate": 4.998908290987913e-05, "loss": 0.1897, "num_input_tokens_seen": 14389248, "step": 6650 }, { "epoch": 1.0856443719412725, "grad_norm": 0.6262145042419434, "learning_rate": 4.998887156376443e-05, "loss": 0.0761, "num_input_tokens_seen": 14399616, "step": 6655 }, { "epoch": 1.0864600326264273, "grad_norm": 7.90919828414917, "learning_rate": 4.998865819192501e-05, "loss": 0.3444, "num_input_tokens_seen": 14412000, "step": 6660 }, { "epoch": 1.0872756933115824, "grad_norm": 0.22059565782546997, "learning_rate": 4.9988442794378166e-05, "loss": 0.2168, "num_input_tokens_seen": 14423392, "step": 6665 }, { "epoch": 1.0880913539967374, "grad_norm": 2.1804144382476807, "learning_rate": 4.998822537114136e-05, "loss": 0.2435, "num_input_tokens_seen": 14433792, "step": 6670 }, { "epoch": 1.0889070146818924, "grad_norm": 0.1316068172454834, "learning_rate": 4.998800592223222e-05, "loss": 0.0807, "num_input_tokens_seen": 14445152, "step": 6675 }, { "epoch": 1.0897226753670473, "grad_norm": 0.2838009297847748, "learning_rate": 4.9987784447668526e-05, "loss": 0.125, "num_input_tokens_seen": 14455968, "step": 6680 }, { "epoch": 1.0905383360522023, "grad_norm": 0.4855414032936096, "learning_rate": 4.9987560947468245e-05, "loss": 0.047, "num_input_tokens_seen": 14465824, "step": 6685 }, { "epoch": 1.0913539967373573, "grad_norm": 6.95814847946167, "learning_rate": 4.998733542164948e-05, "loss": 0.2047, "num_input_tokens_seen": 14476480, "step": 6690 }, { "epoch": 1.0921696574225122, "grad_norm": 0.47371330857276917, "learning_rate": 4.998710787023053e-05, "loss": 0.1393, "num_input_tokens_seen": 14486144, "step": 6695 }, { "epoch": 1.0929853181076672, "grad_norm": 1.3137682676315308, "learning_rate": 4.998687829322983e-05, "loss": 0.2674, "num_input_tokens_seen": 14497664, "step": 6700 }, { "epoch": 1.0938009787928222, "grad_norm": 4.352360725402832, "learning_rate": 4.9986646690665996e-05, "loss": 0.1401, "num_input_tokens_seen": 14509888, "step": 6705 }, { "epoch": 1.094616639477977, "grad_norm": 1.3058325052261353, "learning_rate": 4.998641306255779e-05, "loss": 0.1986, "num_input_tokens_seen": 14521664, "step": 6710 }, { "epoch": 1.095432300163132, "grad_norm": 1.6888068914413452, "learning_rate": 4.998617740892417e-05, "loss": 0.1728, "num_input_tokens_seen": 14532768, "step": 6715 }, { "epoch": 1.0962479608482871, "grad_norm": 1.2989392280578613, "learning_rate": 4.998593972978423e-05, "loss": 0.1047, "num_input_tokens_seen": 14544160, "step": 6720 }, { "epoch": 1.0970636215334422, "grad_norm": 0.5585448145866394, "learning_rate": 4.9985700025157236e-05, "loss": 0.0322, "num_input_tokens_seen": 14554112, "step": 6725 }, { "epoch": 1.097879282218597, "grad_norm": 1.6414457559585571, "learning_rate": 4.998545829506263e-05, "loss": 0.0519, "num_input_tokens_seen": 14564096, "step": 6730 }, { "epoch": 1.098694942903752, "grad_norm": 0.15382681787014008, "learning_rate": 4.998521453951999e-05, "loss": 0.1204, "num_input_tokens_seen": 14574592, "step": 6735 }, { "epoch": 1.099510603588907, "grad_norm": 0.043996699154376984, "learning_rate": 4.998496875854908e-05, "loss": 0.015, "num_input_tokens_seen": 14585728, "step": 6740 }, { "epoch": 1.100326264274062, "grad_norm": 2.8113901615142822, "learning_rate": 4.998472095216984e-05, "loss": 0.2483, "num_input_tokens_seen": 14594464, "step": 6745 }, { "epoch": 1.101141924959217, "grad_norm": 3.563047409057617, "learning_rate": 4.998447112040235e-05, "loss": 0.1717, "num_input_tokens_seen": 14604480, "step": 6750 }, { "epoch": 1.101957585644372, "grad_norm": 3.7106142044067383, "learning_rate": 4.998421926326685e-05, "loss": 0.0621, "num_input_tokens_seen": 14616128, "step": 6755 }, { "epoch": 1.102773246329527, "grad_norm": 0.08062312752008438, "learning_rate": 4.998396538078378e-05, "loss": 0.0147, "num_input_tokens_seen": 14627008, "step": 6760 }, { "epoch": 1.1035889070146818, "grad_norm": 1.1190317869186401, "learning_rate": 4.99837094729737e-05, "loss": 0.2473, "num_input_tokens_seen": 14637280, "step": 6765 }, { "epoch": 1.1044045676998369, "grad_norm": 5.4377264976501465, "learning_rate": 4.998345153985738e-05, "loss": 0.1444, "num_input_tokens_seen": 14646976, "step": 6770 }, { "epoch": 1.105220228384992, "grad_norm": 19.340038299560547, "learning_rate": 4.998319158145569e-05, "loss": 0.0803, "num_input_tokens_seen": 14657152, "step": 6775 }, { "epoch": 1.1060358890701467, "grad_norm": 0.9310303926467896, "learning_rate": 4.998292959778974e-05, "loss": 0.3306, "num_input_tokens_seen": 14667488, "step": 6780 }, { "epoch": 1.1068515497553018, "grad_norm": 7.0268168449401855, "learning_rate": 4.9982665588880753e-05, "loss": 0.2986, "num_input_tokens_seen": 14678112, "step": 6785 }, { "epoch": 1.1076672104404568, "grad_norm": 0.04533948376774788, "learning_rate": 4.9982399554750136e-05, "loss": 0.0561, "num_input_tokens_seen": 14689312, "step": 6790 }, { "epoch": 1.1084828711256118, "grad_norm": 0.22335876524448395, "learning_rate": 4.998213149541945e-05, "loss": 0.3764, "num_input_tokens_seen": 14700032, "step": 6795 }, { "epoch": 1.1092985318107667, "grad_norm": 2.8416543006896973, "learning_rate": 4.9981861410910424e-05, "loss": 0.2424, "num_input_tokens_seen": 14711008, "step": 6800 }, { "epoch": 1.1101141924959217, "grad_norm": 0.06697509437799454, "learning_rate": 4.9981589301244956e-05, "loss": 0.0252, "num_input_tokens_seen": 14720448, "step": 6805 }, { "epoch": 1.1109298531810767, "grad_norm": 0.13166627287864685, "learning_rate": 4.99813151664451e-05, "loss": 0.0869, "num_input_tokens_seen": 14732128, "step": 6810 }, { "epoch": 1.1117455138662315, "grad_norm": 0.06577765941619873, "learning_rate": 4.998103900653309e-05, "loss": 0.1219, "num_input_tokens_seen": 14743200, "step": 6815 }, { "epoch": 1.1125611745513866, "grad_norm": 4.53542947769165, "learning_rate": 4.9980760821531304e-05, "loss": 0.3091, "num_input_tokens_seen": 14754464, "step": 6820 }, { "epoch": 1.1133768352365416, "grad_norm": 3.502213716506958, "learning_rate": 4.998048061146229e-05, "loss": 0.2777, "num_input_tokens_seen": 14763840, "step": 6825 }, { "epoch": 1.1141924959216967, "grad_norm": 0.13414250314235687, "learning_rate": 4.9980198376348774e-05, "loss": 0.0437, "num_input_tokens_seen": 14774304, "step": 6830 }, { "epoch": 1.1150081566068515, "grad_norm": 6.36775541305542, "learning_rate": 4.997991411621362e-05, "loss": 0.0415, "num_input_tokens_seen": 14783456, "step": 6835 }, { "epoch": 1.1158238172920065, "grad_norm": 0.055438391864299774, "learning_rate": 4.9979627831079894e-05, "loss": 0.0065, "num_input_tokens_seen": 14793664, "step": 6840 }, { "epoch": 1.1166394779771616, "grad_norm": 2.260453462600708, "learning_rate": 4.997933952097078e-05, "loss": 0.0955, "num_input_tokens_seen": 14804256, "step": 6845 }, { "epoch": 1.1174551386623164, "grad_norm": 0.5315571427345276, "learning_rate": 4.997904918590966e-05, "loss": 0.2423, "num_input_tokens_seen": 14815744, "step": 6850 }, { "epoch": 1.1182707993474714, "grad_norm": 0.12336158007383347, "learning_rate": 4.997875682592008e-05, "loss": 0.0575, "num_input_tokens_seen": 14826304, "step": 6855 }, { "epoch": 1.1190864600326265, "grad_norm": 0.6235188841819763, "learning_rate": 4.997846244102573e-05, "loss": 0.2058, "num_input_tokens_seen": 14838016, "step": 6860 }, { "epoch": 1.1199021207177815, "grad_norm": 2.3068888187408447, "learning_rate": 4.997816603125047e-05, "loss": 0.0434, "num_input_tokens_seen": 14849344, "step": 6865 }, { "epoch": 1.1207177814029363, "grad_norm": 3.278022527694702, "learning_rate": 4.9977867596618333e-05, "loss": 0.2902, "num_input_tokens_seen": 14860608, "step": 6870 }, { "epoch": 1.1215334420880914, "grad_norm": 5.233441352844238, "learning_rate": 4.997756713715352e-05, "loss": 0.0885, "num_input_tokens_seen": 14870976, "step": 6875 }, { "epoch": 1.1223491027732464, "grad_norm": 0.9062994718551636, "learning_rate": 4.997726465288037e-05, "loss": 0.0712, "num_input_tokens_seen": 14880960, "step": 6880 }, { "epoch": 1.1231647634584012, "grad_norm": 0.045212745666503906, "learning_rate": 4.997696014382341e-05, "loss": 0.1472, "num_input_tokens_seen": 14892256, "step": 6885 }, { "epoch": 1.1239804241435563, "grad_norm": 1.470868468284607, "learning_rate": 4.997665361000735e-05, "loss": 0.3008, "num_input_tokens_seen": 14902304, "step": 6890 }, { "epoch": 1.1247960848287113, "grad_norm": 0.13788996636867523, "learning_rate": 4.9976345051456995e-05, "loss": 0.132, "num_input_tokens_seen": 14913536, "step": 6895 }, { "epoch": 1.1256117455138663, "grad_norm": 5.891739368438721, "learning_rate": 4.99760344681974e-05, "loss": 0.2338, "num_input_tokens_seen": 14924480, "step": 6900 }, { "epoch": 1.1264274061990212, "grad_norm": 2.3570706844329834, "learning_rate": 4.997572186025371e-05, "loss": 0.1666, "num_input_tokens_seen": 14935264, "step": 6905 }, { "epoch": 1.1272430668841762, "grad_norm": 5.320467948913574, "learning_rate": 4.997540722765128e-05, "loss": 0.1228, "num_input_tokens_seen": 14947072, "step": 6910 }, { "epoch": 1.1280587275693312, "grad_norm": 2.584219217300415, "learning_rate": 4.997509057041563e-05, "loss": 0.1896, "num_input_tokens_seen": 14957888, "step": 6915 }, { "epoch": 1.128874388254486, "grad_norm": 2.8470618724823, "learning_rate": 4.99747718885724e-05, "loss": 0.2429, "num_input_tokens_seen": 14967424, "step": 6920 }, { "epoch": 1.129690048939641, "grad_norm": 2.975456953048706, "learning_rate": 4.9974451182147456e-05, "loss": 0.1532, "num_input_tokens_seen": 14977536, "step": 6925 }, { "epoch": 1.1305057096247961, "grad_norm": 0.3898094594478607, "learning_rate": 4.997412845116677e-05, "loss": 0.0593, "num_input_tokens_seen": 14987008, "step": 6930 }, { "epoch": 1.131321370309951, "grad_norm": 0.1392330378293991, "learning_rate": 4.997380369565652e-05, "loss": 0.39, "num_input_tokens_seen": 14998272, "step": 6935 }, { "epoch": 1.132137030995106, "grad_norm": 4.5014495849609375, "learning_rate": 4.9973476915643015e-05, "loss": 0.2747, "num_input_tokens_seen": 15007392, "step": 6940 }, { "epoch": 1.132952691680261, "grad_norm": 3.5496082305908203, "learning_rate": 4.997314811115277e-05, "loss": 0.312, "num_input_tokens_seen": 15016608, "step": 6945 }, { "epoch": 1.133768352365416, "grad_norm": 2.735983371734619, "learning_rate": 4.997281728221242e-05, "loss": 0.1643, "num_input_tokens_seen": 15027712, "step": 6950 }, { "epoch": 1.1345840130505709, "grad_norm": 1.7396841049194336, "learning_rate": 4.997248442884879e-05, "loss": 0.1043, "num_input_tokens_seen": 15038080, "step": 6955 }, { "epoch": 1.135399673735726, "grad_norm": 0.6531755328178406, "learning_rate": 4.997214955108887e-05, "loss": 0.1422, "num_input_tokens_seen": 15050432, "step": 6960 }, { "epoch": 1.136215334420881, "grad_norm": 0.45595988631248474, "learning_rate": 4.9971812648959796e-05, "loss": 0.1733, "num_input_tokens_seen": 15060992, "step": 6965 }, { "epoch": 1.137030995106036, "grad_norm": 1.885270595550537, "learning_rate": 4.997147372248887e-05, "loss": 0.0791, "num_input_tokens_seen": 15070816, "step": 6970 }, { "epoch": 1.1378466557911908, "grad_norm": 0.7401260137557983, "learning_rate": 4.99711327717036e-05, "loss": 0.1806, "num_input_tokens_seen": 15080128, "step": 6975 }, { "epoch": 1.1386623164763459, "grad_norm": 0.2424100637435913, "learning_rate": 4.997078979663159e-05, "loss": 0.1129, "num_input_tokens_seen": 15090464, "step": 6980 }, { "epoch": 1.139477977161501, "grad_norm": 3.2046449184417725, "learning_rate": 4.997044479730067e-05, "loss": 0.1906, "num_input_tokens_seen": 15101472, "step": 6985 }, { "epoch": 1.1402936378466557, "grad_norm": 3.129856824874878, "learning_rate": 4.997009777373879e-05, "loss": 0.1128, "num_input_tokens_seen": 15111616, "step": 6990 }, { "epoch": 1.1411092985318108, "grad_norm": 6.59491491317749, "learning_rate": 4.9969748725974085e-05, "loss": 0.2529, "num_input_tokens_seen": 15122496, "step": 6995 }, { "epoch": 1.1419249592169658, "grad_norm": 0.2636934816837311, "learning_rate": 4.996939765403486e-05, "loss": 0.1272, "num_input_tokens_seen": 15134112, "step": 7000 }, { "epoch": 1.1427406199021206, "grad_norm": 5.670815467834473, "learning_rate": 4.996904455794956e-05, "loss": 0.0784, "num_input_tokens_seen": 15144352, "step": 7005 }, { "epoch": 1.1435562805872757, "grad_norm": 0.15952295064926147, "learning_rate": 4.996868943774683e-05, "loss": 0.1361, "num_input_tokens_seen": 15154304, "step": 7010 }, { "epoch": 1.1443719412724307, "grad_norm": 1.9397414922714233, "learning_rate": 4.9968332293455433e-05, "loss": 0.0549, "num_input_tokens_seen": 15165024, "step": 7015 }, { "epoch": 1.1451876019575857, "grad_norm": 2.2744736671447754, "learning_rate": 4.996797312510433e-05, "loss": 0.1571, "num_input_tokens_seen": 15176224, "step": 7020 }, { "epoch": 1.1460032626427405, "grad_norm": 5.310516834259033, "learning_rate": 4.9967611932722645e-05, "loss": 0.1318, "num_input_tokens_seen": 15186464, "step": 7025 }, { "epoch": 1.1468189233278956, "grad_norm": 4.836538314819336, "learning_rate": 4.9967248716339656e-05, "loss": 0.2183, "num_input_tokens_seen": 15197408, "step": 7030 }, { "epoch": 1.1476345840130506, "grad_norm": 0.4939217269420624, "learning_rate": 4.9966883475984796e-05, "loss": 0.0798, "num_input_tokens_seen": 15209504, "step": 7035 }, { "epoch": 1.1484502446982057, "grad_norm": 3.923380136489868, "learning_rate": 4.996651621168768e-05, "loss": 0.2964, "num_input_tokens_seen": 15220576, "step": 7040 }, { "epoch": 1.1492659053833605, "grad_norm": 0.5404402017593384, "learning_rate": 4.9966146923478086e-05, "loss": 0.1547, "num_input_tokens_seen": 15230752, "step": 7045 }, { "epoch": 1.1500815660685155, "grad_norm": 6.2645440101623535, "learning_rate": 4.996577561138594e-05, "loss": 0.0831, "num_input_tokens_seen": 15241312, "step": 7050 }, { "epoch": 1.1508972267536706, "grad_norm": 4.551450729370117, "learning_rate": 4.996540227544136e-05, "loss": 0.1611, "num_input_tokens_seen": 15252512, "step": 7055 }, { "epoch": 1.1517128874388254, "grad_norm": 0.6982122659683228, "learning_rate": 4.9965026915674584e-05, "loss": 0.0937, "num_input_tokens_seen": 15262400, "step": 7060 }, { "epoch": 1.1525285481239804, "grad_norm": 5.603863716125488, "learning_rate": 4.9964649532116065e-05, "loss": 0.2171, "num_input_tokens_seen": 15274816, "step": 7065 }, { "epoch": 1.1533442088091355, "grad_norm": 0.09071218222379684, "learning_rate": 4.996427012479638e-05, "loss": 0.0906, "num_input_tokens_seen": 15285248, "step": 7070 }, { "epoch": 1.1541598694942903, "grad_norm": 1.8232356309890747, "learning_rate": 4.9963888693746294e-05, "loss": 0.018, "num_input_tokens_seen": 15295552, "step": 7075 }, { "epoch": 1.1549755301794453, "grad_norm": 0.33773085474967957, "learning_rate": 4.996350523899672e-05, "loss": 0.0267, "num_input_tokens_seen": 15305248, "step": 7080 }, { "epoch": 1.1557911908646004, "grad_norm": 0.06964617222547531, "learning_rate": 4.9963119760578756e-05, "loss": 0.0268, "num_input_tokens_seen": 15316576, "step": 7085 }, { "epoch": 1.1566068515497552, "grad_norm": 0.03703215718269348, "learning_rate": 4.996273225852364e-05, "loss": 0.1948, "num_input_tokens_seen": 15328256, "step": 7090 }, { "epoch": 1.1574225122349102, "grad_norm": 3.359741449356079, "learning_rate": 4.996234273286278e-05, "loss": 0.2215, "num_input_tokens_seen": 15338752, "step": 7095 }, { "epoch": 1.1582381729200653, "grad_norm": 0.04670816659927368, "learning_rate": 4.996195118362777e-05, "loss": 0.0109, "num_input_tokens_seen": 15348224, "step": 7100 }, { "epoch": 1.1590538336052203, "grad_norm": 0.11876621842384338, "learning_rate": 4.996155761085034e-05, "loss": 0.0895, "num_input_tokens_seen": 15359008, "step": 7105 }, { "epoch": 1.1598694942903751, "grad_norm": 0.1864207237958908, "learning_rate": 4.996116201456239e-05, "loss": 0.0529, "num_input_tokens_seen": 15369408, "step": 7110 }, { "epoch": 1.1606851549755302, "grad_norm": 0.23362979292869568, "learning_rate": 4.9960764394796e-05, "loss": 0.0038, "num_input_tokens_seen": 15379392, "step": 7115 }, { "epoch": 1.1615008156606852, "grad_norm": 0.9414898157119751, "learning_rate": 4.99603647515834e-05, "loss": 0.0729, "num_input_tokens_seen": 15389920, "step": 7120 }, { "epoch": 1.1623164763458402, "grad_norm": 0.13409867882728577, "learning_rate": 4.9959963084956986e-05, "loss": 0.2052, "num_input_tokens_seen": 15400896, "step": 7125 }, { "epoch": 1.163132137030995, "grad_norm": 0.18150252103805542, "learning_rate": 4.9959559394949315e-05, "loss": 0.0223, "num_input_tokens_seen": 15412320, "step": 7130 }, { "epoch": 1.16394779771615, "grad_norm": 0.06716419756412506, "learning_rate": 4.9959153681593114e-05, "loss": 0.102, "num_input_tokens_seen": 15423680, "step": 7135 }, { "epoch": 1.1647634584013051, "grad_norm": 0.2635498046875, "learning_rate": 4.9958745944921275e-05, "loss": 0.0494, "num_input_tokens_seen": 15433952, "step": 7140 }, { "epoch": 1.16557911908646, "grad_norm": 0.7040302753448486, "learning_rate": 4.995833618496685e-05, "loss": 0.1906, "num_input_tokens_seen": 15444896, "step": 7145 }, { "epoch": 1.166394779771615, "grad_norm": 0.14256872236728668, "learning_rate": 4.9957924401763065e-05, "loss": 0.0646, "num_input_tokens_seen": 15456192, "step": 7150 }, { "epoch": 1.16721044045677, "grad_norm": 3.789982557296753, "learning_rate": 4.9957510595343285e-05, "loss": 0.2344, "num_input_tokens_seen": 15465312, "step": 7155 }, { "epoch": 1.1680261011419248, "grad_norm": 0.07367774099111557, "learning_rate": 4.995709476574106e-05, "loss": 0.1398, "num_input_tokens_seen": 15474496, "step": 7160 }, { "epoch": 1.1688417618270799, "grad_norm": 5.558032989501953, "learning_rate": 4.9956676912990105e-05, "loss": 0.1262, "num_input_tokens_seen": 15484992, "step": 7165 }, { "epoch": 1.169657422512235, "grad_norm": 0.6465906500816345, "learning_rate": 4.99562570371243e-05, "loss": 0.0429, "num_input_tokens_seen": 15495232, "step": 7170 }, { "epoch": 1.17047308319739, "grad_norm": 0.06183154881000519, "learning_rate": 4.9955835138177667e-05, "loss": 0.1305, "num_input_tokens_seen": 15506304, "step": 7175 }, { "epoch": 1.1712887438825448, "grad_norm": 0.1421712338924408, "learning_rate": 4.9955411216184414e-05, "loss": 0.2294, "num_input_tokens_seen": 15517248, "step": 7180 }, { "epoch": 1.1721044045676998, "grad_norm": 0.3086666464805603, "learning_rate": 4.9954985271178903e-05, "loss": 0.0355, "num_input_tokens_seen": 15527040, "step": 7185 }, { "epoch": 1.1729200652528549, "grad_norm": 6.889306545257568, "learning_rate": 4.995455730319566e-05, "loss": 0.3019, "num_input_tokens_seen": 15538368, "step": 7190 }, { "epoch": 1.17373572593801, "grad_norm": 0.12419155240058899, "learning_rate": 4.9954127312269386e-05, "loss": 0.0041, "num_input_tokens_seen": 15548608, "step": 7195 }, { "epoch": 1.1745513866231647, "grad_norm": 0.6525631546974182, "learning_rate": 4.9953695298434944e-05, "loss": 0.2912, "num_input_tokens_seen": 15560032, "step": 7200 }, { "epoch": 1.1753670473083198, "grad_norm": 1.1164226531982422, "learning_rate": 4.9953261261727334e-05, "loss": 0.188, "num_input_tokens_seen": 15571616, "step": 7205 }, { "epoch": 1.1761827079934748, "grad_norm": 0.09184513241052628, "learning_rate": 4.9952825202181766e-05, "loss": 0.1342, "num_input_tokens_seen": 15583392, "step": 7210 }, { "epoch": 1.1769983686786296, "grad_norm": 0.11937636137008667, "learning_rate": 4.995238711983358e-05, "loss": 0.2329, "num_input_tokens_seen": 15593504, "step": 7215 }, { "epoch": 1.1778140293637847, "grad_norm": 2.529798746109009, "learning_rate": 4.995194701471828e-05, "loss": 0.0952, "num_input_tokens_seen": 15604928, "step": 7220 }, { "epoch": 1.1786296900489397, "grad_norm": 2.5741701126098633, "learning_rate": 4.9951504886871545e-05, "loss": 0.1982, "num_input_tokens_seen": 15615200, "step": 7225 }, { "epoch": 1.1794453507340945, "grad_norm": 1.6471775770187378, "learning_rate": 4.995106073632924e-05, "loss": 0.0458, "num_input_tokens_seen": 15626176, "step": 7230 }, { "epoch": 1.1802610114192496, "grad_norm": 2.7898664474487305, "learning_rate": 4.995061456312733e-05, "loss": 0.1147, "num_input_tokens_seen": 15636768, "step": 7235 }, { "epoch": 1.1810766721044046, "grad_norm": 0.9005841612815857, "learning_rate": 4.995016636730202e-05, "loss": 0.195, "num_input_tokens_seen": 15648512, "step": 7240 }, { "epoch": 1.1818923327895596, "grad_norm": 6.270269393920898, "learning_rate": 4.994971614888962e-05, "loss": 0.097, "num_input_tokens_seen": 15659328, "step": 7245 }, { "epoch": 1.1827079934747144, "grad_norm": 0.20680290460586548, "learning_rate": 4.994926390792664e-05, "loss": 0.0193, "num_input_tokens_seen": 15669920, "step": 7250 }, { "epoch": 1.1835236541598695, "grad_norm": 3.018476724624634, "learning_rate": 4.9948809644449734e-05, "loss": 0.1868, "num_input_tokens_seen": 15681152, "step": 7255 }, { "epoch": 1.1843393148450245, "grad_norm": 4.8543853759765625, "learning_rate": 4.994835335849573e-05, "loss": 0.1502, "num_input_tokens_seen": 15690912, "step": 7260 }, { "epoch": 1.1851549755301796, "grad_norm": 3.1689562797546387, "learning_rate": 4.994789505010161e-05, "loss": 0.1375, "num_input_tokens_seen": 15701216, "step": 7265 }, { "epoch": 1.1859706362153344, "grad_norm": 2.3359832763671875, "learning_rate": 4.994743471930454e-05, "loss": 0.1836, "num_input_tokens_seen": 15712000, "step": 7270 }, { "epoch": 1.1867862969004894, "grad_norm": 3.6418848037719727, "learning_rate": 4.994697236614183e-05, "loss": 0.2498, "num_input_tokens_seen": 15722848, "step": 7275 }, { "epoch": 1.1876019575856445, "grad_norm": 1.657184362411499, "learning_rate": 4.994650799065096e-05, "loss": 0.2006, "num_input_tokens_seen": 15734816, "step": 7280 }, { "epoch": 1.1884176182707993, "grad_norm": 0.7946965098381042, "learning_rate": 4.9946041592869576e-05, "loss": 0.3594, "num_input_tokens_seen": 15745664, "step": 7285 }, { "epoch": 1.1892332789559543, "grad_norm": 8.222297668457031, "learning_rate": 4.994557317283548e-05, "loss": 0.2051, "num_input_tokens_seen": 15756896, "step": 7290 }, { "epoch": 1.1900489396411094, "grad_norm": 0.49199816584587097, "learning_rate": 4.9945102730586655e-05, "loss": 0.1951, "num_input_tokens_seen": 15766656, "step": 7295 }, { "epoch": 1.1908646003262642, "grad_norm": 3.079603672027588, "learning_rate": 4.994463026616123e-05, "loss": 0.3055, "num_input_tokens_seen": 15777792, "step": 7300 }, { "epoch": 1.1916802610114192, "grad_norm": 1.5396080017089844, "learning_rate": 4.994415577959751e-05, "loss": 0.2226, "num_input_tokens_seen": 15787968, "step": 7305 }, { "epoch": 1.1924959216965743, "grad_norm": 2.956415891647339, "learning_rate": 4.9943679270933954e-05, "loss": 0.3819, "num_input_tokens_seen": 15797920, "step": 7310 }, { "epoch": 1.1933115823817293, "grad_norm": 0.6605135798454285, "learning_rate": 4.99432007402092e-05, "loss": 0.148, "num_input_tokens_seen": 15809376, "step": 7315 }, { "epoch": 1.1941272430668841, "grad_norm": 6.03127384185791, "learning_rate": 4.9942720187462025e-05, "loss": 0.1243, "num_input_tokens_seen": 15821408, "step": 7320 }, { "epoch": 1.1949429037520392, "grad_norm": 0.9249451160430908, "learning_rate": 4.9942237612731395e-05, "loss": 0.1065, "num_input_tokens_seen": 15830816, "step": 7325 }, { "epoch": 1.1957585644371942, "grad_norm": 0.7879658341407776, "learning_rate": 4.994175301605644e-05, "loss": 0.0357, "num_input_tokens_seen": 15841792, "step": 7330 }, { "epoch": 1.196574225122349, "grad_norm": 0.23040802776813507, "learning_rate": 4.9941266397476414e-05, "loss": 0.2713, "num_input_tokens_seen": 15852768, "step": 7335 }, { "epoch": 1.197389885807504, "grad_norm": 0.11664178222417831, "learning_rate": 4.9940777757030796e-05, "loss": 0.055, "num_input_tokens_seen": 15863680, "step": 7340 }, { "epoch": 1.198205546492659, "grad_norm": 5.903824806213379, "learning_rate": 4.994028709475917e-05, "loss": 0.1763, "num_input_tokens_seen": 15874976, "step": 7345 }, { "epoch": 1.1990212071778141, "grad_norm": 0.06786380708217621, "learning_rate": 4.993979441070135e-05, "loss": 0.0218, "num_input_tokens_seen": 15885408, "step": 7350 }, { "epoch": 1.199836867862969, "grad_norm": 6.23204231262207, "learning_rate": 4.9939299704897236e-05, "loss": 0.2324, "num_input_tokens_seen": 15895168, "step": 7355 }, { "epoch": 1.200652528548124, "grad_norm": 3.4383184909820557, "learning_rate": 4.993880297738694e-05, "loss": 0.1526, "num_input_tokens_seen": 15906240, "step": 7360 }, { "epoch": 1.201468189233279, "grad_norm": 4.5740580558776855, "learning_rate": 4.9938304228210754e-05, "loss": 0.1911, "num_input_tokens_seen": 15917024, "step": 7365 }, { "epoch": 1.2022838499184338, "grad_norm": 6.418374538421631, "learning_rate": 4.9937803457409084e-05, "loss": 0.4276, "num_input_tokens_seen": 15926496, "step": 7370 }, { "epoch": 1.2030995106035889, "grad_norm": 4.022270679473877, "learning_rate": 4.9937300665022535e-05, "loss": 0.1046, "num_input_tokens_seen": 15937600, "step": 7375 }, { "epoch": 1.203915171288744, "grad_norm": 3.2668960094451904, "learning_rate": 4.9936795851091854e-05, "loss": 0.1221, "num_input_tokens_seen": 15948064, "step": 7380 }, { "epoch": 1.2047308319738987, "grad_norm": 0.07322722673416138, "learning_rate": 4.993628901565799e-05, "loss": 0.1286, "num_input_tokens_seen": 15958752, "step": 7385 }, { "epoch": 1.2055464926590538, "grad_norm": 6.813504695892334, "learning_rate": 4.9935780158762e-05, "loss": 0.2462, "num_input_tokens_seen": 15970080, "step": 7390 }, { "epoch": 1.2063621533442088, "grad_norm": 1.1422133445739746, "learning_rate": 4.993526928044515e-05, "loss": 0.0705, "num_input_tokens_seen": 15982080, "step": 7395 }, { "epoch": 1.2071778140293639, "grad_norm": 0.5997249484062195, "learning_rate": 4.9934756380748846e-05, "loss": 0.0465, "num_input_tokens_seen": 15993056, "step": 7400 }, { "epoch": 1.2079934747145187, "grad_norm": 3.280303478240967, "learning_rate": 4.993424145971468e-05, "loss": 0.1448, "num_input_tokens_seen": 16003584, "step": 7405 }, { "epoch": 1.2088091353996737, "grad_norm": 2.7594776153564453, "learning_rate": 4.993372451738439e-05, "loss": 0.2384, "num_input_tokens_seen": 16014912, "step": 7410 }, { "epoch": 1.2096247960848288, "grad_norm": 0.6693934202194214, "learning_rate": 4.993320555379987e-05, "loss": 0.0899, "num_input_tokens_seen": 16026304, "step": 7415 }, { "epoch": 1.2104404567699838, "grad_norm": 1.054826021194458, "learning_rate": 4.9932684569003205e-05, "loss": 0.0551, "num_input_tokens_seen": 16038464, "step": 7420 }, { "epoch": 1.2112561174551386, "grad_norm": 5.335202217102051, "learning_rate": 4.993216156303662e-05, "loss": 0.1442, "num_input_tokens_seen": 16049728, "step": 7425 }, { "epoch": 1.2120717781402937, "grad_norm": 3.579210042953491, "learning_rate": 4.9931636535942506e-05, "loss": 0.3986, "num_input_tokens_seen": 16060032, "step": 7430 }, { "epoch": 1.2128874388254487, "grad_norm": 4.611842632293701, "learning_rate": 4.993110948776344e-05, "loss": 0.1536, "num_input_tokens_seen": 16072064, "step": 7435 }, { "epoch": 1.2137030995106035, "grad_norm": 0.4529060125350952, "learning_rate": 4.993058041854214e-05, "loss": 0.1385, "num_input_tokens_seen": 16082112, "step": 7440 }, { "epoch": 1.2145187601957586, "grad_norm": 0.110162153840065, "learning_rate": 4.9930049328321495e-05, "loss": 0.2334, "num_input_tokens_seen": 16093280, "step": 7445 }, { "epoch": 1.2153344208809136, "grad_norm": 4.0999369621276855, "learning_rate": 4.9929516217144554e-05, "loss": 0.2506, "num_input_tokens_seen": 16104352, "step": 7450 }, { "epoch": 1.2161500815660684, "grad_norm": 0.1871415227651596, "learning_rate": 4.992898108505454e-05, "loss": 0.1421, "num_input_tokens_seen": 16114784, "step": 7455 }, { "epoch": 1.2169657422512234, "grad_norm": 1.4177812337875366, "learning_rate": 4.992844393209483e-05, "loss": 0.0688, "num_input_tokens_seen": 16126080, "step": 7460 }, { "epoch": 1.2177814029363785, "grad_norm": 1.9091269969940186, "learning_rate": 4.992790475830896e-05, "loss": 0.2458, "num_input_tokens_seen": 16136480, "step": 7465 }, { "epoch": 1.2185970636215335, "grad_norm": 2.707463026046753, "learning_rate": 4.992736356374066e-05, "loss": 0.3477, "num_input_tokens_seen": 16147424, "step": 7470 }, { "epoch": 1.2194127243066883, "grad_norm": 0.41100895404815674, "learning_rate": 4.992682034843379e-05, "loss": 0.2539, "num_input_tokens_seen": 16157280, "step": 7475 }, { "epoch": 1.2202283849918434, "grad_norm": 4.53908634185791, "learning_rate": 4.992627511243238e-05, "loss": 0.1339, "num_input_tokens_seen": 16168320, "step": 7480 }, { "epoch": 1.2210440456769984, "grad_norm": 0.13700874149799347, "learning_rate": 4.992572785578063e-05, "loss": 0.076, "num_input_tokens_seen": 16178976, "step": 7485 }, { "epoch": 1.2218597063621535, "grad_norm": 0.1897386610507965, "learning_rate": 4.9925178578522914e-05, "loss": 0.1635, "num_input_tokens_seen": 16190208, "step": 7490 }, { "epoch": 1.2226753670473083, "grad_norm": 1.5856431722640991, "learning_rate": 4.992462728070375e-05, "loss": 0.2558, "num_input_tokens_seen": 16199488, "step": 7495 }, { "epoch": 1.2234910277324633, "grad_norm": 0.24407242238521576, "learning_rate": 4.992407396236784e-05, "loss": 0.2119, "num_input_tokens_seen": 16210432, "step": 7500 }, { "epoch": 1.2243066884176184, "grad_norm": 0.5603540539741516, "learning_rate": 4.992351862356003e-05, "loss": 0.0716, "num_input_tokens_seen": 16221664, "step": 7505 }, { "epoch": 1.2251223491027732, "grad_norm": 0.23095354437828064, "learning_rate": 4.992296126432533e-05, "loss": 0.1511, "num_input_tokens_seen": 16232064, "step": 7510 }, { "epoch": 1.2259380097879282, "grad_norm": 0.13533274829387665, "learning_rate": 4.992240188470894e-05, "loss": 0.0716, "num_input_tokens_seen": 16242976, "step": 7515 }, { "epoch": 1.2267536704730833, "grad_norm": 0.14358443021774292, "learning_rate": 4.99218404847562e-05, "loss": 0.1667, "num_input_tokens_seen": 16253440, "step": 7520 }, { "epoch": 1.227569331158238, "grad_norm": 1.9528138637542725, "learning_rate": 4.9921277064512614e-05, "loss": 0.0796, "num_input_tokens_seen": 16263680, "step": 7525 }, { "epoch": 1.2283849918433931, "grad_norm": 0.19445714354515076, "learning_rate": 4.992071162402386e-05, "loss": 0.2385, "num_input_tokens_seen": 16274944, "step": 7530 }, { "epoch": 1.2292006525285482, "grad_norm": 1.89347505569458, "learning_rate": 4.992014416333577e-05, "loss": 0.0619, "num_input_tokens_seen": 16285376, "step": 7535 }, { "epoch": 1.2300163132137032, "grad_norm": 2.8959848880767822, "learning_rate": 4.991957468249436e-05, "loss": 0.0882, "num_input_tokens_seen": 16296544, "step": 7540 }, { "epoch": 1.230831973898858, "grad_norm": 0.16721372306346893, "learning_rate": 4.991900318154578e-05, "loss": 0.2256, "num_input_tokens_seen": 16306752, "step": 7545 }, { "epoch": 1.231647634584013, "grad_norm": 4.97653341293335, "learning_rate": 4.991842966053637e-05, "loss": 0.1995, "num_input_tokens_seen": 16317536, "step": 7550 }, { "epoch": 1.232463295269168, "grad_norm": 7.182518482208252, "learning_rate": 4.991785411951261e-05, "loss": 0.17, "num_input_tokens_seen": 16328640, "step": 7555 }, { "epoch": 1.233278955954323, "grad_norm": 3.598841428756714, "learning_rate": 4.9917276558521164e-05, "loss": 0.3151, "num_input_tokens_seen": 16340032, "step": 7560 }, { "epoch": 1.234094616639478, "grad_norm": 3.606041193008423, "learning_rate": 4.9916696977608855e-05, "loss": 0.2259, "num_input_tokens_seen": 16351328, "step": 7565 }, { "epoch": 1.234910277324633, "grad_norm": 0.5559537410736084, "learning_rate": 4.991611537682266e-05, "loss": 0.1566, "num_input_tokens_seen": 16360832, "step": 7570 }, { "epoch": 1.235725938009788, "grad_norm": 0.22433821856975555, "learning_rate": 4.991553175620973e-05, "loss": 0.0131, "num_input_tokens_seen": 16372640, "step": 7575 }, { "epoch": 1.2365415986949428, "grad_norm": 0.6391193270683289, "learning_rate": 4.991494611581738e-05, "loss": 0.0338, "num_input_tokens_seen": 16383360, "step": 7580 }, { "epoch": 1.2373572593800979, "grad_norm": 0.726963222026825, "learning_rate": 4.9914358455693076e-05, "loss": 0.1841, "num_input_tokens_seen": 16393728, "step": 7585 }, { "epoch": 1.238172920065253, "grad_norm": 4.856930255889893, "learning_rate": 4.991376877588446e-05, "loss": 0.269, "num_input_tokens_seen": 16405440, "step": 7590 }, { "epoch": 1.2389885807504077, "grad_norm": 3.5513672828674316, "learning_rate": 4.991317707643934e-05, "loss": 0.22, "num_input_tokens_seen": 16416672, "step": 7595 }, { "epoch": 1.2398042414355628, "grad_norm": 3.371920585632324, "learning_rate": 4.991258335740568e-05, "loss": 0.2127, "num_input_tokens_seen": 16427136, "step": 7600 }, { "epoch": 1.2406199021207178, "grad_norm": 1.6889550685882568, "learning_rate": 4.99119876188316e-05, "loss": 0.109, "num_input_tokens_seen": 16437984, "step": 7605 }, { "epoch": 1.2414355628058726, "grad_norm": 1.0969815254211426, "learning_rate": 4.9911389860765406e-05, "loss": 0.1772, "num_input_tokens_seen": 16448640, "step": 7610 }, { "epoch": 1.2422512234910277, "grad_norm": 3.5157084465026855, "learning_rate": 4.9910790083255555e-05, "loss": 0.3096, "num_input_tokens_seen": 16457856, "step": 7615 }, { "epoch": 1.2430668841761827, "grad_norm": 0.1653311550617218, "learning_rate": 4.991018828635066e-05, "loss": 0.059, "num_input_tokens_seen": 16469056, "step": 7620 }, { "epoch": 1.2438825448613378, "grad_norm": 0.17270918190479279, "learning_rate": 4.99095844700995e-05, "loss": 0.1477, "num_input_tokens_seen": 16479616, "step": 7625 }, { "epoch": 1.2446982055464926, "grad_norm": 0.31358715891838074, "learning_rate": 4.9908978634551045e-05, "loss": 0.0557, "num_input_tokens_seen": 16489952, "step": 7630 }, { "epoch": 1.2455138662316476, "grad_norm": 0.26833394169807434, "learning_rate": 4.990837077975439e-05, "loss": 0.0935, "num_input_tokens_seen": 16501152, "step": 7635 }, { "epoch": 1.2463295269168027, "grad_norm": 0.24005572497844696, "learning_rate": 4.990776090575881e-05, "loss": 0.0116, "num_input_tokens_seen": 16512608, "step": 7640 }, { "epoch": 1.2471451876019577, "grad_norm": 5.5196533203125, "learning_rate": 4.990714901261376e-05, "loss": 0.1512, "num_input_tokens_seen": 16524064, "step": 7645 }, { "epoch": 1.2479608482871125, "grad_norm": 3.4011459350585938, "learning_rate": 4.990653510036883e-05, "loss": 0.1, "num_input_tokens_seen": 16534688, "step": 7650 }, { "epoch": 1.2487765089722676, "grad_norm": 3.6264588832855225, "learning_rate": 4.99059191690738e-05, "loss": 0.0947, "num_input_tokens_seen": 16544928, "step": 7655 }, { "epoch": 1.2495921696574226, "grad_norm": 3.0403060913085938, "learning_rate": 4.9905301218778575e-05, "loss": 0.2576, "num_input_tokens_seen": 16555200, "step": 7660 }, { "epoch": 1.2504078303425774, "grad_norm": 5.403607368469238, "learning_rate": 4.990468124953328e-05, "loss": 0.2939, "num_input_tokens_seen": 16566304, "step": 7665 }, { "epoch": 1.2512234910277324, "grad_norm": 3.9963810443878174, "learning_rate": 4.990405926138815e-05, "loss": 0.3114, "num_input_tokens_seen": 16578560, "step": 7670 }, { "epoch": 1.2520391517128875, "grad_norm": 1.2811282873153687, "learning_rate": 4.9903435254393616e-05, "loss": 0.1692, "num_input_tokens_seen": 16588704, "step": 7675 }, { "epoch": 1.2528548123980423, "grad_norm": 3.366016387939453, "learning_rate": 4.990280922860026e-05, "loss": 0.05, "num_input_tokens_seen": 16599776, "step": 7680 }, { "epoch": 1.2536704730831973, "grad_norm": 0.13389497995376587, "learning_rate": 4.990218118405883e-05, "loss": 0.1158, "num_input_tokens_seen": 16609824, "step": 7685 }, { "epoch": 1.2544861337683524, "grad_norm": 4.384880542755127, "learning_rate": 4.990155112082024e-05, "loss": 0.2521, "num_input_tokens_seen": 16621280, "step": 7690 }, { "epoch": 1.2553017944535072, "grad_norm": 0.6415671110153198, "learning_rate": 4.9900919038935564e-05, "loss": 0.107, "num_input_tokens_seen": 16631776, "step": 7695 }, { "epoch": 1.2561174551386622, "grad_norm": 6.010123252868652, "learning_rate": 4.9900284938456056e-05, "loss": 0.1302, "num_input_tokens_seen": 16643584, "step": 7700 }, { "epoch": 1.2569331158238173, "grad_norm": 0.11299262195825577, "learning_rate": 4.98996488194331e-05, "loss": 0.0749, "num_input_tokens_seen": 16654432, "step": 7705 }, { "epoch": 1.2577487765089723, "grad_norm": 0.10881415754556656, "learning_rate": 4.989901068191828e-05, "loss": 0.074, "num_input_tokens_seen": 16664064, "step": 7710 }, { "epoch": 1.2585644371941274, "grad_norm": 0.6600273251533508, "learning_rate": 4.9898370525963314e-05, "loss": 0.1275, "num_input_tokens_seen": 16674432, "step": 7715 }, { "epoch": 1.2593800978792822, "grad_norm": 2.8763315677642822, "learning_rate": 4.9897728351620085e-05, "loss": 0.0988, "num_input_tokens_seen": 16685920, "step": 7720 }, { "epoch": 1.2601957585644372, "grad_norm": 3.1279098987579346, "learning_rate": 4.989708415894069e-05, "loss": 0.1397, "num_input_tokens_seen": 16696320, "step": 7725 }, { "epoch": 1.2610114192495923, "grad_norm": 0.3222804665565491, "learning_rate": 4.9896437947977306e-05, "loss": 0.0757, "num_input_tokens_seen": 16707040, "step": 7730 }, { "epoch": 1.261827079934747, "grad_norm": 3.6984167098999023, "learning_rate": 4.989578971878235e-05, "loss": 0.3237, "num_input_tokens_seen": 16718208, "step": 7735 }, { "epoch": 1.2626427406199021, "grad_norm": 3.510631561279297, "learning_rate": 4.9895139471408356e-05, "loss": 0.0439, "num_input_tokens_seen": 16728480, "step": 7740 }, { "epoch": 1.2634584013050572, "grad_norm": 0.07595990598201752, "learning_rate": 4.9894487205908044e-05, "loss": 0.1572, "num_input_tokens_seen": 16740384, "step": 7745 }, { "epoch": 1.264274061990212, "grad_norm": 3.165645122528076, "learning_rate": 4.9893832922334285e-05, "loss": 0.282, "num_input_tokens_seen": 16749888, "step": 7750 }, { "epoch": 1.265089722675367, "grad_norm": 5.2043633460998535, "learning_rate": 4.989317662074011e-05, "loss": 0.2996, "num_input_tokens_seen": 16761024, "step": 7755 }, { "epoch": 1.265905383360522, "grad_norm": 0.07771420478820801, "learning_rate": 4.989251830117874e-05, "loss": 0.0128, "num_input_tokens_seen": 16771296, "step": 7760 }, { "epoch": 1.2667210440456769, "grad_norm": 2.6050169467926025, "learning_rate": 4.9891857963703535e-05, "loss": 0.1125, "num_input_tokens_seen": 16781984, "step": 7765 }, { "epoch": 1.267536704730832, "grad_norm": 4.233067989349365, "learning_rate": 4.989119560836802e-05, "loss": 0.1228, "num_input_tokens_seen": 16793056, "step": 7770 }, { "epoch": 1.268352365415987, "grad_norm": 0.8741741180419922, "learning_rate": 4.989053123522589e-05, "loss": 0.061, "num_input_tokens_seen": 16805568, "step": 7775 }, { "epoch": 1.269168026101142, "grad_norm": 0.4571402072906494, "learning_rate": 4.988986484433101e-05, "loss": 0.0753, "num_input_tokens_seen": 16815808, "step": 7780 }, { "epoch": 1.269983686786297, "grad_norm": 0.1190493032336235, "learning_rate": 4.988919643573739e-05, "loss": 0.1496, "num_input_tokens_seen": 16826272, "step": 7785 }, { "epoch": 1.2707993474714518, "grad_norm": 12.23763370513916, "learning_rate": 4.9888526009499223e-05, "loss": 0.2057, "num_input_tokens_seen": 16836576, "step": 7790 }, { "epoch": 1.2716150081566069, "grad_norm": 0.23459982872009277, "learning_rate": 4.9887853565670854e-05, "loss": 0.0345, "num_input_tokens_seen": 16846720, "step": 7795 }, { "epoch": 1.272430668841762, "grad_norm": 0.4631684124469757, "learning_rate": 4.9887179104306796e-05, "loss": 0.104, "num_input_tokens_seen": 16856960, "step": 7800 }, { "epoch": 1.2732463295269167, "grad_norm": 3.170642614364624, "learning_rate": 4.988650262546173e-05, "loss": 0.2514, "num_input_tokens_seen": 16868512, "step": 7805 }, { "epoch": 1.2740619902120718, "grad_norm": 0.30952563881874084, "learning_rate": 4.9885824129190476e-05, "loss": 0.1314, "num_input_tokens_seen": 16879744, "step": 7810 }, { "epoch": 1.2748776508972268, "grad_norm": 2.387211799621582, "learning_rate": 4.988514361554806e-05, "loss": 0.1511, "num_input_tokens_seen": 16889440, "step": 7815 }, { "epoch": 1.2756933115823816, "grad_norm": 1.0796942710876465, "learning_rate": 4.988446108458963e-05, "loss": 0.1575, "num_input_tokens_seen": 16900768, "step": 7820 }, { "epoch": 1.2765089722675367, "grad_norm": 1.151346206665039, "learning_rate": 4.988377653637052e-05, "loss": 0.1886, "num_input_tokens_seen": 16910240, "step": 7825 }, { "epoch": 1.2773246329526917, "grad_norm": 0.6544666290283203, "learning_rate": 4.988308997094623e-05, "loss": 0.0358, "num_input_tokens_seen": 16921600, "step": 7830 }, { "epoch": 1.2781402936378465, "grad_norm": 6.242870807647705, "learning_rate": 4.988240138837241e-05, "loss": 0.1791, "num_input_tokens_seen": 16933344, "step": 7835 }, { "epoch": 1.2789559543230016, "grad_norm": 4.08312463760376, "learning_rate": 4.988171078870488e-05, "loss": 0.2494, "num_input_tokens_seen": 16944416, "step": 7840 }, { "epoch": 1.2797716150081566, "grad_norm": 5.498296737670898, "learning_rate": 4.988101817199963e-05, "loss": 0.2828, "num_input_tokens_seen": 16954816, "step": 7845 }, { "epoch": 1.2805872756933117, "grad_norm": 4.145997524261475, "learning_rate": 4.988032353831279e-05, "loss": 0.1137, "num_input_tokens_seen": 16964992, "step": 7850 }, { "epoch": 1.2814029363784667, "grad_norm": 0.0852891206741333, "learning_rate": 4.9879626887700694e-05, "loss": 0.0351, "num_input_tokens_seen": 16975200, "step": 7855 }, { "epoch": 1.2822185970636215, "grad_norm": 2.0257375240325928, "learning_rate": 4.98789282202198e-05, "loss": 0.0681, "num_input_tokens_seen": 16985984, "step": 7860 }, { "epoch": 1.2830342577487766, "grad_norm": 0.12425827234983444, "learning_rate": 4.9878227535926745e-05, "loss": 0.0915, "num_input_tokens_seen": 16997504, "step": 7865 }, { "epoch": 1.2838499184339316, "grad_norm": 0.049235470592975616, "learning_rate": 4.987752483487834e-05, "loss": 0.0288, "num_input_tokens_seen": 17007296, "step": 7870 }, { "epoch": 1.2846655791190864, "grad_norm": 3.61763072013855, "learning_rate": 4.987682011713155e-05, "loss": 0.22, "num_input_tokens_seen": 17017792, "step": 7875 }, { "epoch": 1.2854812398042414, "grad_norm": 0.8922204375267029, "learning_rate": 4.9876113382743496e-05, "loss": 0.1027, "num_input_tokens_seen": 17029472, "step": 7880 }, { "epoch": 1.2862969004893965, "grad_norm": 0.139808788895607, "learning_rate": 4.987540463177147e-05, "loss": 0.0355, "num_input_tokens_seen": 17039808, "step": 7885 }, { "epoch": 1.2871125611745513, "grad_norm": 0.26891791820526123, "learning_rate": 4.987469386427292e-05, "loss": 0.0768, "num_input_tokens_seen": 17050720, "step": 7890 }, { "epoch": 1.2879282218597063, "grad_norm": 0.3657711148262024, "learning_rate": 4.987398108030548e-05, "loss": 0.2023, "num_input_tokens_seen": 17061792, "step": 7895 }, { "epoch": 1.2887438825448614, "grad_norm": 0.6014899611473083, "learning_rate": 4.987326627992692e-05, "loss": 0.1014, "num_input_tokens_seen": 17072512, "step": 7900 }, { "epoch": 1.2895595432300162, "grad_norm": 0.4137114882469177, "learning_rate": 4.98725494631952e-05, "loss": 0.1236, "num_input_tokens_seen": 17084288, "step": 7905 }, { "epoch": 1.2903752039151712, "grad_norm": 6.638491630554199, "learning_rate": 4.9871830630168404e-05, "loss": 0.4086, "num_input_tokens_seen": 17095648, "step": 7910 }, { "epoch": 1.2911908646003263, "grad_norm": 3.11419677734375, "learning_rate": 4.987110978090482e-05, "loss": 0.1669, "num_input_tokens_seen": 17106976, "step": 7915 }, { "epoch": 1.2920065252854813, "grad_norm": 5.048390865325928, "learning_rate": 4.9870386915462894e-05, "loss": 0.0826, "num_input_tokens_seen": 17116640, "step": 7920 }, { "epoch": 1.2928221859706361, "grad_norm": 0.12708722054958344, "learning_rate": 4.986966203390121e-05, "loss": 0.2733, "num_input_tokens_seen": 17127584, "step": 7925 }, { "epoch": 1.2936378466557912, "grad_norm": 0.03831891342997551, "learning_rate": 4.986893513627853e-05, "loss": 0.0283, "num_input_tokens_seen": 17137760, "step": 7930 }, { "epoch": 1.2944535073409462, "grad_norm": 3.6218161582946777, "learning_rate": 4.9868206222653785e-05, "loss": 0.2629, "num_input_tokens_seen": 17149056, "step": 7935 }, { "epoch": 1.2952691680261013, "grad_norm": 0.09029592573642731, "learning_rate": 4.9867475293086066e-05, "loss": 0.1806, "num_input_tokens_seen": 17160096, "step": 7940 }, { "epoch": 1.296084828711256, "grad_norm": 4.290560722351074, "learning_rate": 4.9866742347634624e-05, "loss": 0.1409, "num_input_tokens_seen": 17170592, "step": 7945 }, { "epoch": 1.2969004893964111, "grad_norm": 3.9079580307006836, "learning_rate": 4.986600738635887e-05, "loss": 0.3408, "num_input_tokens_seen": 17181760, "step": 7950 }, { "epoch": 1.2977161500815662, "grad_norm": 3.26395320892334, "learning_rate": 4.986527040931839e-05, "loss": 0.2637, "num_input_tokens_seen": 17192416, "step": 7955 }, { "epoch": 1.298531810766721, "grad_norm": 4.412578105926514, "learning_rate": 4.9864531416572926e-05, "loss": 0.0548, "num_input_tokens_seen": 17204256, "step": 7960 }, { "epoch": 1.299347471451876, "grad_norm": 3.5438807010650635, "learning_rate": 4.986379040818239e-05, "loss": 0.2506, "num_input_tokens_seen": 17215520, "step": 7965 }, { "epoch": 1.300163132137031, "grad_norm": 4.044760227203369, "learning_rate": 4.9863047384206835e-05, "loss": 0.2148, "num_input_tokens_seen": 17226560, "step": 7970 }, { "epoch": 1.3009787928221859, "grad_norm": 0.22716321051120758, "learning_rate": 4.986230234470651e-05, "loss": 0.0901, "num_input_tokens_seen": 17236864, "step": 7975 }, { "epoch": 1.301794453507341, "grad_norm": 2.31801700592041, "learning_rate": 4.986155528974181e-05, "loss": 0.1649, "num_input_tokens_seen": 17247424, "step": 7980 }, { "epoch": 1.302610114192496, "grad_norm": 3.5804312229156494, "learning_rate": 4.986080621937329e-05, "loss": 0.06, "num_input_tokens_seen": 17258208, "step": 7985 }, { "epoch": 1.3034257748776508, "grad_norm": 3.015270233154297, "learning_rate": 4.9860055133661675e-05, "loss": 0.0771, "num_input_tokens_seen": 17269280, "step": 7990 }, { "epoch": 1.3042414355628058, "grad_norm": 0.5983902215957642, "learning_rate": 4.985930203266785e-05, "loss": 0.059, "num_input_tokens_seen": 17278816, "step": 7995 }, { "epoch": 1.3050570962479608, "grad_norm": 2.452056884765625, "learning_rate": 4.985854691645287e-05, "loss": 0.0612, "num_input_tokens_seen": 17289824, "step": 8000 }, { "epoch": 1.3058727569331159, "grad_norm": 0.5834968090057373, "learning_rate": 4.985778978507795e-05, "loss": 0.0479, "num_input_tokens_seen": 17300448, "step": 8005 }, { "epoch": 1.306688417618271, "grad_norm": 2.934300184249878, "learning_rate": 4.9857030638604454e-05, "loss": 0.2972, "num_input_tokens_seen": 17310592, "step": 8010 }, { "epoch": 1.3075040783034257, "grad_norm": 0.10457460582256317, "learning_rate": 4.985626947709393e-05, "loss": 0.1711, "num_input_tokens_seen": 17321632, "step": 8015 }, { "epoch": 1.3083197389885808, "grad_norm": 1.1078598499298096, "learning_rate": 4.985550630060809e-05, "loss": 0.1833, "num_input_tokens_seen": 17332960, "step": 8020 }, { "epoch": 1.3091353996737358, "grad_norm": 5.162299156188965, "learning_rate": 4.985474110920879e-05, "loss": 0.3693, "num_input_tokens_seen": 17343808, "step": 8025 }, { "epoch": 1.3099510603588906, "grad_norm": 0.14242039620876312, "learning_rate": 4.985397390295807e-05, "loss": 0.2391, "num_input_tokens_seen": 17353696, "step": 8030 }, { "epoch": 1.3107667210440457, "grad_norm": 1.9484858512878418, "learning_rate": 4.985320468191811e-05, "loss": 0.1691, "num_input_tokens_seen": 17364000, "step": 8035 }, { "epoch": 1.3115823817292007, "grad_norm": 1.8402628898620605, "learning_rate": 4.985243344615128e-05, "loss": 0.1106, "num_input_tokens_seen": 17371936, "step": 8040 }, { "epoch": 1.3123980424143555, "grad_norm": 3.3673973083496094, "learning_rate": 4.9851660195720095e-05, "loss": 0.1843, "num_input_tokens_seen": 17382304, "step": 8045 }, { "epoch": 1.3132137030995106, "grad_norm": 0.8606354594230652, "learning_rate": 4.985088493068724e-05, "loss": 0.0396, "num_input_tokens_seen": 17392000, "step": 8050 }, { "epoch": 1.3140293637846656, "grad_norm": 0.5301800966262817, "learning_rate": 4.985010765111555e-05, "loss": 0.1012, "num_input_tokens_seen": 17402528, "step": 8055 }, { "epoch": 1.3148450244698204, "grad_norm": 0.20276932418346405, "learning_rate": 4.984932835706805e-05, "loss": 0.24, "num_input_tokens_seen": 17412480, "step": 8060 }, { "epoch": 1.3156606851549755, "grad_norm": 0.7233949303627014, "learning_rate": 4.984854704860791e-05, "loss": 0.0774, "num_input_tokens_seen": 17423520, "step": 8065 }, { "epoch": 1.3164763458401305, "grad_norm": 2.157273530960083, "learning_rate": 4.984776372579847e-05, "loss": 0.0993, "num_input_tokens_seen": 17434176, "step": 8070 }, { "epoch": 1.3172920065252856, "grad_norm": 3.237471580505371, "learning_rate": 4.984697838870322e-05, "loss": 0.1383, "num_input_tokens_seen": 17445504, "step": 8075 }, { "epoch": 1.3181076672104406, "grad_norm": 1.3257803916931152, "learning_rate": 4.984619103738584e-05, "loss": 0.0582, "num_input_tokens_seen": 17456544, "step": 8080 }, { "epoch": 1.3189233278955954, "grad_norm": 1.4488815069198608, "learning_rate": 4.984540167191014e-05, "loss": 0.0891, "num_input_tokens_seen": 17466432, "step": 8085 }, { "epoch": 1.3197389885807504, "grad_norm": 4.252408504486084, "learning_rate": 4.984461029234011e-05, "loss": 0.0422, "num_input_tokens_seen": 17477472, "step": 8090 }, { "epoch": 1.3205546492659055, "grad_norm": 0.4625602662563324, "learning_rate": 4.9843816898739913e-05, "loss": 0.0391, "num_input_tokens_seen": 17487360, "step": 8095 }, { "epoch": 1.3213703099510603, "grad_norm": 0.09763139486312866, "learning_rate": 4.984302149117387e-05, "loss": 0.0207, "num_input_tokens_seen": 17497568, "step": 8100 }, { "epoch": 1.3221859706362153, "grad_norm": 1.8961961269378662, "learning_rate": 4.984222406970644e-05, "loss": 0.0359, "num_input_tokens_seen": 17508768, "step": 8105 }, { "epoch": 1.3230016313213704, "grad_norm": 0.11448470503091812, "learning_rate": 4.984142463440229e-05, "loss": 0.1468, "num_input_tokens_seen": 17518368, "step": 8110 }, { "epoch": 1.3238172920065252, "grad_norm": 8.590022087097168, "learning_rate": 4.984062318532621e-05, "loss": 0.4463, "num_input_tokens_seen": 17528416, "step": 8115 }, { "epoch": 1.3246329526916802, "grad_norm": 0.28208616375923157, "learning_rate": 4.983981972254317e-05, "loss": 0.3348, "num_input_tokens_seen": 17537600, "step": 8120 }, { "epoch": 1.3254486133768353, "grad_norm": 0.3865681290626526, "learning_rate": 4.983901424611832e-05, "loss": 0.0758, "num_input_tokens_seen": 17548736, "step": 8125 }, { "epoch": 1.32626427406199, "grad_norm": 3.612586259841919, "learning_rate": 4.9838206756116926e-05, "loss": 0.3078, "num_input_tokens_seen": 17558336, "step": 8130 }, { "epoch": 1.3270799347471451, "grad_norm": 2.4190359115600586, "learning_rate": 4.983739725260448e-05, "loss": 0.0441, "num_input_tokens_seen": 17570272, "step": 8135 }, { "epoch": 1.3278955954323002, "grad_norm": 0.7069585919380188, "learning_rate": 4.983658573564658e-05, "loss": 0.0208, "num_input_tokens_seen": 17580960, "step": 8140 }, { "epoch": 1.3287112561174552, "grad_norm": 0.09782170504331589, "learning_rate": 4.983577220530902e-05, "loss": 0.1728, "num_input_tokens_seen": 17590528, "step": 8145 }, { "epoch": 1.32952691680261, "grad_norm": 3.0552656650543213, "learning_rate": 4.983495666165775e-05, "loss": 0.1243, "num_input_tokens_seen": 17600672, "step": 8150 }, { "epoch": 1.330342577487765, "grad_norm": 1.2478984594345093, "learning_rate": 4.983413910475889e-05, "loss": 0.3207, "num_input_tokens_seen": 17611648, "step": 8155 }, { "epoch": 1.3311582381729201, "grad_norm": 1.831872582435608, "learning_rate": 4.98333195346787e-05, "loss": 0.2454, "num_input_tokens_seen": 17622656, "step": 8160 }, { "epoch": 1.3319738988580752, "grad_norm": 0.2922876179218292, "learning_rate": 4.983249795148363e-05, "loss": 0.0801, "num_input_tokens_seen": 17632864, "step": 8165 }, { "epoch": 1.33278955954323, "grad_norm": 0.2000967562198639, "learning_rate": 4.983167435524027e-05, "loss": 0.1674, "num_input_tokens_seen": 17643968, "step": 8170 }, { "epoch": 1.333605220228385, "grad_norm": 0.5970892310142517, "learning_rate": 4.98308487460154e-05, "loss": 0.1559, "num_input_tokens_seen": 17654272, "step": 8175 }, { "epoch": 1.33442088091354, "grad_norm": 0.967530369758606, "learning_rate": 4.983002112387594e-05, "loss": 0.0755, "num_input_tokens_seen": 17665728, "step": 8180 }, { "epoch": 1.3352365415986949, "grad_norm": 0.3429315686225891, "learning_rate": 4.982919148888897e-05, "loss": 0.196, "num_input_tokens_seen": 17677856, "step": 8185 }, { "epoch": 1.33605220228385, "grad_norm": 0.09351611882448196, "learning_rate": 4.982835984112177e-05, "loss": 0.1281, "num_input_tokens_seen": 17688672, "step": 8190 }, { "epoch": 1.336867862969005, "grad_norm": 0.23307408392429352, "learning_rate": 4.982752618064174e-05, "loss": 0.0638, "num_input_tokens_seen": 17699456, "step": 8195 }, { "epoch": 1.3376835236541598, "grad_norm": 6.512901782989502, "learning_rate": 4.982669050751646e-05, "loss": 0.1444, "num_input_tokens_seen": 17710976, "step": 8200 }, { "epoch": 1.3384991843393148, "grad_norm": 1.9145153760910034, "learning_rate": 4.982585282181368e-05, "loss": 0.099, "num_input_tokens_seen": 17721152, "step": 8205 }, { "epoch": 1.3393148450244698, "grad_norm": 2.3821253776550293, "learning_rate": 4.9825013123601305e-05, "loss": 0.1493, "num_input_tokens_seen": 17731200, "step": 8210 }, { "epoch": 1.3401305057096247, "grad_norm": 5.011270523071289, "learning_rate": 4.9824171412947404e-05, "loss": 0.2082, "num_input_tokens_seen": 17741760, "step": 8215 }, { "epoch": 1.3409461663947797, "grad_norm": 4.1288604736328125, "learning_rate": 4.982332768992021e-05, "loss": 0.1214, "num_input_tokens_seen": 17754368, "step": 8220 }, { "epoch": 1.3417618270799347, "grad_norm": 2.3313276767730713, "learning_rate": 4.982248195458812e-05, "loss": 0.2546, "num_input_tokens_seen": 17764096, "step": 8225 }, { "epoch": 1.3425774877650898, "grad_norm": 2.1330060958862305, "learning_rate": 4.98216342070197e-05, "loss": 0.1471, "num_input_tokens_seen": 17774784, "step": 8230 }, { "epoch": 1.3433931484502448, "grad_norm": 0.05763570964336395, "learning_rate": 4.982078444728367e-05, "loss": 0.0151, "num_input_tokens_seen": 17786112, "step": 8235 }, { "epoch": 1.3442088091353996, "grad_norm": 5.828211307525635, "learning_rate": 4.981993267544891e-05, "loss": 0.1202, "num_input_tokens_seen": 17797120, "step": 8240 }, { "epoch": 1.3450244698205547, "grad_norm": 3.2131574153900146, "learning_rate": 4.9819078891584467e-05, "loss": 0.1813, "num_input_tokens_seen": 17808032, "step": 8245 }, { "epoch": 1.3458401305057097, "grad_norm": 0.8757337927818298, "learning_rate": 4.981822309575956e-05, "loss": 0.0207, "num_input_tokens_seen": 17818144, "step": 8250 }, { "epoch": 1.3466557911908645, "grad_norm": 0.15001745522022247, "learning_rate": 4.981736528804357e-05, "loss": 0.0431, "num_input_tokens_seen": 17828032, "step": 8255 }, { "epoch": 1.3474714518760196, "grad_norm": 1.268739938735962, "learning_rate": 4.9816505468506026e-05, "loss": 0.0701, "num_input_tokens_seen": 17839232, "step": 8260 }, { "epoch": 1.3482871125611746, "grad_norm": 2.76004958152771, "learning_rate": 4.981564363721663e-05, "loss": 0.0458, "num_input_tokens_seen": 17849088, "step": 8265 }, { "epoch": 1.3491027732463294, "grad_norm": 0.21174126863479614, "learning_rate": 4.981477979424524e-05, "loss": 0.0495, "num_input_tokens_seen": 17860512, "step": 8270 }, { "epoch": 1.3499184339314845, "grad_norm": 0.1321340948343277, "learning_rate": 4.98139139396619e-05, "loss": 0.3906, "num_input_tokens_seen": 17872000, "step": 8275 }, { "epoch": 1.3507340946166395, "grad_norm": 0.21824656426906586, "learning_rate": 4.981304607353678e-05, "loss": 0.1646, "num_input_tokens_seen": 17883712, "step": 8280 }, { "epoch": 1.3515497553017943, "grad_norm": 5.227387428283691, "learning_rate": 4.981217619594026e-05, "loss": 0.1417, "num_input_tokens_seen": 17894240, "step": 8285 }, { "epoch": 1.3523654159869494, "grad_norm": 0.08399225771427155, "learning_rate": 4.981130430694283e-05, "loss": 0.2309, "num_input_tokens_seen": 17903904, "step": 8290 }, { "epoch": 1.3531810766721044, "grad_norm": 0.37167224287986755, "learning_rate": 4.9810430406615194e-05, "loss": 0.0779, "num_input_tokens_seen": 17914208, "step": 8295 }, { "epoch": 1.3539967373572595, "grad_norm": 0.03977183252573013, "learning_rate": 4.980955449502818e-05, "loss": 0.0061, "num_input_tokens_seen": 17924640, "step": 8300 }, { "epoch": 1.3548123980424145, "grad_norm": 0.07032673805952072, "learning_rate": 4.980867657225279e-05, "loss": 0.0751, "num_input_tokens_seen": 17935520, "step": 8305 }, { "epoch": 1.3556280587275693, "grad_norm": 0.7361053824424744, "learning_rate": 4.980779663836019e-05, "loss": 0.1407, "num_input_tokens_seen": 17946848, "step": 8310 }, { "epoch": 1.3564437194127243, "grad_norm": 1.1821690797805786, "learning_rate": 4.980691469342174e-05, "loss": 0.0675, "num_input_tokens_seen": 17957568, "step": 8315 }, { "epoch": 1.3572593800978794, "grad_norm": 4.284687519073486, "learning_rate": 4.98060307375089e-05, "loss": 0.0545, "num_input_tokens_seen": 17967776, "step": 8320 }, { "epoch": 1.3580750407830342, "grad_norm": 4.312143802642822, "learning_rate": 4.980514477069336e-05, "loss": 0.1381, "num_input_tokens_seen": 17978688, "step": 8325 }, { "epoch": 1.3588907014681892, "grad_norm": 0.1006174311041832, "learning_rate": 4.980425679304691e-05, "loss": 0.1376, "num_input_tokens_seen": 17990432, "step": 8330 }, { "epoch": 1.3597063621533443, "grad_norm": 2.875581979751587, "learning_rate": 4.9803366804641556e-05, "loss": 0.1652, "num_input_tokens_seen": 18001888, "step": 8335 }, { "epoch": 1.360522022838499, "grad_norm": 0.1079099252820015, "learning_rate": 4.980247480554944e-05, "loss": 0.0801, "num_input_tokens_seen": 18013536, "step": 8340 }, { "epoch": 1.3613376835236541, "grad_norm": 5.862875461578369, "learning_rate": 4.980158079584286e-05, "loss": 0.4018, "num_input_tokens_seen": 18024864, "step": 8345 }, { "epoch": 1.3621533442088092, "grad_norm": 0.4484040439128876, "learning_rate": 4.9800684775594306e-05, "loss": 0.0091, "num_input_tokens_seen": 18036192, "step": 8350 }, { "epoch": 1.362969004893964, "grad_norm": 5.443084239959717, "learning_rate": 4.979978674487641e-05, "loss": 0.2277, "num_input_tokens_seen": 18044864, "step": 8355 }, { "epoch": 1.363784665579119, "grad_norm": 0.09923567622900009, "learning_rate": 4.979888670376196e-05, "loss": 0.2332, "num_input_tokens_seen": 18056192, "step": 8360 }, { "epoch": 1.364600326264274, "grad_norm": 0.11489620059728622, "learning_rate": 4.979798465232393e-05, "loss": 0.0727, "num_input_tokens_seen": 18066592, "step": 8365 }, { "epoch": 1.3654159869494291, "grad_norm": 4.0995683670043945, "learning_rate": 4.9797080590635434e-05, "loss": 0.1438, "num_input_tokens_seen": 18077408, "step": 8370 }, { "epoch": 1.366231647634584, "grad_norm": 2.340662956237793, "learning_rate": 4.979617451876978e-05, "loss": 0.0526, "num_input_tokens_seen": 18088544, "step": 8375 }, { "epoch": 1.367047308319739, "grad_norm": 4.116942882537842, "learning_rate": 4.979526643680039e-05, "loss": 0.262, "num_input_tokens_seen": 18100192, "step": 8380 }, { "epoch": 1.367862969004894, "grad_norm": 0.5059496164321899, "learning_rate": 4.9794356344800894e-05, "loss": 0.2898, "num_input_tokens_seen": 18110432, "step": 8385 }, { "epoch": 1.368678629690049, "grad_norm": 0.1756363958120346, "learning_rate": 4.9793444242845075e-05, "loss": 0.197, "num_input_tokens_seen": 18120928, "step": 8390 }, { "epoch": 1.3694942903752039, "grad_norm": 0.18464797735214233, "learning_rate": 4.979253013100686e-05, "loss": 0.1876, "num_input_tokens_seen": 18131104, "step": 8395 }, { "epoch": 1.370309951060359, "grad_norm": 0.05068230628967285, "learning_rate": 4.979161400936036e-05, "loss": 0.0163, "num_input_tokens_seen": 18142720, "step": 8400 }, { "epoch": 1.371125611745514, "grad_norm": 0.08261064440011978, "learning_rate": 4.979069587797984e-05, "loss": 0.1064, "num_input_tokens_seen": 18153664, "step": 8405 }, { "epoch": 1.3719412724306688, "grad_norm": 2.9787333011627197, "learning_rate": 4.978977573693972e-05, "loss": 0.5077, "num_input_tokens_seen": 18164512, "step": 8410 }, { "epoch": 1.3727569331158238, "grad_norm": 0.3857441544532776, "learning_rate": 4.97888535863146e-05, "loss": 0.3214, "num_input_tokens_seen": 18174368, "step": 8415 }, { "epoch": 1.3735725938009788, "grad_norm": 3.607125759124756, "learning_rate": 4.9787929426179224e-05, "loss": 0.1154, "num_input_tokens_seen": 18185216, "step": 8420 }, { "epoch": 1.3743882544861337, "grad_norm": 0.5385351777076721, "learning_rate": 4.978700325660852e-05, "loss": 0.1098, "num_input_tokens_seen": 18196256, "step": 8425 }, { "epoch": 1.3752039151712887, "grad_norm": 3.186066150665283, "learning_rate": 4.978607507767757e-05, "loss": 0.0432, "num_input_tokens_seen": 18207264, "step": 8430 }, { "epoch": 1.3760195758564437, "grad_norm": 3.8766305446624756, "learning_rate": 4.9785144889461606e-05, "loss": 0.3158, "num_input_tokens_seen": 18217760, "step": 8435 }, { "epoch": 1.3768352365415986, "grad_norm": 0.1176619753241539, "learning_rate": 4.978421269203604e-05, "loss": 0.125, "num_input_tokens_seen": 18228608, "step": 8440 }, { "epoch": 1.3776508972267536, "grad_norm": 5.708771705627441, "learning_rate": 4.9783278485476434e-05, "loss": 0.2388, "num_input_tokens_seen": 18239200, "step": 8445 }, { "epoch": 1.3784665579119086, "grad_norm": 4.8118367195129395, "learning_rate": 4.978234226985853e-05, "loss": 0.2544, "num_input_tokens_seen": 18251200, "step": 8450 }, { "epoch": 1.3792822185970637, "grad_norm": 1.8131279945373535, "learning_rate": 4.978140404525822e-05, "loss": 0.0307, "num_input_tokens_seen": 18261856, "step": 8455 }, { "epoch": 1.3800978792822187, "grad_norm": 0.2119222730398178, "learning_rate": 4.978046381175155e-05, "loss": 0.0128, "num_input_tokens_seen": 18273152, "step": 8460 }, { "epoch": 1.3809135399673735, "grad_norm": 0.21933406591415405, "learning_rate": 4.977952156941476e-05, "loss": 0.0756, "num_input_tokens_seen": 18284896, "step": 8465 }, { "epoch": 1.3817292006525286, "grad_norm": 0.1431412696838379, "learning_rate": 4.977857731832421e-05, "loss": 0.1027, "num_input_tokens_seen": 18294912, "step": 8470 }, { "epoch": 1.3825448613376836, "grad_norm": 0.31807583570480347, "learning_rate": 4.977763105855646e-05, "loss": 0.0976, "num_input_tokens_seen": 18304672, "step": 8475 }, { "epoch": 1.3833605220228384, "grad_norm": 0.24125567078590393, "learning_rate": 4.9776682790188225e-05, "loss": 0.1622, "num_input_tokens_seen": 18316256, "step": 8480 }, { "epoch": 1.3841761827079935, "grad_norm": 0.12713797390460968, "learning_rate": 4.977573251329636e-05, "loss": 0.2192, "num_input_tokens_seen": 18327904, "step": 8485 }, { "epoch": 1.3849918433931485, "grad_norm": 0.1462138593196869, "learning_rate": 4.97747802279579e-05, "loss": 0.0199, "num_input_tokens_seen": 18338752, "step": 8490 }, { "epoch": 1.3858075040783033, "grad_norm": 0.19010719656944275, "learning_rate": 4.9773825934250056e-05, "loss": 0.0993, "num_input_tokens_seen": 18350464, "step": 8495 }, { "epoch": 1.3866231647634584, "grad_norm": 2.17090106010437, "learning_rate": 4.977286963225018e-05, "loss": 0.2162, "num_input_tokens_seen": 18360832, "step": 8500 }, { "epoch": 1.3874388254486134, "grad_norm": 4.011931896209717, "learning_rate": 4.9771911322035794e-05, "loss": 0.1839, "num_input_tokens_seen": 18370976, "step": 8505 }, { "epoch": 1.3882544861337682, "grad_norm": 0.12120926380157471, "learning_rate": 4.977095100368459e-05, "loss": 0.0996, "num_input_tokens_seen": 18381920, "step": 8510 }, { "epoch": 1.3890701468189233, "grad_norm": 1.0785771608352661, "learning_rate": 4.9769988677274405e-05, "loss": 0.1777, "num_input_tokens_seen": 18393152, "step": 8515 }, { "epoch": 1.3898858075040783, "grad_norm": 0.7257936596870422, "learning_rate": 4.976902434288326e-05, "loss": 0.0733, "num_input_tokens_seen": 18403168, "step": 8520 }, { "epoch": 1.3907014681892333, "grad_norm": 0.11688599735498428, "learning_rate": 4.9768058000589325e-05, "loss": 0.1568, "num_input_tokens_seen": 18413792, "step": 8525 }, { "epoch": 1.3915171288743884, "grad_norm": 4.304495334625244, "learning_rate": 4.976708965047093e-05, "loss": 0.1298, "num_input_tokens_seen": 18424864, "step": 8530 }, { "epoch": 1.3923327895595432, "grad_norm": 3.4943485260009766, "learning_rate": 4.976611929260659e-05, "loss": 0.0414, "num_input_tokens_seen": 18434144, "step": 8535 }, { "epoch": 1.3931484502446982, "grad_norm": 1.9871363639831543, "learning_rate": 4.976514692707496e-05, "loss": 0.1636, "num_input_tokens_seen": 18444576, "step": 8540 }, { "epoch": 1.3939641109298533, "grad_norm": 0.3116692900657654, "learning_rate": 4.9764172553954855e-05, "loss": 0.2656, "num_input_tokens_seen": 18455808, "step": 8545 }, { "epoch": 1.394779771615008, "grad_norm": 2.6198909282684326, "learning_rate": 4.976319617332527e-05, "loss": 0.0423, "num_input_tokens_seen": 18466848, "step": 8550 }, { "epoch": 1.3955954323001631, "grad_norm": 0.45830538868904114, "learning_rate": 4.9762217785265356e-05, "loss": 0.2964, "num_input_tokens_seen": 18479040, "step": 8555 }, { "epoch": 1.3964110929853182, "grad_norm": 3.4091854095458984, "learning_rate": 4.976123738985443e-05, "loss": 0.1384, "num_input_tokens_seen": 18490080, "step": 8560 }, { "epoch": 1.397226753670473, "grad_norm": 5.418155193328857, "learning_rate": 4.976025498717196e-05, "loss": 0.3336, "num_input_tokens_seen": 18501632, "step": 8565 }, { "epoch": 1.398042414355628, "grad_norm": 0.10266997665166855, "learning_rate": 4.9759270577297603e-05, "loss": 0.1686, "num_input_tokens_seen": 18513056, "step": 8570 }, { "epoch": 1.398858075040783, "grad_norm": 0.42906904220581055, "learning_rate": 4.975828416031113e-05, "loss": 0.009, "num_input_tokens_seen": 18523616, "step": 8575 }, { "epoch": 1.399673735725938, "grad_norm": 0.7605011463165283, "learning_rate": 4.975729573629252e-05, "loss": 0.237, "num_input_tokens_seen": 18534688, "step": 8580 }, { "epoch": 1.400489396411093, "grad_norm": 0.226216658949852, "learning_rate": 4.9756305305321906e-05, "loss": 0.2454, "num_input_tokens_seen": 18545120, "step": 8585 }, { "epoch": 1.401305057096248, "grad_norm": 0.17236967384815216, "learning_rate": 4.975531286747958e-05, "loss": 0.0725, "num_input_tokens_seen": 18556224, "step": 8590 }, { "epoch": 1.402120717781403, "grad_norm": 3.434866428375244, "learning_rate": 4.975431842284597e-05, "loss": 0.4768, "num_input_tokens_seen": 18566272, "step": 8595 }, { "epoch": 1.4029363784665578, "grad_norm": 0.6368977427482605, "learning_rate": 4.975332197150171e-05, "loss": 0.1642, "num_input_tokens_seen": 18577216, "step": 8600 }, { "epoch": 1.4037520391517129, "grad_norm": 0.1678442656993866, "learning_rate": 4.975232351352758e-05, "loss": 0.3473, "num_input_tokens_seen": 18587072, "step": 8605 }, { "epoch": 1.404567699836868, "grad_norm": 4.314147472381592, "learning_rate": 4.975132304900451e-05, "loss": 0.1323, "num_input_tokens_seen": 18598112, "step": 8610 }, { "epoch": 1.405383360522023, "grad_norm": 0.5971012711524963, "learning_rate": 4.975032057801361e-05, "loss": 0.1713, "num_input_tokens_seen": 18608384, "step": 8615 }, { "epoch": 1.4061990212071778, "grad_norm": 4.2186713218688965, "learning_rate": 4.974931610063613e-05, "loss": 0.0658, "num_input_tokens_seen": 18618944, "step": 8620 }, { "epoch": 1.4070146818923328, "grad_norm": 0.15565381944179535, "learning_rate": 4.974830961695353e-05, "loss": 0.1091, "num_input_tokens_seen": 18629248, "step": 8625 }, { "epoch": 1.4078303425774878, "grad_norm": 0.4324505627155304, "learning_rate": 4.9747301127047366e-05, "loss": 0.032, "num_input_tokens_seen": 18641248, "step": 8630 }, { "epoch": 1.4086460032626427, "grad_norm": 0.2560870945453644, "learning_rate": 4.974629063099942e-05, "loss": 0.0226, "num_input_tokens_seen": 18651520, "step": 8635 }, { "epoch": 1.4094616639477977, "grad_norm": 1.649951457977295, "learning_rate": 4.974527812889158e-05, "loss": 0.1574, "num_input_tokens_seen": 18662304, "step": 8640 }, { "epoch": 1.4102773246329527, "grad_norm": 0.06403063982725143, "learning_rate": 4.974426362080594e-05, "loss": 0.1435, "num_input_tokens_seen": 18672416, "step": 8645 }, { "epoch": 1.4110929853181076, "grad_norm": 3.4695379734039307, "learning_rate": 4.974324710682474e-05, "loss": 0.0544, "num_input_tokens_seen": 18683136, "step": 8650 }, { "epoch": 1.4119086460032626, "grad_norm": 0.19298793375492096, "learning_rate": 4.974222858703039e-05, "loss": 0.2589, "num_input_tokens_seen": 18693472, "step": 8655 }, { "epoch": 1.4127243066884176, "grad_norm": 0.20529305934906006, "learning_rate": 4.9741208061505454e-05, "loss": 0.0915, "num_input_tokens_seen": 18704128, "step": 8660 }, { "epoch": 1.4135399673735725, "grad_norm": 0.9639427661895752, "learning_rate": 4.974018553033264e-05, "loss": 0.0374, "num_input_tokens_seen": 18715456, "step": 8665 }, { "epoch": 1.4143556280587275, "grad_norm": 0.4283669590950012, "learning_rate": 4.973916099359487e-05, "loss": 0.2163, "num_input_tokens_seen": 18726240, "step": 8670 }, { "epoch": 1.4151712887438825, "grad_norm": 0.44654086232185364, "learning_rate": 4.973813445137518e-05, "loss": 0.0779, "num_input_tokens_seen": 18736992, "step": 8675 }, { "epoch": 1.4159869494290376, "grad_norm": 0.32161450386047363, "learning_rate": 4.9737105903756794e-05, "loss": 0.0227, "num_input_tokens_seen": 18749216, "step": 8680 }, { "epoch": 1.4168026101141926, "grad_norm": 0.15897774696350098, "learning_rate": 4.973607535082309e-05, "loss": 0.0631, "num_input_tokens_seen": 18760352, "step": 8685 }, { "epoch": 1.4176182707993474, "grad_norm": 0.13181594014167786, "learning_rate": 4.97350427926576e-05, "loss": 0.0557, "num_input_tokens_seen": 18769792, "step": 8690 }, { "epoch": 1.4184339314845025, "grad_norm": 0.13483324646949768, "learning_rate": 4.973400822934404e-05, "loss": 0.2174, "num_input_tokens_seen": 18781696, "step": 8695 }, { "epoch": 1.4192495921696575, "grad_norm": 1.089842438697815, "learning_rate": 4.973297166096628e-05, "loss": 0.2138, "num_input_tokens_seen": 18790912, "step": 8700 }, { "epoch": 1.4200652528548123, "grad_norm": 4.54611349105835, "learning_rate": 4.9731933087608334e-05, "loss": 0.2273, "num_input_tokens_seen": 18800672, "step": 8705 }, { "epoch": 1.4208809135399674, "grad_norm": 4.534762382507324, "learning_rate": 4.973089250935441e-05, "loss": 0.1754, "num_input_tokens_seen": 18812032, "step": 8710 }, { "epoch": 1.4216965742251224, "grad_norm": 4.2552361488342285, "learning_rate": 4.972984992628885e-05, "loss": 0.3013, "num_input_tokens_seen": 18823328, "step": 8715 }, { "epoch": 1.4225122349102772, "grad_norm": 4.679597854614258, "learning_rate": 4.972880533849619e-05, "loss": 0.3067, "num_input_tokens_seen": 18834048, "step": 8720 }, { "epoch": 1.4233278955954323, "grad_norm": 1.3409359455108643, "learning_rate": 4.9727758746061084e-05, "loss": 0.1296, "num_input_tokens_seen": 18845216, "step": 8725 }, { "epoch": 1.4241435562805873, "grad_norm": 0.1467946171760559, "learning_rate": 4.972671014906839e-05, "loss": 0.1887, "num_input_tokens_seen": 18854848, "step": 8730 }, { "epoch": 1.4249592169657421, "grad_norm": 1.7204428911209106, "learning_rate": 4.972565954760311e-05, "loss": 0.1263, "num_input_tokens_seen": 18866368, "step": 8735 }, { "epoch": 1.4257748776508972, "grad_norm": 3.4446494579315186, "learning_rate": 4.9724606941750406e-05, "loss": 0.3014, "num_input_tokens_seen": 18878464, "step": 8740 }, { "epoch": 1.4265905383360522, "grad_norm": 0.11606528609991074, "learning_rate": 4.972355233159562e-05, "loss": 0.144, "num_input_tokens_seen": 18889856, "step": 8745 }, { "epoch": 1.4274061990212072, "grad_norm": 0.30330905318260193, "learning_rate": 4.972249571722423e-05, "loss": 0.188, "num_input_tokens_seen": 18900352, "step": 8750 }, { "epoch": 1.4282218597063623, "grad_norm": 0.20018945634365082, "learning_rate": 4.97214370987219e-05, "loss": 0.1408, "num_input_tokens_seen": 18911936, "step": 8755 }, { "epoch": 1.429037520391517, "grad_norm": 0.1850886046886444, "learning_rate": 4.972037647617444e-05, "loss": 0.0465, "num_input_tokens_seen": 18922048, "step": 8760 }, { "epoch": 1.4298531810766721, "grad_norm": 3.108037233352661, "learning_rate": 4.9719313849667835e-05, "loss": 0.1357, "num_input_tokens_seen": 18932288, "step": 8765 }, { "epoch": 1.4306688417618272, "grad_norm": 1.8283677101135254, "learning_rate": 4.9718249219288226e-05, "loss": 0.1841, "num_input_tokens_seen": 18943008, "step": 8770 }, { "epoch": 1.431484502446982, "grad_norm": 6.500990867614746, "learning_rate": 4.971718258512191e-05, "loss": 0.1459, "num_input_tokens_seen": 18953376, "step": 8775 }, { "epoch": 1.432300163132137, "grad_norm": 1.0945346355438232, "learning_rate": 4.971611394725537e-05, "loss": 0.0602, "num_input_tokens_seen": 18965600, "step": 8780 }, { "epoch": 1.433115823817292, "grad_norm": 0.162503182888031, "learning_rate": 4.971504330577521e-05, "loss": 0.0703, "num_input_tokens_seen": 18975936, "step": 8785 }, { "epoch": 1.433931484502447, "grad_norm": 0.03908466547727585, "learning_rate": 4.971397066076825e-05, "loss": 0.0595, "num_input_tokens_seen": 18986048, "step": 8790 }, { "epoch": 1.434747145187602, "grad_norm": 0.5271322727203369, "learning_rate": 4.971289601232143e-05, "loss": 0.1402, "num_input_tokens_seen": 18997056, "step": 8795 }, { "epoch": 1.435562805872757, "grad_norm": 0.08785668015480042, "learning_rate": 4.971181936052186e-05, "loss": 0.0255, "num_input_tokens_seen": 19007744, "step": 8800 }, { "epoch": 1.4363784665579118, "grad_norm": 4.008883953094482, "learning_rate": 4.971074070545684e-05, "loss": 0.1603, "num_input_tokens_seen": 19018784, "step": 8805 }, { "epoch": 1.4371941272430668, "grad_norm": 3.2489356994628906, "learning_rate": 4.970966004721378e-05, "loss": 0.112, "num_input_tokens_seen": 19030176, "step": 8810 }, { "epoch": 1.4380097879282219, "grad_norm": 0.3383443057537079, "learning_rate": 4.970857738588031e-05, "loss": 0.0913, "num_input_tokens_seen": 19041088, "step": 8815 }, { "epoch": 1.438825448613377, "grad_norm": 0.21210889518260956, "learning_rate": 4.9707492721544185e-05, "loss": 0.0082, "num_input_tokens_seen": 19052352, "step": 8820 }, { "epoch": 1.4396411092985317, "grad_norm": 1.9470065832138062, "learning_rate": 4.970640605429334e-05, "loss": 0.0698, "num_input_tokens_seen": 19062688, "step": 8825 }, { "epoch": 1.4404567699836868, "grad_norm": 6.853029251098633, "learning_rate": 4.970531738421585e-05, "loss": 0.1794, "num_input_tokens_seen": 19073504, "step": 8830 }, { "epoch": 1.4412724306688418, "grad_norm": 3.3483693599700928, "learning_rate": 4.970422671139999e-05, "loss": 0.3258, "num_input_tokens_seen": 19085792, "step": 8835 }, { "epoch": 1.4420880913539968, "grad_norm": 0.28674644231796265, "learning_rate": 4.970313403593416e-05, "loss": 0.1313, "num_input_tokens_seen": 19096512, "step": 8840 }, { "epoch": 1.4429037520391517, "grad_norm": 4.106522560119629, "learning_rate": 4.970203935790695e-05, "loss": 0.2026, "num_input_tokens_seen": 19106336, "step": 8845 }, { "epoch": 1.4437194127243067, "grad_norm": 4.873129367828369, "learning_rate": 4.970094267740708e-05, "loss": 0.1281, "num_input_tokens_seen": 19116800, "step": 8850 }, { "epoch": 1.4445350734094617, "grad_norm": 5.79468297958374, "learning_rate": 4.969984399452347e-05, "loss": 0.2079, "num_input_tokens_seen": 19127488, "step": 8855 }, { "epoch": 1.4453507340946166, "grad_norm": 0.08337918668985367, "learning_rate": 4.9698743309345184e-05, "loss": 0.2117, "num_input_tokens_seen": 19135264, "step": 8860 }, { "epoch": 1.4461663947797716, "grad_norm": 5.34727668762207, "learning_rate": 4.969764062196145e-05, "loss": 0.179, "num_input_tokens_seen": 19146656, "step": 8865 }, { "epoch": 1.4469820554649266, "grad_norm": 0.050858285278081894, "learning_rate": 4.969653593246164e-05, "loss": 0.0158, "num_input_tokens_seen": 19158880, "step": 8870 }, { "epoch": 1.4477977161500815, "grad_norm": 2.2910377979278564, "learning_rate": 4.9695429240935335e-05, "loss": 0.1317, "num_input_tokens_seen": 19170240, "step": 8875 }, { "epoch": 1.4486133768352365, "grad_norm": 0.17886285483837128, "learning_rate": 4.9694320547472215e-05, "loss": 0.0451, "num_input_tokens_seen": 19180960, "step": 8880 }, { "epoch": 1.4494290375203915, "grad_norm": 2.9980437755584717, "learning_rate": 4.9693209852162184e-05, "loss": 0.4063, "num_input_tokens_seen": 19192576, "step": 8885 }, { "epoch": 1.4502446982055464, "grad_norm": 1.1452898979187012, "learning_rate": 4.969209715509526e-05, "loss": 0.0696, "num_input_tokens_seen": 19202752, "step": 8890 }, { "epoch": 1.4510603588907014, "grad_norm": 3.7562623023986816, "learning_rate": 4.969098245636167e-05, "loss": 0.0542, "num_input_tokens_seen": 19214048, "step": 8895 }, { "epoch": 1.4518760195758564, "grad_norm": 6.766478061676025, "learning_rate": 4.968986575605175e-05, "loss": 0.2298, "num_input_tokens_seen": 19225056, "step": 8900 }, { "epoch": 1.4526916802610115, "grad_norm": 12.138822555541992, "learning_rate": 4.968874705425604e-05, "loss": 0.2153, "num_input_tokens_seen": 19236480, "step": 8905 }, { "epoch": 1.4535073409461665, "grad_norm": 5.017162799835205, "learning_rate": 4.968762635106522e-05, "loss": 0.24, "num_input_tokens_seen": 19248192, "step": 8910 }, { "epoch": 1.4543230016313213, "grad_norm": 0.35115402936935425, "learning_rate": 4.9686503646570146e-05, "loss": 0.1287, "num_input_tokens_seen": 19258368, "step": 8915 }, { "epoch": 1.4551386623164764, "grad_norm": 0.2280079871416092, "learning_rate": 4.9685378940861826e-05, "loss": 0.0872, "num_input_tokens_seen": 19269088, "step": 8920 }, { "epoch": 1.4559543230016314, "grad_norm": 2.5575084686279297, "learning_rate": 4.9684252234031446e-05, "loss": 0.1059, "num_input_tokens_seen": 19280384, "step": 8925 }, { "epoch": 1.4567699836867862, "grad_norm": 0.19328927993774414, "learning_rate": 4.968312352617033e-05, "loss": 0.2862, "num_input_tokens_seen": 19292192, "step": 8930 }, { "epoch": 1.4575856443719413, "grad_norm": 0.379935085773468, "learning_rate": 4.968199281736997e-05, "loss": 0.1058, "num_input_tokens_seen": 19303904, "step": 8935 }, { "epoch": 1.4584013050570963, "grad_norm": 0.6336538195610046, "learning_rate": 4.968086010772205e-05, "loss": 0.0835, "num_input_tokens_seen": 19315712, "step": 8940 }, { "epoch": 1.4592169657422511, "grad_norm": 0.13827356696128845, "learning_rate": 4.9679725397318375e-05, "loss": 0.2034, "num_input_tokens_seen": 19326848, "step": 8945 }, { "epoch": 1.4600326264274062, "grad_norm": 8.300313949584961, "learning_rate": 4.967858868625094e-05, "loss": 0.3573, "num_input_tokens_seen": 19338496, "step": 8950 }, { "epoch": 1.4608482871125612, "grad_norm": 6.544249057769775, "learning_rate": 4.967744997461188e-05, "loss": 0.3305, "num_input_tokens_seen": 19348992, "step": 8955 }, { "epoch": 1.461663947797716, "grad_norm": 2.2257773876190186, "learning_rate": 4.9676309262493513e-05, "loss": 0.1591, "num_input_tokens_seen": 19359680, "step": 8960 }, { "epoch": 1.462479608482871, "grad_norm": 3.6891024112701416, "learning_rate": 4.9675166549988314e-05, "loss": 0.076, "num_input_tokens_seen": 19370848, "step": 8965 }, { "epoch": 1.463295269168026, "grad_norm": 0.443911612033844, "learning_rate": 4.9674021837188917e-05, "loss": 0.1519, "num_input_tokens_seen": 19381888, "step": 8970 }, { "epoch": 1.4641109298531811, "grad_norm": 0.08385307341814041, "learning_rate": 4.967287512418811e-05, "loss": 0.0991, "num_input_tokens_seen": 19392768, "step": 8975 }, { "epoch": 1.4649265905383362, "grad_norm": 0.12367914617061615, "learning_rate": 4.9671726411078864e-05, "loss": 0.0911, "num_input_tokens_seen": 19403488, "step": 8980 }, { "epoch": 1.465742251223491, "grad_norm": 0.9014115333557129, "learning_rate": 4.967057569795428e-05, "loss": 0.2221, "num_input_tokens_seen": 19414656, "step": 8985 }, { "epoch": 1.466557911908646, "grad_norm": 0.43794381618499756, "learning_rate": 4.966942298490767e-05, "loss": 0.2248, "num_input_tokens_seen": 19424960, "step": 8990 }, { "epoch": 1.467373572593801, "grad_norm": 5.037932872772217, "learning_rate": 4.966826827203245e-05, "loss": 0.0941, "num_input_tokens_seen": 19436512, "step": 8995 }, { "epoch": 1.468189233278956, "grad_norm": 4.7633209228515625, "learning_rate": 4.966711155942223e-05, "loss": 0.1554, "num_input_tokens_seen": 19447808, "step": 9000 }, { "epoch": 1.469004893964111, "grad_norm": 0.4131033718585968, "learning_rate": 4.966595284717081e-05, "loss": 0.0731, "num_input_tokens_seen": 19457984, "step": 9005 }, { "epoch": 1.469820554649266, "grad_norm": 0.31178730726242065, "learning_rate": 4.966479213537207e-05, "loss": 0.0987, "num_input_tokens_seen": 19467680, "step": 9010 }, { "epoch": 1.4706362153344208, "grad_norm": 0.5420904159545898, "learning_rate": 4.966362942412015e-05, "loss": 0.16, "num_input_tokens_seen": 19475968, "step": 9015 }, { "epoch": 1.4714518760195758, "grad_norm": 3.456433057785034, "learning_rate": 4.9662464713509285e-05, "loss": 0.2759, "num_input_tokens_seen": 19487328, "step": 9020 }, { "epoch": 1.4722675367047309, "grad_norm": 0.2179572433233261, "learning_rate": 4.966129800363389e-05, "loss": 0.0207, "num_input_tokens_seen": 19497280, "step": 9025 }, { "epoch": 1.4730831973898857, "grad_norm": 0.4606347680091858, "learning_rate": 4.9660129294588554e-05, "loss": 0.1661, "num_input_tokens_seen": 19507264, "step": 9030 }, { "epoch": 1.4738988580750407, "grad_norm": 1.0950851440429688, "learning_rate": 4.965895858646801e-05, "loss": 0.1189, "num_input_tokens_seen": 19518080, "step": 9035 }, { "epoch": 1.4747145187601958, "grad_norm": 3.8063807487487793, "learning_rate": 4.9657785879367166e-05, "loss": 0.0902, "num_input_tokens_seen": 19528288, "step": 9040 }, { "epoch": 1.4755301794453508, "grad_norm": 7.3399786949157715, "learning_rate": 4.965661117338108e-05, "loss": 0.2115, "num_input_tokens_seen": 19539072, "step": 9045 }, { "epoch": 1.4763458401305056, "grad_norm": 3.9826085567474365, "learning_rate": 4.9655434468605e-05, "loss": 0.256, "num_input_tokens_seen": 19550496, "step": 9050 }, { "epoch": 1.4771615008156607, "grad_norm": 0.15139615535736084, "learning_rate": 4.9654255765134294e-05, "loss": 0.0415, "num_input_tokens_seen": 19562752, "step": 9055 }, { "epoch": 1.4779771615008157, "grad_norm": 0.25424888730049133, "learning_rate": 4.965307506306452e-05, "loss": 0.2528, "num_input_tokens_seen": 19572512, "step": 9060 }, { "epoch": 1.4787928221859707, "grad_norm": 0.16260232031345367, "learning_rate": 4.965189236249139e-05, "loss": 0.1952, "num_input_tokens_seen": 19582752, "step": 9065 }, { "epoch": 1.4796084828711256, "grad_norm": 4.861161231994629, "learning_rate": 4.9650707663510785e-05, "loss": 0.1756, "num_input_tokens_seen": 19593184, "step": 9070 }, { "epoch": 1.4804241435562806, "grad_norm": 1.71065092086792, "learning_rate": 4.9649520966218744e-05, "loss": 0.0373, "num_input_tokens_seen": 19604768, "step": 9075 }, { "epoch": 1.4812398042414356, "grad_norm": 0.21857862174510956, "learning_rate": 4.9648332270711463e-05, "loss": 0.1268, "num_input_tokens_seen": 19616576, "step": 9080 }, { "epoch": 1.4820554649265905, "grad_norm": 0.23377980291843414, "learning_rate": 4.96471415770853e-05, "loss": 0.0381, "num_input_tokens_seen": 19628384, "step": 9085 }, { "epoch": 1.4828711256117455, "grad_norm": 2.2004427909851074, "learning_rate": 4.964594888543678e-05, "loss": 0.1432, "num_input_tokens_seen": 19639136, "step": 9090 }, { "epoch": 1.4836867862969005, "grad_norm": 4.055203437805176, "learning_rate": 4.9644754195862597e-05, "loss": 0.1213, "num_input_tokens_seen": 19649856, "step": 9095 }, { "epoch": 1.4845024469820554, "grad_norm": 0.4328365623950958, "learning_rate": 4.964355750845959e-05, "loss": 0.1327, "num_input_tokens_seen": 19661120, "step": 9100 }, { "epoch": 1.4853181076672104, "grad_norm": 4.144128322601318, "learning_rate": 4.9642358823324776e-05, "loss": 0.1021, "num_input_tokens_seen": 19671648, "step": 9105 }, { "epoch": 1.4861337683523654, "grad_norm": 0.16398490965366364, "learning_rate": 4.964115814055531e-05, "loss": 0.0928, "num_input_tokens_seen": 19683584, "step": 9110 }, { "epoch": 1.4869494290375203, "grad_norm": 0.2402666062116623, "learning_rate": 4.963995546024854e-05, "loss": 0.292, "num_input_tokens_seen": 19694080, "step": 9115 }, { "epoch": 1.4877650897226753, "grad_norm": 3.6252024173736572, "learning_rate": 4.963875078250197e-05, "loss": 0.3565, "num_input_tokens_seen": 19705824, "step": 9120 }, { "epoch": 1.4885807504078303, "grad_norm": 3.403533697128296, "learning_rate": 4.963754410741324e-05, "loss": 0.3943, "num_input_tokens_seen": 19717056, "step": 9125 }, { "epoch": 1.4893964110929854, "grad_norm": 1.4570884704589844, "learning_rate": 4.9636335435080174e-05, "loss": 0.0448, "num_input_tokens_seen": 19728512, "step": 9130 }, { "epoch": 1.4902120717781404, "grad_norm": 4.06797981262207, "learning_rate": 4.963512476560075e-05, "loss": 0.0556, "num_input_tokens_seen": 19740768, "step": 9135 }, { "epoch": 1.4910277324632952, "grad_norm": 5.106830596923828, "learning_rate": 4.963391209907312e-05, "loss": 0.1786, "num_input_tokens_seen": 19752576, "step": 9140 }, { "epoch": 1.4918433931484503, "grad_norm": 0.10509225726127625, "learning_rate": 4.9632697435595585e-05, "loss": 0.015, "num_input_tokens_seen": 19764064, "step": 9145 }, { "epoch": 1.4926590538336053, "grad_norm": 0.033262308686971664, "learning_rate": 4.96314807752666e-05, "loss": 0.0454, "num_input_tokens_seen": 19776608, "step": 9150 }, { "epoch": 1.4934747145187601, "grad_norm": 0.31555286049842834, "learning_rate": 4.963026211818482e-05, "loss": 0.232, "num_input_tokens_seen": 19787488, "step": 9155 }, { "epoch": 1.4942903752039152, "grad_norm": 0.533806324005127, "learning_rate": 4.962904146444901e-05, "loss": 0.178, "num_input_tokens_seen": 19798240, "step": 9160 }, { "epoch": 1.4951060358890702, "grad_norm": 0.05178459733724594, "learning_rate": 4.962781881415814e-05, "loss": 0.4078, "num_input_tokens_seen": 19808384, "step": 9165 }, { "epoch": 1.495921696574225, "grad_norm": 1.5524178743362427, "learning_rate": 4.962659416741131e-05, "loss": 0.1784, "num_input_tokens_seen": 19817696, "step": 9170 }, { "epoch": 1.49673735725938, "grad_norm": 0.8013507723808289, "learning_rate": 4.962536752430781e-05, "loss": 0.1411, "num_input_tokens_seen": 19828384, "step": 9175 }, { "epoch": 1.497553017944535, "grad_norm": 0.268471360206604, "learning_rate": 4.962413888494706e-05, "loss": 0.0817, "num_input_tokens_seen": 19840704, "step": 9180 }, { "epoch": 1.49836867862969, "grad_norm": 1.3799679279327393, "learning_rate": 4.9622908249428676e-05, "loss": 0.1637, "num_input_tokens_seen": 19851648, "step": 9185 }, { "epoch": 1.499184339314845, "grad_norm": 4.304414749145508, "learning_rate": 4.962167561785241e-05, "loss": 0.1251, "num_input_tokens_seen": 19861568, "step": 9190 }, { "epoch": 1.5, "grad_norm": 0.08585814386606216, "learning_rate": 4.962044099031819e-05, "loss": 0.0889, "num_input_tokens_seen": 19871232, "step": 9195 }, { "epoch": 1.5, "eval_loss": 0.1519882082939148, "eval_runtime": 133.2806, "eval_samples_per_second": 20.446, "eval_steps_per_second": 5.117, "num_input_tokens_seen": 19871232, "step": 9195 }, { "epoch": 1.5008156606851548, "grad_norm": 1.3533514738082886, "learning_rate": 4.9619204366926106e-05, "loss": 0.0204, "num_input_tokens_seen": 19882400, "step": 9200 }, { "epoch": 1.50163132137031, "grad_norm": 0.6818300485610962, "learning_rate": 4.96179657477764e-05, "loss": 0.1166, "num_input_tokens_seen": 19893824, "step": 9205 }, { "epoch": 1.502446982055465, "grad_norm": 0.19116778671741486, "learning_rate": 4.961672513296948e-05, "loss": 0.1069, "num_input_tokens_seen": 19905280, "step": 9210 }, { "epoch": 1.50326264274062, "grad_norm": 0.24817152321338654, "learning_rate": 4.9615482522605915e-05, "loss": 0.0113, "num_input_tokens_seen": 19916032, "step": 9215 }, { "epoch": 1.504078303425775, "grad_norm": 0.788390576839447, "learning_rate": 4.9614237916786434e-05, "loss": 0.1029, "num_input_tokens_seen": 19927232, "step": 9220 }, { "epoch": 1.5048939641109298, "grad_norm": 0.06866984814405441, "learning_rate": 4.961299131561194e-05, "loss": 0.0033, "num_input_tokens_seen": 19936992, "step": 9225 }, { "epoch": 1.5057096247960848, "grad_norm": 2.1778368949890137, "learning_rate": 4.961174271918349e-05, "loss": 0.158, "num_input_tokens_seen": 19948256, "step": 9230 }, { "epoch": 1.5065252854812399, "grad_norm": 1.7241088151931763, "learning_rate": 4.961049212760229e-05, "loss": 0.2009, "num_input_tokens_seen": 19960416, "step": 9235 }, { "epoch": 1.5073409461663947, "grad_norm": 5.672590255737305, "learning_rate": 4.960923954096972e-05, "loss": 0.1006, "num_input_tokens_seen": 19970656, "step": 9240 }, { "epoch": 1.5081566068515497, "grad_norm": 0.04307292029261589, "learning_rate": 4.960798495938734e-05, "loss": 0.1079, "num_input_tokens_seen": 19982176, "step": 9245 }, { "epoch": 1.5089722675367048, "grad_norm": 4.042102336883545, "learning_rate": 4.960672838295683e-05, "loss": 0.2614, "num_input_tokens_seen": 19991968, "step": 9250 }, { "epoch": 1.5097879282218596, "grad_norm": 2.628999948501587, "learning_rate": 4.960546981178007e-05, "loss": 0.2371, "num_input_tokens_seen": 20003968, "step": 9255 }, { "epoch": 1.5106035889070146, "grad_norm": 3.945488929748535, "learning_rate": 4.9604209245959076e-05, "loss": 0.1066, "num_input_tokens_seen": 20014784, "step": 9260 }, { "epoch": 1.5114192495921697, "grad_norm": 1.1156165599822998, "learning_rate": 4.960294668559604e-05, "loss": 0.289, "num_input_tokens_seen": 20024192, "step": 9265 }, { "epoch": 1.5122349102773245, "grad_norm": 0.1157669648528099, "learning_rate": 4.960168213079331e-05, "loss": 0.0745, "num_input_tokens_seen": 20034368, "step": 9270 }, { "epoch": 1.5130505709624797, "grad_norm": 8.24000072479248, "learning_rate": 4.9600415581653406e-05, "loss": 0.2667, "num_input_tokens_seen": 20045216, "step": 9275 }, { "epoch": 1.5138662316476346, "grad_norm": 0.4580290913581848, "learning_rate": 4.9599147038278984e-05, "loss": 0.0723, "num_input_tokens_seen": 20056928, "step": 9280 }, { "epoch": 1.5146818923327896, "grad_norm": 4.009837627410889, "learning_rate": 4.95978765007729e-05, "loss": 0.2275, "num_input_tokens_seen": 20067520, "step": 9285 }, { "epoch": 1.5154975530179446, "grad_norm": 0.13445672392845154, "learning_rate": 4.959660396923813e-05, "loss": 0.0468, "num_input_tokens_seen": 20077568, "step": 9290 }, { "epoch": 1.5163132137030995, "grad_norm": 2.3956339359283447, "learning_rate": 4.9595329443777836e-05, "loss": 0.3213, "num_input_tokens_seen": 20087584, "step": 9295 }, { "epoch": 1.5171288743882545, "grad_norm": 1.0057824850082397, "learning_rate": 4.959405292449535e-05, "loss": 0.1141, "num_input_tokens_seen": 20098880, "step": 9300 }, { "epoch": 1.5179445350734095, "grad_norm": 7.3586907386779785, "learning_rate": 4.959277441149415e-05, "loss": 0.2152, "num_input_tokens_seen": 20109568, "step": 9305 }, { "epoch": 1.5187601957585644, "grad_norm": 0.20948190987110138, "learning_rate": 4.959149390487786e-05, "loss": 0.0594, "num_input_tokens_seen": 20121184, "step": 9310 }, { "epoch": 1.5195758564437194, "grad_norm": 0.2613030970096588, "learning_rate": 4.959021140475031e-05, "loss": 0.1212, "num_input_tokens_seen": 20132608, "step": 9315 }, { "epoch": 1.5203915171288744, "grad_norm": 0.5939708948135376, "learning_rate": 4.958892691121545e-05, "loss": 0.0263, "num_input_tokens_seen": 20143200, "step": 9320 }, { "epoch": 1.5212071778140293, "grad_norm": 0.059133075177669525, "learning_rate": 4.958764042437741e-05, "loss": 0.1101, "num_input_tokens_seen": 20153344, "step": 9325 }, { "epoch": 1.5220228384991843, "grad_norm": 0.09682247042655945, "learning_rate": 4.958635194434048e-05, "loss": 0.1299, "num_input_tokens_seen": 20164288, "step": 9330 }, { "epoch": 1.5228384991843393, "grad_norm": 0.4423978924751282, "learning_rate": 4.958506147120912e-05, "loss": 0.0717, "num_input_tokens_seen": 20174880, "step": 9335 }, { "epoch": 1.5236541598694942, "grad_norm": 0.1162685751914978, "learning_rate": 4.958376900508792e-05, "loss": 0.1498, "num_input_tokens_seen": 20184960, "step": 9340 }, { "epoch": 1.5244698205546494, "grad_norm": 0.1906140297651291, "learning_rate": 4.958247454608167e-05, "loss": 0.0792, "num_input_tokens_seen": 20194464, "step": 9345 }, { "epoch": 1.5252854812398042, "grad_norm": 0.6163058876991272, "learning_rate": 4.958117809429531e-05, "loss": 0.4561, "num_input_tokens_seen": 20204352, "step": 9350 }, { "epoch": 1.5261011419249593, "grad_norm": 0.1939542442560196, "learning_rate": 4.9579879649833925e-05, "loss": 0.0988, "num_input_tokens_seen": 20214816, "step": 9355 }, { "epoch": 1.5269168026101143, "grad_norm": 3.6777944564819336, "learning_rate": 4.957857921280279e-05, "loss": 0.288, "num_input_tokens_seen": 20225664, "step": 9360 }, { "epoch": 1.5277324632952691, "grad_norm": 4.889492034912109, "learning_rate": 4.9577276783307296e-05, "loss": 0.1006, "num_input_tokens_seen": 20236672, "step": 9365 }, { "epoch": 1.5285481239804242, "grad_norm": 4.427472114562988, "learning_rate": 4.9575972361453046e-05, "loss": 0.2351, "num_input_tokens_seen": 20246784, "step": 9370 }, { "epoch": 1.5293637846655792, "grad_norm": 0.09633055329322815, "learning_rate": 4.957466594734579e-05, "loss": 0.0995, "num_input_tokens_seen": 20257088, "step": 9375 }, { "epoch": 1.530179445350734, "grad_norm": 0.11631061881780624, "learning_rate": 4.9573357541091414e-05, "loss": 0.0408, "num_input_tokens_seen": 20267520, "step": 9380 }, { "epoch": 1.530995106035889, "grad_norm": 0.1659749150276184, "learning_rate": 4.957204714279599e-05, "loss": 0.0962, "num_input_tokens_seen": 20277568, "step": 9385 }, { "epoch": 1.531810766721044, "grad_norm": 3.294208526611328, "learning_rate": 4.957073475256575e-05, "loss": 0.0976, "num_input_tokens_seen": 20288768, "step": 9390 }, { "epoch": 1.532626427406199, "grad_norm": 3.342176675796509, "learning_rate": 4.9569420370507087e-05, "loss": 0.0922, "num_input_tokens_seen": 20299968, "step": 9395 }, { "epoch": 1.533442088091354, "grad_norm": 2.932504892349243, "learning_rate": 4.956810399672653e-05, "loss": 0.3596, "num_input_tokens_seen": 20310592, "step": 9400 }, { "epoch": 1.534257748776509, "grad_norm": 4.083658218383789, "learning_rate": 4.956678563133082e-05, "loss": 0.4003, "num_input_tokens_seen": 20321088, "step": 9405 }, { "epoch": 1.5350734094616638, "grad_norm": 0.15010380744934082, "learning_rate": 4.956546527442681e-05, "loss": 0.1766, "num_input_tokens_seen": 20331136, "step": 9410 }, { "epoch": 1.535889070146819, "grad_norm": 2.304518222808838, "learning_rate": 4.956414292612154e-05, "loss": 0.1801, "num_input_tokens_seen": 20343136, "step": 9415 }, { "epoch": 1.536704730831974, "grad_norm": 0.20615914463996887, "learning_rate": 4.9562818586522206e-05, "loss": 0.1304, "num_input_tokens_seen": 20354656, "step": 9420 }, { "epoch": 1.5375203915171287, "grad_norm": 0.20106583833694458, "learning_rate": 4.9561492255736175e-05, "loss": 0.0132, "num_input_tokens_seen": 20365856, "step": 9425 }, { "epoch": 1.538336052202284, "grad_norm": 3.1261301040649414, "learning_rate": 4.9560163933870954e-05, "loss": 0.2497, "num_input_tokens_seen": 20376064, "step": 9430 }, { "epoch": 1.5391517128874388, "grad_norm": 1.1069012880325317, "learning_rate": 4.9558833621034224e-05, "loss": 0.1004, "num_input_tokens_seen": 20387104, "step": 9435 }, { "epoch": 1.5399673735725938, "grad_norm": 4.373126029968262, "learning_rate": 4.955750131733383e-05, "loss": 0.1297, "num_input_tokens_seen": 20398112, "step": 9440 }, { "epoch": 1.5407830342577489, "grad_norm": 0.38516080379486084, "learning_rate": 4.955616702287778e-05, "loss": 0.0138, "num_input_tokens_seen": 20409152, "step": 9445 }, { "epoch": 1.5415986949429037, "grad_norm": 1.4748661518096924, "learning_rate": 4.9554830737774226e-05, "loss": 0.086, "num_input_tokens_seen": 20420704, "step": 9450 }, { "epoch": 1.5424143556280587, "grad_norm": 0.3413572907447815, "learning_rate": 4.955349246213151e-05, "loss": 0.1354, "num_input_tokens_seen": 20431456, "step": 9455 }, { "epoch": 1.5432300163132138, "grad_norm": 3.291419744491577, "learning_rate": 4.9552152196058114e-05, "loss": 0.1367, "num_input_tokens_seen": 20442880, "step": 9460 }, { "epoch": 1.5440456769983686, "grad_norm": 1.0140867233276367, "learning_rate": 4.955080993966268e-05, "loss": 0.0152, "num_input_tokens_seen": 20454624, "step": 9465 }, { "epoch": 1.5448613376835236, "grad_norm": 1.005288004875183, "learning_rate": 4.954946569305402e-05, "loss": 0.1254, "num_input_tokens_seen": 20465824, "step": 9470 }, { "epoch": 1.5456769983686787, "grad_norm": 0.6372134685516357, "learning_rate": 4.9548119456341114e-05, "loss": 0.1936, "num_input_tokens_seen": 20477088, "step": 9475 }, { "epoch": 1.5464926590538335, "grad_norm": 0.03720410540699959, "learning_rate": 4.954677122963309e-05, "loss": 0.0782, "num_input_tokens_seen": 20487296, "step": 9480 }, { "epoch": 1.5473083197389887, "grad_norm": 0.13508373498916626, "learning_rate": 4.954542101303924e-05, "loss": 0.2571, "num_input_tokens_seen": 20498080, "step": 9485 }, { "epoch": 1.5481239804241436, "grad_norm": 3.5957140922546387, "learning_rate": 4.954406880666902e-05, "loss": 0.1838, "num_input_tokens_seen": 20509824, "step": 9490 }, { "epoch": 1.5489396411092984, "grad_norm": 1.9207464456558228, "learning_rate": 4.954271461063204e-05, "loss": 0.2569, "num_input_tokens_seen": 20520928, "step": 9495 }, { "epoch": 1.5497553017944536, "grad_norm": 1.5969135761260986, "learning_rate": 4.9541358425038095e-05, "loss": 0.2794, "num_input_tokens_seen": 20532288, "step": 9500 }, { "epoch": 1.5505709624796085, "grad_norm": 2.585832118988037, "learning_rate": 4.954000024999711e-05, "loss": 0.2777, "num_input_tokens_seen": 20543456, "step": 9505 }, { "epoch": 1.5513866231647635, "grad_norm": 1.9454797506332397, "learning_rate": 4.9538640085619184e-05, "loss": 0.0811, "num_input_tokens_seen": 20554592, "step": 9510 }, { "epoch": 1.5522022838499185, "grad_norm": 2.7306134700775146, "learning_rate": 4.953727793201459e-05, "loss": 0.0692, "num_input_tokens_seen": 20565504, "step": 9515 }, { "epoch": 1.5530179445350734, "grad_norm": 5.1417236328125, "learning_rate": 4.953591378929375e-05, "loss": 0.2984, "num_input_tokens_seen": 20576896, "step": 9520 }, { "epoch": 1.5538336052202284, "grad_norm": 1.9761908054351807, "learning_rate": 4.953454765756724e-05, "loss": 0.0693, "num_input_tokens_seen": 20586720, "step": 9525 }, { "epoch": 1.5546492659053834, "grad_norm": 3.7507734298706055, "learning_rate": 4.953317953694582e-05, "loss": 0.1395, "num_input_tokens_seen": 20597376, "step": 9530 }, { "epoch": 1.5554649265905383, "grad_norm": 0.5698208212852478, "learning_rate": 4.953180942754037e-05, "loss": 0.0368, "num_input_tokens_seen": 20608608, "step": 9535 }, { "epoch": 1.5562805872756933, "grad_norm": 2.6742403507232666, "learning_rate": 4.9530437329461987e-05, "loss": 0.0987, "num_input_tokens_seen": 20620128, "step": 9540 }, { "epoch": 1.5570962479608483, "grad_norm": 0.09604479372501373, "learning_rate": 4.952906324282188e-05, "loss": 0.0658, "num_input_tokens_seen": 20631200, "step": 9545 }, { "epoch": 1.5579119086460032, "grad_norm": 0.10088922083377838, "learning_rate": 4.952768716773145e-05, "loss": 0.0292, "num_input_tokens_seen": 20640352, "step": 9550 }, { "epoch": 1.5587275693311582, "grad_norm": 2.8340446949005127, "learning_rate": 4.9526309104302246e-05, "loss": 0.2607, "num_input_tokens_seen": 20652832, "step": 9555 }, { "epoch": 1.5595432300163132, "grad_norm": 0.8983645439147949, "learning_rate": 4.952492905264599e-05, "loss": 0.0425, "num_input_tokens_seen": 20663488, "step": 9560 }, { "epoch": 1.560358890701468, "grad_norm": 0.3439129590988159, "learning_rate": 4.9523547012874524e-05, "loss": 0.0462, "num_input_tokens_seen": 20674464, "step": 9565 }, { "epoch": 1.5611745513866233, "grad_norm": 1.29847252368927, "learning_rate": 4.952216298509993e-05, "loss": 0.1046, "num_input_tokens_seen": 20685760, "step": 9570 }, { "epoch": 1.5619902120717781, "grad_norm": 6.125744342803955, "learning_rate": 4.952077696943437e-05, "loss": 0.2323, "num_input_tokens_seen": 20697280, "step": 9575 }, { "epoch": 1.5628058727569332, "grad_norm": 0.07258804887533188, "learning_rate": 4.951938896599021e-05, "loss": 0.2327, "num_input_tokens_seen": 20708000, "step": 9580 }, { "epoch": 1.5636215334420882, "grad_norm": 0.2027614712715149, "learning_rate": 4.951799897487997e-05, "loss": 0.1106, "num_input_tokens_seen": 20719424, "step": 9585 }, { "epoch": 1.564437194127243, "grad_norm": 2.6210386753082275, "learning_rate": 4.951660699621633e-05, "loss": 0.1683, "num_input_tokens_seen": 20728096, "step": 9590 }, { "epoch": 1.565252854812398, "grad_norm": 3.2500882148742676, "learning_rate": 4.9515213030112135e-05, "loss": 0.1551, "num_input_tokens_seen": 20738816, "step": 9595 }, { "epoch": 1.566068515497553, "grad_norm": 0.1480618715286255, "learning_rate": 4.951381707668038e-05, "loss": 0.1608, "num_input_tokens_seen": 20749408, "step": 9600 }, { "epoch": 1.566884176182708, "grad_norm": 0.06155373901128769, "learning_rate": 4.951241913603423e-05, "loss": 0.0466, "num_input_tokens_seen": 20759200, "step": 9605 }, { "epoch": 1.567699836867863, "grad_norm": 1.1635830402374268, "learning_rate": 4.9511019208287014e-05, "loss": 0.0979, "num_input_tokens_seen": 20770560, "step": 9610 }, { "epoch": 1.568515497553018, "grad_norm": 2.791290521621704, "learning_rate": 4.9509617293552215e-05, "loss": 0.1495, "num_input_tokens_seen": 20781056, "step": 9615 }, { "epoch": 1.5693311582381728, "grad_norm": 1.9688714742660522, "learning_rate": 4.9508213391943467e-05, "loss": 0.0452, "num_input_tokens_seen": 20791648, "step": 9620 }, { "epoch": 1.5701468189233279, "grad_norm": 4.5324320793151855, "learning_rate": 4.950680750357459e-05, "loss": 0.1624, "num_input_tokens_seen": 20804128, "step": 9625 }, { "epoch": 1.570962479608483, "grad_norm": 7.816247463226318, "learning_rate": 4.950539962855956e-05, "loss": 0.0982, "num_input_tokens_seen": 20814752, "step": 9630 }, { "epoch": 1.5717781402936377, "grad_norm": 0.8073441982269287, "learning_rate": 4.9503989767012493e-05, "loss": 0.168, "num_input_tokens_seen": 20826720, "step": 9635 }, { "epoch": 1.572593800978793, "grad_norm": 4.1045379638671875, "learning_rate": 4.950257791904768e-05, "loss": 0.09, "num_input_tokens_seen": 20837536, "step": 9640 }, { "epoch": 1.5734094616639478, "grad_norm": 6.973872184753418, "learning_rate": 4.950116408477958e-05, "loss": 0.2424, "num_input_tokens_seen": 20847744, "step": 9645 }, { "epoch": 1.5742251223491026, "grad_norm": 4.554234504699707, "learning_rate": 4.94997482643228e-05, "loss": 0.3488, "num_input_tokens_seen": 20859136, "step": 9650 }, { "epoch": 1.5750407830342579, "grad_norm": 6.059474468231201, "learning_rate": 4.949833045779212e-05, "loss": 0.1507, "num_input_tokens_seen": 20868832, "step": 9655 }, { "epoch": 1.5758564437194127, "grad_norm": 0.741489589214325, "learning_rate": 4.9496910665302467e-05, "loss": 0.0825, "num_input_tokens_seen": 20878080, "step": 9660 }, { "epoch": 1.5766721044045677, "grad_norm": 1.6913024187088013, "learning_rate": 4.949548888696893e-05, "loss": 0.1598, "num_input_tokens_seen": 20889600, "step": 9665 }, { "epoch": 1.5774877650897228, "grad_norm": 5.594682216644287, "learning_rate": 4.9494065122906787e-05, "loss": 0.2739, "num_input_tokens_seen": 20899392, "step": 9670 }, { "epoch": 1.5783034257748776, "grad_norm": 0.30009037256240845, "learning_rate": 4.9492639373231436e-05, "loss": 0.2161, "num_input_tokens_seen": 20909888, "step": 9675 }, { "epoch": 1.5791190864600326, "grad_norm": 0.20817957818508148, "learning_rate": 4.949121163805847e-05, "loss": 0.0206, "num_input_tokens_seen": 20920032, "step": 9680 }, { "epoch": 1.5799347471451877, "grad_norm": 0.060204293578863144, "learning_rate": 4.948978191750362e-05, "loss": 0.1828, "num_input_tokens_seen": 20931136, "step": 9685 }, { "epoch": 1.5807504078303425, "grad_norm": 0.053723905235528946, "learning_rate": 4.948835021168278e-05, "loss": 0.0399, "num_input_tokens_seen": 20941792, "step": 9690 }, { "epoch": 1.5815660685154975, "grad_norm": 0.6115943193435669, "learning_rate": 4.9486916520712026e-05, "loss": 0.1158, "num_input_tokens_seen": 20952672, "step": 9695 }, { "epoch": 1.5823817292006526, "grad_norm": 0.3317805528640747, "learning_rate": 4.948548084470757e-05, "loss": 0.0291, "num_input_tokens_seen": 20964480, "step": 9700 }, { "epoch": 1.5831973898858074, "grad_norm": 0.36165598034858704, "learning_rate": 4.94840431837858e-05, "loss": 0.0418, "num_input_tokens_seen": 20975392, "step": 9705 }, { "epoch": 1.5840130505709626, "grad_norm": 0.12291105091571808, "learning_rate": 4.948260353806326e-05, "loss": 0.2428, "num_input_tokens_seen": 20985344, "step": 9710 }, { "epoch": 1.5848287112561175, "grad_norm": 2.6017065048217773, "learning_rate": 4.948116190765665e-05, "loss": 0.1741, "num_input_tokens_seen": 20995456, "step": 9715 }, { "epoch": 1.5856443719412723, "grad_norm": 0.10842294991016388, "learning_rate": 4.9479718292682846e-05, "loss": 0.0578, "num_input_tokens_seen": 21006816, "step": 9720 }, { "epoch": 1.5864600326264275, "grad_norm": 3.0721845626831055, "learning_rate": 4.9478272693258866e-05, "loss": 0.1575, "num_input_tokens_seen": 21017344, "step": 9725 }, { "epoch": 1.5872756933115824, "grad_norm": 4.885010242462158, "learning_rate": 4.94768251095019e-05, "loss": 0.2057, "num_input_tokens_seen": 21027040, "step": 9730 }, { "epoch": 1.5880913539967374, "grad_norm": 1.0548969507217407, "learning_rate": 4.9475375541529294e-05, "loss": 0.0189, "num_input_tokens_seen": 21037472, "step": 9735 }, { "epoch": 1.5889070146818924, "grad_norm": 7.931169509887695, "learning_rate": 4.947392398945856e-05, "loss": 0.2874, "num_input_tokens_seen": 21048800, "step": 9740 }, { "epoch": 1.5897226753670473, "grad_norm": 2.7945945262908936, "learning_rate": 4.9472470453407374e-05, "loss": 0.3119, "num_input_tokens_seen": 21060672, "step": 9745 }, { "epoch": 1.5905383360522023, "grad_norm": 0.10913848876953125, "learning_rate": 4.947101493349355e-05, "loss": 0.274, "num_input_tokens_seen": 21070880, "step": 9750 }, { "epoch": 1.5913539967373573, "grad_norm": 1.3329194784164429, "learning_rate": 4.94695574298351e-05, "loss": 0.323, "num_input_tokens_seen": 21081792, "step": 9755 }, { "epoch": 1.5921696574225122, "grad_norm": 0.2509414851665497, "learning_rate": 4.946809794255016e-05, "loss": 0.0835, "num_input_tokens_seen": 21093280, "step": 9760 }, { "epoch": 1.5929853181076672, "grad_norm": 2.833569049835205, "learning_rate": 4.946663647175706e-05, "loss": 0.0485, "num_input_tokens_seen": 21103264, "step": 9765 }, { "epoch": 1.5938009787928222, "grad_norm": 0.09655069559812546, "learning_rate": 4.946517301757426e-05, "loss": 0.1494, "num_input_tokens_seen": 21113632, "step": 9770 }, { "epoch": 1.594616639477977, "grad_norm": 2.8146326541900635, "learning_rate": 4.94637075801204e-05, "loss": 0.1869, "num_input_tokens_seen": 21124448, "step": 9775 }, { "epoch": 1.595432300163132, "grad_norm": 8.07801342010498, "learning_rate": 4.946224015951427e-05, "loss": 0.1695, "num_input_tokens_seen": 21135776, "step": 9780 }, { "epoch": 1.5962479608482871, "grad_norm": 0.2656671702861786, "learning_rate": 4.946077075587484e-05, "loss": 0.2517, "num_input_tokens_seen": 21147072, "step": 9785 }, { "epoch": 1.597063621533442, "grad_norm": 0.2358125001192093, "learning_rate": 4.945929936932122e-05, "loss": 0.1235, "num_input_tokens_seen": 21158848, "step": 9790 }, { "epoch": 1.5978792822185972, "grad_norm": 2.6165988445281982, "learning_rate": 4.945782599997269e-05, "loss": 0.1093, "num_input_tokens_seen": 21168448, "step": 9795 }, { "epoch": 1.598694942903752, "grad_norm": 5.36940860748291, "learning_rate": 4.945635064794869e-05, "loss": 0.2194, "num_input_tokens_seen": 21179744, "step": 9800 }, { "epoch": 1.599510603588907, "grad_norm": 3.525139570236206, "learning_rate": 4.94548733133688e-05, "loss": 0.1811, "num_input_tokens_seen": 21191904, "step": 9805 }, { "epoch": 1.600326264274062, "grad_norm": 0.09005577862262726, "learning_rate": 4.945339399635281e-05, "loss": 0.124, "num_input_tokens_seen": 21202848, "step": 9810 }, { "epoch": 1.601141924959217, "grad_norm": 0.1309748888015747, "learning_rate": 4.945191269702062e-05, "loss": 0.0408, "num_input_tokens_seen": 21214304, "step": 9815 }, { "epoch": 1.601957585644372, "grad_norm": 2.700035810470581, "learning_rate": 4.945042941549233e-05, "loss": 0.0976, "num_input_tokens_seen": 21224448, "step": 9820 }, { "epoch": 1.602773246329527, "grad_norm": 4.2569260597229, "learning_rate": 4.944894415188815e-05, "loss": 0.2857, "num_input_tokens_seen": 21235392, "step": 9825 }, { "epoch": 1.6035889070146818, "grad_norm": 4.408572196960449, "learning_rate": 4.944745690632852e-05, "loss": 0.1675, "num_input_tokens_seen": 21246656, "step": 9830 }, { "epoch": 1.6044045676998369, "grad_norm": 5.793549060821533, "learning_rate": 4.944596767893399e-05, "loss": 0.1217, "num_input_tokens_seen": 21257376, "step": 9835 }, { "epoch": 1.605220228384992, "grad_norm": 0.5074617862701416, "learning_rate": 4.944447646982529e-05, "loss": 0.134, "num_input_tokens_seen": 21267456, "step": 9840 }, { "epoch": 1.6060358890701467, "grad_norm": 3.0906171798706055, "learning_rate": 4.9442983279123276e-05, "loss": 0.2018, "num_input_tokens_seen": 21279104, "step": 9845 }, { "epoch": 1.6068515497553018, "grad_norm": 0.09473452717065811, "learning_rate": 4.944148810694903e-05, "loss": 0.2745, "num_input_tokens_seen": 21291328, "step": 9850 }, { "epoch": 1.6076672104404568, "grad_norm": 3.595895290374756, "learning_rate": 4.9439990953423735e-05, "loss": 0.0922, "num_input_tokens_seen": 21302016, "step": 9855 }, { "epoch": 1.6084828711256116, "grad_norm": 1.9205716848373413, "learning_rate": 4.943849181866876e-05, "loss": 0.0972, "num_input_tokens_seen": 21312448, "step": 9860 }, { "epoch": 1.6092985318107669, "grad_norm": 0.738785982131958, "learning_rate": 4.943699070280565e-05, "loss": 0.0264, "num_input_tokens_seen": 21323136, "step": 9865 }, { "epoch": 1.6101141924959217, "grad_norm": 0.8316954970359802, "learning_rate": 4.9435487605956084e-05, "loss": 0.0611, "num_input_tokens_seen": 21334368, "step": 9870 }, { "epoch": 1.6109298531810765, "grad_norm": 1.3046718835830688, "learning_rate": 4.94339825282419e-05, "loss": 0.0991, "num_input_tokens_seen": 21344992, "step": 9875 }, { "epoch": 1.6117455138662318, "grad_norm": 0.14773312211036682, "learning_rate": 4.943247546978512e-05, "loss": 0.0802, "num_input_tokens_seen": 21354688, "step": 9880 }, { "epoch": 1.6125611745513866, "grad_norm": 3.677513837814331, "learning_rate": 4.943096643070791e-05, "loss": 0.281, "num_input_tokens_seen": 21365696, "step": 9885 }, { "epoch": 1.6133768352365416, "grad_norm": 2.2047548294067383, "learning_rate": 4.9429455411132596e-05, "loss": 0.2397, "num_input_tokens_seen": 21378240, "step": 9890 }, { "epoch": 1.6141924959216967, "grad_norm": 7.5614013671875, "learning_rate": 4.942794241118167e-05, "loss": 0.4745, "num_input_tokens_seen": 21388992, "step": 9895 }, { "epoch": 1.6150081566068515, "grad_norm": 1.1326520442962646, "learning_rate": 4.9426427430977796e-05, "loss": 0.1331, "num_input_tokens_seen": 21399808, "step": 9900 }, { "epoch": 1.6158238172920065, "grad_norm": 1.2015715837478638, "learning_rate": 4.942491047064377e-05, "loss": 0.0292, "num_input_tokens_seen": 21409696, "step": 9905 }, { "epoch": 1.6166394779771616, "grad_norm": 0.10749071836471558, "learning_rate": 4.942339153030257e-05, "loss": 0.0778, "num_input_tokens_seen": 21420416, "step": 9910 }, { "epoch": 1.6174551386623164, "grad_norm": 3.0217397212982178, "learning_rate": 4.942187061007732e-05, "loss": 0.3587, "num_input_tokens_seen": 21431648, "step": 9915 }, { "epoch": 1.6182707993474714, "grad_norm": 1.1433262825012207, "learning_rate": 4.942034771009134e-05, "loss": 0.2492, "num_input_tokens_seen": 21441856, "step": 9920 }, { "epoch": 1.6190864600326265, "grad_norm": 2.3705992698669434, "learning_rate": 4.941882283046806e-05, "loss": 0.1498, "num_input_tokens_seen": 21452736, "step": 9925 }, { "epoch": 1.6199021207177813, "grad_norm": 0.23972788453102112, "learning_rate": 4.94172959713311e-05, "loss": 0.1306, "num_input_tokens_seen": 21463552, "step": 9930 }, { "epoch": 1.6207177814029365, "grad_norm": 2.9236345291137695, "learning_rate": 4.941576713280424e-05, "loss": 0.2271, "num_input_tokens_seen": 21475136, "step": 9935 }, { "epoch": 1.6215334420880914, "grad_norm": 0.05357769504189491, "learning_rate": 4.941423631501141e-05, "loss": 0.1754, "num_input_tokens_seen": 21485536, "step": 9940 }, { "epoch": 1.6223491027732462, "grad_norm": 0.1600756049156189, "learning_rate": 4.941270351807671e-05, "loss": 0.058, "num_input_tokens_seen": 21496320, "step": 9945 }, { "epoch": 1.6231647634584014, "grad_norm": 3.389401435852051, "learning_rate": 4.941116874212439e-05, "loss": 0.1327, "num_input_tokens_seen": 21506688, "step": 9950 }, { "epoch": 1.6239804241435563, "grad_norm": 5.759918689727783, "learning_rate": 4.940963198727887e-05, "loss": 0.3432, "num_input_tokens_seen": 21517184, "step": 9955 }, { "epoch": 1.6247960848287113, "grad_norm": 0.1930082142353058, "learning_rate": 4.940809325366473e-05, "loss": 0.0287, "num_input_tokens_seen": 21527104, "step": 9960 }, { "epoch": 1.6256117455138663, "grad_norm": 6.6077470779418945, "learning_rate": 4.9406552541406707e-05, "loss": 0.2845, "num_input_tokens_seen": 21537024, "step": 9965 }, { "epoch": 1.6264274061990212, "grad_norm": 4.8132004737854, "learning_rate": 4.94050098506297e-05, "loss": 0.112, "num_input_tokens_seen": 21547648, "step": 9970 }, { "epoch": 1.6272430668841762, "grad_norm": 5.695581912994385, "learning_rate": 4.940346518145876e-05, "loss": 0.1869, "num_input_tokens_seen": 21558944, "step": 9975 }, { "epoch": 1.6280587275693312, "grad_norm": 6.937102317810059, "learning_rate": 4.940191853401911e-05, "loss": 0.3996, "num_input_tokens_seen": 21569408, "step": 9980 }, { "epoch": 1.628874388254486, "grad_norm": 2.9301321506500244, "learning_rate": 4.940036990843613e-05, "loss": 0.1656, "num_input_tokens_seen": 21580384, "step": 9985 }, { "epoch": 1.629690048939641, "grad_norm": 2.733616590499878, "learning_rate": 4.9398819304835364e-05, "loss": 0.0979, "num_input_tokens_seen": 21591328, "step": 9990 }, { "epoch": 1.6305057096247961, "grad_norm": 1.712278962135315, "learning_rate": 4.93972667233425e-05, "loss": 0.0576, "num_input_tokens_seen": 21603456, "step": 9995 }, { "epoch": 1.631321370309951, "grad_norm": 1.8469611406326294, "learning_rate": 4.9395712164083406e-05, "loss": 0.162, "num_input_tokens_seen": 21613248, "step": 10000 }, { "epoch": 1.632137030995106, "grad_norm": 0.2925925552845001, "learning_rate": 4.93941556271841e-05, "loss": 0.0475, "num_input_tokens_seen": 21624256, "step": 10005 }, { "epoch": 1.632952691680261, "grad_norm": 0.38624608516693115, "learning_rate": 4.9392597112770765e-05, "loss": 0.1058, "num_input_tokens_seen": 21636032, "step": 10010 }, { "epoch": 1.6337683523654158, "grad_norm": 3.0619940757751465, "learning_rate": 4.939103662096974e-05, "loss": 0.1101, "num_input_tokens_seen": 21647264, "step": 10015 }, { "epoch": 1.634584013050571, "grad_norm": 4.746956825256348, "learning_rate": 4.938947415190754e-05, "loss": 0.0825, "num_input_tokens_seen": 21658496, "step": 10020 }, { "epoch": 1.635399673735726, "grad_norm": 1.4105510711669922, "learning_rate": 4.93879097057108e-05, "loss": 0.1926, "num_input_tokens_seen": 21669248, "step": 10025 }, { "epoch": 1.636215334420881, "grad_norm": 2.56160044670105, "learning_rate": 4.938634328250636e-05, "loss": 0.0852, "num_input_tokens_seen": 21679840, "step": 10030 }, { "epoch": 1.637030995106036, "grad_norm": 0.12285134196281433, "learning_rate": 4.93847748824212e-05, "loss": 0.1782, "num_input_tokens_seen": 21691904, "step": 10035 }, { "epoch": 1.6378466557911908, "grad_norm": 0.17458020150661469, "learning_rate": 4.938320450558246e-05, "loss": 0.0126, "num_input_tokens_seen": 21701408, "step": 10040 }, { "epoch": 1.6386623164763459, "grad_norm": 0.1861964613199234, "learning_rate": 4.938163215211745e-05, "loss": 0.0405, "num_input_tokens_seen": 21712256, "step": 10045 }, { "epoch": 1.639477977161501, "grad_norm": 0.37605103850364685, "learning_rate": 4.938005782215362e-05, "loss": 0.0657, "num_input_tokens_seen": 21722304, "step": 10050 }, { "epoch": 1.6402936378466557, "grad_norm": 0.7383480072021484, "learning_rate": 4.93784815158186e-05, "loss": 0.0866, "num_input_tokens_seen": 21733152, "step": 10055 }, { "epoch": 1.6411092985318108, "grad_norm": 2.417325973510742, "learning_rate": 4.937690323324017e-05, "loss": 0.157, "num_input_tokens_seen": 21744160, "step": 10060 }, { "epoch": 1.6419249592169658, "grad_norm": 3.986969232559204, "learning_rate": 4.9375322974546285e-05, "loss": 0.0965, "num_input_tokens_seen": 21754432, "step": 10065 }, { "epoch": 1.6427406199021206, "grad_norm": 0.06366409361362457, "learning_rate": 4.937374073986504e-05, "loss": 0.0182, "num_input_tokens_seen": 21764928, "step": 10070 }, { "epoch": 1.6435562805872757, "grad_norm": 2.6650710105895996, "learning_rate": 4.937215652932469e-05, "loss": 0.2547, "num_input_tokens_seen": 21775008, "step": 10075 }, { "epoch": 1.6443719412724307, "grad_norm": 4.918404579162598, "learning_rate": 4.937057034305368e-05, "loss": 0.1259, "num_input_tokens_seen": 21785792, "step": 10080 }, { "epoch": 1.6451876019575855, "grad_norm": 0.11018738150596619, "learning_rate": 4.9368982181180576e-05, "loss": 0.1627, "num_input_tokens_seen": 21796704, "step": 10085 }, { "epoch": 1.6460032626427408, "grad_norm": 0.10577882081270218, "learning_rate": 4.936739204383413e-05, "loss": 0.3467, "num_input_tokens_seen": 21807840, "step": 10090 }, { "epoch": 1.6468189233278956, "grad_norm": 3.6516923904418945, "learning_rate": 4.936579993114324e-05, "loss": 0.1327, "num_input_tokens_seen": 21818528, "step": 10095 }, { "epoch": 1.6476345840130504, "grad_norm": 0.052826497703790665, "learning_rate": 4.936420584323699e-05, "loss": 0.0168, "num_input_tokens_seen": 21829472, "step": 10100 }, { "epoch": 1.6484502446982057, "grad_norm": 5.709690093994141, "learning_rate": 4.936260978024458e-05, "loss": 0.5152, "num_input_tokens_seen": 21841024, "step": 10105 }, { "epoch": 1.6492659053833605, "grad_norm": 1.8346400260925293, "learning_rate": 4.936101174229541e-05, "loss": 0.4441, "num_input_tokens_seen": 21852096, "step": 10110 }, { "epoch": 1.6500815660685155, "grad_norm": 4.051974773406982, "learning_rate": 4.935941172951902e-05, "loss": 0.1744, "num_input_tokens_seen": 21863904, "step": 10115 }, { "epoch": 1.6508972267536706, "grad_norm": 1.8480687141418457, "learning_rate": 4.9357809742045126e-05, "loss": 0.1047, "num_input_tokens_seen": 21874048, "step": 10120 }, { "epoch": 1.6517128874388254, "grad_norm": 4.244266986846924, "learning_rate": 4.935620578000358e-05, "loss": 0.1778, "num_input_tokens_seen": 21885632, "step": 10125 }, { "epoch": 1.6525285481239804, "grad_norm": 0.2436949461698532, "learning_rate": 4.935459984352441e-05, "loss": 0.2817, "num_input_tokens_seen": 21896864, "step": 10130 }, { "epoch": 1.6533442088091355, "grad_norm": 2.2461154460906982, "learning_rate": 4.93529919327378e-05, "loss": 0.0961, "num_input_tokens_seen": 21907712, "step": 10135 }, { "epoch": 1.6541598694942903, "grad_norm": 0.9836466908454895, "learning_rate": 4.9351382047774095e-05, "loss": 0.1588, "num_input_tokens_seen": 21918464, "step": 10140 }, { "epoch": 1.6549755301794453, "grad_norm": 0.1777764856815338, "learning_rate": 4.934977018876381e-05, "loss": 0.2104, "num_input_tokens_seen": 21929664, "step": 10145 }, { "epoch": 1.6557911908646004, "grad_norm": 0.8241901397705078, "learning_rate": 4.93481563558376e-05, "loss": 0.1703, "num_input_tokens_seen": 21940288, "step": 10150 }, { "epoch": 1.6566068515497552, "grad_norm": 3.3961079120635986, "learning_rate": 4.9346540549126305e-05, "loss": 0.044, "num_input_tokens_seen": 21950464, "step": 10155 }, { "epoch": 1.6574225122349104, "grad_norm": 1.6570359468460083, "learning_rate": 4.934492276876089e-05, "loss": 0.1409, "num_input_tokens_seen": 21961696, "step": 10160 }, { "epoch": 1.6582381729200653, "grad_norm": 0.07733599096536636, "learning_rate": 4.934330301487251e-05, "loss": 0.1334, "num_input_tokens_seen": 21972192, "step": 10165 }, { "epoch": 1.65905383360522, "grad_norm": 0.19854383170604706, "learning_rate": 4.934168128759248e-05, "loss": 0.2047, "num_input_tokens_seen": 21982112, "step": 10170 }, { "epoch": 1.6598694942903753, "grad_norm": 4.915229320526123, "learning_rate": 4.9340057587052245e-05, "loss": 0.1606, "num_input_tokens_seen": 21993568, "step": 10175 }, { "epoch": 1.6606851549755302, "grad_norm": 1.7136536836624146, "learning_rate": 4.9338431913383444e-05, "loss": 0.0769, "num_input_tokens_seen": 22003296, "step": 10180 }, { "epoch": 1.6615008156606852, "grad_norm": 0.07920566201210022, "learning_rate": 4.9336804266717864e-05, "loss": 0.062, "num_input_tokens_seen": 22013344, "step": 10185 }, { "epoch": 1.6623164763458402, "grad_norm": 0.15559551119804382, "learning_rate": 4.933517464718744e-05, "loss": 0.2693, "num_input_tokens_seen": 22022112, "step": 10190 }, { "epoch": 1.663132137030995, "grad_norm": 0.16380707919597626, "learning_rate": 4.933354305492429e-05, "loss": 0.1376, "num_input_tokens_seen": 22032704, "step": 10195 }, { "epoch": 1.66394779771615, "grad_norm": 0.12119174748659134, "learning_rate": 4.933190949006068e-05, "loss": 0.0976, "num_input_tokens_seen": 22044320, "step": 10200 }, { "epoch": 1.6647634584013051, "grad_norm": 0.12529213726520538, "learning_rate": 4.933027395272901e-05, "loss": 0.0086, "num_input_tokens_seen": 22055008, "step": 10205 }, { "epoch": 1.66557911908646, "grad_norm": 0.555073618888855, "learning_rate": 4.9328636443061894e-05, "loss": 0.133, "num_input_tokens_seen": 22065920, "step": 10210 }, { "epoch": 1.666394779771615, "grad_norm": 0.13457554578781128, "learning_rate": 4.932699696119207e-05, "loss": 0.0583, "num_input_tokens_seen": 22077024, "step": 10215 }, { "epoch": 1.66721044045677, "grad_norm": 3.605102777481079, "learning_rate": 4.932535550725243e-05, "loss": 0.3744, "num_input_tokens_seen": 22088320, "step": 10220 }, { "epoch": 1.6680261011419248, "grad_norm": 0.05995646119117737, "learning_rate": 4.932371208137605e-05, "loss": 0.0263, "num_input_tokens_seen": 22098528, "step": 10225 }, { "epoch": 1.6688417618270799, "grad_norm": 2.473446846008301, "learning_rate": 4.932206668369615e-05, "loss": 0.157, "num_input_tokens_seen": 22109792, "step": 10230 }, { "epoch": 1.669657422512235, "grad_norm": 0.20219111442565918, "learning_rate": 4.932041931434611e-05, "loss": 0.2797, "num_input_tokens_seen": 22120064, "step": 10235 }, { "epoch": 1.6704730831973897, "grad_norm": 0.13476024568080902, "learning_rate": 4.931876997345949e-05, "loss": 0.0701, "num_input_tokens_seen": 22131200, "step": 10240 }, { "epoch": 1.671288743882545, "grad_norm": 0.13390684127807617, "learning_rate": 4.931711866116998e-05, "loss": 0.1255, "num_input_tokens_seen": 22142880, "step": 10245 }, { "epoch": 1.6721044045676998, "grad_norm": 0.10579252988100052, "learning_rate": 4.9315465377611445e-05, "loss": 0.2971, "num_input_tokens_seen": 22151744, "step": 10250 }, { "epoch": 1.6729200652528549, "grad_norm": 5.694382190704346, "learning_rate": 4.9313810122917914e-05, "loss": 0.1125, "num_input_tokens_seen": 22162432, "step": 10255 }, { "epoch": 1.67373572593801, "grad_norm": 2.870382785797119, "learning_rate": 4.931215289722357e-05, "loss": 0.067, "num_input_tokens_seen": 22174432, "step": 10260 }, { "epoch": 1.6745513866231647, "grad_norm": 2.902642250061035, "learning_rate": 4.931049370066275e-05, "loss": 0.2172, "num_input_tokens_seen": 22185184, "step": 10265 }, { "epoch": 1.6753670473083198, "grad_norm": 0.22015023231506348, "learning_rate": 4.930883253336996e-05, "loss": 0.1038, "num_input_tokens_seen": 22196480, "step": 10270 }, { "epoch": 1.6761827079934748, "grad_norm": 0.15334255993366241, "learning_rate": 4.930716939547986e-05, "loss": 0.1083, "num_input_tokens_seen": 22207392, "step": 10275 }, { "epoch": 1.6769983686786296, "grad_norm": 4.0195231437683105, "learning_rate": 4.930550428712728e-05, "loss": 0.1289, "num_input_tokens_seen": 22218880, "step": 10280 }, { "epoch": 1.6778140293637847, "grad_norm": 0.15493622422218323, "learning_rate": 4.93038372084472e-05, "loss": 0.2316, "num_input_tokens_seen": 22229184, "step": 10285 }, { "epoch": 1.6786296900489397, "grad_norm": 5.129248142242432, "learning_rate": 4.9302168159574756e-05, "loss": 0.2169, "num_input_tokens_seen": 22240096, "step": 10290 }, { "epoch": 1.6794453507340945, "grad_norm": 0.1276528239250183, "learning_rate": 4.930049714064525e-05, "loss": 0.3012, "num_input_tokens_seen": 22251616, "step": 10295 }, { "epoch": 1.6802610114192496, "grad_norm": 1.9392294883728027, "learning_rate": 4.9298824151794154e-05, "loss": 0.1245, "num_input_tokens_seen": 22262464, "step": 10300 }, { "epoch": 1.6810766721044046, "grad_norm": 3.2291712760925293, "learning_rate": 4.9297149193157075e-05, "loss": 0.1465, "num_input_tokens_seen": 22272640, "step": 10305 }, { "epoch": 1.6818923327895594, "grad_norm": 5.780815124511719, "learning_rate": 4.9295472264869804e-05, "loss": 0.1781, "num_input_tokens_seen": 22283264, "step": 10310 }, { "epoch": 1.6827079934747147, "grad_norm": 0.17058567702770233, "learning_rate": 4.929379336706827e-05, "loss": 0.1163, "num_input_tokens_seen": 22294112, "step": 10315 }, { "epoch": 1.6835236541598695, "grad_norm": 0.38892847299575806, "learning_rate": 4.9292112499888584e-05, "loss": 0.177, "num_input_tokens_seen": 22305184, "step": 10320 }, { "epoch": 1.6843393148450243, "grad_norm": 0.9305853843688965, "learning_rate": 4.929042966346701e-05, "loss": 0.123, "num_input_tokens_seen": 22315840, "step": 10325 }, { "epoch": 1.6851549755301796, "grad_norm": 2.4178287982940674, "learning_rate": 4.928874485793995e-05, "loss": 0.1348, "num_input_tokens_seen": 22325728, "step": 10330 }, { "epoch": 1.6859706362153344, "grad_norm": 3.133922815322876, "learning_rate": 4.928705808344399e-05, "loss": 0.1868, "num_input_tokens_seen": 22335904, "step": 10335 }, { "epoch": 1.6867862969004894, "grad_norm": 0.4173702597618103, "learning_rate": 4.928536934011587e-05, "loss": 0.2283, "num_input_tokens_seen": 22345184, "step": 10340 }, { "epoch": 1.6876019575856445, "grad_norm": 9.072600364685059, "learning_rate": 4.92836786280925e-05, "loss": 0.2349, "num_input_tokens_seen": 22356736, "step": 10345 }, { "epoch": 1.6884176182707993, "grad_norm": 1.4616029262542725, "learning_rate": 4.9281985947510915e-05, "loss": 0.1325, "num_input_tokens_seen": 22368704, "step": 10350 }, { "epoch": 1.6892332789559543, "grad_norm": 0.3134356439113617, "learning_rate": 4.9280291298508355e-05, "loss": 0.2812, "num_input_tokens_seen": 22379904, "step": 10355 }, { "epoch": 1.6900489396411094, "grad_norm": 5.039915561676025, "learning_rate": 4.927859468122217e-05, "loss": 0.164, "num_input_tokens_seen": 22391776, "step": 10360 }, { "epoch": 1.6908646003262642, "grad_norm": 2.0959596633911133, "learning_rate": 4.9276896095789924e-05, "loss": 0.0899, "num_input_tokens_seen": 22401792, "step": 10365 }, { "epoch": 1.6916802610114192, "grad_norm": 2.153296947479248, "learning_rate": 4.927519554234929e-05, "loss": 0.1166, "num_input_tokens_seen": 22411968, "step": 10370 }, { "epoch": 1.6924959216965743, "grad_norm": 3.7540318965911865, "learning_rate": 4.9273493021038146e-05, "loss": 0.1372, "num_input_tokens_seen": 22424000, "step": 10375 }, { "epoch": 1.693311582381729, "grad_norm": 3.609611749649048, "learning_rate": 4.927178853199449e-05, "loss": 0.1268, "num_input_tokens_seen": 22434048, "step": 10380 }, { "epoch": 1.6941272430668843, "grad_norm": 0.1421138048171997, "learning_rate": 4.927008207535651e-05, "loss": 0.0819, "num_input_tokens_seen": 22446176, "step": 10385 }, { "epoch": 1.6949429037520392, "grad_norm": 0.2003869116306305, "learning_rate": 4.9268373651262515e-05, "loss": 0.0414, "num_input_tokens_seen": 22457600, "step": 10390 }, { "epoch": 1.695758564437194, "grad_norm": 0.43225133419036865, "learning_rate": 4.9266663259851025e-05, "loss": 0.0819, "num_input_tokens_seen": 22468768, "step": 10395 }, { "epoch": 1.6965742251223492, "grad_norm": 5.00507926940918, "learning_rate": 4.926495090126068e-05, "loss": 0.2094, "num_input_tokens_seen": 22480448, "step": 10400 }, { "epoch": 1.697389885807504, "grad_norm": 2.335838556289673, "learning_rate": 4.92632365756303e-05, "loss": 0.0683, "num_input_tokens_seen": 22490880, "step": 10405 }, { "epoch": 1.698205546492659, "grad_norm": 0.15659040212631226, "learning_rate": 4.926152028309885e-05, "loss": 0.3744, "num_input_tokens_seen": 22500704, "step": 10410 }, { "epoch": 1.6990212071778141, "grad_norm": 0.08425473421812057, "learning_rate": 4.9259802023805466e-05, "loss": 0.0573, "num_input_tokens_seen": 22511040, "step": 10415 }, { "epoch": 1.699836867862969, "grad_norm": 0.16577233374118805, "learning_rate": 4.9258081797889434e-05, "loss": 0.1264, "num_input_tokens_seen": 22521664, "step": 10420 }, { "epoch": 1.700652528548124, "grad_norm": 0.4766976833343506, "learning_rate": 4.925635960549021e-05, "loss": 0.1119, "num_input_tokens_seen": 22532192, "step": 10425 }, { "epoch": 1.701468189233279, "grad_norm": 0.28510862588882446, "learning_rate": 4.92546354467474e-05, "loss": 0.1663, "num_input_tokens_seen": 22543712, "step": 10430 }, { "epoch": 1.7022838499184338, "grad_norm": 0.7656769156455994, "learning_rate": 4.9252909321800775e-05, "loss": 0.2079, "num_input_tokens_seen": 22555232, "step": 10435 }, { "epoch": 1.7030995106035889, "grad_norm": 0.7301608920097351, "learning_rate": 4.925118123079026e-05, "loss": 0.0141, "num_input_tokens_seen": 22566720, "step": 10440 }, { "epoch": 1.703915171288744, "grad_norm": 0.6367702484130859, "learning_rate": 4.924945117385594e-05, "loss": 0.0552, "num_input_tokens_seen": 22577376, "step": 10445 }, { "epoch": 1.7047308319738987, "grad_norm": 3.5113401412963867, "learning_rate": 4.9247719151138086e-05, "loss": 0.2347, "num_input_tokens_seen": 22589568, "step": 10450 }, { "epoch": 1.7055464926590538, "grad_norm": 1.2844570875167847, "learning_rate": 4.924598516277707e-05, "loss": 0.1636, "num_input_tokens_seen": 22600672, "step": 10455 }, { "epoch": 1.7063621533442088, "grad_norm": 0.269253671169281, "learning_rate": 4.924424920891347e-05, "loss": 0.28, "num_input_tokens_seen": 22611200, "step": 10460 }, { "epoch": 1.7071778140293636, "grad_norm": 0.36347147822380066, "learning_rate": 4.9242511289688024e-05, "loss": 0.1163, "num_input_tokens_seen": 22620608, "step": 10465 }, { "epoch": 1.707993474714519, "grad_norm": 0.0721467137336731, "learning_rate": 4.924077140524161e-05, "loss": 0.1647, "num_input_tokens_seen": 22631168, "step": 10470 }, { "epoch": 1.7088091353996737, "grad_norm": 0.4332468807697296, "learning_rate": 4.9239029555715264e-05, "loss": 0.2585, "num_input_tokens_seen": 22640640, "step": 10475 }, { "epoch": 1.7096247960848288, "grad_norm": 0.36143365502357483, "learning_rate": 4.92372857412502e-05, "loss": 0.1593, "num_input_tokens_seen": 22650976, "step": 10480 }, { "epoch": 1.7104404567699838, "grad_norm": 1.6781527996063232, "learning_rate": 4.9235539961987766e-05, "loss": 0.2733, "num_input_tokens_seen": 22661472, "step": 10485 }, { "epoch": 1.7112561174551386, "grad_norm": 0.6840170621871948, "learning_rate": 4.9233792218069494e-05, "loss": 0.1775, "num_input_tokens_seen": 22673024, "step": 10490 }, { "epoch": 1.7120717781402937, "grad_norm": 0.07148187607526779, "learning_rate": 4.923204250963707e-05, "loss": 0.1668, "num_input_tokens_seen": 22684416, "step": 10495 }, { "epoch": 1.7128874388254487, "grad_norm": 0.15426300466060638, "learning_rate": 4.923029083683233e-05, "loss": 0.0727, "num_input_tokens_seen": 22695136, "step": 10500 }, { "epoch": 1.7137030995106035, "grad_norm": 0.618942379951477, "learning_rate": 4.9228537199797263e-05, "loss": 0.204, "num_input_tokens_seen": 22705408, "step": 10505 }, { "epoch": 1.7145187601957586, "grad_norm": 4.171415328979492, "learning_rate": 4.9226781598674047e-05, "loss": 0.1214, "num_input_tokens_seen": 22716000, "step": 10510 }, { "epoch": 1.7153344208809136, "grad_norm": 4.400952339172363, "learning_rate": 4.922502403360498e-05, "loss": 0.2735, "num_input_tokens_seen": 22727936, "step": 10515 }, { "epoch": 1.7161500815660684, "grad_norm": 6.595067024230957, "learning_rate": 4.922326450473255e-05, "loss": 0.3739, "num_input_tokens_seen": 22737632, "step": 10520 }, { "epoch": 1.7169657422512234, "grad_norm": 4.229296684265137, "learning_rate": 4.9221503012199386e-05, "loss": 0.1762, "num_input_tokens_seen": 22748288, "step": 10525 }, { "epoch": 1.7177814029363785, "grad_norm": 2.971266508102417, "learning_rate": 4.92197395561483e-05, "loss": 0.234, "num_input_tokens_seen": 22760288, "step": 10530 }, { "epoch": 1.7185970636215333, "grad_norm": 0.2944638133049011, "learning_rate": 4.9217974136722235e-05, "loss": 0.1311, "num_input_tokens_seen": 22772032, "step": 10535 }, { "epoch": 1.7194127243066886, "grad_norm": 1.3731074333190918, "learning_rate": 4.92162067540643e-05, "loss": 0.0977, "num_input_tokens_seen": 22783392, "step": 10540 }, { "epoch": 1.7202283849918434, "grad_norm": 0.07093972712755203, "learning_rate": 4.921443740831778e-05, "loss": 0.072, "num_input_tokens_seen": 22795808, "step": 10545 }, { "epoch": 1.7210440456769984, "grad_norm": 0.4293394982814789, "learning_rate": 4.9212666099626095e-05, "loss": 0.298, "num_input_tokens_seen": 22806400, "step": 10550 }, { "epoch": 1.7218597063621535, "grad_norm": 2.244494915008545, "learning_rate": 4.9210892828132835e-05, "loss": 0.1399, "num_input_tokens_seen": 22816256, "step": 10555 }, { "epoch": 1.7226753670473083, "grad_norm": 0.49821212887763977, "learning_rate": 4.920911759398177e-05, "loss": 0.0197, "num_input_tokens_seen": 22827200, "step": 10560 }, { "epoch": 1.7234910277324633, "grad_norm": 1.3760017156600952, "learning_rate": 4.920734039731679e-05, "loss": 0.1363, "num_input_tokens_seen": 22837248, "step": 10565 }, { "epoch": 1.7243066884176184, "grad_norm": 0.7770042419433594, "learning_rate": 4.9205561238281985e-05, "loss": 0.076, "num_input_tokens_seen": 22846048, "step": 10570 }, { "epoch": 1.7251223491027732, "grad_norm": 4.274723529815674, "learning_rate": 4.920378011702155e-05, "loss": 0.1818, "num_input_tokens_seen": 22856672, "step": 10575 }, { "epoch": 1.7259380097879282, "grad_norm": 2.931492328643799, "learning_rate": 4.92019970336799e-05, "loss": 0.1957, "num_input_tokens_seen": 22868224, "step": 10580 }, { "epoch": 1.7267536704730833, "grad_norm": 4.527055740356445, "learning_rate": 4.920021198840157e-05, "loss": 0.3171, "num_input_tokens_seen": 22879584, "step": 10585 }, { "epoch": 1.727569331158238, "grad_norm": 0.10872957110404968, "learning_rate": 4.919842498133126e-05, "loss": 0.0381, "num_input_tokens_seen": 22889696, "step": 10590 }, { "epoch": 1.7283849918433931, "grad_norm": 0.2734326124191284, "learning_rate": 4.919663601261384e-05, "loss": 0.0757, "num_input_tokens_seen": 22900224, "step": 10595 }, { "epoch": 1.7292006525285482, "grad_norm": 2.8099300861358643, "learning_rate": 4.919484508239434e-05, "loss": 0.2513, "num_input_tokens_seen": 22911136, "step": 10600 }, { "epoch": 1.730016313213703, "grad_norm": 0.2676504850387573, "learning_rate": 4.9193052190817926e-05, "loss": 0.2708, "num_input_tokens_seen": 22922400, "step": 10605 }, { "epoch": 1.7308319738988582, "grad_norm": 0.5753697752952576, "learning_rate": 4.919125733802995e-05, "loss": 0.1411, "num_input_tokens_seen": 22933664, "step": 10610 }, { "epoch": 1.731647634584013, "grad_norm": 2.312779426574707, "learning_rate": 4.9189460524175915e-05, "loss": 0.16, "num_input_tokens_seen": 22943104, "step": 10615 }, { "epoch": 1.7324632952691679, "grad_norm": 4.1412129402160645, "learning_rate": 4.918766174940146e-05, "loss": 0.1877, "num_input_tokens_seen": 22953344, "step": 10620 }, { "epoch": 1.7332789559543231, "grad_norm": 0.7794988751411438, "learning_rate": 4.918586101385243e-05, "loss": 0.1442, "num_input_tokens_seen": 22964928, "step": 10625 }, { "epoch": 1.734094616639478, "grad_norm": 0.28300607204437256, "learning_rate": 4.918405831767478e-05, "loss": 0.1773, "num_input_tokens_seen": 22977152, "step": 10630 }, { "epoch": 1.734910277324633, "grad_norm": 3.008607864379883, "learning_rate": 4.9182253661014656e-05, "loss": 0.1388, "num_input_tokens_seen": 22988448, "step": 10635 }, { "epoch": 1.735725938009788, "grad_norm": 0.1705198734998703, "learning_rate": 4.9180447044018354e-05, "loss": 0.0846, "num_input_tokens_seen": 22998592, "step": 10640 }, { "epoch": 1.7365415986949428, "grad_norm": 1.8257312774658203, "learning_rate": 4.917863846683232e-05, "loss": 0.0785, "num_input_tokens_seen": 23009344, "step": 10645 }, { "epoch": 1.7373572593800979, "grad_norm": 4.587921142578125, "learning_rate": 4.9176827929603176e-05, "loss": 0.3389, "num_input_tokens_seen": 23019584, "step": 10650 }, { "epoch": 1.738172920065253, "grad_norm": 3.5429532527923584, "learning_rate": 4.9175015432477686e-05, "loss": 0.1734, "num_input_tokens_seen": 23030336, "step": 10655 }, { "epoch": 1.7389885807504077, "grad_norm": 0.692238450050354, "learning_rate": 4.9173200975602776e-05, "loss": 0.1656, "num_input_tokens_seen": 23041760, "step": 10660 }, { "epoch": 1.7398042414355628, "grad_norm": 3.643800735473633, "learning_rate": 4.917138455912555e-05, "loss": 0.1636, "num_input_tokens_seen": 23051840, "step": 10665 }, { "epoch": 1.7406199021207178, "grad_norm": 4.496054649353027, "learning_rate": 4.916956618319324e-05, "loss": 0.0523, "num_input_tokens_seen": 23061632, "step": 10670 }, { "epoch": 1.7414355628058726, "grad_norm": 4.332545757293701, "learning_rate": 4.916774584795327e-05, "loss": 0.1602, "num_input_tokens_seen": 23071520, "step": 10675 }, { "epoch": 1.7422512234910277, "grad_norm": 0.1590498834848404, "learning_rate": 4.916592355355318e-05, "loss": 0.2678, "num_input_tokens_seen": 23082304, "step": 10680 }, { "epoch": 1.7430668841761827, "grad_norm": 4.96671724319458, "learning_rate": 4.916409930014073e-05, "loss": 0.2867, "num_input_tokens_seen": 23092416, "step": 10685 }, { "epoch": 1.7438825448613375, "grad_norm": 3.715205192565918, "learning_rate": 4.916227308786377e-05, "loss": 0.2121, "num_input_tokens_seen": 23102720, "step": 10690 }, { "epoch": 1.7446982055464928, "grad_norm": 0.7393945455551147, "learning_rate": 4.916044491687036e-05, "loss": 0.0452, "num_input_tokens_seen": 23114016, "step": 10695 }, { "epoch": 1.7455138662316476, "grad_norm": 0.9213941693305969, "learning_rate": 4.915861478730869e-05, "loss": 0.0591, "num_input_tokens_seen": 23125888, "step": 10700 }, { "epoch": 1.7463295269168027, "grad_norm": 3.8318026065826416, "learning_rate": 4.915678269932713e-05, "loss": 0.0616, "num_input_tokens_seen": 23137248, "step": 10705 }, { "epoch": 1.7471451876019577, "grad_norm": 0.2188013345003128, "learning_rate": 4.91549486530742e-05, "loss": 0.1431, "num_input_tokens_seen": 23148064, "step": 10710 }, { "epoch": 1.7479608482871125, "grad_norm": 0.37179723381996155, "learning_rate": 4.9153112648698565e-05, "loss": 0.0091, "num_input_tokens_seen": 23158208, "step": 10715 }, { "epoch": 1.7487765089722676, "grad_norm": 2.3651340007781982, "learning_rate": 4.915127468634906e-05, "loss": 0.129, "num_input_tokens_seen": 23170144, "step": 10720 }, { "epoch": 1.7495921696574226, "grad_norm": 0.7593448758125305, "learning_rate": 4.9149434766174695e-05, "loss": 0.1706, "num_input_tokens_seen": 23180768, "step": 10725 }, { "epoch": 1.7504078303425774, "grad_norm": 4.086053371429443, "learning_rate": 4.914759288832462e-05, "loss": 0.2413, "num_input_tokens_seen": 23191840, "step": 10730 }, { "epoch": 1.7512234910277324, "grad_norm": 0.35563644766807556, "learning_rate": 4.914574905294813e-05, "loss": 0.1839, "num_input_tokens_seen": 23201760, "step": 10735 }, { "epoch": 1.7520391517128875, "grad_norm": 0.1594395488500595, "learning_rate": 4.9143903260194715e-05, "loss": 0.0933, "num_input_tokens_seen": 23213024, "step": 10740 }, { "epoch": 1.7528548123980423, "grad_norm": 0.2561168968677521, "learning_rate": 4.914205551021399e-05, "loss": 0.1451, "num_input_tokens_seen": 23224512, "step": 10745 }, { "epoch": 1.7536704730831973, "grad_norm": 0.6068389415740967, "learning_rate": 4.914020580315576e-05, "loss": 0.0916, "num_input_tokens_seen": 23235040, "step": 10750 }, { "epoch": 1.7544861337683524, "grad_norm": 0.6194460391998291, "learning_rate": 4.913835413916996e-05, "loss": 0.1701, "num_input_tokens_seen": 23245056, "step": 10755 }, { "epoch": 1.7553017944535072, "grad_norm": 0.8164099454879761, "learning_rate": 4.9136500518406694e-05, "loss": 0.1467, "num_input_tokens_seen": 23257824, "step": 10760 }, { "epoch": 1.7561174551386625, "grad_norm": 1.7459872961044312, "learning_rate": 4.913464494101622e-05, "loss": 0.1432, "num_input_tokens_seen": 23269376, "step": 10765 }, { "epoch": 1.7569331158238173, "grad_norm": 3.7063353061676025, "learning_rate": 4.913278740714898e-05, "loss": 0.1261, "num_input_tokens_seen": 23282560, "step": 10770 }, { "epoch": 1.7577487765089723, "grad_norm": 0.7812284231185913, "learning_rate": 4.913092791695554e-05, "loss": 0.074, "num_input_tokens_seen": 23292896, "step": 10775 }, { "epoch": 1.7585644371941274, "grad_norm": 1.1562386751174927, "learning_rate": 4.912906647058664e-05, "loss": 0.0276, "num_input_tokens_seen": 23303168, "step": 10780 }, { "epoch": 1.7593800978792822, "grad_norm": 0.2978338301181793, "learning_rate": 4.912720306819319e-05, "loss": 0.1808, "num_input_tokens_seen": 23314784, "step": 10785 }, { "epoch": 1.7601957585644372, "grad_norm": 0.9384825825691223, "learning_rate": 4.9125337709926235e-05, "loss": 0.1236, "num_input_tokens_seen": 23325696, "step": 10790 }, { "epoch": 1.7610114192495923, "grad_norm": 2.842236042022705, "learning_rate": 4.9123470395937e-05, "loss": 0.2587, "num_input_tokens_seen": 23337120, "step": 10795 }, { "epoch": 1.761827079934747, "grad_norm": 1.5775529146194458, "learning_rate": 4.9121601126376845e-05, "loss": 0.1221, "num_input_tokens_seen": 23347584, "step": 10800 }, { "epoch": 1.7626427406199021, "grad_norm": 2.2849128246307373, "learning_rate": 4.9119729901397313e-05, "loss": 0.3481, "num_input_tokens_seen": 23357504, "step": 10805 }, { "epoch": 1.7634584013050572, "grad_norm": 0.4386286735534668, "learning_rate": 4.9117856721150095e-05, "loss": 0.1882, "num_input_tokens_seen": 23367456, "step": 10810 }, { "epoch": 1.764274061990212, "grad_norm": 0.11932945996522903, "learning_rate": 4.911598158578704e-05, "loss": 0.0874, "num_input_tokens_seen": 23377984, "step": 10815 }, { "epoch": 1.765089722675367, "grad_norm": 1.375817060470581, "learning_rate": 4.9114104495460154e-05, "loss": 0.2791, "num_input_tokens_seen": 23390272, "step": 10820 }, { "epoch": 1.765905383360522, "grad_norm": 0.24430467188358307, "learning_rate": 4.9112225450321606e-05, "loss": 0.0177, "num_input_tokens_seen": 23400416, "step": 10825 }, { "epoch": 1.7667210440456769, "grad_norm": 3.3636837005615234, "learning_rate": 4.911034445052371e-05, "loss": 0.1068, "num_input_tokens_seen": 23412704, "step": 10830 }, { "epoch": 1.7675367047308321, "grad_norm": 0.5361234545707703, "learning_rate": 4.910846149621896e-05, "loss": 0.148, "num_input_tokens_seen": 23421952, "step": 10835 }, { "epoch": 1.768352365415987, "grad_norm": 0.1068536564707756, "learning_rate": 4.9106576587560006e-05, "loss": 0.1008, "num_input_tokens_seen": 23431136, "step": 10840 }, { "epoch": 1.7691680261011418, "grad_norm": 0.2754809558391571, "learning_rate": 4.9104689724699625e-05, "loss": 0.1675, "num_input_tokens_seen": 23442880, "step": 10845 }, { "epoch": 1.769983686786297, "grad_norm": 4.434019565582275, "learning_rate": 4.91028009077908e-05, "loss": 0.17, "num_input_tokens_seen": 23453120, "step": 10850 }, { "epoch": 1.7707993474714518, "grad_norm": 0.20431602001190186, "learning_rate": 4.910091013698663e-05, "loss": 0.2072, "num_input_tokens_seen": 23464192, "step": 10855 }, { "epoch": 1.7716150081566069, "grad_norm": 0.8551867008209229, "learning_rate": 4.90990174124404e-05, "loss": 0.2168, "num_input_tokens_seen": 23475264, "step": 10860 }, { "epoch": 1.772430668841762, "grad_norm": 1.3833519220352173, "learning_rate": 4.909712273430554e-05, "loss": 0.073, "num_input_tokens_seen": 23485440, "step": 10865 }, { "epoch": 1.7732463295269167, "grad_norm": 0.8069531917572021, "learning_rate": 4.9095226102735645e-05, "loss": 0.1187, "num_input_tokens_seen": 23496768, "step": 10870 }, { "epoch": 1.7740619902120718, "grad_norm": 0.052002016454935074, "learning_rate": 4.909332751788447e-05, "loss": 0.1645, "num_input_tokens_seen": 23507328, "step": 10875 }, { "epoch": 1.7748776508972268, "grad_norm": 1.4189658164978027, "learning_rate": 4.909142697990591e-05, "loss": 0.0986, "num_input_tokens_seen": 23518976, "step": 10880 }, { "epoch": 1.7756933115823816, "grad_norm": 0.11173592507839203, "learning_rate": 4.908952448895404e-05, "loss": 0.1, "num_input_tokens_seen": 23530720, "step": 10885 }, { "epoch": 1.7765089722675367, "grad_norm": 1.56269109249115, "learning_rate": 4.908762004518309e-05, "loss": 0.0558, "num_input_tokens_seen": 23541312, "step": 10890 }, { "epoch": 1.7773246329526917, "grad_norm": 0.5744493007659912, "learning_rate": 4.908571364874743e-05, "loss": 0.103, "num_input_tokens_seen": 23552064, "step": 10895 }, { "epoch": 1.7781402936378465, "grad_norm": 0.17743384838104248, "learning_rate": 4.9083805299801626e-05, "loss": 0.1079, "num_input_tokens_seen": 23563104, "step": 10900 }, { "epoch": 1.7789559543230016, "grad_norm": 3.0502898693084717, "learning_rate": 4.908189499850036e-05, "loss": 0.136, "num_input_tokens_seen": 23572992, "step": 10905 }, { "epoch": 1.7797716150081566, "grad_norm": 3.5304319858551025, "learning_rate": 4.907998274499849e-05, "loss": 0.2232, "num_input_tokens_seen": 23583328, "step": 10910 }, { "epoch": 1.7805872756933114, "grad_norm": 0.8347854614257812, "learning_rate": 4.9078068539451045e-05, "loss": 0.0981, "num_input_tokens_seen": 23593888, "step": 10915 }, { "epoch": 1.7814029363784667, "grad_norm": 4.794600486755371, "learning_rate": 4.907615238201319e-05, "loss": 0.1206, "num_input_tokens_seen": 23604544, "step": 10920 }, { "epoch": 1.7822185970636215, "grad_norm": 0.2617413103580475, "learning_rate": 4.907423427284026e-05, "loss": 0.2593, "num_input_tokens_seen": 23615584, "step": 10925 }, { "epoch": 1.7830342577487766, "grad_norm": 0.05737384408712387, "learning_rate": 4.907231421208775e-05, "loss": 0.0256, "num_input_tokens_seen": 23625568, "step": 10930 }, { "epoch": 1.7838499184339316, "grad_norm": 0.07537528872489929, "learning_rate": 4.907039219991131e-05, "loss": 0.0192, "num_input_tokens_seen": 23636320, "step": 10935 }, { "epoch": 1.7846655791190864, "grad_norm": 2.5834670066833496, "learning_rate": 4.906846823646675e-05, "loss": 0.2319, "num_input_tokens_seen": 23646912, "step": 10940 }, { "epoch": 1.7854812398042414, "grad_norm": 0.09616564214229584, "learning_rate": 4.906654232191002e-05, "loss": 0.1385, "num_input_tokens_seen": 23657120, "step": 10945 }, { "epoch": 1.7862969004893965, "grad_norm": 3.8253488540649414, "learning_rate": 4.906461445639726e-05, "loss": 0.208, "num_input_tokens_seen": 23667712, "step": 10950 }, { "epoch": 1.7871125611745513, "grad_norm": 0.83757483959198, "learning_rate": 4.906268464008476e-05, "loss": 0.106, "num_input_tokens_seen": 23678720, "step": 10955 }, { "epoch": 1.7879282218597063, "grad_norm": 2.588719129562378, "learning_rate": 4.9060752873128946e-05, "loss": 0.0493, "num_input_tokens_seen": 23688160, "step": 10960 }, { "epoch": 1.7887438825448614, "grad_norm": 4.905638217926025, "learning_rate": 4.905881915568642e-05, "loss": 0.2047, "num_input_tokens_seen": 23699680, "step": 10965 }, { "epoch": 1.7895595432300162, "grad_norm": 2.158893585205078, "learning_rate": 4.905688348791394e-05, "loss": 0.0967, "num_input_tokens_seen": 23710880, "step": 10970 }, { "epoch": 1.7903752039151712, "grad_norm": 0.4053361713886261, "learning_rate": 4.905494586996842e-05, "loss": 0.0522, "num_input_tokens_seen": 23722304, "step": 10975 }, { "epoch": 1.7911908646003263, "grad_norm": 4.744560241699219, "learning_rate": 4.905300630200693e-05, "loss": 0.2627, "num_input_tokens_seen": 23734048, "step": 10980 }, { "epoch": 1.792006525285481, "grad_norm": 0.09347156435251236, "learning_rate": 4.9051064784186704e-05, "loss": 0.0866, "num_input_tokens_seen": 23745600, "step": 10985 }, { "epoch": 1.7928221859706364, "grad_norm": 0.11766652762889862, "learning_rate": 4.9049121316665146e-05, "loss": 0.0331, "num_input_tokens_seen": 23756800, "step": 10990 }, { "epoch": 1.7936378466557912, "grad_norm": 2.3665807247161865, "learning_rate": 4.904717589959978e-05, "loss": 0.282, "num_input_tokens_seen": 23766880, "step": 10995 }, { "epoch": 1.7944535073409462, "grad_norm": 0.056640349328517914, "learning_rate": 4.904522853314833e-05, "loss": 0.1131, "num_input_tokens_seen": 23777472, "step": 11000 }, { "epoch": 1.7952691680261013, "grad_norm": 2.2606041431427, "learning_rate": 4.904327921746864e-05, "loss": 0.1389, "num_input_tokens_seen": 23786336, "step": 11005 }, { "epoch": 1.796084828711256, "grad_norm": 3.8277926445007324, "learning_rate": 4.904132795271875e-05, "loss": 0.2275, "num_input_tokens_seen": 23797184, "step": 11010 }, { "epoch": 1.7969004893964111, "grad_norm": 0.5380455255508423, "learning_rate": 4.9039374739056825e-05, "loss": 0.0943, "num_input_tokens_seen": 23808288, "step": 11015 }, { "epoch": 1.7977161500815662, "grad_norm": 1.1800470352172852, "learning_rate": 4.903741957664121e-05, "loss": 0.2547, "num_input_tokens_seen": 23818176, "step": 11020 }, { "epoch": 1.798531810766721, "grad_norm": 0.8688233494758606, "learning_rate": 4.903546246563041e-05, "loss": 0.1267, "num_input_tokens_seen": 23829344, "step": 11025 }, { "epoch": 1.799347471451876, "grad_norm": 2.287400484085083, "learning_rate": 4.9033503406183055e-05, "loss": 0.1267, "num_input_tokens_seen": 23839936, "step": 11030 }, { "epoch": 1.800163132137031, "grad_norm": 3.783482074737549, "learning_rate": 4.9031542398457974e-05, "loss": 0.1061, "num_input_tokens_seen": 23852736, "step": 11035 }, { "epoch": 1.8009787928221859, "grad_norm": 0.21201197803020477, "learning_rate": 4.902957944261413e-05, "loss": 0.1609, "num_input_tokens_seen": 23864736, "step": 11040 }, { "epoch": 1.801794453507341, "grad_norm": 2.7451019287109375, "learning_rate": 4.902761453881065e-05, "loss": 0.1805, "num_input_tokens_seen": 23874432, "step": 11045 }, { "epoch": 1.802610114192496, "grad_norm": 1.6343696117401123, "learning_rate": 4.9025647687206824e-05, "loss": 0.0406, "num_input_tokens_seen": 23885056, "step": 11050 }, { "epoch": 1.8034257748776508, "grad_norm": 0.5249418020248413, "learning_rate": 4.90236788879621e-05, "loss": 0.1174, "num_input_tokens_seen": 23896096, "step": 11055 }, { "epoch": 1.804241435562806, "grad_norm": 0.22775468230247498, "learning_rate": 4.9021708141236056e-05, "loss": 0.0624, "num_input_tokens_seen": 23907200, "step": 11060 }, { "epoch": 1.8050570962479608, "grad_norm": 0.21686673164367676, "learning_rate": 4.901973544718847e-05, "loss": 0.1234, "num_input_tokens_seen": 23916576, "step": 11065 }, { "epoch": 1.8058727569331157, "grad_norm": 0.07676563411951065, "learning_rate": 4.901776080597926e-05, "loss": 0.0371, "num_input_tokens_seen": 23926784, "step": 11070 }, { "epoch": 1.806688417618271, "grad_norm": 1.3494975566864014, "learning_rate": 4.9015784217768487e-05, "loss": 0.2541, "num_input_tokens_seen": 23937632, "step": 11075 }, { "epoch": 1.8075040783034257, "grad_norm": 0.22478091716766357, "learning_rate": 4.901380568271639e-05, "loss": 0.0092, "num_input_tokens_seen": 23949024, "step": 11080 }, { "epoch": 1.8083197389885808, "grad_norm": 2.6358680725097656, "learning_rate": 4.901182520098336e-05, "loss": 0.144, "num_input_tokens_seen": 23960192, "step": 11085 }, { "epoch": 1.8091353996737358, "grad_norm": 0.964083194732666, "learning_rate": 4.9009842772729944e-05, "loss": 0.2404, "num_input_tokens_seen": 23970304, "step": 11090 }, { "epoch": 1.8099510603588906, "grad_norm": 0.11122696846723557, "learning_rate": 4.9007858398116856e-05, "loss": 0.1999, "num_input_tokens_seen": 23980544, "step": 11095 }, { "epoch": 1.8107667210440457, "grad_norm": 5.403952598571777, "learning_rate": 4.9005872077304944e-05, "loss": 0.0819, "num_input_tokens_seen": 23991680, "step": 11100 }, { "epoch": 1.8115823817292007, "grad_norm": 0.19622884690761566, "learning_rate": 4.900388381045524e-05, "loss": 0.282, "num_input_tokens_seen": 24000672, "step": 11105 }, { "epoch": 1.8123980424143555, "grad_norm": 0.861206591129303, "learning_rate": 4.9001893597728915e-05, "loss": 0.1903, "num_input_tokens_seen": 24010496, "step": 11110 }, { "epoch": 1.8132137030995106, "grad_norm": 0.15993455052375793, "learning_rate": 4.899990143928731e-05, "loss": 0.13, "num_input_tokens_seen": 24021056, "step": 11115 }, { "epoch": 1.8140293637846656, "grad_norm": 0.10411029309034348, "learning_rate": 4.899790733529193e-05, "loss": 0.0775, "num_input_tokens_seen": 24031136, "step": 11120 }, { "epoch": 1.8148450244698204, "grad_norm": 0.17643879354000092, "learning_rate": 4.8995911285904404e-05, "loss": 0.0145, "num_input_tokens_seen": 24040864, "step": 11125 }, { "epoch": 1.8156606851549757, "grad_norm": 0.20683497190475464, "learning_rate": 4.899391329128656e-05, "loss": 0.2032, "num_input_tokens_seen": 24051168, "step": 11130 }, { "epoch": 1.8164763458401305, "grad_norm": 2.6556005477905273, "learning_rate": 4.899191335160037e-05, "loss": 0.159, "num_input_tokens_seen": 24060448, "step": 11135 }, { "epoch": 1.8172920065252853, "grad_norm": 0.18561014533042908, "learning_rate": 4.898991146700794e-05, "loss": 0.4073, "num_input_tokens_seen": 24070208, "step": 11140 }, { "epoch": 1.8181076672104406, "grad_norm": 0.1081450805068016, "learning_rate": 4.898790763767157e-05, "loss": 0.0513, "num_input_tokens_seen": 24080000, "step": 11145 }, { "epoch": 1.8189233278955954, "grad_norm": 0.46138057112693787, "learning_rate": 4.8985901863753694e-05, "loss": 0.0705, "num_input_tokens_seen": 24089120, "step": 11150 }, { "epoch": 1.8197389885807504, "grad_norm": 0.14180487394332886, "learning_rate": 4.8983894145416896e-05, "loss": 0.0435, "num_input_tokens_seen": 24099968, "step": 11155 }, { "epoch": 1.8205546492659055, "grad_norm": 1.645262360572815, "learning_rate": 4.898188448282396e-05, "loss": 0.0534, "num_input_tokens_seen": 24110560, "step": 11160 }, { "epoch": 1.8213703099510603, "grad_norm": 0.09216941893100739, "learning_rate": 4.897987287613778e-05, "loss": 0.1165, "num_input_tokens_seen": 24121984, "step": 11165 }, { "epoch": 1.8221859706362153, "grad_norm": 3.4673421382904053, "learning_rate": 4.897785932552143e-05, "loss": 0.2744, "num_input_tokens_seen": 24132544, "step": 11170 }, { "epoch": 1.8230016313213704, "grad_norm": 3.1680285930633545, "learning_rate": 4.897584383113814e-05, "loss": 0.1616, "num_input_tokens_seen": 24143808, "step": 11175 }, { "epoch": 1.8238172920065252, "grad_norm": 2.7808051109313965, "learning_rate": 4.89738263931513e-05, "loss": 0.3092, "num_input_tokens_seen": 24154496, "step": 11180 }, { "epoch": 1.8246329526916802, "grad_norm": 2.7151877880096436, "learning_rate": 4.8971807011724444e-05, "loss": 0.1983, "num_input_tokens_seen": 24165472, "step": 11185 }, { "epoch": 1.8254486133768353, "grad_norm": 0.22334140539169312, "learning_rate": 4.8969785687021294e-05, "loss": 0.0279, "num_input_tokens_seen": 24177088, "step": 11190 }, { "epoch": 1.82626427406199, "grad_norm": 0.19176222383975983, "learning_rate": 4.8967762419205684e-05, "loss": 0.1226, "num_input_tokens_seen": 24188768, "step": 11195 }, { "epoch": 1.8270799347471451, "grad_norm": 0.1524587869644165, "learning_rate": 4.896573720844164e-05, "loss": 0.2339, "num_input_tokens_seen": 24199712, "step": 11200 }, { "epoch": 1.8278955954323002, "grad_norm": 1.981576919555664, "learning_rate": 4.896371005489334e-05, "loss": 0.164, "num_input_tokens_seen": 24210912, "step": 11205 }, { "epoch": 1.828711256117455, "grad_norm": 5.648102760314941, "learning_rate": 4.896168095872511e-05, "loss": 0.2033, "num_input_tokens_seen": 24221824, "step": 11210 }, { "epoch": 1.8295269168026103, "grad_norm": 0.17025180160999298, "learning_rate": 4.895964992010145e-05, "loss": 0.0773, "num_input_tokens_seen": 24231168, "step": 11215 }, { "epoch": 1.830342577487765, "grad_norm": 0.08324454724788666, "learning_rate": 4.895761693918699e-05, "loss": 0.0206, "num_input_tokens_seen": 24241408, "step": 11220 }, { "epoch": 1.8311582381729201, "grad_norm": 1.4679114818572998, "learning_rate": 4.895558201614654e-05, "loss": 0.1481, "num_input_tokens_seen": 24251520, "step": 11225 }, { "epoch": 1.8319738988580752, "grad_norm": 0.139203280210495, "learning_rate": 4.895354515114506e-05, "loss": 0.2452, "num_input_tokens_seen": 24262144, "step": 11230 }, { "epoch": 1.83278955954323, "grad_norm": 1.3829185962677002, "learning_rate": 4.895150634434769e-05, "loss": 0.1032, "num_input_tokens_seen": 24273440, "step": 11235 }, { "epoch": 1.833605220228385, "grad_norm": 2.407167673110962, "learning_rate": 4.894946559591966e-05, "loss": 0.0307, "num_input_tokens_seen": 24284832, "step": 11240 }, { "epoch": 1.83442088091354, "grad_norm": 0.31610408425331116, "learning_rate": 4.8947422906026446e-05, "loss": 0.1837, "num_input_tokens_seen": 24295488, "step": 11245 }, { "epoch": 1.8352365415986949, "grad_norm": 1.0342751741409302, "learning_rate": 4.894537827483362e-05, "loss": 0.0547, "num_input_tokens_seen": 24306624, "step": 11250 }, { "epoch": 1.83605220228385, "grad_norm": 0.16724175214767456, "learning_rate": 4.8943331702506935e-05, "loss": 0.0728, "num_input_tokens_seen": 24318176, "step": 11255 }, { "epoch": 1.836867862969005, "grad_norm": 1.1085926294326782, "learning_rate": 4.894128318921229e-05, "loss": 0.0768, "num_input_tokens_seen": 24329024, "step": 11260 }, { "epoch": 1.8376835236541598, "grad_norm": 0.05294278636574745, "learning_rate": 4.893923273511576e-05, "loss": 0.0307, "num_input_tokens_seen": 24339680, "step": 11265 }, { "epoch": 1.8384991843393148, "grad_norm": 0.09843534976243973, "learning_rate": 4.893718034038355e-05, "loss": 0.2451, "num_input_tokens_seen": 24350368, "step": 11270 }, { "epoch": 1.8393148450244698, "grad_norm": 0.2138974964618683, "learning_rate": 4.8935126005182056e-05, "loss": 0.1078, "num_input_tokens_seen": 24361536, "step": 11275 }, { "epoch": 1.8401305057096247, "grad_norm": 0.11413642764091492, "learning_rate": 4.8933069729677795e-05, "loss": 0.2499, "num_input_tokens_seen": 24372768, "step": 11280 }, { "epoch": 1.84094616639478, "grad_norm": 5.159852504730225, "learning_rate": 4.893101151403747e-05, "loss": 0.189, "num_input_tokens_seen": 24382592, "step": 11285 }, { "epoch": 1.8417618270799347, "grad_norm": 2.8542661666870117, "learning_rate": 4.892895135842792e-05, "loss": 0.084, "num_input_tokens_seen": 24393248, "step": 11290 }, { "epoch": 1.8425774877650896, "grad_norm": 0.7388946413993835, "learning_rate": 4.892688926301616e-05, "loss": 0.1437, "num_input_tokens_seen": 24403232, "step": 11295 }, { "epoch": 1.8433931484502448, "grad_norm": 1.2321661710739136, "learning_rate": 4.892482522796936e-05, "loss": 0.0948, "num_input_tokens_seen": 24413792, "step": 11300 }, { "epoch": 1.8442088091353996, "grad_norm": 0.13754427433013916, "learning_rate": 4.892275925345483e-05, "loss": 0.0725, "num_input_tokens_seen": 24423328, "step": 11305 }, { "epoch": 1.8450244698205547, "grad_norm": 1.5106384754180908, "learning_rate": 4.8920691339640055e-05, "loss": 0.3124, "num_input_tokens_seen": 24434464, "step": 11310 }, { "epoch": 1.8458401305057097, "grad_norm": 0.12568306922912598, "learning_rate": 4.8918621486692663e-05, "loss": 0.1007, "num_input_tokens_seen": 24444224, "step": 11315 }, { "epoch": 1.8466557911908645, "grad_norm": 0.09501215815544128, "learning_rate": 4.8916549694780455e-05, "loss": 0.1191, "num_input_tokens_seen": 24454912, "step": 11320 }, { "epoch": 1.8474714518760196, "grad_norm": 0.7249631881713867, "learning_rate": 4.891447596407137e-05, "loss": 0.0572, "num_input_tokens_seen": 24466304, "step": 11325 }, { "epoch": 1.8482871125611746, "grad_norm": 2.0175819396972656, "learning_rate": 4.8912400294733526e-05, "loss": 0.1571, "num_input_tokens_seen": 24476960, "step": 11330 }, { "epoch": 1.8491027732463294, "grad_norm": 1.7725948095321655, "learning_rate": 4.891032268693519e-05, "loss": 0.2005, "num_input_tokens_seen": 24486784, "step": 11335 }, { "epoch": 1.8499184339314845, "grad_norm": 0.12097588926553726, "learning_rate": 4.8908243140844765e-05, "loss": 0.0358, "num_input_tokens_seen": 24497120, "step": 11340 }, { "epoch": 1.8507340946166395, "grad_norm": 1.4836633205413818, "learning_rate": 4.890616165663085e-05, "loss": 0.1303, "num_input_tokens_seen": 24507200, "step": 11345 }, { "epoch": 1.8515497553017943, "grad_norm": 3.1321067810058594, "learning_rate": 4.890407823446218e-05, "loss": 0.2664, "num_input_tokens_seen": 24517920, "step": 11350 }, { "epoch": 1.8523654159869496, "grad_norm": 0.13268068432807922, "learning_rate": 4.890199287450763e-05, "loss": 0.0902, "num_input_tokens_seen": 24528640, "step": 11355 }, { "epoch": 1.8531810766721044, "grad_norm": 4.3858537673950195, "learning_rate": 4.889990557693626e-05, "loss": 0.2153, "num_input_tokens_seen": 24540352, "step": 11360 }, { "epoch": 1.8539967373572592, "grad_norm": 2.9111762046813965, "learning_rate": 4.889781634191728e-05, "loss": 0.0478, "num_input_tokens_seen": 24551584, "step": 11365 }, { "epoch": 1.8548123980424145, "grad_norm": 0.45477044582366943, "learning_rate": 4.889572516962006e-05, "loss": 0.0905, "num_input_tokens_seen": 24562400, "step": 11370 }, { "epoch": 1.8556280587275693, "grad_norm": 0.1414952278137207, "learning_rate": 4.889363206021409e-05, "loss": 0.1127, "num_input_tokens_seen": 24572736, "step": 11375 }, { "epoch": 1.8564437194127243, "grad_norm": 2.483764171600342, "learning_rate": 4.889153701386908e-05, "loss": 0.201, "num_input_tokens_seen": 24584416, "step": 11380 }, { "epoch": 1.8572593800978794, "grad_norm": 3.6788392066955566, "learning_rate": 4.888944003075486e-05, "loss": 0.2004, "num_input_tokens_seen": 24593472, "step": 11385 }, { "epoch": 1.8580750407830342, "grad_norm": 0.10282847285270691, "learning_rate": 4.888734111104142e-05, "loss": 0.4256, "num_input_tokens_seen": 24604544, "step": 11390 }, { "epoch": 1.8588907014681892, "grad_norm": 0.22144851088523865, "learning_rate": 4.88852402548989e-05, "loss": 0.1469, "num_input_tokens_seen": 24614784, "step": 11395 }, { "epoch": 1.8597063621533443, "grad_norm": 2.5628433227539062, "learning_rate": 4.8883137462497615e-05, "loss": 0.0804, "num_input_tokens_seen": 24625504, "step": 11400 }, { "epoch": 1.860522022838499, "grad_norm": 2.815721035003662, "learning_rate": 4.8881032734008024e-05, "loss": 0.0992, "num_input_tokens_seen": 24636192, "step": 11405 }, { "epoch": 1.8613376835236541, "grad_norm": 2.4981741905212402, "learning_rate": 4.887892606960075e-05, "loss": 0.1739, "num_input_tokens_seen": 24647200, "step": 11410 }, { "epoch": 1.8621533442088092, "grad_norm": 0.11964482814073563, "learning_rate": 4.887681746944657e-05, "loss": 0.0335, "num_input_tokens_seen": 24656640, "step": 11415 }, { "epoch": 1.862969004893964, "grad_norm": 1.2380677461624146, "learning_rate": 4.8874706933716406e-05, "loss": 0.0191, "num_input_tokens_seen": 24665664, "step": 11420 }, { "epoch": 1.863784665579119, "grad_norm": 0.6048164963722229, "learning_rate": 4.887259446258137e-05, "loss": 0.1681, "num_input_tokens_seen": 24676896, "step": 11425 }, { "epoch": 1.864600326264274, "grad_norm": 0.26376572251319885, "learning_rate": 4.887048005621269e-05, "loss": 0.0464, "num_input_tokens_seen": 24688960, "step": 11430 }, { "epoch": 1.865415986949429, "grad_norm": 2.958484172821045, "learning_rate": 4.886836371478178e-05, "loss": 0.0831, "num_input_tokens_seen": 24699200, "step": 11435 }, { "epoch": 1.8662316476345842, "grad_norm": 3.382356882095337, "learning_rate": 4.8866245438460215e-05, "loss": 0.0957, "num_input_tokens_seen": 24709600, "step": 11440 }, { "epoch": 1.867047308319739, "grad_norm": 6.393452167510986, "learning_rate": 4.886412522741968e-05, "loss": 0.2363, "num_input_tokens_seen": 24719968, "step": 11445 }, { "epoch": 1.867862969004894, "grad_norm": 0.09577061980962753, "learning_rate": 4.886200308183207e-05, "loss": 0.0147, "num_input_tokens_seen": 24731520, "step": 11450 }, { "epoch": 1.868678629690049, "grad_norm": 1.72732412815094, "learning_rate": 4.885987900186943e-05, "loss": 0.0535, "num_input_tokens_seen": 24742880, "step": 11455 }, { "epoch": 1.8694942903752039, "grad_norm": 0.040071967989206314, "learning_rate": 4.8857752987703924e-05, "loss": 0.153, "num_input_tokens_seen": 24753600, "step": 11460 }, { "epoch": 1.870309951060359, "grad_norm": 0.05324089527130127, "learning_rate": 4.8855625039507916e-05, "loss": 0.2349, "num_input_tokens_seen": 24764608, "step": 11465 }, { "epoch": 1.871125611745514, "grad_norm": 5.415706634521484, "learning_rate": 4.8853495157453886e-05, "loss": 0.2286, "num_input_tokens_seen": 24775872, "step": 11470 }, { "epoch": 1.8719412724306688, "grad_norm": 3.6094143390655518, "learning_rate": 4.885136334171452e-05, "loss": 0.1601, "num_input_tokens_seen": 24786208, "step": 11475 }, { "epoch": 1.8727569331158238, "grad_norm": 3.190077304840088, "learning_rate": 4.8849229592462615e-05, "loss": 0.3495, "num_input_tokens_seen": 24795296, "step": 11480 }, { "epoch": 1.8735725938009788, "grad_norm": 0.18625961244106293, "learning_rate": 4.884709390987115e-05, "loss": 0.0388, "num_input_tokens_seen": 24805632, "step": 11485 }, { "epoch": 1.8743882544861337, "grad_norm": 3.2550137042999268, "learning_rate": 4.8844956294113255e-05, "loss": 0.1069, "num_input_tokens_seen": 24815552, "step": 11490 }, { "epoch": 1.8752039151712887, "grad_norm": 0.11549729853868484, "learning_rate": 4.884281674536221e-05, "loss": 0.2082, "num_input_tokens_seen": 24827264, "step": 11495 }, { "epoch": 1.8760195758564437, "grad_norm": 0.13546086847782135, "learning_rate": 4.884067526379147e-05, "loss": 0.1353, "num_input_tokens_seen": 24839168, "step": 11500 }, { "epoch": 1.8768352365415986, "grad_norm": 3.3355772495269775, "learning_rate": 4.8838531849574624e-05, "loss": 0.1486, "num_input_tokens_seen": 24850336, "step": 11505 }, { "epoch": 1.8776508972267538, "grad_norm": 0.2108527272939682, "learning_rate": 4.8836386502885426e-05, "loss": 0.0319, "num_input_tokens_seen": 24861984, "step": 11510 }, { "epoch": 1.8784665579119086, "grad_norm": 1.1564456224441528, "learning_rate": 4.88342392238978e-05, "loss": 0.0966, "num_input_tokens_seen": 24872160, "step": 11515 }, { "epoch": 1.8792822185970635, "grad_norm": 2.4284708499908447, "learning_rate": 4.88320900127858e-05, "loss": 0.2325, "num_input_tokens_seen": 24882272, "step": 11520 }, { "epoch": 1.8800978792822187, "grad_norm": 3.3414292335510254, "learning_rate": 4.882993886972367e-05, "loss": 0.2882, "num_input_tokens_seen": 24892448, "step": 11525 }, { "epoch": 1.8809135399673735, "grad_norm": 0.647790789604187, "learning_rate": 4.882778579488578e-05, "loss": 0.0531, "num_input_tokens_seen": 24902016, "step": 11530 }, { "epoch": 1.8817292006525286, "grad_norm": 4.748507499694824, "learning_rate": 4.882563078844668e-05, "loss": 0.2505, "num_input_tokens_seen": 24913344, "step": 11535 }, { "epoch": 1.8825448613376836, "grad_norm": 0.508827269077301, "learning_rate": 4.882347385058105e-05, "loss": 0.2493, "num_input_tokens_seen": 24923840, "step": 11540 }, { "epoch": 1.8833605220228384, "grad_norm": 0.22358126938343048, "learning_rate": 4.882131498146375e-05, "loss": 0.1672, "num_input_tokens_seen": 24933568, "step": 11545 }, { "epoch": 1.8841761827079935, "grad_norm": 0.16932672262191772, "learning_rate": 4.881915418126979e-05, "loss": 0.0433, "num_input_tokens_seen": 24943520, "step": 11550 }, { "epoch": 1.8849918433931485, "grad_norm": 4.0258870124816895, "learning_rate": 4.8816991450174334e-05, "loss": 0.1707, "num_input_tokens_seen": 24954976, "step": 11555 }, { "epoch": 1.8858075040783033, "grad_norm": 2.1621270179748535, "learning_rate": 4.881482678835271e-05, "loss": 0.0654, "num_input_tokens_seen": 24964384, "step": 11560 }, { "epoch": 1.8866231647634584, "grad_norm": 0.20863114297389984, "learning_rate": 4.881266019598039e-05, "loss": 0.0321, "num_input_tokens_seen": 24975584, "step": 11565 }, { "epoch": 1.8874388254486134, "grad_norm": 5.4353790283203125, "learning_rate": 4.8810491673233006e-05, "loss": 0.1001, "num_input_tokens_seen": 24987520, "step": 11570 }, { "epoch": 1.8882544861337682, "grad_norm": 0.11817409843206406, "learning_rate": 4.880832122028635e-05, "loss": 0.1111, "num_input_tokens_seen": 24999424, "step": 11575 }, { "epoch": 1.8890701468189235, "grad_norm": 4.3622517585754395, "learning_rate": 4.880614883731638e-05, "loss": 0.3256, "num_input_tokens_seen": 25010624, "step": 11580 }, { "epoch": 1.8898858075040783, "grad_norm": 6.816831111907959, "learning_rate": 4.88039745244992e-05, "loss": 0.3157, "num_input_tokens_seen": 25021696, "step": 11585 }, { "epoch": 1.8907014681892331, "grad_norm": 1.683463215827942, "learning_rate": 4.880179828201106e-05, "loss": 0.1249, "num_input_tokens_seen": 25032640, "step": 11590 }, { "epoch": 1.8915171288743884, "grad_norm": 0.07192590832710266, "learning_rate": 4.8799620110028375e-05, "loss": 0.2552, "num_input_tokens_seen": 25043744, "step": 11595 }, { "epoch": 1.8923327895595432, "grad_norm": 3.1122612953186035, "learning_rate": 4.879744000872774e-05, "loss": 0.3787, "num_input_tokens_seen": 25055136, "step": 11600 }, { "epoch": 1.8931484502446982, "grad_norm": 0.08800233155488968, "learning_rate": 4.879525797828585e-05, "loss": 0.126, "num_input_tokens_seen": 25066528, "step": 11605 }, { "epoch": 1.8939641109298533, "grad_norm": 0.6428315043449402, "learning_rate": 4.879307401887963e-05, "loss": 0.0265, "num_input_tokens_seen": 25077280, "step": 11610 }, { "epoch": 1.894779771615008, "grad_norm": 3.7041077613830566, "learning_rate": 4.87908881306861e-05, "loss": 0.1289, "num_input_tokens_seen": 25088544, "step": 11615 }, { "epoch": 1.8955954323001631, "grad_norm": 0.13210909068584442, "learning_rate": 4.878870031388246e-05, "loss": 0.0186, "num_input_tokens_seen": 25097344, "step": 11620 }, { "epoch": 1.8964110929853182, "grad_norm": 3.7125773429870605, "learning_rate": 4.8786510568646074e-05, "loss": 0.2127, "num_input_tokens_seen": 25108736, "step": 11625 }, { "epoch": 1.897226753670473, "grad_norm": 4.024951934814453, "learning_rate": 4.878431889515445e-05, "loss": 0.1576, "num_input_tokens_seen": 25119648, "step": 11630 }, { "epoch": 1.898042414355628, "grad_norm": 0.28142425417900085, "learning_rate": 4.8782125293585255e-05, "loss": 0.0438, "num_input_tokens_seen": 25130048, "step": 11635 }, { "epoch": 1.898858075040783, "grad_norm": 2.7388367652893066, "learning_rate": 4.877992976411632e-05, "loss": 0.1148, "num_input_tokens_seen": 25140032, "step": 11640 }, { "epoch": 1.899673735725938, "grad_norm": 0.16469921171665192, "learning_rate": 4.8777732306925614e-05, "loss": 0.2104, "num_input_tokens_seen": 25151136, "step": 11645 }, { "epoch": 1.900489396411093, "grad_norm": 0.21354708075523376, "learning_rate": 4.877553292219128e-05, "loss": 0.1238, "num_input_tokens_seen": 25161408, "step": 11650 }, { "epoch": 1.901305057096248, "grad_norm": 2.321537494659424, "learning_rate": 4.877333161009161e-05, "loss": 0.1364, "num_input_tokens_seen": 25172224, "step": 11655 }, { "epoch": 1.9021207177814028, "grad_norm": 1.2828636169433594, "learning_rate": 4.8771128370805066e-05, "loss": 0.2614, "num_input_tokens_seen": 25183424, "step": 11660 }, { "epoch": 1.902936378466558, "grad_norm": 1.7964338064193726, "learning_rate": 4.876892320451023e-05, "loss": 0.1097, "num_input_tokens_seen": 25194944, "step": 11665 }, { "epoch": 1.9037520391517129, "grad_norm": 1.1999586820602417, "learning_rate": 4.876671611138588e-05, "loss": 0.1334, "num_input_tokens_seen": 25205728, "step": 11670 }, { "epoch": 1.904567699836868, "grad_norm": 0.12653008103370667, "learning_rate": 4.876450709161093e-05, "loss": 0.1627, "num_input_tokens_seen": 25216576, "step": 11675 }, { "epoch": 1.905383360522023, "grad_norm": 0.07031231373548508, "learning_rate": 4.876229614536446e-05, "loss": 0.0095, "num_input_tokens_seen": 25227616, "step": 11680 }, { "epoch": 1.9061990212071778, "grad_norm": 0.8870084285736084, "learning_rate": 4.8760083272825695e-05, "loss": 0.1272, "num_input_tokens_seen": 25237856, "step": 11685 }, { "epoch": 1.9070146818923328, "grad_norm": 7.386667251586914, "learning_rate": 4.875786847417402e-05, "loss": 0.2315, "num_input_tokens_seen": 25248256, "step": 11690 }, { "epoch": 1.9078303425774878, "grad_norm": 0.2217559665441513, "learning_rate": 4.875565174958898e-05, "loss": 0.0843, "num_input_tokens_seen": 25258944, "step": 11695 }, { "epoch": 1.9086460032626427, "grad_norm": 0.147831529378891, "learning_rate": 4.8753433099250276e-05, "loss": 0.1228, "num_input_tokens_seen": 25269120, "step": 11700 }, { "epoch": 1.9094616639477977, "grad_norm": 1.9237598180770874, "learning_rate": 4.875121252333776e-05, "loss": 0.1464, "num_input_tokens_seen": 25280160, "step": 11705 }, { "epoch": 1.9102773246329527, "grad_norm": 4.616643905639648, "learning_rate": 4.874899002203145e-05, "loss": 0.2244, "num_input_tokens_seen": 25290112, "step": 11710 }, { "epoch": 1.9110929853181076, "grad_norm": 0.8603242039680481, "learning_rate": 4.8746765595511504e-05, "loss": 0.0974, "num_input_tokens_seen": 25301664, "step": 11715 }, { "epoch": 1.9119086460032626, "grad_norm": 2.7926738262176514, "learning_rate": 4.874453924395824e-05, "loss": 0.2612, "num_input_tokens_seen": 25311264, "step": 11720 }, { "epoch": 1.9127243066884176, "grad_norm": 0.6210606694221497, "learning_rate": 4.874231096755216e-05, "loss": 0.0201, "num_input_tokens_seen": 25321984, "step": 11725 }, { "epoch": 1.9135399673735725, "grad_norm": 0.3121470510959625, "learning_rate": 4.8740080766473876e-05, "loss": 0.0963, "num_input_tokens_seen": 25333664, "step": 11730 }, { "epoch": 1.9143556280587277, "grad_norm": 0.15780290961265564, "learning_rate": 4.873784864090419e-05, "loss": 0.1531, "num_input_tokens_seen": 25342944, "step": 11735 }, { "epoch": 1.9151712887438825, "grad_norm": 2.160059928894043, "learning_rate": 4.873561459102406e-05, "loss": 0.1624, "num_input_tokens_seen": 25353568, "step": 11740 }, { "epoch": 1.9159869494290374, "grad_norm": 0.15781857073307037, "learning_rate": 4.873337861701456e-05, "loss": 0.1003, "num_input_tokens_seen": 25364736, "step": 11745 }, { "epoch": 1.9168026101141926, "grad_norm": 0.0535578653216362, "learning_rate": 4.8731140719056977e-05, "loss": 0.0627, "num_input_tokens_seen": 25374528, "step": 11750 }, { "epoch": 1.9176182707993474, "grad_norm": 0.15755146741867065, "learning_rate": 4.872890089733272e-05, "loss": 0.1495, "num_input_tokens_seen": 25385440, "step": 11755 }, { "epoch": 1.9184339314845025, "grad_norm": 9.134349822998047, "learning_rate": 4.8726659152023356e-05, "loss": 0.3055, "num_input_tokens_seen": 25397344, "step": 11760 }, { "epoch": 1.9192495921696575, "grad_norm": 0.09816361218690872, "learning_rate": 4.87244154833106e-05, "loss": 0.0622, "num_input_tokens_seen": 25408448, "step": 11765 }, { "epoch": 1.9200652528548123, "grad_norm": 1.739254117012024, "learning_rate": 4.872216989137637e-05, "loss": 0.2179, "num_input_tokens_seen": 25418336, "step": 11770 }, { "epoch": 1.9208809135399674, "grad_norm": 0.8327562808990479, "learning_rate": 4.871992237640267e-05, "loss": 0.1653, "num_input_tokens_seen": 25429824, "step": 11775 }, { "epoch": 1.9216965742251224, "grad_norm": 0.17584045231342316, "learning_rate": 4.871767293857171e-05, "loss": 0.2071, "num_input_tokens_seen": 25439584, "step": 11780 }, { "epoch": 1.9225122349102772, "grad_norm": 3.230736017227173, "learning_rate": 4.871542157806584e-05, "loss": 0.1719, "num_input_tokens_seen": 25451744, "step": 11785 }, { "epoch": 1.9233278955954323, "grad_norm": 1.1818323135375977, "learning_rate": 4.871316829506757e-05, "loss": 0.0281, "num_input_tokens_seen": 25461856, "step": 11790 }, { "epoch": 1.9241435562805873, "grad_norm": 5.720760345458984, "learning_rate": 4.871091308975955e-05, "loss": 0.3042, "num_input_tokens_seen": 25473120, "step": 11795 }, { "epoch": 1.9249592169657421, "grad_norm": 3.1483027935028076, "learning_rate": 4.8708655962324615e-05, "loss": 0.0345, "num_input_tokens_seen": 25485632, "step": 11800 }, { "epoch": 1.9257748776508974, "grad_norm": 0.8307350873947144, "learning_rate": 4.870639691294573e-05, "loss": 0.0248, "num_input_tokens_seen": 25496160, "step": 11805 }, { "epoch": 1.9265905383360522, "grad_norm": 2.910285711288452, "learning_rate": 4.8704135941806016e-05, "loss": 0.135, "num_input_tokens_seen": 25507712, "step": 11810 }, { "epoch": 1.927406199021207, "grad_norm": 7.951888561248779, "learning_rate": 4.870187304908878e-05, "loss": 0.3368, "num_input_tokens_seen": 25519328, "step": 11815 }, { "epoch": 1.9282218597063623, "grad_norm": 0.47642260789871216, "learning_rate": 4.869960823497745e-05, "loss": 0.019, "num_input_tokens_seen": 25528992, "step": 11820 }, { "epoch": 1.929037520391517, "grad_norm": 0.1696988046169281, "learning_rate": 4.8697341499655626e-05, "loss": 0.1483, "num_input_tokens_seen": 25540032, "step": 11825 }, { "epoch": 1.9298531810766721, "grad_norm": 9.308799743652344, "learning_rate": 4.8695072843307064e-05, "loss": 0.1182, "num_input_tokens_seen": 25551392, "step": 11830 }, { "epoch": 1.9306688417618272, "grad_norm": 5.08847713470459, "learning_rate": 4.869280226611567e-05, "loss": 0.1406, "num_input_tokens_seen": 25561856, "step": 11835 }, { "epoch": 1.931484502446982, "grad_norm": 4.5817718505859375, "learning_rate": 4.86905297682655e-05, "loss": 0.2429, "num_input_tokens_seen": 25571712, "step": 11840 }, { "epoch": 1.932300163132137, "grad_norm": 0.09621615707874298, "learning_rate": 4.868825534994078e-05, "loss": 0.0559, "num_input_tokens_seen": 25581088, "step": 11845 }, { "epoch": 1.933115823817292, "grad_norm": 0.06166497990489006, "learning_rate": 4.86859790113259e-05, "loss": 0.0089, "num_input_tokens_seen": 25593376, "step": 11850 }, { "epoch": 1.933931484502447, "grad_norm": 4.5079169273376465, "learning_rate": 4.868370075260538e-05, "loss": 0.2731, "num_input_tokens_seen": 25603488, "step": 11855 }, { "epoch": 1.934747145187602, "grad_norm": 3.5476584434509277, "learning_rate": 4.86814205739639e-05, "loss": 0.3042, "num_input_tokens_seen": 25613216, "step": 11860 }, { "epoch": 1.935562805872757, "grad_norm": 1.5228770971298218, "learning_rate": 4.86791384755863e-05, "loss": 0.0359, "num_input_tokens_seen": 25624096, "step": 11865 }, { "epoch": 1.9363784665579118, "grad_norm": 0.13201339542865753, "learning_rate": 4.86768544576576e-05, "loss": 0.1756, "num_input_tokens_seen": 25633760, "step": 11870 }, { "epoch": 1.9371941272430668, "grad_norm": 0.7958288788795471, "learning_rate": 4.867456852036295e-05, "loss": 0.1408, "num_input_tokens_seen": 25645216, "step": 11875 }, { "epoch": 1.9380097879282219, "grad_norm": 0.15591788291931152, "learning_rate": 4.867228066388765e-05, "loss": 0.2832, "num_input_tokens_seen": 25656288, "step": 11880 }, { "epoch": 1.9388254486133767, "grad_norm": 0.15843915939331055, "learning_rate": 4.866999088841716e-05, "loss": 0.2027, "num_input_tokens_seen": 25667488, "step": 11885 }, { "epoch": 1.939641109298532, "grad_norm": 3.267289400100708, "learning_rate": 4.866769919413711e-05, "loss": 0.1275, "num_input_tokens_seen": 25678848, "step": 11890 }, { "epoch": 1.9404567699836868, "grad_norm": 0.848806619644165, "learning_rate": 4.866540558123328e-05, "loss": 0.0825, "num_input_tokens_seen": 25689280, "step": 11895 }, { "epoch": 1.9412724306688418, "grad_norm": 3.1739892959594727, "learning_rate": 4.8663110049891595e-05, "loss": 0.1432, "num_input_tokens_seen": 25699552, "step": 11900 }, { "epoch": 1.9420880913539968, "grad_norm": 0.08247596770524979, "learning_rate": 4.866081260029813e-05, "loss": 0.1974, "num_input_tokens_seen": 25709920, "step": 11905 }, { "epoch": 1.9429037520391517, "grad_norm": 4.789215564727783, "learning_rate": 4.8658513232639155e-05, "loss": 0.2647, "num_input_tokens_seen": 25722592, "step": 11910 }, { "epoch": 1.9437194127243067, "grad_norm": 0.2024151086807251, "learning_rate": 4.8656211947101054e-05, "loss": 0.0144, "num_input_tokens_seen": 25733696, "step": 11915 }, { "epoch": 1.9445350734094617, "grad_norm": 2.598015546798706, "learning_rate": 4.865390874387038e-05, "loss": 0.1755, "num_input_tokens_seen": 25743712, "step": 11920 }, { "epoch": 1.9453507340946166, "grad_norm": 0.09131888300180435, "learning_rate": 4.865160362313384e-05, "loss": 0.2475, "num_input_tokens_seen": 25753952, "step": 11925 }, { "epoch": 1.9461663947797716, "grad_norm": 0.42168277502059937, "learning_rate": 4.8649296585078316e-05, "loss": 0.1209, "num_input_tokens_seen": 25764896, "step": 11930 }, { "epoch": 1.9469820554649266, "grad_norm": 1.9647661447525024, "learning_rate": 4.864698762989081e-05, "loss": 0.12, "num_input_tokens_seen": 25774944, "step": 11935 }, { "epoch": 1.9477977161500815, "grad_norm": 0.47306153178215027, "learning_rate": 4.86446767577585e-05, "loss": 0.0901, "num_input_tokens_seen": 25785472, "step": 11940 }, { "epoch": 1.9486133768352365, "grad_norm": 1.2947392463684082, "learning_rate": 4.864236396886872e-05, "loss": 0.1879, "num_input_tokens_seen": 25796448, "step": 11945 }, { "epoch": 1.9494290375203915, "grad_norm": 0.14454756677150726, "learning_rate": 4.864004926340896e-05, "loss": 0.061, "num_input_tokens_seen": 25807168, "step": 11950 }, { "epoch": 1.9502446982055464, "grad_norm": 7.566388130187988, "learning_rate": 4.8637732641566855e-05, "loss": 0.1213, "num_input_tokens_seen": 25817792, "step": 11955 }, { "epoch": 1.9510603588907016, "grad_norm": 1.5212756395339966, "learning_rate": 4.8635414103530205e-05, "loss": 0.228, "num_input_tokens_seen": 25827392, "step": 11960 }, { "epoch": 1.9518760195758564, "grad_norm": 0.12417714297771454, "learning_rate": 4.863309364948697e-05, "loss": 0.0169, "num_input_tokens_seen": 25838720, "step": 11965 }, { "epoch": 1.9526916802610113, "grad_norm": 2.240032911300659, "learning_rate": 4.863077127962524e-05, "loss": 0.0409, "num_input_tokens_seen": 25850496, "step": 11970 }, { "epoch": 1.9535073409461665, "grad_norm": 0.1251329779624939, "learning_rate": 4.8628446994133306e-05, "loss": 0.1351, "num_input_tokens_seen": 25862240, "step": 11975 }, { "epoch": 1.9543230016313213, "grad_norm": 1.1251899003982544, "learning_rate": 4.8626120793199545e-05, "loss": 0.2288, "num_input_tokens_seen": 25873280, "step": 11980 }, { "epoch": 1.9551386623164764, "grad_norm": 0.14849115908145905, "learning_rate": 4.862379267701257e-05, "loss": 0.0192, "num_input_tokens_seen": 25883872, "step": 11985 }, { "epoch": 1.9559543230016314, "grad_norm": 0.17326806485652924, "learning_rate": 4.86214626457611e-05, "loss": 0.0124, "num_input_tokens_seen": 25895424, "step": 11990 }, { "epoch": 1.9567699836867862, "grad_norm": 5.171052932739258, "learning_rate": 4.8619130699633994e-05, "loss": 0.3317, "num_input_tokens_seen": 25906944, "step": 11995 }, { "epoch": 1.9575856443719413, "grad_norm": 3.372026205062866, "learning_rate": 4.861679683882033e-05, "loss": 0.1096, "num_input_tokens_seen": 25918720, "step": 12000 }, { "epoch": 1.9584013050570963, "grad_norm": 3.6195554733276367, "learning_rate": 4.861446106350928e-05, "loss": 0.068, "num_input_tokens_seen": 25929248, "step": 12005 }, { "epoch": 1.9592169657422511, "grad_norm": 5.8719635009765625, "learning_rate": 4.861212337389019e-05, "loss": 0.259, "num_input_tokens_seen": 25939040, "step": 12010 }, { "epoch": 1.9600326264274062, "grad_norm": 0.049886275082826614, "learning_rate": 4.8609783770152575e-05, "loss": 0.2155, "num_input_tokens_seen": 25949024, "step": 12015 }, { "epoch": 1.9608482871125612, "grad_norm": 0.07218191027641296, "learning_rate": 4.8607442252486095e-05, "loss": 0.2368, "num_input_tokens_seen": 25959264, "step": 12020 }, { "epoch": 1.961663947797716, "grad_norm": 0.17256250977516174, "learning_rate": 4.8605098821080564e-05, "loss": 0.0527, "num_input_tokens_seen": 25970432, "step": 12025 }, { "epoch": 1.9624796084828713, "grad_norm": 12.043556213378906, "learning_rate": 4.8602753476125954e-05, "loss": 0.2184, "num_input_tokens_seen": 25981856, "step": 12030 }, { "epoch": 1.963295269168026, "grad_norm": 0.8876869678497314, "learning_rate": 4.860040621781238e-05, "loss": 0.2989, "num_input_tokens_seen": 25993696, "step": 12035 }, { "epoch": 1.964110929853181, "grad_norm": 3.2724945545196533, "learning_rate": 4.8598057046330135e-05, "loss": 0.1858, "num_input_tokens_seen": 26004736, "step": 12040 }, { "epoch": 1.9649265905383362, "grad_norm": 0.14167501032352448, "learning_rate": 4.8595705961869656e-05, "loss": 0.1371, "num_input_tokens_seen": 26016480, "step": 12045 }, { "epoch": 1.965742251223491, "grad_norm": 0.7142931818962097, "learning_rate": 4.859335296462152e-05, "loss": 0.0118, "num_input_tokens_seen": 26027072, "step": 12050 }, { "epoch": 1.966557911908646, "grad_norm": 1.256037712097168, "learning_rate": 4.859099805477648e-05, "loss": 0.1166, "num_input_tokens_seen": 26036096, "step": 12055 }, { "epoch": 1.967373572593801, "grad_norm": 0.32416561245918274, "learning_rate": 4.858864123252544e-05, "loss": 0.1108, "num_input_tokens_seen": 26047264, "step": 12060 }, { "epoch": 1.968189233278956, "grad_norm": 0.6069583892822266, "learning_rate": 4.8586282498059456e-05, "loss": 0.1216, "num_input_tokens_seen": 26058176, "step": 12065 }, { "epoch": 1.969004893964111, "grad_norm": 0.22854197025299072, "learning_rate": 4.8583921851569735e-05, "loss": 0.0635, "num_input_tokens_seen": 26069664, "step": 12070 }, { "epoch": 1.969820554649266, "grad_norm": 0.08224233984947205, "learning_rate": 4.8581559293247655e-05, "loss": 0.1001, "num_input_tokens_seen": 26080640, "step": 12075 }, { "epoch": 1.9706362153344208, "grad_norm": 0.8420477509498596, "learning_rate": 4.857919482328471e-05, "loss": 0.0428, "num_input_tokens_seen": 26091360, "step": 12080 }, { "epoch": 1.9714518760195758, "grad_norm": 0.20236536860466003, "learning_rate": 4.857682844187261e-05, "loss": 0.0353, "num_input_tokens_seen": 26102080, "step": 12085 }, { "epoch": 1.9722675367047309, "grad_norm": 8.648172378540039, "learning_rate": 4.857446014920316e-05, "loss": 0.3215, "num_input_tokens_seen": 26112192, "step": 12090 }, { "epoch": 1.9730831973898857, "grad_norm": 4.223199844360352, "learning_rate": 4.857208994546836e-05, "loss": 0.18, "num_input_tokens_seen": 26122496, "step": 12095 }, { "epoch": 1.9738988580750407, "grad_norm": 6.034249782562256, "learning_rate": 4.856971783086034e-05, "loss": 0.086, "num_input_tokens_seen": 26134048, "step": 12100 }, { "epoch": 1.9747145187601958, "grad_norm": 0.1975557953119278, "learning_rate": 4.85673438055714e-05, "loss": 0.2566, "num_input_tokens_seen": 26144192, "step": 12105 }, { "epoch": 1.9755301794453506, "grad_norm": 0.14109517633914948, "learning_rate": 4.856496786979399e-05, "loss": 0.1572, "num_input_tokens_seen": 26155328, "step": 12110 }, { "epoch": 1.9763458401305058, "grad_norm": 0.08842913806438446, "learning_rate": 4.8562590023720725e-05, "loss": 0.1905, "num_input_tokens_seen": 26166272, "step": 12115 }, { "epoch": 1.9771615008156607, "grad_norm": 2.3355515003204346, "learning_rate": 4.8560210267544345e-05, "loss": 0.2171, "num_input_tokens_seen": 26175520, "step": 12120 }, { "epoch": 1.9779771615008157, "grad_norm": 4.464560508728027, "learning_rate": 4.855782860145779e-05, "loss": 0.4179, "num_input_tokens_seen": 26187008, "step": 12125 }, { "epoch": 1.9787928221859707, "grad_norm": 3.1640465259552, "learning_rate": 4.8555445025654116e-05, "loss": 0.132, "num_input_tokens_seen": 26198016, "step": 12130 }, { "epoch": 1.9796084828711256, "grad_norm": 0.2288096845149994, "learning_rate": 4.855305954032655e-05, "loss": 0.199, "num_input_tokens_seen": 26209440, "step": 12135 }, { "epoch": 1.9804241435562806, "grad_norm": 0.7268104553222656, "learning_rate": 4.855067214566846e-05, "loss": 0.0869, "num_input_tokens_seen": 26220736, "step": 12140 }, { "epoch": 1.9812398042414356, "grad_norm": 2.697340250015259, "learning_rate": 4.85482828418734e-05, "loss": 0.0684, "num_input_tokens_seen": 26231456, "step": 12145 }, { "epoch": 1.9820554649265905, "grad_norm": 0.1778566837310791, "learning_rate": 4.854589162913505e-05, "loss": 0.0792, "num_input_tokens_seen": 26241568, "step": 12150 }, { "epoch": 1.9828711256117455, "grad_norm": 2.7179176807403564, "learning_rate": 4.854349850764725e-05, "loss": 0.2748, "num_input_tokens_seen": 26252480, "step": 12155 }, { "epoch": 1.9836867862969005, "grad_norm": 0.2689560651779175, "learning_rate": 4.8541103477604e-05, "loss": 0.0481, "num_input_tokens_seen": 26263264, "step": 12160 }, { "epoch": 1.9845024469820554, "grad_norm": 4.350931167602539, "learning_rate": 4.853870653919946e-05, "loss": 0.1086, "num_input_tokens_seen": 26274496, "step": 12165 }, { "epoch": 1.9853181076672104, "grad_norm": 4.802256107330322, "learning_rate": 4.853630769262794e-05, "loss": 0.1675, "num_input_tokens_seen": 26285152, "step": 12170 }, { "epoch": 1.9861337683523654, "grad_norm": 3.5757229328155518, "learning_rate": 4.853390693808388e-05, "loss": 0.2827, "num_input_tokens_seen": 26296064, "step": 12175 }, { "epoch": 1.9869494290375203, "grad_norm": 3.319054365158081, "learning_rate": 4.853150427576193e-05, "loss": 0.1783, "num_input_tokens_seen": 26305984, "step": 12180 }, { "epoch": 1.9877650897226755, "grad_norm": 0.572554886341095, "learning_rate": 4.852909970585684e-05, "loss": 0.1024, "num_input_tokens_seen": 26316896, "step": 12185 }, { "epoch": 1.9885807504078303, "grad_norm": 0.4888591468334198, "learning_rate": 4.852669322856354e-05, "loss": 0.1044, "num_input_tokens_seen": 26326976, "step": 12190 }, { "epoch": 1.9893964110929854, "grad_norm": 2.590435266494751, "learning_rate": 4.8524284844077116e-05, "loss": 0.2301, "num_input_tokens_seen": 26337504, "step": 12195 }, { "epoch": 1.9902120717781404, "grad_norm": 2.9432034492492676, "learning_rate": 4.8521874552592805e-05, "loss": 0.1644, "num_input_tokens_seen": 26348608, "step": 12200 }, { "epoch": 1.9910277324632952, "grad_norm": 6.0449066162109375, "learning_rate": 4.851946235430599e-05, "loss": 0.2743, "num_input_tokens_seen": 26358240, "step": 12205 }, { "epoch": 1.9918433931484503, "grad_norm": 0.1594952940940857, "learning_rate": 4.851704824941222e-05, "loss": 0.0722, "num_input_tokens_seen": 26369568, "step": 12210 }, { "epoch": 1.9926590538336053, "grad_norm": 2.65429949760437, "learning_rate": 4.8514632238107194e-05, "loss": 0.1383, "num_input_tokens_seen": 26379936, "step": 12215 }, { "epoch": 1.9934747145187601, "grad_norm": 1.7988312244415283, "learning_rate": 4.851221432058677e-05, "loss": 0.124, "num_input_tokens_seen": 26390624, "step": 12220 }, { "epoch": 1.9942903752039152, "grad_norm": 0.48065802454948425, "learning_rate": 4.850979449704695e-05, "loss": 0.1274, "num_input_tokens_seen": 26399616, "step": 12225 }, { "epoch": 1.9951060358890702, "grad_norm": 0.7311342358589172, "learning_rate": 4.85073727676839e-05, "loss": 0.0981, "num_input_tokens_seen": 26410208, "step": 12230 }, { "epoch": 1.995921696574225, "grad_norm": 0.2876925468444824, "learning_rate": 4.8504949132693936e-05, "loss": 0.0782, "num_input_tokens_seen": 26420160, "step": 12235 }, { "epoch": 1.99673735725938, "grad_norm": 0.24604609608650208, "learning_rate": 4.850252359227353e-05, "loss": 0.0747, "num_input_tokens_seen": 26430432, "step": 12240 }, { "epoch": 1.997553017944535, "grad_norm": 0.2904493510723114, "learning_rate": 4.8500096146619325e-05, "loss": 0.1003, "num_input_tokens_seen": 26440128, "step": 12245 }, { "epoch": 1.99836867862969, "grad_norm": 4.259380340576172, "learning_rate": 4.849766679592808e-05, "loss": 0.094, "num_input_tokens_seen": 26450496, "step": 12250 }, { "epoch": 1.9991843393148452, "grad_norm": 2.26529598236084, "learning_rate": 4.849523554039673e-05, "loss": 0.126, "num_input_tokens_seen": 26461824, "step": 12255 }, { "epoch": 2.0, "grad_norm": 1.0837936401367188, "learning_rate": 4.8492802380222393e-05, "loss": 0.1146, "num_input_tokens_seen": 26471216, "step": 12260 }, { "epoch": 2.0, "eval_loss": 0.14923527836799622, "eval_runtime": 132.9831, "eval_samples_per_second": 20.491, "eval_steps_per_second": 5.128, "num_input_tokens_seen": 26471216, "step": 12260 }, { "epoch": 2.000815660685155, "grad_norm": 3.045311689376831, "learning_rate": 4.849036731560228e-05, "loss": 0.1005, "num_input_tokens_seen": 26481904, "step": 12265 }, { "epoch": 2.00163132137031, "grad_norm": 3.7052078247070312, "learning_rate": 4.84879303467338e-05, "loss": 0.2073, "num_input_tokens_seen": 26491344, "step": 12270 }, { "epoch": 2.002446982055465, "grad_norm": 0.19312560558319092, "learning_rate": 4.8485491473814514e-05, "loss": 0.0304, "num_input_tokens_seen": 26502160, "step": 12275 }, { "epoch": 2.0032626427406197, "grad_norm": 2.924866199493408, "learning_rate": 4.8483050697042135e-05, "loss": 0.076, "num_input_tokens_seen": 26512816, "step": 12280 }, { "epoch": 2.004078303425775, "grad_norm": 0.06528328359127045, "learning_rate": 4.8480608016614504e-05, "loss": 0.0106, "num_input_tokens_seen": 26522640, "step": 12285 }, { "epoch": 2.00489396411093, "grad_norm": 0.07696110010147095, "learning_rate": 4.847816343272965e-05, "loss": 0.2894, "num_input_tokens_seen": 26533072, "step": 12290 }, { "epoch": 2.0057096247960846, "grad_norm": 0.037772562354803085, "learning_rate": 4.847571694558574e-05, "loss": 0.1201, "num_input_tokens_seen": 26543408, "step": 12295 }, { "epoch": 2.00652528548124, "grad_norm": 0.08915374428033829, "learning_rate": 4.84732685553811e-05, "loss": 0.2462, "num_input_tokens_seen": 26554320, "step": 12300 }, { "epoch": 2.0073409461663947, "grad_norm": 0.136813685297966, "learning_rate": 4.847081826231421e-05, "loss": 0.1828, "num_input_tokens_seen": 26565008, "step": 12305 }, { "epoch": 2.00815660685155, "grad_norm": 0.06767254322767258, "learning_rate": 4.846836606658371e-05, "loss": 0.0326, "num_input_tokens_seen": 26576912, "step": 12310 }, { "epoch": 2.0089722675367048, "grad_norm": 3.284564971923828, "learning_rate": 4.8465911968388364e-05, "loss": 0.4687, "num_input_tokens_seen": 26586800, "step": 12315 }, { "epoch": 2.0097879282218596, "grad_norm": 1.2693392038345337, "learning_rate": 4.846345596792713e-05, "loss": 0.0776, "num_input_tokens_seen": 26598160, "step": 12320 }, { "epoch": 2.010603588907015, "grad_norm": 2.8209829330444336, "learning_rate": 4.846099806539911e-05, "loss": 0.1851, "num_input_tokens_seen": 26609136, "step": 12325 }, { "epoch": 2.0114192495921697, "grad_norm": 0.29326483607292175, "learning_rate": 4.845853826100355e-05, "loss": 0.0905, "num_input_tokens_seen": 26620752, "step": 12330 }, { "epoch": 2.0122349102773245, "grad_norm": 5.118554592132568, "learning_rate": 4.845607655493984e-05, "loss": 0.2262, "num_input_tokens_seen": 26630256, "step": 12335 }, { "epoch": 2.0130505709624797, "grad_norm": 3.600283145904541, "learning_rate": 4.8453612947407564e-05, "loss": 0.3009, "num_input_tokens_seen": 26641136, "step": 12340 }, { "epoch": 2.0138662316476346, "grad_norm": 0.3470727205276489, "learning_rate": 4.8451147438606416e-05, "loss": 0.0599, "num_input_tokens_seen": 26651824, "step": 12345 }, { "epoch": 2.0146818923327894, "grad_norm": 6.615456581115723, "learning_rate": 4.844868002873626e-05, "loss": 0.2983, "num_input_tokens_seen": 26662064, "step": 12350 }, { "epoch": 2.0154975530179446, "grad_norm": 0.5639845132827759, "learning_rate": 4.844621071799712e-05, "loss": 0.1157, "num_input_tokens_seen": 26673104, "step": 12355 }, { "epoch": 2.0163132137030995, "grad_norm": 2.5894696712493896, "learning_rate": 4.844373950658918e-05, "loss": 0.2547, "num_input_tokens_seen": 26682736, "step": 12360 }, { "epoch": 2.0171288743882543, "grad_norm": 0.23206761479377747, "learning_rate": 4.844126639471277e-05, "loss": 0.1492, "num_input_tokens_seen": 26694224, "step": 12365 }, { "epoch": 2.0179445350734095, "grad_norm": 0.5956500172615051, "learning_rate": 4.843879138256836e-05, "loss": 0.0564, "num_input_tokens_seen": 26704880, "step": 12370 }, { "epoch": 2.0187601957585644, "grad_norm": 1.3979803323745728, "learning_rate": 4.843631447035659e-05, "loss": 0.1814, "num_input_tokens_seen": 26713648, "step": 12375 }, { "epoch": 2.0195758564437196, "grad_norm": 0.1042853444814682, "learning_rate": 4.843383565827826e-05, "loss": 0.0313, "num_input_tokens_seen": 26724560, "step": 12380 }, { "epoch": 2.0203915171288744, "grad_norm": 4.647457599639893, "learning_rate": 4.843135494653431e-05, "loss": 0.1446, "num_input_tokens_seen": 26735344, "step": 12385 }, { "epoch": 2.0212071778140293, "grad_norm": 2.719658851623535, "learning_rate": 4.842887233532584e-05, "loss": 0.1123, "num_input_tokens_seen": 26746288, "step": 12390 }, { "epoch": 2.0220228384991845, "grad_norm": 2.9892961978912354, "learning_rate": 4.842638782485409e-05, "loss": 0.1384, "num_input_tokens_seen": 26757936, "step": 12395 }, { "epoch": 2.0228384991843393, "grad_norm": 2.41135835647583, "learning_rate": 4.8423901415320486e-05, "loss": 0.1465, "num_input_tokens_seen": 26770256, "step": 12400 }, { "epoch": 2.023654159869494, "grad_norm": 3.05769681930542, "learning_rate": 4.8421413106926586e-05, "loss": 0.0795, "num_input_tokens_seen": 26780240, "step": 12405 }, { "epoch": 2.0244698205546494, "grad_norm": 0.4137382209300995, "learning_rate": 4.841892289987409e-05, "loss": 0.1214, "num_input_tokens_seen": 26791824, "step": 12410 }, { "epoch": 2.0252854812398042, "grad_norm": 0.10253965109586716, "learning_rate": 4.841643079436489e-05, "loss": 0.1238, "num_input_tokens_seen": 26802576, "step": 12415 }, { "epoch": 2.026101141924959, "grad_norm": 0.252849817276001, "learning_rate": 4.841393679060099e-05, "loss": 0.1483, "num_input_tokens_seen": 26814256, "step": 12420 }, { "epoch": 2.0269168026101143, "grad_norm": 6.598630428314209, "learning_rate": 4.841144088878457e-05, "loss": 0.3621, "num_input_tokens_seen": 26824528, "step": 12425 }, { "epoch": 2.027732463295269, "grad_norm": 3.748215913772583, "learning_rate": 4.8408943089117964e-05, "loss": 0.0528, "num_input_tokens_seen": 26836816, "step": 12430 }, { "epoch": 2.028548123980424, "grad_norm": 1.9309725761413574, "learning_rate": 4.840644339180366e-05, "loss": 0.1367, "num_input_tokens_seen": 26846800, "step": 12435 }, { "epoch": 2.029363784665579, "grad_norm": 1.057390570640564, "learning_rate": 4.8403941797044286e-05, "loss": 0.1011, "num_input_tokens_seen": 26858288, "step": 12440 }, { "epoch": 2.030179445350734, "grad_norm": 0.6322423219680786, "learning_rate": 4.840143830504264e-05, "loss": 0.0339, "num_input_tokens_seen": 26868624, "step": 12445 }, { "epoch": 2.0309951060358893, "grad_norm": 0.3212203085422516, "learning_rate": 4.839893291600167e-05, "loss": 0.1978, "num_input_tokens_seen": 26879536, "step": 12450 }, { "epoch": 2.031810766721044, "grad_norm": 1.5423448085784912, "learning_rate": 4.839642563012447e-05, "loss": 0.4648, "num_input_tokens_seen": 26890320, "step": 12455 }, { "epoch": 2.032626427406199, "grad_norm": 0.2827596366405487, "learning_rate": 4.83939164476143e-05, "loss": 0.1585, "num_input_tokens_seen": 26900784, "step": 12460 }, { "epoch": 2.033442088091354, "grad_norm": 0.22221170365810394, "learning_rate": 4.839140536867456e-05, "loss": 0.0696, "num_input_tokens_seen": 26912944, "step": 12465 }, { "epoch": 2.034257748776509, "grad_norm": 0.13560988008975983, "learning_rate": 4.838889239350881e-05, "loss": 0.1641, "num_input_tokens_seen": 26923792, "step": 12470 }, { "epoch": 2.035073409461664, "grad_norm": 0.320088267326355, "learning_rate": 4.838637752232078e-05, "loss": 0.0278, "num_input_tokens_seen": 26934736, "step": 12475 }, { "epoch": 2.035889070146819, "grad_norm": 0.11676841229200363, "learning_rate": 4.838386075531432e-05, "loss": 0.1107, "num_input_tokens_seen": 26945840, "step": 12480 }, { "epoch": 2.036704730831974, "grad_norm": 0.10097681730985641, "learning_rate": 4.8381342092693464e-05, "loss": 0.0379, "num_input_tokens_seen": 26956304, "step": 12485 }, { "epoch": 2.0375203915171287, "grad_norm": 1.9858624935150146, "learning_rate": 4.837882153466237e-05, "loss": 0.0379, "num_input_tokens_seen": 26967632, "step": 12490 }, { "epoch": 2.038336052202284, "grad_norm": 0.4636116623878479, "learning_rate": 4.837629908142539e-05, "loss": 0.0771, "num_input_tokens_seen": 26978160, "step": 12495 }, { "epoch": 2.039151712887439, "grad_norm": 3.140413284301758, "learning_rate": 4.837377473318699e-05, "loss": 0.2424, "num_input_tokens_seen": 26987568, "step": 12500 }, { "epoch": 2.0399673735725936, "grad_norm": 0.1420295536518097, "learning_rate": 4.837124849015182e-05, "loss": 0.0991, "num_input_tokens_seen": 26999760, "step": 12505 }, { "epoch": 2.040783034257749, "grad_norm": 3.2932164669036865, "learning_rate": 4.8368720352524655e-05, "loss": 0.0692, "num_input_tokens_seen": 27011696, "step": 12510 }, { "epoch": 2.0415986949429037, "grad_norm": 1.278024673461914, "learning_rate": 4.8366190320510454e-05, "loss": 0.054, "num_input_tokens_seen": 27022608, "step": 12515 }, { "epoch": 2.0424143556280585, "grad_norm": 2.2411954402923584, "learning_rate": 4.83636583943143e-05, "loss": 0.1548, "num_input_tokens_seen": 27033680, "step": 12520 }, { "epoch": 2.0432300163132138, "grad_norm": 3.933164119720459, "learning_rate": 4.8361124574141455e-05, "loss": 0.0331, "num_input_tokens_seen": 27045136, "step": 12525 }, { "epoch": 2.0440456769983686, "grad_norm": 0.5014989972114563, "learning_rate": 4.835858886019732e-05, "loss": 0.1856, "num_input_tokens_seen": 27055760, "step": 12530 }, { "epoch": 2.044861337683524, "grad_norm": 0.11863043904304504, "learning_rate": 4.835605125268745e-05, "loss": 0.0668, "num_input_tokens_seen": 27066352, "step": 12535 }, { "epoch": 2.0456769983686787, "grad_norm": 0.6212408542633057, "learning_rate": 4.835351175181755e-05, "loss": 0.1345, "num_input_tokens_seen": 27077872, "step": 12540 }, { "epoch": 2.0464926590538335, "grad_norm": 5.539122104644775, "learning_rate": 4.83509703577935e-05, "loss": 0.1086, "num_input_tokens_seen": 27088912, "step": 12545 }, { "epoch": 2.0473083197389887, "grad_norm": 0.050495702773332596, "learning_rate": 4.834842707082131e-05, "loss": 0.0102, "num_input_tokens_seen": 27099216, "step": 12550 }, { "epoch": 2.0481239804241436, "grad_norm": 0.1347498893737793, "learning_rate": 4.834588189110716e-05, "loss": 0.2236, "num_input_tokens_seen": 27109712, "step": 12555 }, { "epoch": 2.0489396411092984, "grad_norm": 0.47515809535980225, "learning_rate": 4.834333481885735e-05, "loss": 0.0167, "num_input_tokens_seen": 27120304, "step": 12560 }, { "epoch": 2.0497553017944536, "grad_norm": 0.0680442750453949, "learning_rate": 4.8340785854278395e-05, "loss": 0.1594, "num_input_tokens_seen": 27130992, "step": 12565 }, { "epoch": 2.0505709624796085, "grad_norm": 2.6305956840515137, "learning_rate": 4.83382349975769e-05, "loss": 0.1084, "num_input_tokens_seen": 27141136, "step": 12570 }, { "epoch": 2.0513866231647633, "grad_norm": 0.8694716095924377, "learning_rate": 4.833568224895967e-05, "loss": 0.1113, "num_input_tokens_seen": 27152208, "step": 12575 }, { "epoch": 2.0522022838499185, "grad_norm": 0.0830124020576477, "learning_rate": 4.833312760863362e-05, "loss": 0.1575, "num_input_tokens_seen": 27164016, "step": 12580 }, { "epoch": 2.0530179445350734, "grad_norm": 0.39163777232170105, "learning_rate": 4.833057107680586e-05, "loss": 0.1352, "num_input_tokens_seen": 27174800, "step": 12585 }, { "epoch": 2.053833605220228, "grad_norm": 0.5324012041091919, "learning_rate": 4.832801265368363e-05, "loss": 0.0634, "num_input_tokens_seen": 27184784, "step": 12590 }, { "epoch": 2.0546492659053834, "grad_norm": 0.484078973531723, "learning_rate": 4.832545233947433e-05, "loss": 0.0428, "num_input_tokens_seen": 27195600, "step": 12595 }, { "epoch": 2.0554649265905383, "grad_norm": 0.09587056934833527, "learning_rate": 4.832289013438551e-05, "loss": 0.0715, "num_input_tokens_seen": 27206416, "step": 12600 }, { "epoch": 2.0562805872756935, "grad_norm": 1.2562344074249268, "learning_rate": 4.8320326038624875e-05, "loss": 0.1726, "num_input_tokens_seen": 27217360, "step": 12605 }, { "epoch": 2.0570962479608483, "grad_norm": 3.0299909114837646, "learning_rate": 4.831776005240029e-05, "loss": 0.2203, "num_input_tokens_seen": 27228304, "step": 12610 }, { "epoch": 2.057911908646003, "grad_norm": 0.25369992852211, "learning_rate": 4.831519217591976e-05, "loss": 0.157, "num_input_tokens_seen": 27240208, "step": 12615 }, { "epoch": 2.0587275693311584, "grad_norm": 0.15049006044864655, "learning_rate": 4.831262240939144e-05, "loss": 0.044, "num_input_tokens_seen": 27250256, "step": 12620 }, { "epoch": 2.0595432300163132, "grad_norm": 6.327296733856201, "learning_rate": 4.8310050753023674e-05, "loss": 0.2034, "num_input_tokens_seen": 27260080, "step": 12625 }, { "epoch": 2.060358890701468, "grad_norm": 2.6136813163757324, "learning_rate": 4.8307477207024923e-05, "loss": 0.0326, "num_input_tokens_seen": 27272112, "step": 12630 }, { "epoch": 2.0611745513866233, "grad_norm": 0.1514524519443512, "learning_rate": 4.83049017716038e-05, "loss": 0.1483, "num_input_tokens_seen": 27282672, "step": 12635 }, { "epoch": 2.061990212071778, "grad_norm": 2.3813018798828125, "learning_rate": 4.8302324446969094e-05, "loss": 0.0663, "num_input_tokens_seen": 27294704, "step": 12640 }, { "epoch": 2.062805872756933, "grad_norm": 0.9157320857048035, "learning_rate": 4.829974523332973e-05, "loss": 0.1115, "num_input_tokens_seen": 27306320, "step": 12645 }, { "epoch": 2.063621533442088, "grad_norm": 0.3651241064071655, "learning_rate": 4.8297164130894804e-05, "loss": 0.1239, "num_input_tokens_seen": 27316720, "step": 12650 }, { "epoch": 2.064437194127243, "grad_norm": 0.14332373440265656, "learning_rate": 4.8294581139873544e-05, "loss": 0.0085, "num_input_tokens_seen": 27327408, "step": 12655 }, { "epoch": 2.065252854812398, "grad_norm": 2.2300896644592285, "learning_rate": 4.829199626047534e-05, "loss": 0.1645, "num_input_tokens_seen": 27338672, "step": 12660 }, { "epoch": 2.066068515497553, "grad_norm": 3.436211109161377, "learning_rate": 4.8289409492909726e-05, "loss": 0.2541, "num_input_tokens_seen": 27350352, "step": 12665 }, { "epoch": 2.066884176182708, "grad_norm": 0.06419938057661057, "learning_rate": 4.8286820837386416e-05, "loss": 0.0668, "num_input_tokens_seen": 27360432, "step": 12670 }, { "epoch": 2.067699836867863, "grad_norm": 0.6670409440994263, "learning_rate": 4.828423029411526e-05, "loss": 0.1714, "num_input_tokens_seen": 27369840, "step": 12675 }, { "epoch": 2.068515497553018, "grad_norm": 0.11572045087814331, "learning_rate": 4.828163786330624e-05, "loss": 0.0127, "num_input_tokens_seen": 27380016, "step": 12680 }, { "epoch": 2.069331158238173, "grad_norm": 3.221712827682495, "learning_rate": 4.8279043545169535e-05, "loss": 0.1313, "num_input_tokens_seen": 27391504, "step": 12685 }, { "epoch": 2.070146818923328, "grad_norm": 0.22601261734962463, "learning_rate": 4.8276447339915446e-05, "loss": 0.0173, "num_input_tokens_seen": 27402064, "step": 12690 }, { "epoch": 2.070962479608483, "grad_norm": 0.08553045243024826, "learning_rate": 4.827384924775442e-05, "loss": 0.0924, "num_input_tokens_seen": 27413808, "step": 12695 }, { "epoch": 2.0717781402936377, "grad_norm": 0.33960750699043274, "learning_rate": 4.8271249268897094e-05, "loss": 0.2201, "num_input_tokens_seen": 27425520, "step": 12700 }, { "epoch": 2.072593800978793, "grad_norm": 3.99654221534729, "learning_rate": 4.826864740355422e-05, "loss": 0.2413, "num_input_tokens_seen": 27436624, "step": 12705 }, { "epoch": 2.073409461663948, "grad_norm": 0.1754871904850006, "learning_rate": 4.826604365193673e-05, "loss": 0.15, "num_input_tokens_seen": 27448176, "step": 12710 }, { "epoch": 2.0742251223491026, "grad_norm": 0.36933010816574097, "learning_rate": 4.8263438014255687e-05, "loss": 0.0558, "num_input_tokens_seen": 27459568, "step": 12715 }, { "epoch": 2.075040783034258, "grad_norm": 6.893112659454346, "learning_rate": 4.8260830490722317e-05, "loss": 0.1323, "num_input_tokens_seen": 27470800, "step": 12720 }, { "epoch": 2.0758564437194127, "grad_norm": 0.9920274019241333, "learning_rate": 4.8258221081548004e-05, "loss": 0.1025, "num_input_tokens_seen": 27481168, "step": 12725 }, { "epoch": 2.0766721044045675, "grad_norm": 0.5149859189987183, "learning_rate": 4.825560978694429e-05, "loss": 0.0987, "num_input_tokens_seen": 27492528, "step": 12730 }, { "epoch": 2.0774877650897228, "grad_norm": 0.38302597403526306, "learning_rate": 4.8252996607122835e-05, "loss": 0.1309, "num_input_tokens_seen": 27503152, "step": 12735 }, { "epoch": 2.0783034257748776, "grad_norm": 0.5170636773109436, "learning_rate": 4.82503815422955e-05, "loss": 0.0333, "num_input_tokens_seen": 27513904, "step": 12740 }, { "epoch": 2.0791190864600324, "grad_norm": 2.662625312805176, "learning_rate": 4.824776459267426e-05, "loss": 0.2391, "num_input_tokens_seen": 27525648, "step": 12745 }, { "epoch": 2.0799347471451877, "grad_norm": 4.804661750793457, "learning_rate": 4.824514575847127e-05, "loss": 0.1396, "num_input_tokens_seen": 27537264, "step": 12750 }, { "epoch": 2.0807504078303425, "grad_norm": 1.9787460565567017, "learning_rate": 4.824252503989881e-05, "loss": 0.0634, "num_input_tokens_seen": 27548144, "step": 12755 }, { "epoch": 2.0815660685154977, "grad_norm": 0.3444620966911316, "learning_rate": 4.823990243716935e-05, "loss": 0.0351, "num_input_tokens_seen": 27559568, "step": 12760 }, { "epoch": 2.0823817292006526, "grad_norm": 4.222772598266602, "learning_rate": 4.823727795049548e-05, "loss": 0.2073, "num_input_tokens_seen": 27570544, "step": 12765 }, { "epoch": 2.0831973898858074, "grad_norm": 0.6607770919799805, "learning_rate": 4.8234651580089945e-05, "loss": 0.1915, "num_input_tokens_seen": 27581616, "step": 12770 }, { "epoch": 2.0840130505709626, "grad_norm": 0.049387168139219284, "learning_rate": 4.823202332616567e-05, "loss": 0.2267, "num_input_tokens_seen": 27592112, "step": 12775 }, { "epoch": 2.0848287112561175, "grad_norm": 2.145779609680176, "learning_rate": 4.8229393188935703e-05, "loss": 0.2072, "num_input_tokens_seen": 27603056, "step": 12780 }, { "epoch": 2.0856443719412723, "grad_norm": 1.4277530908584595, "learning_rate": 4.8226761168613255e-05, "loss": 0.1471, "num_input_tokens_seen": 27613616, "step": 12785 }, { "epoch": 2.0864600326264275, "grad_norm": 1.1328977346420288, "learning_rate": 4.82241272654117e-05, "loss": 0.1536, "num_input_tokens_seen": 27624784, "step": 12790 }, { "epoch": 2.0872756933115824, "grad_norm": 0.17303496599197388, "learning_rate": 4.822149147954455e-05, "loss": 0.0318, "num_input_tokens_seen": 27636048, "step": 12795 }, { "epoch": 2.088091353996737, "grad_norm": 1.8548846244812012, "learning_rate": 4.8218853811225475e-05, "loss": 0.1574, "num_input_tokens_seen": 27645328, "step": 12800 }, { "epoch": 2.0889070146818924, "grad_norm": 0.09046615660190582, "learning_rate": 4.8216214260668304e-05, "loss": 0.1622, "num_input_tokens_seen": 27655024, "step": 12805 }, { "epoch": 2.0897226753670473, "grad_norm": 4.659290313720703, "learning_rate": 4.8213572828087e-05, "loss": 0.1731, "num_input_tokens_seen": 27666224, "step": 12810 }, { "epoch": 2.090538336052202, "grad_norm": 0.06531661748886108, "learning_rate": 4.82109295136957e-05, "loss": 0.0062, "num_input_tokens_seen": 27677904, "step": 12815 }, { "epoch": 2.0913539967373573, "grad_norm": 3.016369104385376, "learning_rate": 4.820828431770868e-05, "loss": 0.1218, "num_input_tokens_seen": 27687088, "step": 12820 }, { "epoch": 2.092169657422512, "grad_norm": 3.686668872833252, "learning_rate": 4.820563724034039e-05, "loss": 0.2888, "num_input_tokens_seen": 27697008, "step": 12825 }, { "epoch": 2.0929853181076674, "grad_norm": 4.606642246246338, "learning_rate": 4.820298828180538e-05, "loss": 0.0425, "num_input_tokens_seen": 27708208, "step": 12830 }, { "epoch": 2.0938009787928222, "grad_norm": 0.16520646214485168, "learning_rate": 4.8200337442318424e-05, "loss": 0.316, "num_input_tokens_seen": 27718704, "step": 12835 }, { "epoch": 2.094616639477977, "grad_norm": 0.08105045557022095, "learning_rate": 4.819768472209439e-05, "loss": 0.0807, "num_input_tokens_seen": 27728816, "step": 12840 }, { "epoch": 2.0954323001631323, "grad_norm": 3.2872564792633057, "learning_rate": 4.8195030121348336e-05, "loss": 0.0702, "num_input_tokens_seen": 27738992, "step": 12845 }, { "epoch": 2.096247960848287, "grad_norm": 0.22190487384796143, "learning_rate": 4.819237364029544e-05, "loss": 0.1307, "num_input_tokens_seen": 27750384, "step": 12850 }, { "epoch": 2.097063621533442, "grad_norm": 0.9946272969245911, "learning_rate": 4.818971527915107e-05, "loss": 0.0873, "num_input_tokens_seen": 27760784, "step": 12855 }, { "epoch": 2.097879282218597, "grad_norm": 0.13623514771461487, "learning_rate": 4.818705503813071e-05, "loss": 0.0473, "num_input_tokens_seen": 27771952, "step": 12860 }, { "epoch": 2.098694942903752, "grad_norm": 0.6727923154830933, "learning_rate": 4.818439291745002e-05, "loss": 0.0806, "num_input_tokens_seen": 27781744, "step": 12865 }, { "epoch": 2.099510603588907, "grad_norm": 0.07763024419546127, "learning_rate": 4.81817289173248e-05, "loss": 0.0227, "num_input_tokens_seen": 27792944, "step": 12870 }, { "epoch": 2.100326264274062, "grad_norm": 0.552696704864502, "learning_rate": 4.8179063037971016e-05, "loss": 0.0718, "num_input_tokens_seen": 27804048, "step": 12875 }, { "epoch": 2.101141924959217, "grad_norm": 2.295488119125366, "learning_rate": 4.817639527960477e-05, "loss": 0.1117, "num_input_tokens_seen": 27814960, "step": 12880 }, { "epoch": 2.1019575856443717, "grad_norm": 0.11163581907749176, "learning_rate": 4.817372564244233e-05, "loss": 0.0343, "num_input_tokens_seen": 27827280, "step": 12885 }, { "epoch": 2.102773246329527, "grad_norm": 1.5921348333358765, "learning_rate": 4.817105412670011e-05, "loss": 0.0979, "num_input_tokens_seen": 27837520, "step": 12890 }, { "epoch": 2.103588907014682, "grad_norm": 5.123642444610596, "learning_rate": 4.8168380732594666e-05, "loss": 0.0743, "num_input_tokens_seen": 27848912, "step": 12895 }, { "epoch": 2.104404567699837, "grad_norm": 0.34312704205513, "learning_rate": 4.816570546034273e-05, "loss": 0.0867, "num_input_tokens_seen": 27859504, "step": 12900 }, { "epoch": 2.105220228384992, "grad_norm": 0.45883709192276, "learning_rate": 4.816302831016116e-05, "loss": 0.0312, "num_input_tokens_seen": 27870736, "step": 12905 }, { "epoch": 2.1060358890701467, "grad_norm": 4.113556385040283, "learning_rate": 4.8160349282266995e-05, "loss": 0.1945, "num_input_tokens_seen": 27880720, "step": 12910 }, { "epoch": 2.106851549755302, "grad_norm": 4.943325519561768, "learning_rate": 4.81576683768774e-05, "loss": 0.3171, "num_input_tokens_seen": 27892048, "step": 12915 }, { "epoch": 2.107667210440457, "grad_norm": 0.42248857021331787, "learning_rate": 4.81549855942097e-05, "loss": 0.0241, "num_input_tokens_seen": 27903760, "step": 12920 }, { "epoch": 2.1084828711256116, "grad_norm": 0.08590734750032425, "learning_rate": 4.8152300934481384e-05, "loss": 0.1001, "num_input_tokens_seen": 27914192, "step": 12925 }, { "epoch": 2.109298531810767, "grad_norm": 0.6834118366241455, "learning_rate": 4.8149614397910094e-05, "loss": 0.0145, "num_input_tokens_seen": 27925936, "step": 12930 }, { "epoch": 2.1101141924959217, "grad_norm": 1.6590182781219482, "learning_rate": 4.8146925984713585e-05, "loss": 0.29, "num_input_tokens_seen": 27937392, "step": 12935 }, { "epoch": 2.1109298531810765, "grad_norm": 0.3167704939842224, "learning_rate": 4.814423569510981e-05, "loss": 0.0985, "num_input_tokens_seen": 27948304, "step": 12940 }, { "epoch": 2.1117455138662318, "grad_norm": 3.9980692863464355, "learning_rate": 4.814154352931687e-05, "loss": 0.0544, "num_input_tokens_seen": 27959344, "step": 12945 }, { "epoch": 2.1125611745513866, "grad_norm": 3.7122998237609863, "learning_rate": 4.813884948755298e-05, "loss": 0.3088, "num_input_tokens_seen": 27970032, "step": 12950 }, { "epoch": 2.1133768352365414, "grad_norm": 0.3176769018173218, "learning_rate": 4.8136153570036544e-05, "loss": 0.0613, "num_input_tokens_seen": 27979984, "step": 12955 }, { "epoch": 2.1141924959216967, "grad_norm": 0.08330029249191284, "learning_rate": 4.8133455776986114e-05, "loss": 0.1169, "num_input_tokens_seen": 27989200, "step": 12960 }, { "epoch": 2.1150081566068515, "grad_norm": 0.30361437797546387, "learning_rate": 4.813075610862038e-05, "loss": 0.1103, "num_input_tokens_seen": 28000432, "step": 12965 }, { "epoch": 2.1158238172920063, "grad_norm": 0.9845252633094788, "learning_rate": 4.8128054565158196e-05, "loss": 0.0299, "num_input_tokens_seen": 28010064, "step": 12970 }, { "epoch": 2.1166394779771616, "grad_norm": 2.610048532485962, "learning_rate": 4.8125351146818556e-05, "loss": 0.1835, "num_input_tokens_seen": 28021776, "step": 12975 }, { "epoch": 2.1174551386623164, "grad_norm": 0.10616115480661392, "learning_rate": 4.8122645853820604e-05, "loss": 0.2588, "num_input_tokens_seen": 28032656, "step": 12980 }, { "epoch": 2.1182707993474716, "grad_norm": 0.028080087155103683, "learning_rate": 4.811993868638367e-05, "loss": 0.0717, "num_input_tokens_seen": 28043728, "step": 12985 }, { "epoch": 2.1190864600326265, "grad_norm": 0.18192705512046814, "learning_rate": 4.811722964472719e-05, "loss": 0.2153, "num_input_tokens_seen": 28055120, "step": 12990 }, { "epoch": 2.1199021207177813, "grad_norm": 6.684364318847656, "learning_rate": 4.811451872907078e-05, "loss": 0.1343, "num_input_tokens_seen": 28065936, "step": 12995 }, { "epoch": 2.1207177814029365, "grad_norm": 5.173242092132568, "learning_rate": 4.8111805939634204e-05, "loss": 0.1583, "num_input_tokens_seen": 28077040, "step": 13000 }, { "epoch": 2.1215334420880914, "grad_norm": 1.8242324590682983, "learning_rate": 4.810909127663736e-05, "loss": 0.0703, "num_input_tokens_seen": 28088624, "step": 13005 }, { "epoch": 2.122349102773246, "grad_norm": 3.3235926628112793, "learning_rate": 4.810637474030033e-05, "loss": 0.215, "num_input_tokens_seen": 28097520, "step": 13010 }, { "epoch": 2.1231647634584014, "grad_norm": 0.15635597705841064, "learning_rate": 4.810365633084333e-05, "loss": 0.3075, "num_input_tokens_seen": 28107504, "step": 13015 }, { "epoch": 2.1239804241435563, "grad_norm": 4.994592189788818, "learning_rate": 4.810093604848671e-05, "loss": 0.2109, "num_input_tokens_seen": 28118352, "step": 13020 }, { "epoch": 2.124796084828711, "grad_norm": 1.1246777772903442, "learning_rate": 4.8098213893451005e-05, "loss": 0.1026, "num_input_tokens_seen": 28129936, "step": 13025 }, { "epoch": 2.1256117455138663, "grad_norm": 2.986356019973755, "learning_rate": 4.809548986595688e-05, "loss": 0.1595, "num_input_tokens_seen": 28140496, "step": 13030 }, { "epoch": 2.126427406199021, "grad_norm": 4.179571628570557, "learning_rate": 4.809276396622516e-05, "loss": 0.1303, "num_input_tokens_seen": 28151440, "step": 13035 }, { "epoch": 2.1272430668841764, "grad_norm": 1.1820942163467407, "learning_rate": 4.809003619447683e-05, "loss": 0.1196, "num_input_tokens_seen": 28161200, "step": 13040 }, { "epoch": 2.1280587275693312, "grad_norm": 4.96096134185791, "learning_rate": 4.8087306550932996e-05, "loss": 0.058, "num_input_tokens_seen": 28171568, "step": 13045 }, { "epoch": 2.128874388254486, "grad_norm": 3.9760048389434814, "learning_rate": 4.808457503581496e-05, "loss": 0.1363, "num_input_tokens_seen": 28183344, "step": 13050 }, { "epoch": 2.1296900489396413, "grad_norm": 0.23450660705566406, "learning_rate": 4.808184164934414e-05, "loss": 0.0445, "num_input_tokens_seen": 28193552, "step": 13055 }, { "epoch": 2.130505709624796, "grad_norm": 0.19968116283416748, "learning_rate": 4.8079106391742115e-05, "loss": 0.1831, "num_input_tokens_seen": 28203600, "step": 13060 }, { "epoch": 2.131321370309951, "grad_norm": 1.3951623439788818, "learning_rate": 4.807636926323063e-05, "loss": 0.4211, "num_input_tokens_seen": 28213584, "step": 13065 }, { "epoch": 2.132137030995106, "grad_norm": 0.18396364152431488, "learning_rate": 4.8073630264031556e-05, "loss": 0.0669, "num_input_tokens_seen": 28224016, "step": 13070 }, { "epoch": 2.132952691680261, "grad_norm": 0.12926486134529114, "learning_rate": 4.807088939436695e-05, "loss": 0.1624, "num_input_tokens_seen": 28235760, "step": 13075 }, { "epoch": 2.133768352365416, "grad_norm": 0.3938828110694885, "learning_rate": 4.806814665445898e-05, "loss": 0.162, "num_input_tokens_seen": 28244656, "step": 13080 }, { "epoch": 2.134584013050571, "grad_norm": 1.4899837970733643, "learning_rate": 4.8065402044529994e-05, "loss": 0.022, "num_input_tokens_seen": 28254096, "step": 13085 }, { "epoch": 2.135399673735726, "grad_norm": 2.1409873962402344, "learning_rate": 4.806265556480249e-05, "loss": 0.0767, "num_input_tokens_seen": 28264880, "step": 13090 }, { "epoch": 2.1362153344208807, "grad_norm": 3.346529722213745, "learning_rate": 4.80599072154991e-05, "loss": 0.1405, "num_input_tokens_seen": 28274448, "step": 13095 }, { "epoch": 2.137030995106036, "grad_norm": 0.546467661857605, "learning_rate": 4.805715699684264e-05, "loss": 0.0342, "num_input_tokens_seen": 28285776, "step": 13100 }, { "epoch": 2.137846655791191, "grad_norm": 5.533677577972412, "learning_rate": 4.8054404909056036e-05, "loss": 0.0847, "num_input_tokens_seen": 28296016, "step": 13105 }, { "epoch": 2.1386623164763456, "grad_norm": 3.5788450241088867, "learning_rate": 4.805165095236239e-05, "loss": 0.2059, "num_input_tokens_seen": 28307952, "step": 13110 }, { "epoch": 2.139477977161501, "grad_norm": 1.0270748138427734, "learning_rate": 4.804889512698496e-05, "loss": 0.1529, "num_input_tokens_seen": 28318960, "step": 13115 }, { "epoch": 2.1402936378466557, "grad_norm": 0.12226854264736176, "learning_rate": 4.804613743314714e-05, "loss": 0.0078, "num_input_tokens_seen": 28329136, "step": 13120 }, { "epoch": 2.141109298531811, "grad_norm": 0.07593917101621628, "learning_rate": 4.804337787107248e-05, "loss": 0.0218, "num_input_tokens_seen": 28339696, "step": 13125 }, { "epoch": 2.141924959216966, "grad_norm": 0.5602729916572571, "learning_rate": 4.80406164409847e-05, "loss": 0.1575, "num_input_tokens_seen": 28351440, "step": 13130 }, { "epoch": 2.1427406199021206, "grad_norm": 0.35029932856559753, "learning_rate": 4.8037853143107634e-05, "loss": 0.1944, "num_input_tokens_seen": 28361232, "step": 13135 }, { "epoch": 2.143556280587276, "grad_norm": 0.10239975154399872, "learning_rate": 4.8035087977665304e-05, "loss": 0.3515, "num_input_tokens_seen": 28370864, "step": 13140 }, { "epoch": 2.1443719412724307, "grad_norm": 0.37655508518218994, "learning_rate": 4.803232094488186e-05, "loss": 0.0357, "num_input_tokens_seen": 28381456, "step": 13145 }, { "epoch": 2.1451876019575855, "grad_norm": 0.0940922424197197, "learning_rate": 4.802955204498162e-05, "loss": 0.0229, "num_input_tokens_seen": 28391600, "step": 13150 }, { "epoch": 2.1460032626427408, "grad_norm": 1.0168960094451904, "learning_rate": 4.802678127818904e-05, "loss": 0.1323, "num_input_tokens_seen": 28402640, "step": 13155 }, { "epoch": 2.1468189233278956, "grad_norm": 3.733787775039673, "learning_rate": 4.802400864472873e-05, "loss": 0.3108, "num_input_tokens_seen": 28412272, "step": 13160 }, { "epoch": 2.1476345840130504, "grad_norm": 0.9021793007850647, "learning_rate": 4.8021234144825456e-05, "loss": 0.1442, "num_input_tokens_seen": 28423312, "step": 13165 }, { "epoch": 2.1484502446982057, "grad_norm": 1.0865229368209839, "learning_rate": 4.801845777870414e-05, "loss": 0.2975, "num_input_tokens_seen": 28433424, "step": 13170 }, { "epoch": 2.1492659053833605, "grad_norm": 3.830258846282959, "learning_rate": 4.801567954658984e-05, "loss": 0.2252, "num_input_tokens_seen": 28443984, "step": 13175 }, { "epoch": 2.1500815660685153, "grad_norm": 0.3991985619068146, "learning_rate": 4.801289944870777e-05, "loss": 0.1898, "num_input_tokens_seen": 28454832, "step": 13180 }, { "epoch": 2.1508972267536706, "grad_norm": 0.0991239994764328, "learning_rate": 4.8010117485283305e-05, "loss": 0.1274, "num_input_tokens_seen": 28467280, "step": 13185 }, { "epoch": 2.1517128874388254, "grad_norm": 1.4642499685287476, "learning_rate": 4.800733365654197e-05, "loss": 0.217, "num_input_tokens_seen": 28478128, "step": 13190 }, { "epoch": 2.15252854812398, "grad_norm": 2.121992588043213, "learning_rate": 4.8004547962709424e-05, "loss": 0.1124, "num_input_tokens_seen": 28489200, "step": 13195 }, { "epoch": 2.1533442088091355, "grad_norm": 0.3353828191757202, "learning_rate": 4.80017604040115e-05, "loss": 0.0315, "num_input_tokens_seen": 28500240, "step": 13200 }, { "epoch": 2.1541598694942903, "grad_norm": 2.9096145629882812, "learning_rate": 4.799897098067417e-05, "loss": 0.0922, "num_input_tokens_seen": 28511824, "step": 13205 }, { "epoch": 2.1549755301794455, "grad_norm": 0.10797697305679321, "learning_rate": 4.799617969292355e-05, "loss": 0.095, "num_input_tokens_seen": 28521456, "step": 13210 }, { "epoch": 2.1557911908646004, "grad_norm": 3.181422710418701, "learning_rate": 4.799338654098593e-05, "loss": 0.1553, "num_input_tokens_seen": 28531888, "step": 13215 }, { "epoch": 2.156606851549755, "grad_norm": 0.06177400425076485, "learning_rate": 4.799059152508773e-05, "loss": 0.053, "num_input_tokens_seen": 28541616, "step": 13220 }, { "epoch": 2.1574225122349104, "grad_norm": 3.903703451156616, "learning_rate": 4.798779464545552e-05, "loss": 0.1561, "num_input_tokens_seen": 28553776, "step": 13225 }, { "epoch": 2.1582381729200653, "grad_norm": 3.8479654788970947, "learning_rate": 4.7984995902316045e-05, "loss": 0.1142, "num_input_tokens_seen": 28565680, "step": 13230 }, { "epoch": 2.15905383360522, "grad_norm": 5.2140278816223145, "learning_rate": 4.798219529589618e-05, "loss": 0.1837, "num_input_tokens_seen": 28576560, "step": 13235 }, { "epoch": 2.1598694942903753, "grad_norm": 0.19050484895706177, "learning_rate": 4.797939282642294e-05, "loss": 0.2851, "num_input_tokens_seen": 28586352, "step": 13240 }, { "epoch": 2.16068515497553, "grad_norm": 3.2972590923309326, "learning_rate": 4.797658849412353e-05, "loss": 0.3033, "num_input_tokens_seen": 28598416, "step": 13245 }, { "epoch": 2.161500815660685, "grad_norm": 0.08232977241277695, "learning_rate": 4.797378229922528e-05, "loss": 0.1517, "num_input_tokens_seen": 28609424, "step": 13250 }, { "epoch": 2.1623164763458402, "grad_norm": 1.3809925317764282, "learning_rate": 4.797097424195566e-05, "loss": 0.0568, "num_input_tokens_seen": 28619184, "step": 13255 }, { "epoch": 2.163132137030995, "grad_norm": 1.9804331064224243, "learning_rate": 4.796816432254232e-05, "loss": 0.1551, "num_input_tokens_seen": 28629232, "step": 13260 }, { "epoch": 2.1639477977161503, "grad_norm": 2.687870979309082, "learning_rate": 4.796535254121304e-05, "loss": 0.0535, "num_input_tokens_seen": 28639408, "step": 13265 }, { "epoch": 2.164763458401305, "grad_norm": 0.5376800894737244, "learning_rate": 4.7962538898195754e-05, "loss": 0.0974, "num_input_tokens_seen": 28650736, "step": 13270 }, { "epoch": 2.16557911908646, "grad_norm": 4.385888576507568, "learning_rate": 4.7959723393718556e-05, "loss": 0.0901, "num_input_tokens_seen": 28660784, "step": 13275 }, { "epoch": 2.166394779771615, "grad_norm": 3.2208805084228516, "learning_rate": 4.7956906028009683e-05, "loss": 0.1131, "num_input_tokens_seen": 28671248, "step": 13280 }, { "epoch": 2.16721044045677, "grad_norm": 3.118640422821045, "learning_rate": 4.795408680129753e-05, "loss": 0.1467, "num_input_tokens_seen": 28682000, "step": 13285 }, { "epoch": 2.168026101141925, "grad_norm": 2.172536611557007, "learning_rate": 4.795126571381062e-05, "loss": 0.2531, "num_input_tokens_seen": 28692912, "step": 13290 }, { "epoch": 2.16884176182708, "grad_norm": 5.100015640258789, "learning_rate": 4.794844276577767e-05, "loss": 0.2147, "num_input_tokens_seen": 28703504, "step": 13295 }, { "epoch": 2.169657422512235, "grad_norm": 0.16371570527553558, "learning_rate": 4.794561795742751e-05, "loss": 0.0312, "num_input_tokens_seen": 28714416, "step": 13300 }, { "epoch": 2.1704730831973897, "grad_norm": 3.3605289459228516, "learning_rate": 4.794279128898913e-05, "loss": 0.1119, "num_input_tokens_seen": 28725200, "step": 13305 }, { "epoch": 2.171288743882545, "grad_norm": 0.49802085757255554, "learning_rate": 4.7939962760691675e-05, "loss": 0.0897, "num_input_tokens_seen": 28736176, "step": 13310 }, { "epoch": 2.1721044045677, "grad_norm": 0.26157346367836, "learning_rate": 4.793713237276445e-05, "loss": 0.0249, "num_input_tokens_seen": 28746928, "step": 13315 }, { "epoch": 2.1729200652528546, "grad_norm": 3.901275873184204, "learning_rate": 4.7934300125436885e-05, "loss": 0.144, "num_input_tokens_seen": 28757584, "step": 13320 }, { "epoch": 2.17373572593801, "grad_norm": 2.934119939804077, "learning_rate": 4.7931466018938586e-05, "loss": 0.0525, "num_input_tokens_seen": 28766224, "step": 13325 }, { "epoch": 2.1745513866231647, "grad_norm": 0.10597917437553406, "learning_rate": 4.79286300534993e-05, "loss": 0.0311, "num_input_tokens_seen": 28775984, "step": 13330 }, { "epoch": 2.1753670473083195, "grad_norm": 0.04173220321536064, "learning_rate": 4.792579222934892e-05, "loss": 0.1027, "num_input_tokens_seen": 28787056, "step": 13335 }, { "epoch": 2.176182707993475, "grad_norm": 1.1457356214523315, "learning_rate": 4.79229525467175e-05, "loss": 0.0717, "num_input_tokens_seen": 28797648, "step": 13340 }, { "epoch": 2.1769983686786296, "grad_norm": 0.10137543827295303, "learning_rate": 4.792011100583524e-05, "loss": 0.0495, "num_input_tokens_seen": 28808656, "step": 13345 }, { "epoch": 2.177814029363785, "grad_norm": 4.8282999992370605, "learning_rate": 4.791726760693248e-05, "loss": 0.0331, "num_input_tokens_seen": 28818448, "step": 13350 }, { "epoch": 2.1786296900489397, "grad_norm": 6.721035480499268, "learning_rate": 4.791442235023974e-05, "loss": 0.1084, "num_input_tokens_seen": 28828752, "step": 13355 }, { "epoch": 2.1794453507340945, "grad_norm": 3.9628283977508545, "learning_rate": 4.7911575235987644e-05, "loss": 0.2347, "num_input_tokens_seen": 28838896, "step": 13360 }, { "epoch": 2.1802610114192498, "grad_norm": 0.14923158288002014, "learning_rate": 4.790872626440701e-05, "loss": 0.1262, "num_input_tokens_seen": 28848880, "step": 13365 }, { "epoch": 2.1810766721044046, "grad_norm": 5.65922737121582, "learning_rate": 4.790587543572879e-05, "loss": 0.4029, "num_input_tokens_seen": 28860592, "step": 13370 }, { "epoch": 2.1818923327895594, "grad_norm": 0.25351399183273315, "learning_rate": 4.790302275018408e-05, "loss": 0.1504, "num_input_tokens_seen": 28872432, "step": 13375 }, { "epoch": 2.1827079934747147, "grad_norm": 0.8721989393234253, "learning_rate": 4.790016820800414e-05, "loss": 0.0133, "num_input_tokens_seen": 28883856, "step": 13380 }, { "epoch": 2.1835236541598695, "grad_norm": 0.45588570833206177, "learning_rate": 4.789731180942037e-05, "loss": 0.3111, "num_input_tokens_seen": 28894384, "step": 13385 }, { "epoch": 2.1843393148450243, "grad_norm": 5.443912506103516, "learning_rate": 4.7894453554664325e-05, "loss": 0.1183, "num_input_tokens_seen": 28902288, "step": 13390 }, { "epoch": 2.1851549755301796, "grad_norm": 3.1598997116088867, "learning_rate": 4.7891593443967706e-05, "loss": 0.1033, "num_input_tokens_seen": 28913296, "step": 13395 }, { "epoch": 2.1859706362153344, "grad_norm": 0.06592902541160583, "learning_rate": 4.788873147756238e-05, "loss": 0.1292, "num_input_tokens_seen": 28923888, "step": 13400 }, { "epoch": 2.186786296900489, "grad_norm": 2.075955390930176, "learning_rate": 4.788586765568034e-05, "loss": 0.3123, "num_input_tokens_seen": 28934640, "step": 13405 }, { "epoch": 2.1876019575856445, "grad_norm": 0.06410079449415207, "learning_rate": 4.788300197855374e-05, "loss": 0.1714, "num_input_tokens_seen": 28945648, "step": 13410 }, { "epoch": 2.1884176182707993, "grad_norm": 5.264660835266113, "learning_rate": 4.788013444641491e-05, "loss": 0.0807, "num_input_tokens_seen": 28957392, "step": 13415 }, { "epoch": 2.189233278955954, "grad_norm": 0.2360692024230957, "learning_rate": 4.7877265059496266e-05, "loss": 0.0486, "num_input_tokens_seen": 28967056, "step": 13420 }, { "epoch": 2.1900489396411094, "grad_norm": 0.11466488987207413, "learning_rate": 4.7874393818030456e-05, "loss": 0.2175, "num_input_tokens_seen": 28977808, "step": 13425 }, { "epoch": 2.190864600326264, "grad_norm": 0.20892131328582764, "learning_rate": 4.7871520722250214e-05, "loss": 0.1056, "num_input_tokens_seen": 28987440, "step": 13430 }, { "epoch": 2.1916802610114194, "grad_norm": 7.450143814086914, "learning_rate": 4.786864577238845e-05, "loss": 0.0841, "num_input_tokens_seen": 28997392, "step": 13435 }, { "epoch": 2.1924959216965743, "grad_norm": 1.5859612226486206, "learning_rate": 4.7865768968678226e-05, "loss": 0.2124, "num_input_tokens_seen": 29007824, "step": 13440 }, { "epoch": 2.193311582381729, "grad_norm": 0.43595489859580994, "learning_rate": 4.786289031135275e-05, "loss": 0.13, "num_input_tokens_seen": 29019024, "step": 13445 }, { "epoch": 2.1941272430668843, "grad_norm": 5.910229682922363, "learning_rate": 4.786000980064538e-05, "loss": 0.1705, "num_input_tokens_seen": 29031600, "step": 13450 }, { "epoch": 2.194942903752039, "grad_norm": 5.994784355163574, "learning_rate": 4.785712743678963e-05, "loss": 0.1151, "num_input_tokens_seen": 29043056, "step": 13455 }, { "epoch": 2.195758564437194, "grad_norm": 2.1848740577697754, "learning_rate": 4.785424322001915e-05, "loss": 0.1138, "num_input_tokens_seen": 29053648, "step": 13460 }, { "epoch": 2.1965742251223492, "grad_norm": 2.133012533187866, "learning_rate": 4.785135715056775e-05, "loss": 0.043, "num_input_tokens_seen": 29064112, "step": 13465 }, { "epoch": 2.197389885807504, "grad_norm": 0.09406670928001404, "learning_rate": 4.78484692286694e-05, "loss": 0.0183, "num_input_tokens_seen": 29075216, "step": 13470 }, { "epoch": 2.198205546492659, "grad_norm": 4.004861354827881, "learning_rate": 4.7845579454558196e-05, "loss": 0.173, "num_input_tokens_seen": 29084368, "step": 13475 }, { "epoch": 2.199021207177814, "grad_norm": 0.1620786041021347, "learning_rate": 4.784268782846841e-05, "loss": 0.0272, "num_input_tokens_seen": 29096016, "step": 13480 }, { "epoch": 2.199836867862969, "grad_norm": 1.8775203227996826, "learning_rate": 4.783979435063445e-05, "loss": 0.1249, "num_input_tokens_seen": 29106768, "step": 13485 }, { "epoch": 2.200652528548124, "grad_norm": 5.59013557434082, "learning_rate": 4.783689902129086e-05, "loss": 0.2208, "num_input_tokens_seen": 29116560, "step": 13490 }, { "epoch": 2.201468189233279, "grad_norm": 1.0871704816818237, "learning_rate": 4.783400184067237e-05, "loss": 0.1266, "num_input_tokens_seen": 29127376, "step": 13495 }, { "epoch": 2.202283849918434, "grad_norm": 0.44361236691474915, "learning_rate": 4.783110280901383e-05, "loss": 0.1347, "num_input_tokens_seen": 29139088, "step": 13500 }, { "epoch": 2.203099510603589, "grad_norm": 0.11101436614990234, "learning_rate": 4.7828201926550245e-05, "loss": 0.0708, "num_input_tokens_seen": 29149488, "step": 13505 }, { "epoch": 2.203915171288744, "grad_norm": 0.06615414470434189, "learning_rate": 4.7825299193516794e-05, "loss": 0.0471, "num_input_tokens_seen": 29159728, "step": 13510 }, { "epoch": 2.2047308319738987, "grad_norm": 1.3377586603164673, "learning_rate": 4.782239461014877e-05, "loss": 0.1196, "num_input_tokens_seen": 29170928, "step": 13515 }, { "epoch": 2.205546492659054, "grad_norm": 4.223592281341553, "learning_rate": 4.781948817668164e-05, "loss": 0.2729, "num_input_tokens_seen": 29180912, "step": 13520 }, { "epoch": 2.206362153344209, "grad_norm": 0.12885291874408722, "learning_rate": 4.7816579893351014e-05, "loss": 0.1262, "num_input_tokens_seen": 29191216, "step": 13525 }, { "epoch": 2.2071778140293636, "grad_norm": 0.08625456690788269, "learning_rate": 4.781366976039265e-05, "loss": 0.0098, "num_input_tokens_seen": 29201424, "step": 13530 }, { "epoch": 2.207993474714519, "grad_norm": 0.11553726345300674, "learning_rate": 4.781075777804246e-05, "loss": 0.1708, "num_input_tokens_seen": 29211152, "step": 13535 }, { "epoch": 2.2088091353996737, "grad_norm": 3.9450199604034424, "learning_rate": 4.7807843946536514e-05, "loss": 0.2024, "num_input_tokens_seen": 29221744, "step": 13540 }, { "epoch": 2.2096247960848285, "grad_norm": 0.16982929408550262, "learning_rate": 4.7804928266110996e-05, "loss": 0.3198, "num_input_tokens_seen": 29231568, "step": 13545 }, { "epoch": 2.210440456769984, "grad_norm": 7.157938003540039, "learning_rate": 4.780201073700229e-05, "loss": 0.5554, "num_input_tokens_seen": 29242128, "step": 13550 }, { "epoch": 2.2112561174551386, "grad_norm": 0.3418239653110504, "learning_rate": 4.7799091359446905e-05, "loss": 0.1646, "num_input_tokens_seen": 29250992, "step": 13555 }, { "epoch": 2.2120717781402934, "grad_norm": 1.7065843343734741, "learning_rate": 4.779617013368148e-05, "loss": 0.0285, "num_input_tokens_seen": 29261072, "step": 13560 }, { "epoch": 2.2128874388254487, "grad_norm": 0.3221491873264313, "learning_rate": 4.7793247059942845e-05, "loss": 0.1075, "num_input_tokens_seen": 29271760, "step": 13565 }, { "epoch": 2.2137030995106035, "grad_norm": 0.3167019486427307, "learning_rate": 4.779032213846795e-05, "loss": 0.1779, "num_input_tokens_seen": 29282480, "step": 13570 }, { "epoch": 2.2145187601957588, "grad_norm": 1.490031361579895, "learning_rate": 4.77873953694939e-05, "loss": 0.0974, "num_input_tokens_seen": 29292912, "step": 13575 }, { "epoch": 2.2153344208809136, "grad_norm": 0.22250661253929138, "learning_rate": 4.778446675325796e-05, "loss": 0.1102, "num_input_tokens_seen": 29302800, "step": 13580 }, { "epoch": 2.2161500815660684, "grad_norm": 0.31745657324790955, "learning_rate": 4.778153628999754e-05, "loss": 0.055, "num_input_tokens_seen": 29313424, "step": 13585 }, { "epoch": 2.2169657422512237, "grad_norm": 2.743222236633301, "learning_rate": 4.7778603979950196e-05, "loss": 0.2179, "num_input_tokens_seen": 29323856, "step": 13590 }, { "epoch": 2.2177814029363785, "grad_norm": 0.0542568676173687, "learning_rate": 4.777566982335364e-05, "loss": 0.0383, "num_input_tokens_seen": 29334832, "step": 13595 }, { "epoch": 2.2185970636215333, "grad_norm": 0.03890717774629593, "learning_rate": 4.777273382044572e-05, "loss": 0.0706, "num_input_tokens_seen": 29345936, "step": 13600 }, { "epoch": 2.2194127243066886, "grad_norm": 3.6464173793792725, "learning_rate": 4.7769795971464456e-05, "loss": 0.1469, "num_input_tokens_seen": 29356560, "step": 13605 }, { "epoch": 2.2202283849918434, "grad_norm": 0.06402415037155151, "learning_rate": 4.7766856276647986e-05, "loss": 0.045, "num_input_tokens_seen": 29367024, "step": 13610 }, { "epoch": 2.221044045676998, "grad_norm": 2.0355734825134277, "learning_rate": 4.776391473623464e-05, "loss": 0.0548, "num_input_tokens_seen": 29376944, "step": 13615 }, { "epoch": 2.2218597063621535, "grad_norm": 4.244828224182129, "learning_rate": 4.7760971350462856e-05, "loss": 0.2537, "num_input_tokens_seen": 29389552, "step": 13620 }, { "epoch": 2.2226753670473083, "grad_norm": 0.0875852108001709, "learning_rate": 4.775802611957125e-05, "loss": 0.014, "num_input_tokens_seen": 29399600, "step": 13625 }, { "epoch": 2.223491027732463, "grad_norm": 3.1022586822509766, "learning_rate": 4.7755079043798565e-05, "loss": 0.0613, "num_input_tokens_seen": 29410896, "step": 13630 }, { "epoch": 2.2243066884176184, "grad_norm": 2.86226224899292, "learning_rate": 4.775213012338373e-05, "loss": 0.1702, "num_input_tokens_seen": 29421136, "step": 13635 }, { "epoch": 2.225122349102773, "grad_norm": 2.082360029220581, "learning_rate": 4.774917935856577e-05, "loss": 0.1033, "num_input_tokens_seen": 29431536, "step": 13640 }, { "epoch": 2.225938009787928, "grad_norm": 0.041097868233919144, "learning_rate": 4.774622674958391e-05, "loss": 0.1653, "num_input_tokens_seen": 29442416, "step": 13645 }, { "epoch": 2.2267536704730833, "grad_norm": 3.4875710010528564, "learning_rate": 4.7743272296677495e-05, "loss": 0.0571, "num_input_tokens_seen": 29453232, "step": 13650 }, { "epoch": 2.227569331158238, "grad_norm": 2.4293694496154785, "learning_rate": 4.774031600008603e-05, "loss": 0.1168, "num_input_tokens_seen": 29464976, "step": 13655 }, { "epoch": 2.2283849918433933, "grad_norm": 3.0849180221557617, "learning_rate": 4.7737357860049164e-05, "loss": 0.1924, "num_input_tokens_seen": 29475536, "step": 13660 }, { "epoch": 2.229200652528548, "grad_norm": 0.1027827262878418, "learning_rate": 4.7734397876806704e-05, "loss": 0.009, "num_input_tokens_seen": 29486256, "step": 13665 }, { "epoch": 2.230016313213703, "grad_norm": 7.251781463623047, "learning_rate": 4.77314360505986e-05, "loss": 0.2379, "num_input_tokens_seen": 29498512, "step": 13670 }, { "epoch": 2.2308319738988582, "grad_norm": 3.810894727706909, "learning_rate": 4.772847238166495e-05, "loss": 0.1145, "num_input_tokens_seen": 29509360, "step": 13675 }, { "epoch": 2.231647634584013, "grad_norm": 0.1406639963388443, "learning_rate": 4.7725506870246006e-05, "loss": 0.0067, "num_input_tokens_seen": 29520400, "step": 13680 }, { "epoch": 2.232463295269168, "grad_norm": 0.7439082264900208, "learning_rate": 4.772253951658217e-05, "loss": 0.1638, "num_input_tokens_seen": 29532016, "step": 13685 }, { "epoch": 2.233278955954323, "grad_norm": 0.1474129557609558, "learning_rate": 4.771957032091398e-05, "loss": 0.0163, "num_input_tokens_seen": 29543728, "step": 13690 }, { "epoch": 2.234094616639478, "grad_norm": 2.2892446517944336, "learning_rate": 4.771659928348214e-05, "loss": 0.5025, "num_input_tokens_seen": 29554352, "step": 13695 }, { "epoch": 2.2349102773246328, "grad_norm": 3.372269630432129, "learning_rate": 4.7713626404527514e-05, "loss": 0.1512, "num_input_tokens_seen": 29565168, "step": 13700 }, { "epoch": 2.235725938009788, "grad_norm": 3.662602186203003, "learning_rate": 4.7710651684291074e-05, "loss": 0.2718, "num_input_tokens_seen": 29575792, "step": 13705 }, { "epoch": 2.236541598694943, "grad_norm": 0.12891605496406555, "learning_rate": 4.770767512301398e-05, "loss": 0.0985, "num_input_tokens_seen": 29586384, "step": 13710 }, { "epoch": 2.237357259380098, "grad_norm": 0.0943826287984848, "learning_rate": 4.770469672093752e-05, "loss": 0.0976, "num_input_tokens_seen": 29597232, "step": 13715 }, { "epoch": 2.238172920065253, "grad_norm": 2.561107635498047, "learning_rate": 4.7701716478303135e-05, "loss": 0.0606, "num_input_tokens_seen": 29606896, "step": 13720 }, { "epoch": 2.2389885807504077, "grad_norm": 0.13817407190799713, "learning_rate": 4.769873439535244e-05, "loss": 0.0217, "num_input_tokens_seen": 29617648, "step": 13725 }, { "epoch": 2.239804241435563, "grad_norm": 2.578217029571533, "learning_rate": 4.769575047232715e-05, "loss": 0.3705, "num_input_tokens_seen": 29627664, "step": 13730 }, { "epoch": 2.240619902120718, "grad_norm": 0.2209499031305313, "learning_rate": 4.769276470946917e-05, "loss": 0.0574, "num_input_tokens_seen": 29639088, "step": 13735 }, { "epoch": 2.2414355628058726, "grad_norm": 3.821312427520752, "learning_rate": 4.768977710702055e-05, "loss": 0.1059, "num_input_tokens_seen": 29649648, "step": 13740 }, { "epoch": 2.242251223491028, "grad_norm": 1.264604091644287, "learning_rate": 4.768678766522347e-05, "loss": 0.1255, "num_input_tokens_seen": 29661360, "step": 13745 }, { "epoch": 2.2430668841761827, "grad_norm": 0.49606001377105713, "learning_rate": 4.768379638432026e-05, "loss": 0.0849, "num_input_tokens_seen": 29672336, "step": 13750 }, { "epoch": 2.2438825448613375, "grad_norm": 5.561700820922852, "learning_rate": 4.768080326455343e-05, "loss": 0.2971, "num_input_tokens_seen": 29683472, "step": 13755 }, { "epoch": 2.244698205546493, "grad_norm": 0.08782978355884552, "learning_rate": 4.7677808306165596e-05, "loss": 0.0086, "num_input_tokens_seen": 29695472, "step": 13760 }, { "epoch": 2.2455138662316476, "grad_norm": 2.264838457107544, "learning_rate": 4.767481150939956e-05, "loss": 0.2447, "num_input_tokens_seen": 29705520, "step": 13765 }, { "epoch": 2.2463295269168024, "grad_norm": 0.361931174993515, "learning_rate": 4.767181287449825e-05, "loss": 0.0115, "num_input_tokens_seen": 29715824, "step": 13770 }, { "epoch": 2.2471451876019577, "grad_norm": 0.4308825135231018, "learning_rate": 4.766881240170475e-05, "loss": 0.0165, "num_input_tokens_seen": 29726864, "step": 13775 }, { "epoch": 2.2479608482871125, "grad_norm": 0.065828338265419, "learning_rate": 4.7665810091262305e-05, "loss": 0.0096, "num_input_tokens_seen": 29738128, "step": 13780 }, { "epoch": 2.2487765089722673, "grad_norm": 0.14022226631641388, "learning_rate": 4.766280594341428e-05, "loss": 0.2112, "num_input_tokens_seen": 29747152, "step": 13785 }, { "epoch": 2.2495921696574226, "grad_norm": 3.939537286758423, "learning_rate": 4.7659799958404225e-05, "loss": 0.1179, "num_input_tokens_seen": 29756912, "step": 13790 }, { "epoch": 2.2504078303425774, "grad_norm": 2.971608877182007, "learning_rate": 4.7656792136475804e-05, "loss": 0.2464, "num_input_tokens_seen": 29766768, "step": 13795 }, { "epoch": 2.2512234910277327, "grad_norm": 5.468246936798096, "learning_rate": 4.765378247787285e-05, "loss": 0.1061, "num_input_tokens_seen": 29778032, "step": 13800 }, { "epoch": 2.2520391517128875, "grad_norm": 4.380643844604492, "learning_rate": 4.765077098283935e-05, "loss": 0.3154, "num_input_tokens_seen": 29787408, "step": 13805 }, { "epoch": 2.2528548123980423, "grad_norm": 0.9004188179969788, "learning_rate": 4.764775765161943e-05, "loss": 0.0235, "num_input_tokens_seen": 29797232, "step": 13810 }, { "epoch": 2.2536704730831976, "grad_norm": 5.672367095947266, "learning_rate": 4.764474248445735e-05, "loss": 0.2793, "num_input_tokens_seen": 29808016, "step": 13815 }, { "epoch": 2.2544861337683524, "grad_norm": 0.7587845325469971, "learning_rate": 4.764172548159755e-05, "loss": 0.0612, "num_input_tokens_seen": 29818640, "step": 13820 }, { "epoch": 2.255301794453507, "grad_norm": 0.09891112148761749, "learning_rate": 4.7638706643284605e-05, "loss": 0.1374, "num_input_tokens_seen": 29829776, "step": 13825 }, { "epoch": 2.2561174551386625, "grad_norm": 0.09488624334335327, "learning_rate": 4.7635685969763225e-05, "loss": 0.0576, "num_input_tokens_seen": 29840304, "step": 13830 }, { "epoch": 2.2569331158238173, "grad_norm": 0.2625058889389038, "learning_rate": 4.763266346127829e-05, "loss": 0.3236, "num_input_tokens_seen": 29852048, "step": 13835 }, { "epoch": 2.257748776508972, "grad_norm": 4.30433464050293, "learning_rate": 4.7629639118074816e-05, "loss": 0.1341, "num_input_tokens_seen": 29861264, "step": 13840 }, { "epoch": 2.2585644371941274, "grad_norm": 0.10462737083435059, "learning_rate": 4.7626612940397976e-05, "loss": 0.2599, "num_input_tokens_seen": 29871984, "step": 13845 }, { "epoch": 2.259380097879282, "grad_norm": 0.9041357040405273, "learning_rate": 4.762358492849308e-05, "loss": 0.2253, "num_input_tokens_seen": 29882256, "step": 13850 }, { "epoch": 2.2601957585644374, "grad_norm": 3.5074238777160645, "learning_rate": 4.762055508260561e-05, "loss": 0.1098, "num_input_tokens_seen": 29891696, "step": 13855 }, { "epoch": 2.2610114192495923, "grad_norm": 1.1502227783203125, "learning_rate": 4.7617523402981155e-05, "loss": 0.0542, "num_input_tokens_seen": 29903856, "step": 13860 }, { "epoch": 2.261827079934747, "grad_norm": 0.7846003770828247, "learning_rate": 4.7614489889865506e-05, "loss": 0.0743, "num_input_tokens_seen": 29913520, "step": 13865 }, { "epoch": 2.262642740619902, "grad_norm": 3.0706188678741455, "learning_rate": 4.761145454350455e-05, "loss": 0.0275, "num_input_tokens_seen": 29924016, "step": 13870 }, { "epoch": 2.263458401305057, "grad_norm": 3.6503069400787354, "learning_rate": 4.760841736414437e-05, "loss": 0.2495, "num_input_tokens_seen": 29935120, "step": 13875 }, { "epoch": 2.264274061990212, "grad_norm": 4.099191188812256, "learning_rate": 4.760537835203116e-05, "loss": 0.1509, "num_input_tokens_seen": 29946288, "step": 13880 }, { "epoch": 2.2650897226753672, "grad_norm": 7.713541507720947, "learning_rate": 4.760233750741128e-05, "loss": 0.1661, "num_input_tokens_seen": 29957008, "step": 13885 }, { "epoch": 2.265905383360522, "grad_norm": 0.08547984808683395, "learning_rate": 4.7599294830531235e-05, "loss": 0.0592, "num_input_tokens_seen": 29966896, "step": 13890 }, { "epoch": 2.266721044045677, "grad_norm": 5.586185932159424, "learning_rate": 4.759625032163769e-05, "loss": 0.2013, "num_input_tokens_seen": 29976528, "step": 13895 }, { "epoch": 2.267536704730832, "grad_norm": 0.24295584857463837, "learning_rate": 4.7593203980977444e-05, "loss": 0.1003, "num_input_tokens_seen": 29987120, "step": 13900 }, { "epoch": 2.268352365415987, "grad_norm": 1.3970733880996704, "learning_rate": 4.759015580879744e-05, "loss": 0.0388, "num_input_tokens_seen": 29997680, "step": 13905 }, { "epoch": 2.2691680261011418, "grad_norm": 0.16488811373710632, "learning_rate": 4.758710580534479e-05, "loss": 0.2564, "num_input_tokens_seen": 30008784, "step": 13910 }, { "epoch": 2.269983686786297, "grad_norm": 0.07985883206129074, "learning_rate": 4.758405397086674e-05, "loss": 0.3888, "num_input_tokens_seen": 30018416, "step": 13915 }, { "epoch": 2.270799347471452, "grad_norm": 2.3947901725769043, "learning_rate": 4.758100030561068e-05, "loss": 0.111, "num_input_tokens_seen": 30030352, "step": 13920 }, { "epoch": 2.2716150081566067, "grad_norm": 0.1784278154373169, "learning_rate": 4.757794480982416e-05, "loss": 0.0863, "num_input_tokens_seen": 30041392, "step": 13925 }, { "epoch": 2.272430668841762, "grad_norm": 0.3512839674949646, "learning_rate": 4.757488748375487e-05, "loss": 0.0106, "num_input_tokens_seen": 30052240, "step": 13930 }, { "epoch": 2.2732463295269167, "grad_norm": 0.2251429557800293, "learning_rate": 4.757182832765067e-05, "loss": 0.2198, "num_input_tokens_seen": 30062192, "step": 13935 }, { "epoch": 2.274061990212072, "grad_norm": 1.2161368131637573, "learning_rate": 4.7568767341759526e-05, "loss": 0.0991, "num_input_tokens_seen": 30073808, "step": 13940 }, { "epoch": 2.274877650897227, "grad_norm": 0.9560844898223877, "learning_rate": 4.756570452632959e-05, "loss": 0.1643, "num_input_tokens_seen": 30084016, "step": 13945 }, { "epoch": 2.2756933115823816, "grad_norm": 3.6224071979522705, "learning_rate": 4.756263988160915e-05, "loss": 0.1163, "num_input_tokens_seen": 30095600, "step": 13950 }, { "epoch": 2.2765089722675365, "grad_norm": 2.437493085861206, "learning_rate": 4.755957340784664e-05, "loss": 0.0388, "num_input_tokens_seen": 30106800, "step": 13955 }, { "epoch": 2.2773246329526917, "grad_norm": 0.09992051124572754, "learning_rate": 4.755650510529064e-05, "loss": 0.0214, "num_input_tokens_seen": 30118288, "step": 13960 }, { "epoch": 2.2781402936378465, "grad_norm": 0.2428797483444214, "learning_rate": 4.755343497418989e-05, "loss": 0.0804, "num_input_tokens_seen": 30129552, "step": 13965 }, { "epoch": 2.278955954323002, "grad_norm": 0.4452281594276428, "learning_rate": 4.7550363014793264e-05, "loss": 0.1458, "num_input_tokens_seen": 30140720, "step": 13970 }, { "epoch": 2.2797716150081566, "grad_norm": 1.0804243087768555, "learning_rate": 4.754728922734979e-05, "loss": 0.0942, "num_input_tokens_seen": 30150640, "step": 13975 }, { "epoch": 2.2805872756933114, "grad_norm": 2.807777166366577, "learning_rate": 4.754421361210865e-05, "loss": 0.0585, "num_input_tokens_seen": 30160976, "step": 13980 }, { "epoch": 2.2814029363784667, "grad_norm": 6.057126045227051, "learning_rate": 4.7541136169319165e-05, "loss": 0.318, "num_input_tokens_seen": 30170576, "step": 13985 }, { "epoch": 2.2822185970636215, "grad_norm": 4.7831807136535645, "learning_rate": 4.7538056899230815e-05, "loss": 0.1246, "num_input_tokens_seen": 30181904, "step": 13990 }, { "epoch": 2.2830342577487763, "grad_norm": 0.5198666453361511, "learning_rate": 4.753497580209321e-05, "loss": 0.0424, "num_input_tokens_seen": 30193392, "step": 13995 }, { "epoch": 2.2838499184339316, "grad_norm": 0.29861927032470703, "learning_rate": 4.7531892878156125e-05, "loss": 0.0781, "num_input_tokens_seen": 30204528, "step": 14000 }, { "epoch": 2.2846655791190864, "grad_norm": 2.2439777851104736, "learning_rate": 4.752880812766948e-05, "loss": 0.162, "num_input_tokens_seen": 30215760, "step": 14005 }, { "epoch": 2.2854812398042412, "grad_norm": 0.21506105363368988, "learning_rate": 4.752572155088334e-05, "loss": 0.0174, "num_input_tokens_seen": 30226096, "step": 14010 }, { "epoch": 2.2862969004893965, "grad_norm": 0.14795491099357605, "learning_rate": 4.752263314804791e-05, "loss": 0.0904, "num_input_tokens_seen": 30236784, "step": 14015 }, { "epoch": 2.2871125611745513, "grad_norm": 0.24999599158763885, "learning_rate": 4.7519542919413566e-05, "loss": 0.1317, "num_input_tokens_seen": 30248432, "step": 14020 }, { "epoch": 2.2879282218597066, "grad_norm": 0.5208101868629456, "learning_rate": 4.751645086523081e-05, "loss": 0.1923, "num_input_tokens_seen": 30258448, "step": 14025 }, { "epoch": 2.2887438825448614, "grad_norm": 0.5559170842170715, "learning_rate": 4.75133569857503e-05, "loss": 0.1787, "num_input_tokens_seen": 30269424, "step": 14030 }, { "epoch": 2.289559543230016, "grad_norm": 0.18382087349891663, "learning_rate": 4.751026128122283e-05, "loss": 0.0508, "num_input_tokens_seen": 30280144, "step": 14035 }, { "epoch": 2.2903752039151715, "grad_norm": 0.3146985173225403, "learning_rate": 4.7507163751899374e-05, "loss": 0.0464, "num_input_tokens_seen": 30290064, "step": 14040 }, { "epoch": 2.2911908646003263, "grad_norm": 0.35729309916496277, "learning_rate": 4.750406439803102e-05, "loss": 0.2322, "num_input_tokens_seen": 30299984, "step": 14045 }, { "epoch": 2.292006525285481, "grad_norm": 0.6355782151222229, "learning_rate": 4.750096321986902e-05, "loss": 0.0146, "num_input_tokens_seen": 30311248, "step": 14050 }, { "epoch": 2.2928221859706364, "grad_norm": 0.1297059953212738, "learning_rate": 4.749786021766478e-05, "loss": 0.1262, "num_input_tokens_seen": 30322160, "step": 14055 }, { "epoch": 2.293637846655791, "grad_norm": 5.0604939460754395, "learning_rate": 4.749475539166983e-05, "loss": 0.1377, "num_input_tokens_seen": 30333616, "step": 14060 }, { "epoch": 2.294453507340946, "grad_norm": 7.509324073791504, "learning_rate": 4.749164874213588e-05, "loss": 0.0514, "num_input_tokens_seen": 30344112, "step": 14065 }, { "epoch": 2.2952691680261013, "grad_norm": 2.4683310985565186, "learning_rate": 4.7488540269314756e-05, "loss": 0.0391, "num_input_tokens_seen": 30354192, "step": 14070 }, { "epoch": 2.296084828711256, "grad_norm": 7.585381507873535, "learning_rate": 4.748542997345845e-05, "loss": 0.0866, "num_input_tokens_seen": 30365168, "step": 14075 }, { "epoch": 2.2969004893964113, "grad_norm": 2.6453585624694824, "learning_rate": 4.74823178548191e-05, "loss": 0.111, "num_input_tokens_seen": 30376464, "step": 14080 }, { "epoch": 2.297716150081566, "grad_norm": 0.7056415677070618, "learning_rate": 4.7479203913649e-05, "loss": 0.0497, "num_input_tokens_seen": 30387472, "step": 14085 }, { "epoch": 2.298531810766721, "grad_norm": 9.016286849975586, "learning_rate": 4.747608815020056e-05, "loss": 0.3355, "num_input_tokens_seen": 30398224, "step": 14090 }, { "epoch": 2.299347471451876, "grad_norm": 5.035734176635742, "learning_rate": 4.747297056472638e-05, "loss": 0.2694, "num_input_tokens_seen": 30409872, "step": 14095 }, { "epoch": 2.300163132137031, "grad_norm": 7.550227642059326, "learning_rate": 4.7469851157479177e-05, "loss": 0.1785, "num_input_tokens_seen": 30420144, "step": 14100 }, { "epoch": 2.300978792822186, "grad_norm": 1.597720742225647, "learning_rate": 4.746672992871183e-05, "loss": 0.0707, "num_input_tokens_seen": 30430704, "step": 14105 }, { "epoch": 2.301794453507341, "grad_norm": 0.23507060110569, "learning_rate": 4.746360687867736e-05, "loss": 0.1525, "num_input_tokens_seen": 30441328, "step": 14110 }, { "epoch": 2.302610114192496, "grad_norm": 0.27064698934555054, "learning_rate": 4.746048200762893e-05, "loss": 0.1133, "num_input_tokens_seen": 30452208, "step": 14115 }, { "epoch": 2.3034257748776508, "grad_norm": 0.8912692666053772, "learning_rate": 4.7457355315819874e-05, "loss": 0.1297, "num_input_tokens_seen": 30462960, "step": 14120 }, { "epoch": 2.304241435562806, "grad_norm": 2.2029247283935547, "learning_rate": 4.745422680350364e-05, "loss": 0.2764, "num_input_tokens_seen": 30473168, "step": 14125 }, { "epoch": 2.305057096247961, "grad_norm": 3.689709186553955, "learning_rate": 4.745109647093385e-05, "loss": 0.1368, "num_input_tokens_seen": 30483824, "step": 14130 }, { "epoch": 2.3058727569331157, "grad_norm": 1.900779366493225, "learning_rate": 4.744796431836428e-05, "loss": 0.1906, "num_input_tokens_seen": 30495632, "step": 14135 }, { "epoch": 2.306688417618271, "grad_norm": 2.435326337814331, "learning_rate": 4.7444830346048804e-05, "loss": 0.1039, "num_input_tokens_seen": 30505808, "step": 14140 }, { "epoch": 2.3075040783034257, "grad_norm": 4.353400230407715, "learning_rate": 4.744169455424151e-05, "loss": 0.0595, "num_input_tokens_seen": 30516368, "step": 14145 }, { "epoch": 2.3083197389885806, "grad_norm": 0.09357459843158722, "learning_rate": 4.7438556943196574e-05, "loss": 0.0658, "num_input_tokens_seen": 30527856, "step": 14150 }, { "epoch": 2.309135399673736, "grad_norm": 6.035260200500488, "learning_rate": 4.743541751316837e-05, "loss": 0.3764, "num_input_tokens_seen": 30537936, "step": 14155 }, { "epoch": 2.3099510603588906, "grad_norm": 3.8033721446990967, "learning_rate": 4.743227626441139e-05, "loss": 0.0865, "num_input_tokens_seen": 30548304, "step": 14160 }, { "epoch": 2.310766721044046, "grad_norm": 1.0218576192855835, "learning_rate": 4.7429133197180264e-05, "loss": 0.0135, "num_input_tokens_seen": 30559728, "step": 14165 }, { "epoch": 2.3115823817292007, "grad_norm": 0.36538276076316833, "learning_rate": 4.7425988311729805e-05, "loss": 0.0562, "num_input_tokens_seen": 30569840, "step": 14170 }, { "epoch": 2.3123980424143555, "grad_norm": 0.20794284343719482, "learning_rate": 4.742284160831494e-05, "loss": 0.1339, "num_input_tokens_seen": 30580400, "step": 14175 }, { "epoch": 2.3132137030995104, "grad_norm": 4.326881408691406, "learning_rate": 4.741969308719076e-05, "loss": 0.2695, "num_input_tokens_seen": 30594544, "step": 14180 }, { "epoch": 2.3140293637846656, "grad_norm": 0.6262915134429932, "learning_rate": 4.741654274861251e-05, "loss": 0.0165, "num_input_tokens_seen": 30604912, "step": 14185 }, { "epoch": 2.3148450244698204, "grad_norm": 0.602825403213501, "learning_rate": 4.741339059283556e-05, "loss": 0.02, "num_input_tokens_seen": 30616400, "step": 14190 }, { "epoch": 2.3156606851549757, "grad_norm": 0.15345075726509094, "learning_rate": 4.7410236620115444e-05, "loss": 0.0954, "num_input_tokens_seen": 30627824, "step": 14195 }, { "epoch": 2.3164763458401305, "grad_norm": 0.13649988174438477, "learning_rate": 4.740708083070784e-05, "loss": 0.0336, "num_input_tokens_seen": 30638320, "step": 14200 }, { "epoch": 2.3172920065252853, "grad_norm": 0.13931918144226074, "learning_rate": 4.740392322486858e-05, "loss": 0.2585, "num_input_tokens_seen": 30649744, "step": 14205 }, { "epoch": 2.3181076672104406, "grad_norm": 4.44880485534668, "learning_rate": 4.740076380285361e-05, "loss": 0.3618, "num_input_tokens_seen": 30661136, "step": 14210 }, { "epoch": 2.3189233278955954, "grad_norm": 0.4982512891292572, "learning_rate": 4.739760256491908e-05, "loss": 0.074, "num_input_tokens_seen": 30672368, "step": 14215 }, { "epoch": 2.3197389885807502, "grad_norm": 6.6637983322143555, "learning_rate": 4.7394439511321225e-05, "loss": 0.1824, "num_input_tokens_seen": 30684784, "step": 14220 }, { "epoch": 2.3205546492659055, "grad_norm": 0.8021751046180725, "learning_rate": 4.7391274642316485e-05, "loss": 0.0226, "num_input_tokens_seen": 30696432, "step": 14225 }, { "epoch": 2.3213703099510603, "grad_norm": 2.5145702362060547, "learning_rate": 4.7388107958161414e-05, "loss": 0.2304, "num_input_tokens_seen": 30706256, "step": 14230 }, { "epoch": 2.322185970636215, "grad_norm": 7.98170280456543, "learning_rate": 4.738493945911271e-05, "loss": 0.0551, "num_input_tokens_seen": 30716976, "step": 14235 }, { "epoch": 2.3230016313213704, "grad_norm": 3.667553663253784, "learning_rate": 4.738176914542723e-05, "loss": 0.2516, "num_input_tokens_seen": 30726768, "step": 14240 }, { "epoch": 2.323817292006525, "grad_norm": 7.572120666503906, "learning_rate": 4.737859701736199e-05, "loss": 0.1267, "num_input_tokens_seen": 30737072, "step": 14245 }, { "epoch": 2.3246329526916805, "grad_norm": 2.4298205375671387, "learning_rate": 4.737542307517413e-05, "loss": 0.1191, "num_input_tokens_seen": 30748400, "step": 14250 }, { "epoch": 2.3254486133768353, "grad_norm": 6.543609619140625, "learning_rate": 4.737224731912093e-05, "loss": 0.1612, "num_input_tokens_seen": 30760240, "step": 14255 }, { "epoch": 2.32626427406199, "grad_norm": 0.08930646628141403, "learning_rate": 4.736906974945986e-05, "loss": 0.1548, "num_input_tokens_seen": 30770928, "step": 14260 }, { "epoch": 2.3270799347471454, "grad_norm": 3.269556999206543, "learning_rate": 4.736589036644848e-05, "loss": 0.0759, "num_input_tokens_seen": 30780240, "step": 14265 }, { "epoch": 2.3278955954323, "grad_norm": 0.06753698736429214, "learning_rate": 4.736270917034456e-05, "loss": 0.0633, "num_input_tokens_seen": 30792144, "step": 14270 }, { "epoch": 2.328711256117455, "grad_norm": 3.205472946166992, "learning_rate": 4.735952616140597e-05, "loss": 0.1823, "num_input_tokens_seen": 30802416, "step": 14275 }, { "epoch": 2.3295269168026103, "grad_norm": 4.39502477645874, "learning_rate": 4.735634133989072e-05, "loss": 0.1622, "num_input_tokens_seen": 30813872, "step": 14280 }, { "epoch": 2.330342577487765, "grad_norm": 1.3082256317138672, "learning_rate": 4.735315470605702e-05, "loss": 0.0281, "num_input_tokens_seen": 30825584, "step": 14285 }, { "epoch": 2.33115823817292, "grad_norm": 2.325822591781616, "learning_rate": 4.734996626016317e-05, "loss": 0.0367, "num_input_tokens_seen": 30836688, "step": 14290 }, { "epoch": 2.331973898858075, "grad_norm": 0.07380348443984985, "learning_rate": 4.7346776002467664e-05, "loss": 0.1007, "num_input_tokens_seen": 30848816, "step": 14295 }, { "epoch": 2.33278955954323, "grad_norm": 3.9989590644836426, "learning_rate": 4.73435839332291e-05, "loss": 0.1435, "num_input_tokens_seen": 30858608, "step": 14300 }, { "epoch": 2.3336052202283852, "grad_norm": 0.07307717204093933, "learning_rate": 4.734039005270625e-05, "loss": 0.0699, "num_input_tokens_seen": 30868560, "step": 14305 }, { "epoch": 2.33442088091354, "grad_norm": 0.0705452486872673, "learning_rate": 4.733719436115804e-05, "loss": 0.0139, "num_input_tokens_seen": 30878192, "step": 14310 }, { "epoch": 2.335236541598695, "grad_norm": 0.11348162591457367, "learning_rate": 4.733399685884351e-05, "loss": 0.0747, "num_input_tokens_seen": 30888432, "step": 14315 }, { "epoch": 2.3360522022838497, "grad_norm": 0.11116781830787659, "learning_rate": 4.7330797546021876e-05, "loss": 0.0686, "num_input_tokens_seen": 30898864, "step": 14320 }, { "epoch": 2.336867862969005, "grad_norm": 3.1997568607330322, "learning_rate": 4.732759642295248e-05, "loss": 0.2044, "num_input_tokens_seen": 30909424, "step": 14325 }, { "epoch": 2.3376835236541598, "grad_norm": 3.342360258102417, "learning_rate": 4.732439348989484e-05, "loss": 0.1344, "num_input_tokens_seen": 30919600, "step": 14330 }, { "epoch": 2.338499184339315, "grad_norm": 0.05780024081468582, "learning_rate": 4.732118874710858e-05, "loss": 0.0248, "num_input_tokens_seen": 30930416, "step": 14335 }, { "epoch": 2.33931484502447, "grad_norm": 0.0849500522017479, "learning_rate": 4.731798219485351e-05, "loss": 0.0346, "num_input_tokens_seen": 30940560, "step": 14340 }, { "epoch": 2.3401305057096247, "grad_norm": 2.2444562911987305, "learning_rate": 4.7314773833389567e-05, "loss": 0.2389, "num_input_tokens_seen": 30951440, "step": 14345 }, { "epoch": 2.34094616639478, "grad_norm": 0.053432710468769073, "learning_rate": 4.731156366297682e-05, "loss": 0.2715, "num_input_tokens_seen": 30961936, "step": 14350 }, { "epoch": 2.3417618270799347, "grad_norm": 3.597527503967285, "learning_rate": 4.730835168387553e-05, "loss": 0.183, "num_input_tokens_seen": 30972112, "step": 14355 }, { "epoch": 2.3425774877650896, "grad_norm": 4.307960033416748, "learning_rate": 4.730513789634605e-05, "loss": 0.1867, "num_input_tokens_seen": 30984272, "step": 14360 }, { "epoch": 2.343393148450245, "grad_norm": 0.33503457903862, "learning_rate": 4.7301922300648926e-05, "loss": 0.2667, "num_input_tokens_seen": 30996368, "step": 14365 }, { "epoch": 2.3442088091353996, "grad_norm": 1.6205902099609375, "learning_rate": 4.729870489704481e-05, "loss": 0.2137, "num_input_tokens_seen": 31007216, "step": 14370 }, { "epoch": 2.3450244698205545, "grad_norm": 0.1872188299894333, "learning_rate": 4.729548568579454e-05, "loss": 0.0181, "num_input_tokens_seen": 31018416, "step": 14375 }, { "epoch": 2.3458401305057097, "grad_norm": 0.05943664163351059, "learning_rate": 4.729226466715907e-05, "loss": 0.0146, "num_input_tokens_seen": 31030416, "step": 14380 }, { "epoch": 2.3466557911908645, "grad_norm": 1.1938236951828003, "learning_rate": 4.728904184139952e-05, "loss": 0.041, "num_input_tokens_seen": 31040752, "step": 14385 }, { "epoch": 2.34747145187602, "grad_norm": 0.23707178235054016, "learning_rate": 4.728581720877715e-05, "loss": 0.0882, "num_input_tokens_seen": 31052048, "step": 14390 }, { "epoch": 2.3482871125611746, "grad_norm": 10.16082763671875, "learning_rate": 4.7282590769553346e-05, "loss": 0.1714, "num_input_tokens_seen": 31062320, "step": 14395 }, { "epoch": 2.3491027732463294, "grad_norm": 9.198515892028809, "learning_rate": 4.727936252398969e-05, "loss": 0.113, "num_input_tokens_seen": 31071856, "step": 14400 }, { "epoch": 2.3499184339314847, "grad_norm": 0.029299002140760422, "learning_rate": 4.727613247234785e-05, "loss": 0.0055, "num_input_tokens_seen": 31082096, "step": 14405 }, { "epoch": 2.3507340946166395, "grad_norm": 5.021510124206543, "learning_rate": 4.727290061488969e-05, "loss": 0.2366, "num_input_tokens_seen": 31092400, "step": 14410 }, { "epoch": 2.3515497553017943, "grad_norm": 0.1946427822113037, "learning_rate": 4.726966695187719e-05, "loss": 0.1824, "num_input_tokens_seen": 31102768, "step": 14415 }, { "epoch": 2.3523654159869496, "grad_norm": 5.19529914855957, "learning_rate": 4.7266431483572495e-05, "loss": 0.0701, "num_input_tokens_seen": 31113936, "step": 14420 }, { "epoch": 2.3531810766721044, "grad_norm": 0.08390124142169952, "learning_rate": 4.726319421023789e-05, "loss": 0.0937, "num_input_tokens_seen": 31123920, "step": 14425 }, { "epoch": 2.3539967373572592, "grad_norm": 4.709795951843262, "learning_rate": 4.725995513213579e-05, "loss": 0.3095, "num_input_tokens_seen": 31135472, "step": 14430 }, { "epoch": 2.3548123980424145, "grad_norm": 0.028088008984923363, "learning_rate": 4.725671424952879e-05, "loss": 0.1675, "num_input_tokens_seen": 31145552, "step": 14435 }, { "epoch": 2.3556280587275693, "grad_norm": 5.511946201324463, "learning_rate": 4.7253471562679594e-05, "loss": 0.1055, "num_input_tokens_seen": 31157680, "step": 14440 }, { "epoch": 2.356443719412724, "grad_norm": 0.23965363204479218, "learning_rate": 4.725022707185109e-05, "loss": 0.0938, "num_input_tokens_seen": 31168720, "step": 14445 }, { "epoch": 2.3572593800978794, "grad_norm": 5.716320514678955, "learning_rate": 4.724698077730628e-05, "loss": 0.4099, "num_input_tokens_seen": 31178832, "step": 14450 }, { "epoch": 2.358075040783034, "grad_norm": 5.578467845916748, "learning_rate": 4.7243732679308325e-05, "loss": 0.0624, "num_input_tokens_seen": 31190288, "step": 14455 }, { "epoch": 2.358890701468189, "grad_norm": 0.20651887357234955, "learning_rate": 4.724048277812054e-05, "loss": 0.1101, "num_input_tokens_seen": 31202128, "step": 14460 }, { "epoch": 2.3597063621533443, "grad_norm": 0.09927832335233688, "learning_rate": 4.7237231074006374e-05, "loss": 0.0596, "num_input_tokens_seen": 31213232, "step": 14465 }, { "epoch": 2.360522022838499, "grad_norm": 5.533904552459717, "learning_rate": 4.723397756722942e-05, "loss": 0.1979, "num_input_tokens_seen": 31224272, "step": 14470 }, { "epoch": 2.3613376835236544, "grad_norm": 0.3006957471370697, "learning_rate": 4.7230722258053434e-05, "loss": 0.0408, "num_input_tokens_seen": 31234064, "step": 14475 }, { "epoch": 2.362153344208809, "grad_norm": 0.05534852668642998, "learning_rate": 4.7227465146742304e-05, "loss": 0.4224, "num_input_tokens_seen": 31245776, "step": 14480 }, { "epoch": 2.362969004893964, "grad_norm": 0.07525574415922165, "learning_rate": 4.722420623356007e-05, "loss": 0.1208, "num_input_tokens_seen": 31257392, "step": 14485 }, { "epoch": 2.3637846655791193, "grad_norm": 3.0173566341400146, "learning_rate": 4.722094551877091e-05, "loss": 0.064, "num_input_tokens_seen": 31267408, "step": 14490 }, { "epoch": 2.364600326264274, "grad_norm": 0.47848212718963623, "learning_rate": 4.7217683002639165e-05, "loss": 0.1229, "num_input_tokens_seen": 31278960, "step": 14495 }, { "epoch": 2.365415986949429, "grad_norm": 2.3555610179901123, "learning_rate": 4.7214418685429295e-05, "loss": 0.323, "num_input_tokens_seen": 31289264, "step": 14500 }, { "epoch": 2.366231647634584, "grad_norm": 7.209322452545166, "learning_rate": 4.721115256740594e-05, "loss": 0.0825, "num_input_tokens_seen": 31299152, "step": 14505 }, { "epoch": 2.367047308319739, "grad_norm": 0.688676655292511, "learning_rate": 4.720788464883385e-05, "loss": 0.1016, "num_input_tokens_seen": 31308944, "step": 14510 }, { "epoch": 2.367862969004894, "grad_norm": 0.032087117433547974, "learning_rate": 4.720461492997796e-05, "loss": 0.0918, "num_input_tokens_seen": 31319824, "step": 14515 }, { "epoch": 2.368678629690049, "grad_norm": 0.8098523020744324, "learning_rate": 4.720134341110332e-05, "loss": 0.1364, "num_input_tokens_seen": 31332016, "step": 14520 }, { "epoch": 2.369494290375204, "grad_norm": 2.5464463233947754, "learning_rate": 4.719807009247513e-05, "loss": 0.2921, "num_input_tokens_seen": 31342512, "step": 14525 }, { "epoch": 2.370309951060359, "grad_norm": 0.1941344439983368, "learning_rate": 4.7194794974358744e-05, "loss": 0.0212, "num_input_tokens_seen": 31352528, "step": 14530 }, { "epoch": 2.371125611745514, "grad_norm": 0.8727250695228577, "learning_rate": 4.719151805701966e-05, "loss": 0.0416, "num_input_tokens_seen": 31361520, "step": 14535 }, { "epoch": 2.3719412724306688, "grad_norm": 0.5036548376083374, "learning_rate": 4.7188239340723526e-05, "loss": 0.1059, "num_input_tokens_seen": 31372304, "step": 14540 }, { "epoch": 2.3727569331158236, "grad_norm": 1.291709065437317, "learning_rate": 4.7184958825736135e-05, "loss": 0.1785, "num_input_tokens_seen": 31382960, "step": 14545 }, { "epoch": 2.373572593800979, "grad_norm": 5.894839763641357, "learning_rate": 4.718167651232341e-05, "loss": 0.2981, "num_input_tokens_seen": 31394128, "step": 14550 }, { "epoch": 2.3743882544861337, "grad_norm": 2.329805612564087, "learning_rate": 4.7178392400751433e-05, "loss": 0.0469, "num_input_tokens_seen": 31405072, "step": 14555 }, { "epoch": 2.375203915171289, "grad_norm": 0.2343021035194397, "learning_rate": 4.7175106491286446e-05, "loss": 0.0138, "num_input_tokens_seen": 31415280, "step": 14560 }, { "epoch": 2.3760195758564437, "grad_norm": 0.8916614651679993, "learning_rate": 4.717181878419481e-05, "loss": 0.0206, "num_input_tokens_seen": 31426384, "step": 14565 }, { "epoch": 2.3768352365415986, "grad_norm": 1.5694172382354736, "learning_rate": 4.7168529279743046e-05, "loss": 0.0934, "num_input_tokens_seen": 31436304, "step": 14570 }, { "epoch": 2.377650897226754, "grad_norm": 0.12804140150547028, "learning_rate": 4.716523797819781e-05, "loss": 0.1344, "num_input_tokens_seen": 31448240, "step": 14575 }, { "epoch": 2.3784665579119086, "grad_norm": 0.0611204132437706, "learning_rate": 4.716194487982592e-05, "loss": 0.2789, "num_input_tokens_seen": 31458928, "step": 14580 }, { "epoch": 2.3792822185970635, "grad_norm": 1.1616871356964111, "learning_rate": 4.715864998489433e-05, "loss": 0.116, "num_input_tokens_seen": 31470480, "step": 14585 }, { "epoch": 2.3800978792822187, "grad_norm": 0.12439309060573578, "learning_rate": 4.715535329367014e-05, "loss": 0.0303, "num_input_tokens_seen": 31481968, "step": 14590 }, { "epoch": 2.3809135399673735, "grad_norm": 4.315296173095703, "learning_rate": 4.71520548064206e-05, "loss": 0.1764, "num_input_tokens_seen": 31493360, "step": 14595 }, { "epoch": 2.3817292006525284, "grad_norm": 4.915417194366455, "learning_rate": 4.71487545234131e-05, "loss": 0.1525, "num_input_tokens_seen": 31505552, "step": 14600 }, { "epoch": 2.3825448613376836, "grad_norm": 0.05003701522946358, "learning_rate": 4.7145452444915175e-05, "loss": 0.0541, "num_input_tokens_seen": 31516336, "step": 14605 }, { "epoch": 2.3833605220228384, "grad_norm": 0.053478702902793884, "learning_rate": 4.71421485711945e-05, "loss": 0.0606, "num_input_tokens_seen": 31527152, "step": 14610 }, { "epoch": 2.3841761827079937, "grad_norm": 0.049502789974212646, "learning_rate": 4.713884290251892e-05, "loss": 0.0105, "num_input_tokens_seen": 31536400, "step": 14615 }, { "epoch": 2.3849918433931485, "grad_norm": 5.173823356628418, "learning_rate": 4.713553543915641e-05, "loss": 0.0732, "num_input_tokens_seen": 31547056, "step": 14620 }, { "epoch": 2.3858075040783033, "grad_norm": 0.07904334366321564, "learning_rate": 4.713222618137508e-05, "loss": 0.1324, "num_input_tokens_seen": 31557840, "step": 14625 }, { "epoch": 2.3866231647634586, "grad_norm": 0.037357017397880554, "learning_rate": 4.71289151294432e-05, "loss": 0.0076, "num_input_tokens_seen": 31569424, "step": 14630 }, { "epoch": 2.3874388254486134, "grad_norm": 0.529985249042511, "learning_rate": 4.7125602283629166e-05, "loss": 0.2955, "num_input_tokens_seen": 31579504, "step": 14635 }, { "epoch": 2.3882544861337682, "grad_norm": 1.5207878351211548, "learning_rate": 4.7122287644201556e-05, "loss": 0.2426, "num_input_tokens_seen": 31591120, "step": 14640 }, { "epoch": 2.3890701468189235, "grad_norm": 1.2961686849594116, "learning_rate": 4.711897121142906e-05, "loss": 0.031, "num_input_tokens_seen": 31601968, "step": 14645 }, { "epoch": 2.3898858075040783, "grad_norm": 0.14282628893852234, "learning_rate": 4.711565298558053e-05, "loss": 0.1925, "num_input_tokens_seen": 31612944, "step": 14650 }, { "epoch": 2.390701468189233, "grad_norm": 5.399940013885498, "learning_rate": 4.711233296692495e-05, "loss": 0.0322, "num_input_tokens_seen": 31623472, "step": 14655 }, { "epoch": 2.3915171288743884, "grad_norm": 1.1830055713653564, "learning_rate": 4.7109011155731475e-05, "loss": 0.1171, "num_input_tokens_seen": 31635248, "step": 14660 }, { "epoch": 2.392332789559543, "grad_norm": 1.982262134552002, "learning_rate": 4.710568755226936e-05, "loss": 0.1877, "num_input_tokens_seen": 31646320, "step": 14665 }, { "epoch": 2.393148450244698, "grad_norm": 0.13083043694496155, "learning_rate": 4.710236215680806e-05, "loss": 0.0632, "num_input_tokens_seen": 31658608, "step": 14670 }, { "epoch": 2.3939641109298533, "grad_norm": 5.85105037689209, "learning_rate": 4.709903496961713e-05, "loss": 0.1228, "num_input_tokens_seen": 31669680, "step": 14675 }, { "epoch": 2.394779771615008, "grad_norm": 2.7455968856811523, "learning_rate": 4.7095705990966306e-05, "loss": 0.2834, "num_input_tokens_seen": 31679696, "step": 14680 }, { "epoch": 2.395595432300163, "grad_norm": 9.03864860534668, "learning_rate": 4.709237522112543e-05, "loss": 0.1886, "num_input_tokens_seen": 31691472, "step": 14685 }, { "epoch": 2.396411092985318, "grad_norm": 2.2991671562194824, "learning_rate": 4.708904266036453e-05, "loss": 0.1675, "num_input_tokens_seen": 31701968, "step": 14690 }, { "epoch": 2.397226753670473, "grad_norm": 0.08642101287841797, "learning_rate": 4.7085708308953754e-05, "loss": 0.2329, "num_input_tokens_seen": 31713104, "step": 14695 }, { "epoch": 2.3980424143556283, "grad_norm": 0.17132538557052612, "learning_rate": 4.7082372167163394e-05, "loss": 0.1597, "num_input_tokens_seen": 31724560, "step": 14700 }, { "epoch": 2.398858075040783, "grad_norm": 0.12105961889028549, "learning_rate": 4.707903423526391e-05, "loss": 0.0614, "num_input_tokens_seen": 31733776, "step": 14705 }, { "epoch": 2.399673735725938, "grad_norm": 0.23157398402690887, "learning_rate": 4.707569451352588e-05, "loss": 0.0887, "num_input_tokens_seen": 31745680, "step": 14710 }, { "epoch": 2.400489396411093, "grad_norm": 3.026545763015747, "learning_rate": 4.707235300222004e-05, "loss": 0.1202, "num_input_tokens_seen": 31756240, "step": 14715 }, { "epoch": 2.401305057096248, "grad_norm": 0.14984041452407837, "learning_rate": 4.706900970161727e-05, "loss": 0.0343, "num_input_tokens_seen": 31767056, "step": 14720 }, { "epoch": 2.402120717781403, "grad_norm": 0.2534407377243042, "learning_rate": 4.7065664611988596e-05, "loss": 0.0474, "num_input_tokens_seen": 31777968, "step": 14725 }, { "epoch": 2.402936378466558, "grad_norm": 0.39624321460723877, "learning_rate": 4.7062317733605185e-05, "loss": 0.1078, "num_input_tokens_seen": 31788976, "step": 14730 }, { "epoch": 2.403752039151713, "grad_norm": 0.4739513099193573, "learning_rate": 4.705896906673837e-05, "loss": 0.0786, "num_input_tokens_seen": 31799280, "step": 14735 }, { "epoch": 2.4045676998368677, "grad_norm": 0.2549099922180176, "learning_rate": 4.705561861165959e-05, "loss": 0.1446, "num_input_tokens_seen": 31810288, "step": 14740 }, { "epoch": 2.405383360522023, "grad_norm": 0.5927517414093018, "learning_rate": 4.705226636864045e-05, "loss": 0.0974, "num_input_tokens_seen": 31820880, "step": 14745 }, { "epoch": 2.4061990212071778, "grad_norm": 0.13888344168663025, "learning_rate": 4.704891233795271e-05, "loss": 0.0182, "num_input_tokens_seen": 31830704, "step": 14750 }, { "epoch": 2.407014681892333, "grad_norm": 0.20728279650211334, "learning_rate": 4.704555651986826e-05, "loss": 0.0819, "num_input_tokens_seen": 31842608, "step": 14755 }, { "epoch": 2.407830342577488, "grad_norm": 0.04445790499448776, "learning_rate": 4.704219891465914e-05, "loss": 0.1251, "num_input_tokens_seen": 31853456, "step": 14760 }, { "epoch": 2.4086460032626427, "grad_norm": 0.18355217576026917, "learning_rate": 4.703883952259754e-05, "loss": 0.2662, "num_input_tokens_seen": 31864240, "step": 14765 }, { "epoch": 2.4094616639477975, "grad_norm": 0.319480299949646, "learning_rate": 4.7035478343955774e-05, "loss": 0.1982, "num_input_tokens_seen": 31874896, "step": 14770 }, { "epoch": 2.4102773246329527, "grad_norm": 3.1677727699279785, "learning_rate": 4.7032115379006337e-05, "loss": 0.0879, "num_input_tokens_seen": 31887120, "step": 14775 }, { "epoch": 2.4110929853181076, "grad_norm": 7.563157081604004, "learning_rate": 4.7028750628021834e-05, "loss": 0.1887, "num_input_tokens_seen": 31899216, "step": 14780 }, { "epoch": 2.411908646003263, "grad_norm": 5.556710720062256, "learning_rate": 4.702538409127503e-05, "loss": 0.086, "num_input_tokens_seen": 31911248, "step": 14785 }, { "epoch": 2.4127243066884176, "grad_norm": 0.4326038658618927, "learning_rate": 4.7022015769038844e-05, "loss": 0.0516, "num_input_tokens_seen": 31921264, "step": 14790 }, { "epoch": 2.4135399673735725, "grad_norm": 1.8095623254776, "learning_rate": 4.701864566158631e-05, "loss": 0.0194, "num_input_tokens_seen": 31932208, "step": 14795 }, { "epoch": 2.4143556280587277, "grad_norm": 4.630144119262695, "learning_rate": 4.701527376919064e-05, "loss": 0.3303, "num_input_tokens_seen": 31943792, "step": 14800 }, { "epoch": 2.4151712887438825, "grad_norm": 0.05155915766954422, "learning_rate": 4.701190009212518e-05, "loss": 0.2663, "num_input_tokens_seen": 31954128, "step": 14805 }, { "epoch": 2.4159869494290374, "grad_norm": 3.377645969390869, "learning_rate": 4.700852463066341e-05, "loss": 0.0917, "num_input_tokens_seen": 31964176, "step": 14810 }, { "epoch": 2.4168026101141926, "grad_norm": 0.655836284160614, "learning_rate": 4.7005147385078956e-05, "loss": 0.0666, "num_input_tokens_seen": 31975312, "step": 14815 }, { "epoch": 2.4176182707993474, "grad_norm": 0.5649428963661194, "learning_rate": 4.700176835564561e-05, "loss": 0.0829, "num_input_tokens_seen": 31985520, "step": 14820 }, { "epoch": 2.4184339314845023, "grad_norm": 0.06093466654419899, "learning_rate": 4.699838754263728e-05, "loss": 0.0071, "num_input_tokens_seen": 31996656, "step": 14825 }, { "epoch": 2.4192495921696575, "grad_norm": 1.8105956315994263, "learning_rate": 4.6995004946328035e-05, "loss": 0.0355, "num_input_tokens_seen": 32005680, "step": 14830 }, { "epoch": 2.4200652528548123, "grad_norm": 0.07185716181993484, "learning_rate": 4.699162056699209e-05, "loss": 0.1257, "num_input_tokens_seen": 32017232, "step": 14835 }, { "epoch": 2.4208809135399676, "grad_norm": 0.10656130313873291, "learning_rate": 4.698823440490381e-05, "loss": 0.182, "num_input_tokens_seen": 32028112, "step": 14840 }, { "epoch": 2.4216965742251224, "grad_norm": 2.879488945007324, "learning_rate": 4.6984846460337664e-05, "loss": 0.0894, "num_input_tokens_seen": 32038608, "step": 14845 }, { "epoch": 2.4225122349102772, "grad_norm": 0.4780980944633484, "learning_rate": 4.698145673356832e-05, "loss": 0.0985, "num_input_tokens_seen": 32048880, "step": 14850 }, { "epoch": 2.4233278955954325, "grad_norm": 3.4437508583068848, "learning_rate": 4.697806522487056e-05, "loss": 0.2392, "num_input_tokens_seen": 32058864, "step": 14855 }, { "epoch": 2.4241435562805873, "grad_norm": 0.07802551984786987, "learning_rate": 4.697467193451932e-05, "loss": 0.0387, "num_input_tokens_seen": 32068816, "step": 14860 }, { "epoch": 2.424959216965742, "grad_norm": 1.7365397214889526, "learning_rate": 4.6971276862789674e-05, "loss": 0.1649, "num_input_tokens_seen": 32080848, "step": 14865 }, { "epoch": 2.4257748776508974, "grad_norm": 5.480813980102539, "learning_rate": 4.6967880009956845e-05, "loss": 0.043, "num_input_tokens_seen": 32091824, "step": 14870 }, { "epoch": 2.426590538336052, "grad_norm": 2.316845178604126, "learning_rate": 4.69644813762962e-05, "loss": 0.0888, "num_input_tokens_seen": 32102448, "step": 14875 }, { "epoch": 2.427406199021207, "grad_norm": 5.749186038970947, "learning_rate": 4.696108096208324e-05, "loss": 0.0514, "num_input_tokens_seen": 32114000, "step": 14880 }, { "epoch": 2.4282218597063623, "grad_norm": 0.1210162416100502, "learning_rate": 4.695767876759363e-05, "loss": 0.08, "num_input_tokens_seen": 32126192, "step": 14885 }, { "epoch": 2.429037520391517, "grad_norm": 1.1517932415008545, "learning_rate": 4.695427479310317e-05, "loss": 0.0067, "num_input_tokens_seen": 32136880, "step": 14890 }, { "epoch": 2.429853181076672, "grad_norm": 3.0411510467529297, "learning_rate": 4.6950869038887804e-05, "loss": 0.2664, "num_input_tokens_seen": 32147408, "step": 14895 }, { "epoch": 2.430668841761827, "grad_norm": 0.1486297845840454, "learning_rate": 4.6947461505223614e-05, "loss": 0.2751, "num_input_tokens_seen": 32158416, "step": 14900 }, { "epoch": 2.431484502446982, "grad_norm": 3.4809763431549072, "learning_rate": 4.6944052192386836e-05, "loss": 0.2939, "num_input_tokens_seen": 32169264, "step": 14905 }, { "epoch": 2.432300163132137, "grad_norm": 0.273034930229187, "learning_rate": 4.6940641100653834e-05, "loss": 0.2019, "num_input_tokens_seen": 32181872, "step": 14910 }, { "epoch": 2.433115823817292, "grad_norm": 0.11415091156959534, "learning_rate": 4.693722823030114e-05, "loss": 0.348, "num_input_tokens_seen": 32192816, "step": 14915 }, { "epoch": 2.433931484502447, "grad_norm": 1.0659358501434326, "learning_rate": 4.693381358160543e-05, "loss": 0.0275, "num_input_tokens_seen": 32203952, "step": 14920 }, { "epoch": 2.434747145187602, "grad_norm": 0.3799728453159332, "learning_rate": 4.693039715484349e-05, "loss": 0.1396, "num_input_tokens_seen": 32215088, "step": 14925 }, { "epoch": 2.435562805872757, "grad_norm": 0.10501531511545181, "learning_rate": 4.692697895029229e-05, "loss": 0.028, "num_input_tokens_seen": 32226256, "step": 14930 }, { "epoch": 2.436378466557912, "grad_norm": 2.2539470195770264, "learning_rate": 4.6923558968228906e-05, "loss": 0.1439, "num_input_tokens_seen": 32238096, "step": 14935 }, { "epoch": 2.437194127243067, "grad_norm": 0.1651032269001007, "learning_rate": 4.692013720893061e-05, "loss": 0.0736, "num_input_tokens_seen": 32247440, "step": 14940 }, { "epoch": 2.438009787928222, "grad_norm": 0.1005806028842926, "learning_rate": 4.691671367267476e-05, "loss": 0.0151, "num_input_tokens_seen": 32257488, "step": 14945 }, { "epoch": 2.4388254486133767, "grad_norm": 0.12197540700435638, "learning_rate": 4.6913288359738895e-05, "loss": 0.0385, "num_input_tokens_seen": 32268816, "step": 14950 }, { "epoch": 2.439641109298532, "grad_norm": 1.650969386100769, "learning_rate": 4.690986127040069e-05, "loss": 0.1656, "num_input_tokens_seen": 32280048, "step": 14955 }, { "epoch": 2.4404567699836868, "grad_norm": 3.752460479736328, "learning_rate": 4.690643240493797e-05, "loss": 0.0589, "num_input_tokens_seen": 32289680, "step": 14960 }, { "epoch": 2.4412724306688416, "grad_norm": 0.1766907274723053, "learning_rate": 4.690300176362867e-05, "loss": 0.023, "num_input_tokens_seen": 32300656, "step": 14965 }, { "epoch": 2.442088091353997, "grad_norm": 11.739470481872559, "learning_rate": 4.6899569346750924e-05, "loss": 0.1824, "num_input_tokens_seen": 32311888, "step": 14970 }, { "epoch": 2.4429037520391517, "grad_norm": 1.436651349067688, "learning_rate": 4.689613515458297e-05, "loss": 0.2738, "num_input_tokens_seen": 32323248, "step": 14975 }, { "epoch": 2.443719412724307, "grad_norm": 0.15432757139205933, "learning_rate": 4.68926991874032e-05, "loss": 0.0313, "num_input_tokens_seen": 32333808, "step": 14980 }, { "epoch": 2.4445350734094617, "grad_norm": 5.070051670074463, "learning_rate": 4.688926144549015e-05, "loss": 0.2821, "num_input_tokens_seen": 32345648, "step": 14985 }, { "epoch": 2.4453507340946166, "grad_norm": 0.10778035968542099, "learning_rate": 4.6885821929122497e-05, "loss": 0.0694, "num_input_tokens_seen": 32356272, "step": 14990 }, { "epoch": 2.4461663947797714, "grad_norm": 0.10623596608638763, "learning_rate": 4.688238063857908e-05, "loss": 0.2396, "num_input_tokens_seen": 32366800, "step": 14995 }, { "epoch": 2.4469820554649266, "grad_norm": 0.05677817016839981, "learning_rate": 4.687893757413885e-05, "loss": 0.2477, "num_input_tokens_seen": 32377072, "step": 15000 }, { "epoch": 2.4477977161500815, "grad_norm": 2.137606382369995, "learning_rate": 4.6875492736080935e-05, "loss": 0.1913, "num_input_tokens_seen": 32386640, "step": 15005 }, { "epoch": 2.4486133768352367, "grad_norm": 5.929174900054932, "learning_rate": 4.687204612468458e-05, "loss": 0.0861, "num_input_tokens_seen": 32396496, "step": 15010 }, { "epoch": 2.4494290375203915, "grad_norm": 2.540224075317383, "learning_rate": 4.6868597740229186e-05, "loss": 0.1164, "num_input_tokens_seen": 32407824, "step": 15015 }, { "epoch": 2.4502446982055464, "grad_norm": 0.1759481132030487, "learning_rate": 4.68651475829943e-05, "loss": 0.1133, "num_input_tokens_seen": 32417968, "step": 15020 }, { "epoch": 2.4510603588907016, "grad_norm": 0.07518824189901352, "learning_rate": 4.686169565325961e-05, "loss": 0.2448, "num_input_tokens_seen": 32428240, "step": 15025 }, { "epoch": 2.4518760195758564, "grad_norm": 1.3536690473556519, "learning_rate": 4.685824195130495e-05, "loss": 0.0332, "num_input_tokens_seen": 32438704, "step": 15030 }, { "epoch": 2.4526916802610113, "grad_norm": 3.277383804321289, "learning_rate": 4.6854786477410286e-05, "loss": 0.1731, "num_input_tokens_seen": 32449776, "step": 15035 }, { "epoch": 2.4535073409461665, "grad_norm": 0.16884741187095642, "learning_rate": 4.6851329231855736e-05, "loss": 0.0281, "num_input_tokens_seen": 32460240, "step": 15040 }, { "epoch": 2.4543230016313213, "grad_norm": 0.06038467586040497, "learning_rate": 4.6847870214921566e-05, "loss": 0.0622, "num_input_tokens_seen": 32469808, "step": 15045 }, { "epoch": 2.455138662316476, "grad_norm": 4.148122310638428, "learning_rate": 4.6844409426888186e-05, "loss": 0.0775, "num_input_tokens_seen": 32481232, "step": 15050 }, { "epoch": 2.4559543230016314, "grad_norm": 4.650274753570557, "learning_rate": 4.684094686803614e-05, "loss": 0.2861, "num_input_tokens_seen": 32492272, "step": 15055 }, { "epoch": 2.4567699836867862, "grad_norm": 0.22571337223052979, "learning_rate": 4.683748253864612e-05, "loss": 0.1318, "num_input_tokens_seen": 32503312, "step": 15060 }, { "epoch": 2.4575856443719415, "grad_norm": 2.9444875717163086, "learning_rate": 4.6834016438998965e-05, "loss": 0.1337, "num_input_tokens_seen": 32513424, "step": 15065 }, { "epoch": 2.4584013050570963, "grad_norm": 3.9076199531555176, "learning_rate": 4.6830548569375645e-05, "loss": 0.1281, "num_input_tokens_seen": 32525264, "step": 15070 }, { "epoch": 2.459216965742251, "grad_norm": 7.194041728973389, "learning_rate": 4.68270789300573e-05, "loss": 0.196, "num_input_tokens_seen": 32536592, "step": 15075 }, { "epoch": 2.4600326264274064, "grad_norm": 0.21722622215747833, "learning_rate": 4.682360752132518e-05, "loss": 0.0695, "num_input_tokens_seen": 32547344, "step": 15080 }, { "epoch": 2.460848287112561, "grad_norm": 0.585285484790802, "learning_rate": 4.682013434346071e-05, "loss": 0.0129, "num_input_tokens_seen": 32558800, "step": 15085 }, { "epoch": 2.461663947797716, "grad_norm": 1.7635830640792847, "learning_rate": 4.6816659396745424e-05, "loss": 0.0484, "num_input_tokens_seen": 32569392, "step": 15090 }, { "epoch": 2.4624796084828713, "grad_norm": 0.07490061968564987, "learning_rate": 4.6813182681461044e-05, "loss": 0.054, "num_input_tokens_seen": 32579632, "step": 15095 }, { "epoch": 2.463295269168026, "grad_norm": 0.08279639482498169, "learning_rate": 4.680970419788939e-05, "loss": 0.066, "num_input_tokens_seen": 32590128, "step": 15100 }, { "epoch": 2.464110929853181, "grad_norm": 0.08523435890674591, "learning_rate": 4.6806223946312455e-05, "loss": 0.0923, "num_input_tokens_seen": 32600656, "step": 15105 }, { "epoch": 2.464926590538336, "grad_norm": 0.3098160922527313, "learning_rate": 4.6802741927012363e-05, "loss": 0.0975, "num_input_tokens_seen": 32610928, "step": 15110 }, { "epoch": 2.465742251223491, "grad_norm": 0.06355145573616028, "learning_rate": 4.679925814027138e-05, "loss": 0.101, "num_input_tokens_seen": 32621488, "step": 15115 }, { "epoch": 2.466557911908646, "grad_norm": 0.5660656690597534, "learning_rate": 4.6795772586371934e-05, "loss": 0.1325, "num_input_tokens_seen": 32633712, "step": 15120 }, { "epoch": 2.467373572593801, "grad_norm": 0.025278862565755844, "learning_rate": 4.679228526559656e-05, "loss": 0.1558, "num_input_tokens_seen": 32644560, "step": 15125 }, { "epoch": 2.468189233278956, "grad_norm": 0.09633134305477142, "learning_rate": 4.678879617822798e-05, "loss": 0.078, "num_input_tokens_seen": 32655888, "step": 15130 }, { "epoch": 2.4690048939641107, "grad_norm": 0.09229657053947449, "learning_rate": 4.6785305324549016e-05, "loss": 0.1135, "num_input_tokens_seen": 32666832, "step": 15135 }, { "epoch": 2.469820554649266, "grad_norm": 1.2514058351516724, "learning_rate": 4.678181270484267e-05, "loss": 0.0573, "num_input_tokens_seen": 32677616, "step": 15140 }, { "epoch": 2.470636215334421, "grad_norm": 8.427809715270996, "learning_rate": 4.677831831939207e-05, "loss": 0.1057, "num_input_tokens_seen": 32687696, "step": 15145 }, { "epoch": 2.471451876019576, "grad_norm": 3.933966636657715, "learning_rate": 4.6774822168480476e-05, "loss": 0.2268, "num_input_tokens_seen": 32698096, "step": 15150 }, { "epoch": 2.472267536704731, "grad_norm": 0.10171698033809662, "learning_rate": 4.677132425239132e-05, "loss": 0.089, "num_input_tokens_seen": 32707920, "step": 15155 }, { "epoch": 2.4730831973898857, "grad_norm": 3.6427295207977295, "learning_rate": 4.676782457140815e-05, "loss": 0.1902, "num_input_tokens_seen": 32719472, "step": 15160 }, { "epoch": 2.473898858075041, "grad_norm": 4.919198989868164, "learning_rate": 4.676432312581467e-05, "loss": 0.1909, "num_input_tokens_seen": 32730064, "step": 15165 }, { "epoch": 2.4747145187601958, "grad_norm": 0.2900311350822449, "learning_rate": 4.676081991589473e-05, "loss": 0.1988, "num_input_tokens_seen": 32740944, "step": 15170 }, { "epoch": 2.4755301794453506, "grad_norm": 0.059633657336235046, "learning_rate": 4.6757314941932315e-05, "loss": 0.0829, "num_input_tokens_seen": 32751856, "step": 15175 }, { "epoch": 2.476345840130506, "grad_norm": 2.0496883392333984, "learning_rate": 4.6753808204211554e-05, "loss": 0.2245, "num_input_tokens_seen": 32762928, "step": 15180 }, { "epoch": 2.4771615008156607, "grad_norm": 2.0595855712890625, "learning_rate": 4.675029970301672e-05, "loss": 0.0569, "num_input_tokens_seen": 32773840, "step": 15185 }, { "epoch": 2.4779771615008155, "grad_norm": 2.525069236755371, "learning_rate": 4.674678943863223e-05, "loss": 0.2212, "num_input_tokens_seen": 32786064, "step": 15190 }, { "epoch": 2.4787928221859707, "grad_norm": 3.346924304962158, "learning_rate": 4.674327741134266e-05, "loss": 0.1739, "num_input_tokens_seen": 32796816, "step": 15195 }, { "epoch": 2.4796084828711256, "grad_norm": 1.2025363445281982, "learning_rate": 4.673976362143269e-05, "loss": 0.2415, "num_input_tokens_seen": 32807088, "step": 15200 }, { "epoch": 2.480424143556281, "grad_norm": 6.3952412605285645, "learning_rate": 4.673624806918717e-05, "loss": 0.2665, "num_input_tokens_seen": 32817424, "step": 15205 }, { "epoch": 2.4812398042414356, "grad_norm": 1.9188134670257568, "learning_rate": 4.673273075489109e-05, "loss": 0.182, "num_input_tokens_seen": 32827984, "step": 15210 }, { "epoch": 2.4820554649265905, "grad_norm": 1.138280987739563, "learning_rate": 4.6729211678829595e-05, "loss": 0.1766, "num_input_tokens_seen": 32839664, "step": 15215 }, { "epoch": 2.4828711256117453, "grad_norm": 3.514547109603882, "learning_rate": 4.672569084128794e-05, "loss": 0.2367, "num_input_tokens_seen": 32851280, "step": 15220 }, { "epoch": 2.4836867862969005, "grad_norm": 0.11544863879680634, "learning_rate": 4.6722168242551554e-05, "loss": 0.1579, "num_input_tokens_seen": 32862224, "step": 15225 }, { "epoch": 2.4845024469820554, "grad_norm": 0.05716465413570404, "learning_rate": 4.671864388290599e-05, "loss": 0.0094, "num_input_tokens_seen": 32872816, "step": 15230 }, { "epoch": 2.4853181076672106, "grad_norm": 1.00725519657135, "learning_rate": 4.671511776263696e-05, "loss": 0.1438, "num_input_tokens_seen": 32882896, "step": 15235 }, { "epoch": 2.4861337683523654, "grad_norm": 3.159532308578491, "learning_rate": 4.67115898820303e-05, "loss": 0.077, "num_input_tokens_seen": 32894256, "step": 15240 }, { "epoch": 2.4869494290375203, "grad_norm": 0.07962616533041, "learning_rate": 4.6708060241372e-05, "loss": 0.0425, "num_input_tokens_seen": 32905392, "step": 15245 }, { "epoch": 2.4877650897226755, "grad_norm": 0.1137455627322197, "learning_rate": 4.670452884094819e-05, "loss": 0.1247, "num_input_tokens_seen": 32916592, "step": 15250 }, { "epoch": 2.4885807504078303, "grad_norm": 0.1146974116563797, "learning_rate": 4.6700995681045144e-05, "loss": 0.0115, "num_input_tokens_seen": 32927216, "step": 15255 }, { "epoch": 2.489396411092985, "grad_norm": 2.8338823318481445, "learning_rate": 4.669746076194928e-05, "loss": 0.046, "num_input_tokens_seen": 32937648, "step": 15260 }, { "epoch": 2.4902120717781404, "grad_norm": 0.07691849768161774, "learning_rate": 4.669392408394716e-05, "loss": 0.1563, "num_input_tokens_seen": 32948496, "step": 15265 }, { "epoch": 2.4910277324632952, "grad_norm": 2.104459524154663, "learning_rate": 4.669038564732548e-05, "loss": 0.1469, "num_input_tokens_seen": 32958832, "step": 15270 }, { "epoch": 2.49184339314845, "grad_norm": 6.215854167938232, "learning_rate": 4.668684545237107e-05, "loss": 0.1873, "num_input_tokens_seen": 32969456, "step": 15275 }, { "epoch": 2.4926590538336053, "grad_norm": 0.09748450666666031, "learning_rate": 4.668330349937093e-05, "loss": 0.0697, "num_input_tokens_seen": 32980496, "step": 15280 }, { "epoch": 2.49347471451876, "grad_norm": 0.21889671683311462, "learning_rate": 4.6679759788612205e-05, "loss": 0.1825, "num_input_tokens_seen": 32991888, "step": 15285 }, { "epoch": 2.4942903752039154, "grad_norm": 3.8613903522491455, "learning_rate": 4.667621432038214e-05, "loss": 0.0933, "num_input_tokens_seen": 33002672, "step": 15290 }, { "epoch": 2.49510603588907, "grad_norm": 1.3740220069885254, "learning_rate": 4.6672667094968156e-05, "loss": 0.0575, "num_input_tokens_seen": 33012720, "step": 15295 }, { "epoch": 2.495921696574225, "grad_norm": 0.1635286808013916, "learning_rate": 4.6669118112657814e-05, "loss": 0.1084, "num_input_tokens_seen": 33023760, "step": 15300 }, { "epoch": 2.4967373572593803, "grad_norm": 1.9827061891555786, "learning_rate": 4.666556737373881e-05, "loss": 0.1727, "num_input_tokens_seen": 33034192, "step": 15305 }, { "epoch": 2.497553017944535, "grad_norm": 7.943353176116943, "learning_rate": 4.666201487849898e-05, "loss": 0.2828, "num_input_tokens_seen": 33044080, "step": 15310 }, { "epoch": 2.49836867862969, "grad_norm": 0.5402096509933472, "learning_rate": 4.665846062722632e-05, "loss": 0.0179, "num_input_tokens_seen": 33055184, "step": 15315 }, { "epoch": 2.499184339314845, "grad_norm": 1.8655792474746704, "learning_rate": 4.665490462020895e-05, "loss": 0.1486, "num_input_tokens_seen": 33065712, "step": 15320 }, { "epoch": 2.5, "grad_norm": 2.123403787612915, "learning_rate": 4.665134685773513e-05, "loss": 0.1268, "num_input_tokens_seen": 33075856, "step": 15325 }, { "epoch": 2.5, "eval_loss": 0.15446700155735016, "eval_runtime": 132.912, "eval_samples_per_second": 20.502, "eval_steps_per_second": 5.131, "num_input_tokens_seen": 33075856, "step": 15325 }, { "epoch": 2.500815660685155, "grad_norm": 0.08432100713253021, "learning_rate": 4.664778734009327e-05, "loss": 0.0126, "num_input_tokens_seen": 33087312, "step": 15330 }, { "epoch": 2.50163132137031, "grad_norm": 0.07163143157958984, "learning_rate": 4.664422606757194e-05, "loss": 0.2113, "num_input_tokens_seen": 33098960, "step": 15335 }, { "epoch": 2.502446982055465, "grad_norm": 0.1372404843568802, "learning_rate": 4.664066304045982e-05, "loss": 0.1979, "num_input_tokens_seen": 33109712, "step": 15340 }, { "epoch": 2.50326264274062, "grad_norm": 2.33196759223938, "learning_rate": 4.6637098259045744e-05, "loss": 0.1085, "num_input_tokens_seen": 33121008, "step": 15345 }, { "epoch": 2.504078303425775, "grad_norm": 0.09625104069709778, "learning_rate": 4.66335317236187e-05, "loss": 0.1927, "num_input_tokens_seen": 33133360, "step": 15350 }, { "epoch": 2.50489396411093, "grad_norm": 0.8944844007492065, "learning_rate": 4.662996343446781e-05, "loss": 0.0355, "num_input_tokens_seen": 33144688, "step": 15355 }, { "epoch": 2.5057096247960846, "grad_norm": 0.08113139867782593, "learning_rate": 4.6626393391882326e-05, "loss": 0.1126, "num_input_tokens_seen": 33155024, "step": 15360 }, { "epoch": 2.50652528548124, "grad_norm": 0.12328080832958221, "learning_rate": 4.6622821596151676e-05, "loss": 0.3855, "num_input_tokens_seen": 33166576, "step": 15365 }, { "epoch": 2.5073409461663947, "grad_norm": 0.2539548873901367, "learning_rate": 4.6619248047565386e-05, "loss": 0.1121, "num_input_tokens_seen": 33176720, "step": 15370 }, { "epoch": 2.50815660685155, "grad_norm": 3.820850133895874, "learning_rate": 4.6615672746413156e-05, "loss": 0.1299, "num_input_tokens_seen": 33187824, "step": 15375 }, { "epoch": 2.5089722675367048, "grad_norm": 0.8509734869003296, "learning_rate": 4.661209569298482e-05, "loss": 0.0881, "num_input_tokens_seen": 33198800, "step": 15380 }, { "epoch": 2.5097879282218596, "grad_norm": 0.9104613661766052, "learning_rate": 4.660851688757034e-05, "loss": 0.1186, "num_input_tokens_seen": 33209232, "step": 15385 }, { "epoch": 2.5106035889070144, "grad_norm": 2.4266676902770996, "learning_rate": 4.6604936330459845e-05, "loss": 0.3417, "num_input_tokens_seen": 33219760, "step": 15390 }, { "epoch": 2.5114192495921697, "grad_norm": 0.26394614577293396, "learning_rate": 4.660135402194359e-05, "loss": 0.0792, "num_input_tokens_seen": 33229040, "step": 15395 }, { "epoch": 2.5122349102773245, "grad_norm": 0.27636078000068665, "learning_rate": 4.6597769962311975e-05, "loss": 0.0794, "num_input_tokens_seen": 33238960, "step": 15400 }, { "epoch": 2.5130505709624797, "grad_norm": 2.8311269283294678, "learning_rate": 4.6594184151855536e-05, "loss": 0.2758, "num_input_tokens_seen": 33247920, "step": 15405 }, { "epoch": 2.5138662316476346, "grad_norm": 0.316254198551178, "learning_rate": 4.6590596590864966e-05, "loss": 0.1256, "num_input_tokens_seen": 33259504, "step": 15410 }, { "epoch": 2.5146818923327894, "grad_norm": 0.11673296988010406, "learning_rate": 4.658700727963109e-05, "loss": 0.0366, "num_input_tokens_seen": 33269872, "step": 15415 }, { "epoch": 2.5154975530179446, "grad_norm": 0.36511024832725525, "learning_rate": 4.6583416218444866e-05, "loss": 0.0189, "num_input_tokens_seen": 33282192, "step": 15420 }, { "epoch": 2.5163132137030995, "grad_norm": 2.4341371059417725, "learning_rate": 4.657982340759741e-05, "loss": 0.0702, "num_input_tokens_seen": 33293648, "step": 15425 }, { "epoch": 2.5171288743882547, "grad_norm": 1.751753330230713, "learning_rate": 4.657622884737998e-05, "loss": 0.2745, "num_input_tokens_seen": 33304752, "step": 15430 }, { "epoch": 2.5179445350734095, "grad_norm": 2.4139862060546875, "learning_rate": 4.657263253808396e-05, "loss": 0.0779, "num_input_tokens_seen": 33315216, "step": 15435 }, { "epoch": 2.5187601957585644, "grad_norm": 6.911896228790283, "learning_rate": 4.6569034480000887e-05, "loss": 0.174, "num_input_tokens_seen": 33325392, "step": 15440 }, { "epoch": 2.519575856443719, "grad_norm": 0.10046017915010452, "learning_rate": 4.656543467342244e-05, "loss": 0.1558, "num_input_tokens_seen": 33336656, "step": 15445 }, { "epoch": 2.5203915171288744, "grad_norm": 0.28276556730270386, "learning_rate": 4.656183311864043e-05, "loss": 0.0927, "num_input_tokens_seen": 33347856, "step": 15450 }, { "epoch": 2.5212071778140293, "grad_norm": 1.1627421379089355, "learning_rate": 4.655822981594683e-05, "loss": 0.1814, "num_input_tokens_seen": 33358640, "step": 15455 }, { "epoch": 2.5220228384991845, "grad_norm": 0.24366597831249237, "learning_rate": 4.6554624765633734e-05, "loss": 0.1068, "num_input_tokens_seen": 33369200, "step": 15460 }, { "epoch": 2.5228384991843393, "grad_norm": 0.275344580411911, "learning_rate": 4.655101796799338e-05, "loss": 0.1087, "num_input_tokens_seen": 33378320, "step": 15465 }, { "epoch": 2.523654159869494, "grad_norm": 0.14054085314273834, "learning_rate": 4.654740942331818e-05, "loss": 0.1326, "num_input_tokens_seen": 33388976, "step": 15470 }, { "epoch": 2.5244698205546494, "grad_norm": 0.28822585940361023, "learning_rate": 4.6543799131900625e-05, "loss": 0.1237, "num_input_tokens_seen": 33399664, "step": 15475 }, { "epoch": 2.5252854812398042, "grad_norm": 4.053973197937012, "learning_rate": 4.6540187094033407e-05, "loss": 0.1205, "num_input_tokens_seen": 33409616, "step": 15480 }, { "epoch": 2.5261011419249595, "grad_norm": 0.2551969885826111, "learning_rate": 4.6536573310009326e-05, "loss": 0.014, "num_input_tokens_seen": 33420624, "step": 15485 }, { "epoch": 2.5269168026101143, "grad_norm": 2.315868616104126, "learning_rate": 4.653295778012134e-05, "loss": 0.111, "num_input_tokens_seen": 33430960, "step": 15490 }, { "epoch": 2.527732463295269, "grad_norm": 0.19548150897026062, "learning_rate": 4.652934050466254e-05, "loss": 0.0263, "num_input_tokens_seen": 33443344, "step": 15495 }, { "epoch": 2.528548123980424, "grad_norm": 0.32879120111465454, "learning_rate": 4.652572148392616e-05, "loss": 0.0985, "num_input_tokens_seen": 33454032, "step": 15500 }, { "epoch": 2.529363784665579, "grad_norm": 1.3231101036071777, "learning_rate": 4.652210071820557e-05, "loss": 0.09, "num_input_tokens_seen": 33465360, "step": 15505 }, { "epoch": 2.530179445350734, "grad_norm": 0.09188534319400787, "learning_rate": 4.6518478207794304e-05, "loss": 0.0437, "num_input_tokens_seen": 33476336, "step": 15510 }, { "epoch": 2.5309951060358893, "grad_norm": 0.04122030735015869, "learning_rate": 4.6514853952986e-05, "loss": 0.0115, "num_input_tokens_seen": 33486736, "step": 15515 }, { "epoch": 2.531810766721044, "grad_norm": 0.2291618436574936, "learning_rate": 4.6511227954074476e-05, "loss": 0.0937, "num_input_tokens_seen": 33497680, "step": 15520 }, { "epoch": 2.532626427406199, "grad_norm": 4.7056379318237305, "learning_rate": 4.650760021135366e-05, "loss": 0.1712, "num_input_tokens_seen": 33509456, "step": 15525 }, { "epoch": 2.5334420880913537, "grad_norm": 8.325273513793945, "learning_rate": 4.650397072511765e-05, "loss": 0.1599, "num_input_tokens_seen": 33521040, "step": 15530 }, { "epoch": 2.534257748776509, "grad_norm": 4.120744228363037, "learning_rate": 4.650033949566066e-05, "loss": 0.0692, "num_input_tokens_seen": 33531472, "step": 15535 }, { "epoch": 2.535073409461664, "grad_norm": 0.34146207571029663, "learning_rate": 4.6496706523277054e-05, "loss": 0.043, "num_input_tokens_seen": 33541552, "step": 15540 }, { "epoch": 2.535889070146819, "grad_norm": 0.21056298911571503, "learning_rate": 4.649307180826136e-05, "loss": 0.039, "num_input_tokens_seen": 33551920, "step": 15545 }, { "epoch": 2.536704730831974, "grad_norm": 1.2509381771087646, "learning_rate": 4.64894353509082e-05, "loss": 0.4235, "num_input_tokens_seen": 33562960, "step": 15550 }, { "epoch": 2.5375203915171287, "grad_norm": 0.04825669527053833, "learning_rate": 4.648579715151237e-05, "loss": 0.1027, "num_input_tokens_seen": 33572848, "step": 15555 }, { "epoch": 2.538336052202284, "grad_norm": 4.791048049926758, "learning_rate": 4.648215721036881e-05, "loss": 0.2568, "num_input_tokens_seen": 33582224, "step": 15560 }, { "epoch": 2.539151712887439, "grad_norm": 0.1789199411869049, "learning_rate": 4.647851552777258e-05, "loss": 0.0064, "num_input_tokens_seen": 33593680, "step": 15565 }, { "epoch": 2.539967373572594, "grad_norm": 0.9188748598098755, "learning_rate": 4.6474872104018907e-05, "loss": 0.1188, "num_input_tokens_seen": 33605616, "step": 15570 }, { "epoch": 2.540783034257749, "grad_norm": 0.8341172933578491, "learning_rate": 4.6471226939403145e-05, "loss": 0.2392, "num_input_tokens_seen": 33616752, "step": 15575 }, { "epoch": 2.5415986949429037, "grad_norm": 0.07152128964662552, "learning_rate": 4.646758003422077e-05, "loss": 0.1538, "num_input_tokens_seen": 33628048, "step": 15580 }, { "epoch": 2.5424143556280585, "grad_norm": 3.8207027912139893, "learning_rate": 4.646393138876745e-05, "loss": 0.0258, "num_input_tokens_seen": 33639280, "step": 15585 }, { "epoch": 2.5432300163132138, "grad_norm": 0.29015499353408813, "learning_rate": 4.6460281003338924e-05, "loss": 0.0836, "num_input_tokens_seen": 33649808, "step": 15590 }, { "epoch": 2.5440456769983686, "grad_norm": 0.10190565884113312, "learning_rate": 4.6456628878231144e-05, "loss": 0.298, "num_input_tokens_seen": 33658992, "step": 15595 }, { "epoch": 2.544861337683524, "grad_norm": 0.10488893836736679, "learning_rate": 4.645297501374015e-05, "loss": 0.0102, "num_input_tokens_seen": 33669904, "step": 15600 }, { "epoch": 2.5456769983686787, "grad_norm": 0.06030065566301346, "learning_rate": 4.644931941016216e-05, "loss": 0.1156, "num_input_tokens_seen": 33680080, "step": 15605 }, { "epoch": 2.5464926590538335, "grad_norm": 0.11721018701791763, "learning_rate": 4.644566206779349e-05, "loss": 0.1102, "num_input_tokens_seen": 33692208, "step": 15610 }, { "epoch": 2.5473083197389887, "grad_norm": 0.3074197471141815, "learning_rate": 4.6442002986930656e-05, "loss": 0.2263, "num_input_tokens_seen": 33703088, "step": 15615 }, { "epoch": 2.5481239804241436, "grad_norm": 0.06651334464550018, "learning_rate": 4.6438342167870255e-05, "loss": 0.1658, "num_input_tokens_seen": 33712944, "step": 15620 }, { "epoch": 2.5489396411092984, "grad_norm": 0.023425359278917313, "learning_rate": 4.643467961090906e-05, "loss": 0.0949, "num_input_tokens_seen": 33723408, "step": 15625 }, { "epoch": 2.5497553017944536, "grad_norm": 2.7293004989624023, "learning_rate": 4.643101531634399e-05, "loss": 0.1028, "num_input_tokens_seen": 33733488, "step": 15630 }, { "epoch": 2.5505709624796085, "grad_norm": 0.12186019122600555, "learning_rate": 4.642734928447207e-05, "loss": 0.1996, "num_input_tokens_seen": 33742640, "step": 15635 }, { "epoch": 2.5513866231647633, "grad_norm": 0.07401527464389801, "learning_rate": 4.642368151559049e-05, "loss": 0.2536, "num_input_tokens_seen": 33753584, "step": 15640 }, { "epoch": 2.5522022838499185, "grad_norm": 0.14932815730571747, "learning_rate": 4.642001200999659e-05, "loss": 0.0142, "num_input_tokens_seen": 33764656, "step": 15645 }, { "epoch": 2.5530179445350734, "grad_norm": 0.10367850959300995, "learning_rate": 4.6416340767987833e-05, "loss": 0.1579, "num_input_tokens_seen": 33775632, "step": 15650 }, { "epoch": 2.5538336052202286, "grad_norm": 0.04860873147845268, "learning_rate": 4.641266778986182e-05, "loss": 0.0717, "num_input_tokens_seen": 33787536, "step": 15655 }, { "epoch": 2.5546492659053834, "grad_norm": 2.193845272064209, "learning_rate": 4.640899307591632e-05, "loss": 0.1361, "num_input_tokens_seen": 33798736, "step": 15660 }, { "epoch": 2.5554649265905383, "grad_norm": 0.1617467850446701, "learning_rate": 4.64053166264492e-05, "loss": 0.0706, "num_input_tokens_seen": 33810256, "step": 15665 }, { "epoch": 2.556280587275693, "grad_norm": 0.3594755232334137, "learning_rate": 4.640163844175852e-05, "loss": 0.0497, "num_input_tokens_seen": 33820272, "step": 15670 }, { "epoch": 2.5570962479608483, "grad_norm": 0.4882911443710327, "learning_rate": 4.6397958522142426e-05, "loss": 0.1827, "num_input_tokens_seen": 33830448, "step": 15675 }, { "epoch": 2.557911908646003, "grad_norm": 3.7488725185394287, "learning_rate": 4.639427686789924e-05, "loss": 0.316, "num_input_tokens_seen": 33841808, "step": 15680 }, { "epoch": 2.5587275693311584, "grad_norm": 0.12173541635274887, "learning_rate": 4.6390593479327424e-05, "loss": 0.1763, "num_input_tokens_seen": 33851088, "step": 15685 }, { "epoch": 2.5595432300163132, "grad_norm": 1.2456576824188232, "learning_rate": 4.6386908356725564e-05, "loss": 0.0291, "num_input_tokens_seen": 33863600, "step": 15690 }, { "epoch": 2.560358890701468, "grad_norm": 0.0961676612496376, "learning_rate": 4.63832215003924e-05, "loss": 0.1181, "num_input_tokens_seen": 33873712, "step": 15695 }, { "epoch": 2.5611745513866233, "grad_norm": 1.2785987854003906, "learning_rate": 4.63795329106268e-05, "loss": 0.1444, "num_input_tokens_seen": 33884496, "step": 15700 }, { "epoch": 2.561990212071778, "grad_norm": 5.591950416564941, "learning_rate": 4.637584258772779e-05, "loss": 0.2803, "num_input_tokens_seen": 33896176, "step": 15705 }, { "epoch": 2.5628058727569334, "grad_norm": 0.16706594824790955, "learning_rate": 4.637215053199451e-05, "loss": 0.0251, "num_input_tokens_seen": 33907760, "step": 15710 }, { "epoch": 2.563621533442088, "grad_norm": 0.13217823207378387, "learning_rate": 4.6368456743726276e-05, "loss": 0.1172, "num_input_tokens_seen": 33918864, "step": 15715 }, { "epoch": 2.564437194127243, "grad_norm": 1.666069746017456, "learning_rate": 4.636476122322251e-05, "loss": 0.0872, "num_input_tokens_seen": 33929072, "step": 15720 }, { "epoch": 2.565252854812398, "grad_norm": 0.08260957896709442, "learning_rate": 4.636106397078279e-05, "loss": 0.1391, "num_input_tokens_seen": 33939440, "step": 15725 }, { "epoch": 2.566068515497553, "grad_norm": 4.464944839477539, "learning_rate": 4.635736498670685e-05, "loss": 0.0534, "num_input_tokens_seen": 33949168, "step": 15730 }, { "epoch": 2.566884176182708, "grad_norm": 0.10898555815219879, "learning_rate": 4.635366427129454e-05, "loss": 0.0321, "num_input_tokens_seen": 33958800, "step": 15735 }, { "epoch": 2.567699836867863, "grad_norm": 0.1415897011756897, "learning_rate": 4.634996182484584e-05, "loss": 0.1536, "num_input_tokens_seen": 33968144, "step": 15740 }, { "epoch": 2.568515497553018, "grad_norm": 0.38334110379219055, "learning_rate": 4.634625764766093e-05, "loss": 0.0839, "num_input_tokens_seen": 33979344, "step": 15745 }, { "epoch": 2.569331158238173, "grad_norm": 4.0980448722839355, "learning_rate": 4.6342551740040053e-05, "loss": 0.115, "num_input_tokens_seen": 33990160, "step": 15750 }, { "epoch": 2.5701468189233276, "grad_norm": 3.0779635906219482, "learning_rate": 4.633884410228364e-05, "loss": 0.1243, "num_input_tokens_seen": 34000336, "step": 15755 }, { "epoch": 2.570962479608483, "grad_norm": 0.6685082316398621, "learning_rate": 4.633513473469225e-05, "loss": 0.2121, "num_input_tokens_seen": 34011696, "step": 15760 }, { "epoch": 2.5717781402936377, "grad_norm": 0.2305363416671753, "learning_rate": 4.633142363756658e-05, "loss": 0.199, "num_input_tokens_seen": 34022256, "step": 15765 }, { "epoch": 2.572593800978793, "grad_norm": 3.24003529548645, "learning_rate": 4.6327710811207486e-05, "loss": 0.2613, "num_input_tokens_seen": 34032848, "step": 15770 }, { "epoch": 2.573409461663948, "grad_norm": 4.1800994873046875, "learning_rate": 4.6323996255915936e-05, "loss": 0.2036, "num_input_tokens_seen": 34043888, "step": 15775 }, { "epoch": 2.5742251223491026, "grad_norm": 2.6655170917510986, "learning_rate": 4.6320279971993055e-05, "loss": 0.2776, "num_input_tokens_seen": 34055760, "step": 15780 }, { "epoch": 2.575040783034258, "grad_norm": 0.7486165165901184, "learning_rate": 4.631656195974009e-05, "loss": 0.1037, "num_input_tokens_seen": 34067536, "step": 15785 }, { "epoch": 2.5758564437194127, "grad_norm": 0.48042935132980347, "learning_rate": 4.631284221945846e-05, "loss": 0.0574, "num_input_tokens_seen": 34078896, "step": 15790 }, { "epoch": 2.576672104404568, "grad_norm": 0.08358737826347351, "learning_rate": 4.6309120751449706e-05, "loss": 0.0085, "num_input_tokens_seen": 34089648, "step": 15795 }, { "epoch": 2.5774877650897228, "grad_norm": 0.076420359313488, "learning_rate": 4.63053975560155e-05, "loss": 0.0215, "num_input_tokens_seen": 34101392, "step": 15800 }, { "epoch": 2.5783034257748776, "grad_norm": 0.2541749179363251, "learning_rate": 4.630167263345766e-05, "loss": 0.0602, "num_input_tokens_seen": 34111184, "step": 15805 }, { "epoch": 2.5791190864600324, "grad_norm": 2.386906147003174, "learning_rate": 4.629794598407815e-05, "loss": 0.2211, "num_input_tokens_seen": 34121872, "step": 15810 }, { "epoch": 2.5799347471451877, "grad_norm": 0.09215260297060013, "learning_rate": 4.629421760817908e-05, "loss": 0.104, "num_input_tokens_seen": 34132912, "step": 15815 }, { "epoch": 2.5807504078303425, "grad_norm": 0.09753682464361191, "learning_rate": 4.6290487506062685e-05, "loss": 0.0566, "num_input_tokens_seen": 34143632, "step": 15820 }, { "epoch": 2.5815660685154977, "grad_norm": 0.12670865654945374, "learning_rate": 4.6286755678031344e-05, "loss": 0.0094, "num_input_tokens_seen": 34154928, "step": 15825 }, { "epoch": 2.5823817292006526, "grad_norm": 0.23903071880340576, "learning_rate": 4.628302212438758e-05, "loss": 0.0715, "num_input_tokens_seen": 34165168, "step": 15830 }, { "epoch": 2.5831973898858074, "grad_norm": 2.1843554973602295, "learning_rate": 4.627928684543406e-05, "loss": 0.135, "num_input_tokens_seen": 34176304, "step": 15835 }, { "epoch": 2.5840130505709626, "grad_norm": 0.27093032002449036, "learning_rate": 4.627554984147357e-05, "loss": 0.0337, "num_input_tokens_seen": 34186928, "step": 15840 }, { "epoch": 2.5848287112561175, "grad_norm": 5.873973369598389, "learning_rate": 4.627181111280906e-05, "loss": 0.1361, "num_input_tokens_seen": 34197712, "step": 15845 }, { "epoch": 2.5856443719412723, "grad_norm": 0.762215793132782, "learning_rate": 4.6268070659743605e-05, "loss": 0.0176, "num_input_tokens_seen": 34208144, "step": 15850 }, { "epoch": 2.5864600326264275, "grad_norm": 0.3678542375564575, "learning_rate": 4.626432848258044e-05, "loss": 0.1944, "num_input_tokens_seen": 34219504, "step": 15855 }, { "epoch": 2.5872756933115824, "grad_norm": 0.046675343066453934, "learning_rate": 4.62605845816229e-05, "loss": 0.2332, "num_input_tokens_seen": 34230032, "step": 15860 }, { "epoch": 2.588091353996737, "grad_norm": 2.1609106063842773, "learning_rate": 4.625683895717451e-05, "loss": 0.0379, "num_input_tokens_seen": 34240400, "step": 15865 }, { "epoch": 2.5889070146818924, "grad_norm": 1.1828417778015137, "learning_rate": 4.62530916095389e-05, "loss": 0.1004, "num_input_tokens_seen": 34252048, "step": 15870 }, { "epoch": 2.5897226753670473, "grad_norm": 0.25823017954826355, "learning_rate": 4.6249342539019844e-05, "loss": 0.0129, "num_input_tokens_seen": 34262832, "step": 15875 }, { "epoch": 2.5905383360522025, "grad_norm": 0.10200931131839752, "learning_rate": 4.6245591745921254e-05, "loss": 0.02, "num_input_tokens_seen": 34274512, "step": 15880 }, { "epoch": 2.5913539967373573, "grad_norm": 3.583845853805542, "learning_rate": 4.624183923054721e-05, "loss": 0.1516, "num_input_tokens_seen": 34284240, "step": 15885 }, { "epoch": 2.592169657422512, "grad_norm": 0.035426244139671326, "learning_rate": 4.623808499320189e-05, "loss": 0.1173, "num_input_tokens_seen": 34293744, "step": 15890 }, { "epoch": 2.592985318107667, "grad_norm": 2.031107187271118, "learning_rate": 4.623432903418965e-05, "loss": 0.0326, "num_input_tokens_seen": 34304144, "step": 15895 }, { "epoch": 2.5938009787928222, "grad_norm": 0.019137678667902946, "learning_rate": 4.6230571353814944e-05, "loss": 0.228, "num_input_tokens_seen": 34314992, "step": 15900 }, { "epoch": 2.594616639477977, "grad_norm": 10.572669982910156, "learning_rate": 4.622681195238241e-05, "loss": 0.202, "num_input_tokens_seen": 34326768, "step": 15905 }, { "epoch": 2.5954323001631323, "grad_norm": 0.7545855641365051, "learning_rate": 4.622305083019679e-05, "loss": 0.0891, "num_input_tokens_seen": 34336080, "step": 15910 }, { "epoch": 2.596247960848287, "grad_norm": 0.16171157360076904, "learning_rate": 4.621928798756299e-05, "loss": 0.0642, "num_input_tokens_seen": 34347568, "step": 15915 }, { "epoch": 2.597063621533442, "grad_norm": 5.608657360076904, "learning_rate": 4.621552342478604e-05, "loss": 0.3341, "num_input_tokens_seen": 34358864, "step": 15920 }, { "epoch": 2.597879282218597, "grad_norm": 0.5222229361534119, "learning_rate": 4.6211757142171105e-05, "loss": 0.0754, "num_input_tokens_seen": 34371024, "step": 15925 }, { "epoch": 2.598694942903752, "grad_norm": 0.0646638423204422, "learning_rate": 4.620798914002352e-05, "loss": 0.2143, "num_input_tokens_seen": 34381520, "step": 15930 }, { "epoch": 2.5995106035889073, "grad_norm": 0.09868259727954865, "learning_rate": 4.6204219418648724e-05, "loss": 0.0045, "num_input_tokens_seen": 34392880, "step": 15935 }, { "epoch": 2.600326264274062, "grad_norm": 6.142043590545654, "learning_rate": 4.6200447978352315e-05, "loss": 0.1834, "num_input_tokens_seen": 34404816, "step": 15940 }, { "epoch": 2.601141924959217, "grad_norm": 0.4567389190196991, "learning_rate": 4.6196674819440015e-05, "loss": 0.2938, "num_input_tokens_seen": 34414896, "step": 15945 }, { "epoch": 2.6019575856443717, "grad_norm": 6.175649166107178, "learning_rate": 4.619289994221771e-05, "loss": 0.2159, "num_input_tokens_seen": 34426480, "step": 15950 }, { "epoch": 2.602773246329527, "grad_norm": 0.21967223286628723, "learning_rate": 4.61891233469914e-05, "loss": 0.0323, "num_input_tokens_seen": 34437328, "step": 15955 }, { "epoch": 2.603588907014682, "grad_norm": 0.34079158306121826, "learning_rate": 4.618534503406724e-05, "loss": 0.1226, "num_input_tokens_seen": 34449584, "step": 15960 }, { "epoch": 2.604404567699837, "grad_norm": 1.7389894723892212, "learning_rate": 4.6181565003751525e-05, "loss": 0.1572, "num_input_tokens_seen": 34460592, "step": 15965 }, { "epoch": 2.605220228384992, "grad_norm": 0.2830871045589447, "learning_rate": 4.617778325635067e-05, "loss": 0.0254, "num_input_tokens_seen": 34472944, "step": 15970 }, { "epoch": 2.6060358890701467, "grad_norm": 0.07348176091909409, "learning_rate": 4.617399979217125e-05, "loss": 0.0096, "num_input_tokens_seen": 34483536, "step": 15975 }, { "epoch": 2.6068515497553015, "grad_norm": 0.17420241236686707, "learning_rate": 4.617021461151997e-05, "loss": 0.1893, "num_input_tokens_seen": 34495248, "step": 15980 }, { "epoch": 2.607667210440457, "grad_norm": 3.890345335006714, "learning_rate": 4.616642771470367e-05, "loss": 0.1241, "num_input_tokens_seen": 34506832, "step": 15985 }, { "epoch": 2.6084828711256116, "grad_norm": 0.051773801445961, "learning_rate": 4.616263910202936e-05, "loss": 0.0594, "num_input_tokens_seen": 34517168, "step": 15990 }, { "epoch": 2.609298531810767, "grad_norm": 0.1775064319372177, "learning_rate": 4.615884877380413e-05, "loss": 0.148, "num_input_tokens_seen": 34528272, "step": 15995 }, { "epoch": 2.6101141924959217, "grad_norm": 0.052289098501205444, "learning_rate": 4.6155056730335274e-05, "loss": 0.0061, "num_input_tokens_seen": 34537904, "step": 16000 }, { "epoch": 2.6109298531810765, "grad_norm": 0.22928167879581451, "learning_rate": 4.615126297193017e-05, "loss": 0.2099, "num_input_tokens_seen": 34548368, "step": 16005 }, { "epoch": 2.6117455138662318, "grad_norm": 0.6492549180984497, "learning_rate": 4.614746749889637e-05, "loss": 0.0688, "num_input_tokens_seen": 34559024, "step": 16010 }, { "epoch": 2.6125611745513866, "grad_norm": 0.8116114139556885, "learning_rate": 4.614367031154155e-05, "loss": 0.2135, "num_input_tokens_seen": 34570224, "step": 16015 }, { "epoch": 2.613376835236542, "grad_norm": 3.537290573120117, "learning_rate": 4.613987141017354e-05, "loss": 0.1358, "num_input_tokens_seen": 34581968, "step": 16020 }, { "epoch": 2.6141924959216967, "grad_norm": 0.11925935000181198, "learning_rate": 4.6136070795100285e-05, "loss": 0.1599, "num_input_tokens_seen": 34593392, "step": 16025 }, { "epoch": 2.6150081566068515, "grad_norm": 0.2562631070613861, "learning_rate": 4.613226846662989e-05, "loss": 0.0064, "num_input_tokens_seen": 34604176, "step": 16030 }, { "epoch": 2.6158238172920063, "grad_norm": 1.875343680381775, "learning_rate": 4.6128464425070595e-05, "loss": 0.0286, "num_input_tokens_seen": 34615024, "step": 16035 }, { "epoch": 2.6166394779771616, "grad_norm": 4.020516395568848, "learning_rate": 4.612465867073076e-05, "loss": 0.1692, "num_input_tokens_seen": 34626160, "step": 16040 }, { "epoch": 2.6174551386623164, "grad_norm": 0.5955748558044434, "learning_rate": 4.612085120391891e-05, "loss": 0.0345, "num_input_tokens_seen": 34635632, "step": 16045 }, { "epoch": 2.6182707993474716, "grad_norm": 3.323315143585205, "learning_rate": 4.61170420249437e-05, "loss": 0.0284, "num_input_tokens_seen": 34646288, "step": 16050 }, { "epoch": 2.6190864600326265, "grad_norm": 0.021180521696805954, "learning_rate": 4.611323113411391e-05, "loss": 0.3613, "num_input_tokens_seen": 34657232, "step": 16055 }, { "epoch": 2.6199021207177813, "grad_norm": 5.6233367919921875, "learning_rate": 4.610941853173848e-05, "loss": 0.2993, "num_input_tokens_seen": 34667408, "step": 16060 }, { "epoch": 2.6207177814029365, "grad_norm": 4.445132255554199, "learning_rate": 4.610560421812647e-05, "loss": 0.1756, "num_input_tokens_seen": 34677616, "step": 16065 }, { "epoch": 2.6215334420880914, "grad_norm": 0.05530526489019394, "learning_rate": 4.6101788193587103e-05, "loss": 0.0497, "num_input_tokens_seen": 34688688, "step": 16070 }, { "epoch": 2.622349102773246, "grad_norm": 0.7006406188011169, "learning_rate": 4.609797045842972e-05, "loss": 0.061, "num_input_tokens_seen": 34700272, "step": 16075 }, { "epoch": 2.6231647634584014, "grad_norm": 0.1208910420536995, "learning_rate": 4.609415101296379e-05, "loss": 0.3196, "num_input_tokens_seen": 34710832, "step": 16080 }, { "epoch": 2.6239804241435563, "grad_norm": 6.794556617736816, "learning_rate": 4.609032985749895e-05, "loss": 0.2261, "num_input_tokens_seen": 34722224, "step": 16085 }, { "epoch": 2.624796084828711, "grad_norm": 0.06990183889865875, "learning_rate": 4.6086506992344956e-05, "loss": 0.0761, "num_input_tokens_seen": 34733136, "step": 16090 }, { "epoch": 2.6256117455138663, "grad_norm": 0.11986415088176727, "learning_rate": 4.608268241781172e-05, "loss": 0.0538, "num_input_tokens_seen": 34743408, "step": 16095 }, { "epoch": 2.626427406199021, "grad_norm": 6.667908191680908, "learning_rate": 4.6078856134209284e-05, "loss": 0.3723, "num_input_tokens_seen": 34755184, "step": 16100 }, { "epoch": 2.6272430668841764, "grad_norm": 4.083271026611328, "learning_rate": 4.6075028141847795e-05, "loss": 0.0794, "num_input_tokens_seen": 34765040, "step": 16105 }, { "epoch": 2.6280587275693312, "grad_norm": 0.24231897294521332, "learning_rate": 4.607119844103761e-05, "loss": 0.0985, "num_input_tokens_seen": 34775312, "step": 16110 }, { "epoch": 2.628874388254486, "grad_norm": 1.1019506454467773, "learning_rate": 4.606736703208916e-05, "loss": 0.1174, "num_input_tokens_seen": 34784400, "step": 16115 }, { "epoch": 2.629690048939641, "grad_norm": 0.6285136342048645, "learning_rate": 4.606353391531304e-05, "loss": 0.0736, "num_input_tokens_seen": 34796080, "step": 16120 }, { "epoch": 2.630505709624796, "grad_norm": 1.0275976657867432, "learning_rate": 4.605969909101998e-05, "loss": 0.01, "num_input_tokens_seen": 34807280, "step": 16125 }, { "epoch": 2.631321370309951, "grad_norm": 0.47369199991226196, "learning_rate": 4.605586255952087e-05, "loss": 0.0611, "num_input_tokens_seen": 34818896, "step": 16130 }, { "epoch": 2.632137030995106, "grad_norm": 0.15059323608875275, "learning_rate": 4.6052024321126695e-05, "loss": 0.0192, "num_input_tokens_seen": 34830288, "step": 16135 }, { "epoch": 2.632952691680261, "grad_norm": 0.13787412643432617, "learning_rate": 4.6048184376148616e-05, "loss": 0.0845, "num_input_tokens_seen": 34840688, "step": 16140 }, { "epoch": 2.633768352365416, "grad_norm": 4.663808345794678, "learning_rate": 4.6044342724897915e-05, "loss": 0.0936, "num_input_tokens_seen": 34852848, "step": 16145 }, { "epoch": 2.634584013050571, "grad_norm": 3.8075475692749023, "learning_rate": 4.604049936768601e-05, "loss": 0.2947, "num_input_tokens_seen": 34863728, "step": 16150 }, { "epoch": 2.635399673735726, "grad_norm": 0.08273309469223022, "learning_rate": 4.6036654304824464e-05, "loss": 0.115, "num_input_tokens_seen": 34874672, "step": 16155 }, { "epoch": 2.636215334420881, "grad_norm": 1.2531658411026, "learning_rate": 4.603280753662499e-05, "loss": 0.0505, "num_input_tokens_seen": 34885904, "step": 16160 }, { "epoch": 2.637030995106036, "grad_norm": 2.630958080291748, "learning_rate": 4.602895906339941e-05, "loss": 0.1805, "num_input_tokens_seen": 34896144, "step": 16165 }, { "epoch": 2.637846655791191, "grad_norm": 3.709801197052002, "learning_rate": 4.6025108885459725e-05, "loss": 0.2926, "num_input_tokens_seen": 34906928, "step": 16170 }, { "epoch": 2.6386623164763456, "grad_norm": 0.6037567853927612, "learning_rate": 4.602125700311801e-05, "loss": 0.0163, "num_input_tokens_seen": 34917392, "step": 16175 }, { "epoch": 2.639477977161501, "grad_norm": 0.12990660965442657, "learning_rate": 4.6017403416686555e-05, "loss": 0.015, "num_input_tokens_seen": 34928240, "step": 16180 }, { "epoch": 2.6402936378466557, "grad_norm": 1.0022168159484863, "learning_rate": 4.601354812647774e-05, "loss": 0.0079, "num_input_tokens_seen": 34939824, "step": 16185 }, { "epoch": 2.641109298531811, "grad_norm": 0.272097110748291, "learning_rate": 4.600969113280409e-05, "loss": 0.0604, "num_input_tokens_seen": 34950704, "step": 16190 }, { "epoch": 2.641924959216966, "grad_norm": 0.08372607827186584, "learning_rate": 4.6005832435978266e-05, "loss": 0.1061, "num_input_tokens_seen": 34962224, "step": 16195 }, { "epoch": 2.6427406199021206, "grad_norm": 0.10326100140810013, "learning_rate": 4.600197203631309e-05, "loss": 0.0601, "num_input_tokens_seen": 34973936, "step": 16200 }, { "epoch": 2.6435562805872754, "grad_norm": 4.949693202972412, "learning_rate": 4.5998109934121494e-05, "loss": 0.0226, "num_input_tokens_seen": 34985296, "step": 16205 }, { "epoch": 2.6443719412724307, "grad_norm": 4.129422664642334, "learning_rate": 4.599424612971657e-05, "loss": 0.2191, "num_input_tokens_seen": 34996528, "step": 16210 }, { "epoch": 2.6451876019575855, "grad_norm": 3.720494508743286, "learning_rate": 4.599038062341153e-05, "loss": 0.1687, "num_input_tokens_seen": 35007056, "step": 16215 }, { "epoch": 2.6460032626427408, "grad_norm": 3.5929319858551025, "learning_rate": 4.598651341551973e-05, "loss": 0.066, "num_input_tokens_seen": 35017744, "step": 16220 }, { "epoch": 2.6468189233278956, "grad_norm": 3.253682851791382, "learning_rate": 4.5982644506354666e-05, "loss": 0.2127, "num_input_tokens_seen": 35029104, "step": 16225 }, { "epoch": 2.6476345840130504, "grad_norm": 0.07084818929433823, "learning_rate": 4.5978773896229977e-05, "loss": 0.1509, "num_input_tokens_seen": 35039632, "step": 16230 }, { "epoch": 2.6484502446982057, "grad_norm": 2.815587043762207, "learning_rate": 4.597490158545943e-05, "loss": 0.1168, "num_input_tokens_seen": 35050704, "step": 16235 }, { "epoch": 2.6492659053833605, "grad_norm": 0.06077788025140762, "learning_rate": 4.5971027574356926e-05, "loss": 0.0822, "num_input_tokens_seen": 35060688, "step": 16240 }, { "epoch": 2.6500815660685157, "grad_norm": 3.9549713134765625, "learning_rate": 4.5967151863236534e-05, "loss": 0.1066, "num_input_tokens_seen": 35071920, "step": 16245 }, { "epoch": 2.6508972267536706, "grad_norm": 0.061554085463285446, "learning_rate": 4.5963274452412416e-05, "loss": 0.0163, "num_input_tokens_seen": 35083056, "step": 16250 }, { "epoch": 2.6517128874388254, "grad_norm": 0.12308736890554428, "learning_rate": 4.595939534219891e-05, "loss": 0.0939, "num_input_tokens_seen": 35094320, "step": 16255 }, { "epoch": 2.65252854812398, "grad_norm": 0.061252955347299576, "learning_rate": 4.595551453291047e-05, "loss": 0.2009, "num_input_tokens_seen": 35104560, "step": 16260 }, { "epoch": 2.6533442088091355, "grad_norm": 5.05578088760376, "learning_rate": 4.5951632024861694e-05, "loss": 0.2439, "num_input_tokens_seen": 35115376, "step": 16265 }, { "epoch": 2.6541598694942903, "grad_norm": 0.19694742560386658, "learning_rate": 4.594774781836732e-05, "loss": 0.0649, "num_input_tokens_seen": 35126192, "step": 16270 }, { "epoch": 2.6549755301794455, "grad_norm": 0.9478245377540588, "learning_rate": 4.594386191374221e-05, "loss": 0.2022, "num_input_tokens_seen": 35137936, "step": 16275 }, { "epoch": 2.6557911908646004, "grad_norm": 0.3759821653366089, "learning_rate": 4.5939974311301406e-05, "loss": 0.1266, "num_input_tokens_seen": 35148560, "step": 16280 }, { "epoch": 2.656606851549755, "grad_norm": 0.5978142619132996, "learning_rate": 4.593608501136002e-05, "loss": 0.0913, "num_input_tokens_seen": 35159632, "step": 16285 }, { "epoch": 2.6574225122349104, "grad_norm": 0.06790251284837723, "learning_rate": 4.5932194014233356e-05, "loss": 0.1793, "num_input_tokens_seen": 35170000, "step": 16290 }, { "epoch": 2.6582381729200653, "grad_norm": 0.08304151892662048, "learning_rate": 4.592830132023684e-05, "loss": 0.0288, "num_input_tokens_seen": 35181360, "step": 16295 }, { "epoch": 2.65905383360522, "grad_norm": 0.13840016722679138, "learning_rate": 4.592440692968602e-05, "loss": 0.1375, "num_input_tokens_seen": 35191600, "step": 16300 }, { "epoch": 2.6598694942903753, "grad_norm": 0.1742582619190216, "learning_rate": 4.5920510842896624e-05, "loss": 0.015, "num_input_tokens_seen": 35201136, "step": 16305 }, { "epoch": 2.66068515497553, "grad_norm": 2.0974180698394775, "learning_rate": 4.591661306018446e-05, "loss": 0.0422, "num_input_tokens_seen": 35211600, "step": 16310 }, { "epoch": 2.661500815660685, "grad_norm": 0.11414952576160431, "learning_rate": 4.591271358186551e-05, "loss": 0.1467, "num_input_tokens_seen": 35222416, "step": 16315 }, { "epoch": 2.6623164763458402, "grad_norm": 0.6652671098709106, "learning_rate": 4.5908812408255884e-05, "loss": 0.1594, "num_input_tokens_seen": 35233904, "step": 16320 }, { "epoch": 2.663132137030995, "grad_norm": 0.1648857444524765, "learning_rate": 4.5904909539671836e-05, "loss": 0.1124, "num_input_tokens_seen": 35243984, "step": 16325 }, { "epoch": 2.6639477977161503, "grad_norm": 1.170156717300415, "learning_rate": 4.590100497642975e-05, "loss": 0.0634, "num_input_tokens_seen": 35255216, "step": 16330 }, { "epoch": 2.664763458401305, "grad_norm": 5.243762969970703, "learning_rate": 4.589709871884615e-05, "loss": 0.1903, "num_input_tokens_seen": 35265136, "step": 16335 }, { "epoch": 2.66557911908646, "grad_norm": 0.07611371576786041, "learning_rate": 4.58931907672377e-05, "loss": 0.1323, "num_input_tokens_seen": 35275600, "step": 16340 }, { "epoch": 2.6663947797716148, "grad_norm": 2.1288270950317383, "learning_rate": 4.588928112192119e-05, "loss": 0.1555, "num_input_tokens_seen": 35286928, "step": 16345 }, { "epoch": 2.66721044045677, "grad_norm": 1.1248494386672974, "learning_rate": 4.588536978321357e-05, "loss": 0.1184, "num_input_tokens_seen": 35299024, "step": 16350 }, { "epoch": 2.668026101141925, "grad_norm": 0.08379536122083664, "learning_rate": 4.588145675143189e-05, "loss": 0.0401, "num_input_tokens_seen": 35310320, "step": 16355 }, { "epoch": 2.66884176182708, "grad_norm": 0.08577514439821243, "learning_rate": 4.5877542026893395e-05, "loss": 0.0539, "num_input_tokens_seen": 35321104, "step": 16360 }, { "epoch": 2.669657422512235, "grad_norm": 5.488109588623047, "learning_rate": 4.5873625609915393e-05, "loss": 0.2237, "num_input_tokens_seen": 35332688, "step": 16365 }, { "epoch": 2.6704730831973897, "grad_norm": 0.062464434653520584, "learning_rate": 4.58697075008154e-05, "loss": 0.2095, "num_input_tokens_seen": 35342416, "step": 16370 }, { "epoch": 2.671288743882545, "grad_norm": 0.29279690980911255, "learning_rate": 4.586578769991102e-05, "loss": 0.1046, "num_input_tokens_seen": 35353616, "step": 16375 }, { "epoch": 2.6721044045677, "grad_norm": 0.9107227325439453, "learning_rate": 4.586186620752001e-05, "loss": 0.1074, "num_input_tokens_seen": 35364304, "step": 16380 }, { "epoch": 2.672920065252855, "grad_norm": 0.12089785933494568, "learning_rate": 4.585794302396028e-05, "loss": 0.1412, "num_input_tokens_seen": 35374992, "step": 16385 }, { "epoch": 2.67373572593801, "grad_norm": 3.918166160583496, "learning_rate": 4.585401814954986e-05, "loss": 0.2009, "num_input_tokens_seen": 35386544, "step": 16390 }, { "epoch": 2.6745513866231647, "grad_norm": 0.08335122466087341, "learning_rate": 4.5850091584606906e-05, "loss": 0.2132, "num_input_tokens_seen": 35397616, "step": 16395 }, { "epoch": 2.6753670473083195, "grad_norm": 0.2627863883972168, "learning_rate": 4.5846163329449745e-05, "loss": 0.2543, "num_input_tokens_seen": 35407408, "step": 16400 }, { "epoch": 2.676182707993475, "grad_norm": 1.354507565498352, "learning_rate": 4.584223338439681e-05, "loss": 0.0171, "num_input_tokens_seen": 35418512, "step": 16405 }, { "epoch": 2.6769983686786296, "grad_norm": 4.321389675140381, "learning_rate": 4.583830174976669e-05, "loss": 0.2351, "num_input_tokens_seen": 35429712, "step": 16410 }, { "epoch": 2.677814029363785, "grad_norm": 0.3772534728050232, "learning_rate": 4.5834368425878085e-05, "loss": 0.2127, "num_input_tokens_seen": 35440560, "step": 16415 }, { "epoch": 2.6786296900489397, "grad_norm": 0.24870356917381287, "learning_rate": 4.583043341304987e-05, "loss": 0.2236, "num_input_tokens_seen": 35451504, "step": 16420 }, { "epoch": 2.6794453507340945, "grad_norm": 1.4166717529296875, "learning_rate": 4.582649671160103e-05, "loss": 0.1816, "num_input_tokens_seen": 35462640, "step": 16425 }, { "epoch": 2.6802610114192493, "grad_norm": 0.1415262520313263, "learning_rate": 4.5822558321850696e-05, "loss": 0.1237, "num_input_tokens_seen": 35474160, "step": 16430 }, { "epoch": 2.6810766721044046, "grad_norm": 1.583168625831604, "learning_rate": 4.5818618244118126e-05, "loss": 0.1406, "num_input_tokens_seen": 35485200, "step": 16435 }, { "epoch": 2.6818923327895594, "grad_norm": 1.8406635522842407, "learning_rate": 4.581467647872273e-05, "loss": 0.4291, "num_input_tokens_seen": 35496176, "step": 16440 }, { "epoch": 2.6827079934747147, "grad_norm": 0.17736110091209412, "learning_rate": 4.5810733025984045e-05, "loss": 0.1222, "num_input_tokens_seen": 35506960, "step": 16445 }, { "epoch": 2.6835236541598695, "grad_norm": 2.8422389030456543, "learning_rate": 4.580678788622176e-05, "loss": 0.2197, "num_input_tokens_seen": 35517744, "step": 16450 }, { "epoch": 2.6843393148450243, "grad_norm": 3.310188055038452, "learning_rate": 4.580284105975566e-05, "loss": 0.2853, "num_input_tokens_seen": 35529680, "step": 16455 }, { "epoch": 2.6851549755301796, "grad_norm": 0.23108744621276855, "learning_rate": 4.5798892546905726e-05, "loss": 0.0904, "num_input_tokens_seen": 35540496, "step": 16460 }, { "epoch": 2.6859706362153344, "grad_norm": 4.71056604385376, "learning_rate": 4.579494234799202e-05, "loss": 0.1644, "num_input_tokens_seen": 35552176, "step": 16465 }, { "epoch": 2.6867862969004896, "grad_norm": 0.12348005175590515, "learning_rate": 4.579099046333477e-05, "loss": 0.0199, "num_input_tokens_seen": 35563440, "step": 16470 }, { "epoch": 2.6876019575856445, "grad_norm": 0.7517337799072266, "learning_rate": 4.5787036893254355e-05, "loss": 0.0169, "num_input_tokens_seen": 35574256, "step": 16475 }, { "epoch": 2.6884176182707993, "grad_norm": 1.4278310537338257, "learning_rate": 4.578308163807125e-05, "loss": 0.0544, "num_input_tokens_seen": 35584368, "step": 16480 }, { "epoch": 2.689233278955954, "grad_norm": 0.46861472725868225, "learning_rate": 4.577912469810609e-05, "loss": 0.0886, "num_input_tokens_seen": 35594896, "step": 16485 }, { "epoch": 2.6900489396411094, "grad_norm": 3.45877742767334, "learning_rate": 4.577516607367965e-05, "loss": 0.093, "num_input_tokens_seen": 35605488, "step": 16490 }, { "epoch": 2.690864600326264, "grad_norm": 3.337538242340088, "learning_rate": 4.577120576511285e-05, "loss": 0.087, "num_input_tokens_seen": 35617616, "step": 16495 }, { "epoch": 2.6916802610114194, "grad_norm": 0.3401094675064087, "learning_rate": 4.5767243772726706e-05, "loss": 0.2254, "num_input_tokens_seen": 35628304, "step": 16500 }, { "epoch": 2.6924959216965743, "grad_norm": 0.359669953584671, "learning_rate": 4.576328009684241e-05, "loss": 0.1823, "num_input_tokens_seen": 35640016, "step": 16505 }, { "epoch": 2.693311582381729, "grad_norm": 1.9160491228103638, "learning_rate": 4.5759314737781275e-05, "loss": 0.2197, "num_input_tokens_seen": 35651632, "step": 16510 }, { "epoch": 2.6941272430668843, "grad_norm": 0.5485318303108215, "learning_rate": 4.575534769586477e-05, "loss": 0.0122, "num_input_tokens_seen": 35661200, "step": 16515 }, { "epoch": 2.694942903752039, "grad_norm": 3.687964677810669, "learning_rate": 4.575137897141446e-05, "loss": 0.1268, "num_input_tokens_seen": 35671472, "step": 16520 }, { "epoch": 2.695758564437194, "grad_norm": 2.542691707611084, "learning_rate": 4.574740856475207e-05, "loss": 0.1762, "num_input_tokens_seen": 35682480, "step": 16525 }, { "epoch": 2.6965742251223492, "grad_norm": 5.13115930557251, "learning_rate": 4.574343647619949e-05, "loss": 0.3092, "num_input_tokens_seen": 35693168, "step": 16530 }, { "epoch": 2.697389885807504, "grad_norm": 0.35084402561187744, "learning_rate": 4.573946270607868e-05, "loss": 0.0771, "num_input_tokens_seen": 35704624, "step": 16535 }, { "epoch": 2.698205546492659, "grad_norm": 0.4560092091560364, "learning_rate": 4.573548725471181e-05, "loss": 0.0506, "num_input_tokens_seen": 35714768, "step": 16540 }, { "epoch": 2.699021207177814, "grad_norm": 0.17051959037780762, "learning_rate": 4.573151012242112e-05, "loss": 0.0698, "num_input_tokens_seen": 35724240, "step": 16545 }, { "epoch": 2.699836867862969, "grad_norm": 1.0198067426681519, "learning_rate": 4.5727531309529024e-05, "loss": 0.0676, "num_input_tokens_seen": 35735472, "step": 16550 }, { "epoch": 2.700652528548124, "grad_norm": 0.08668447285890579, "learning_rate": 4.5723550816358076e-05, "loss": 0.019, "num_input_tokens_seen": 35747376, "step": 16555 }, { "epoch": 2.701468189233279, "grad_norm": 0.316034197807312, "learning_rate": 4.571956864323095e-05, "loss": 0.0096, "num_input_tokens_seen": 35758960, "step": 16560 }, { "epoch": 2.702283849918434, "grad_norm": 3.647171974182129, "learning_rate": 4.571558479047046e-05, "loss": 0.1996, "num_input_tokens_seen": 35770512, "step": 16565 }, { "epoch": 2.7030995106035887, "grad_norm": 0.10150393843650818, "learning_rate": 4.571159925839956e-05, "loss": 0.1544, "num_input_tokens_seen": 35781424, "step": 16570 }, { "epoch": 2.703915171288744, "grad_norm": 4.908153533935547, "learning_rate": 4.570761204734133e-05, "loss": 0.1752, "num_input_tokens_seen": 35793200, "step": 16575 }, { "epoch": 2.7047308319738987, "grad_norm": 2.5196597576141357, "learning_rate": 4.5703623157619e-05, "loss": 0.0226, "num_input_tokens_seen": 35804880, "step": 16580 }, { "epoch": 2.705546492659054, "grad_norm": 0.11236696690320969, "learning_rate": 4.5699632589555924e-05, "loss": 0.2593, "num_input_tokens_seen": 35815376, "step": 16585 }, { "epoch": 2.706362153344209, "grad_norm": 2.4441187381744385, "learning_rate": 4.569564034347561e-05, "loss": 0.2319, "num_input_tokens_seen": 35825008, "step": 16590 }, { "epoch": 2.7071778140293636, "grad_norm": 2.784428596496582, "learning_rate": 4.5691646419701675e-05, "loss": 0.063, "num_input_tokens_seen": 35834384, "step": 16595 }, { "epoch": 2.707993474714519, "grad_norm": 4.784909725189209, "learning_rate": 4.5687650818557884e-05, "loss": 0.1562, "num_input_tokens_seen": 35844752, "step": 16600 }, { "epoch": 2.7088091353996737, "grad_norm": 0.10333660989999771, "learning_rate": 4.568365354036816e-05, "loss": 0.1813, "num_input_tokens_seen": 35855664, "step": 16605 }, { "epoch": 2.709624796084829, "grad_norm": 0.3954441249370575, "learning_rate": 4.567965458545653e-05, "loss": 0.0188, "num_input_tokens_seen": 35867472, "step": 16610 }, { "epoch": 2.710440456769984, "grad_norm": 0.10883083194494247, "learning_rate": 4.5675653954147174e-05, "loss": 0.1063, "num_input_tokens_seen": 35878544, "step": 16615 }, { "epoch": 2.7112561174551386, "grad_norm": 3.0300588607788086, "learning_rate": 4.56716516467644e-05, "loss": 0.1465, "num_input_tokens_seen": 35890704, "step": 16620 }, { "epoch": 2.7120717781402934, "grad_norm": 0.05359930172562599, "learning_rate": 4.5667647663632653e-05, "loss": 0.066, "num_input_tokens_seen": 35901424, "step": 16625 }, { "epoch": 2.7128874388254487, "grad_norm": 5.807322978973389, "learning_rate": 4.566364200507652e-05, "loss": 0.3546, "num_input_tokens_seen": 35912688, "step": 16630 }, { "epoch": 2.7137030995106035, "grad_norm": 2.9554502964019775, "learning_rate": 4.565963467142073e-05, "loss": 0.2036, "num_input_tokens_seen": 35924272, "step": 16635 }, { "epoch": 2.7145187601957588, "grad_norm": 0.1474665254354477, "learning_rate": 4.565562566299012e-05, "loss": 0.1732, "num_input_tokens_seen": 35935984, "step": 16640 }, { "epoch": 2.7153344208809136, "grad_norm": 1.2817227840423584, "learning_rate": 4.5651614980109684e-05, "loss": 0.2505, "num_input_tokens_seen": 35945840, "step": 16645 }, { "epoch": 2.7161500815660684, "grad_norm": 0.5527687072753906, "learning_rate": 4.564760262310456e-05, "loss": 0.116, "num_input_tokens_seen": 35956336, "step": 16650 }, { "epoch": 2.7169657422512232, "grad_norm": 0.08650974184274673, "learning_rate": 4.5643588592300004e-05, "loss": 0.0655, "num_input_tokens_seen": 35967664, "step": 16655 }, { "epoch": 2.7177814029363785, "grad_norm": 3.350778818130493, "learning_rate": 4.56395728880214e-05, "loss": 0.3088, "num_input_tokens_seen": 35978480, "step": 16660 }, { "epoch": 2.7185970636215333, "grad_norm": 0.14510881900787354, "learning_rate": 4.5635555510594304e-05, "loss": 0.1518, "num_input_tokens_seen": 35989040, "step": 16665 }, { "epoch": 2.7194127243066886, "grad_norm": 1.3831045627593994, "learning_rate": 4.563153646034437e-05, "loss": 0.1915, "num_input_tokens_seen": 35999792, "step": 16670 }, { "epoch": 2.7202283849918434, "grad_norm": 0.1926625370979309, "learning_rate": 4.5627515737597406e-05, "loss": 0.0127, "num_input_tokens_seen": 36009392, "step": 16675 }, { "epoch": 2.721044045676998, "grad_norm": 0.21117781102657318, "learning_rate": 4.562349334267936e-05, "loss": 0.1086, "num_input_tokens_seen": 36019952, "step": 16680 }, { "epoch": 2.7218597063621535, "grad_norm": 0.0967724546790123, "learning_rate": 4.5619469275916294e-05, "loss": 0.0757, "num_input_tokens_seen": 36031152, "step": 16685 }, { "epoch": 2.7226753670473083, "grad_norm": 1.1944202184677124, "learning_rate": 4.5615443537634425e-05, "loss": 0.1179, "num_input_tokens_seen": 36041808, "step": 16690 }, { "epoch": 2.7234910277324635, "grad_norm": 2.5175201892852783, "learning_rate": 4.561141612816011e-05, "loss": 0.1078, "num_input_tokens_seen": 36051248, "step": 16695 }, { "epoch": 2.7243066884176184, "grad_norm": 0.1099877655506134, "learning_rate": 4.560738704781982e-05, "loss": 0.0885, "num_input_tokens_seen": 36063024, "step": 16700 }, { "epoch": 2.725122349102773, "grad_norm": 0.34157779812812805, "learning_rate": 4.560335629694018e-05, "loss": 0.1709, "num_input_tokens_seen": 36074800, "step": 16705 }, { "epoch": 2.725938009787928, "grad_norm": 0.4502664804458618, "learning_rate": 4.559932387584792e-05, "loss": 0.0853, "num_input_tokens_seen": 36084784, "step": 16710 }, { "epoch": 2.7267536704730833, "grad_norm": 6.078094482421875, "learning_rate": 4.559528978486997e-05, "loss": 0.1225, "num_input_tokens_seen": 36095248, "step": 16715 }, { "epoch": 2.727569331158238, "grad_norm": 0.8107971549034119, "learning_rate": 4.5591254024333304e-05, "loss": 0.2715, "num_input_tokens_seen": 36106288, "step": 16720 }, { "epoch": 2.7283849918433933, "grad_norm": 0.6794746518135071, "learning_rate": 4.558721659456513e-05, "loss": 0.0457, "num_input_tokens_seen": 36116976, "step": 16725 }, { "epoch": 2.729200652528548, "grad_norm": 0.1730351448059082, "learning_rate": 4.558317749589271e-05, "loss": 0.1307, "num_input_tokens_seen": 36127504, "step": 16730 }, { "epoch": 2.730016313213703, "grad_norm": 0.06251662224531174, "learning_rate": 4.557913672864349e-05, "loss": 0.0141, "num_input_tokens_seen": 36137744, "step": 16735 }, { "epoch": 2.7308319738988582, "grad_norm": 1.2340346574783325, "learning_rate": 4.5575094293145025e-05, "loss": 0.1185, "num_input_tokens_seen": 36149328, "step": 16740 }, { "epoch": 2.731647634584013, "grad_norm": 0.1639896184206009, "learning_rate": 4.557105018972502e-05, "loss": 0.078, "num_input_tokens_seen": 36160688, "step": 16745 }, { "epoch": 2.732463295269168, "grad_norm": 0.26298972964286804, "learning_rate": 4.5567004418711314e-05, "loss": 0.0634, "num_input_tokens_seen": 36171056, "step": 16750 }, { "epoch": 2.733278955954323, "grad_norm": 0.09849293529987335, "learning_rate": 4.556295698043187e-05, "loss": 0.1476, "num_input_tokens_seen": 36181552, "step": 16755 }, { "epoch": 2.734094616639478, "grad_norm": 1.2345936298370361, "learning_rate": 4.55589078752148e-05, "loss": 0.0764, "num_input_tokens_seen": 36191472, "step": 16760 }, { "epoch": 2.7349102773246328, "grad_norm": 0.5437313914299011, "learning_rate": 4.5554857103388336e-05, "loss": 0.2333, "num_input_tokens_seen": 36204272, "step": 16765 }, { "epoch": 2.735725938009788, "grad_norm": 0.9578171372413635, "learning_rate": 4.555080466528087e-05, "loss": 0.19, "num_input_tokens_seen": 36216144, "step": 16770 }, { "epoch": 2.736541598694943, "grad_norm": 4.469488143920898, "learning_rate": 4.5546750561220896e-05, "loss": 0.0396, "num_input_tokens_seen": 36226672, "step": 16775 }, { "epoch": 2.737357259380098, "grad_norm": 6.144575119018555, "learning_rate": 4.554269479153708e-05, "loss": 0.2819, "num_input_tokens_seen": 36238448, "step": 16780 }, { "epoch": 2.738172920065253, "grad_norm": 0.42157989740371704, "learning_rate": 4.553863735655818e-05, "loss": 0.275, "num_input_tokens_seen": 36248400, "step": 16785 }, { "epoch": 2.7389885807504077, "grad_norm": 0.41651174426078796, "learning_rate": 4.553457825661313e-05, "loss": 0.0088, "num_input_tokens_seen": 36259824, "step": 16790 }, { "epoch": 2.7398042414355626, "grad_norm": 4.7950029373168945, "learning_rate": 4.553051749203097e-05, "loss": 0.0473, "num_input_tokens_seen": 36269168, "step": 16795 }, { "epoch": 2.740619902120718, "grad_norm": 1.291053056716919, "learning_rate": 4.5526455063140894e-05, "loss": 0.0389, "num_input_tokens_seen": 36278288, "step": 16800 }, { "epoch": 2.7414355628058726, "grad_norm": 0.9477439522743225, "learning_rate": 4.552239097027222e-05, "loss": 0.0774, "num_input_tokens_seen": 36289200, "step": 16805 }, { "epoch": 2.742251223491028, "grad_norm": 3.5565085411071777, "learning_rate": 4.551832521375441e-05, "loss": 0.1947, "num_input_tokens_seen": 36300112, "step": 16810 }, { "epoch": 2.7430668841761827, "grad_norm": 4.274855136871338, "learning_rate": 4.551425779391705e-05, "loss": 0.3973, "num_input_tokens_seen": 36309872, "step": 16815 }, { "epoch": 2.7438825448613375, "grad_norm": 0.06560499966144562, "learning_rate": 4.551018871108985e-05, "loss": 0.1101, "num_input_tokens_seen": 36320080, "step": 16820 }, { "epoch": 2.744698205546493, "grad_norm": 0.14905408024787903, "learning_rate": 4.55061179656027e-05, "loss": 0.2605, "num_input_tokens_seen": 36330448, "step": 16825 }, { "epoch": 2.7455138662316476, "grad_norm": 0.5720303654670715, "learning_rate": 4.550204555778558e-05, "loss": 0.2433, "num_input_tokens_seen": 36341008, "step": 16830 }, { "epoch": 2.746329526916803, "grad_norm": 0.10031141340732574, "learning_rate": 4.549797148796861e-05, "loss": 0.0955, "num_input_tokens_seen": 36351728, "step": 16835 }, { "epoch": 2.7471451876019577, "grad_norm": 0.16158132255077362, "learning_rate": 4.549389575648208e-05, "loss": 0.1842, "num_input_tokens_seen": 36362832, "step": 16840 }, { "epoch": 2.7479608482871125, "grad_norm": 6.217977523803711, "learning_rate": 4.548981836365636e-05, "loss": 0.1195, "num_input_tokens_seen": 36373520, "step": 16845 }, { "epoch": 2.7487765089722673, "grad_norm": 0.23625050485134125, "learning_rate": 4.5485739309822e-05, "loss": 0.0167, "num_input_tokens_seen": 36384912, "step": 16850 }, { "epoch": 2.7495921696574226, "grad_norm": 0.09042830765247345, "learning_rate": 4.548165859530968e-05, "loss": 0.0098, "num_input_tokens_seen": 36395760, "step": 16855 }, { "epoch": 2.7504078303425774, "grad_norm": 1.595046877861023, "learning_rate": 4.547757622045018e-05, "loss": 0.0277, "num_input_tokens_seen": 36406768, "step": 16860 }, { "epoch": 2.7512234910277327, "grad_norm": 0.9949770569801331, "learning_rate": 4.5473492185574465e-05, "loss": 0.0316, "num_input_tokens_seen": 36415728, "step": 16865 }, { "epoch": 2.7520391517128875, "grad_norm": 0.09952064603567123, "learning_rate": 4.546940649101358e-05, "loss": 0.3187, "num_input_tokens_seen": 36427728, "step": 16870 }, { "epoch": 2.7528548123980423, "grad_norm": 0.07224081456661224, "learning_rate": 4.546531913709874e-05, "loss": 0.2966, "num_input_tokens_seen": 36438640, "step": 16875 }, { "epoch": 2.753670473083197, "grad_norm": 10.022820472717285, "learning_rate": 4.5461230124161294e-05, "loss": 0.1469, "num_input_tokens_seen": 36448976, "step": 16880 }, { "epoch": 2.7544861337683524, "grad_norm": 0.37871983647346497, "learning_rate": 4.545713945253272e-05, "loss": 0.1956, "num_input_tokens_seen": 36458736, "step": 16885 }, { "epoch": 2.755301794453507, "grad_norm": 7.582609176635742, "learning_rate": 4.545304712254462e-05, "loss": 0.2638, "num_input_tokens_seen": 36468144, "step": 16890 }, { "epoch": 2.7561174551386625, "grad_norm": 0.13654311001300812, "learning_rate": 4.544895313452875e-05, "loss": 0.1795, "num_input_tokens_seen": 36478512, "step": 16895 }, { "epoch": 2.7569331158238173, "grad_norm": 3.9354288578033447, "learning_rate": 4.544485748881697e-05, "loss": 0.1404, "num_input_tokens_seen": 36490384, "step": 16900 }, { "epoch": 2.757748776508972, "grad_norm": 7.661076545715332, "learning_rate": 4.544076018574131e-05, "loss": 0.0703, "num_input_tokens_seen": 36500944, "step": 16905 }, { "epoch": 2.7585644371941274, "grad_norm": 1.766951322555542, "learning_rate": 4.5436661225633915e-05, "loss": 0.0863, "num_input_tokens_seen": 36511056, "step": 16910 }, { "epoch": 2.759380097879282, "grad_norm": 2.157850503921509, "learning_rate": 4.543256060882707e-05, "loss": 0.1478, "num_input_tokens_seen": 36522448, "step": 16915 }, { "epoch": 2.7601957585644374, "grad_norm": 0.15223313868045807, "learning_rate": 4.542845833565318e-05, "loss": 0.1927, "num_input_tokens_seen": 36533456, "step": 16920 }, { "epoch": 2.7610114192495923, "grad_norm": 0.4642239809036255, "learning_rate": 4.5424354406444815e-05, "loss": 0.1126, "num_input_tokens_seen": 36544656, "step": 16925 }, { "epoch": 2.761827079934747, "grad_norm": 3.68845272064209, "learning_rate": 4.542024882153464e-05, "loss": 0.1251, "num_input_tokens_seen": 36556144, "step": 16930 }, { "epoch": 2.762642740619902, "grad_norm": 0.17834614217281342, "learning_rate": 4.541614158125549e-05, "loss": 0.1239, "num_input_tokens_seen": 36566064, "step": 16935 }, { "epoch": 2.763458401305057, "grad_norm": 4.574592590332031, "learning_rate": 4.541203268594031e-05, "loss": 0.0395, "num_input_tokens_seen": 36576528, "step": 16940 }, { "epoch": 2.764274061990212, "grad_norm": 5.893138408660889, "learning_rate": 4.5407922135922194e-05, "loss": 0.1199, "num_input_tokens_seen": 36587856, "step": 16945 }, { "epoch": 2.7650897226753672, "grad_norm": 5.327587604522705, "learning_rate": 4.5403809931534355e-05, "loss": 0.2579, "num_input_tokens_seen": 36597936, "step": 16950 }, { "epoch": 2.765905383360522, "grad_norm": 0.3149024248123169, "learning_rate": 4.5399696073110166e-05, "loss": 0.0489, "num_input_tokens_seen": 36608688, "step": 16955 }, { "epoch": 2.766721044045677, "grad_norm": 1.3448281288146973, "learning_rate": 4.53955805609831e-05, "loss": 0.158, "num_input_tokens_seen": 36619824, "step": 16960 }, { "epoch": 2.767536704730832, "grad_norm": 3.0247461795806885, "learning_rate": 4.5391463395486784e-05, "loss": 0.1252, "num_input_tokens_seen": 36630960, "step": 16965 }, { "epoch": 2.768352365415987, "grad_norm": 0.11827442795038223, "learning_rate": 4.538734457695498e-05, "loss": 0.1118, "num_input_tokens_seen": 36640976, "step": 16970 }, { "epoch": 2.7691680261011418, "grad_norm": 0.21846012771129608, "learning_rate": 4.5383224105721586e-05, "loss": 0.2705, "num_input_tokens_seen": 36651792, "step": 16975 }, { "epoch": 2.769983686786297, "grad_norm": 3.764328718185425, "learning_rate": 4.537910198212061e-05, "loss": 0.0186, "num_input_tokens_seen": 36662896, "step": 16980 }, { "epoch": 2.770799347471452, "grad_norm": 4.463024616241455, "learning_rate": 4.537497820648624e-05, "loss": 0.1958, "num_input_tokens_seen": 36673040, "step": 16985 }, { "epoch": 2.7716150081566067, "grad_norm": 2.405142307281494, "learning_rate": 4.537085277915275e-05, "loss": 0.1598, "num_input_tokens_seen": 36684464, "step": 16990 }, { "epoch": 2.772430668841762, "grad_norm": 9.530285835266113, "learning_rate": 4.536672570045457e-05, "loss": 0.1109, "num_input_tokens_seen": 36696368, "step": 16995 }, { "epoch": 2.7732463295269167, "grad_norm": 0.11592987179756165, "learning_rate": 4.536259697072627e-05, "loss": 0.1465, "num_input_tokens_seen": 36707600, "step": 17000 }, { "epoch": 2.774061990212072, "grad_norm": 7.631938457489014, "learning_rate": 4.535846659030254e-05, "loss": 0.2427, "num_input_tokens_seen": 36718992, "step": 17005 }, { "epoch": 2.774877650897227, "grad_norm": 0.2692801356315613, "learning_rate": 4.5354334559518205e-05, "loss": 0.1457, "num_input_tokens_seen": 36729840, "step": 17010 }, { "epoch": 2.7756933115823816, "grad_norm": 0.17738287150859833, "learning_rate": 4.535020087870824e-05, "loss": 0.1265, "num_input_tokens_seen": 36738672, "step": 17015 }, { "epoch": 2.7765089722675365, "grad_norm": 2.427211284637451, "learning_rate": 4.5346065548207727e-05, "loss": 0.0315, "num_input_tokens_seen": 36749104, "step": 17020 }, { "epoch": 2.7773246329526917, "grad_norm": 0.10846050828695297, "learning_rate": 4.5341928568351915e-05, "loss": 0.112, "num_input_tokens_seen": 36760560, "step": 17025 }, { "epoch": 2.7781402936378465, "grad_norm": 0.5021817088127136, "learning_rate": 4.533778993947615e-05, "loss": 0.0255, "num_input_tokens_seen": 36771728, "step": 17030 }, { "epoch": 2.778955954323002, "grad_norm": 3.761016845703125, "learning_rate": 4.533364966191595e-05, "loss": 0.2266, "num_input_tokens_seen": 36783472, "step": 17035 }, { "epoch": 2.7797716150081566, "grad_norm": 0.07614640891551971, "learning_rate": 4.532950773600694e-05, "loss": 0.2203, "num_input_tokens_seen": 36794672, "step": 17040 }, { "epoch": 2.7805872756933114, "grad_norm": 0.1337439864873886, "learning_rate": 4.532536416208487e-05, "loss": 0.1636, "num_input_tokens_seen": 36805840, "step": 17045 }, { "epoch": 2.7814029363784667, "grad_norm": 0.40264835953712463, "learning_rate": 4.532121894048566e-05, "loss": 0.0561, "num_input_tokens_seen": 36816368, "step": 17050 }, { "epoch": 2.7822185970636215, "grad_norm": 3.4463798999786377, "learning_rate": 4.531707207154532e-05, "loss": 0.1587, "num_input_tokens_seen": 36827920, "step": 17055 }, { "epoch": 2.7830342577487768, "grad_norm": 4.410633087158203, "learning_rate": 4.531292355560004e-05, "loss": 0.3407, "num_input_tokens_seen": 36839312, "step": 17060 }, { "epoch": 2.7838499184339316, "grad_norm": 6.410935401916504, "learning_rate": 4.5308773392986115e-05, "loss": 0.1334, "num_input_tokens_seen": 36849264, "step": 17065 }, { "epoch": 2.7846655791190864, "grad_norm": 2.1469972133636475, "learning_rate": 4.530462158403996e-05, "loss": 0.2583, "num_input_tokens_seen": 36859664, "step": 17070 }, { "epoch": 2.7854812398042412, "grad_norm": 0.2848243713378906, "learning_rate": 4.5300468129098165e-05, "loss": 0.1154, "num_input_tokens_seen": 36869776, "step": 17075 }, { "epoch": 2.7862969004893965, "grad_norm": 1.9185163974761963, "learning_rate": 4.529631302849742e-05, "loss": 0.0949, "num_input_tokens_seen": 36880912, "step": 17080 }, { "epoch": 2.7871125611745513, "grad_norm": 3.354107141494751, "learning_rate": 4.529215628257455e-05, "loss": 0.2706, "num_input_tokens_seen": 36891056, "step": 17085 }, { "epoch": 2.7879282218597066, "grad_norm": 0.2763150632381439, "learning_rate": 4.528799789166654e-05, "loss": 0.1209, "num_input_tokens_seen": 36901456, "step": 17090 }, { "epoch": 2.7887438825448614, "grad_norm": 0.2002396583557129, "learning_rate": 4.5283837856110474e-05, "loss": 0.1359, "num_input_tokens_seen": 36912336, "step": 17095 }, { "epoch": 2.789559543230016, "grad_norm": 0.1000060886144638, "learning_rate": 4.5279676176243596e-05, "loss": 0.0315, "num_input_tokens_seen": 36921456, "step": 17100 }, { "epoch": 2.790375203915171, "grad_norm": 0.4621014893054962, "learning_rate": 4.527551285240327e-05, "loss": 0.0672, "num_input_tokens_seen": 36932272, "step": 17105 }, { "epoch": 2.7911908646003263, "grad_norm": 0.21004720032215118, "learning_rate": 4.527134788492698e-05, "loss": 0.0702, "num_input_tokens_seen": 36942640, "step": 17110 }, { "epoch": 2.792006525285481, "grad_norm": 2.599816083908081, "learning_rate": 4.526718127415239e-05, "loss": 0.3906, "num_input_tokens_seen": 36954192, "step": 17115 }, { "epoch": 2.7928221859706364, "grad_norm": 3.8914761543273926, "learning_rate": 4.5263013020417254e-05, "loss": 0.2187, "num_input_tokens_seen": 36964880, "step": 17120 }, { "epoch": 2.793637846655791, "grad_norm": 1.0694578886032104, "learning_rate": 4.5258843124059466e-05, "loss": 0.1012, "num_input_tokens_seen": 36976112, "step": 17125 }, { "epoch": 2.794453507340946, "grad_norm": 0.17898370325565338, "learning_rate": 4.5254671585417056e-05, "loss": 0.0571, "num_input_tokens_seen": 36985232, "step": 17130 }, { "epoch": 2.7952691680261013, "grad_norm": 0.751848578453064, "learning_rate": 4.52504984048282e-05, "loss": 0.0606, "num_input_tokens_seen": 36995376, "step": 17135 }, { "epoch": 2.796084828711256, "grad_norm": 1.4432446956634521, "learning_rate": 4.5246323582631196e-05, "loss": 0.0701, "num_input_tokens_seen": 37006576, "step": 17140 }, { "epoch": 2.7969004893964113, "grad_norm": 2.072270631790161, "learning_rate": 4.524214711916447e-05, "loss": 0.0836, "num_input_tokens_seen": 37017264, "step": 17145 }, { "epoch": 2.797716150081566, "grad_norm": 0.17519228160381317, "learning_rate": 4.523796901476659e-05, "loss": 0.0112, "num_input_tokens_seen": 37027472, "step": 17150 }, { "epoch": 2.798531810766721, "grad_norm": 0.1969202756881714, "learning_rate": 4.5233789269776264e-05, "loss": 0.0564, "num_input_tokens_seen": 37038768, "step": 17155 }, { "epoch": 2.799347471451876, "grad_norm": 0.6406100988388062, "learning_rate": 4.5229607884532306e-05, "loss": 0.0759, "num_input_tokens_seen": 37049424, "step": 17160 }, { "epoch": 2.800163132137031, "grad_norm": 7.187447547912598, "learning_rate": 4.522542485937369e-05, "loss": 0.3022, "num_input_tokens_seen": 37060912, "step": 17165 }, { "epoch": 2.800978792822186, "grad_norm": 2.4074254035949707, "learning_rate": 4.5221240194639514e-05, "loss": 0.1944, "num_input_tokens_seen": 37070032, "step": 17170 }, { "epoch": 2.801794453507341, "grad_norm": 9.948955535888672, "learning_rate": 4.5217053890669004e-05, "loss": 0.1661, "num_input_tokens_seen": 37079792, "step": 17175 }, { "epoch": 2.802610114192496, "grad_norm": 0.4232616722583771, "learning_rate": 4.521286594780152e-05, "loss": 0.0898, "num_input_tokens_seen": 37091344, "step": 17180 }, { "epoch": 2.8034257748776508, "grad_norm": 1.625488519668579, "learning_rate": 4.5208676366376574e-05, "loss": 0.0793, "num_input_tokens_seen": 37102800, "step": 17185 }, { "epoch": 2.804241435562806, "grad_norm": 0.9074686765670776, "learning_rate": 4.520448514673378e-05, "loss": 0.1862, "num_input_tokens_seen": 37114512, "step": 17190 }, { "epoch": 2.805057096247961, "grad_norm": 2.776407480239868, "learning_rate": 4.52002922892129e-05, "loss": 0.2865, "num_input_tokens_seen": 37125232, "step": 17195 }, { "epoch": 2.8058727569331157, "grad_norm": 2.4943649768829346, "learning_rate": 4.519609779415384e-05, "loss": 0.0979, "num_input_tokens_seen": 37136656, "step": 17200 }, { "epoch": 2.806688417618271, "grad_norm": 1.6606615781784058, "learning_rate": 4.519190166189661e-05, "loss": 0.1429, "num_input_tokens_seen": 37148112, "step": 17205 }, { "epoch": 2.8075040783034257, "grad_norm": 0.2653537392616272, "learning_rate": 4.518770389278138e-05, "loss": 0.0566, "num_input_tokens_seen": 37160208, "step": 17210 }, { "epoch": 2.8083197389885806, "grad_norm": 0.15048940479755402, "learning_rate": 4.5183504487148444e-05, "loss": 0.0767, "num_input_tokens_seen": 37170288, "step": 17215 }, { "epoch": 2.809135399673736, "grad_norm": 0.3598777651786804, "learning_rate": 4.517930344533822e-05, "loss": 0.0386, "num_input_tokens_seen": 37179408, "step": 17220 }, { "epoch": 2.8099510603588906, "grad_norm": 0.19925951957702637, "learning_rate": 4.517510076769128e-05, "loss": 0.1304, "num_input_tokens_seen": 37191472, "step": 17225 }, { "epoch": 2.810766721044046, "grad_norm": 0.2026115208864212, "learning_rate": 4.517089645454829e-05, "loss": 0.0549, "num_input_tokens_seen": 37200528, "step": 17230 }, { "epoch": 2.8115823817292007, "grad_norm": 0.1826828569173813, "learning_rate": 4.51666905062501e-05, "loss": 0.1201, "num_input_tokens_seen": 37210960, "step": 17235 }, { "epoch": 2.8123980424143555, "grad_norm": 0.0931023508310318, "learning_rate": 4.516248292313765e-05, "loss": 0.235, "num_input_tokens_seen": 37222192, "step": 17240 }, { "epoch": 2.8132137030995104, "grad_norm": 14.185015678405762, "learning_rate": 4.515827370555202e-05, "loss": 0.1758, "num_input_tokens_seen": 37234064, "step": 17245 }, { "epoch": 2.8140293637846656, "grad_norm": 0.6374616622924805, "learning_rate": 4.515406285383446e-05, "loss": 0.2708, "num_input_tokens_seen": 37245968, "step": 17250 }, { "epoch": 2.8148450244698204, "grad_norm": 3.457275867462158, "learning_rate": 4.514985036832628e-05, "loss": 0.2472, "num_input_tokens_seen": 37256432, "step": 17255 }, { "epoch": 2.8156606851549757, "grad_norm": 1.529738426208496, "learning_rate": 4.514563624936901e-05, "loss": 0.2225, "num_input_tokens_seen": 37267312, "step": 17260 }, { "epoch": 2.8164763458401305, "grad_norm": 0.34878918528556824, "learning_rate": 4.514142049730424e-05, "loss": 0.16, "num_input_tokens_seen": 37278256, "step": 17265 }, { "epoch": 2.8172920065252853, "grad_norm": 0.16346636414527893, "learning_rate": 4.513720311247374e-05, "loss": 0.095, "num_input_tokens_seen": 37289968, "step": 17270 }, { "epoch": 2.8181076672104406, "grad_norm": 0.13625754415988922, "learning_rate": 4.5132984095219364e-05, "loss": 0.071, "num_input_tokens_seen": 37301072, "step": 17275 }, { "epoch": 2.8189233278955954, "grad_norm": 4.504883289337158, "learning_rate": 4.512876344588315e-05, "loss": 0.2223, "num_input_tokens_seen": 37311664, "step": 17280 }, { "epoch": 2.8197389885807507, "grad_norm": 1.4539239406585693, "learning_rate": 4.512454116480724e-05, "loss": 0.0761, "num_input_tokens_seen": 37323120, "step": 17285 }, { "epoch": 2.8205546492659055, "grad_norm": 0.3599880635738373, "learning_rate": 4.512031725233391e-05, "loss": 0.0756, "num_input_tokens_seen": 37334672, "step": 17290 }, { "epoch": 2.8213703099510603, "grad_norm": 0.08150095492601395, "learning_rate": 4.5116091708805575e-05, "loss": 0.0938, "num_input_tokens_seen": 37346384, "step": 17295 }, { "epoch": 2.822185970636215, "grad_norm": 0.6873994469642639, "learning_rate": 4.5111864534564776e-05, "loss": 0.0132, "num_input_tokens_seen": 37358256, "step": 17300 }, { "epoch": 2.8230016313213704, "grad_norm": 5.348658084869385, "learning_rate": 4.510763572995419e-05, "loss": 0.1198, "num_input_tokens_seen": 37369968, "step": 17305 }, { "epoch": 2.823817292006525, "grad_norm": 0.5481806993484497, "learning_rate": 4.5103405295316634e-05, "loss": 0.2089, "num_input_tokens_seen": 37380432, "step": 17310 }, { "epoch": 2.8246329526916805, "grad_norm": 4.319050312042236, "learning_rate": 4.509917323099504e-05, "loss": 0.3818, "num_input_tokens_seen": 37390480, "step": 17315 }, { "epoch": 2.8254486133768353, "grad_norm": 0.35110506415367126, "learning_rate": 4.5094939537332475e-05, "loss": 0.1083, "num_input_tokens_seen": 37402608, "step": 17320 }, { "epoch": 2.82626427406199, "grad_norm": 0.6002646088600159, "learning_rate": 4.5090704214672155e-05, "loss": 0.0823, "num_input_tokens_seen": 37413936, "step": 17325 }, { "epoch": 2.827079934747145, "grad_norm": 0.06697563827037811, "learning_rate": 4.508646726335741e-05, "loss": 0.0499, "num_input_tokens_seen": 37426480, "step": 17330 }, { "epoch": 2.8278955954323, "grad_norm": 5.018412113189697, "learning_rate": 4.508222868373171e-05, "loss": 0.2247, "num_input_tokens_seen": 37438224, "step": 17335 }, { "epoch": 2.828711256117455, "grad_norm": 0.06362497806549072, "learning_rate": 4.507798847613866e-05, "loss": 0.0617, "num_input_tokens_seen": 37448176, "step": 17340 }, { "epoch": 2.8295269168026103, "grad_norm": 0.2531951367855072, "learning_rate": 4.507374664092199e-05, "loss": 0.1934, "num_input_tokens_seen": 37458672, "step": 17345 }, { "epoch": 2.830342577487765, "grad_norm": 0.12508904933929443, "learning_rate": 4.506950317842556e-05, "loss": 0.2338, "num_input_tokens_seen": 37469840, "step": 17350 }, { "epoch": 2.83115823817292, "grad_norm": 3.515825033187866, "learning_rate": 4.506525808899337e-05, "loss": 0.2209, "num_input_tokens_seen": 37479376, "step": 17355 }, { "epoch": 2.831973898858075, "grad_norm": 3.7540087699890137, "learning_rate": 4.506101137296955e-05, "loss": 0.1844, "num_input_tokens_seen": 37489392, "step": 17360 }, { "epoch": 2.83278955954323, "grad_norm": 1.872329831123352, "learning_rate": 4.505676303069837e-05, "loss": 0.1234, "num_input_tokens_seen": 37499504, "step": 17365 }, { "epoch": 2.8336052202283852, "grad_norm": 0.8481312990188599, "learning_rate": 4.5052513062524196e-05, "loss": 0.1183, "num_input_tokens_seen": 37510512, "step": 17370 }, { "epoch": 2.83442088091354, "grad_norm": 2.0243680477142334, "learning_rate": 4.504826146879158e-05, "loss": 0.1912, "num_input_tokens_seen": 37521424, "step": 17375 }, { "epoch": 2.835236541598695, "grad_norm": 2.5130465030670166, "learning_rate": 4.504400824984516e-05, "loss": 0.267, "num_input_tokens_seen": 37532976, "step": 17380 }, { "epoch": 2.8360522022838497, "grad_norm": 1.1350537538528442, "learning_rate": 4.503975340602973e-05, "loss": 0.2528, "num_input_tokens_seen": 37543504, "step": 17385 }, { "epoch": 2.836867862969005, "grad_norm": 0.19420993328094482, "learning_rate": 4.50354969376902e-05, "loss": 0.143, "num_input_tokens_seen": 37554000, "step": 17390 }, { "epoch": 2.8376835236541598, "grad_norm": 3.076580762863159, "learning_rate": 4.5031238845171644e-05, "loss": 0.1128, "num_input_tokens_seen": 37565360, "step": 17395 }, { "epoch": 2.838499184339315, "grad_norm": 0.17554321885108948, "learning_rate": 4.502697912881923e-05, "loss": 0.2298, "num_input_tokens_seen": 37576368, "step": 17400 }, { "epoch": 2.83931484502447, "grad_norm": 0.1670863777399063, "learning_rate": 4.502271778897825e-05, "loss": 0.1485, "num_input_tokens_seen": 37587664, "step": 17405 }, { "epoch": 2.8401305057096247, "grad_norm": 0.5998808741569519, "learning_rate": 4.50184548259942e-05, "loss": 0.2266, "num_input_tokens_seen": 37598256, "step": 17410 }, { "epoch": 2.84094616639478, "grad_norm": 0.2259002923965454, "learning_rate": 4.501419024021261e-05, "loss": 0.1715, "num_input_tokens_seen": 37608976, "step": 17415 }, { "epoch": 2.8417618270799347, "grad_norm": 1.2721748352050781, "learning_rate": 4.500992403197921e-05, "loss": 0.1692, "num_input_tokens_seen": 37618352, "step": 17420 }, { "epoch": 2.8425774877650896, "grad_norm": 0.22586800158023834, "learning_rate": 4.500565620163985e-05, "loss": 0.1697, "num_input_tokens_seen": 37628816, "step": 17425 }, { "epoch": 2.843393148450245, "grad_norm": 0.13375671207904816, "learning_rate": 4.500138674954047e-05, "loss": 0.0252, "num_input_tokens_seen": 37639824, "step": 17430 }, { "epoch": 2.8442088091353996, "grad_norm": 3.517141342163086, "learning_rate": 4.499711567602721e-05, "loss": 0.1266, "num_input_tokens_seen": 37651504, "step": 17435 }, { "epoch": 2.8450244698205545, "grad_norm": 0.2667512595653534, "learning_rate": 4.499284298144629e-05, "loss": 0.0342, "num_input_tokens_seen": 37662992, "step": 17440 }, { "epoch": 2.8458401305057097, "grad_norm": 0.11251013725996017, "learning_rate": 4.498856866614407e-05, "loss": 0.0498, "num_input_tokens_seen": 37673904, "step": 17445 }, { "epoch": 2.8466557911908645, "grad_norm": 3.6070191860198975, "learning_rate": 4.498429273046705e-05, "loss": 0.0733, "num_input_tokens_seen": 37684368, "step": 17450 }, { "epoch": 2.84747145187602, "grad_norm": 0.16577482223510742, "learning_rate": 4.498001517476187e-05, "loss": 0.0665, "num_input_tokens_seen": 37694480, "step": 17455 }, { "epoch": 2.8482871125611746, "grad_norm": 0.09861022979021072, "learning_rate": 4.497573599937528e-05, "loss": 0.0084, "num_input_tokens_seen": 37705040, "step": 17460 }, { "epoch": 2.8491027732463294, "grad_norm": 3.8634822368621826, "learning_rate": 4.497145520465417e-05, "loss": 0.2386, "num_input_tokens_seen": 37716304, "step": 17465 }, { "epoch": 2.8499184339314843, "grad_norm": 0.17338253557682037, "learning_rate": 4.4967172790945565e-05, "loss": 0.085, "num_input_tokens_seen": 37726352, "step": 17470 }, { "epoch": 2.8507340946166395, "grad_norm": 1.4482851028442383, "learning_rate": 4.496288875859663e-05, "loss": 0.0207, "num_input_tokens_seen": 37736048, "step": 17475 }, { "epoch": 2.8515497553017943, "grad_norm": 3.201935052871704, "learning_rate": 4.4958603107954635e-05, "loss": 0.1531, "num_input_tokens_seen": 37746704, "step": 17480 }, { "epoch": 2.8523654159869496, "grad_norm": 0.1465047299861908, "learning_rate": 4.4954315839367006e-05, "loss": 0.0595, "num_input_tokens_seen": 37758000, "step": 17485 }, { "epoch": 2.8531810766721044, "grad_norm": 0.07675353437662125, "learning_rate": 4.495002695318129e-05, "loss": 0.3736, "num_input_tokens_seen": 37768240, "step": 17490 }, { "epoch": 2.8539967373572592, "grad_norm": 0.2032974660396576, "learning_rate": 4.494573644974516e-05, "loss": 0.2442, "num_input_tokens_seen": 37778576, "step": 17495 }, { "epoch": 2.8548123980424145, "grad_norm": 0.1352575570344925, "learning_rate": 4.494144432940643e-05, "loss": 0.0453, "num_input_tokens_seen": 37788880, "step": 17500 }, { "epoch": 2.8556280587275693, "grad_norm": 0.09475905448198318, "learning_rate": 4.493715059251304e-05, "loss": 0.04, "num_input_tokens_seen": 37799856, "step": 17505 }, { "epoch": 2.8564437194127246, "grad_norm": 0.9383955597877502, "learning_rate": 4.4932855239413065e-05, "loss": 0.2561, "num_input_tokens_seen": 37809808, "step": 17510 }, { "epoch": 2.8572593800978794, "grad_norm": 4.063689708709717, "learning_rate": 4.49285582704547e-05, "loss": 0.2876, "num_input_tokens_seen": 37819856, "step": 17515 }, { "epoch": 2.858075040783034, "grad_norm": 2.331108331680298, "learning_rate": 4.492425968598629e-05, "loss": 0.1281, "num_input_tokens_seen": 37831792, "step": 17520 }, { "epoch": 2.858890701468189, "grad_norm": 2.068864583969116, "learning_rate": 4.49199594863563e-05, "loss": 0.0916, "num_input_tokens_seen": 37843280, "step": 17525 }, { "epoch": 2.8597063621533443, "grad_norm": 1.9548044204711914, "learning_rate": 4.4915657671913314e-05, "loss": 0.1012, "num_input_tokens_seen": 37854928, "step": 17530 }, { "epoch": 2.860522022838499, "grad_norm": 0.30348631739616394, "learning_rate": 4.491135424300607e-05, "loss": 0.1112, "num_input_tokens_seen": 37865424, "step": 17535 }, { "epoch": 2.8613376835236544, "grad_norm": 2.8615989685058594, "learning_rate": 4.490704919998342e-05, "loss": 0.0659, "num_input_tokens_seen": 37876624, "step": 17540 }, { "epoch": 2.862153344208809, "grad_norm": 2.7815873622894287, "learning_rate": 4.4902742543194356e-05, "loss": 0.1882, "num_input_tokens_seen": 37886896, "step": 17545 }, { "epoch": 2.862969004893964, "grad_norm": 2.147285223007202, "learning_rate": 4.4898434272988e-05, "loss": 0.3117, "num_input_tokens_seen": 37897424, "step": 17550 }, { "epoch": 2.863784665579119, "grad_norm": 0.15618032217025757, "learning_rate": 4.489412438971359e-05, "loss": 0.0881, "num_input_tokens_seen": 37908080, "step": 17555 }, { "epoch": 2.864600326264274, "grad_norm": 0.3206488788127899, "learning_rate": 4.488981289372052e-05, "loss": 0.0313, "num_input_tokens_seen": 37918768, "step": 17560 }, { "epoch": 2.865415986949429, "grad_norm": 2.207361936569214, "learning_rate": 4.488549978535829e-05, "loss": 0.2666, "num_input_tokens_seen": 37929488, "step": 17565 }, { "epoch": 2.866231647634584, "grad_norm": 0.21528835594654083, "learning_rate": 4.4881185064976553e-05, "loss": 0.1354, "num_input_tokens_seen": 37940560, "step": 17570 }, { "epoch": 2.867047308319739, "grad_norm": 0.8961257934570312, "learning_rate": 4.487686873292508e-05, "loss": 0.2215, "num_input_tokens_seen": 37951248, "step": 17575 }, { "epoch": 2.867862969004894, "grad_norm": 0.05624648556113243, "learning_rate": 4.487255078955378e-05, "loss": 0.0352, "num_input_tokens_seen": 37961936, "step": 17580 }, { "epoch": 2.868678629690049, "grad_norm": 4.384384632110596, "learning_rate": 4.486823123521267e-05, "loss": 0.2172, "num_input_tokens_seen": 37972560, "step": 17585 }, { "epoch": 2.869494290375204, "grad_norm": 0.31094902753829956, "learning_rate": 4.4863910070251927e-05, "loss": 0.0444, "num_input_tokens_seen": 37984016, "step": 17590 }, { "epoch": 2.870309951060359, "grad_norm": 0.17169949412345886, "learning_rate": 4.485958729502185e-05, "loss": 0.0879, "num_input_tokens_seen": 37994896, "step": 17595 }, { "epoch": 2.871125611745514, "grad_norm": 0.37429696321487427, "learning_rate": 4.485526290987286e-05, "loss": 0.0973, "num_input_tokens_seen": 38006832, "step": 17600 }, { "epoch": 2.8719412724306688, "grad_norm": 3.762223720550537, "learning_rate": 4.485093691515551e-05, "loss": 0.1567, "num_input_tokens_seen": 38018832, "step": 17605 }, { "epoch": 2.8727569331158236, "grad_norm": 0.08808736503124237, "learning_rate": 4.4846609311220494e-05, "loss": 0.2442, "num_input_tokens_seen": 38029520, "step": 17610 }, { "epoch": 2.873572593800979, "grad_norm": 0.18813839554786682, "learning_rate": 4.484228009841863e-05, "loss": 0.0607, "num_input_tokens_seen": 38039568, "step": 17615 }, { "epoch": 2.8743882544861337, "grad_norm": 0.12047629803419113, "learning_rate": 4.483794927710085e-05, "loss": 0.1261, "num_input_tokens_seen": 38051344, "step": 17620 }, { "epoch": 2.875203915171289, "grad_norm": 3.394768714904785, "learning_rate": 4.483361684761826e-05, "loss": 0.1065, "num_input_tokens_seen": 38062448, "step": 17625 }, { "epoch": 2.8760195758564437, "grad_norm": 1.3142900466918945, "learning_rate": 4.482928281032205e-05, "loss": 0.0506, "num_input_tokens_seen": 38073520, "step": 17630 }, { "epoch": 2.8768352365415986, "grad_norm": 4.224359035491943, "learning_rate": 4.482494716556356e-05, "loss": 0.1867, "num_input_tokens_seen": 38084400, "step": 17635 }, { "epoch": 2.877650897226754, "grad_norm": 0.025160379707813263, "learning_rate": 4.482060991369426e-05, "loss": 0.2161, "num_input_tokens_seen": 38094288, "step": 17640 }, { "epoch": 2.8784665579119086, "grad_norm": 0.8182905316352844, "learning_rate": 4.481627105506575e-05, "loss": 0.0301, "num_input_tokens_seen": 38104752, "step": 17645 }, { "epoch": 2.8792822185970635, "grad_norm": 0.20233500003814697, "learning_rate": 4.481193059002976e-05, "loss": 0.0753, "num_input_tokens_seen": 38113456, "step": 17650 }, { "epoch": 2.8800978792822187, "grad_norm": 0.43862026929855347, "learning_rate": 4.480758851893816e-05, "loss": 0.0842, "num_input_tokens_seen": 38123888, "step": 17655 }, { "epoch": 2.8809135399673735, "grad_norm": 2.028978109359741, "learning_rate": 4.480324484214293e-05, "loss": 0.0703, "num_input_tokens_seen": 38134448, "step": 17660 }, { "epoch": 2.8817292006525284, "grad_norm": 6.340385913848877, "learning_rate": 4.479889955999619e-05, "loss": 0.3515, "num_input_tokens_seen": 38145104, "step": 17665 }, { "epoch": 2.8825448613376836, "grad_norm": 2.516404390335083, "learning_rate": 4.4794552672850185e-05, "loss": 0.259, "num_input_tokens_seen": 38155984, "step": 17670 }, { "epoch": 2.8833605220228384, "grad_norm": 4.506988525390625, "learning_rate": 4.479020418105732e-05, "loss": 0.2115, "num_input_tokens_seen": 38167408, "step": 17675 }, { "epoch": 2.8841761827079937, "grad_norm": 6.194934844970703, "learning_rate": 4.478585408497008e-05, "loss": 0.123, "num_input_tokens_seen": 38178096, "step": 17680 }, { "epoch": 2.8849918433931485, "grad_norm": 0.3223642408847809, "learning_rate": 4.478150238494112e-05, "loss": 0.0133, "num_input_tokens_seen": 38188304, "step": 17685 }, { "epoch": 2.8858075040783033, "grad_norm": 4.470766544342041, "learning_rate": 4.47771490813232e-05, "loss": 0.1234, "num_input_tokens_seen": 38198512, "step": 17690 }, { "epoch": 2.886623164763458, "grad_norm": 0.050803132355213165, "learning_rate": 4.4772794174469234e-05, "loss": 0.1378, "num_input_tokens_seen": 38209840, "step": 17695 }, { "epoch": 2.8874388254486134, "grad_norm": 0.3100869357585907, "learning_rate": 4.4768437664732244e-05, "loss": 0.1365, "num_input_tokens_seen": 38219760, "step": 17700 }, { "epoch": 2.8882544861337682, "grad_norm": 0.9346982836723328, "learning_rate": 4.4764079552465385e-05, "loss": 0.0195, "num_input_tokens_seen": 38231376, "step": 17705 }, { "epoch": 2.8890701468189235, "grad_norm": 3.9480233192443848, "learning_rate": 4.475971983802196e-05, "loss": 0.1957, "num_input_tokens_seen": 38242096, "step": 17710 }, { "epoch": 2.8898858075040783, "grad_norm": 0.06706345081329346, "learning_rate": 4.475535852175539e-05, "loss": 0.0324, "num_input_tokens_seen": 38252624, "step": 17715 }, { "epoch": 2.890701468189233, "grad_norm": 2.782609701156616, "learning_rate": 4.475099560401922e-05, "loss": 0.0684, "num_input_tokens_seen": 38262864, "step": 17720 }, { "epoch": 2.8915171288743884, "grad_norm": 0.2172604203224182, "learning_rate": 4.474663108516713e-05, "loss": 0.0215, "num_input_tokens_seen": 38274640, "step": 17725 }, { "epoch": 2.892332789559543, "grad_norm": 0.06671618670225143, "learning_rate": 4.474226496555293e-05, "loss": 0.0152, "num_input_tokens_seen": 38286512, "step": 17730 }, { "epoch": 2.8931484502446985, "grad_norm": 0.036211103200912476, "learning_rate": 4.473789724553056e-05, "loss": 0.0822, "num_input_tokens_seen": 38297456, "step": 17735 }, { "epoch": 2.8939641109298533, "grad_norm": 0.7471785545349121, "learning_rate": 4.473352792545409e-05, "loss": 0.1271, "num_input_tokens_seen": 38308304, "step": 17740 }, { "epoch": 2.894779771615008, "grad_norm": 0.4336090683937073, "learning_rate": 4.4729157005677724e-05, "loss": 0.2564, "num_input_tokens_seen": 38318640, "step": 17745 }, { "epoch": 2.895595432300163, "grad_norm": 0.1255933791399002, "learning_rate": 4.472478448655578e-05, "loss": 0.0443, "num_input_tokens_seen": 38330320, "step": 17750 }, { "epoch": 2.896411092985318, "grad_norm": 0.5787214636802673, "learning_rate": 4.4720410368442724e-05, "loss": 0.0185, "num_input_tokens_seen": 38340432, "step": 17755 }, { "epoch": 2.897226753670473, "grad_norm": 5.940229892730713, "learning_rate": 4.471603465169314e-05, "loss": 0.1063, "num_input_tokens_seen": 38351760, "step": 17760 }, { "epoch": 2.8980424143556283, "grad_norm": 2.6881027221679688, "learning_rate": 4.471165733666176e-05, "loss": 0.0283, "num_input_tokens_seen": 38361136, "step": 17765 }, { "epoch": 2.898858075040783, "grad_norm": 0.05304047837853432, "learning_rate": 4.4707278423703415e-05, "loss": 0.0084, "num_input_tokens_seen": 38371408, "step": 17770 }, { "epoch": 2.899673735725938, "grad_norm": 4.1059064865112305, "learning_rate": 4.470289791317308e-05, "loss": 0.1215, "num_input_tokens_seen": 38383056, "step": 17775 }, { "epoch": 2.9004893964110927, "grad_norm": 0.48627591133117676, "learning_rate": 4.4698515805425876e-05, "loss": 0.0077, "num_input_tokens_seen": 38393840, "step": 17780 }, { "epoch": 2.901305057096248, "grad_norm": 0.21503886580467224, "learning_rate": 4.469413210081703e-05, "loss": 0.0651, "num_input_tokens_seen": 38406096, "step": 17785 }, { "epoch": 2.902120717781403, "grad_norm": 0.11739999055862427, "learning_rate": 4.468974679970191e-05, "loss": 0.1323, "num_input_tokens_seen": 38416912, "step": 17790 }, { "epoch": 2.902936378466558, "grad_norm": 6.5325541496276855, "learning_rate": 4.468535990243601e-05, "loss": 0.175, "num_input_tokens_seen": 38428016, "step": 17795 }, { "epoch": 2.903752039151713, "grad_norm": 5.3871307373046875, "learning_rate": 4.468097140937495e-05, "loss": 0.1288, "num_input_tokens_seen": 38438480, "step": 17800 }, { "epoch": 2.9045676998368677, "grad_norm": 3.131307601928711, "learning_rate": 4.467658132087449e-05, "loss": 0.0539, "num_input_tokens_seen": 38448112, "step": 17805 }, { "epoch": 2.905383360522023, "grad_norm": 4.033162593841553, "learning_rate": 4.4672189637290505e-05, "loss": 0.2172, "num_input_tokens_seen": 38457872, "step": 17810 }, { "epoch": 2.9061990212071778, "grad_norm": 0.10378050059080124, "learning_rate": 4.466779635897902e-05, "loss": 0.2298, "num_input_tokens_seen": 38469040, "step": 17815 }, { "epoch": 2.907014681892333, "grad_norm": 8.18344783782959, "learning_rate": 4.466340148629617e-05, "loss": 0.2725, "num_input_tokens_seen": 38479344, "step": 17820 }, { "epoch": 2.907830342577488, "grad_norm": 3.1844842433929443, "learning_rate": 4.465900501959822e-05, "loss": 0.1133, "num_input_tokens_seen": 38489552, "step": 17825 }, { "epoch": 2.9086460032626427, "grad_norm": 2.472390651702881, "learning_rate": 4.465460695924157e-05, "loss": 0.1878, "num_input_tokens_seen": 38500752, "step": 17830 }, { "epoch": 2.9094616639477975, "grad_norm": 3.5917232036590576, "learning_rate": 4.4650207305582756e-05, "loss": 0.1565, "num_input_tokens_seen": 38512432, "step": 17835 }, { "epoch": 2.9102773246329527, "grad_norm": 0.044487133622169495, "learning_rate": 4.464580605897844e-05, "loss": 0.115, "num_input_tokens_seen": 38522256, "step": 17840 }, { "epoch": 2.9110929853181076, "grad_norm": 0.19997821748256683, "learning_rate": 4.4641403219785396e-05, "loss": 0.2138, "num_input_tokens_seen": 38532784, "step": 17845 }, { "epoch": 2.911908646003263, "grad_norm": 4.2811408042907715, "learning_rate": 4.463699878836055e-05, "loss": 0.2576, "num_input_tokens_seen": 38542640, "step": 17850 }, { "epoch": 2.9127243066884176, "grad_norm": 3.429095506668091, "learning_rate": 4.463259276506095e-05, "loss": 0.1889, "num_input_tokens_seen": 38553008, "step": 17855 }, { "epoch": 2.9135399673735725, "grad_norm": 3.9942924976348877, "learning_rate": 4.462818515024376e-05, "loss": 0.2476, "num_input_tokens_seen": 38565200, "step": 17860 }, { "epoch": 2.9143556280587277, "grad_norm": 0.1372908353805542, "learning_rate": 4.462377594426629e-05, "loss": 0.1281, "num_input_tokens_seen": 38574736, "step": 17865 }, { "epoch": 2.9151712887438825, "grad_norm": 0.20757682621479034, "learning_rate": 4.461936514748597e-05, "loss": 0.0215, "num_input_tokens_seen": 38584496, "step": 17870 }, { "epoch": 2.9159869494290374, "grad_norm": 0.09978707879781723, "learning_rate": 4.4614952760260366e-05, "loss": 0.147, "num_input_tokens_seen": 38594448, "step": 17875 }, { "epoch": 2.9168026101141926, "grad_norm": 0.2199597954750061, "learning_rate": 4.4610538782947166e-05, "loss": 0.0174, "num_input_tokens_seen": 38604976, "step": 17880 }, { "epoch": 2.9176182707993474, "grad_norm": 0.1524445116519928, "learning_rate": 4.460612321590419e-05, "loss": 0.2321, "num_input_tokens_seen": 38615792, "step": 17885 }, { "epoch": 2.9184339314845023, "grad_norm": 2.3187215328216553, "learning_rate": 4.460170605948939e-05, "loss": 0.2173, "num_input_tokens_seen": 38626320, "step": 17890 }, { "epoch": 2.9192495921696575, "grad_norm": 3.7369632720947266, "learning_rate": 4.459728731406083e-05, "loss": 0.1052, "num_input_tokens_seen": 38637008, "step": 17895 }, { "epoch": 2.9200652528548123, "grad_norm": 1.549872636795044, "learning_rate": 4.4592866979976725e-05, "loss": 0.0231, "num_input_tokens_seen": 38647056, "step": 17900 }, { "epoch": 2.9208809135399676, "grad_norm": 9.33074951171875, "learning_rate": 4.458844505759542e-05, "loss": 0.4205, "num_input_tokens_seen": 38658512, "step": 17905 }, { "epoch": 2.9216965742251224, "grad_norm": 0.3138631582260132, "learning_rate": 4.4584021547275356e-05, "loss": 0.0993, "num_input_tokens_seen": 38669328, "step": 17910 }, { "epoch": 2.9225122349102772, "grad_norm": 0.30171555280685425, "learning_rate": 4.457959644937514e-05, "loss": 0.1021, "num_input_tokens_seen": 38680048, "step": 17915 }, { "epoch": 2.923327895595432, "grad_norm": 0.22459585964679718, "learning_rate": 4.457516976425349e-05, "loss": 0.2373, "num_input_tokens_seen": 38691952, "step": 17920 }, { "epoch": 2.9241435562805873, "grad_norm": 0.32993584871292114, "learning_rate": 4.457074149226926e-05, "loss": 0.0452, "num_input_tokens_seen": 38701872, "step": 17925 }, { "epoch": 2.924959216965742, "grad_norm": 0.06956195831298828, "learning_rate": 4.456631163378142e-05, "loss": 0.0127, "num_input_tokens_seen": 38713104, "step": 17930 }, { "epoch": 2.9257748776508974, "grad_norm": 0.13496209681034088, "learning_rate": 4.456188018914908e-05, "loss": 0.244, "num_input_tokens_seen": 38723952, "step": 17935 }, { "epoch": 2.926590538336052, "grad_norm": 0.15551023185253143, "learning_rate": 4.455744715873148e-05, "loss": 0.0215, "num_input_tokens_seen": 38735216, "step": 17940 }, { "epoch": 2.927406199021207, "grad_norm": 0.15168346464633942, "learning_rate": 4.455301254288797e-05, "loss": 0.1295, "num_input_tokens_seen": 38746032, "step": 17945 }, { "epoch": 2.9282218597063623, "grad_norm": 3.0744872093200684, "learning_rate": 4.454857634197806e-05, "loss": 0.1594, "num_input_tokens_seen": 38756464, "step": 17950 }, { "epoch": 2.929037520391517, "grad_norm": 1.041496992111206, "learning_rate": 4.4544138556361364e-05, "loss": 0.1836, "num_input_tokens_seen": 38767952, "step": 17955 }, { "epoch": 2.9298531810766724, "grad_norm": 0.09206711500883102, "learning_rate": 4.453969918639763e-05, "loss": 0.044, "num_input_tokens_seen": 38779312, "step": 17960 }, { "epoch": 2.930668841761827, "grad_norm": 0.10270342230796814, "learning_rate": 4.453525823244673e-05, "loss": 0.0388, "num_input_tokens_seen": 38790576, "step": 17965 }, { "epoch": 2.931484502446982, "grad_norm": 1.683138132095337, "learning_rate": 4.453081569486869e-05, "loss": 0.1008, "num_input_tokens_seen": 38802032, "step": 17970 }, { "epoch": 2.932300163132137, "grad_norm": 1.897834062576294, "learning_rate": 4.452637157402362e-05, "loss": 0.2771, "num_input_tokens_seen": 38812080, "step": 17975 }, { "epoch": 2.933115823817292, "grad_norm": 4.9028425216674805, "learning_rate": 4.45219258702718e-05, "loss": 0.2769, "num_input_tokens_seen": 38823664, "step": 17980 }, { "epoch": 2.933931484502447, "grad_norm": 3.6018197536468506, "learning_rate": 4.451747858397361e-05, "loss": 0.1651, "num_input_tokens_seen": 38833840, "step": 17985 }, { "epoch": 2.934747145187602, "grad_norm": 2.3101999759674072, "learning_rate": 4.451302971548958e-05, "loss": 0.1877, "num_input_tokens_seen": 38844400, "step": 17990 }, { "epoch": 2.935562805872757, "grad_norm": 0.25400373339653015, "learning_rate": 4.450857926518035e-05, "loss": 0.0593, "num_input_tokens_seen": 38855248, "step": 17995 }, { "epoch": 2.936378466557912, "grad_norm": 0.1637653261423111, "learning_rate": 4.45041272334067e-05, "loss": 0.099, "num_input_tokens_seen": 38867472, "step": 18000 }, { "epoch": 2.9371941272430666, "grad_norm": 0.24362194538116455, "learning_rate": 4.449967362052954e-05, "loss": 0.0279, "num_input_tokens_seen": 38878864, "step": 18005 }, { "epoch": 2.938009787928222, "grad_norm": 3.1802594661712646, "learning_rate": 4.449521842690989e-05, "loss": 0.1694, "num_input_tokens_seen": 38890032, "step": 18010 }, { "epoch": 2.9388254486133767, "grad_norm": 0.14970026910305023, "learning_rate": 4.449076165290892e-05, "loss": 0.0931, "num_input_tokens_seen": 38900624, "step": 18015 }, { "epoch": 2.939641109298532, "grad_norm": 0.10972364991903305, "learning_rate": 4.448630329888791e-05, "loss": 0.0187, "num_input_tokens_seen": 38911984, "step": 18020 }, { "epoch": 2.9404567699836868, "grad_norm": 1.4522041082382202, "learning_rate": 4.448184336520829e-05, "loss": 0.0246, "num_input_tokens_seen": 38922896, "step": 18025 }, { "epoch": 2.9412724306688416, "grad_norm": 4.203207492828369, "learning_rate": 4.447738185223158e-05, "loss": 0.2772, "num_input_tokens_seen": 38933872, "step": 18030 }, { "epoch": 2.942088091353997, "grad_norm": 8.245977401733398, "learning_rate": 4.447291876031949e-05, "loss": 0.0493, "num_input_tokens_seen": 38943088, "step": 18035 }, { "epoch": 2.9429037520391517, "grad_norm": 4.124063968658447, "learning_rate": 4.446845408983379e-05, "loss": 0.2852, "num_input_tokens_seen": 38954064, "step": 18040 }, { "epoch": 2.943719412724307, "grad_norm": 0.08290500938892365, "learning_rate": 4.446398784113642e-05, "loss": 0.0899, "num_input_tokens_seen": 38965904, "step": 18045 }, { "epoch": 2.9445350734094617, "grad_norm": 0.26444438099861145, "learning_rate": 4.445952001458944e-05, "loss": 0.1542, "num_input_tokens_seen": 38976080, "step": 18050 }, { "epoch": 2.9453507340946166, "grad_norm": 0.2069074660539627, "learning_rate": 4.445505061055503e-05, "loss": 0.0629, "num_input_tokens_seen": 38987344, "step": 18055 }, { "epoch": 2.9461663947797714, "grad_norm": 14.2250394821167, "learning_rate": 4.44505796293955e-05, "loss": 0.0465, "num_input_tokens_seen": 38999472, "step": 18060 }, { "epoch": 2.9469820554649266, "grad_norm": 0.08218816667795181, "learning_rate": 4.44461070714733e-05, "loss": 0.1194, "num_input_tokens_seen": 39010352, "step": 18065 }, { "epoch": 2.9477977161500815, "grad_norm": 0.08249372243881226, "learning_rate": 4.4441632937150984e-05, "loss": 0.1605, "num_input_tokens_seen": 39020816, "step": 18070 }, { "epoch": 2.9486133768352367, "grad_norm": 3.6566240787506104, "learning_rate": 4.443715722679126e-05, "loss": 0.157, "num_input_tokens_seen": 39032112, "step": 18075 }, { "epoch": 2.9494290375203915, "grad_norm": 0.048471830785274506, "learning_rate": 4.443267994075695e-05, "loss": 0.0101, "num_input_tokens_seen": 39043536, "step": 18080 }, { "epoch": 2.9502446982055464, "grad_norm": 3.1837503910064697, "learning_rate": 4.4428201079411004e-05, "loss": 0.122, "num_input_tokens_seen": 39053232, "step": 18085 }, { "epoch": 2.9510603588907016, "grad_norm": 2.7203781604766846, "learning_rate": 4.4423720643116495e-05, "loss": 0.0211, "num_input_tokens_seen": 39064368, "step": 18090 }, { "epoch": 2.9518760195758564, "grad_norm": 0.42375457286834717, "learning_rate": 4.441923863223664e-05, "loss": 0.3618, "num_input_tokens_seen": 39072560, "step": 18095 }, { "epoch": 2.9526916802610113, "grad_norm": 0.13614337146282196, "learning_rate": 4.441475504713477e-05, "loss": 0.0832, "num_input_tokens_seen": 39084080, "step": 18100 }, { "epoch": 2.9535073409461665, "grad_norm": 3.6559746265411377, "learning_rate": 4.4410269888174346e-05, "loss": 0.1442, "num_input_tokens_seen": 39095056, "step": 18105 }, { "epoch": 2.9543230016313213, "grad_norm": 0.13492968678474426, "learning_rate": 4.440578315571896e-05, "loss": 0.1121, "num_input_tokens_seen": 39106000, "step": 18110 }, { "epoch": 2.955138662316476, "grad_norm": 4.666341781616211, "learning_rate": 4.4401294850132324e-05, "loss": 0.235, "num_input_tokens_seen": 39115760, "step": 18115 }, { "epoch": 2.9559543230016314, "grad_norm": 6.095045566558838, "learning_rate": 4.439680497177829e-05, "loss": 0.2459, "num_input_tokens_seen": 39126160, "step": 18120 }, { "epoch": 2.9567699836867862, "grad_norm": 1.2581567764282227, "learning_rate": 4.439231352102082e-05, "loss": 0.0505, "num_input_tokens_seen": 39137264, "step": 18125 }, { "epoch": 2.9575856443719415, "grad_norm": 4.644974231719971, "learning_rate": 4.438782049822403e-05, "loss": 0.0847, "num_input_tokens_seen": 39147824, "step": 18130 }, { "epoch": 2.9584013050570963, "grad_norm": 0.1869252324104309, "learning_rate": 4.4383325903752124e-05, "loss": 0.1519, "num_input_tokens_seen": 39158768, "step": 18135 }, { "epoch": 2.959216965742251, "grad_norm": 2.9593968391418457, "learning_rate": 4.437882973796948e-05, "loss": 0.0903, "num_input_tokens_seen": 39168752, "step": 18140 }, { "epoch": 2.960032626427406, "grad_norm": 0.4411729872226715, "learning_rate": 4.437433200124057e-05, "loss": 0.0114, "num_input_tokens_seen": 39180528, "step": 18145 }, { "epoch": 2.960848287112561, "grad_norm": 4.563373565673828, "learning_rate": 4.4369832693930005e-05, "loss": 0.2417, "num_input_tokens_seen": 39192336, "step": 18150 }, { "epoch": 2.961663947797716, "grad_norm": 1.0392076969146729, "learning_rate": 4.436533181640252e-05, "loss": 0.1119, "num_input_tokens_seen": 39203056, "step": 18155 }, { "epoch": 2.9624796084828713, "grad_norm": 0.15708614885807037, "learning_rate": 4.436082936902297e-05, "loss": 0.1149, "num_input_tokens_seen": 39213104, "step": 18160 }, { "epoch": 2.963295269168026, "grad_norm": 2.713299036026001, "learning_rate": 4.435632535215637e-05, "loss": 0.2427, "num_input_tokens_seen": 39223792, "step": 18165 }, { "epoch": 2.964110929853181, "grad_norm": 4.85723352432251, "learning_rate": 4.435181976616781e-05, "loss": 0.1196, "num_input_tokens_seen": 39235248, "step": 18170 }, { "epoch": 2.964926590538336, "grad_norm": 0.08172395080327988, "learning_rate": 4.4347312611422555e-05, "loss": 0.1422, "num_input_tokens_seen": 39245424, "step": 18175 }, { "epoch": 2.965742251223491, "grad_norm": 0.10064270347356796, "learning_rate": 4.434280388828598e-05, "loss": 0.2157, "num_input_tokens_seen": 39255344, "step": 18180 }, { "epoch": 2.9665579119086463, "grad_norm": 5.2386555671691895, "learning_rate": 4.433829359712356e-05, "loss": 0.0972, "num_input_tokens_seen": 39266192, "step": 18185 }, { "epoch": 2.967373572593801, "grad_norm": 2.407827615737915, "learning_rate": 4.4333781738300954e-05, "loss": 0.2413, "num_input_tokens_seen": 39277040, "step": 18190 }, { "epoch": 2.968189233278956, "grad_norm": 3.0858025550842285, "learning_rate": 4.43292683121839e-05, "loss": 0.1982, "num_input_tokens_seen": 39286160, "step": 18195 }, { "epoch": 2.9690048939641107, "grad_norm": 0.13510720431804657, "learning_rate": 4.432475331913828e-05, "loss": 0.0432, "num_input_tokens_seen": 39296496, "step": 18200 }, { "epoch": 2.969820554649266, "grad_norm": 3.1260018348693848, "learning_rate": 4.4320236759530095e-05, "loss": 0.1484, "num_input_tokens_seen": 39306672, "step": 18205 }, { "epoch": 2.970636215334421, "grad_norm": 0.10233663767576218, "learning_rate": 4.43157186337255e-05, "loss": 0.0118, "num_input_tokens_seen": 39316816, "step": 18210 }, { "epoch": 2.971451876019576, "grad_norm": 0.25618240237236023, "learning_rate": 4.431119894209074e-05, "loss": 0.1324, "num_input_tokens_seen": 39327984, "step": 18215 }, { "epoch": 2.972267536704731, "grad_norm": 0.2764376699924469, "learning_rate": 4.430667768499221e-05, "loss": 0.1514, "num_input_tokens_seen": 39338832, "step": 18220 }, { "epoch": 2.9730831973898857, "grad_norm": 0.2718583047389984, "learning_rate": 4.4302154862796425e-05, "loss": 0.1349, "num_input_tokens_seen": 39349104, "step": 18225 }, { "epoch": 2.9738988580750405, "grad_norm": 3.6538009643554688, "learning_rate": 4.4297630475870025e-05, "loss": 0.3551, "num_input_tokens_seen": 39358640, "step": 18230 }, { "epoch": 2.9747145187601958, "grad_norm": 0.119206503033638, "learning_rate": 4.429310452457979e-05, "loss": 0.0875, "num_input_tokens_seen": 39368688, "step": 18235 }, { "epoch": 2.9755301794453506, "grad_norm": 0.44838809967041016, "learning_rate": 4.428857700929261e-05, "loss": 0.0963, "num_input_tokens_seen": 39378224, "step": 18240 }, { "epoch": 2.976345840130506, "grad_norm": 0.9642930030822754, "learning_rate": 4.428404793037551e-05, "loss": 0.0235, "num_input_tokens_seen": 39388368, "step": 18245 }, { "epoch": 2.9771615008156607, "grad_norm": 3.36867618560791, "learning_rate": 4.427951728819564e-05, "loss": 0.163, "num_input_tokens_seen": 39399120, "step": 18250 }, { "epoch": 2.9779771615008155, "grad_norm": 0.419706791639328, "learning_rate": 4.427498508312026e-05, "loss": 0.0774, "num_input_tokens_seen": 39408560, "step": 18255 }, { "epoch": 2.9787928221859707, "grad_norm": 1.9322048425674438, "learning_rate": 4.4270451315516807e-05, "loss": 0.0891, "num_input_tokens_seen": 39419664, "step": 18260 }, { "epoch": 2.9796084828711256, "grad_norm": 2.4042837619781494, "learning_rate": 4.426591598575278e-05, "loss": 0.0169, "num_input_tokens_seen": 39430480, "step": 18265 }, { "epoch": 2.980424143556281, "grad_norm": 0.16649606823921204, "learning_rate": 4.4261379094195856e-05, "loss": 0.0231, "num_input_tokens_seen": 39441008, "step": 18270 }, { "epoch": 2.9812398042414356, "grad_norm": 2.1626293659210205, "learning_rate": 4.425684064121381e-05, "loss": 0.0349, "num_input_tokens_seen": 39451984, "step": 18275 }, { "epoch": 2.9820554649265905, "grad_norm": 7.148900032043457, "learning_rate": 4.425230062717455e-05, "loss": 0.2486, "num_input_tokens_seen": 39461904, "step": 18280 }, { "epoch": 2.9828711256117453, "grad_norm": 2.3899693489074707, "learning_rate": 4.424775905244612e-05, "loss": 0.0257, "num_input_tokens_seen": 39472656, "step": 18285 }, { "epoch": 2.9836867862969005, "grad_norm": 0.0685814619064331, "learning_rate": 4.424321591739668e-05, "loss": 0.2119, "num_input_tokens_seen": 39483376, "step": 18290 }, { "epoch": 2.9845024469820554, "grad_norm": 0.15265725553035736, "learning_rate": 4.423867122239451e-05, "loss": 0.057, "num_input_tokens_seen": 39496144, "step": 18295 }, { "epoch": 2.9853181076672106, "grad_norm": 1.2704753875732422, "learning_rate": 4.4234124967808044e-05, "loss": 0.167, "num_input_tokens_seen": 39506128, "step": 18300 }, { "epoch": 2.9861337683523654, "grad_norm": 3.1465489864349365, "learning_rate": 4.4229577154005814e-05, "loss": 0.489, "num_input_tokens_seen": 39515440, "step": 18305 }, { "epoch": 2.9869494290375203, "grad_norm": 0.0706682801246643, "learning_rate": 4.4225027781356484e-05, "loss": 0.0114, "num_input_tokens_seen": 39526512, "step": 18310 }, { "epoch": 2.9877650897226755, "grad_norm": 0.04816197231411934, "learning_rate": 4.4220476850228866e-05, "loss": 0.0067, "num_input_tokens_seen": 39537136, "step": 18315 }, { "epoch": 2.9885807504078303, "grad_norm": 1.9526958465576172, "learning_rate": 4.421592436099186e-05, "loss": 0.0855, "num_input_tokens_seen": 39545808, "step": 18320 }, { "epoch": 2.9893964110929856, "grad_norm": 1.2589412927627563, "learning_rate": 4.4211370314014534e-05, "loss": 0.0655, "num_input_tokens_seen": 39558000, "step": 18325 }, { "epoch": 2.9902120717781404, "grad_norm": 0.10761820524930954, "learning_rate": 4.4206814709666046e-05, "loss": 0.2526, "num_input_tokens_seen": 39570000, "step": 18330 }, { "epoch": 2.9910277324632952, "grad_norm": 0.17023952305316925, "learning_rate": 4.4202257548315704e-05, "loss": 0.1917, "num_input_tokens_seen": 39580560, "step": 18335 }, { "epoch": 2.99184339314845, "grad_norm": 0.2935185730457306, "learning_rate": 4.4197698830332934e-05, "loss": 0.0068, "num_input_tokens_seen": 39591408, "step": 18340 }, { "epoch": 2.9926590538336053, "grad_norm": 2.8027877807617188, "learning_rate": 4.419313855608729e-05, "loss": 0.0838, "num_input_tokens_seen": 39601872, "step": 18345 }, { "epoch": 2.99347471451876, "grad_norm": 3.0456812381744385, "learning_rate": 4.418857672594845e-05, "loss": 0.0514, "num_input_tokens_seen": 39613008, "step": 18350 }, { "epoch": 2.9942903752039154, "grad_norm": 3.173734188079834, "learning_rate": 4.4184013340286215e-05, "loss": 0.092, "num_input_tokens_seen": 39624272, "step": 18355 }, { "epoch": 2.99510603588907, "grad_norm": 0.14348101615905762, "learning_rate": 4.417944839947053e-05, "loss": 0.0765, "num_input_tokens_seen": 39635216, "step": 18360 }, { "epoch": 2.995921696574225, "grad_norm": 0.1864471733570099, "learning_rate": 4.417488190387144e-05, "loss": 0.0615, "num_input_tokens_seen": 39645808, "step": 18365 }, { "epoch": 2.99673735725938, "grad_norm": 0.3066174387931824, "learning_rate": 4.417031385385911e-05, "loss": 0.1555, "num_input_tokens_seen": 39656560, "step": 18370 }, { "epoch": 2.997553017944535, "grad_norm": 2.119971752166748, "learning_rate": 4.416574424980389e-05, "loss": 0.0398, "num_input_tokens_seen": 39666704, "step": 18375 }, { "epoch": 2.99836867862969, "grad_norm": 1.2401740550994873, "learning_rate": 4.4161173092076194e-05, "loss": 0.0196, "num_input_tokens_seen": 39677008, "step": 18380 }, { "epoch": 2.999184339314845, "grad_norm": 0.6576679944992065, "learning_rate": 4.415660038104658e-05, "loss": 0.0093, "num_input_tokens_seen": 39686416, "step": 18385 }, { "epoch": 3.0, "grad_norm": 0.03525367006659508, "learning_rate": 4.4152026117085735e-05, "loss": 0.1007, "num_input_tokens_seen": 39694112, "step": 18390 }, { "epoch": 3.0, "eval_loss": 0.16290850937366486, "eval_runtime": 133.0304, "eval_samples_per_second": 20.484, "eval_steps_per_second": 5.127, "num_input_tokens_seen": 39694112, "step": 18390 }, { "epoch": 3.000815660685155, "grad_norm": 0.047092389315366745, "learning_rate": 4.4147450300564485e-05, "loss": 0.0063, "num_input_tokens_seen": 39703936, "step": 18395 }, { "epoch": 3.00163132137031, "grad_norm": 0.05611247941851616, "learning_rate": 4.414287293185376e-05, "loss": 0.0276, "num_input_tokens_seen": 39715968, "step": 18400 }, { "epoch": 3.002446982055465, "grad_norm": 0.40346306562423706, "learning_rate": 4.413829401132462e-05, "loss": 0.0844, "num_input_tokens_seen": 39726912, "step": 18405 }, { "epoch": 3.0032626427406197, "grad_norm": 3.6461217403411865, "learning_rate": 4.4133713539348266e-05, "loss": 0.2274, "num_input_tokens_seen": 39737504, "step": 18410 }, { "epoch": 3.004078303425775, "grad_norm": 3.9422435760498047, "learning_rate": 4.4129131516296006e-05, "loss": 0.4117, "num_input_tokens_seen": 39749792, "step": 18415 }, { "epoch": 3.00489396411093, "grad_norm": 6.620187759399414, "learning_rate": 4.412454794253928e-05, "loss": 0.0258, "num_input_tokens_seen": 39760928, "step": 18420 }, { "epoch": 3.0057096247960846, "grad_norm": 1.4317384958267212, "learning_rate": 4.4119962818449665e-05, "loss": 0.0109, "num_input_tokens_seen": 39772096, "step": 18425 }, { "epoch": 3.00652528548124, "grad_norm": 0.21592706441879272, "learning_rate": 4.411537614439886e-05, "loss": 0.0328, "num_input_tokens_seen": 39782528, "step": 18430 }, { "epoch": 3.0073409461663947, "grad_norm": 0.06887643784284592, "learning_rate": 4.4110787920758663e-05, "loss": 0.0025, "num_input_tokens_seen": 39794336, "step": 18435 }, { "epoch": 3.00815660685155, "grad_norm": 2.4659671783447266, "learning_rate": 4.4106198147901035e-05, "loss": 0.2849, "num_input_tokens_seen": 39805376, "step": 18440 }, { "epoch": 3.0089722675367048, "grad_norm": 3.8042080402374268, "learning_rate": 4.410160682619803e-05, "loss": 0.1076, "num_input_tokens_seen": 39816768, "step": 18445 }, { "epoch": 3.0097879282218596, "grad_norm": 0.10827802866697311, "learning_rate": 4.409701395602187e-05, "loss": 0.1606, "num_input_tokens_seen": 39827520, "step": 18450 }, { "epoch": 3.010603588907015, "grad_norm": 0.26161307096481323, "learning_rate": 4.4092419537744854e-05, "loss": 0.1583, "num_input_tokens_seen": 39839072, "step": 18455 }, { "epoch": 3.0114192495921697, "grad_norm": 0.17363031208515167, "learning_rate": 4.408782357173944e-05, "loss": 0.0936, "num_input_tokens_seen": 39850432, "step": 18460 }, { "epoch": 3.0122349102773245, "grad_norm": 0.05275749787688255, "learning_rate": 4.408322605837819e-05, "loss": 0.0348, "num_input_tokens_seen": 39861600, "step": 18465 }, { "epoch": 3.0130505709624797, "grad_norm": 5.259739875793457, "learning_rate": 4.407862699803381e-05, "loss": 0.0829, "num_input_tokens_seen": 39872288, "step": 18470 }, { "epoch": 3.0138662316476346, "grad_norm": 0.046283259987831116, "learning_rate": 4.4074026391079126e-05, "loss": 0.3053, "num_input_tokens_seen": 39881984, "step": 18475 }, { "epoch": 3.0146818923327894, "grad_norm": 0.01923038437962532, "learning_rate": 4.406942423788708e-05, "loss": 0.0363, "num_input_tokens_seen": 39893184, "step": 18480 }, { "epoch": 3.0154975530179446, "grad_norm": 0.07903698086738586, "learning_rate": 4.406482053883075e-05, "loss": 0.0186, "num_input_tokens_seen": 39903200, "step": 18485 }, { "epoch": 3.0163132137030995, "grad_norm": 0.0979006364941597, "learning_rate": 4.406021529428334e-05, "loss": 0.1102, "num_input_tokens_seen": 39913728, "step": 18490 }, { "epoch": 3.0171288743882543, "grad_norm": 0.5228192806243896, "learning_rate": 4.405560850461815e-05, "loss": 0.0121, "num_input_tokens_seen": 39924576, "step": 18495 }, { "epoch": 3.0179445350734095, "grad_norm": 3.690387725830078, "learning_rate": 4.405100017020866e-05, "loss": 0.1169, "num_input_tokens_seen": 39935808, "step": 18500 }, { "epoch": 3.0187601957585644, "grad_norm": 0.2534869909286499, "learning_rate": 4.4046390291428426e-05, "loss": 0.1099, "num_input_tokens_seen": 39946592, "step": 18505 }, { "epoch": 3.0195758564437196, "grad_norm": 0.17684774100780487, "learning_rate": 4.4041778868651155e-05, "loss": 0.0376, "num_input_tokens_seen": 39958592, "step": 18510 }, { "epoch": 3.0203915171288744, "grad_norm": 3.616046190261841, "learning_rate": 4.4037165902250676e-05, "loss": 0.1999, "num_input_tokens_seen": 39969760, "step": 18515 }, { "epoch": 3.0212071778140293, "grad_norm": 0.5237236618995667, "learning_rate": 4.403255139260093e-05, "loss": 0.113, "num_input_tokens_seen": 39980064, "step": 18520 }, { "epoch": 3.0220228384991845, "grad_norm": 3.0811939239501953, "learning_rate": 4.4027935340076005e-05, "loss": 0.0931, "num_input_tokens_seen": 39991840, "step": 18525 }, { "epoch": 3.0228384991843393, "grad_norm": 0.05356327071785927, "learning_rate": 4.402331774505009e-05, "loss": 0.0746, "num_input_tokens_seen": 40002912, "step": 18530 }, { "epoch": 3.023654159869494, "grad_norm": 0.1326315999031067, "learning_rate": 4.4018698607897515e-05, "loss": 0.2377, "num_input_tokens_seen": 40014624, "step": 18535 }, { "epoch": 3.0244698205546494, "grad_norm": 0.7073240280151367, "learning_rate": 4.4014077928992736e-05, "loss": 0.3098, "num_input_tokens_seen": 40025408, "step": 18540 }, { "epoch": 3.0252854812398042, "grad_norm": 0.3344561755657196, "learning_rate": 4.4009455708710315e-05, "loss": 0.1189, "num_input_tokens_seen": 40036352, "step": 18545 }, { "epoch": 3.026101141924959, "grad_norm": 0.2792600989341736, "learning_rate": 4.4004831947424967e-05, "loss": 0.0541, "num_input_tokens_seen": 40045888, "step": 18550 }, { "epoch": 3.0269168026101143, "grad_norm": 0.08360278606414795, "learning_rate": 4.400020664551151e-05, "loss": 0.1498, "num_input_tokens_seen": 40056928, "step": 18555 }, { "epoch": 3.027732463295269, "grad_norm": 0.09548623114824295, "learning_rate": 4.39955798033449e-05, "loss": 0.069, "num_input_tokens_seen": 40067456, "step": 18560 }, { "epoch": 3.028548123980424, "grad_norm": 0.08647414296865463, "learning_rate": 4.399095142130021e-05, "loss": 0.1047, "num_input_tokens_seen": 40077760, "step": 18565 }, { "epoch": 3.029363784665579, "grad_norm": 0.8288561701774597, "learning_rate": 4.398632149975263e-05, "loss": 0.0825, "num_input_tokens_seen": 40089376, "step": 18570 }, { "epoch": 3.030179445350734, "grad_norm": 2.954484224319458, "learning_rate": 4.39816900390775e-05, "loss": 0.1254, "num_input_tokens_seen": 40100544, "step": 18575 }, { "epoch": 3.0309951060358893, "grad_norm": 0.09243351966142654, "learning_rate": 4.397705703965026e-05, "loss": 0.019, "num_input_tokens_seen": 40111616, "step": 18580 }, { "epoch": 3.031810766721044, "grad_norm": 0.9688690900802612, "learning_rate": 4.397242250184649e-05, "loss": 0.168, "num_input_tokens_seen": 40123712, "step": 18585 }, { "epoch": 3.032626427406199, "grad_norm": 0.630936861038208, "learning_rate": 4.396778642604188e-05, "loss": 0.0185, "num_input_tokens_seen": 40134976, "step": 18590 }, { "epoch": 3.033442088091354, "grad_norm": 5.368227958679199, "learning_rate": 4.396314881261227e-05, "loss": 0.1821, "num_input_tokens_seen": 40144992, "step": 18595 }, { "epoch": 3.034257748776509, "grad_norm": 0.03500809147953987, "learning_rate": 4.39585096619336e-05, "loss": 0.0139, "num_input_tokens_seen": 40156928, "step": 18600 }, { "epoch": 3.035073409461664, "grad_norm": 2.2299718856811523, "learning_rate": 4.395386897438194e-05, "loss": 0.07, "num_input_tokens_seen": 40167264, "step": 18605 }, { "epoch": 3.035889070146819, "grad_norm": 3.590679883956909, "learning_rate": 4.3949226750333484e-05, "loss": 0.2646, "num_input_tokens_seen": 40178944, "step": 18610 }, { "epoch": 3.036704730831974, "grad_norm": 2.9252543449401855, "learning_rate": 4.3944582990164565e-05, "loss": 0.1848, "num_input_tokens_seen": 40190240, "step": 18615 }, { "epoch": 3.0375203915171287, "grad_norm": 4.0557146072387695, "learning_rate": 4.393993769425162e-05, "loss": 0.102, "num_input_tokens_seen": 40201696, "step": 18620 }, { "epoch": 3.038336052202284, "grad_norm": 0.2512882649898529, "learning_rate": 4.393529086297123e-05, "loss": 0.1558, "num_input_tokens_seen": 40211424, "step": 18625 }, { "epoch": 3.039151712887439, "grad_norm": 4.620949745178223, "learning_rate": 4.3930642496700084e-05, "loss": 0.0893, "num_input_tokens_seen": 40221888, "step": 18630 }, { "epoch": 3.0399673735725936, "grad_norm": 0.0515848733484745, "learning_rate": 4.392599259581501e-05, "loss": 0.0301, "num_input_tokens_seen": 40233120, "step": 18635 }, { "epoch": 3.040783034257749, "grad_norm": 4.30033540725708, "learning_rate": 4.392134116069294e-05, "loss": 0.1255, "num_input_tokens_seen": 40243104, "step": 18640 }, { "epoch": 3.0415986949429037, "grad_norm": 0.07945439964532852, "learning_rate": 4.391668819171095e-05, "loss": 0.1488, "num_input_tokens_seen": 40253344, "step": 18645 }, { "epoch": 3.0424143556280585, "grad_norm": 0.4420635402202606, "learning_rate": 4.391203368924623e-05, "loss": 0.2902, "num_input_tokens_seen": 40265280, "step": 18650 }, { "epoch": 3.0432300163132138, "grad_norm": 0.8691559433937073, "learning_rate": 4.3907377653676104e-05, "loss": 0.1047, "num_input_tokens_seen": 40276224, "step": 18655 }, { "epoch": 3.0440456769983686, "grad_norm": 0.17548291385173798, "learning_rate": 4.390272008537802e-05, "loss": 0.2017, "num_input_tokens_seen": 40287168, "step": 18660 }, { "epoch": 3.044861337683524, "grad_norm": 1.90533447265625, "learning_rate": 4.3898060984729526e-05, "loss": 0.0163, "num_input_tokens_seen": 40297184, "step": 18665 }, { "epoch": 3.0456769983686787, "grad_norm": 6.821253299713135, "learning_rate": 4.389340035210832e-05, "loss": 0.053, "num_input_tokens_seen": 40308128, "step": 18670 }, { "epoch": 3.0464926590538335, "grad_norm": 3.088578939437866, "learning_rate": 4.388873818789222e-05, "loss": 0.0741, "num_input_tokens_seen": 40320352, "step": 18675 }, { "epoch": 3.0473083197389887, "grad_norm": 0.9741634130477905, "learning_rate": 4.388407449245916e-05, "loss": 0.1692, "num_input_tokens_seen": 40331424, "step": 18680 }, { "epoch": 3.0481239804241436, "grad_norm": 1.4948443174362183, "learning_rate": 4.38794092661872e-05, "loss": 0.1048, "num_input_tokens_seen": 40343264, "step": 18685 }, { "epoch": 3.0489396411092984, "grad_norm": 5.092470169067383, "learning_rate": 4.3874742509454536e-05, "loss": 0.0909, "num_input_tokens_seen": 40356608, "step": 18690 }, { "epoch": 3.0497553017944536, "grad_norm": 1.0479038953781128, "learning_rate": 4.387007422263948e-05, "loss": 0.1159, "num_input_tokens_seen": 40367456, "step": 18695 }, { "epoch": 3.0505709624796085, "grad_norm": 5.6116719245910645, "learning_rate": 4.3865404406120456e-05, "loss": 0.0724, "num_input_tokens_seen": 40377728, "step": 18700 }, { "epoch": 3.0513866231647633, "grad_norm": 0.2902747690677643, "learning_rate": 4.3860733060276025e-05, "loss": 0.0591, "num_input_tokens_seen": 40389184, "step": 18705 }, { "epoch": 3.0522022838499185, "grad_norm": 3.059255838394165, "learning_rate": 4.385606018548488e-05, "loss": 0.0407, "num_input_tokens_seen": 40400512, "step": 18710 }, { "epoch": 3.0530179445350734, "grad_norm": 0.31856194138526917, "learning_rate": 4.385138578212582e-05, "loss": 0.0224, "num_input_tokens_seen": 40411008, "step": 18715 }, { "epoch": 3.053833605220228, "grad_norm": 5.904905319213867, "learning_rate": 4.384670985057778e-05, "loss": 0.1246, "num_input_tokens_seen": 40422528, "step": 18720 }, { "epoch": 3.0546492659053834, "grad_norm": 0.024655459448695183, "learning_rate": 4.3842032391219804e-05, "loss": 0.1097, "num_input_tokens_seen": 40432384, "step": 18725 }, { "epoch": 3.0554649265905383, "grad_norm": 0.06969005614519119, "learning_rate": 4.3837353404431086e-05, "loss": 0.2286, "num_input_tokens_seen": 40443424, "step": 18730 }, { "epoch": 3.0562805872756935, "grad_norm": 0.04249146208167076, "learning_rate": 4.383267289059092e-05, "loss": 0.0787, "num_input_tokens_seen": 40453760, "step": 18735 }, { "epoch": 3.0570962479608483, "grad_norm": 4.027807712554932, "learning_rate": 4.382799085007873e-05, "loss": 0.1105, "num_input_tokens_seen": 40464832, "step": 18740 }, { "epoch": 3.057911908646003, "grad_norm": 14.182214736938477, "learning_rate": 4.382330728327407e-05, "loss": 0.3135, "num_input_tokens_seen": 40475136, "step": 18745 }, { "epoch": 3.0587275693311584, "grad_norm": 2.4968860149383545, "learning_rate": 4.3818622190556624e-05, "loss": 0.2739, "num_input_tokens_seen": 40485856, "step": 18750 }, { "epoch": 3.0595432300163132, "grad_norm": 0.09472081065177917, "learning_rate": 4.381393557230617e-05, "loss": 0.153, "num_input_tokens_seen": 40497120, "step": 18755 }, { "epoch": 3.060358890701468, "grad_norm": 0.643191933631897, "learning_rate": 4.380924742890264e-05, "loss": 0.0556, "num_input_tokens_seen": 40508928, "step": 18760 }, { "epoch": 3.0611745513866233, "grad_norm": 0.0940224900841713, "learning_rate": 4.380455776072607e-05, "loss": 0.1985, "num_input_tokens_seen": 40519904, "step": 18765 }, { "epoch": 3.061990212071778, "grad_norm": 3.93025803565979, "learning_rate": 4.3799866568156634e-05, "loss": 0.248, "num_input_tokens_seen": 40529568, "step": 18770 }, { "epoch": 3.062805872756933, "grad_norm": 4.1050639152526855, "learning_rate": 4.379517385157463e-05, "loss": 0.119, "num_input_tokens_seen": 40540512, "step": 18775 }, { "epoch": 3.063621533442088, "grad_norm": 0.17521882057189941, "learning_rate": 4.3790479611360466e-05, "loss": 0.1883, "num_input_tokens_seen": 40550592, "step": 18780 }, { "epoch": 3.064437194127243, "grad_norm": 1.3301870822906494, "learning_rate": 4.378578384789469e-05, "loss": 0.0607, "num_input_tokens_seen": 40561824, "step": 18785 }, { "epoch": 3.065252854812398, "grad_norm": 0.2315429151058197, "learning_rate": 4.378108656155795e-05, "loss": 0.0135, "num_input_tokens_seen": 40572416, "step": 18790 }, { "epoch": 3.066068515497553, "grad_norm": 2.1123085021972656, "learning_rate": 4.377638775273104e-05, "loss": 0.2255, "num_input_tokens_seen": 40583712, "step": 18795 }, { "epoch": 3.066884176182708, "grad_norm": 5.286981105804443, "learning_rate": 4.3771687421794866e-05, "loss": 0.1539, "num_input_tokens_seen": 40594560, "step": 18800 }, { "epoch": 3.067699836867863, "grad_norm": 0.7339895963668823, "learning_rate": 4.3766985569130465e-05, "loss": 0.0547, "num_input_tokens_seen": 40605536, "step": 18805 }, { "epoch": 3.068515497553018, "grad_norm": 0.6658563613891602, "learning_rate": 4.376228219511899e-05, "loss": 0.0417, "num_input_tokens_seen": 40615328, "step": 18810 }, { "epoch": 3.069331158238173, "grad_norm": 4.639886379241943, "learning_rate": 4.375757730014172e-05, "loss": 0.0324, "num_input_tokens_seen": 40625504, "step": 18815 }, { "epoch": 3.070146818923328, "grad_norm": 2.805645704269409, "learning_rate": 4.3752870884580065e-05, "loss": 0.1696, "num_input_tokens_seen": 40636896, "step": 18820 }, { "epoch": 3.070962479608483, "grad_norm": 5.8893256187438965, "learning_rate": 4.374816294881554e-05, "loss": 0.1972, "num_input_tokens_seen": 40647456, "step": 18825 }, { "epoch": 3.0717781402936377, "grad_norm": 3.609346628189087, "learning_rate": 4.37434534932298e-05, "loss": 0.1405, "num_input_tokens_seen": 40656832, "step": 18830 }, { "epoch": 3.072593800978793, "grad_norm": 6.025660037994385, "learning_rate": 4.373874251820462e-05, "loss": 0.0531, "num_input_tokens_seen": 40666752, "step": 18835 }, { "epoch": 3.073409461663948, "grad_norm": 4.759237289428711, "learning_rate": 4.3734030024121886e-05, "loss": 0.0388, "num_input_tokens_seen": 40678368, "step": 18840 }, { "epoch": 3.0742251223491026, "grad_norm": 4.019664764404297, "learning_rate": 4.372931601136363e-05, "loss": 0.268, "num_input_tokens_seen": 40689632, "step": 18845 }, { "epoch": 3.075040783034258, "grad_norm": 0.5742751359939575, "learning_rate": 4.372460048031198e-05, "loss": 0.023, "num_input_tokens_seen": 40700576, "step": 18850 }, { "epoch": 3.0758564437194127, "grad_norm": 2.622826099395752, "learning_rate": 4.3719883431349206e-05, "loss": 0.1285, "num_input_tokens_seen": 40711488, "step": 18855 }, { "epoch": 3.0766721044045675, "grad_norm": 0.3004291355609894, "learning_rate": 4.3715164864857705e-05, "loss": 0.1862, "num_input_tokens_seen": 40720832, "step": 18860 }, { "epoch": 3.0774877650897228, "grad_norm": 5.004072666168213, "learning_rate": 4.371044478121998e-05, "loss": 0.1355, "num_input_tokens_seen": 40731200, "step": 18865 }, { "epoch": 3.0783034257748776, "grad_norm": 2.9772660732269287, "learning_rate": 4.370572318081866e-05, "loss": 0.0598, "num_input_tokens_seen": 40742432, "step": 18870 }, { "epoch": 3.0791190864600324, "grad_norm": 0.14002737402915955, "learning_rate": 4.3701000064036504e-05, "loss": 0.2778, "num_input_tokens_seen": 40753856, "step": 18875 }, { "epoch": 3.0799347471451877, "grad_norm": 1.761139988899231, "learning_rate": 4.3696275431256405e-05, "loss": 0.0989, "num_input_tokens_seen": 40762976, "step": 18880 }, { "epoch": 3.0807504078303425, "grad_norm": 3.3624911308288574, "learning_rate": 4.369154928286134e-05, "loss": 0.1211, "num_input_tokens_seen": 40773504, "step": 18885 }, { "epoch": 3.0815660685154977, "grad_norm": 0.22525611519813538, "learning_rate": 4.368682161923447e-05, "loss": 0.0058, "num_input_tokens_seen": 40784576, "step": 18890 }, { "epoch": 3.0823817292006526, "grad_norm": 0.24241799116134644, "learning_rate": 4.368209244075901e-05, "loss": 0.0236, "num_input_tokens_seen": 40794240, "step": 18895 }, { "epoch": 3.0831973898858074, "grad_norm": 0.05112004280090332, "learning_rate": 4.3677361747818355e-05, "loss": 0.0441, "num_input_tokens_seen": 40805600, "step": 18900 }, { "epoch": 3.0840130505709626, "grad_norm": 0.057318106293678284, "learning_rate": 4.3672629540795976e-05, "loss": 0.0868, "num_input_tokens_seen": 40816352, "step": 18905 }, { "epoch": 3.0848287112561175, "grad_norm": 0.14517413079738617, "learning_rate": 4.366789582007551e-05, "loss": 0.0669, "num_input_tokens_seen": 40826720, "step": 18910 }, { "epoch": 3.0856443719412723, "grad_norm": 3.7802202701568604, "learning_rate": 4.366316058604069e-05, "loss": 0.2246, "num_input_tokens_seen": 40837216, "step": 18915 }, { "epoch": 3.0864600326264275, "grad_norm": 0.45749226212501526, "learning_rate": 4.3658423839075376e-05, "loss": 0.0127, "num_input_tokens_seen": 40847904, "step": 18920 }, { "epoch": 3.0872756933115824, "grad_norm": 0.07833077013492584, "learning_rate": 4.3653685579563555e-05, "loss": 0.1618, "num_input_tokens_seen": 40858304, "step": 18925 }, { "epoch": 3.088091353996737, "grad_norm": 0.14516383409500122, "learning_rate": 4.364894580788932e-05, "loss": 0.0151, "num_input_tokens_seen": 40869088, "step": 18930 }, { "epoch": 3.0889070146818924, "grad_norm": 0.24459004402160645, "learning_rate": 4.364420452443693e-05, "loss": 0.0282, "num_input_tokens_seen": 40878976, "step": 18935 }, { "epoch": 3.0897226753670473, "grad_norm": 6.220860958099365, "learning_rate": 4.363946172959071e-05, "loss": 0.1223, "num_input_tokens_seen": 40890144, "step": 18940 }, { "epoch": 3.090538336052202, "grad_norm": 9.879328727722168, "learning_rate": 4.363471742373516e-05, "loss": 0.132, "num_input_tokens_seen": 40900448, "step": 18945 }, { "epoch": 3.0913539967373573, "grad_norm": 0.04250815510749817, "learning_rate": 4.3629971607254855e-05, "loss": 0.0092, "num_input_tokens_seen": 40910880, "step": 18950 }, { "epoch": 3.092169657422512, "grad_norm": 0.06580474227666855, "learning_rate": 4.362522428053453e-05, "loss": 0.0124, "num_input_tokens_seen": 40922752, "step": 18955 }, { "epoch": 3.0929853181076674, "grad_norm": 5.778861045837402, "learning_rate": 4.3620475443959016e-05, "loss": 0.1461, "num_input_tokens_seen": 40932640, "step": 18960 }, { "epoch": 3.0938009787928222, "grad_norm": 1.215570330619812, "learning_rate": 4.361572509791328e-05, "loss": 0.1187, "num_input_tokens_seen": 40943264, "step": 18965 }, { "epoch": 3.094616639477977, "grad_norm": 0.7755405306816101, "learning_rate": 4.361097324278242e-05, "loss": 0.0735, "num_input_tokens_seen": 40953664, "step": 18970 }, { "epoch": 3.0954323001631323, "grad_norm": 6.451382637023926, "learning_rate": 4.3606219878951623e-05, "loss": 0.1979, "num_input_tokens_seen": 40964544, "step": 18975 }, { "epoch": 3.096247960848287, "grad_norm": 0.0326329804956913, "learning_rate": 4.360146500680625e-05, "loss": 0.0084, "num_input_tokens_seen": 40975520, "step": 18980 }, { "epoch": 3.097063621533442, "grad_norm": 4.4509735107421875, "learning_rate": 4.3596708626731744e-05, "loss": 0.0525, "num_input_tokens_seen": 40985728, "step": 18985 }, { "epoch": 3.097879282218597, "grad_norm": 0.12521952390670776, "learning_rate": 4.359195073911367e-05, "loss": 0.1533, "num_input_tokens_seen": 40996064, "step": 18990 }, { "epoch": 3.098694942903752, "grad_norm": 0.04597676545381546, "learning_rate": 4.3587191344337735e-05, "loss": 0.0982, "num_input_tokens_seen": 41007456, "step": 18995 }, { "epoch": 3.099510603588907, "grad_norm": 4.002562522888184, "learning_rate": 4.358243044278976e-05, "loss": 0.2087, "num_input_tokens_seen": 41017792, "step": 19000 }, { "epoch": 3.100326264274062, "grad_norm": 0.048084646463394165, "learning_rate": 4.3577668034855684e-05, "loss": 0.0743, "num_input_tokens_seen": 41028512, "step": 19005 }, { "epoch": 3.101141924959217, "grad_norm": 0.241954043507576, "learning_rate": 4.357290412092158e-05, "loss": 0.0317, "num_input_tokens_seen": 41039520, "step": 19010 }, { "epoch": 3.1019575856443717, "grad_norm": 0.12843763828277588, "learning_rate": 4.356813870137363e-05, "loss": 0.2005, "num_input_tokens_seen": 41050336, "step": 19015 }, { "epoch": 3.102773246329527, "grad_norm": 0.0742432102560997, "learning_rate": 4.356337177659814e-05, "loss": 0.0028, "num_input_tokens_seen": 41061088, "step": 19020 }, { "epoch": 3.103588907014682, "grad_norm": 0.04520115256309509, "learning_rate": 4.355860334698154e-05, "loss": 0.2471, "num_input_tokens_seen": 41071680, "step": 19025 }, { "epoch": 3.104404567699837, "grad_norm": 0.03389989584684372, "learning_rate": 4.3553833412910395e-05, "loss": 0.1464, "num_input_tokens_seen": 41083744, "step": 19030 }, { "epoch": 3.105220228384992, "grad_norm": 3.7635552883148193, "learning_rate": 4.354906197477137e-05, "loss": 0.1775, "num_input_tokens_seen": 41094688, "step": 19035 }, { "epoch": 3.1060358890701467, "grad_norm": 0.04354064539074898, "learning_rate": 4.354428903295126e-05, "loss": 0.2955, "num_input_tokens_seen": 41106656, "step": 19040 }, { "epoch": 3.106851549755302, "grad_norm": 0.1443745642900467, "learning_rate": 4.353951458783699e-05, "loss": 0.1355, "num_input_tokens_seen": 41117280, "step": 19045 }, { "epoch": 3.107667210440457, "grad_norm": 0.5298668146133423, "learning_rate": 4.3534738639815606e-05, "loss": 0.1538, "num_input_tokens_seen": 41128064, "step": 19050 }, { "epoch": 3.1084828711256116, "grad_norm": 3.103580951690674, "learning_rate": 4.352996118927426e-05, "loss": 0.1054, "num_input_tokens_seen": 41137440, "step": 19055 }, { "epoch": 3.109298531810767, "grad_norm": 1.0356005430221558, "learning_rate": 4.3525182236600235e-05, "loss": 0.0878, "num_input_tokens_seen": 41147648, "step": 19060 }, { "epoch": 3.1101141924959217, "grad_norm": 2.1156842708587646, "learning_rate": 4.3520401782180954e-05, "loss": 0.0645, "num_input_tokens_seen": 41158688, "step": 19065 }, { "epoch": 3.1109298531810765, "grad_norm": 0.21462346613407135, "learning_rate": 4.351561982640392e-05, "loss": 0.0059, "num_input_tokens_seen": 41168160, "step": 19070 }, { "epoch": 3.1117455138662318, "grad_norm": 6.509387493133545, "learning_rate": 4.35108363696568e-05, "loss": 0.0507, "num_input_tokens_seen": 41179936, "step": 19075 }, { "epoch": 3.1125611745513866, "grad_norm": 0.0804077610373497, "learning_rate": 4.3506051412327364e-05, "loss": 0.1488, "num_input_tokens_seen": 41190528, "step": 19080 }, { "epoch": 3.1133768352365414, "grad_norm": 0.2413562536239624, "learning_rate": 4.3501264954803495e-05, "loss": 0.0673, "num_input_tokens_seen": 41200128, "step": 19085 }, { "epoch": 3.1141924959216967, "grad_norm": 3.4854648113250732, "learning_rate": 4.3496476997473216e-05, "loss": 0.1053, "num_input_tokens_seen": 41211040, "step": 19090 }, { "epoch": 3.1150081566068515, "grad_norm": 1.6438874006271362, "learning_rate": 4.349168754072467e-05, "loss": 0.3506, "num_input_tokens_seen": 41223392, "step": 19095 }, { "epoch": 3.1158238172920063, "grad_norm": 3.393418312072754, "learning_rate": 4.3486896584946094e-05, "loss": 0.0614, "num_input_tokens_seen": 41233344, "step": 19100 }, { "epoch": 3.1166394779771616, "grad_norm": 3.376964807510376, "learning_rate": 4.348210413052589e-05, "loss": 0.3814, "num_input_tokens_seen": 41244512, "step": 19105 }, { "epoch": 3.1174551386623164, "grad_norm": 2.494851589202881, "learning_rate": 4.3477310177852537e-05, "loss": 0.168, "num_input_tokens_seen": 41255360, "step": 19110 }, { "epoch": 3.1182707993474716, "grad_norm": 4.940349102020264, "learning_rate": 4.347251472731467e-05, "loss": 0.0301, "num_input_tokens_seen": 41265856, "step": 19115 }, { "epoch": 3.1190864600326265, "grad_norm": 0.06120862811803818, "learning_rate": 4.3467717779301046e-05, "loss": 0.1862, "num_input_tokens_seen": 41277216, "step": 19120 }, { "epoch": 3.1199021207177813, "grad_norm": 9.058176040649414, "learning_rate": 4.3462919334200494e-05, "loss": 0.4465, "num_input_tokens_seen": 41288256, "step": 19125 }, { "epoch": 3.1207177814029365, "grad_norm": 0.17716369032859802, "learning_rate": 4.345811939240203e-05, "loss": 0.0919, "num_input_tokens_seen": 41298784, "step": 19130 }, { "epoch": 3.1215334420880914, "grad_norm": 0.2572365403175354, "learning_rate": 4.3453317954294755e-05, "loss": 0.1556, "num_input_tokens_seen": 41309792, "step": 19135 }, { "epoch": 3.122349102773246, "grad_norm": 2.621544122695923, "learning_rate": 4.3448515020267896e-05, "loss": 0.1619, "num_input_tokens_seen": 41320320, "step": 19140 }, { "epoch": 3.1231647634584014, "grad_norm": 2.794127941131592, "learning_rate": 4.3443710590710795e-05, "loss": 0.0703, "num_input_tokens_seen": 41331008, "step": 19145 }, { "epoch": 3.1239804241435563, "grad_norm": 0.18797270953655243, "learning_rate": 4.343890466601294e-05, "loss": 0.2203, "num_input_tokens_seen": 41341888, "step": 19150 }, { "epoch": 3.124796084828711, "grad_norm": 0.22668831050395966, "learning_rate": 4.343409724656391e-05, "loss": 0.1158, "num_input_tokens_seen": 41351840, "step": 19155 }, { "epoch": 3.1256117455138663, "grad_norm": 0.4405692219734192, "learning_rate": 4.342928833275341e-05, "loss": 0.0139, "num_input_tokens_seen": 41362592, "step": 19160 }, { "epoch": 3.126427406199021, "grad_norm": 0.5118138194084167, "learning_rate": 4.342447792497131e-05, "loss": 0.1122, "num_input_tokens_seen": 41372704, "step": 19165 }, { "epoch": 3.1272430668841764, "grad_norm": 0.46270260214805603, "learning_rate": 4.341966602360754e-05, "loss": 0.134, "num_input_tokens_seen": 41383584, "step": 19170 }, { "epoch": 3.1280587275693312, "grad_norm": 3.503925323486328, "learning_rate": 4.3414852629052175e-05, "loss": 0.351, "num_input_tokens_seen": 41394976, "step": 19175 }, { "epoch": 3.128874388254486, "grad_norm": 6.204897880554199, "learning_rate": 4.341003774169542e-05, "loss": 0.1614, "num_input_tokens_seen": 41406016, "step": 19180 }, { "epoch": 3.1296900489396413, "grad_norm": 0.08034936338663101, "learning_rate": 4.34052213619276e-05, "loss": 0.0253, "num_input_tokens_seen": 41417376, "step": 19185 }, { "epoch": 3.130505709624796, "grad_norm": 1.7257933616638184, "learning_rate": 4.340040349013915e-05, "loss": 0.2314, "num_input_tokens_seen": 41428416, "step": 19190 }, { "epoch": 3.131321370309951, "grad_norm": 6.101888656616211, "learning_rate": 4.3395584126720626e-05, "loss": 0.1961, "num_input_tokens_seen": 41439552, "step": 19195 }, { "epoch": 3.132137030995106, "grad_norm": 0.16559572517871857, "learning_rate": 4.339076327206272e-05, "loss": 0.0191, "num_input_tokens_seen": 41448928, "step": 19200 }, { "epoch": 3.132952691680261, "grad_norm": 0.6203173398971558, "learning_rate": 4.338594092655622e-05, "loss": 0.0534, "num_input_tokens_seen": 41460096, "step": 19205 }, { "epoch": 3.133768352365416, "grad_norm": 5.801191329956055, "learning_rate": 4.338111709059206e-05, "loss": 0.1536, "num_input_tokens_seen": 41469696, "step": 19210 }, { "epoch": 3.134584013050571, "grad_norm": 0.12250760942697525, "learning_rate": 4.337629176456129e-05, "loss": 0.0289, "num_input_tokens_seen": 41480224, "step": 19215 }, { "epoch": 3.135399673735726, "grad_norm": 0.2107073813676834, "learning_rate": 4.337146494885507e-05, "loss": 0.1377, "num_input_tokens_seen": 41490176, "step": 19220 }, { "epoch": 3.1362153344208807, "grad_norm": 3.8480005264282227, "learning_rate": 4.336663664386468e-05, "loss": 0.1146, "num_input_tokens_seen": 41500928, "step": 19225 }, { "epoch": 3.137030995106036, "grad_norm": 0.11215127259492874, "learning_rate": 4.3361806849981535e-05, "loss": 0.0617, "num_input_tokens_seen": 41512608, "step": 19230 }, { "epoch": 3.137846655791191, "grad_norm": 0.7500024437904358, "learning_rate": 4.335697556759716e-05, "loss": 0.0204, "num_input_tokens_seen": 41522880, "step": 19235 }, { "epoch": 3.1386623164763456, "grad_norm": 0.3657032549381256, "learning_rate": 4.3352142797103204e-05, "loss": 0.1502, "num_input_tokens_seen": 41533632, "step": 19240 }, { "epoch": 3.139477977161501, "grad_norm": 3.9524943828582764, "learning_rate": 4.334730853889143e-05, "loss": 0.0479, "num_input_tokens_seen": 41546112, "step": 19245 }, { "epoch": 3.1402936378466557, "grad_norm": 4.626284599304199, "learning_rate": 4.3342472793353736e-05, "loss": 0.0369, "num_input_tokens_seen": 41557376, "step": 19250 }, { "epoch": 3.141109298531811, "grad_norm": 1.2742774486541748, "learning_rate": 4.333763556088213e-05, "loss": 0.2307, "num_input_tokens_seen": 41568160, "step": 19255 }, { "epoch": 3.141924959216966, "grad_norm": 0.5053208470344543, "learning_rate": 4.333279684186874e-05, "loss": 0.162, "num_input_tokens_seen": 41577696, "step": 19260 }, { "epoch": 3.1427406199021206, "grad_norm": 0.1496593952178955, "learning_rate": 4.332795663670581e-05, "loss": 0.0919, "num_input_tokens_seen": 41589088, "step": 19265 }, { "epoch": 3.143556280587276, "grad_norm": 2.1042985916137695, "learning_rate": 4.332311494578573e-05, "loss": 0.096, "num_input_tokens_seen": 41599904, "step": 19270 }, { "epoch": 3.1443719412724307, "grad_norm": 0.07737638801336288, "learning_rate": 4.331827176950098e-05, "loss": 0.0157, "num_input_tokens_seen": 41610784, "step": 19275 }, { "epoch": 3.1451876019575855, "grad_norm": 0.11145178228616714, "learning_rate": 4.3313427108244175e-05, "loss": 0.1237, "num_input_tokens_seen": 41620768, "step": 19280 }, { "epoch": 3.1460032626427408, "grad_norm": 0.1373949497938156, "learning_rate": 4.330858096240804e-05, "loss": 0.0953, "num_input_tokens_seen": 41632384, "step": 19285 }, { "epoch": 3.1468189233278956, "grad_norm": 3.2667558193206787, "learning_rate": 4.3303733332385446e-05, "loss": 0.136, "num_input_tokens_seen": 41643680, "step": 19290 }, { "epoch": 3.1476345840130504, "grad_norm": 2.046661615371704, "learning_rate": 4.329888421856936e-05, "loss": 0.1643, "num_input_tokens_seen": 41655424, "step": 19295 }, { "epoch": 3.1484502446982057, "grad_norm": 0.5549253225326538, "learning_rate": 4.3294033621352856e-05, "loss": 0.0374, "num_input_tokens_seen": 41666144, "step": 19300 }, { "epoch": 3.1492659053833605, "grad_norm": 5.20822286605835, "learning_rate": 4.3289181541129174e-05, "loss": 0.0326, "num_input_tokens_seen": 41677312, "step": 19305 }, { "epoch": 3.1500815660685153, "grad_norm": 4.700889587402344, "learning_rate": 4.328432797829164e-05, "loss": 0.1031, "num_input_tokens_seen": 41689344, "step": 19310 }, { "epoch": 3.1508972267536706, "grad_norm": 0.08882571756839752, "learning_rate": 4.3279472933233696e-05, "loss": 0.0107, "num_input_tokens_seen": 41700032, "step": 19315 }, { "epoch": 3.1517128874388254, "grad_norm": 0.04364703595638275, "learning_rate": 4.327461640634893e-05, "loss": 0.0488, "num_input_tokens_seen": 41711040, "step": 19320 }, { "epoch": 3.15252854812398, "grad_norm": 0.1867649257183075, "learning_rate": 4.3269758398031037e-05, "loss": 0.1591, "num_input_tokens_seen": 41721184, "step": 19325 }, { "epoch": 3.1533442088091355, "grad_norm": 2.5056307315826416, "learning_rate": 4.3264898908673826e-05, "loss": 0.2196, "num_input_tokens_seen": 41731616, "step": 19330 }, { "epoch": 3.1541598694942903, "grad_norm": 0.2886751890182495, "learning_rate": 4.3260037938671237e-05, "loss": 0.1046, "num_input_tokens_seen": 41741568, "step": 19335 }, { "epoch": 3.1549755301794455, "grad_norm": 4.4228081703186035, "learning_rate": 4.325517548841732e-05, "loss": 0.0868, "num_input_tokens_seen": 41752512, "step": 19340 }, { "epoch": 3.1557911908646004, "grad_norm": 4.792253017425537, "learning_rate": 4.3250311558306255e-05, "loss": 0.1609, "num_input_tokens_seen": 41763040, "step": 19345 }, { "epoch": 3.156606851549755, "grad_norm": 0.13030284643173218, "learning_rate": 4.324544614873233e-05, "loss": 0.3224, "num_input_tokens_seen": 41773472, "step": 19350 }, { "epoch": 3.1574225122349104, "grad_norm": 4.019415855407715, "learning_rate": 4.324057926008997e-05, "loss": 0.0577, "num_input_tokens_seen": 41784096, "step": 19355 }, { "epoch": 3.1582381729200653, "grad_norm": 0.10057320445775986, "learning_rate": 4.323571089277369e-05, "loss": 0.0088, "num_input_tokens_seen": 41795616, "step": 19360 }, { "epoch": 3.15905383360522, "grad_norm": 0.12926925718784332, "learning_rate": 4.3230841047178175e-05, "loss": 0.005, "num_input_tokens_seen": 41806784, "step": 19365 }, { "epoch": 3.1598694942903753, "grad_norm": 6.900053024291992, "learning_rate": 4.3225969723698165e-05, "loss": 0.1477, "num_input_tokens_seen": 41817920, "step": 19370 }, { "epoch": 3.16068515497553, "grad_norm": 2.3402750492095947, "learning_rate": 4.322109692272858e-05, "loss": 0.227, "num_input_tokens_seen": 41827520, "step": 19375 }, { "epoch": 3.161500815660685, "grad_norm": 0.36514484882354736, "learning_rate": 4.321622264466443e-05, "loss": 0.1202, "num_input_tokens_seen": 41838496, "step": 19380 }, { "epoch": 3.1623164763458402, "grad_norm": 2.110487699508667, "learning_rate": 4.321134688990084e-05, "loss": 0.1345, "num_input_tokens_seen": 41848800, "step": 19385 }, { "epoch": 3.163132137030995, "grad_norm": 0.2392733246088028, "learning_rate": 4.320646965883307e-05, "loss": 0.2938, "num_input_tokens_seen": 41859616, "step": 19390 }, { "epoch": 3.1639477977161503, "grad_norm": 0.6725291013717651, "learning_rate": 4.320159095185648e-05, "loss": 0.1144, "num_input_tokens_seen": 41870432, "step": 19395 }, { "epoch": 3.164763458401305, "grad_norm": 1.0118898153305054, "learning_rate": 4.3196710769366585e-05, "loss": 0.1588, "num_input_tokens_seen": 41880960, "step": 19400 }, { "epoch": 3.16557911908646, "grad_norm": 3.306779384613037, "learning_rate": 4.3191829111758985e-05, "loss": 0.3511, "num_input_tokens_seen": 41891456, "step": 19405 }, { "epoch": 3.166394779771615, "grad_norm": 2.330289602279663, "learning_rate": 4.318694597942941e-05, "loss": 0.0267, "num_input_tokens_seen": 41903040, "step": 19410 }, { "epoch": 3.16721044045677, "grad_norm": 3.969630718231201, "learning_rate": 4.318206137277372e-05, "loss": 0.148, "num_input_tokens_seen": 41914272, "step": 19415 }, { "epoch": 3.168026101141925, "grad_norm": 2.5971837043762207, "learning_rate": 4.317717529218788e-05, "loss": 0.2292, "num_input_tokens_seen": 41925184, "step": 19420 }, { "epoch": 3.16884176182708, "grad_norm": 3.305995225906372, "learning_rate": 4.317228773806797e-05, "loss": 0.2062, "num_input_tokens_seen": 41936288, "step": 19425 }, { "epoch": 3.169657422512235, "grad_norm": 2.1929593086242676, "learning_rate": 4.316739871081021e-05, "loss": 0.0966, "num_input_tokens_seen": 41945984, "step": 19430 }, { "epoch": 3.1704730831973897, "grad_norm": 1.9077894687652588, "learning_rate": 4.3162508210810936e-05, "loss": 0.3034, "num_input_tokens_seen": 41956256, "step": 19435 }, { "epoch": 3.171288743882545, "grad_norm": 1.9926167726516724, "learning_rate": 4.31576162384666e-05, "loss": 0.0752, "num_input_tokens_seen": 41967040, "step": 19440 }, { "epoch": 3.1721044045677, "grad_norm": 0.08837661892175674, "learning_rate": 4.315272279417375e-05, "loss": 0.0368, "num_input_tokens_seen": 41977664, "step": 19445 }, { "epoch": 3.1729200652528546, "grad_norm": 12.569263458251953, "learning_rate": 4.314782787832908e-05, "loss": 0.1588, "num_input_tokens_seen": 41988384, "step": 19450 }, { "epoch": 3.17373572593801, "grad_norm": 6.0814690589904785, "learning_rate": 4.314293149132941e-05, "loss": 0.3066, "num_input_tokens_seen": 42000576, "step": 19455 }, { "epoch": 3.1745513866231647, "grad_norm": 0.45871320366859436, "learning_rate": 4.313803363357166e-05, "loss": 0.0765, "num_input_tokens_seen": 42010176, "step": 19460 }, { "epoch": 3.1753670473083195, "grad_norm": 0.300508052110672, "learning_rate": 4.313313430545286e-05, "loss": 0.081, "num_input_tokens_seen": 42021248, "step": 19465 }, { "epoch": 3.176182707993475, "grad_norm": 0.1853676736354828, "learning_rate": 4.3128233507370196e-05, "loss": 0.0504, "num_input_tokens_seen": 42031712, "step": 19470 }, { "epoch": 3.1769983686786296, "grad_norm": 0.3095267117023468, "learning_rate": 4.312333123972094e-05, "loss": 0.1787, "num_input_tokens_seen": 42042848, "step": 19475 }, { "epoch": 3.177814029363785, "grad_norm": 2.2687737941741943, "learning_rate": 4.31184275029025e-05, "loss": 0.1077, "num_input_tokens_seen": 42053856, "step": 19480 }, { "epoch": 3.1786296900489397, "grad_norm": 2.702927589416504, "learning_rate": 4.311352229731239e-05, "loss": 0.1128, "num_input_tokens_seen": 42064864, "step": 19485 }, { "epoch": 3.1794453507340945, "grad_norm": 2.2455344200134277, "learning_rate": 4.310861562334826e-05, "loss": 0.1222, "num_input_tokens_seen": 42076864, "step": 19490 }, { "epoch": 3.1802610114192498, "grad_norm": 0.5983144044876099, "learning_rate": 4.310370748140786e-05, "loss": 0.0858, "num_input_tokens_seen": 42086592, "step": 19495 }, { "epoch": 3.1810766721044046, "grad_norm": 0.12773123383522034, "learning_rate": 4.3098797871889075e-05, "loss": 0.1902, "num_input_tokens_seen": 42096736, "step": 19500 }, { "epoch": 3.1818923327895594, "grad_norm": 0.14167964458465576, "learning_rate": 4.30938867951899e-05, "loss": 0.1249, "num_input_tokens_seen": 42106592, "step": 19505 }, { "epoch": 3.1827079934747147, "grad_norm": 0.3585329055786133, "learning_rate": 4.308897425170846e-05, "loss": 0.0828, "num_input_tokens_seen": 42117024, "step": 19510 }, { "epoch": 3.1835236541598695, "grad_norm": 3.088606834411621, "learning_rate": 4.3084060241842984e-05, "loss": 0.0701, "num_input_tokens_seen": 42127776, "step": 19515 }, { "epoch": 3.1843393148450243, "grad_norm": 0.22292488813400269, "learning_rate": 4.307914476599182e-05, "loss": 0.0281, "num_input_tokens_seen": 42138976, "step": 19520 }, { "epoch": 3.1851549755301796, "grad_norm": 2.7585763931274414, "learning_rate": 4.307422782455346e-05, "loss": 0.0254, "num_input_tokens_seen": 42150752, "step": 19525 }, { "epoch": 3.1859706362153344, "grad_norm": 0.4058837592601776, "learning_rate": 4.306930941792648e-05, "loss": 0.0201, "num_input_tokens_seen": 42161696, "step": 19530 }, { "epoch": 3.186786296900489, "grad_norm": 3.5705759525299072, "learning_rate": 4.3064389546509585e-05, "loss": 0.1497, "num_input_tokens_seen": 42173312, "step": 19535 }, { "epoch": 3.1876019575856445, "grad_norm": 0.3486447334289551, "learning_rate": 4.305946821070163e-05, "loss": 0.2562, "num_input_tokens_seen": 42185344, "step": 19540 }, { "epoch": 3.1884176182707993, "grad_norm": 0.2385052740573883, "learning_rate": 4.3054545410901547e-05, "loss": 0.0955, "num_input_tokens_seen": 42196480, "step": 19545 }, { "epoch": 3.189233278955954, "grad_norm": 0.35330891609191895, "learning_rate": 4.30496211475084e-05, "loss": 0.0127, "num_input_tokens_seen": 42206624, "step": 19550 }, { "epoch": 3.1900489396411094, "grad_norm": 3.7042102813720703, "learning_rate": 4.3044695420921386e-05, "loss": 0.1845, "num_input_tokens_seen": 42218688, "step": 19555 }, { "epoch": 3.190864600326264, "grad_norm": 0.06464269757270813, "learning_rate": 4.30397682315398e-05, "loss": 0.0134, "num_input_tokens_seen": 42228896, "step": 19560 }, { "epoch": 3.1916802610114194, "grad_norm": 0.25477996468544006, "learning_rate": 4.303483957976306e-05, "loss": 0.2881, "num_input_tokens_seen": 42239424, "step": 19565 }, { "epoch": 3.1924959216965743, "grad_norm": 0.243189737200737, "learning_rate": 4.302990946599073e-05, "loss": 0.2144, "num_input_tokens_seen": 42250272, "step": 19570 }, { "epoch": 3.193311582381729, "grad_norm": 3.361558437347412, "learning_rate": 4.302497789062245e-05, "loss": 0.0704, "num_input_tokens_seen": 42261024, "step": 19575 }, { "epoch": 3.1941272430668843, "grad_norm": 0.07526399940252304, "learning_rate": 4.3020044854058e-05, "loss": 0.0393, "num_input_tokens_seen": 42271680, "step": 19580 }, { "epoch": 3.194942903752039, "grad_norm": 0.07639128714799881, "learning_rate": 4.3015110356697285e-05, "loss": 0.0525, "num_input_tokens_seen": 42281792, "step": 19585 }, { "epoch": 3.195758564437194, "grad_norm": 0.09000908583402634, "learning_rate": 4.301017439894032e-05, "loss": 0.0096, "num_input_tokens_seen": 42292128, "step": 19590 }, { "epoch": 3.1965742251223492, "grad_norm": 0.6561002135276794, "learning_rate": 4.300523698118722e-05, "loss": 0.0995, "num_input_tokens_seen": 42303232, "step": 19595 }, { "epoch": 3.197389885807504, "grad_norm": 0.0983612909913063, "learning_rate": 4.3000298103838274e-05, "loss": 0.0148, "num_input_tokens_seen": 42313056, "step": 19600 }, { "epoch": 3.198205546492659, "grad_norm": 0.11739430576562881, "learning_rate": 4.299535776729382e-05, "loss": 0.0558, "num_input_tokens_seen": 42324320, "step": 19605 }, { "epoch": 3.199021207177814, "grad_norm": 0.13181328773498535, "learning_rate": 4.2990415971954364e-05, "loss": 0.0366, "num_input_tokens_seen": 42332800, "step": 19610 }, { "epoch": 3.199836867862969, "grad_norm": 3.944260358810425, "learning_rate": 4.29854727182205e-05, "loss": 0.379, "num_input_tokens_seen": 42344448, "step": 19615 }, { "epoch": 3.200652528548124, "grad_norm": 0.08940883725881577, "learning_rate": 4.298052800649296e-05, "loss": 0.0591, "num_input_tokens_seen": 42355520, "step": 19620 }, { "epoch": 3.201468189233279, "grad_norm": 1.6371586322784424, "learning_rate": 4.297558183717259e-05, "loss": 0.0875, "num_input_tokens_seen": 42366656, "step": 19625 }, { "epoch": 3.202283849918434, "grad_norm": 0.17973874509334564, "learning_rate": 4.297063421066035e-05, "loss": 0.0371, "num_input_tokens_seen": 42377824, "step": 19630 }, { "epoch": 3.203099510603589, "grad_norm": 7.309290409088135, "learning_rate": 4.296568512735732e-05, "loss": 0.0664, "num_input_tokens_seen": 42388064, "step": 19635 }, { "epoch": 3.203915171288744, "grad_norm": 0.04178430140018463, "learning_rate": 4.29607345876647e-05, "loss": 0.1628, "num_input_tokens_seen": 42399648, "step": 19640 }, { "epoch": 3.2047308319738987, "grad_norm": 0.022029142826795578, "learning_rate": 4.2955782591983795e-05, "loss": 0.0573, "num_input_tokens_seen": 42409024, "step": 19645 }, { "epoch": 3.205546492659054, "grad_norm": 0.0897393524646759, "learning_rate": 4.295082914071604e-05, "loss": 0.0048, "num_input_tokens_seen": 42419552, "step": 19650 }, { "epoch": 3.206362153344209, "grad_norm": 4.748601913452148, "learning_rate": 4.294587423426301e-05, "loss": 0.2774, "num_input_tokens_seen": 42428992, "step": 19655 }, { "epoch": 3.2071778140293636, "grad_norm": 0.10691147297620773, "learning_rate": 4.294091787302634e-05, "loss": 0.0772, "num_input_tokens_seen": 42440736, "step": 19660 }, { "epoch": 3.207993474714519, "grad_norm": 0.3394257724285126, "learning_rate": 4.2935960057407855e-05, "loss": 0.0965, "num_input_tokens_seen": 42451168, "step": 19665 }, { "epoch": 3.2088091353996737, "grad_norm": 5.757011890411377, "learning_rate": 4.2931000787809426e-05, "loss": 0.0841, "num_input_tokens_seen": 42462528, "step": 19670 }, { "epoch": 3.2096247960848285, "grad_norm": 0.08598591387271881, "learning_rate": 4.29260400646331e-05, "loss": 0.0393, "num_input_tokens_seen": 42471648, "step": 19675 }, { "epoch": 3.210440456769984, "grad_norm": 1.1945375204086304, "learning_rate": 4.2921077888281014e-05, "loss": 0.0381, "num_input_tokens_seen": 42482944, "step": 19680 }, { "epoch": 3.2112561174551386, "grad_norm": 2.91871976852417, "learning_rate": 4.2916114259155414e-05, "loss": 0.1889, "num_input_tokens_seen": 42493152, "step": 19685 }, { "epoch": 3.2120717781402934, "grad_norm": 0.3258408010005951, "learning_rate": 4.291114917765869e-05, "loss": 0.0159, "num_input_tokens_seen": 42504544, "step": 19690 }, { "epoch": 3.2128874388254487, "grad_norm": 0.4206432104110718, "learning_rate": 4.290618264419334e-05, "loss": 0.1514, "num_input_tokens_seen": 42514496, "step": 19695 }, { "epoch": 3.2137030995106035, "grad_norm": 0.558193027973175, "learning_rate": 4.290121465916196e-05, "loss": 0.0871, "num_input_tokens_seen": 42524704, "step": 19700 }, { "epoch": 3.2145187601957588, "grad_norm": 6.601524829864502, "learning_rate": 4.2896245222967296e-05, "loss": 0.2024, "num_input_tokens_seen": 42534688, "step": 19705 }, { "epoch": 3.2153344208809136, "grad_norm": 2.0279533863067627, "learning_rate": 4.2891274336012186e-05, "loss": 0.2494, "num_input_tokens_seen": 42546016, "step": 19710 }, { "epoch": 3.2161500815660684, "grad_norm": 6.865815162658691, "learning_rate": 4.288630199869961e-05, "loss": 0.1239, "num_input_tokens_seen": 42556672, "step": 19715 }, { "epoch": 3.2169657422512237, "grad_norm": 0.076471708714962, "learning_rate": 4.2881328211432626e-05, "loss": 0.0609, "num_input_tokens_seen": 42567168, "step": 19720 }, { "epoch": 3.2177814029363785, "grad_norm": 0.09073363244533539, "learning_rate": 4.2876352974614456e-05, "loss": 0.0065, "num_input_tokens_seen": 42577760, "step": 19725 }, { "epoch": 3.2185970636215333, "grad_norm": 5.533669948577881, "learning_rate": 4.28713762886484e-05, "loss": 0.1846, "num_input_tokens_seen": 42588672, "step": 19730 }, { "epoch": 3.2194127243066886, "grad_norm": 1.4149954319000244, "learning_rate": 4.286639815393791e-05, "loss": 0.0151, "num_input_tokens_seen": 42600480, "step": 19735 }, { "epoch": 3.2202283849918434, "grad_norm": 0.03279214724898338, "learning_rate": 4.286141857088654e-05, "loss": 0.1365, "num_input_tokens_seen": 42612992, "step": 19740 }, { "epoch": 3.221044045676998, "grad_norm": 0.4979197680950165, "learning_rate": 4.285643753989794e-05, "loss": 0.0752, "num_input_tokens_seen": 42624576, "step": 19745 }, { "epoch": 3.2218597063621535, "grad_norm": 0.1591596007347107, "learning_rate": 4.2851455061375924e-05, "loss": 0.1735, "num_input_tokens_seen": 42636128, "step": 19750 }, { "epoch": 3.2226753670473083, "grad_norm": 0.08087214827537537, "learning_rate": 4.2846471135724376e-05, "loss": 0.1514, "num_input_tokens_seen": 42646304, "step": 19755 }, { "epoch": 3.223491027732463, "grad_norm": 1.0540143251419067, "learning_rate": 4.2841485763347324e-05, "loss": 0.2253, "num_input_tokens_seen": 42656576, "step": 19760 }, { "epoch": 3.2243066884176184, "grad_norm": 0.29019859433174133, "learning_rate": 4.2836498944648904e-05, "loss": 0.0052, "num_input_tokens_seen": 42666624, "step": 19765 }, { "epoch": 3.225122349102773, "grad_norm": 3.46596360206604, "learning_rate": 4.2831510680033394e-05, "loss": 0.1699, "num_input_tokens_seen": 42676096, "step": 19770 }, { "epoch": 3.225938009787928, "grad_norm": 0.8415746092796326, "learning_rate": 4.2826520969905134e-05, "loss": 0.1026, "num_input_tokens_seen": 42686752, "step": 19775 }, { "epoch": 3.2267536704730833, "grad_norm": 2.7905514240264893, "learning_rate": 4.282152981466865e-05, "loss": 0.1408, "num_input_tokens_seen": 42697792, "step": 19780 }, { "epoch": 3.227569331158238, "grad_norm": 0.28766247630119324, "learning_rate": 4.2816537214728524e-05, "loss": 0.141, "num_input_tokens_seen": 42708384, "step": 19785 }, { "epoch": 3.2283849918433933, "grad_norm": 0.3859138488769531, "learning_rate": 4.281154317048949e-05, "loss": 0.116, "num_input_tokens_seen": 42719136, "step": 19790 }, { "epoch": 3.229200652528548, "grad_norm": 0.06281393766403198, "learning_rate": 4.28065476823564e-05, "loss": 0.075, "num_input_tokens_seen": 42728896, "step": 19795 }, { "epoch": 3.230016313213703, "grad_norm": 2.613255739212036, "learning_rate": 4.2801550750734195e-05, "loss": 0.189, "num_input_tokens_seen": 42740512, "step": 19800 }, { "epoch": 3.2308319738988582, "grad_norm": 0.07633854448795319, "learning_rate": 4.279655237602796e-05, "loss": 0.0067, "num_input_tokens_seen": 42750240, "step": 19805 }, { "epoch": 3.231647634584013, "grad_norm": 0.10590975731611252, "learning_rate": 4.279155255864291e-05, "loss": 0.0057, "num_input_tokens_seen": 42760672, "step": 19810 }, { "epoch": 3.232463295269168, "grad_norm": 3.3129844665527344, "learning_rate": 4.2786551298984315e-05, "loss": 0.1996, "num_input_tokens_seen": 42771680, "step": 19815 }, { "epoch": 3.233278955954323, "grad_norm": 0.37032970786094666, "learning_rate": 4.278154859745763e-05, "loss": 0.1322, "num_input_tokens_seen": 42782048, "step": 19820 }, { "epoch": 3.234094616639478, "grad_norm": 3.0718038082122803, "learning_rate": 4.27765444544684e-05, "loss": 0.0622, "num_input_tokens_seen": 42792864, "step": 19825 }, { "epoch": 3.2349102773246328, "grad_norm": 0.06151893734931946, "learning_rate": 4.277153887042227e-05, "loss": 0.167, "num_input_tokens_seen": 42803904, "step": 19830 }, { "epoch": 3.235725938009788, "grad_norm": 1.7453343868255615, "learning_rate": 4.2766531845725036e-05, "loss": 0.0721, "num_input_tokens_seen": 42814720, "step": 19835 }, { "epoch": 3.236541598694943, "grad_norm": 0.5372622013092041, "learning_rate": 4.276152338078258e-05, "loss": 0.0171, "num_input_tokens_seen": 42825856, "step": 19840 }, { "epoch": 3.237357259380098, "grad_norm": 2.056267261505127, "learning_rate": 4.275651347600092e-05, "loss": 0.2859, "num_input_tokens_seen": 42837408, "step": 19845 }, { "epoch": 3.238172920065253, "grad_norm": 3.7286434173583984, "learning_rate": 4.275150213178618e-05, "loss": 0.0628, "num_input_tokens_seen": 42848960, "step": 19850 }, { "epoch": 3.2389885807504077, "grad_norm": 0.07230158150196075, "learning_rate": 4.27464893485446e-05, "loss": 0.1969, "num_input_tokens_seen": 42859168, "step": 19855 }, { "epoch": 3.239804241435563, "grad_norm": 0.23849955201148987, "learning_rate": 4.274147512668256e-05, "loss": 0.052, "num_input_tokens_seen": 42867872, "step": 19860 }, { "epoch": 3.240619902120718, "grad_norm": 3.9088425636291504, "learning_rate": 4.273645946660652e-05, "loss": 0.2549, "num_input_tokens_seen": 42879264, "step": 19865 }, { "epoch": 3.2414355628058726, "grad_norm": 0.10310837626457214, "learning_rate": 4.273144236872308e-05, "loss": 0.0131, "num_input_tokens_seen": 42890816, "step": 19870 }, { "epoch": 3.242251223491028, "grad_norm": 4.3532609939575195, "learning_rate": 4.2726423833438964e-05, "loss": 0.2167, "num_input_tokens_seen": 42902400, "step": 19875 }, { "epoch": 3.2430668841761827, "grad_norm": 4.053173065185547, "learning_rate": 4.272140386116098e-05, "loss": 0.4124, "num_input_tokens_seen": 42913152, "step": 19880 }, { "epoch": 3.2438825448613375, "grad_norm": 9.261698722839355, "learning_rate": 4.2716382452296086e-05, "loss": 0.1838, "num_input_tokens_seen": 42924672, "step": 19885 }, { "epoch": 3.244698205546493, "grad_norm": 0.16540081799030304, "learning_rate": 4.271135960725133e-05, "loss": 0.2583, "num_input_tokens_seen": 42934880, "step": 19890 }, { "epoch": 3.2455138662316476, "grad_norm": 0.07863563299179077, "learning_rate": 4.270633532643391e-05, "loss": 0.0355, "num_input_tokens_seen": 42946176, "step": 19895 }, { "epoch": 3.2463295269168024, "grad_norm": 0.2188352346420288, "learning_rate": 4.27013096102511e-05, "loss": 0.1408, "num_input_tokens_seen": 42958016, "step": 19900 }, { "epoch": 3.2471451876019577, "grad_norm": 0.30662140250205994, "learning_rate": 4.269628245911031e-05, "loss": 0.0544, "num_input_tokens_seen": 42968320, "step": 19905 }, { "epoch": 3.2479608482871125, "grad_norm": 3.4740748405456543, "learning_rate": 4.269125387341909e-05, "loss": 0.1067, "num_input_tokens_seen": 42980096, "step": 19910 }, { "epoch": 3.2487765089722673, "grad_norm": 2.7133607864379883, "learning_rate": 4.268622385358506e-05, "loss": 0.2278, "num_input_tokens_seen": 42990304, "step": 19915 }, { "epoch": 3.2495921696574226, "grad_norm": 0.09134632349014282, "learning_rate": 4.268119240001598e-05, "loss": 0.282, "num_input_tokens_seen": 43001280, "step": 19920 }, { "epoch": 3.2504078303425774, "grad_norm": 0.4034450352191925, "learning_rate": 4.267615951311974e-05, "loss": 0.0167, "num_input_tokens_seen": 43011584, "step": 19925 }, { "epoch": 3.2512234910277327, "grad_norm": 0.06232699751853943, "learning_rate": 4.267112519330432e-05, "loss": 0.1074, "num_input_tokens_seen": 43022432, "step": 19930 }, { "epoch": 3.2520391517128875, "grad_norm": 0.5810019373893738, "learning_rate": 4.266608944097782e-05, "loss": 0.1979, "num_input_tokens_seen": 43033792, "step": 19935 }, { "epoch": 3.2528548123980423, "grad_norm": 0.8372753858566284, "learning_rate": 4.266105225654848e-05, "loss": 0.2448, "num_input_tokens_seen": 43045504, "step": 19940 }, { "epoch": 3.2536704730831976, "grad_norm": 0.0908043310046196, "learning_rate": 4.265601364042463e-05, "loss": 0.1678, "num_input_tokens_seen": 43056544, "step": 19945 }, { "epoch": 3.2544861337683524, "grad_norm": 7.50385856628418, "learning_rate": 4.2650973593014734e-05, "loss": 0.1924, "num_input_tokens_seen": 43065184, "step": 19950 }, { "epoch": 3.255301794453507, "grad_norm": 0.24904431402683258, "learning_rate": 4.264593211472735e-05, "loss": 0.0456, "num_input_tokens_seen": 43074784, "step": 19955 }, { "epoch": 3.2561174551386625, "grad_norm": 3.333866596221924, "learning_rate": 4.264088920597118e-05, "loss": 0.0339, "num_input_tokens_seen": 43084864, "step": 19960 }, { "epoch": 3.2569331158238173, "grad_norm": 0.13684949278831482, "learning_rate": 4.263584486715503e-05, "loss": 0.2277, "num_input_tokens_seen": 43096576, "step": 19965 }, { "epoch": 3.257748776508972, "grad_norm": 4.079904556274414, "learning_rate": 4.2630799098687804e-05, "loss": 0.1449, "num_input_tokens_seen": 43106816, "step": 19970 }, { "epoch": 3.2585644371941274, "grad_norm": 5.152735233306885, "learning_rate": 4.262575190097854e-05, "loss": 0.0791, "num_input_tokens_seen": 43118304, "step": 19975 }, { "epoch": 3.259380097879282, "grad_norm": 0.04147205501794815, "learning_rate": 4.262070327443639e-05, "loss": 0.1603, "num_input_tokens_seen": 43128384, "step": 19980 }, { "epoch": 3.2601957585644374, "grad_norm": 4.397024631500244, "learning_rate": 4.261565321947064e-05, "loss": 0.234, "num_input_tokens_seen": 43139136, "step": 19985 }, { "epoch": 3.2610114192495923, "grad_norm": 0.466985821723938, "learning_rate": 4.261060173649065e-05, "loss": 0.0627, "num_input_tokens_seen": 43150048, "step": 19990 }, { "epoch": 3.261827079934747, "grad_norm": 4.005570888519287, "learning_rate": 4.260554882590594e-05, "loss": 0.1105, "num_input_tokens_seen": 43159456, "step": 19995 }, { "epoch": 3.262642740619902, "grad_norm": 6.014162063598633, "learning_rate": 4.2600494488126104e-05, "loss": 0.2537, "num_input_tokens_seen": 43170816, "step": 20000 }, { "epoch": 3.263458401305057, "grad_norm": 3.622558832168579, "learning_rate": 4.259543872356088e-05, "loss": 0.2441, "num_input_tokens_seen": 43182336, "step": 20005 }, { "epoch": 3.264274061990212, "grad_norm": 2.2375385761260986, "learning_rate": 4.259038153262012e-05, "loss": 0.2732, "num_input_tokens_seen": 43192384, "step": 20010 }, { "epoch": 3.2650897226753672, "grad_norm": 0.07627036422491074, "learning_rate": 4.2585322915713774e-05, "loss": 0.1285, "num_input_tokens_seen": 43203744, "step": 20015 }, { "epoch": 3.265905383360522, "grad_norm": 2.762985944747925, "learning_rate": 4.258026287325192e-05, "loss": 0.0786, "num_input_tokens_seen": 43213376, "step": 20020 }, { "epoch": 3.266721044045677, "grad_norm": 2.847794771194458, "learning_rate": 4.2575201405644764e-05, "loss": 0.2222, "num_input_tokens_seen": 43223744, "step": 20025 }, { "epoch": 3.267536704730832, "grad_norm": 1.276902675628662, "learning_rate": 4.257013851330261e-05, "loss": 0.0735, "num_input_tokens_seen": 43234752, "step": 20030 }, { "epoch": 3.268352365415987, "grad_norm": 2.2544126510620117, "learning_rate": 4.256507419663587e-05, "loss": 0.22, "num_input_tokens_seen": 43245280, "step": 20035 }, { "epoch": 3.2691680261011418, "grad_norm": 4.820090293884277, "learning_rate": 4.25600084560551e-05, "loss": 0.0622, "num_input_tokens_seen": 43256672, "step": 20040 }, { "epoch": 3.269983686786297, "grad_norm": 2.6052327156066895, "learning_rate": 4.255494129197094e-05, "loss": 0.2593, "num_input_tokens_seen": 43268288, "step": 20045 }, { "epoch": 3.270799347471452, "grad_norm": 0.15062707662582397, "learning_rate": 4.254987270479417e-05, "loss": 0.0118, "num_input_tokens_seen": 43278080, "step": 20050 }, { "epoch": 3.2716150081566067, "grad_norm": 0.06929773837327957, "learning_rate": 4.254480269493567e-05, "loss": 0.016, "num_input_tokens_seen": 43287392, "step": 20055 }, { "epoch": 3.272430668841762, "grad_norm": 0.9511746168136597, "learning_rate": 4.253973126280644e-05, "loss": 0.0457, "num_input_tokens_seen": 43297824, "step": 20060 }, { "epoch": 3.2732463295269167, "grad_norm": 3.381207227706909, "learning_rate": 4.2534658408817595e-05, "loss": 0.0696, "num_input_tokens_seen": 43308864, "step": 20065 }, { "epoch": 3.274061990212072, "grad_norm": 3.5614192485809326, "learning_rate": 4.252958413338038e-05, "loss": 0.1761, "num_input_tokens_seen": 43318912, "step": 20070 }, { "epoch": 3.274877650897227, "grad_norm": 0.08948476612567902, "learning_rate": 4.2524508436906124e-05, "loss": 0.0985, "num_input_tokens_seen": 43328896, "step": 20075 }, { "epoch": 3.2756933115823816, "grad_norm": 11.680952072143555, "learning_rate": 4.251943131980629e-05, "loss": 0.0872, "num_input_tokens_seen": 43340512, "step": 20080 }, { "epoch": 3.2765089722675365, "grad_norm": 2.4284684658050537, "learning_rate": 4.2514352782492475e-05, "loss": 0.122, "num_input_tokens_seen": 43351840, "step": 20085 }, { "epoch": 3.2773246329526917, "grad_norm": 4.431033611297607, "learning_rate": 4.250927282537635e-05, "loss": 0.3525, "num_input_tokens_seen": 43363328, "step": 20090 }, { "epoch": 3.2781402936378465, "grad_norm": 0.826481819152832, "learning_rate": 4.2504191448869716e-05, "loss": 0.1639, "num_input_tokens_seen": 43374688, "step": 20095 }, { "epoch": 3.278955954323002, "grad_norm": 0.4012637734413147, "learning_rate": 4.249910865338452e-05, "loss": 0.1135, "num_input_tokens_seen": 43384832, "step": 20100 }, { "epoch": 3.2797716150081566, "grad_norm": 0.3164308965206146, "learning_rate": 4.249402443933279e-05, "loss": 0.252, "num_input_tokens_seen": 43396352, "step": 20105 }, { "epoch": 3.2805872756933114, "grad_norm": 6.287999629974365, "learning_rate": 4.248893880712667e-05, "loss": 0.0945, "num_input_tokens_seen": 43407104, "step": 20110 }, { "epoch": 3.2814029363784667, "grad_norm": 0.17996253073215485, "learning_rate": 4.248385175717843e-05, "loss": 0.0275, "num_input_tokens_seen": 43418816, "step": 20115 }, { "epoch": 3.2822185970636215, "grad_norm": 0.10354874283075333, "learning_rate": 4.247876328990046e-05, "loss": 0.1144, "num_input_tokens_seen": 43429088, "step": 20120 }, { "epoch": 3.2830342577487763, "grad_norm": 3.017449140548706, "learning_rate": 4.247367340570525e-05, "loss": 0.0328, "num_input_tokens_seen": 43439744, "step": 20125 }, { "epoch": 3.2838499184339316, "grad_norm": 1.2335954904556274, "learning_rate": 4.2468582105005413e-05, "loss": 0.0128, "num_input_tokens_seen": 43451264, "step": 20130 }, { "epoch": 3.2846655791190864, "grad_norm": 0.0889868438243866, "learning_rate": 4.246348938821367e-05, "loss": 0.1561, "num_input_tokens_seen": 43461696, "step": 20135 }, { "epoch": 3.2854812398042412, "grad_norm": 0.8793284296989441, "learning_rate": 4.2458395255742875e-05, "loss": 0.0136, "num_input_tokens_seen": 43471904, "step": 20140 }, { "epoch": 3.2862969004893965, "grad_norm": 0.21098622679710388, "learning_rate": 4.245329970800597e-05, "loss": 0.2366, "num_input_tokens_seen": 43482400, "step": 20145 }, { "epoch": 3.2871125611745513, "grad_norm": 0.20438610017299652, "learning_rate": 4.244820274541604e-05, "loss": 0.184, "num_input_tokens_seen": 43492160, "step": 20150 }, { "epoch": 3.2879282218597066, "grad_norm": 0.6875913143157959, "learning_rate": 4.244310436838627e-05, "loss": 0.1147, "num_input_tokens_seen": 43502688, "step": 20155 }, { "epoch": 3.2887438825448614, "grad_norm": 2.2987444400787354, "learning_rate": 4.2438004577329946e-05, "loss": 0.2091, "num_input_tokens_seen": 43513184, "step": 20160 }, { "epoch": 3.289559543230016, "grad_norm": 3.6858136653900146, "learning_rate": 4.243290337266049e-05, "loss": 0.1854, "num_input_tokens_seen": 43522592, "step": 20165 }, { "epoch": 3.2903752039151715, "grad_norm": 3.0720269680023193, "learning_rate": 4.242780075479143e-05, "loss": 0.1067, "num_input_tokens_seen": 43533312, "step": 20170 }, { "epoch": 3.2911908646003263, "grad_norm": 3.2114734649658203, "learning_rate": 4.242269672413643e-05, "loss": 0.2091, "num_input_tokens_seen": 43543808, "step": 20175 }, { "epoch": 3.292006525285481, "grad_norm": 2.9163646697998047, "learning_rate": 4.241759128110922e-05, "loss": 0.3067, "num_input_tokens_seen": 43554528, "step": 20180 }, { "epoch": 3.2928221859706364, "grad_norm": 0.2551548480987549, "learning_rate": 4.241248442612368e-05, "loss": 0.0563, "num_input_tokens_seen": 43564960, "step": 20185 }, { "epoch": 3.293637846655791, "grad_norm": 0.07951763272285461, "learning_rate": 4.240737615959381e-05, "loss": 0.0559, "num_input_tokens_seen": 43576576, "step": 20190 }, { "epoch": 3.294453507340946, "grad_norm": 5.029916763305664, "learning_rate": 4.2402266481933706e-05, "loss": 0.1453, "num_input_tokens_seen": 43588576, "step": 20195 }, { "epoch": 3.2952691680261013, "grad_norm": 0.6786830425262451, "learning_rate": 4.2397155393557574e-05, "loss": 0.3138, "num_input_tokens_seen": 43598912, "step": 20200 }, { "epoch": 3.296084828711256, "grad_norm": 1.1113815307617188, "learning_rate": 4.239204289487976e-05, "loss": 0.0831, "num_input_tokens_seen": 43609568, "step": 20205 }, { "epoch": 3.2969004893964113, "grad_norm": 5.254055023193359, "learning_rate": 4.23869289863147e-05, "loss": 0.0951, "num_input_tokens_seen": 43619904, "step": 20210 }, { "epoch": 3.297716150081566, "grad_norm": 0.3169059455394745, "learning_rate": 4.238181366827696e-05, "loss": 0.13, "num_input_tokens_seen": 43630976, "step": 20215 }, { "epoch": 3.298531810766721, "grad_norm": 0.26758649945259094, "learning_rate": 4.237669694118121e-05, "loss": 0.0168, "num_input_tokens_seen": 43640384, "step": 20220 }, { "epoch": 3.299347471451876, "grad_norm": 1.9709473848342896, "learning_rate": 4.237157880544223e-05, "loss": 0.1413, "num_input_tokens_seen": 43650912, "step": 20225 }, { "epoch": 3.300163132137031, "grad_norm": 0.19510877132415771, "learning_rate": 4.2366459261474933e-05, "loss": 0.1766, "num_input_tokens_seen": 43662368, "step": 20230 }, { "epoch": 3.300978792822186, "grad_norm": 5.569932460784912, "learning_rate": 4.2361338309694335e-05, "loss": 0.1336, "num_input_tokens_seen": 43674048, "step": 20235 }, { "epoch": 3.301794453507341, "grad_norm": 0.298435240983963, "learning_rate": 4.235621595051556e-05, "loss": 0.1006, "num_input_tokens_seen": 43685120, "step": 20240 }, { "epoch": 3.302610114192496, "grad_norm": 0.10472102463245392, "learning_rate": 4.2351092184353855e-05, "loss": 0.0847, "num_input_tokens_seen": 43696288, "step": 20245 }, { "epoch": 3.3034257748776508, "grad_norm": 0.14596039056777954, "learning_rate": 4.234596701162458e-05, "loss": 0.129, "num_input_tokens_seen": 43708000, "step": 20250 }, { "epoch": 3.304241435562806, "grad_norm": 0.0793973058462143, "learning_rate": 4.2340840432743206e-05, "loss": 0.0357, "num_input_tokens_seen": 43720416, "step": 20255 }, { "epoch": 3.305057096247961, "grad_norm": 0.12220805883407593, "learning_rate": 4.2335712448125316e-05, "loss": 0.1799, "num_input_tokens_seen": 43730080, "step": 20260 }, { "epoch": 3.3058727569331157, "grad_norm": 3.793534994125366, "learning_rate": 4.233058305818662e-05, "loss": 0.0631, "num_input_tokens_seen": 43740896, "step": 20265 }, { "epoch": 3.306688417618271, "grad_norm": 7.794310569763184, "learning_rate": 4.232545226334293e-05, "loss": 0.1448, "num_input_tokens_seen": 43752128, "step": 20270 }, { "epoch": 3.3075040783034257, "grad_norm": 0.04614422470331192, "learning_rate": 4.232032006401017e-05, "loss": 0.0284, "num_input_tokens_seen": 43763008, "step": 20275 }, { "epoch": 3.3083197389885806, "grad_norm": 3.9614317417144775, "learning_rate": 4.231518646060438e-05, "loss": 0.1683, "num_input_tokens_seen": 43774144, "step": 20280 }, { "epoch": 3.309135399673736, "grad_norm": 3.1272754669189453, "learning_rate": 4.231005145354172e-05, "loss": 0.0643, "num_input_tokens_seen": 43785280, "step": 20285 }, { "epoch": 3.3099510603588906, "grad_norm": 0.057867251336574554, "learning_rate": 4.230491504323846e-05, "loss": 0.0143, "num_input_tokens_seen": 43796288, "step": 20290 }, { "epoch": 3.310766721044046, "grad_norm": 0.23088066279888153, "learning_rate": 4.229977723011097e-05, "loss": 0.0154, "num_input_tokens_seen": 43807392, "step": 20295 }, { "epoch": 3.3115823817292007, "grad_norm": 4.954625606536865, "learning_rate": 4.2294638014575774e-05, "loss": 0.2845, "num_input_tokens_seen": 43819328, "step": 20300 }, { "epoch": 3.3123980424143555, "grad_norm": 0.16805298626422882, "learning_rate": 4.228949739704946e-05, "loss": 0.0137, "num_input_tokens_seen": 43829632, "step": 20305 }, { "epoch": 3.3132137030995104, "grad_norm": 1.6958798170089722, "learning_rate": 4.228435537794877e-05, "loss": 0.0947, "num_input_tokens_seen": 43839520, "step": 20310 }, { "epoch": 3.3140293637846656, "grad_norm": 6.2627387046813965, "learning_rate": 4.227921195769053e-05, "loss": 0.041, "num_input_tokens_seen": 43850048, "step": 20315 }, { "epoch": 3.3148450244698204, "grad_norm": 0.14634250104427338, "learning_rate": 4.227406713669169e-05, "loss": 0.0083, "num_input_tokens_seen": 43860480, "step": 20320 }, { "epoch": 3.3156606851549757, "grad_norm": 0.07750158756971359, "learning_rate": 4.226892091536933e-05, "loss": 0.044, "num_input_tokens_seen": 43871456, "step": 20325 }, { "epoch": 3.3164763458401305, "grad_norm": 1.7951619625091553, "learning_rate": 4.226377329414061e-05, "loss": 0.1785, "num_input_tokens_seen": 43882208, "step": 20330 }, { "epoch": 3.3172920065252853, "grad_norm": 0.02641744166612625, "learning_rate": 4.225862427342283e-05, "loss": 0.0144, "num_input_tokens_seen": 43893568, "step": 20335 }, { "epoch": 3.3181076672104406, "grad_norm": 3.071866273880005, "learning_rate": 4.2253473853633405e-05, "loss": 0.2477, "num_input_tokens_seen": 43904736, "step": 20340 }, { "epoch": 3.3189233278955954, "grad_norm": 1.6703068017959595, "learning_rate": 4.2248322035189835e-05, "loss": 0.0634, "num_input_tokens_seen": 43915232, "step": 20345 }, { "epoch": 3.3197389885807502, "grad_norm": 0.08380986750125885, "learning_rate": 4.224316881850977e-05, "loss": 0.3356, "num_input_tokens_seen": 43925664, "step": 20350 }, { "epoch": 3.3205546492659055, "grad_norm": 2.1386399269104004, "learning_rate": 4.223801420401095e-05, "loss": 0.0725, "num_input_tokens_seen": 43936736, "step": 20355 }, { "epoch": 3.3213703099510603, "grad_norm": 3.1797494888305664, "learning_rate": 4.223285819211124e-05, "loss": 0.2569, "num_input_tokens_seen": 43948320, "step": 20360 }, { "epoch": 3.322185970636215, "grad_norm": 0.11439325660467148, "learning_rate": 4.2227700783228594e-05, "loss": 0.0064, "num_input_tokens_seen": 43958304, "step": 20365 }, { "epoch": 3.3230016313213704, "grad_norm": 0.0685453861951828, "learning_rate": 4.222254197778112e-05, "loss": 0.1345, "num_input_tokens_seen": 43969120, "step": 20370 }, { "epoch": 3.323817292006525, "grad_norm": 0.10162924230098724, "learning_rate": 4.2217381776187005e-05, "loss": 0.127, "num_input_tokens_seen": 43980768, "step": 20375 }, { "epoch": 3.3246329526916805, "grad_norm": 4.4270124435424805, "learning_rate": 4.2212220178864556e-05, "loss": 0.0686, "num_input_tokens_seen": 43991360, "step": 20380 }, { "epoch": 3.3254486133768353, "grad_norm": 0.6833563446998596, "learning_rate": 4.2207057186232215e-05, "loss": 0.0277, "num_input_tokens_seen": 44002464, "step": 20385 }, { "epoch": 3.32626427406199, "grad_norm": 0.2188410758972168, "learning_rate": 4.220189279870851e-05, "loss": 0.0136, "num_input_tokens_seen": 44011264, "step": 20390 }, { "epoch": 3.3270799347471454, "grad_norm": 0.09716194868087769, "learning_rate": 4.219672701671209e-05, "loss": 0.0373, "num_input_tokens_seen": 44021632, "step": 20395 }, { "epoch": 3.3278955954323, "grad_norm": 0.04455023258924484, "learning_rate": 4.219155984066171e-05, "loss": 0.1428, "num_input_tokens_seen": 44031872, "step": 20400 }, { "epoch": 3.328711256117455, "grad_norm": 0.22936658561229706, "learning_rate": 4.218639127097628e-05, "loss": 0.185, "num_input_tokens_seen": 44044064, "step": 20405 }, { "epoch": 3.3295269168026103, "grad_norm": 12.915611267089844, "learning_rate": 4.218122130807476e-05, "loss": 0.3073, "num_input_tokens_seen": 44055488, "step": 20410 }, { "epoch": 3.330342577487765, "grad_norm": 0.139049232006073, "learning_rate": 4.2176049952376265e-05, "loss": 0.0067, "num_input_tokens_seen": 44066496, "step": 20415 }, { "epoch": 3.33115823817292, "grad_norm": 0.07639367878437042, "learning_rate": 4.217087720430002e-05, "loss": 0.141, "num_input_tokens_seen": 44077664, "step": 20420 }, { "epoch": 3.331973898858075, "grad_norm": 0.4085349440574646, "learning_rate": 4.2165703064265335e-05, "loss": 0.1546, "num_input_tokens_seen": 44089344, "step": 20425 }, { "epoch": 3.33278955954323, "grad_norm": 0.16980627179145813, "learning_rate": 4.216052753269166e-05, "loss": 0.0047, "num_input_tokens_seen": 44100288, "step": 20430 }, { "epoch": 3.3336052202283852, "grad_norm": 0.954876184463501, "learning_rate": 4.215535060999856e-05, "loss": 0.0113, "num_input_tokens_seen": 44111040, "step": 20435 }, { "epoch": 3.33442088091354, "grad_norm": 0.2842998504638672, "learning_rate": 4.215017229660569e-05, "loss": 0.0206, "num_input_tokens_seen": 44122144, "step": 20440 }, { "epoch": 3.335236541598695, "grad_norm": 1.7308735847473145, "learning_rate": 4.214499259293283e-05, "loss": 0.1076, "num_input_tokens_seen": 44133344, "step": 20445 }, { "epoch": 3.3360522022838497, "grad_norm": 5.43765926361084, "learning_rate": 4.213981149939988e-05, "loss": 0.043, "num_input_tokens_seen": 44143520, "step": 20450 }, { "epoch": 3.336867862969005, "grad_norm": 0.058233749121427536, "learning_rate": 4.213462901642685e-05, "loss": 0.0145, "num_input_tokens_seen": 44154496, "step": 20455 }, { "epoch": 3.3376835236541598, "grad_norm": 0.04777275770902634, "learning_rate": 4.212944514443384e-05, "loss": 0.006, "num_input_tokens_seen": 44165504, "step": 20460 }, { "epoch": 3.338499184339315, "grad_norm": 1.7751857042312622, "learning_rate": 4.21242598838411e-05, "loss": 0.1912, "num_input_tokens_seen": 44177472, "step": 20465 }, { "epoch": 3.33931484502447, "grad_norm": 0.137112095952034, "learning_rate": 4.211907323506897e-05, "loss": 0.1756, "num_input_tokens_seen": 44187712, "step": 20470 }, { "epoch": 3.3401305057096247, "grad_norm": 0.9803479909896851, "learning_rate": 4.21138851985379e-05, "loss": 0.2745, "num_input_tokens_seen": 44198816, "step": 20475 }, { "epoch": 3.34094616639478, "grad_norm": 5.399055480957031, "learning_rate": 4.210869577466846e-05, "loss": 0.2211, "num_input_tokens_seen": 44209600, "step": 20480 }, { "epoch": 3.3417618270799347, "grad_norm": 5.411726474761963, "learning_rate": 4.210350496388133e-05, "loss": 0.0525, "num_input_tokens_seen": 44220768, "step": 20485 }, { "epoch": 3.3425774877650896, "grad_norm": 0.09991085529327393, "learning_rate": 4.2098312766597305e-05, "loss": 0.1057, "num_input_tokens_seen": 44231936, "step": 20490 }, { "epoch": 3.343393148450245, "grad_norm": 0.08061181008815765, "learning_rate": 4.209311918323729e-05, "loss": 0.1447, "num_input_tokens_seen": 44242912, "step": 20495 }, { "epoch": 3.3442088091353996, "grad_norm": 0.15167714655399323, "learning_rate": 4.208792421422231e-05, "loss": 0.0979, "num_input_tokens_seen": 44254144, "step": 20500 }, { "epoch": 3.3450244698205545, "grad_norm": 0.5698845982551575, "learning_rate": 4.208272785997348e-05, "loss": 0.0788, "num_input_tokens_seen": 44264832, "step": 20505 }, { "epoch": 3.3458401305057097, "grad_norm": 2.7397055625915527, "learning_rate": 4.207753012091207e-05, "loss": 0.1534, "num_input_tokens_seen": 44276800, "step": 20510 }, { "epoch": 3.3466557911908645, "grad_norm": 7.79945707321167, "learning_rate": 4.20723309974594e-05, "loss": 0.0505, "num_input_tokens_seen": 44287584, "step": 20515 }, { "epoch": 3.34747145187602, "grad_norm": 9.29866886138916, "learning_rate": 4.2067130490036964e-05, "loss": 0.0592, "num_input_tokens_seen": 44298688, "step": 20520 }, { "epoch": 3.3482871125611746, "grad_norm": 0.09406154602766037, "learning_rate": 4.206192859906633e-05, "loss": 0.0137, "num_input_tokens_seen": 44309184, "step": 20525 }, { "epoch": 3.3491027732463294, "grad_norm": 0.11522577702999115, "learning_rate": 4.205672532496919e-05, "loss": 0.2563, "num_input_tokens_seen": 44319872, "step": 20530 }, { "epoch": 3.3499184339314847, "grad_norm": 4.166998386383057, "learning_rate": 4.205152066816736e-05, "loss": 0.1554, "num_input_tokens_seen": 44330432, "step": 20535 }, { "epoch": 3.3507340946166395, "grad_norm": 0.469267874956131, "learning_rate": 4.204631462908274e-05, "loss": 0.227, "num_input_tokens_seen": 44341920, "step": 20540 }, { "epoch": 3.3515497553017943, "grad_norm": 0.7317011952400208, "learning_rate": 4.2041107208137366e-05, "loss": 0.0236, "num_input_tokens_seen": 44352096, "step": 20545 }, { "epoch": 3.3523654159869496, "grad_norm": 1.8006529808044434, "learning_rate": 4.203589840575337e-05, "loss": 0.0204, "num_input_tokens_seen": 44363456, "step": 20550 }, { "epoch": 3.3531810766721044, "grad_norm": 0.26886630058288574, "learning_rate": 4.203068822235302e-05, "loss": 0.0149, "num_input_tokens_seen": 44375168, "step": 20555 }, { "epoch": 3.3539967373572592, "grad_norm": 0.051335785537958145, "learning_rate": 4.2025476658358656e-05, "loss": 0.0156, "num_input_tokens_seen": 44385824, "step": 20560 }, { "epoch": 3.3548123980424145, "grad_norm": 2.6601674556732178, "learning_rate": 4.202026371419278e-05, "loss": 0.2979, "num_input_tokens_seen": 44396736, "step": 20565 }, { "epoch": 3.3556280587275693, "grad_norm": 0.06045924127101898, "learning_rate": 4.201504939027796e-05, "loss": 0.1814, "num_input_tokens_seen": 44407648, "step": 20570 }, { "epoch": 3.356443719412724, "grad_norm": 0.08100897073745728, "learning_rate": 4.20098336870369e-05, "loss": 0.1006, "num_input_tokens_seen": 44418336, "step": 20575 }, { "epoch": 3.3572593800978794, "grad_norm": 7.067513942718506, "learning_rate": 4.200461660489242e-05, "loss": 0.1193, "num_input_tokens_seen": 44428960, "step": 20580 }, { "epoch": 3.358075040783034, "grad_norm": 0.15315377712249756, "learning_rate": 4.199939814426744e-05, "loss": 0.0467, "num_input_tokens_seen": 44440608, "step": 20585 }, { "epoch": 3.358890701468189, "grad_norm": 0.11060906201601028, "learning_rate": 4.1994178305584996e-05, "loss": 0.1923, "num_input_tokens_seen": 44451040, "step": 20590 }, { "epoch": 3.3597063621533443, "grad_norm": 0.46185383200645447, "learning_rate": 4.198895708926822e-05, "loss": 0.0389, "num_input_tokens_seen": 44461792, "step": 20595 }, { "epoch": 3.360522022838499, "grad_norm": 0.028823737055063248, "learning_rate": 4.198373449574039e-05, "loss": 0.0052, "num_input_tokens_seen": 44472000, "step": 20600 }, { "epoch": 3.3613376835236544, "grad_norm": 6.627003192901611, "learning_rate": 4.197851052542486e-05, "loss": 0.0401, "num_input_tokens_seen": 44482752, "step": 20605 }, { "epoch": 3.362153344208809, "grad_norm": 0.37836965918540955, "learning_rate": 4.197328517874513e-05, "loss": 0.1861, "num_input_tokens_seen": 44494624, "step": 20610 }, { "epoch": 3.362969004893964, "grad_norm": 3.33638858795166, "learning_rate": 4.1968058456124756e-05, "loss": 0.2023, "num_input_tokens_seen": 44504256, "step": 20615 }, { "epoch": 3.3637846655791193, "grad_norm": 0.037092022597789764, "learning_rate": 4.196283035798749e-05, "loss": 0.005, "num_input_tokens_seen": 44515680, "step": 20620 }, { "epoch": 3.364600326264274, "grad_norm": 0.5618161559104919, "learning_rate": 4.1957600884757124e-05, "loss": 0.0323, "num_input_tokens_seen": 44527360, "step": 20625 }, { "epoch": 3.365415986949429, "grad_norm": 0.06743231415748596, "learning_rate": 4.195237003685759e-05, "loss": 0.1649, "num_input_tokens_seen": 44537600, "step": 20630 }, { "epoch": 3.366231647634584, "grad_norm": 6.578611373901367, "learning_rate": 4.194713781471292e-05, "loss": 0.014, "num_input_tokens_seen": 44548704, "step": 20635 }, { "epoch": 3.367047308319739, "grad_norm": 0.08273762464523315, "learning_rate": 4.194190421874727e-05, "loss": 0.168, "num_input_tokens_seen": 44559584, "step": 20640 }, { "epoch": 3.367862969004894, "grad_norm": 0.4571586847305298, "learning_rate": 4.193666924938491e-05, "loss": 0.0049, "num_input_tokens_seen": 44568576, "step": 20645 }, { "epoch": 3.368678629690049, "grad_norm": 4.7706217765808105, "learning_rate": 4.1931432907050196e-05, "loss": 0.2462, "num_input_tokens_seen": 44579968, "step": 20650 }, { "epoch": 3.369494290375204, "grad_norm": 0.12259075790643692, "learning_rate": 4.192619519216763e-05, "loss": 0.2038, "num_input_tokens_seen": 44591552, "step": 20655 }, { "epoch": 3.370309951060359, "grad_norm": 1.7152997255325317, "learning_rate": 4.192095610516179e-05, "loss": 0.1175, "num_input_tokens_seen": 44602016, "step": 20660 }, { "epoch": 3.371125611745514, "grad_norm": 0.030470367521047592, "learning_rate": 4.1915715646457385e-05, "loss": 0.0843, "num_input_tokens_seen": 44611936, "step": 20665 }, { "epoch": 3.3719412724306688, "grad_norm": 0.10524343699216843, "learning_rate": 4.191047381647925e-05, "loss": 0.1735, "num_input_tokens_seen": 44623232, "step": 20670 }, { "epoch": 3.3727569331158236, "grad_norm": 0.05628953501582146, "learning_rate": 4.190523061565231e-05, "loss": 0.0332, "num_input_tokens_seen": 44634592, "step": 20675 }, { "epoch": 3.373572593800979, "grad_norm": 0.01846391148865223, "learning_rate": 4.189998604440159e-05, "loss": 0.0234, "num_input_tokens_seen": 44645568, "step": 20680 }, { "epoch": 3.3743882544861337, "grad_norm": 13.302069664001465, "learning_rate": 4.189474010315226e-05, "loss": 0.2192, "num_input_tokens_seen": 44656992, "step": 20685 }, { "epoch": 3.375203915171289, "grad_norm": 0.1590079665184021, "learning_rate": 4.188949279232958e-05, "loss": 0.069, "num_input_tokens_seen": 44669248, "step": 20690 }, { "epoch": 3.3760195758564437, "grad_norm": 0.13195693492889404, "learning_rate": 4.188424411235891e-05, "loss": 0.1071, "num_input_tokens_seen": 44680864, "step": 20695 }, { "epoch": 3.3768352365415986, "grad_norm": 0.0731479600071907, "learning_rate": 4.1878994063665734e-05, "loss": 0.1902, "num_input_tokens_seen": 44691456, "step": 20700 }, { "epoch": 3.377650897226754, "grad_norm": 0.04988702014088631, "learning_rate": 4.187374264667566e-05, "loss": 0.0075, "num_input_tokens_seen": 44702336, "step": 20705 }, { "epoch": 3.3784665579119086, "grad_norm": 3.4724490642547607, "learning_rate": 4.1868489861814394e-05, "loss": 0.4671, "num_input_tokens_seen": 44713728, "step": 20710 }, { "epoch": 3.3792822185970635, "grad_norm": 5.628291606903076, "learning_rate": 4.1863235709507755e-05, "loss": 0.1601, "num_input_tokens_seen": 44723968, "step": 20715 }, { "epoch": 3.3800978792822187, "grad_norm": 0.056657902896404266, "learning_rate": 4.1857980190181655e-05, "loss": 0.0375, "num_input_tokens_seen": 44734304, "step": 20720 }, { "epoch": 3.3809135399673735, "grad_norm": 4.896595001220703, "learning_rate": 4.1852723304262145e-05, "loss": 0.1615, "num_input_tokens_seen": 44745568, "step": 20725 }, { "epoch": 3.3817292006525284, "grad_norm": 4.006829738616943, "learning_rate": 4.1847465052175386e-05, "loss": 0.0521, "num_input_tokens_seen": 44757216, "step": 20730 }, { "epoch": 3.3825448613376836, "grad_norm": 0.04925549402832985, "learning_rate": 4.184220543434762e-05, "loss": 0.0489, "num_input_tokens_seen": 44767808, "step": 20735 }, { "epoch": 3.3833605220228384, "grad_norm": 5.684967994689941, "learning_rate": 4.1836944451205215e-05, "loss": 0.1919, "num_input_tokens_seen": 44778208, "step": 20740 }, { "epoch": 3.3841761827079937, "grad_norm": 0.06142764911055565, "learning_rate": 4.1831682103174676e-05, "loss": 0.0585, "num_input_tokens_seen": 44788032, "step": 20745 }, { "epoch": 3.3849918433931485, "grad_norm": 2.765772819519043, "learning_rate": 4.182641839068259e-05, "loss": 0.1909, "num_input_tokens_seen": 44797440, "step": 20750 }, { "epoch": 3.3858075040783033, "grad_norm": 7.4340009689331055, "learning_rate": 4.182115331415564e-05, "loss": 0.1822, "num_input_tokens_seen": 44809248, "step": 20755 }, { "epoch": 3.3866231647634586, "grad_norm": 7.273159980773926, "learning_rate": 4.1815886874020646e-05, "loss": 0.1483, "num_input_tokens_seen": 44820768, "step": 20760 }, { "epoch": 3.3874388254486134, "grad_norm": 0.15974025428295135, "learning_rate": 4.181061907070455e-05, "loss": 0.0072, "num_input_tokens_seen": 44832128, "step": 20765 }, { "epoch": 3.3882544861337682, "grad_norm": 2.6168227195739746, "learning_rate": 4.180534990463437e-05, "loss": 0.1866, "num_input_tokens_seen": 44840992, "step": 20770 }, { "epoch": 3.3890701468189235, "grad_norm": 0.29398149251937866, "learning_rate": 4.1800079376237265e-05, "loss": 0.0418, "num_input_tokens_seen": 44851232, "step": 20775 }, { "epoch": 3.3898858075040783, "grad_norm": 0.07715565711259842, "learning_rate": 4.179480748594048e-05, "loss": 0.0193, "num_input_tokens_seen": 44860672, "step": 20780 }, { "epoch": 3.390701468189233, "grad_norm": 0.3808293342590332, "learning_rate": 4.178953423417138e-05, "loss": 0.0299, "num_input_tokens_seen": 44870688, "step": 20785 }, { "epoch": 3.3915171288743884, "grad_norm": 3.0713024139404297, "learning_rate": 4.1784259621357444e-05, "loss": 0.0206, "num_input_tokens_seen": 44881760, "step": 20790 }, { "epoch": 3.392332789559543, "grad_norm": 0.355150431394577, "learning_rate": 4.1778983647926274e-05, "loss": 0.0104, "num_input_tokens_seen": 44892896, "step": 20795 }, { "epoch": 3.393148450244698, "grad_norm": 0.4394971430301666, "learning_rate": 4.177370631430554e-05, "loss": 0.0883, "num_input_tokens_seen": 44903296, "step": 20800 }, { "epoch": 3.3939641109298533, "grad_norm": 0.1047433614730835, "learning_rate": 4.176842762092307e-05, "loss": 0.1069, "num_input_tokens_seen": 44913664, "step": 20805 }, { "epoch": 3.394779771615008, "grad_norm": 0.9792728424072266, "learning_rate": 4.176314756820677e-05, "loss": 0.1567, "num_input_tokens_seen": 44924864, "step": 20810 }, { "epoch": 3.395595432300163, "grad_norm": 0.0860372856259346, "learning_rate": 4.175786615658468e-05, "loss": 0.0993, "num_input_tokens_seen": 44935008, "step": 20815 }, { "epoch": 3.396411092985318, "grad_norm": 0.03137561306357384, "learning_rate": 4.175258338648493e-05, "loss": 0.0732, "num_input_tokens_seen": 44945280, "step": 20820 }, { "epoch": 3.397226753670473, "grad_norm": 0.35096603631973267, "learning_rate": 4.174729925833576e-05, "loss": 0.0592, "num_input_tokens_seen": 44956064, "step": 20825 }, { "epoch": 3.3980424143556283, "grad_norm": 0.047019828110933304, "learning_rate": 4.174201377256555e-05, "loss": 0.0811, "num_input_tokens_seen": 44967424, "step": 20830 }, { "epoch": 3.398858075040783, "grad_norm": 0.12674105167388916, "learning_rate": 4.173672692960274e-05, "loss": 0.0743, "num_input_tokens_seen": 44977952, "step": 20835 }, { "epoch": 3.399673735725938, "grad_norm": 0.051974739879369736, "learning_rate": 4.173143872987594e-05, "loss": 0.1049, "num_input_tokens_seen": 44988384, "step": 20840 }, { "epoch": 3.400489396411093, "grad_norm": 0.02065962553024292, "learning_rate": 4.172614917381381e-05, "loss": 0.0057, "num_input_tokens_seen": 44998816, "step": 20845 }, { "epoch": 3.401305057096248, "grad_norm": 5.289228439331055, "learning_rate": 4.1720858261845166e-05, "loss": 0.1662, "num_input_tokens_seen": 45010112, "step": 20850 }, { "epoch": 3.402120717781403, "grad_norm": 9.325918197631836, "learning_rate": 4.171556599439891e-05, "loss": 0.2236, "num_input_tokens_seen": 45021184, "step": 20855 }, { "epoch": 3.402936378466558, "grad_norm": 0.10597573220729828, "learning_rate": 4.1710272371904055e-05, "loss": 0.2338, "num_input_tokens_seen": 45030976, "step": 20860 }, { "epoch": 3.403752039151713, "grad_norm": 1.7547309398651123, "learning_rate": 4.170497739478974e-05, "loss": 0.264, "num_input_tokens_seen": 45042400, "step": 20865 }, { "epoch": 3.4045676998368677, "grad_norm": 0.03804885968565941, "learning_rate": 4.16996810634852e-05, "loss": 0.067, "num_input_tokens_seen": 45052352, "step": 20870 }, { "epoch": 3.405383360522023, "grad_norm": 0.1667662113904953, "learning_rate": 4.1694383378419774e-05, "loss": 0.007, "num_input_tokens_seen": 45063520, "step": 20875 }, { "epoch": 3.4061990212071778, "grad_norm": 10.003411293029785, "learning_rate": 4.168908434002292e-05, "loss": 0.0792, "num_input_tokens_seen": 45074144, "step": 20880 }, { "epoch": 3.407014681892333, "grad_norm": 0.23482690751552582, "learning_rate": 4.168378394872422e-05, "loss": 0.0184, "num_input_tokens_seen": 45084704, "step": 20885 }, { "epoch": 3.407830342577488, "grad_norm": 2.349637031555176, "learning_rate": 4.167848220495334e-05, "loss": 0.1031, "num_input_tokens_seen": 45095744, "step": 20890 }, { "epoch": 3.4086460032626427, "grad_norm": 0.12306543439626694, "learning_rate": 4.167317910914006e-05, "loss": 0.089, "num_input_tokens_seen": 45105952, "step": 20895 }, { "epoch": 3.4094616639477975, "grad_norm": 0.12144485861063004, "learning_rate": 4.166787466171429e-05, "loss": 0.0066, "num_input_tokens_seen": 45118080, "step": 20900 }, { "epoch": 3.4102773246329527, "grad_norm": 0.024729499593377113, "learning_rate": 4.166256886310602e-05, "loss": 0.0096, "num_input_tokens_seen": 45129024, "step": 20905 }, { "epoch": 3.4110929853181076, "grad_norm": 2.3089208602905273, "learning_rate": 4.165726171374538e-05, "loss": 0.0283, "num_input_tokens_seen": 45139200, "step": 20910 }, { "epoch": 3.411908646003263, "grad_norm": 0.04093559458851814, "learning_rate": 4.165195321406259e-05, "loss": 0.0837, "num_input_tokens_seen": 45150752, "step": 20915 }, { "epoch": 3.4127243066884176, "grad_norm": 0.11183979362249374, "learning_rate": 4.164664336448797e-05, "loss": 0.1029, "num_input_tokens_seen": 45162016, "step": 20920 }, { "epoch": 3.4135399673735725, "grad_norm": 3.6680314540863037, "learning_rate": 4.164133216545199e-05, "loss": 0.1584, "num_input_tokens_seen": 45171904, "step": 20925 }, { "epoch": 3.4143556280587277, "grad_norm": 7.581708908081055, "learning_rate": 4.163601961738517e-05, "loss": 0.2686, "num_input_tokens_seen": 45183712, "step": 20930 }, { "epoch": 3.4151712887438825, "grad_norm": 1.1461793184280396, "learning_rate": 4.16307057207182e-05, "loss": 0.0608, "num_input_tokens_seen": 45194656, "step": 20935 }, { "epoch": 3.4159869494290374, "grad_norm": 0.03561193495988846, "learning_rate": 4.162539047588183e-05, "loss": 0.2059, "num_input_tokens_seen": 45204800, "step": 20940 }, { "epoch": 3.4168026101141926, "grad_norm": 10.955655097961426, "learning_rate": 4.162007388330696e-05, "loss": 0.1946, "num_input_tokens_seen": 45215776, "step": 20945 }, { "epoch": 3.4176182707993474, "grad_norm": 0.05724068358540535, "learning_rate": 4.1614755943424575e-05, "loss": 0.0306, "num_input_tokens_seen": 45225952, "step": 20950 }, { "epoch": 3.4184339314845023, "grad_norm": 1.750388503074646, "learning_rate": 4.160943665666577e-05, "loss": 0.1012, "num_input_tokens_seen": 45237312, "step": 20955 }, { "epoch": 3.4192495921696575, "grad_norm": 0.2838281989097595, "learning_rate": 4.160411602346175e-05, "loss": 0.0107, "num_input_tokens_seen": 45246912, "step": 20960 }, { "epoch": 3.4200652528548123, "grad_norm": 0.1384785771369934, "learning_rate": 4.159879404424384e-05, "loss": 0.1127, "num_input_tokens_seen": 45257440, "step": 20965 }, { "epoch": 3.4208809135399676, "grad_norm": 0.13913112878799438, "learning_rate": 4.159347071944346e-05, "loss": 0.1909, "num_input_tokens_seen": 45268416, "step": 20970 }, { "epoch": 3.4216965742251224, "grad_norm": 0.32134515047073364, "learning_rate": 4.158814604949215e-05, "loss": 0.1516, "num_input_tokens_seen": 45278848, "step": 20975 }, { "epoch": 3.4225122349102772, "grad_norm": 0.4873961806297302, "learning_rate": 4.158282003482156e-05, "loss": 0.0175, "num_input_tokens_seen": 45291072, "step": 20980 }, { "epoch": 3.4233278955954325, "grad_norm": 0.3885081708431244, "learning_rate": 4.157749267586343e-05, "loss": 0.1169, "num_input_tokens_seen": 45302336, "step": 20985 }, { "epoch": 3.4241435562805873, "grad_norm": 0.17299550771713257, "learning_rate": 4.1572163973049624e-05, "loss": 0.0423, "num_input_tokens_seen": 45312864, "step": 20990 }, { "epoch": 3.424959216965742, "grad_norm": 0.3936578631401062, "learning_rate": 4.1566833926812135e-05, "loss": 0.1395, "num_input_tokens_seen": 45323744, "step": 20995 }, { "epoch": 3.4257748776508974, "grad_norm": 0.4577231705188751, "learning_rate": 4.1561502537583016e-05, "loss": 0.1863, "num_input_tokens_seen": 45333760, "step": 21000 }, { "epoch": 3.426590538336052, "grad_norm": 1.2642056941986084, "learning_rate": 4.155616980579447e-05, "loss": 0.2441, "num_input_tokens_seen": 45344896, "step": 21005 }, { "epoch": 3.427406199021207, "grad_norm": 9.072660446166992, "learning_rate": 4.155083573187881e-05, "loss": 0.1113, "num_input_tokens_seen": 45354336, "step": 21010 }, { "epoch": 3.4282218597063623, "grad_norm": 0.16346946358680725, "learning_rate": 4.154550031626842e-05, "loss": 0.3153, "num_input_tokens_seen": 45365376, "step": 21015 }, { "epoch": 3.429037520391517, "grad_norm": 3.710902452468872, "learning_rate": 4.1540163559395816e-05, "loss": 0.0215, "num_input_tokens_seen": 45376416, "step": 21020 }, { "epoch": 3.429853181076672, "grad_norm": 0.0782494992017746, "learning_rate": 4.153482546169364e-05, "loss": 0.0936, "num_input_tokens_seen": 45386272, "step": 21025 }, { "epoch": 3.430668841761827, "grad_norm": 0.04548351466655731, "learning_rate": 4.15294860235946e-05, "loss": 0.0538, "num_input_tokens_seen": 45395936, "step": 21030 }, { "epoch": 3.431484502446982, "grad_norm": 0.07497764378786087, "learning_rate": 4.152414524553156e-05, "loss": 0.0151, "num_input_tokens_seen": 45407680, "step": 21035 }, { "epoch": 3.432300163132137, "grad_norm": 2.2037370204925537, "learning_rate": 4.1518803127937464e-05, "loss": 0.0161, "num_input_tokens_seen": 45417088, "step": 21040 }, { "epoch": 3.433115823817292, "grad_norm": 0.6846911311149597, "learning_rate": 4.1513459671245384e-05, "loss": 0.0445, "num_input_tokens_seen": 45429536, "step": 21045 }, { "epoch": 3.433931484502447, "grad_norm": 0.07101361453533173, "learning_rate": 4.150811487588846e-05, "loss": 0.0205, "num_input_tokens_seen": 45440320, "step": 21050 }, { "epoch": 3.434747145187602, "grad_norm": 0.4925655722618103, "learning_rate": 4.150276874229999e-05, "loss": 0.2046, "num_input_tokens_seen": 45451008, "step": 21055 }, { "epoch": 3.435562805872757, "grad_norm": 1.419433355331421, "learning_rate": 4.149742127091335e-05, "loss": 0.0333, "num_input_tokens_seen": 45462432, "step": 21060 }, { "epoch": 3.436378466557912, "grad_norm": 8.319808006286621, "learning_rate": 4.149207246216203e-05, "loss": 0.1205, "num_input_tokens_seen": 45472544, "step": 21065 }, { "epoch": 3.437194127243067, "grad_norm": 0.07023496180772781, "learning_rate": 4.1486722316479635e-05, "loss": 0.0681, "num_input_tokens_seen": 45483168, "step": 21070 }, { "epoch": 3.438009787928222, "grad_norm": 0.05403876677155495, "learning_rate": 4.1481370834299884e-05, "loss": 0.0056, "num_input_tokens_seen": 45492832, "step": 21075 }, { "epoch": 3.4388254486133767, "grad_norm": 1.4059116840362549, "learning_rate": 4.1476018016056583e-05, "loss": 0.2701, "num_input_tokens_seen": 45504064, "step": 21080 }, { "epoch": 3.439641109298532, "grad_norm": 4.7639923095703125, "learning_rate": 4.1470663862183664e-05, "loss": 0.1012, "num_input_tokens_seen": 45514336, "step": 21085 }, { "epoch": 3.4404567699836868, "grad_norm": 0.06506458669900894, "learning_rate": 4.146530837311516e-05, "loss": 0.0895, "num_input_tokens_seen": 45524480, "step": 21090 }, { "epoch": 3.4412724306688416, "grad_norm": 11.314379692077637, "learning_rate": 4.145995154928521e-05, "loss": 0.3164, "num_input_tokens_seen": 45535264, "step": 21095 }, { "epoch": 3.442088091353997, "grad_norm": 7.392039775848389, "learning_rate": 4.1454593391128084e-05, "loss": 0.0397, "num_input_tokens_seen": 45546560, "step": 21100 }, { "epoch": 3.4429037520391517, "grad_norm": 0.08344903588294983, "learning_rate": 4.144923389907812e-05, "loss": 0.0747, "num_input_tokens_seen": 45558016, "step": 21105 }, { "epoch": 3.443719412724307, "grad_norm": 0.10498899966478348, "learning_rate": 4.1443873073569796e-05, "loss": 0.0587, "num_input_tokens_seen": 45568512, "step": 21110 }, { "epoch": 3.4445350734094617, "grad_norm": 0.8218194842338562, "learning_rate": 4.143851091503768e-05, "loss": 0.0851, "num_input_tokens_seen": 45579968, "step": 21115 }, { "epoch": 3.4453507340946166, "grad_norm": 0.4953779876232147, "learning_rate": 4.1433147423916466e-05, "loss": 0.0262, "num_input_tokens_seen": 45592064, "step": 21120 }, { "epoch": 3.4461663947797714, "grad_norm": 0.06428025662899017, "learning_rate": 4.1427782600640943e-05, "loss": 0.1159, "num_input_tokens_seen": 45603040, "step": 21125 }, { "epoch": 3.4469820554649266, "grad_norm": 0.13550199568271637, "learning_rate": 4.1422416445646e-05, "loss": 0.0592, "num_input_tokens_seen": 45613536, "step": 21130 }, { "epoch": 3.4477977161500815, "grad_norm": 0.29464489221572876, "learning_rate": 4.141704895936666e-05, "loss": 0.0622, "num_input_tokens_seen": 45625120, "step": 21135 }, { "epoch": 3.4486133768352367, "grad_norm": 6.6131815910339355, "learning_rate": 4.141168014223803e-05, "loss": 0.1254, "num_input_tokens_seen": 45635744, "step": 21140 }, { "epoch": 3.4494290375203915, "grad_norm": 0.18010494112968445, "learning_rate": 4.1406309994695335e-05, "loss": 0.1449, "num_input_tokens_seen": 45646432, "step": 21145 }, { "epoch": 3.4502446982055464, "grad_norm": 2.384899377822876, "learning_rate": 4.1400938517173905e-05, "loss": 0.0981, "num_input_tokens_seen": 45656576, "step": 21150 }, { "epoch": 3.4510603588907016, "grad_norm": 2.9363596439361572, "learning_rate": 4.139556571010919e-05, "loss": 0.2242, "num_input_tokens_seen": 45667904, "step": 21155 }, { "epoch": 3.4518760195758564, "grad_norm": 0.039616163820028305, "learning_rate": 4.139019157393672e-05, "loss": 0.011, "num_input_tokens_seen": 45678528, "step": 21160 }, { "epoch": 3.4526916802610113, "grad_norm": 3.5711658000946045, "learning_rate": 4.138481610909216e-05, "loss": 0.1952, "num_input_tokens_seen": 45689312, "step": 21165 }, { "epoch": 3.4535073409461665, "grad_norm": 1.4512710571289062, "learning_rate": 4.137943931601127e-05, "loss": 0.0097, "num_input_tokens_seen": 45700160, "step": 21170 }, { "epoch": 3.4543230016313213, "grad_norm": 8.78475570678711, "learning_rate": 4.1374061195129924e-05, "loss": 0.2915, "num_input_tokens_seen": 45710432, "step": 21175 }, { "epoch": 3.455138662316476, "grad_norm": 0.7189760804176331, "learning_rate": 4.13686817468841e-05, "loss": 0.0062, "num_input_tokens_seen": 45721248, "step": 21180 }, { "epoch": 3.4559543230016314, "grad_norm": 3.4032669067382812, "learning_rate": 4.1363300971709866e-05, "loss": 0.0931, "num_input_tokens_seen": 45732768, "step": 21185 }, { "epoch": 3.4567699836867862, "grad_norm": 0.06290104985237122, "learning_rate": 4.135791887004344e-05, "loss": 0.0042, "num_input_tokens_seen": 45743200, "step": 21190 }, { "epoch": 3.4575856443719415, "grad_norm": 0.16576777398586273, "learning_rate": 4.13525354423211e-05, "loss": 0.2485, "num_input_tokens_seen": 45753376, "step": 21195 }, { "epoch": 3.4584013050570963, "grad_norm": 3.3895223140716553, "learning_rate": 4.1347150688979275e-05, "loss": 0.0537, "num_input_tokens_seen": 45764064, "step": 21200 }, { "epoch": 3.459216965742251, "grad_norm": 6.020229339599609, "learning_rate": 4.134176461045447e-05, "loss": 0.2202, "num_input_tokens_seen": 45774944, "step": 21205 }, { "epoch": 3.4600326264274064, "grad_norm": 0.6488676071166992, "learning_rate": 4.133637720718331e-05, "loss": 0.0603, "num_input_tokens_seen": 45785856, "step": 21210 }, { "epoch": 3.460848287112561, "grad_norm": 5.132880687713623, "learning_rate": 4.133098847960252e-05, "loss": 0.3604, "num_input_tokens_seen": 45796352, "step": 21215 }, { "epoch": 3.461663947797716, "grad_norm": 5.736630916595459, "learning_rate": 4.1325598428148935e-05, "loss": 0.0395, "num_input_tokens_seen": 45808160, "step": 21220 }, { "epoch": 3.4624796084828713, "grad_norm": 0.06908325850963593, "learning_rate": 4.132020705325952e-05, "loss": 0.0071, "num_input_tokens_seen": 45819744, "step": 21225 }, { "epoch": 3.463295269168026, "grad_norm": 0.03077886253595352, "learning_rate": 4.13148143553713e-05, "loss": 0.2597, "num_input_tokens_seen": 45830048, "step": 21230 }, { "epoch": 3.464110929853181, "grad_norm": 2.3064441680908203, "learning_rate": 4.130942033492146e-05, "loss": 0.0756, "num_input_tokens_seen": 45841216, "step": 21235 }, { "epoch": 3.464926590538336, "grad_norm": 0.12238824367523193, "learning_rate": 4.1304024992347245e-05, "loss": 0.0749, "num_input_tokens_seen": 45850720, "step": 21240 }, { "epoch": 3.465742251223491, "grad_norm": 6.307570934295654, "learning_rate": 4.129862832808604e-05, "loss": 0.272, "num_input_tokens_seen": 45862272, "step": 21245 }, { "epoch": 3.466557911908646, "grad_norm": 0.08646532893180847, "learning_rate": 4.129323034257533e-05, "loss": 0.1786, "num_input_tokens_seen": 45872576, "step": 21250 }, { "epoch": 3.467373572593801, "grad_norm": 0.06172173470258713, "learning_rate": 4.128783103625269e-05, "loss": 0.2169, "num_input_tokens_seen": 45882816, "step": 21255 }, { "epoch": 3.468189233278956, "grad_norm": 0.13198105990886688, "learning_rate": 4.128243040955583e-05, "loss": 0.16, "num_input_tokens_seen": 45893856, "step": 21260 }, { "epoch": 3.4690048939641107, "grad_norm": 3.57318377494812, "learning_rate": 4.1277028462922535e-05, "loss": 0.1352, "num_input_tokens_seen": 45904128, "step": 21265 }, { "epoch": 3.469820554649266, "grad_norm": 6.988917827606201, "learning_rate": 4.127162519679073e-05, "loss": 0.1319, "num_input_tokens_seen": 45914624, "step": 21270 }, { "epoch": 3.470636215334421, "grad_norm": 1.898236870765686, "learning_rate": 4.126622061159843e-05, "loss": 0.1336, "num_input_tokens_seen": 45926432, "step": 21275 }, { "epoch": 3.471451876019576, "grad_norm": 0.3959386348724365, "learning_rate": 4.126081470778375e-05, "loss": 0.0191, "num_input_tokens_seen": 45936608, "step": 21280 }, { "epoch": 3.472267536704731, "grad_norm": 0.11599043756723404, "learning_rate": 4.125540748578491e-05, "loss": 0.1255, "num_input_tokens_seen": 45948768, "step": 21285 }, { "epoch": 3.4730831973898857, "grad_norm": 0.05390927568078041, "learning_rate": 4.124999894604028e-05, "loss": 0.0068, "num_input_tokens_seen": 45958080, "step": 21290 }, { "epoch": 3.473898858075041, "grad_norm": 4.834368705749512, "learning_rate": 4.124458908898827e-05, "loss": 0.1203, "num_input_tokens_seen": 45968768, "step": 21295 }, { "epoch": 3.4747145187601958, "grad_norm": 3.5230612754821777, "learning_rate": 4.1239177915067454e-05, "loss": 0.1929, "num_input_tokens_seen": 45979360, "step": 21300 }, { "epoch": 3.4755301794453506, "grad_norm": 1.043532133102417, "learning_rate": 4.123376542471648e-05, "loss": 0.0244, "num_input_tokens_seen": 45990880, "step": 21305 }, { "epoch": 3.476345840130506, "grad_norm": 0.046062592417001724, "learning_rate": 4.122835161837409e-05, "loss": 0.1462, "num_input_tokens_seen": 46001344, "step": 21310 }, { "epoch": 3.4771615008156607, "grad_norm": 1.381137728691101, "learning_rate": 4.12229364964792e-05, "loss": 0.0699, "num_input_tokens_seen": 46013152, "step": 21315 }, { "epoch": 3.4779771615008155, "grad_norm": 0.1660504937171936, "learning_rate": 4.121752005947076e-05, "loss": 0.1241, "num_input_tokens_seen": 46023424, "step": 21320 }, { "epoch": 3.4787928221859707, "grad_norm": 7.055490493774414, "learning_rate": 4.121210230778785e-05, "loss": 0.0877, "num_input_tokens_seen": 46033568, "step": 21325 }, { "epoch": 3.4796084828711256, "grad_norm": 3.550671100616455, "learning_rate": 4.120668324186967e-05, "loss": 0.0416, "num_input_tokens_seen": 46045088, "step": 21330 }, { "epoch": 3.480424143556281, "grad_norm": 0.07857126742601395, "learning_rate": 4.120126286215552e-05, "loss": 0.063, "num_input_tokens_seen": 46056640, "step": 21335 }, { "epoch": 3.4812398042414356, "grad_norm": 0.11759999394416809, "learning_rate": 4.119584116908478e-05, "loss": 0.0602, "num_input_tokens_seen": 46067008, "step": 21340 }, { "epoch": 3.4820554649265905, "grad_norm": 0.061887551099061966, "learning_rate": 4.1190418163097e-05, "loss": 0.0646, "num_input_tokens_seen": 46078048, "step": 21345 }, { "epoch": 3.4828711256117453, "grad_norm": 0.040443260222673416, "learning_rate": 4.118499384463176e-05, "loss": 0.0951, "num_input_tokens_seen": 46089568, "step": 21350 }, { "epoch": 3.4836867862969005, "grad_norm": 4.102940559387207, "learning_rate": 4.1179568214128805e-05, "loss": 0.3468, "num_input_tokens_seen": 46101024, "step": 21355 }, { "epoch": 3.4845024469820554, "grad_norm": 6.000858306884766, "learning_rate": 4.117414127202795e-05, "loss": 0.1659, "num_input_tokens_seen": 46112064, "step": 21360 }, { "epoch": 3.4853181076672106, "grad_norm": 12.418498039245605, "learning_rate": 4.116871301876914e-05, "loss": 0.1087, "num_input_tokens_seen": 46122080, "step": 21365 }, { "epoch": 3.4861337683523654, "grad_norm": 4.769489288330078, "learning_rate": 4.116328345479241e-05, "loss": 0.1946, "num_input_tokens_seen": 46132992, "step": 21370 }, { "epoch": 3.4869494290375203, "grad_norm": 4.555278778076172, "learning_rate": 4.115785258053792e-05, "loss": 0.2054, "num_input_tokens_seen": 46144576, "step": 21375 }, { "epoch": 3.4877650897226755, "grad_norm": 0.12587052583694458, "learning_rate": 4.1152420396445915e-05, "loss": 0.3145, "num_input_tokens_seen": 46155296, "step": 21380 }, { "epoch": 3.4885807504078303, "grad_norm": 4.871669769287109, "learning_rate": 4.1146986902956745e-05, "loss": 0.0611, "num_input_tokens_seen": 46166528, "step": 21385 }, { "epoch": 3.489396411092985, "grad_norm": 0.40100112557411194, "learning_rate": 4.1141552100510896e-05, "loss": 0.1811, "num_input_tokens_seen": 46176672, "step": 21390 }, { "epoch": 3.4902120717781404, "grad_norm": 0.15952937304973602, "learning_rate": 4.1136115989548926e-05, "loss": 0.0188, "num_input_tokens_seen": 46186880, "step": 21395 }, { "epoch": 3.4910277324632952, "grad_norm": 8.589343070983887, "learning_rate": 4.113067857051153e-05, "loss": 0.085, "num_input_tokens_seen": 46196320, "step": 21400 }, { "epoch": 3.49184339314845, "grad_norm": 0.05669253692030907, "learning_rate": 4.112523984383948e-05, "loss": 0.0679, "num_input_tokens_seen": 46207040, "step": 21405 }, { "epoch": 3.4926590538336053, "grad_norm": 1.8569155931472778, "learning_rate": 4.111979980997366e-05, "loss": 0.0304, "num_input_tokens_seen": 46217248, "step": 21410 }, { "epoch": 3.49347471451876, "grad_norm": 0.23351731896400452, "learning_rate": 4.1114358469355084e-05, "loss": 0.1226, "num_input_tokens_seen": 46226688, "step": 21415 }, { "epoch": 3.4942903752039154, "grad_norm": 0.1810547411441803, "learning_rate": 4.110891582242485e-05, "loss": 0.1612, "num_input_tokens_seen": 46236768, "step": 21420 }, { "epoch": 3.49510603588907, "grad_norm": 1.3555699586868286, "learning_rate": 4.1103471869624154e-05, "loss": 0.0848, "num_input_tokens_seen": 46247872, "step": 21425 }, { "epoch": 3.495921696574225, "grad_norm": 1.4279813766479492, "learning_rate": 4.109802661139433e-05, "loss": 0.1751, "num_input_tokens_seen": 46258688, "step": 21430 }, { "epoch": 3.4967373572593803, "grad_norm": 1.8895460367202759, "learning_rate": 4.109258004817679e-05, "loss": 0.0242, "num_input_tokens_seen": 46269504, "step": 21435 }, { "epoch": 3.497553017944535, "grad_norm": 4.239985942840576, "learning_rate": 4.1087132180413047e-05, "loss": 0.1051, "num_input_tokens_seen": 46280928, "step": 21440 }, { "epoch": 3.49836867862969, "grad_norm": 0.11252937465906143, "learning_rate": 4.108168300854475e-05, "loss": 0.1634, "num_input_tokens_seen": 46291648, "step": 21445 }, { "epoch": 3.499184339314845, "grad_norm": 0.35491886734962463, "learning_rate": 4.1076232533013635e-05, "loss": 0.0715, "num_input_tokens_seen": 46302592, "step": 21450 }, { "epoch": 3.5, "grad_norm": 0.26530101895332336, "learning_rate": 4.1070780754261533e-05, "loss": 0.1521, "num_input_tokens_seen": 46313216, "step": 21455 }, { "epoch": 3.5, "eval_loss": 0.16028907895088196, "eval_runtime": 132.9607, "eval_samples_per_second": 20.495, "eval_steps_per_second": 5.129, "num_input_tokens_seen": 46313216, "step": 21455 }, { "epoch": 3.500815660685155, "grad_norm": 0.11428025364875793, "learning_rate": 4.10653276727304e-05, "loss": 0.1485, "num_input_tokens_seen": 46322656, "step": 21460 }, { "epoch": 3.50163132137031, "grad_norm": 0.42429232597351074, "learning_rate": 4.105987328886229e-05, "loss": 0.0923, "num_input_tokens_seen": 46332576, "step": 21465 }, { "epoch": 3.502446982055465, "grad_norm": 0.028669022023677826, "learning_rate": 4.1054417603099376e-05, "loss": 0.218, "num_input_tokens_seen": 46343584, "step": 21470 }, { "epoch": 3.50326264274062, "grad_norm": 0.09211162477731705, "learning_rate": 4.104896061588391e-05, "loss": 0.0127, "num_input_tokens_seen": 46354240, "step": 21475 }, { "epoch": 3.504078303425775, "grad_norm": 8.533767700195312, "learning_rate": 4.1043502327658256e-05, "loss": 0.2029, "num_input_tokens_seen": 46364736, "step": 21480 }, { "epoch": 3.50489396411093, "grad_norm": 0.18505850434303284, "learning_rate": 4.1038042738864906e-05, "loss": 0.183, "num_input_tokens_seen": 46374976, "step": 21485 }, { "epoch": 3.5057096247960846, "grad_norm": 2.84120774269104, "learning_rate": 4.103258184994644e-05, "loss": 0.1389, "num_input_tokens_seen": 46386080, "step": 21490 }, { "epoch": 3.50652528548124, "grad_norm": 0.7979764342308044, "learning_rate": 4.102711966134553e-05, "loss": 0.0688, "num_input_tokens_seen": 46397152, "step": 21495 }, { "epoch": 3.5073409461663947, "grad_norm": 0.1829701066017151, "learning_rate": 4.102165617350498e-05, "loss": 0.1268, "num_input_tokens_seen": 46408480, "step": 21500 }, { "epoch": 3.50815660685155, "grad_norm": 0.11440054327249527, "learning_rate": 4.101619138686769e-05, "loss": 0.1559, "num_input_tokens_seen": 46419712, "step": 21505 }, { "epoch": 3.5089722675367048, "grad_norm": 7.477168560028076, "learning_rate": 4.101072530187666e-05, "loss": 0.2169, "num_input_tokens_seen": 46431072, "step": 21510 }, { "epoch": 3.5097879282218596, "grad_norm": 0.15226133167743683, "learning_rate": 4.100525791897501e-05, "loss": 0.0295, "num_input_tokens_seen": 46441280, "step": 21515 }, { "epoch": 3.5106035889070144, "grad_norm": 0.23722317814826965, "learning_rate": 4.0999789238605925e-05, "loss": 0.2065, "num_input_tokens_seen": 46451808, "step": 21520 }, { "epoch": 3.5114192495921697, "grad_norm": 3.630800485610962, "learning_rate": 4.099431926121276e-05, "loss": 0.1932, "num_input_tokens_seen": 46463136, "step": 21525 }, { "epoch": 3.5122349102773245, "grad_norm": 4.703624725341797, "learning_rate": 4.098884798723891e-05, "loss": 0.1012, "num_input_tokens_seen": 46473280, "step": 21530 }, { "epoch": 3.5130505709624797, "grad_norm": 3.5786826610565186, "learning_rate": 4.098337541712791e-05, "loss": 0.0781, "num_input_tokens_seen": 46484032, "step": 21535 }, { "epoch": 3.5138662316476346, "grad_norm": 0.13141930103302002, "learning_rate": 4.0977901551323414e-05, "loss": 0.0406, "num_input_tokens_seen": 46495488, "step": 21540 }, { "epoch": 3.5146818923327894, "grad_norm": 3.897994041442871, "learning_rate": 4.097242639026914e-05, "loss": 0.2088, "num_input_tokens_seen": 46506944, "step": 21545 }, { "epoch": 3.5154975530179446, "grad_norm": 2.7126681804656982, "learning_rate": 4.0966949934408946e-05, "loss": 0.1499, "num_input_tokens_seen": 46517216, "step": 21550 }, { "epoch": 3.5163132137030995, "grad_norm": 4.191082954406738, "learning_rate": 4.0961472184186766e-05, "loss": 0.1604, "num_input_tokens_seen": 46528864, "step": 21555 }, { "epoch": 3.5171288743882547, "grad_norm": 0.14017659425735474, "learning_rate": 4.0955993140046665e-05, "loss": 0.1651, "num_input_tokens_seen": 46539072, "step": 21560 }, { "epoch": 3.5179445350734095, "grad_norm": 0.11831638962030411, "learning_rate": 4.095051280243281e-05, "loss": 0.0834, "num_input_tokens_seen": 46550016, "step": 21565 }, { "epoch": 3.5187601957585644, "grad_norm": 1.5843397378921509, "learning_rate": 4.0945031171789435e-05, "loss": 0.0641, "num_input_tokens_seen": 46559968, "step": 21570 }, { "epoch": 3.519575856443719, "grad_norm": 0.5666118860244751, "learning_rate": 4.0939548248560946e-05, "loss": 0.0396, "num_input_tokens_seen": 46571456, "step": 21575 }, { "epoch": 3.5203915171288744, "grad_norm": 0.08470553159713745, "learning_rate": 4.093406403319179e-05, "loss": 0.0682, "num_input_tokens_seen": 46583104, "step": 21580 }, { "epoch": 3.5212071778140293, "grad_norm": 0.33066871762275696, "learning_rate": 4.0928578526126566e-05, "loss": 0.1707, "num_input_tokens_seen": 46592384, "step": 21585 }, { "epoch": 3.5220228384991845, "grad_norm": 0.3490210175514221, "learning_rate": 4.092309172780994e-05, "loss": 0.0124, "num_input_tokens_seen": 46603488, "step": 21590 }, { "epoch": 3.5228384991843393, "grad_norm": 0.061410918831825256, "learning_rate": 4.09176036386867e-05, "loss": 0.2111, "num_input_tokens_seen": 46614528, "step": 21595 }, { "epoch": 3.523654159869494, "grad_norm": 3.608489513397217, "learning_rate": 4.091211425920175e-05, "loss": 0.0739, "num_input_tokens_seen": 46625920, "step": 21600 }, { "epoch": 3.5244698205546494, "grad_norm": 0.05101187154650688, "learning_rate": 4.090662358980009e-05, "loss": 0.0256, "num_input_tokens_seen": 46636288, "step": 21605 }, { "epoch": 3.5252854812398042, "grad_norm": 5.769841194152832, "learning_rate": 4.0901131630926794e-05, "loss": 0.2354, "num_input_tokens_seen": 46645536, "step": 21610 }, { "epoch": 3.5261011419249595, "grad_norm": 0.13154403865337372, "learning_rate": 4.089563838302709e-05, "loss": 0.0662, "num_input_tokens_seen": 46656224, "step": 21615 }, { "epoch": 3.5269168026101143, "grad_norm": 0.40239590406417847, "learning_rate": 4.089014384654629e-05, "loss": 0.0607, "num_input_tokens_seen": 46665920, "step": 21620 }, { "epoch": 3.527732463295269, "grad_norm": 1.9793695211410522, "learning_rate": 4.088464802192981e-05, "loss": 0.0313, "num_input_tokens_seen": 46676864, "step": 21625 }, { "epoch": 3.528548123980424, "grad_norm": 0.11798906326293945, "learning_rate": 4.0879150909623156e-05, "loss": 0.1958, "num_input_tokens_seen": 46688192, "step": 21630 }, { "epoch": 3.529363784665579, "grad_norm": 0.08413796126842499, "learning_rate": 4.0873652510071955e-05, "loss": 0.1456, "num_input_tokens_seen": 46698848, "step": 21635 }, { "epoch": 3.530179445350734, "grad_norm": 7.618236064910889, "learning_rate": 4.086815282372195e-05, "loss": 0.0819, "num_input_tokens_seen": 46709920, "step": 21640 }, { "epoch": 3.5309951060358893, "grad_norm": 4.7247114181518555, "learning_rate": 4.086265185101895e-05, "loss": 0.1243, "num_input_tokens_seen": 46721152, "step": 21645 }, { "epoch": 3.531810766721044, "grad_norm": 0.1500084549188614, "learning_rate": 4.0857149592408914e-05, "loss": 0.096, "num_input_tokens_seen": 46732320, "step": 21650 }, { "epoch": 3.532626427406199, "grad_norm": 0.060236118733882904, "learning_rate": 4.085164604833788e-05, "loss": 0.1269, "num_input_tokens_seen": 46744032, "step": 21655 }, { "epoch": 3.5334420880913537, "grad_norm": 0.4299047589302063, "learning_rate": 4.084614121925198e-05, "loss": 0.127, "num_input_tokens_seen": 46755584, "step": 21660 }, { "epoch": 3.534257748776509, "grad_norm": 3.1743924617767334, "learning_rate": 4.084063510559746e-05, "loss": 0.1116, "num_input_tokens_seen": 46766368, "step": 21665 }, { "epoch": 3.535073409461664, "grad_norm": 0.2604175806045532, "learning_rate": 4.0835127707820696e-05, "loss": 0.1737, "num_input_tokens_seen": 46777920, "step": 21670 }, { "epoch": 3.535889070146819, "grad_norm": 0.0569436214864254, "learning_rate": 4.0829619026368134e-05, "loss": 0.1881, "num_input_tokens_seen": 46789312, "step": 21675 }, { "epoch": 3.536704730831974, "grad_norm": 0.10592181980609894, "learning_rate": 4.0824109061686325e-05, "loss": 0.0114, "num_input_tokens_seen": 46800736, "step": 21680 }, { "epoch": 3.5375203915171287, "grad_norm": 7.927008152008057, "learning_rate": 4.081859781422195e-05, "loss": 0.2195, "num_input_tokens_seen": 46812096, "step": 21685 }, { "epoch": 3.538336052202284, "grad_norm": 0.1780281662940979, "learning_rate": 4.0813085284421774e-05, "loss": 0.1556, "num_input_tokens_seen": 46822624, "step": 21690 }, { "epoch": 3.539151712887439, "grad_norm": 10.250486373901367, "learning_rate": 4.080757147273267e-05, "loss": 0.1929, "num_input_tokens_seen": 46833760, "step": 21695 }, { "epoch": 3.539967373572594, "grad_norm": 3.7362842559814453, "learning_rate": 4.080205637960162e-05, "loss": 0.2167, "num_input_tokens_seen": 46845312, "step": 21700 }, { "epoch": 3.540783034257749, "grad_norm": 1.0604051351547241, "learning_rate": 4.07965400054757e-05, "loss": 0.0727, "num_input_tokens_seen": 46855904, "step": 21705 }, { "epoch": 3.5415986949429037, "grad_norm": 0.11310368776321411, "learning_rate": 4.0791022350802086e-05, "loss": 0.1851, "num_input_tokens_seen": 46865760, "step": 21710 }, { "epoch": 3.5424143556280585, "grad_norm": 0.5822985172271729, "learning_rate": 4.078550341602809e-05, "loss": 0.0106, "num_input_tokens_seen": 46877888, "step": 21715 }, { "epoch": 3.5432300163132138, "grad_norm": 0.23588088154792786, "learning_rate": 4.077998320160109e-05, "loss": 0.1103, "num_input_tokens_seen": 46889312, "step": 21720 }, { "epoch": 3.5440456769983686, "grad_norm": 2.986375093460083, "learning_rate": 4.077446170796858e-05, "loss": 0.1407, "num_input_tokens_seen": 46899392, "step": 21725 }, { "epoch": 3.544861337683524, "grad_norm": 2.9055681228637695, "learning_rate": 4.076893893557816e-05, "loss": 0.074, "num_input_tokens_seen": 46910784, "step": 21730 }, { "epoch": 3.5456769983686787, "grad_norm": 2.1018295288085938, "learning_rate": 4.076341488487755e-05, "loss": 0.2735, "num_input_tokens_seen": 46921920, "step": 21735 }, { "epoch": 3.5464926590538335, "grad_norm": 0.5600054860115051, "learning_rate": 4.0757889556314545e-05, "loss": 0.0757, "num_input_tokens_seen": 46931776, "step": 21740 }, { "epoch": 3.5473083197389887, "grad_norm": 1.423844575881958, "learning_rate": 4.0752362950337054e-05, "loss": 0.2423, "num_input_tokens_seen": 46942848, "step": 21745 }, { "epoch": 3.5481239804241436, "grad_norm": 0.08828958868980408, "learning_rate": 4.0746835067393096e-05, "loss": 0.0082, "num_input_tokens_seen": 46953408, "step": 21750 }, { "epoch": 3.5489396411092984, "grad_norm": 1.2527973651885986, "learning_rate": 4.074130590793079e-05, "loss": 0.0254, "num_input_tokens_seen": 46964416, "step": 21755 }, { "epoch": 3.5497553017944536, "grad_norm": 0.1187516376376152, "learning_rate": 4.073577547239836e-05, "loss": 0.022, "num_input_tokens_seen": 46974304, "step": 21760 }, { "epoch": 3.5505709624796085, "grad_norm": 0.17528195679187775, "learning_rate": 4.073024376124412e-05, "loss": 0.042, "num_input_tokens_seen": 46984256, "step": 21765 }, { "epoch": 3.5513866231647633, "grad_norm": 0.8355662822723389, "learning_rate": 4.072471077491651e-05, "loss": 0.2306, "num_input_tokens_seen": 46995424, "step": 21770 }, { "epoch": 3.5522022838499185, "grad_norm": 0.0951867401599884, "learning_rate": 4.071917651386406e-05, "loss": 0.3113, "num_input_tokens_seen": 47006880, "step": 21775 }, { "epoch": 3.5530179445350734, "grad_norm": 2.912970542907715, "learning_rate": 4.071364097853541e-05, "loss": 0.1603, "num_input_tokens_seen": 47016864, "step": 21780 }, { "epoch": 3.5538336052202286, "grad_norm": 3.3453893661499023, "learning_rate": 4.070810416937927e-05, "loss": 0.2121, "num_input_tokens_seen": 47029248, "step": 21785 }, { "epoch": 3.5546492659053834, "grad_norm": 0.22751952707767487, "learning_rate": 4.070256608684452e-05, "loss": 0.025, "num_input_tokens_seen": 47040480, "step": 21790 }, { "epoch": 3.5554649265905383, "grad_norm": 0.13204766809940338, "learning_rate": 4.069702673138009e-05, "loss": 0.0208, "num_input_tokens_seen": 47051872, "step": 21795 }, { "epoch": 3.556280587275693, "grad_norm": 0.09759233891963959, "learning_rate": 4.0691486103435025e-05, "loss": 0.0098, "num_input_tokens_seen": 47063008, "step": 21800 }, { "epoch": 3.5570962479608483, "grad_norm": 0.07521530240774155, "learning_rate": 4.0685944203458476e-05, "loss": 0.0319, "num_input_tokens_seen": 47073792, "step": 21805 }, { "epoch": 3.557911908646003, "grad_norm": 0.11585956811904907, "learning_rate": 4.06804010318997e-05, "loss": 0.0384, "num_input_tokens_seen": 47084832, "step": 21810 }, { "epoch": 3.5587275693311584, "grad_norm": 0.2817971408367157, "learning_rate": 4.0674856589208063e-05, "loss": 0.0096, "num_input_tokens_seen": 47096032, "step": 21815 }, { "epoch": 3.5595432300163132, "grad_norm": 0.0972599983215332, "learning_rate": 4.066931087583301e-05, "loss": 0.0799, "num_input_tokens_seen": 47106720, "step": 21820 }, { "epoch": 3.560358890701468, "grad_norm": 0.07077526301145554, "learning_rate": 4.0663763892224114e-05, "loss": 0.1032, "num_input_tokens_seen": 47118592, "step": 21825 }, { "epoch": 3.5611745513866233, "grad_norm": 4.707823753356934, "learning_rate": 4.065821563883104e-05, "loss": 0.1321, "num_input_tokens_seen": 47129792, "step": 21830 }, { "epoch": 3.561990212071778, "grad_norm": 0.07917211204767227, "learning_rate": 4.0652666116103556e-05, "loss": 0.0063, "num_input_tokens_seen": 47140224, "step": 21835 }, { "epoch": 3.5628058727569334, "grad_norm": 1.6662336587905884, "learning_rate": 4.064711532449153e-05, "loss": 0.0629, "num_input_tokens_seen": 47151904, "step": 21840 }, { "epoch": 3.563621533442088, "grad_norm": 2.8767783641815186, "learning_rate": 4.0641563264444946e-05, "loss": 0.0193, "num_input_tokens_seen": 47161920, "step": 21845 }, { "epoch": 3.564437194127243, "grad_norm": 0.08682331442832947, "learning_rate": 4.063600993641389e-05, "loss": 0.166, "num_input_tokens_seen": 47172128, "step": 21850 }, { "epoch": 3.565252854812398, "grad_norm": 9.139212608337402, "learning_rate": 4.0630455340848525e-05, "loss": 0.1559, "num_input_tokens_seen": 47182784, "step": 21855 }, { "epoch": 3.566068515497553, "grad_norm": 0.028107237070798874, "learning_rate": 4.062489947819914e-05, "loss": 0.0055, "num_input_tokens_seen": 47194336, "step": 21860 }, { "epoch": 3.566884176182708, "grad_norm": 3.5822339057922363, "learning_rate": 4.061934234891612e-05, "loss": 0.1516, "num_input_tokens_seen": 47204832, "step": 21865 }, { "epoch": 3.567699836867863, "grad_norm": 0.12957142293453217, "learning_rate": 4.0613783953449966e-05, "loss": 0.1401, "num_input_tokens_seen": 47215648, "step": 21870 }, { "epoch": 3.568515497553018, "grad_norm": 0.035445135086774826, "learning_rate": 4.0608224292251264e-05, "loss": 0.1497, "num_input_tokens_seen": 47226080, "step": 21875 }, { "epoch": 3.569331158238173, "grad_norm": 2.962271213531494, "learning_rate": 4.0602663365770696e-05, "loss": 0.1262, "num_input_tokens_seen": 47238112, "step": 21880 }, { "epoch": 3.5701468189233276, "grad_norm": 3.3195712566375732, "learning_rate": 4.0597101174459074e-05, "loss": 0.1438, "num_input_tokens_seen": 47249216, "step": 21885 }, { "epoch": 3.570962479608483, "grad_norm": 0.04456210136413574, "learning_rate": 4.0591537718767284e-05, "loss": 0.0894, "num_input_tokens_seen": 47259296, "step": 21890 }, { "epoch": 3.5717781402936377, "grad_norm": 0.8475660681724548, "learning_rate": 4.058597299914634e-05, "loss": 0.1626, "num_input_tokens_seen": 47270336, "step": 21895 }, { "epoch": 3.572593800978793, "grad_norm": 4.9658894538879395, "learning_rate": 4.0580407016047345e-05, "loss": 0.0797, "num_input_tokens_seen": 47280736, "step": 21900 }, { "epoch": 3.573409461663948, "grad_norm": 2.843942880630493, "learning_rate": 4.0574839769921504e-05, "loss": 0.1598, "num_input_tokens_seen": 47291328, "step": 21905 }, { "epoch": 3.5742251223491026, "grad_norm": 0.07666657119989395, "learning_rate": 4.056927126122012e-05, "loss": 0.115, "num_input_tokens_seen": 47302368, "step": 21910 }, { "epoch": 3.575040783034258, "grad_norm": 4.6631622314453125, "learning_rate": 4.056370149039461e-05, "loss": 0.032, "num_input_tokens_seen": 47312768, "step": 21915 }, { "epoch": 3.5758564437194127, "grad_norm": 0.055636707693338394, "learning_rate": 4.05581304578965e-05, "loss": 0.0083, "num_input_tokens_seen": 47323520, "step": 21920 }, { "epoch": 3.576672104404568, "grad_norm": 10.303662300109863, "learning_rate": 4.055255816417738e-05, "loss": 0.2338, "num_input_tokens_seen": 47334688, "step": 21925 }, { "epoch": 3.5774877650897228, "grad_norm": 0.2885773777961731, "learning_rate": 4.054698460968899e-05, "loss": 0.1213, "num_input_tokens_seen": 47345184, "step": 21930 }, { "epoch": 3.5783034257748776, "grad_norm": 2.2620084285736084, "learning_rate": 4.054140979488314e-05, "loss": 0.0632, "num_input_tokens_seen": 47356608, "step": 21935 }, { "epoch": 3.5791190864600324, "grad_norm": 4.410715579986572, "learning_rate": 4.0535833720211755e-05, "loss": 0.4028, "num_input_tokens_seen": 47367520, "step": 21940 }, { "epoch": 3.5799347471451877, "grad_norm": 5.771556377410889, "learning_rate": 4.053025638612686e-05, "loss": 0.1804, "num_input_tokens_seen": 47376288, "step": 21945 }, { "epoch": 3.5807504078303425, "grad_norm": 0.24861368536949158, "learning_rate": 4.052467779308058e-05, "loss": 0.0099, "num_input_tokens_seen": 47386592, "step": 21950 }, { "epoch": 3.5815660685154977, "grad_norm": 2.6123125553131104, "learning_rate": 4.051909794152515e-05, "loss": 0.2404, "num_input_tokens_seen": 47397792, "step": 21955 }, { "epoch": 3.5823817292006526, "grad_norm": 0.07052227109670639, "learning_rate": 4.05135168319129e-05, "loss": 0.0078, "num_input_tokens_seen": 47409280, "step": 21960 }, { "epoch": 3.5831973898858074, "grad_norm": 0.0400143526494503, "learning_rate": 4.050793446469626e-05, "loss": 0.0118, "num_input_tokens_seen": 47419680, "step": 21965 }, { "epoch": 3.5840130505709626, "grad_norm": 6.331526756286621, "learning_rate": 4.0502350840327764e-05, "loss": 0.0409, "num_input_tokens_seen": 47431296, "step": 21970 }, { "epoch": 3.5848287112561175, "grad_norm": 5.153724193572998, "learning_rate": 4.0496765959260055e-05, "loss": 0.0997, "num_input_tokens_seen": 47441824, "step": 21975 }, { "epoch": 3.5856443719412723, "grad_norm": 4.120921611785889, "learning_rate": 4.049117982194586e-05, "loss": 0.0082, "num_input_tokens_seen": 47452192, "step": 21980 }, { "epoch": 3.5864600326264275, "grad_norm": 4.24963903427124, "learning_rate": 4.048559242883804e-05, "loss": 0.1077, "num_input_tokens_seen": 47461696, "step": 21985 }, { "epoch": 3.5872756933115824, "grad_norm": 0.09849881380796432, "learning_rate": 4.0480003780389507e-05, "loss": 0.1782, "num_input_tokens_seen": 47472960, "step": 21990 }, { "epoch": 3.588091353996737, "grad_norm": 0.10503087937831879, "learning_rate": 4.0474413877053335e-05, "loss": 0.2302, "num_input_tokens_seen": 47483712, "step": 21995 }, { "epoch": 3.5889070146818924, "grad_norm": 0.1379232257604599, "learning_rate": 4.0468822719282654e-05, "loss": 0.1462, "num_input_tokens_seen": 47493728, "step": 22000 }, { "epoch": 3.5897226753670473, "grad_norm": 0.04909015819430351, "learning_rate": 4.046323030753071e-05, "loss": 0.0048, "num_input_tokens_seen": 47504928, "step": 22005 }, { "epoch": 3.5905383360522025, "grad_norm": 0.12813489139080048, "learning_rate": 4.045763664225087e-05, "loss": 0.1032, "num_input_tokens_seen": 47514880, "step": 22010 }, { "epoch": 3.5913539967373573, "grad_norm": 0.569116473197937, "learning_rate": 4.045204172389656e-05, "loss": 0.0439, "num_input_tokens_seen": 47526688, "step": 22015 }, { "epoch": 3.592169657422512, "grad_norm": 0.07062584906816483, "learning_rate": 4.044644555292135e-05, "loss": 0.0286, "num_input_tokens_seen": 47536768, "step": 22020 }, { "epoch": 3.592985318107667, "grad_norm": 0.05885003134608269, "learning_rate": 4.04408481297789e-05, "loss": 0.1038, "num_input_tokens_seen": 47548768, "step": 22025 }, { "epoch": 3.5938009787928222, "grad_norm": 0.07328484207391739, "learning_rate": 4.043524945492294e-05, "loss": 0.1124, "num_input_tokens_seen": 47559968, "step": 22030 }, { "epoch": 3.594616639477977, "grad_norm": 3.6764700412750244, "learning_rate": 4.042964952880734e-05, "loss": 0.1308, "num_input_tokens_seen": 47571040, "step": 22035 }, { "epoch": 3.5954323001631323, "grad_norm": 2.515673875808716, "learning_rate": 4.042404835188607e-05, "loss": 0.0795, "num_input_tokens_seen": 47582816, "step": 22040 }, { "epoch": 3.596247960848287, "grad_norm": 0.10273520648479462, "learning_rate": 4.041844592461318e-05, "loss": 0.0245, "num_input_tokens_seen": 47594112, "step": 22045 }, { "epoch": 3.597063621533442, "grad_norm": 0.09178955107927322, "learning_rate": 4.0412842247442815e-05, "loss": 0.1805, "num_input_tokens_seen": 47604320, "step": 22050 }, { "epoch": 3.597879282218597, "grad_norm": 0.048253919929265976, "learning_rate": 4.040723732082927e-05, "loss": 0.1487, "num_input_tokens_seen": 47613472, "step": 22055 }, { "epoch": 3.598694942903752, "grad_norm": 12.505542755126953, "learning_rate": 4.040163114522689e-05, "loss": 0.265, "num_input_tokens_seen": 47624096, "step": 22060 }, { "epoch": 3.5995106035889073, "grad_norm": 7.198615550994873, "learning_rate": 4.039602372109014e-05, "loss": 0.1578, "num_input_tokens_seen": 47635456, "step": 22065 }, { "epoch": 3.600326264274062, "grad_norm": 4.6033124923706055, "learning_rate": 4.0390415048873584e-05, "loss": 0.1346, "num_input_tokens_seen": 47644448, "step": 22070 }, { "epoch": 3.601141924959217, "grad_norm": 4.066956043243408, "learning_rate": 4.03848051290319e-05, "loss": 0.1952, "num_input_tokens_seen": 47654144, "step": 22075 }, { "epoch": 3.6019575856443717, "grad_norm": 0.04538601636886597, "learning_rate": 4.037919396201985e-05, "loss": 0.0186, "num_input_tokens_seen": 47664672, "step": 22080 }, { "epoch": 3.602773246329527, "grad_norm": 4.017793655395508, "learning_rate": 4.0373581548292305e-05, "loss": 0.1193, "num_input_tokens_seen": 47676512, "step": 22085 }, { "epoch": 3.603588907014682, "grad_norm": 4.2030463218688965, "learning_rate": 4.036796788830423e-05, "loss": 0.1719, "num_input_tokens_seen": 47687168, "step": 22090 }, { "epoch": 3.604404567699837, "grad_norm": 7.6258745193481445, "learning_rate": 4.036235298251071e-05, "loss": 0.2498, "num_input_tokens_seen": 47698208, "step": 22095 }, { "epoch": 3.605220228384992, "grad_norm": 1.149688720703125, "learning_rate": 4.03567368313669e-05, "loss": 0.0456, "num_input_tokens_seen": 47708448, "step": 22100 }, { "epoch": 3.6060358890701467, "grad_norm": 0.1677428036928177, "learning_rate": 4.035111943532808e-05, "loss": 0.0714, "num_input_tokens_seen": 47718624, "step": 22105 }, { "epoch": 3.6068515497553015, "grad_norm": 3.900057792663574, "learning_rate": 4.034550079484964e-05, "loss": 0.33, "num_input_tokens_seen": 47729184, "step": 22110 }, { "epoch": 3.607667210440457, "grad_norm": 0.27360770106315613, "learning_rate": 4.033988091038704e-05, "loss": 0.0163, "num_input_tokens_seen": 47740992, "step": 22115 }, { "epoch": 3.6084828711256116, "grad_norm": 0.5041369795799255, "learning_rate": 4.0334259782395855e-05, "loss": 0.0946, "num_input_tokens_seen": 47751520, "step": 22120 }, { "epoch": 3.609298531810767, "grad_norm": 0.6643962860107422, "learning_rate": 4.032863741133177e-05, "loss": 0.1931, "num_input_tokens_seen": 47761920, "step": 22125 }, { "epoch": 3.6101141924959217, "grad_norm": 0.24680005013942719, "learning_rate": 4.0323013797650556e-05, "loss": 0.2495, "num_input_tokens_seen": 47773376, "step": 22130 }, { "epoch": 3.6109298531810765, "grad_norm": 0.7548781633377075, "learning_rate": 4.0317388941808096e-05, "loss": 0.0122, "num_input_tokens_seen": 47784064, "step": 22135 }, { "epoch": 3.6117455138662318, "grad_norm": 3.6045496463775635, "learning_rate": 4.0311762844260377e-05, "loss": 0.0533, "num_input_tokens_seen": 47794592, "step": 22140 }, { "epoch": 3.6125611745513866, "grad_norm": 2.84478497505188, "learning_rate": 4.030613550546347e-05, "loss": 0.0654, "num_input_tokens_seen": 47805504, "step": 22145 }, { "epoch": 3.613376835236542, "grad_norm": 0.22830621898174286, "learning_rate": 4.030050692587355e-05, "loss": 0.0894, "num_input_tokens_seen": 47816128, "step": 22150 }, { "epoch": 3.6141924959216967, "grad_norm": 2.5198605060577393, "learning_rate": 4.02948771059469e-05, "loss": 0.1818, "num_input_tokens_seen": 47827264, "step": 22155 }, { "epoch": 3.6150081566068515, "grad_norm": 0.3472936451435089, "learning_rate": 4.028924604613991e-05, "loss": 0.0564, "num_input_tokens_seen": 47838208, "step": 22160 }, { "epoch": 3.6158238172920063, "grad_norm": 6.473494052886963, "learning_rate": 4.028361374690906e-05, "loss": 0.3111, "num_input_tokens_seen": 47848192, "step": 22165 }, { "epoch": 3.6166394779771616, "grad_norm": 0.1116599515080452, "learning_rate": 4.027798020871093e-05, "loss": 0.0812, "num_input_tokens_seen": 47858688, "step": 22170 }, { "epoch": 3.6174551386623164, "grad_norm": 0.28514230251312256, "learning_rate": 4.027234543200221e-05, "loss": 0.0069, "num_input_tokens_seen": 47870048, "step": 22175 }, { "epoch": 3.6182707993474716, "grad_norm": 3.8633968830108643, "learning_rate": 4.026670941723968e-05, "loss": 0.2806, "num_input_tokens_seen": 47881984, "step": 22180 }, { "epoch": 3.6190864600326265, "grad_norm": 1.2139700651168823, "learning_rate": 4.026107216488022e-05, "loss": 0.0192, "num_input_tokens_seen": 47893376, "step": 22185 }, { "epoch": 3.6199021207177813, "grad_norm": 0.20846295356750488, "learning_rate": 4.0255433675380803e-05, "loss": 0.1225, "num_input_tokens_seen": 47903680, "step": 22190 }, { "epoch": 3.6207177814029365, "grad_norm": 4.988443374633789, "learning_rate": 4.024979394919855e-05, "loss": 0.2028, "num_input_tokens_seen": 47914144, "step": 22195 }, { "epoch": 3.6215334420880914, "grad_norm": 0.16319090127944946, "learning_rate": 4.0244152986790604e-05, "loss": 0.0056, "num_input_tokens_seen": 47924224, "step": 22200 }, { "epoch": 3.622349102773246, "grad_norm": 0.17418339848518372, "learning_rate": 4.0238510788614276e-05, "loss": 0.0155, "num_input_tokens_seen": 47935264, "step": 22205 }, { "epoch": 3.6231647634584014, "grad_norm": 9.328039169311523, "learning_rate": 4.0232867355126934e-05, "loss": 0.1032, "num_input_tokens_seen": 47946432, "step": 22210 }, { "epoch": 3.6239804241435563, "grad_norm": 0.0981740653514862, "learning_rate": 4.0227222686786084e-05, "loss": 0.1034, "num_input_tokens_seen": 47956608, "step": 22215 }, { "epoch": 3.624796084828711, "grad_norm": 0.2725767195224762, "learning_rate": 4.02215767840493e-05, "loss": 0.2099, "num_input_tokens_seen": 47967552, "step": 22220 }, { "epoch": 3.6256117455138663, "grad_norm": 2.8347389698028564, "learning_rate": 4.021592964737427e-05, "loss": 0.1766, "num_input_tokens_seen": 47977888, "step": 22225 }, { "epoch": 3.626427406199021, "grad_norm": 0.5867570638656616, "learning_rate": 4.021028127721878e-05, "loss": 0.0112, "num_input_tokens_seen": 47988576, "step": 22230 }, { "epoch": 3.6272430668841764, "grad_norm": 6.528834819793701, "learning_rate": 4.020463167404071e-05, "loss": 0.3195, "num_input_tokens_seen": 47999776, "step": 22235 }, { "epoch": 3.6280587275693312, "grad_norm": 0.07322845607995987, "learning_rate": 4.019898083829804e-05, "loss": 0.16, "num_input_tokens_seen": 48010688, "step": 22240 }, { "epoch": 3.628874388254486, "grad_norm": 0.10080315917730331, "learning_rate": 4.019332877044888e-05, "loss": 0.0074, "num_input_tokens_seen": 48021152, "step": 22245 }, { "epoch": 3.629690048939641, "grad_norm": 3.8889048099517822, "learning_rate": 4.018767547095139e-05, "loss": 0.1934, "num_input_tokens_seen": 48030176, "step": 22250 }, { "epoch": 3.630505709624796, "grad_norm": 4.339661121368408, "learning_rate": 4.018202094026386e-05, "loss": 0.161, "num_input_tokens_seen": 48039232, "step": 22255 }, { "epoch": 3.631321370309951, "grad_norm": 0.1206539049744606, "learning_rate": 4.01763651788447e-05, "loss": 0.1115, "num_input_tokens_seen": 48049984, "step": 22260 }, { "epoch": 3.632137030995106, "grad_norm": 0.10961958020925522, "learning_rate": 4.017070818715235e-05, "loss": 0.0891, "num_input_tokens_seen": 48061088, "step": 22265 }, { "epoch": 3.632952691680261, "grad_norm": 2.189587354660034, "learning_rate": 4.016504996564544e-05, "loss": 0.116, "num_input_tokens_seen": 48072704, "step": 22270 }, { "epoch": 3.633768352365416, "grad_norm": 0.13103020191192627, "learning_rate": 4.015939051478262e-05, "loss": 0.0934, "num_input_tokens_seen": 48082848, "step": 22275 }, { "epoch": 3.634584013050571, "grad_norm": 1.8878916501998901, "learning_rate": 4.0153729835022685e-05, "loss": 0.1781, "num_input_tokens_seen": 48094464, "step": 22280 }, { "epoch": 3.635399673735726, "grad_norm": 2.901374101638794, "learning_rate": 4.014806792682453e-05, "loss": 0.1184, "num_input_tokens_seen": 48105088, "step": 22285 }, { "epoch": 3.636215334420881, "grad_norm": 3.0098648071289062, "learning_rate": 4.0142404790647124e-05, "loss": 0.1215, "num_input_tokens_seen": 48114432, "step": 22290 }, { "epoch": 3.637030995106036, "grad_norm": 6.524054527282715, "learning_rate": 4.0136740426949546e-05, "loss": 0.0509, "num_input_tokens_seen": 48124288, "step": 22295 }, { "epoch": 3.637846655791191, "grad_norm": 4.471999645233154, "learning_rate": 4.0131074836191e-05, "loss": 0.2926, "num_input_tokens_seen": 48134144, "step": 22300 }, { "epoch": 3.6386623164763456, "grad_norm": 0.30536070466041565, "learning_rate": 4.0125408018830744e-05, "loss": 0.0636, "num_input_tokens_seen": 48144416, "step": 22305 }, { "epoch": 3.639477977161501, "grad_norm": 0.401883602142334, "learning_rate": 4.011973997532818e-05, "loss": 0.0148, "num_input_tokens_seen": 48155584, "step": 22310 }, { "epoch": 3.6402936378466557, "grad_norm": 7.902167320251465, "learning_rate": 4.011407070614276e-05, "loss": 0.2724, "num_input_tokens_seen": 48167232, "step": 22315 }, { "epoch": 3.641109298531811, "grad_norm": 2.3360376358032227, "learning_rate": 4.010840021173409e-05, "loss": 0.148, "num_input_tokens_seen": 48178208, "step": 22320 }, { "epoch": 3.641924959216966, "grad_norm": 0.12659363448619843, "learning_rate": 4.010272849256184e-05, "loss": 0.0125, "num_input_tokens_seen": 48188896, "step": 22325 }, { "epoch": 3.6427406199021206, "grad_norm": 3.3409955501556396, "learning_rate": 4.0097055549085784e-05, "loss": 0.3578, "num_input_tokens_seen": 48200000, "step": 22330 }, { "epoch": 3.6435562805872754, "grad_norm": 2.3330581188201904, "learning_rate": 4.009138138176581e-05, "loss": 0.1743, "num_input_tokens_seen": 48211200, "step": 22335 }, { "epoch": 3.6443719412724307, "grad_norm": 4.646770477294922, "learning_rate": 4.008570599106188e-05, "loss": 0.2123, "num_input_tokens_seen": 48221504, "step": 22340 }, { "epoch": 3.6451876019575855, "grad_norm": 1.5298365354537964, "learning_rate": 4.008002937743409e-05, "loss": 0.0963, "num_input_tokens_seen": 48230976, "step": 22345 }, { "epoch": 3.6460032626427408, "grad_norm": 5.603265762329102, "learning_rate": 4.0074351541342595e-05, "loss": 0.0707, "num_input_tokens_seen": 48242368, "step": 22350 }, { "epoch": 3.6468189233278956, "grad_norm": 0.32391926646232605, "learning_rate": 4.006867248324767e-05, "loss": 0.0429, "num_input_tokens_seen": 48253696, "step": 22355 }, { "epoch": 3.6476345840130504, "grad_norm": 0.1386079043149948, "learning_rate": 4.006299220360971e-05, "loss": 0.1191, "num_input_tokens_seen": 48264768, "step": 22360 }, { "epoch": 3.6484502446982057, "grad_norm": 0.21881791949272156, "learning_rate": 4.0057310702889164e-05, "loss": 0.1884, "num_input_tokens_seen": 48275520, "step": 22365 }, { "epoch": 3.6492659053833605, "grad_norm": 0.826500654220581, "learning_rate": 4.005162798154661e-05, "loss": 0.0849, "num_input_tokens_seen": 48284960, "step": 22370 }, { "epoch": 3.6500815660685157, "grad_norm": 0.33012261986732483, "learning_rate": 4.004594404004273e-05, "loss": 0.007, "num_input_tokens_seen": 48295840, "step": 22375 }, { "epoch": 3.6508972267536706, "grad_norm": 2.405694007873535, "learning_rate": 4.0040258878838284e-05, "loss": 0.0482, "num_input_tokens_seen": 48306880, "step": 22380 }, { "epoch": 3.6517128874388254, "grad_norm": 9.564769744873047, "learning_rate": 4.003457249839413e-05, "loss": 0.1081, "num_input_tokens_seen": 48318336, "step": 22385 }, { "epoch": 3.65252854812398, "grad_norm": 6.086526393890381, "learning_rate": 4.002888489917126e-05, "loss": 0.2148, "num_input_tokens_seen": 48329152, "step": 22390 }, { "epoch": 3.6533442088091355, "grad_norm": 1.8801467418670654, "learning_rate": 4.002319608163071e-05, "loss": 0.0184, "num_input_tokens_seen": 48339136, "step": 22395 }, { "epoch": 3.6541598694942903, "grad_norm": 0.10271084308624268, "learning_rate": 4.0017506046233664e-05, "loss": 0.0898, "num_input_tokens_seen": 48350208, "step": 22400 }, { "epoch": 3.6549755301794455, "grad_norm": 0.14470317959785461, "learning_rate": 4.001181479344138e-05, "loss": 0.3134, "num_input_tokens_seen": 48362016, "step": 22405 }, { "epoch": 3.6557911908646004, "grad_norm": 0.4115452766418457, "learning_rate": 4.000612232371522e-05, "loss": 0.1287, "num_input_tokens_seen": 48371424, "step": 22410 }, { "epoch": 3.656606851549755, "grad_norm": 0.11699778586626053, "learning_rate": 4.000042863751664e-05, "loss": 0.2665, "num_input_tokens_seen": 48383328, "step": 22415 }, { "epoch": 3.6574225122349104, "grad_norm": 0.1325097382068634, "learning_rate": 3.999473373530721e-05, "loss": 0.0129, "num_input_tokens_seen": 48394304, "step": 22420 }, { "epoch": 3.6582381729200653, "grad_norm": 0.4000729024410248, "learning_rate": 3.9989037617548575e-05, "loss": 0.0617, "num_input_tokens_seen": 48405376, "step": 22425 }, { "epoch": 3.65905383360522, "grad_norm": 2.915386915206909, "learning_rate": 3.9983340284702495e-05, "loss": 0.0663, "num_input_tokens_seen": 48416352, "step": 22430 }, { "epoch": 3.6598694942903753, "grad_norm": 1.3424534797668457, "learning_rate": 3.9977641737230833e-05, "loss": 0.2314, "num_input_tokens_seen": 48427488, "step": 22435 }, { "epoch": 3.66068515497553, "grad_norm": 0.15219749510288239, "learning_rate": 3.9971941975595535e-05, "loss": 0.0193, "num_input_tokens_seen": 48439520, "step": 22440 }, { "epoch": 3.661500815660685, "grad_norm": 0.43363040685653687, "learning_rate": 3.996624100025865e-05, "loss": 0.0262, "num_input_tokens_seen": 48449984, "step": 22445 }, { "epoch": 3.6623164763458402, "grad_norm": 4.819627285003662, "learning_rate": 3.9960538811682334e-05, "loss": 0.246, "num_input_tokens_seen": 48460672, "step": 22450 }, { "epoch": 3.663132137030995, "grad_norm": 5.342092514038086, "learning_rate": 3.9954835410328836e-05, "loss": 0.0607, "num_input_tokens_seen": 48471328, "step": 22455 }, { "epoch": 3.6639477977161503, "grad_norm": 0.058290671557188034, "learning_rate": 3.9949130796660496e-05, "loss": 0.0115, "num_input_tokens_seen": 48481664, "step": 22460 }, { "epoch": 3.664763458401305, "grad_norm": 0.13387128710746765, "learning_rate": 3.994342497113977e-05, "loss": 0.0682, "num_input_tokens_seen": 48492896, "step": 22465 }, { "epoch": 3.66557911908646, "grad_norm": 16.06231689453125, "learning_rate": 3.993771793422918e-05, "loss": 0.0848, "num_input_tokens_seen": 48503648, "step": 22470 }, { "epoch": 3.6663947797716148, "grad_norm": 1.8369101285934448, "learning_rate": 3.993200968639139e-05, "loss": 0.0649, "num_input_tokens_seen": 48514368, "step": 22475 }, { "epoch": 3.66721044045677, "grad_norm": 0.08171354234218597, "learning_rate": 3.9926300228089124e-05, "loss": 0.21, "num_input_tokens_seen": 48525504, "step": 22480 }, { "epoch": 3.668026101141925, "grad_norm": 0.33790531754493713, "learning_rate": 3.992058955978523e-05, "loss": 0.0831, "num_input_tokens_seen": 48534880, "step": 22485 }, { "epoch": 3.66884176182708, "grad_norm": 3.9071567058563232, "learning_rate": 3.9914877681942645e-05, "loss": 0.1641, "num_input_tokens_seen": 48546656, "step": 22490 }, { "epoch": 3.669657422512235, "grad_norm": 0.05480445548892021, "learning_rate": 3.99091645950244e-05, "loss": 0.0346, "num_input_tokens_seen": 48557952, "step": 22495 }, { "epoch": 3.6704730831973897, "grad_norm": 1.3236916065216064, "learning_rate": 3.990345029949361e-05, "loss": 0.0472, "num_input_tokens_seen": 48568832, "step": 22500 }, { "epoch": 3.671288743882545, "grad_norm": 8.661673545837402, "learning_rate": 3.9897734795813524e-05, "loss": 0.1859, "num_input_tokens_seen": 48578784, "step": 22505 }, { "epoch": 3.6721044045677, "grad_norm": 0.064410001039505, "learning_rate": 3.989201808444747e-05, "loss": 0.0965, "num_input_tokens_seen": 48590016, "step": 22510 }, { "epoch": 3.672920065252855, "grad_norm": 3.6234500408172607, "learning_rate": 3.988630016585887e-05, "loss": 0.0224, "num_input_tokens_seen": 48600608, "step": 22515 }, { "epoch": 3.67373572593801, "grad_norm": 0.05369186773896217, "learning_rate": 3.988058104051124e-05, "loss": 0.0091, "num_input_tokens_seen": 48610912, "step": 22520 }, { "epoch": 3.6745513866231647, "grad_norm": 0.2313581258058548, "learning_rate": 3.987486070886821e-05, "loss": 0.0782, "num_input_tokens_seen": 48622272, "step": 22525 }, { "epoch": 3.6753670473083195, "grad_norm": 0.08435726910829544, "learning_rate": 3.98691391713935e-05, "loss": 0.0125, "num_input_tokens_seen": 48633376, "step": 22530 }, { "epoch": 3.676182707993475, "grad_norm": 0.24193263053894043, "learning_rate": 3.986341642855092e-05, "loss": 0.1817, "num_input_tokens_seen": 48644608, "step": 22535 }, { "epoch": 3.6769983686786296, "grad_norm": 0.3538898825645447, "learning_rate": 3.985769248080439e-05, "loss": 0.0667, "num_input_tokens_seen": 48656448, "step": 22540 }, { "epoch": 3.677814029363785, "grad_norm": 2.382725954055786, "learning_rate": 3.9851967328617925e-05, "loss": 0.2129, "num_input_tokens_seen": 48667936, "step": 22545 }, { "epoch": 3.6786296900489397, "grad_norm": 0.2911075949668884, "learning_rate": 3.984624097245562e-05, "loss": 0.252, "num_input_tokens_seen": 48678240, "step": 22550 }, { "epoch": 3.6794453507340945, "grad_norm": 4.779908657073975, "learning_rate": 3.98405134127817e-05, "loss": 0.1643, "num_input_tokens_seen": 48687296, "step": 22555 }, { "epoch": 3.6802610114192493, "grad_norm": 0.07430237531661987, "learning_rate": 3.983478465006045e-05, "loss": 0.0067, "num_input_tokens_seen": 48698720, "step": 22560 }, { "epoch": 3.6810766721044046, "grad_norm": 3.6542553901672363, "learning_rate": 3.9829054684756304e-05, "loss": 0.1002, "num_input_tokens_seen": 48709696, "step": 22565 }, { "epoch": 3.6818923327895594, "grad_norm": 5.811183452606201, "learning_rate": 3.982332351733373e-05, "loss": 0.0659, "num_input_tokens_seen": 48720032, "step": 22570 }, { "epoch": 3.6827079934747147, "grad_norm": 3.3798558712005615, "learning_rate": 3.981759114825735e-05, "loss": 0.1565, "num_input_tokens_seen": 48729664, "step": 22575 }, { "epoch": 3.6835236541598695, "grad_norm": 0.1618494689464569, "learning_rate": 3.981185757799184e-05, "loss": 0.0121, "num_input_tokens_seen": 48742336, "step": 22580 }, { "epoch": 3.6843393148450243, "grad_norm": 0.2879089415073395, "learning_rate": 3.9806122807002e-05, "loss": 0.0097, "num_input_tokens_seen": 48754048, "step": 22585 }, { "epoch": 3.6851549755301796, "grad_norm": 0.32481124997138977, "learning_rate": 3.9800386835752726e-05, "loss": 0.0145, "num_input_tokens_seen": 48765344, "step": 22590 }, { "epoch": 3.6859706362153344, "grad_norm": 0.20661230385303497, "learning_rate": 3.979464966470899e-05, "loss": 0.131, "num_input_tokens_seen": 48777152, "step": 22595 }, { "epoch": 3.6867862969004896, "grad_norm": 0.6438668966293335, "learning_rate": 3.978891129433588e-05, "loss": 0.166, "num_input_tokens_seen": 48786880, "step": 22600 }, { "epoch": 3.6876019575856445, "grad_norm": 0.03673599287867546, "learning_rate": 3.97831717250986e-05, "loss": 0.0039, "num_input_tokens_seen": 48796256, "step": 22605 }, { "epoch": 3.6884176182707993, "grad_norm": 0.1118549332022667, "learning_rate": 3.97774309574624e-05, "loss": 0.1462, "num_input_tokens_seen": 48808096, "step": 22610 }, { "epoch": 3.689233278955954, "grad_norm": 2.930147647857666, "learning_rate": 3.977168899189267e-05, "loss": 0.1644, "num_input_tokens_seen": 48818848, "step": 22615 }, { "epoch": 3.6900489396411094, "grad_norm": 0.1776353120803833, "learning_rate": 3.9765945828854876e-05, "loss": 0.0123, "num_input_tokens_seen": 48831296, "step": 22620 }, { "epoch": 3.690864600326264, "grad_norm": 0.09796354919672012, "learning_rate": 3.97602014688146e-05, "loss": 0.1369, "num_input_tokens_seen": 48841536, "step": 22625 }, { "epoch": 3.6916802610114194, "grad_norm": 8.46008014678955, "learning_rate": 3.9754455912237486e-05, "loss": 0.329, "num_input_tokens_seen": 48853152, "step": 22630 }, { "epoch": 3.6924959216965743, "grad_norm": 0.1644945740699768, "learning_rate": 3.974870915958932e-05, "loss": 0.1864, "num_input_tokens_seen": 48862688, "step": 22635 }, { "epoch": 3.693311582381729, "grad_norm": 0.29013460874557495, "learning_rate": 3.974296121133596e-05, "loss": 0.0922, "num_input_tokens_seen": 48874688, "step": 22640 }, { "epoch": 3.6941272430668843, "grad_norm": 0.32632938027381897, "learning_rate": 3.9737212067943354e-05, "loss": 0.089, "num_input_tokens_seen": 48885856, "step": 22645 }, { "epoch": 3.694942903752039, "grad_norm": 0.1759522408246994, "learning_rate": 3.973146172987756e-05, "loss": 0.2164, "num_input_tokens_seen": 48896096, "step": 22650 }, { "epoch": 3.695758564437194, "grad_norm": 3.1296448707580566, "learning_rate": 3.9725710197604735e-05, "loss": 0.2253, "num_input_tokens_seen": 48907328, "step": 22655 }, { "epoch": 3.6965742251223492, "grad_norm": 6.081822872161865, "learning_rate": 3.971995747159113e-05, "loss": 0.052, "num_input_tokens_seen": 48918304, "step": 22660 }, { "epoch": 3.697389885807504, "grad_norm": 0.17746877670288086, "learning_rate": 3.971420355230308e-05, "loss": 0.0185, "num_input_tokens_seen": 48929440, "step": 22665 }, { "epoch": 3.698205546492659, "grad_norm": 0.16611170768737793, "learning_rate": 3.9708448440207026e-05, "loss": 0.0139, "num_input_tokens_seen": 48940256, "step": 22670 }, { "epoch": 3.699021207177814, "grad_norm": 0.11874650418758392, "learning_rate": 3.970269213576951e-05, "loss": 0.0557, "num_input_tokens_seen": 48949856, "step": 22675 }, { "epoch": 3.699836867862969, "grad_norm": 0.20596261322498322, "learning_rate": 3.969693463945717e-05, "loss": 0.0107, "num_input_tokens_seen": 48960768, "step": 22680 }, { "epoch": 3.700652528548124, "grad_norm": 8.530813217163086, "learning_rate": 3.9691175951736745e-05, "loss": 0.1962, "num_input_tokens_seen": 48971264, "step": 22685 }, { "epoch": 3.701468189233279, "grad_norm": 0.068712517619133, "learning_rate": 3.9685416073075045e-05, "loss": 0.1436, "num_input_tokens_seen": 48981632, "step": 22690 }, { "epoch": 3.702283849918434, "grad_norm": 1.9284168481826782, "learning_rate": 3.967965500393901e-05, "loss": 0.2985, "num_input_tokens_seen": 48993024, "step": 22695 }, { "epoch": 3.7030995106035887, "grad_norm": 0.6498374938964844, "learning_rate": 3.9673892744795655e-05, "loss": 0.2043, "num_input_tokens_seen": 49004064, "step": 22700 }, { "epoch": 3.703915171288744, "grad_norm": 2.0610876083374023, "learning_rate": 3.96681292961121e-05, "loss": 0.1693, "num_input_tokens_seen": 49016032, "step": 22705 }, { "epoch": 3.7047308319738987, "grad_norm": 2.4175643920898438, "learning_rate": 3.9662364658355555e-05, "loss": 0.0362, "num_input_tokens_seen": 49026336, "step": 22710 }, { "epoch": 3.705546492659054, "grad_norm": 5.219014644622803, "learning_rate": 3.965659883199334e-05, "loss": 0.105, "num_input_tokens_seen": 49037312, "step": 22715 }, { "epoch": 3.706362153344209, "grad_norm": 0.7204117774963379, "learning_rate": 3.9650831817492864e-05, "loss": 0.11, "num_input_tokens_seen": 49048480, "step": 22720 }, { "epoch": 3.7071778140293636, "grad_norm": 2.4042067527770996, "learning_rate": 3.964506361532161e-05, "loss": 0.4708, "num_input_tokens_seen": 49059840, "step": 22725 }, { "epoch": 3.707993474714519, "grad_norm": 0.5451491475105286, "learning_rate": 3.96392942259472e-05, "loss": 0.1008, "num_input_tokens_seen": 49070720, "step": 22730 }, { "epoch": 3.7088091353996737, "grad_norm": 0.12057767063379288, "learning_rate": 3.963352364983731e-05, "loss": 0.0816, "num_input_tokens_seen": 49081408, "step": 22735 }, { "epoch": 3.709624796084829, "grad_norm": 3.3763442039489746, "learning_rate": 3.962775188745975e-05, "loss": 0.0747, "num_input_tokens_seen": 49092416, "step": 22740 }, { "epoch": 3.710440456769984, "grad_norm": 1.9849827289581299, "learning_rate": 3.9621978939282405e-05, "loss": 0.1388, "num_input_tokens_seen": 49102464, "step": 22745 }, { "epoch": 3.7112561174551386, "grad_norm": 0.1819087713956833, "learning_rate": 3.961620480577325e-05, "loss": 0.166, "num_input_tokens_seen": 49113920, "step": 22750 }, { "epoch": 3.7120717781402934, "grad_norm": 0.08712363243103027, "learning_rate": 3.961042948740038e-05, "loss": 0.1381, "num_input_tokens_seen": 49124096, "step": 22755 }, { "epoch": 3.7128874388254487, "grad_norm": 0.08532737195491791, "learning_rate": 3.960465298463195e-05, "loss": 0.082, "num_input_tokens_seen": 49134048, "step": 22760 }, { "epoch": 3.7137030995106035, "grad_norm": 0.8071739673614502, "learning_rate": 3.959887529793625e-05, "loss": 0.0217, "num_input_tokens_seen": 49144800, "step": 22765 }, { "epoch": 3.7145187601957588, "grad_norm": 6.525815486907959, "learning_rate": 3.9593096427781665e-05, "loss": 0.1802, "num_input_tokens_seen": 49156128, "step": 22770 }, { "epoch": 3.7153344208809136, "grad_norm": 0.13750576972961426, "learning_rate": 3.958731637463662e-05, "loss": 0.1035, "num_input_tokens_seen": 49167168, "step": 22775 }, { "epoch": 3.7161500815660684, "grad_norm": 0.1299505978822708, "learning_rate": 3.958153513896969e-05, "loss": 0.0479, "num_input_tokens_seen": 49177248, "step": 22780 }, { "epoch": 3.7169657422512232, "grad_norm": 0.6548534631729126, "learning_rate": 3.957575272124954e-05, "loss": 0.0128, "num_input_tokens_seen": 49189568, "step": 22785 }, { "epoch": 3.7177814029363785, "grad_norm": 6.015018939971924, "learning_rate": 3.9569969121944925e-05, "loss": 0.1047, "num_input_tokens_seen": 49198912, "step": 22790 }, { "epoch": 3.7185970636215333, "grad_norm": 0.1631668359041214, "learning_rate": 3.956418434152467e-05, "loss": 0.2272, "num_input_tokens_seen": 49210112, "step": 22795 }, { "epoch": 3.7194127243066886, "grad_norm": 2.453158378601074, "learning_rate": 3.955839838045775e-05, "loss": 0.2212, "num_input_tokens_seen": 49221344, "step": 22800 }, { "epoch": 3.7202283849918434, "grad_norm": 0.03234280273318291, "learning_rate": 3.9552611239213185e-05, "loss": 0.0937, "num_input_tokens_seen": 49232064, "step": 22805 }, { "epoch": 3.721044045676998, "grad_norm": 3.2036361694335938, "learning_rate": 3.954682291826011e-05, "loss": 0.2731, "num_input_tokens_seen": 49242144, "step": 22810 }, { "epoch": 3.7218597063621535, "grad_norm": 0.5641581416130066, "learning_rate": 3.9541033418067765e-05, "loss": 0.0319, "num_input_tokens_seen": 49253248, "step": 22815 }, { "epoch": 3.7226753670473083, "grad_norm": 0.41947269439697266, "learning_rate": 3.953524273910546e-05, "loss": 0.1217, "num_input_tokens_seen": 49264384, "step": 22820 }, { "epoch": 3.7234910277324635, "grad_norm": 0.08137278258800507, "learning_rate": 3.952945088184264e-05, "loss": 0.0355, "num_input_tokens_seen": 49276416, "step": 22825 }, { "epoch": 3.7243066884176184, "grad_norm": 0.2503032386302948, "learning_rate": 3.952365784674881e-05, "loss": 0.0399, "num_input_tokens_seen": 49288800, "step": 22830 }, { "epoch": 3.725122349102773, "grad_norm": 5.778484344482422, "learning_rate": 3.951786363429357e-05, "loss": 0.1051, "num_input_tokens_seen": 49299328, "step": 22835 }, { "epoch": 3.725938009787928, "grad_norm": 2.861217737197876, "learning_rate": 3.951206824494665e-05, "loss": 0.1333, "num_input_tokens_seen": 49311360, "step": 22840 }, { "epoch": 3.7267536704730833, "grad_norm": 0.1345771700143814, "learning_rate": 3.950627167917784e-05, "loss": 0.2041, "num_input_tokens_seen": 49321632, "step": 22845 }, { "epoch": 3.727569331158238, "grad_norm": 5.471856117248535, "learning_rate": 3.950047393745705e-05, "loss": 0.0848, "num_input_tokens_seen": 49332256, "step": 22850 }, { "epoch": 3.7283849918433933, "grad_norm": 1.5649278163909912, "learning_rate": 3.949467502025426e-05, "loss": 0.1056, "num_input_tokens_seen": 49343360, "step": 22855 }, { "epoch": 3.729200652528548, "grad_norm": 0.0935976654291153, "learning_rate": 3.948887492803957e-05, "loss": 0.1103, "num_input_tokens_seen": 49354112, "step": 22860 }, { "epoch": 3.730016313213703, "grad_norm": 0.08797755092382431, "learning_rate": 3.948307366128316e-05, "loss": 0.1159, "num_input_tokens_seen": 49365120, "step": 22865 }, { "epoch": 3.7308319738988582, "grad_norm": 3.224078416824341, "learning_rate": 3.9477271220455323e-05, "loss": 0.3085, "num_input_tokens_seen": 49377600, "step": 22870 }, { "epoch": 3.731647634584013, "grad_norm": 0.08593336492776871, "learning_rate": 3.947146760602642e-05, "loss": 0.1162, "num_input_tokens_seen": 49387648, "step": 22875 }, { "epoch": 3.732463295269168, "grad_norm": 6.358371734619141, "learning_rate": 3.946566281846692e-05, "loss": 0.2713, "num_input_tokens_seen": 49398368, "step": 22880 }, { "epoch": 3.733278955954323, "grad_norm": 0.22296041250228882, "learning_rate": 3.9459856858247404e-05, "loss": 0.1504, "num_input_tokens_seen": 49408416, "step": 22885 }, { "epoch": 3.734094616639478, "grad_norm": 0.2840973138809204, "learning_rate": 3.945404972583851e-05, "loss": 0.239, "num_input_tokens_seen": 49417888, "step": 22890 }, { "epoch": 3.7349102773246328, "grad_norm": 0.3990428149700165, "learning_rate": 3.9448241421711004e-05, "loss": 0.2198, "num_input_tokens_seen": 49427296, "step": 22895 }, { "epoch": 3.735725938009788, "grad_norm": 0.13959169387817383, "learning_rate": 3.9442431946335755e-05, "loss": 0.209, "num_input_tokens_seen": 49437984, "step": 22900 }, { "epoch": 3.736541598694943, "grad_norm": 2.983022451400757, "learning_rate": 3.943662130018368e-05, "loss": 0.1907, "num_input_tokens_seen": 49448288, "step": 22905 }, { "epoch": 3.737357259380098, "grad_norm": 0.11356587707996368, "learning_rate": 3.943080948372583e-05, "loss": 0.1462, "num_input_tokens_seen": 49458624, "step": 22910 }, { "epoch": 3.738172920065253, "grad_norm": 3.975989818572998, "learning_rate": 3.942499649743335e-05, "loss": 0.0523, "num_input_tokens_seen": 49468032, "step": 22915 }, { "epoch": 3.7389885807504077, "grad_norm": 1.244232416152954, "learning_rate": 3.941918234177746e-05, "loss": 0.0235, "num_input_tokens_seen": 49479744, "step": 22920 }, { "epoch": 3.7398042414355626, "grad_norm": 7.66337251663208, "learning_rate": 3.941336701722949e-05, "loss": 0.1321, "num_input_tokens_seen": 49490688, "step": 22925 }, { "epoch": 3.740619902120718, "grad_norm": 1.0893155336380005, "learning_rate": 3.940755052426085e-05, "loss": 0.0255, "num_input_tokens_seen": 49499936, "step": 22930 }, { "epoch": 3.7414355628058726, "grad_norm": 0.09969876706600189, "learning_rate": 3.940173286334307e-05, "loss": 0.03, "num_input_tokens_seen": 49510080, "step": 22935 }, { "epoch": 3.742251223491028, "grad_norm": 0.971980094909668, "learning_rate": 3.9395914034947744e-05, "loss": 0.0408, "num_input_tokens_seen": 49521056, "step": 22940 }, { "epoch": 3.7430668841761827, "grad_norm": 0.25531283020973206, "learning_rate": 3.939009403954659e-05, "loss": 0.0144, "num_input_tokens_seen": 49531712, "step": 22945 }, { "epoch": 3.7438825448613375, "grad_norm": 1.407782793045044, "learning_rate": 3.9384272877611384e-05, "loss": 0.0522, "num_input_tokens_seen": 49542912, "step": 22950 }, { "epoch": 3.744698205546493, "grad_norm": 0.13444636762142181, "learning_rate": 3.9378450549614044e-05, "loss": 0.0092, "num_input_tokens_seen": 49553888, "step": 22955 }, { "epoch": 3.7455138662316476, "grad_norm": 1.0330255031585693, "learning_rate": 3.9372627056026544e-05, "loss": 0.0525, "num_input_tokens_seen": 49565376, "step": 22960 }, { "epoch": 3.746329526916803, "grad_norm": 0.7507807612419128, "learning_rate": 3.9366802397320966e-05, "loss": 0.0254, "num_input_tokens_seen": 49577184, "step": 22965 }, { "epoch": 3.7471451876019577, "grad_norm": 1.9532883167266846, "learning_rate": 3.9360976573969494e-05, "loss": 0.0124, "num_input_tokens_seen": 49588608, "step": 22970 }, { "epoch": 3.7479608482871125, "grad_norm": 0.11609118431806564, "learning_rate": 3.935514958644439e-05, "loss": 0.005, "num_input_tokens_seen": 49599840, "step": 22975 }, { "epoch": 3.7487765089722673, "grad_norm": 0.15790703892707825, "learning_rate": 3.934932143521803e-05, "loss": 0.2568, "num_input_tokens_seen": 49611040, "step": 22980 }, { "epoch": 3.7495921696574226, "grad_norm": 0.09029453992843628, "learning_rate": 3.934349212076286e-05, "loss": 0.018, "num_input_tokens_seen": 49621472, "step": 22985 }, { "epoch": 3.7504078303425774, "grad_norm": 0.054871074855327606, "learning_rate": 3.933766164355145e-05, "loss": 0.1665, "num_input_tokens_seen": 49632832, "step": 22990 }, { "epoch": 3.7512234910277327, "grad_norm": 6.67854642868042, "learning_rate": 3.9331830004056424e-05, "loss": 0.3048, "num_input_tokens_seen": 49643136, "step": 22995 }, { "epoch": 3.7520391517128875, "grad_norm": 2.5512938499450684, "learning_rate": 3.932599720275055e-05, "loss": 0.1603, "num_input_tokens_seen": 49653920, "step": 23000 }, { "epoch": 3.7528548123980423, "grad_norm": 0.0809488296508789, "learning_rate": 3.9320163240106656e-05, "loss": 0.0827, "num_input_tokens_seen": 49664992, "step": 23005 }, { "epoch": 3.753670473083197, "grad_norm": 0.07908891886472702, "learning_rate": 3.931432811659766e-05, "loss": 0.0055, "num_input_tokens_seen": 49676000, "step": 23010 }, { "epoch": 3.7544861337683524, "grad_norm": 0.10362471640110016, "learning_rate": 3.9308491832696596e-05, "loss": 0.0187, "num_input_tokens_seen": 49687712, "step": 23015 }, { "epoch": 3.755301794453507, "grad_norm": 0.40513619780540466, "learning_rate": 3.930265438887659e-05, "loss": 0.0395, "num_input_tokens_seen": 49698944, "step": 23020 }, { "epoch": 3.7561174551386625, "grad_norm": 0.07383420318365097, "learning_rate": 3.929681578561084e-05, "loss": 0.0855, "num_input_tokens_seen": 49710048, "step": 23025 }, { "epoch": 3.7569331158238173, "grad_norm": 1.2087088823318481, "learning_rate": 3.929097602337267e-05, "loss": 0.2128, "num_input_tokens_seen": 49720672, "step": 23030 }, { "epoch": 3.757748776508972, "grad_norm": 0.9839165210723877, "learning_rate": 3.9285135102635474e-05, "loss": 0.0764, "num_input_tokens_seen": 49731776, "step": 23035 }, { "epoch": 3.7585644371941274, "grad_norm": 0.16747084259986877, "learning_rate": 3.9279293023872745e-05, "loss": 0.1409, "num_input_tokens_seen": 49742336, "step": 23040 }, { "epoch": 3.759380097879282, "grad_norm": 3.6237456798553467, "learning_rate": 3.927344978755806e-05, "loss": 0.3473, "num_input_tokens_seen": 49753408, "step": 23045 }, { "epoch": 3.7601957585644374, "grad_norm": 0.05490643531084061, "learning_rate": 3.926760539416512e-05, "loss": 0.05, "num_input_tokens_seen": 49764192, "step": 23050 }, { "epoch": 3.7610114192495923, "grad_norm": 1.0296424627304077, "learning_rate": 3.926175984416769e-05, "loss": 0.0124, "num_input_tokens_seen": 49775840, "step": 23055 }, { "epoch": 3.761827079934747, "grad_norm": 0.07540196925401688, "learning_rate": 3.9255913138039645e-05, "loss": 0.0601, "num_input_tokens_seen": 49785408, "step": 23060 }, { "epoch": 3.762642740619902, "grad_norm": 4.27272891998291, "learning_rate": 3.925006527625494e-05, "loss": 0.2503, "num_input_tokens_seen": 49796192, "step": 23065 }, { "epoch": 3.763458401305057, "grad_norm": 0.09080624580383301, "learning_rate": 3.924421625928765e-05, "loss": 0.0178, "num_input_tokens_seen": 49807296, "step": 23070 }, { "epoch": 3.764274061990212, "grad_norm": 10.499856948852539, "learning_rate": 3.923836608761192e-05, "loss": 0.1899, "num_input_tokens_seen": 49818368, "step": 23075 }, { "epoch": 3.7650897226753672, "grad_norm": 0.10141909867525101, "learning_rate": 3.923251476170198e-05, "loss": 0.1773, "num_input_tokens_seen": 49830368, "step": 23080 }, { "epoch": 3.765905383360522, "grad_norm": 6.361057281494141, "learning_rate": 3.922666228203218e-05, "loss": 0.1997, "num_input_tokens_seen": 49840960, "step": 23085 }, { "epoch": 3.766721044045677, "grad_norm": 0.16668111085891724, "learning_rate": 3.9220808649076954e-05, "loss": 0.3243, "num_input_tokens_seen": 49851776, "step": 23090 }, { "epoch": 3.767536704730832, "grad_norm": 0.5158731937408447, "learning_rate": 3.921495386331082e-05, "loss": 0.209, "num_input_tokens_seen": 49862336, "step": 23095 }, { "epoch": 3.768352365415987, "grad_norm": 8.285778045654297, "learning_rate": 3.9209097925208405e-05, "loss": 0.2586, "num_input_tokens_seen": 49874208, "step": 23100 }, { "epoch": 3.7691680261011418, "grad_norm": 0.31928977370262146, "learning_rate": 3.920324083524441e-05, "loss": 0.0141, "num_input_tokens_seen": 49885440, "step": 23105 }, { "epoch": 3.769983686786297, "grad_norm": 1.4121078252792358, "learning_rate": 3.919738259389365e-05, "loss": 0.0671, "num_input_tokens_seen": 49896832, "step": 23110 }, { "epoch": 3.770799347471452, "grad_norm": 0.08855217695236206, "learning_rate": 3.919152320163101e-05, "loss": 0.1848, "num_input_tokens_seen": 49907008, "step": 23115 }, { "epoch": 3.7716150081566067, "grad_norm": 0.10175371170043945, "learning_rate": 3.91856626589315e-05, "loss": 0.0132, "num_input_tokens_seen": 49917600, "step": 23120 }, { "epoch": 3.772430668841762, "grad_norm": 5.034108638763428, "learning_rate": 3.91798009662702e-05, "loss": 0.0776, "num_input_tokens_seen": 49929280, "step": 23125 }, { "epoch": 3.7732463295269167, "grad_norm": 1.5025721788406372, "learning_rate": 3.917393812412229e-05, "loss": 0.0291, "num_input_tokens_seen": 49940384, "step": 23130 }, { "epoch": 3.774061990212072, "grad_norm": 0.04287495091557503, "learning_rate": 3.916807413296303e-05, "loss": 0.2155, "num_input_tokens_seen": 49949696, "step": 23135 }, { "epoch": 3.774877650897227, "grad_norm": 8.513350486755371, "learning_rate": 3.916220899326779e-05, "loss": 0.1193, "num_input_tokens_seen": 49959680, "step": 23140 }, { "epoch": 3.7756933115823816, "grad_norm": 2.701164722442627, "learning_rate": 3.915634270551204e-05, "loss": 0.2091, "num_input_tokens_seen": 49970592, "step": 23145 }, { "epoch": 3.7765089722675365, "grad_norm": 3.0898141860961914, "learning_rate": 3.915047527017132e-05, "loss": 0.1448, "num_input_tokens_seen": 49981440, "step": 23150 }, { "epoch": 3.7773246329526917, "grad_norm": 0.4086742103099823, "learning_rate": 3.914460668772127e-05, "loss": 0.1051, "num_input_tokens_seen": 49993024, "step": 23155 }, { "epoch": 3.7781402936378465, "grad_norm": 7.631880283355713, "learning_rate": 3.913873695863763e-05, "loss": 0.1822, "num_input_tokens_seen": 50004704, "step": 23160 }, { "epoch": 3.778955954323002, "grad_norm": 0.4384767413139343, "learning_rate": 3.913286608339625e-05, "loss": 0.1034, "num_input_tokens_seen": 50015424, "step": 23165 }, { "epoch": 3.7797716150081566, "grad_norm": 4.022773742675781, "learning_rate": 3.9126994062473013e-05, "loss": 0.0819, "num_input_tokens_seen": 50026368, "step": 23170 }, { "epoch": 3.7805872756933114, "grad_norm": 1.4691871404647827, "learning_rate": 3.912112089634397e-05, "loss": 0.137, "num_input_tokens_seen": 50036512, "step": 23175 }, { "epoch": 3.7814029363784667, "grad_norm": 0.08211508393287659, "learning_rate": 3.911524658548522e-05, "loss": 0.0938, "num_input_tokens_seen": 50047776, "step": 23180 }, { "epoch": 3.7822185970636215, "grad_norm": 12.340832710266113, "learning_rate": 3.9109371130372956e-05, "loss": 0.2162, "num_input_tokens_seen": 50058048, "step": 23185 }, { "epoch": 3.7830342577487768, "grad_norm": 0.12297343462705612, "learning_rate": 3.910349453148348e-05, "loss": 0.0299, "num_input_tokens_seen": 50068992, "step": 23190 }, { "epoch": 3.7838499184339316, "grad_norm": 0.13493342697620392, "learning_rate": 3.909761678929318e-05, "loss": 0.0647, "num_input_tokens_seen": 50079776, "step": 23195 }, { "epoch": 3.7846655791190864, "grad_norm": 0.07279930263757706, "learning_rate": 3.909173790427852e-05, "loss": 0.0746, "num_input_tokens_seen": 50090592, "step": 23200 }, { "epoch": 3.7854812398042412, "grad_norm": 0.10993704199790955, "learning_rate": 3.90858578769161e-05, "loss": 0.1579, "num_input_tokens_seen": 50102240, "step": 23205 }, { "epoch": 3.7862969004893965, "grad_norm": 2.9637222290039062, "learning_rate": 3.907997670768256e-05, "loss": 0.1213, "num_input_tokens_seen": 50113472, "step": 23210 }, { "epoch": 3.7871125611745513, "grad_norm": 0.1488426774740219, "learning_rate": 3.907409439705467e-05, "loss": 0.0084, "num_input_tokens_seen": 50124224, "step": 23215 }, { "epoch": 3.7879282218597066, "grad_norm": 0.2778640389442444, "learning_rate": 3.9068210945509276e-05, "loss": 0.0135, "num_input_tokens_seen": 50135264, "step": 23220 }, { "epoch": 3.7887438825448614, "grad_norm": 3.5414040088653564, "learning_rate": 3.906232635352333e-05, "loss": 0.2533, "num_input_tokens_seen": 50145824, "step": 23225 }, { "epoch": 3.789559543230016, "grad_norm": 0.025893602520227432, "learning_rate": 3.9056440621573855e-05, "loss": 0.164, "num_input_tokens_seen": 50156864, "step": 23230 }, { "epoch": 3.790375203915171, "grad_norm": 0.4216342568397522, "learning_rate": 3.9050553750137975e-05, "loss": 0.1048, "num_input_tokens_seen": 50167872, "step": 23235 }, { "epoch": 3.7911908646003263, "grad_norm": 0.3165377974510193, "learning_rate": 3.904466573969292e-05, "loss": 0.0694, "num_input_tokens_seen": 50178880, "step": 23240 }, { "epoch": 3.792006525285481, "grad_norm": 2.1399779319763184, "learning_rate": 3.9038776590716e-05, "loss": 0.3976, "num_input_tokens_seen": 50189280, "step": 23245 }, { "epoch": 3.7928221859706364, "grad_norm": 0.06540483236312866, "learning_rate": 3.903288630368461e-05, "loss": 0.0216, "num_input_tokens_seen": 50200736, "step": 23250 }, { "epoch": 3.793637846655791, "grad_norm": 0.05924204736948013, "learning_rate": 3.902699487907626e-05, "loss": 0.0134, "num_input_tokens_seen": 50210208, "step": 23255 }, { "epoch": 3.794453507340946, "grad_norm": 0.39854496717453003, "learning_rate": 3.902110231736853e-05, "loss": 0.0584, "num_input_tokens_seen": 50220960, "step": 23260 }, { "epoch": 3.7952691680261013, "grad_norm": 0.11542457342147827, "learning_rate": 3.901520861903911e-05, "loss": 0.1137, "num_input_tokens_seen": 50232384, "step": 23265 }, { "epoch": 3.796084828711256, "grad_norm": 4.767996311187744, "learning_rate": 3.900931378456576e-05, "loss": 0.0676, "num_input_tokens_seen": 50243328, "step": 23270 }, { "epoch": 3.7969004893964113, "grad_norm": 0.15116938948631287, "learning_rate": 3.9003417814426346e-05, "loss": 0.1516, "num_input_tokens_seen": 50254080, "step": 23275 }, { "epoch": 3.797716150081566, "grad_norm": 0.06791775673627853, "learning_rate": 3.8997520709098845e-05, "loss": 0.0222, "num_input_tokens_seen": 50264384, "step": 23280 }, { "epoch": 3.798531810766721, "grad_norm": 7.044241428375244, "learning_rate": 3.899162246906129e-05, "loss": 0.1518, "num_input_tokens_seen": 50275232, "step": 23285 }, { "epoch": 3.799347471451876, "grad_norm": 3.8409976959228516, "learning_rate": 3.8985723094791814e-05, "loss": 0.1798, "num_input_tokens_seen": 50286048, "step": 23290 }, { "epoch": 3.800163132137031, "grad_norm": 0.14364583790302277, "learning_rate": 3.897982258676867e-05, "loss": 0.0591, "num_input_tokens_seen": 50296192, "step": 23295 }, { "epoch": 3.800978792822186, "grad_norm": 8.493257522583008, "learning_rate": 3.8973920945470174e-05, "loss": 0.204, "num_input_tokens_seen": 50305888, "step": 23300 }, { "epoch": 3.801794453507341, "grad_norm": 2.7930526733398438, "learning_rate": 3.896801817137474e-05, "loss": 0.0949, "num_input_tokens_seen": 50316640, "step": 23305 }, { "epoch": 3.802610114192496, "grad_norm": 0.30696114897727966, "learning_rate": 3.8962114264960894e-05, "loss": 0.1659, "num_input_tokens_seen": 50328832, "step": 23310 }, { "epoch": 3.8034257748776508, "grad_norm": 0.2347618192434311, "learning_rate": 3.8956209226707206e-05, "loss": 0.255, "num_input_tokens_seen": 50339136, "step": 23315 }, { "epoch": 3.804241435562806, "grad_norm": 0.03590773046016693, "learning_rate": 3.8950303057092386e-05, "loss": 0.1674, "num_input_tokens_seen": 50350336, "step": 23320 }, { "epoch": 3.805057096247961, "grad_norm": 3.0185699462890625, "learning_rate": 3.8944395756595225e-05, "loss": 0.1713, "num_input_tokens_seen": 50361376, "step": 23325 }, { "epoch": 3.8058727569331157, "grad_norm": 0.06552945077419281, "learning_rate": 3.893848732569458e-05, "loss": 0.063, "num_input_tokens_seen": 50372064, "step": 23330 }, { "epoch": 3.806688417618271, "grad_norm": 0.7695654630661011, "learning_rate": 3.893257776486944e-05, "loss": 0.0524, "num_input_tokens_seen": 50382304, "step": 23335 }, { "epoch": 3.8075040783034257, "grad_norm": 6.420783519744873, "learning_rate": 3.8926667074598846e-05, "loss": 0.1637, "num_input_tokens_seen": 50392416, "step": 23340 }, { "epoch": 3.8083197389885806, "grad_norm": 3.166912317276001, "learning_rate": 3.892075525536196e-05, "loss": 0.2588, "num_input_tokens_seen": 50402816, "step": 23345 }, { "epoch": 3.809135399673736, "grad_norm": 0.7989125847816467, "learning_rate": 3.891484230763802e-05, "loss": 0.1568, "num_input_tokens_seen": 50412224, "step": 23350 }, { "epoch": 3.8099510603588906, "grad_norm": 3.0957610607147217, "learning_rate": 3.890892823190636e-05, "loss": 0.3021, "num_input_tokens_seen": 50423680, "step": 23355 }, { "epoch": 3.810766721044046, "grad_norm": 0.22804246842861176, "learning_rate": 3.890301302864641e-05, "loss": 0.0085, "num_input_tokens_seen": 50435072, "step": 23360 }, { "epoch": 3.8115823817292007, "grad_norm": 6.268031120300293, "learning_rate": 3.889709669833767e-05, "loss": 0.2217, "num_input_tokens_seen": 50446336, "step": 23365 }, { "epoch": 3.8123980424143555, "grad_norm": 3.2093803882598877, "learning_rate": 3.8891179241459766e-05, "loss": 0.103, "num_input_tokens_seen": 50456928, "step": 23370 }, { "epoch": 3.8132137030995104, "grad_norm": 0.15560902655124664, "learning_rate": 3.888526065849238e-05, "loss": 0.142, "num_input_tokens_seen": 50467488, "step": 23375 }, { "epoch": 3.8140293637846656, "grad_norm": 0.392856627702713, "learning_rate": 3.887934094991531e-05, "loss": 0.0582, "num_input_tokens_seen": 50479168, "step": 23380 }, { "epoch": 3.8148450244698204, "grad_norm": 0.37020111083984375, "learning_rate": 3.887342011620845e-05, "loss": 0.0112, "num_input_tokens_seen": 50490048, "step": 23385 }, { "epoch": 3.8156606851549757, "grad_norm": 4.762162685394287, "learning_rate": 3.886749815785176e-05, "loss": 0.1138, "num_input_tokens_seen": 50500192, "step": 23390 }, { "epoch": 3.8164763458401305, "grad_norm": 0.07374925166368484, "learning_rate": 3.8861575075325304e-05, "loss": 0.0963, "num_input_tokens_seen": 50510944, "step": 23395 }, { "epoch": 3.8172920065252853, "grad_norm": 0.11507974565029144, "learning_rate": 3.8855650869109246e-05, "loss": 0.1208, "num_input_tokens_seen": 50521344, "step": 23400 }, { "epoch": 3.8181076672104406, "grad_norm": 0.07705114781856537, "learning_rate": 3.884972553968382e-05, "loss": 0.0464, "num_input_tokens_seen": 50532064, "step": 23405 }, { "epoch": 3.8189233278955954, "grad_norm": 0.29260510206222534, "learning_rate": 3.884379908752936e-05, "loss": 0.0628, "num_input_tokens_seen": 50542816, "step": 23410 }, { "epoch": 3.8197389885807507, "grad_norm": 0.06639407575130463, "learning_rate": 3.883787151312632e-05, "loss": 0.0111, "num_input_tokens_seen": 50553088, "step": 23415 }, { "epoch": 3.8205546492659055, "grad_norm": 0.12404583394527435, "learning_rate": 3.88319428169552e-05, "loss": 0.0105, "num_input_tokens_seen": 50561792, "step": 23420 }, { "epoch": 3.8213703099510603, "grad_norm": 1.8111509084701538, "learning_rate": 3.882601299949661e-05, "loss": 0.1381, "num_input_tokens_seen": 50573280, "step": 23425 }, { "epoch": 3.822185970636215, "grad_norm": 0.09226392954587936, "learning_rate": 3.882008206123125e-05, "loss": 0.0091, "num_input_tokens_seen": 50584384, "step": 23430 }, { "epoch": 3.8230016313213704, "grad_norm": 0.04823372885584831, "learning_rate": 3.881415000263991e-05, "loss": 0.0361, "num_input_tokens_seen": 50595904, "step": 23435 }, { "epoch": 3.823817292006525, "grad_norm": 0.5151596665382385, "learning_rate": 3.8808216824203494e-05, "loss": 0.0067, "num_input_tokens_seen": 50608224, "step": 23440 }, { "epoch": 3.8246329526916805, "grad_norm": 1.2669740915298462, "learning_rate": 3.880228252640295e-05, "loss": 0.3168, "num_input_tokens_seen": 50618912, "step": 23445 }, { "epoch": 3.8254486133768353, "grad_norm": 0.1803109049797058, "learning_rate": 3.879634710971935e-05, "loss": 0.0697, "num_input_tokens_seen": 50629728, "step": 23450 }, { "epoch": 3.82626427406199, "grad_norm": 0.08579455316066742, "learning_rate": 3.8790410574633854e-05, "loss": 0.1001, "num_input_tokens_seen": 50640096, "step": 23455 }, { "epoch": 3.827079934747145, "grad_norm": 0.0672377198934555, "learning_rate": 3.8784472921627715e-05, "loss": 0.1406, "num_input_tokens_seen": 50650176, "step": 23460 }, { "epoch": 3.8278955954323, "grad_norm": 0.051177676767110825, "learning_rate": 3.877853415118224e-05, "loss": 0.1531, "num_input_tokens_seen": 50661504, "step": 23465 }, { "epoch": 3.828711256117455, "grad_norm": 0.13691945374011993, "learning_rate": 3.877259426377889e-05, "loss": 0.0122, "num_input_tokens_seen": 50672384, "step": 23470 }, { "epoch": 3.8295269168026103, "grad_norm": 0.0938628613948822, "learning_rate": 3.8766653259899165e-05, "loss": 0.1482, "num_input_tokens_seen": 50683168, "step": 23475 }, { "epoch": 3.830342577487765, "grad_norm": 4.719187259674072, "learning_rate": 3.8760711140024677e-05, "loss": 0.2146, "num_input_tokens_seen": 50692512, "step": 23480 }, { "epoch": 3.83115823817292, "grad_norm": 0.07632511854171753, "learning_rate": 3.875476790463712e-05, "loss": 0.1284, "num_input_tokens_seen": 50703168, "step": 23485 }, { "epoch": 3.831973898858075, "grad_norm": 0.7380771636962891, "learning_rate": 3.8748823554218286e-05, "loss": 0.199, "num_input_tokens_seen": 50714112, "step": 23490 }, { "epoch": 3.83278955954323, "grad_norm": 3.486445426940918, "learning_rate": 3.8742878089250043e-05, "loss": 0.1281, "num_input_tokens_seen": 50724576, "step": 23495 }, { "epoch": 3.8336052202283852, "grad_norm": 0.6635816693305969, "learning_rate": 3.8736931510214385e-05, "loss": 0.1293, "num_input_tokens_seen": 50735392, "step": 23500 }, { "epoch": 3.83442088091354, "grad_norm": 0.1672944873571396, "learning_rate": 3.873098381759336e-05, "loss": 0.0404, "num_input_tokens_seen": 50747488, "step": 23505 }, { "epoch": 3.835236541598695, "grad_norm": 0.22422181069850922, "learning_rate": 3.872503501186911e-05, "loss": 0.0297, "num_input_tokens_seen": 50757120, "step": 23510 }, { "epoch": 3.8360522022838497, "grad_norm": 0.06843329221010208, "learning_rate": 3.871908509352388e-05, "loss": 0.0706, "num_input_tokens_seen": 50767136, "step": 23515 }, { "epoch": 3.836867862969005, "grad_norm": 0.8522431254386902, "learning_rate": 3.871313406304001e-05, "loss": 0.0101, "num_input_tokens_seen": 50777728, "step": 23520 }, { "epoch": 3.8376835236541598, "grad_norm": 2.191159725189209, "learning_rate": 3.87071819208999e-05, "loss": 0.2006, "num_input_tokens_seen": 50787488, "step": 23525 }, { "epoch": 3.838499184339315, "grad_norm": 3.2646467685699463, "learning_rate": 3.870122866758609e-05, "loss": 0.1007, "num_input_tokens_seen": 50797760, "step": 23530 }, { "epoch": 3.83931484502447, "grad_norm": 0.24704453349113464, "learning_rate": 3.869527430358116e-05, "loss": 0.0304, "num_input_tokens_seen": 50808832, "step": 23535 }, { "epoch": 3.8401305057096247, "grad_norm": 0.10138709843158722, "learning_rate": 3.8689318829367796e-05, "loss": 0.124, "num_input_tokens_seen": 50819296, "step": 23540 }, { "epoch": 3.84094616639478, "grad_norm": 0.11551807075738907, "learning_rate": 3.86833622454288e-05, "loss": 0.0586, "num_input_tokens_seen": 50831424, "step": 23545 }, { "epoch": 3.8417618270799347, "grad_norm": 0.2826504409313202, "learning_rate": 3.8677404552247024e-05, "loss": 0.0514, "num_input_tokens_seen": 50841152, "step": 23550 }, { "epoch": 3.8425774877650896, "grad_norm": 6.57863187789917, "learning_rate": 3.8671445750305444e-05, "loss": 0.3215, "num_input_tokens_seen": 50850752, "step": 23555 }, { "epoch": 3.843393148450245, "grad_norm": 1.516378402709961, "learning_rate": 3.8665485840087104e-05, "loss": 0.0714, "num_input_tokens_seen": 50860896, "step": 23560 }, { "epoch": 3.8442088091353996, "grad_norm": 3.2162604331970215, "learning_rate": 3.865952482207513e-05, "loss": 0.1335, "num_input_tokens_seen": 50871072, "step": 23565 }, { "epoch": 3.8450244698205545, "grad_norm": 2.1718997955322266, "learning_rate": 3.865356269675278e-05, "loss": 0.106, "num_input_tokens_seen": 50880960, "step": 23570 }, { "epoch": 3.8458401305057097, "grad_norm": 4.0175933837890625, "learning_rate": 3.8647599464603355e-05, "loss": 0.0306, "num_input_tokens_seen": 50890080, "step": 23575 }, { "epoch": 3.8466557911908645, "grad_norm": 0.11540604382753372, "learning_rate": 3.864163512611028e-05, "loss": 0.006, "num_input_tokens_seen": 50901216, "step": 23580 }, { "epoch": 3.84747145187602, "grad_norm": 10.32332992553711, "learning_rate": 3.863566968175703e-05, "loss": 0.0681, "num_input_tokens_seen": 50912416, "step": 23585 }, { "epoch": 3.8482871125611746, "grad_norm": 0.05607185512781143, "learning_rate": 3.862970313202722e-05, "loss": 0.0092, "num_input_tokens_seen": 50922752, "step": 23590 }, { "epoch": 3.8491027732463294, "grad_norm": 0.08821630477905273, "learning_rate": 3.86237354774045e-05, "loss": 0.004, "num_input_tokens_seen": 50935168, "step": 23595 }, { "epoch": 3.8499184339314843, "grad_norm": 4.020759105682373, "learning_rate": 3.861776671837267e-05, "loss": 0.282, "num_input_tokens_seen": 50946304, "step": 23600 }, { "epoch": 3.8507340946166395, "grad_norm": 0.6966497302055359, "learning_rate": 3.861179685541557e-05, "loss": 0.0111, "num_input_tokens_seen": 50955872, "step": 23605 }, { "epoch": 3.8515497553017943, "grad_norm": 3.10355806350708, "learning_rate": 3.8605825889017156e-05, "loss": 0.1657, "num_input_tokens_seen": 50967488, "step": 23610 }, { "epoch": 3.8523654159869496, "grad_norm": 0.09562846273183823, "learning_rate": 3.859985381966146e-05, "loss": 0.0234, "num_input_tokens_seen": 50977664, "step": 23615 }, { "epoch": 3.8531810766721044, "grad_norm": 0.3513879179954529, "learning_rate": 3.8593880647832606e-05, "loss": 0.0194, "num_input_tokens_seen": 50989920, "step": 23620 }, { "epoch": 3.8539967373572592, "grad_norm": 6.543063163757324, "learning_rate": 3.858790637401482e-05, "loss": 0.1168, "num_input_tokens_seen": 50999136, "step": 23625 }, { "epoch": 3.8548123980424145, "grad_norm": 0.021561764180660248, "learning_rate": 3.858193099869239e-05, "loss": 0.2842, "num_input_tokens_seen": 51010336, "step": 23630 }, { "epoch": 3.8556280587275693, "grad_norm": 2.674123764038086, "learning_rate": 3.857595452234971e-05, "loss": 0.218, "num_input_tokens_seen": 51021792, "step": 23635 }, { "epoch": 3.8564437194127246, "grad_norm": 0.04573666304349899, "learning_rate": 3.856997694547129e-05, "loss": 0.1464, "num_input_tokens_seen": 51032416, "step": 23640 }, { "epoch": 3.8572593800978794, "grad_norm": 7.868891716003418, "learning_rate": 3.856399826854168e-05, "loss": 0.1463, "num_input_tokens_seen": 51042304, "step": 23645 }, { "epoch": 3.858075040783034, "grad_norm": 1.4317147731781006, "learning_rate": 3.855801849204555e-05, "loss": 0.084, "num_input_tokens_seen": 51053664, "step": 23650 }, { "epoch": 3.858890701468189, "grad_norm": 0.1854161024093628, "learning_rate": 3.855203761646764e-05, "loss": 0.009, "num_input_tokens_seen": 51064608, "step": 23655 }, { "epoch": 3.8597063621533443, "grad_norm": 0.04124079644680023, "learning_rate": 3.85460556422928e-05, "loss": 0.1217, "num_input_tokens_seen": 51074208, "step": 23660 }, { "epoch": 3.860522022838499, "grad_norm": 2.4907431602478027, "learning_rate": 3.854007257000596e-05, "loss": 0.0648, "num_input_tokens_seen": 51085600, "step": 23665 }, { "epoch": 3.8613376835236544, "grad_norm": 0.08115028589963913, "learning_rate": 3.853408840009214e-05, "loss": 0.0973, "num_input_tokens_seen": 51097344, "step": 23670 }, { "epoch": 3.862153344208809, "grad_norm": 0.11729727685451508, "learning_rate": 3.8528103133036434e-05, "loss": 0.0077, "num_input_tokens_seen": 51109184, "step": 23675 }, { "epoch": 3.862969004893964, "grad_norm": 0.07816064357757568, "learning_rate": 3.8522116769324056e-05, "loss": 0.1096, "num_input_tokens_seen": 51120800, "step": 23680 }, { "epoch": 3.863784665579119, "grad_norm": 0.13324949145317078, "learning_rate": 3.851612930944027e-05, "loss": 0.0708, "num_input_tokens_seen": 51131008, "step": 23685 }, { "epoch": 3.864600326264274, "grad_norm": 0.4708535969257355, "learning_rate": 3.851014075387048e-05, "loss": 0.2543, "num_input_tokens_seen": 51141600, "step": 23690 }, { "epoch": 3.865415986949429, "grad_norm": 2.499551296234131, "learning_rate": 3.850415110310012e-05, "loss": 0.0298, "num_input_tokens_seen": 51152128, "step": 23695 }, { "epoch": 3.866231647634584, "grad_norm": 0.04783983528614044, "learning_rate": 3.8498160357614756e-05, "loss": 0.0438, "num_input_tokens_seen": 51163840, "step": 23700 }, { "epoch": 3.867047308319739, "grad_norm": 0.27930396795272827, "learning_rate": 3.8492168517900016e-05, "loss": 0.086, "num_input_tokens_seen": 51173984, "step": 23705 }, { "epoch": 3.867862969004894, "grad_norm": 4.76519250869751, "learning_rate": 3.8486175584441643e-05, "loss": 0.1064, "num_input_tokens_seen": 51185120, "step": 23710 }, { "epoch": 3.868678629690049, "grad_norm": 0.048098061233758926, "learning_rate": 3.8480181557725455e-05, "loss": 0.0231, "num_input_tokens_seen": 51195968, "step": 23715 }, { "epoch": 3.869494290375204, "grad_norm": 0.9078822135925293, "learning_rate": 3.847418643823735e-05, "loss": 0.0254, "num_input_tokens_seen": 51206368, "step": 23720 }, { "epoch": 3.870309951060359, "grad_norm": 0.14563687145709991, "learning_rate": 3.8468190226463316e-05, "loss": 0.1075, "num_input_tokens_seen": 51216928, "step": 23725 }, { "epoch": 3.871125611745514, "grad_norm": 3.4391939640045166, "learning_rate": 3.846219292288945e-05, "loss": 0.3805, "num_input_tokens_seen": 51227936, "step": 23730 }, { "epoch": 3.8719412724306688, "grad_norm": 0.7178727984428406, "learning_rate": 3.845619452800192e-05, "loss": 0.217, "num_input_tokens_seen": 51238880, "step": 23735 }, { "epoch": 3.8727569331158236, "grad_norm": 0.705920934677124, "learning_rate": 3.845019504228699e-05, "loss": 0.1067, "num_input_tokens_seen": 51250560, "step": 23740 }, { "epoch": 3.873572593800979, "grad_norm": 0.06280519813299179, "learning_rate": 3.8444194466230994e-05, "loss": 0.0058, "num_input_tokens_seen": 51261440, "step": 23745 }, { "epoch": 3.8743882544861337, "grad_norm": 0.23043088614940643, "learning_rate": 3.843819280032038e-05, "loss": 0.178, "num_input_tokens_seen": 51272640, "step": 23750 }, { "epoch": 3.875203915171289, "grad_norm": 2.443315267562866, "learning_rate": 3.843219004504168e-05, "loss": 0.3003, "num_input_tokens_seen": 51284064, "step": 23755 }, { "epoch": 3.8760195758564437, "grad_norm": 2.8800432682037354, "learning_rate": 3.84261862008815e-05, "loss": 0.1018, "num_input_tokens_seen": 51294368, "step": 23760 }, { "epoch": 3.8768352365415986, "grad_norm": 0.11355220526456833, "learning_rate": 3.8420181268326536e-05, "loss": 0.0793, "num_input_tokens_seen": 51304704, "step": 23765 }, { "epoch": 3.877650897226754, "grad_norm": 2.2404026985168457, "learning_rate": 3.841417524786359e-05, "loss": 0.0141, "num_input_tokens_seen": 51316128, "step": 23770 }, { "epoch": 3.8784665579119086, "grad_norm": 0.03165043145418167, "learning_rate": 3.840816813997954e-05, "loss": 0.0677, "num_input_tokens_seen": 51327712, "step": 23775 }, { "epoch": 3.8792822185970635, "grad_norm": 2.7214980125427246, "learning_rate": 3.8402159945161346e-05, "loss": 0.2191, "num_input_tokens_seen": 51338624, "step": 23780 }, { "epoch": 3.8800978792822187, "grad_norm": 0.05646583065390587, "learning_rate": 3.839615066389607e-05, "loss": 0.0186, "num_input_tokens_seen": 51348928, "step": 23785 }, { "epoch": 3.8809135399673735, "grad_norm": 6.865532875061035, "learning_rate": 3.839014029667084e-05, "loss": 0.1212, "num_input_tokens_seen": 51359744, "step": 23790 }, { "epoch": 3.8817292006525284, "grad_norm": 0.0790897086262703, "learning_rate": 3.83841288439729e-05, "loss": 0.252, "num_input_tokens_seen": 51371616, "step": 23795 }, { "epoch": 3.8825448613376836, "grad_norm": 0.24048492312431335, "learning_rate": 3.837811630628957e-05, "loss": 0.1602, "num_input_tokens_seen": 51382752, "step": 23800 }, { "epoch": 3.8833605220228384, "grad_norm": 6.1536431312561035, "learning_rate": 3.837210268410824e-05, "loss": 0.2929, "num_input_tokens_seen": 51393440, "step": 23805 }, { "epoch": 3.8841761827079937, "grad_norm": 1.6496332883834839, "learning_rate": 3.836608797791642e-05, "loss": 0.0169, "num_input_tokens_seen": 51401792, "step": 23810 }, { "epoch": 3.8849918433931485, "grad_norm": 1.3711801767349243, "learning_rate": 3.8360072188201704e-05, "loss": 0.2867, "num_input_tokens_seen": 51413216, "step": 23815 }, { "epoch": 3.8858075040783033, "grad_norm": 0.17884577810764313, "learning_rate": 3.835405531545173e-05, "loss": 0.1844, "num_input_tokens_seen": 51423840, "step": 23820 }, { "epoch": 3.886623164763458, "grad_norm": 0.10995449870824814, "learning_rate": 3.834803736015428e-05, "loss": 0.0403, "num_input_tokens_seen": 51434464, "step": 23825 }, { "epoch": 3.8874388254486134, "grad_norm": 0.39918452501296997, "learning_rate": 3.8342018322797205e-05, "loss": 0.2348, "num_input_tokens_seen": 51446112, "step": 23830 }, { "epoch": 3.8882544861337682, "grad_norm": 2.797009229660034, "learning_rate": 3.833599820386842e-05, "loss": 0.2407, "num_input_tokens_seen": 51456704, "step": 23835 }, { "epoch": 3.8890701468189235, "grad_norm": 3.1366398334503174, "learning_rate": 3.8329977003855956e-05, "loss": 0.1807, "num_input_tokens_seen": 51467424, "step": 23840 }, { "epoch": 3.8898858075040783, "grad_norm": 1.1464749574661255, "learning_rate": 3.832395472324791e-05, "loss": 0.0183, "num_input_tokens_seen": 51477792, "step": 23845 }, { "epoch": 3.890701468189233, "grad_norm": 2.3124589920043945, "learning_rate": 3.83179313625325e-05, "loss": 0.2955, "num_input_tokens_seen": 51486368, "step": 23850 }, { "epoch": 3.8915171288743884, "grad_norm": 5.189943790435791, "learning_rate": 3.8311906922198005e-05, "loss": 0.1469, "num_input_tokens_seen": 51496224, "step": 23855 }, { "epoch": 3.892332789559543, "grad_norm": 0.08746843785047531, "learning_rate": 3.830588140273278e-05, "loss": 0.1092, "num_input_tokens_seen": 51507872, "step": 23860 }, { "epoch": 3.8931484502446985, "grad_norm": 2.0948245525360107, "learning_rate": 3.829985480462529e-05, "loss": 0.0898, "num_input_tokens_seen": 51518112, "step": 23865 }, { "epoch": 3.8939641109298533, "grad_norm": 3.290347099304199, "learning_rate": 3.82938271283641e-05, "loss": 0.1162, "num_input_tokens_seen": 51529120, "step": 23870 }, { "epoch": 3.894779771615008, "grad_norm": 0.5662157535552979, "learning_rate": 3.828779837443783e-05, "loss": 0.12, "num_input_tokens_seen": 51540768, "step": 23875 }, { "epoch": 3.895595432300163, "grad_norm": 0.3609636127948761, "learning_rate": 3.8281768543335195e-05, "loss": 0.1168, "num_input_tokens_seen": 51550304, "step": 23880 }, { "epoch": 3.896411092985318, "grad_norm": 0.196279376745224, "learning_rate": 3.827573763554502e-05, "loss": 0.0881, "num_input_tokens_seen": 51561088, "step": 23885 }, { "epoch": 3.897226753670473, "grad_norm": 3.1661527156829834, "learning_rate": 3.826970565155618e-05, "loss": 0.1379, "num_input_tokens_seen": 51571264, "step": 23890 }, { "epoch": 3.8980424143556283, "grad_norm": 0.12482129037380219, "learning_rate": 3.8263672591857666e-05, "loss": 0.0682, "num_input_tokens_seen": 51582912, "step": 23895 }, { "epoch": 3.898858075040783, "grad_norm": 0.6632061004638672, "learning_rate": 3.825763845693857e-05, "loss": 0.0494, "num_input_tokens_seen": 51594144, "step": 23900 }, { "epoch": 3.899673735725938, "grad_norm": 0.1388709396123886, "learning_rate": 3.825160324728802e-05, "loss": 0.0762, "num_input_tokens_seen": 51604928, "step": 23905 }, { "epoch": 3.9004893964110927, "grad_norm": 4.40236234664917, "learning_rate": 3.824556696339528e-05, "loss": 0.264, "num_input_tokens_seen": 51614176, "step": 23910 }, { "epoch": 3.901305057096248, "grad_norm": 2.515265703201294, "learning_rate": 3.823952960574967e-05, "loss": 0.0735, "num_input_tokens_seen": 51624992, "step": 23915 }, { "epoch": 3.902120717781403, "grad_norm": 0.3718299865722656, "learning_rate": 3.823349117484062e-05, "loss": 0.0106, "num_input_tokens_seen": 51635744, "step": 23920 }, { "epoch": 3.902936378466558, "grad_norm": 0.35502177476882935, "learning_rate": 3.822745167115762e-05, "loss": 0.0169, "num_input_tokens_seen": 51645472, "step": 23925 }, { "epoch": 3.903752039151713, "grad_norm": 5.897215366363525, "learning_rate": 3.822141109519027e-05, "loss": 0.1144, "num_input_tokens_seen": 51656128, "step": 23930 }, { "epoch": 3.9045676998368677, "grad_norm": 3.5866503715515137, "learning_rate": 3.821536944742827e-05, "loss": 0.2927, "num_input_tokens_seen": 51666368, "step": 23935 }, { "epoch": 3.905383360522023, "grad_norm": 2.2468960285186768, "learning_rate": 3.820932672836135e-05, "loss": 0.0721, "num_input_tokens_seen": 51677792, "step": 23940 }, { "epoch": 3.9061990212071778, "grad_norm": 0.5014287233352661, "learning_rate": 3.820328293847939e-05, "loss": 0.2138, "num_input_tokens_seen": 51689280, "step": 23945 }, { "epoch": 3.907014681892333, "grad_norm": 0.36148637533187866, "learning_rate": 3.819723807827232e-05, "loss": 0.0615, "num_input_tokens_seen": 51699712, "step": 23950 }, { "epoch": 3.907830342577488, "grad_norm": 3.4324469566345215, "learning_rate": 3.8191192148230176e-05, "loss": 0.2222, "num_input_tokens_seen": 51710624, "step": 23955 }, { "epoch": 3.9086460032626427, "grad_norm": 4.112708568572998, "learning_rate": 3.818514514884306e-05, "loss": 0.0628, "num_input_tokens_seen": 51721952, "step": 23960 }, { "epoch": 3.9094616639477975, "grad_norm": 0.1018175259232521, "learning_rate": 3.8179097080601175e-05, "loss": 0.2149, "num_input_tokens_seen": 51731584, "step": 23965 }, { "epoch": 3.9102773246329527, "grad_norm": 0.1846858710050583, "learning_rate": 3.817304794399481e-05, "loss": 0.228, "num_input_tokens_seen": 51743232, "step": 23970 }, { "epoch": 3.9110929853181076, "grad_norm": 0.16165630519390106, "learning_rate": 3.816699773951434e-05, "loss": 0.0865, "num_input_tokens_seen": 51755712, "step": 23975 }, { "epoch": 3.911908646003263, "grad_norm": 0.1982184648513794, "learning_rate": 3.8160946467650226e-05, "loss": 0.0162, "num_input_tokens_seen": 51767776, "step": 23980 }, { "epoch": 3.9127243066884176, "grad_norm": 0.9768170118331909, "learning_rate": 3.815489412889302e-05, "loss": 0.109, "num_input_tokens_seen": 51778176, "step": 23985 }, { "epoch": 3.9135399673735725, "grad_norm": 0.1167348325252533, "learning_rate": 3.8148840723733335e-05, "loss": 0.1562, "num_input_tokens_seen": 51789024, "step": 23990 }, { "epoch": 3.9143556280587277, "grad_norm": 0.16741915047168732, "learning_rate": 3.814278625266191e-05, "loss": 0.0853, "num_input_tokens_seen": 51798752, "step": 23995 }, { "epoch": 3.9151712887438825, "grad_norm": 10.41820240020752, "learning_rate": 3.8136730716169554e-05, "loss": 0.161, "num_input_tokens_seen": 51809856, "step": 24000 }, { "epoch": 3.9159869494290374, "grad_norm": 0.1835808902978897, "learning_rate": 3.8130674114747146e-05, "loss": 0.0321, "num_input_tokens_seen": 51819584, "step": 24005 }, { "epoch": 3.9168026101141926, "grad_norm": 2.11114764213562, "learning_rate": 3.812461644888566e-05, "loss": 0.1169, "num_input_tokens_seen": 51831168, "step": 24010 }, { "epoch": 3.9176182707993474, "grad_norm": 0.175631582736969, "learning_rate": 3.8118557719076186e-05, "loss": 0.0438, "num_input_tokens_seen": 51839776, "step": 24015 }, { "epoch": 3.9184339314845023, "grad_norm": 0.08247873932123184, "learning_rate": 3.811249792580985e-05, "loss": 0.1423, "num_input_tokens_seen": 51850368, "step": 24020 }, { "epoch": 3.9192495921696575, "grad_norm": 3.1666617393493652, "learning_rate": 3.810643706957791e-05, "loss": 0.1349, "num_input_tokens_seen": 51861248, "step": 24025 }, { "epoch": 3.9200652528548123, "grad_norm": 9.254212379455566, "learning_rate": 3.810037515087167e-05, "loss": 0.3344, "num_input_tokens_seen": 51872928, "step": 24030 }, { "epoch": 3.9208809135399676, "grad_norm": 9.931520462036133, "learning_rate": 3.809431217018255e-05, "loss": 0.2647, "num_input_tokens_seen": 51883680, "step": 24035 }, { "epoch": 3.9216965742251224, "grad_norm": 2.439455270767212, "learning_rate": 3.8088248128002044e-05, "loss": 0.3082, "num_input_tokens_seen": 51895808, "step": 24040 }, { "epoch": 3.9225122349102772, "grad_norm": 0.23569348454475403, "learning_rate": 3.808218302482175e-05, "loss": 0.0596, "num_input_tokens_seen": 51905760, "step": 24045 }, { "epoch": 3.923327895595432, "grad_norm": 3.6026687622070312, "learning_rate": 3.8076116861133305e-05, "loss": 0.2412, "num_input_tokens_seen": 51916288, "step": 24050 }, { "epoch": 3.9241435562805873, "grad_norm": 0.42549461126327515, "learning_rate": 3.8070049637428485e-05, "loss": 0.0244, "num_input_tokens_seen": 51927040, "step": 24055 }, { "epoch": 3.924959216965742, "grad_norm": 0.1177360936999321, "learning_rate": 3.806398135419913e-05, "loss": 0.0116, "num_input_tokens_seen": 51937760, "step": 24060 }, { "epoch": 3.9257748776508974, "grad_norm": 0.8087987899780273, "learning_rate": 3.805791201193716e-05, "loss": 0.0162, "num_input_tokens_seen": 51948384, "step": 24065 }, { "epoch": 3.926590538336052, "grad_norm": 2.6735105514526367, "learning_rate": 3.8051841611134576e-05, "loss": 0.1596, "num_input_tokens_seen": 51958496, "step": 24070 }, { "epoch": 3.927406199021207, "grad_norm": 0.08874467760324478, "learning_rate": 3.804577015228349e-05, "loss": 0.0228, "num_input_tokens_seen": 51969216, "step": 24075 }, { "epoch": 3.9282218597063623, "grad_norm": 0.10837216675281525, "learning_rate": 3.803969763587609e-05, "loss": 0.0239, "num_input_tokens_seen": 51980768, "step": 24080 }, { "epoch": 3.929037520391517, "grad_norm": 0.42723798751831055, "learning_rate": 3.803362406240463e-05, "loss": 0.0496, "num_input_tokens_seen": 51990976, "step": 24085 }, { "epoch": 3.9298531810766724, "grad_norm": 8.741414070129395, "learning_rate": 3.802754943236148e-05, "loss": 0.1212, "num_input_tokens_seen": 52002240, "step": 24090 }, { "epoch": 3.930668841761827, "grad_norm": 4.983425140380859, "learning_rate": 3.8021473746239064e-05, "loss": 0.0816, "num_input_tokens_seen": 52013760, "step": 24095 }, { "epoch": 3.931484502446982, "grad_norm": 0.33898645639419556, "learning_rate": 3.801539700452992e-05, "loss": 0.016, "num_input_tokens_seen": 52024960, "step": 24100 }, { "epoch": 3.932300163132137, "grad_norm": 0.09568259865045547, "learning_rate": 3.800931920772666e-05, "loss": 0.0903, "num_input_tokens_seen": 52035872, "step": 24105 }, { "epoch": 3.933115823817292, "grad_norm": 0.11450260132551193, "learning_rate": 3.8003240356321965e-05, "loss": 0.2034, "num_input_tokens_seen": 52047136, "step": 24110 }, { "epoch": 3.933931484502447, "grad_norm": 0.05758999288082123, "learning_rate": 3.7997160450808634e-05, "loss": 0.2704, "num_input_tokens_seen": 52059040, "step": 24115 }, { "epoch": 3.934747145187602, "grad_norm": 0.12375502288341522, "learning_rate": 3.7991079491679524e-05, "loss": 0.1142, "num_input_tokens_seen": 52070176, "step": 24120 }, { "epoch": 3.935562805872757, "grad_norm": 0.07845445722341537, "learning_rate": 3.79849974794276e-05, "loss": 0.0391, "num_input_tokens_seen": 52080576, "step": 24125 }, { "epoch": 3.936378466557912, "grad_norm": 0.668676495552063, "learning_rate": 3.7978914414545895e-05, "loss": 0.073, "num_input_tokens_seen": 52091744, "step": 24130 }, { "epoch": 3.9371941272430666, "grad_norm": 0.23654919862747192, "learning_rate": 3.797283029752753e-05, "loss": 0.2787, "num_input_tokens_seen": 52102432, "step": 24135 }, { "epoch": 3.938009787928222, "grad_norm": 0.12085125595331192, "learning_rate": 3.796674512886573e-05, "loss": 0.0253, "num_input_tokens_seen": 52111968, "step": 24140 }, { "epoch": 3.9388254486133767, "grad_norm": 0.13203787803649902, "learning_rate": 3.7960658909053766e-05, "loss": 0.1109, "num_input_tokens_seen": 52123872, "step": 24145 }, { "epoch": 3.939641109298532, "grad_norm": 0.09453985840082169, "learning_rate": 3.7954571638585035e-05, "loss": 0.0118, "num_input_tokens_seen": 52134400, "step": 24150 }, { "epoch": 3.9404567699836868, "grad_norm": 0.08839213848114014, "learning_rate": 3.7948483317952985e-05, "loss": 0.0097, "num_input_tokens_seen": 52144448, "step": 24155 }, { "epoch": 3.9412724306688416, "grad_norm": 7.535945892333984, "learning_rate": 3.794239394765119e-05, "loss": 0.1754, "num_input_tokens_seen": 52155840, "step": 24160 }, { "epoch": 3.942088091353997, "grad_norm": 0.07784853130578995, "learning_rate": 3.793630352817327e-05, "loss": 0.0804, "num_input_tokens_seen": 52167968, "step": 24165 }, { "epoch": 3.9429037520391517, "grad_norm": 0.5558604001998901, "learning_rate": 3.7930212060012946e-05, "loss": 0.011, "num_input_tokens_seen": 52177792, "step": 24170 }, { "epoch": 3.943719412724307, "grad_norm": 4.437076091766357, "learning_rate": 3.792411954366402e-05, "loss": 0.0865, "num_input_tokens_seen": 52188992, "step": 24175 }, { "epoch": 3.9445350734094617, "grad_norm": 6.7131757736206055, "learning_rate": 3.791802597962039e-05, "loss": 0.1714, "num_input_tokens_seen": 52200192, "step": 24180 }, { "epoch": 3.9453507340946166, "grad_norm": 2.135201930999756, "learning_rate": 3.791193136837603e-05, "loss": 0.1607, "num_input_tokens_seen": 52210944, "step": 24185 }, { "epoch": 3.9461663947797714, "grad_norm": 0.322039932012558, "learning_rate": 3.7905835710425e-05, "loss": 0.1606, "num_input_tokens_seen": 52220960, "step": 24190 }, { "epoch": 3.9469820554649266, "grad_norm": 3.2532620429992676, "learning_rate": 3.789973900626145e-05, "loss": 0.236, "num_input_tokens_seen": 52231232, "step": 24195 }, { "epoch": 3.9477977161500815, "grad_norm": 0.06413562595844269, "learning_rate": 3.78936412563796e-05, "loss": 0.1216, "num_input_tokens_seen": 52240032, "step": 24200 }, { "epoch": 3.9486133768352367, "grad_norm": 0.05348264053463936, "learning_rate": 3.788754246127375e-05, "loss": 0.0141, "num_input_tokens_seen": 52251968, "step": 24205 }, { "epoch": 3.9494290375203915, "grad_norm": 0.026875484734773636, "learning_rate": 3.7881442621438333e-05, "loss": 0.0168, "num_input_tokens_seen": 52263072, "step": 24210 }, { "epoch": 3.9502446982055464, "grad_norm": 4.340539932250977, "learning_rate": 3.787534173736782e-05, "loss": 0.2256, "num_input_tokens_seen": 52274336, "step": 24215 }, { "epoch": 3.9510603588907016, "grad_norm": 8.347455024719238, "learning_rate": 3.786923980955678e-05, "loss": 0.0662, "num_input_tokens_seen": 52284992, "step": 24220 }, { "epoch": 3.9518760195758564, "grad_norm": 3.045938014984131, "learning_rate": 3.7863136838499855e-05, "loss": 0.0868, "num_input_tokens_seen": 52296032, "step": 24225 }, { "epoch": 3.9526916802610113, "grad_norm": 2.9014275074005127, "learning_rate": 3.785703282469179e-05, "loss": 0.2974, "num_input_tokens_seen": 52306432, "step": 24230 }, { "epoch": 3.9535073409461665, "grad_norm": 0.120819590985775, "learning_rate": 3.785092776862741e-05, "loss": 0.0123, "num_input_tokens_seen": 52316640, "step": 24235 }, { "epoch": 3.9543230016313213, "grad_norm": 5.4397149085998535, "learning_rate": 3.784482167080162e-05, "loss": 0.1867, "num_input_tokens_seen": 52327104, "step": 24240 }, { "epoch": 3.955138662316476, "grad_norm": 0.06622958928346634, "learning_rate": 3.783871453170941e-05, "loss": 0.1037, "num_input_tokens_seen": 52338080, "step": 24245 }, { "epoch": 3.9559543230016314, "grad_norm": 0.13515892624855042, "learning_rate": 3.783260635184586e-05, "loss": 0.4521, "num_input_tokens_seen": 52348576, "step": 24250 }, { "epoch": 3.9567699836867862, "grad_norm": 0.10123121738433838, "learning_rate": 3.782649713170613e-05, "loss": 0.1931, "num_input_tokens_seen": 52358848, "step": 24255 }, { "epoch": 3.9575856443719415, "grad_norm": 0.1138068288564682, "learning_rate": 3.7820386871785455e-05, "loss": 0.0617, "num_input_tokens_seen": 52369248, "step": 24260 }, { "epoch": 3.9584013050570963, "grad_norm": 4.640443801879883, "learning_rate": 3.7814275572579175e-05, "loss": 0.1401, "num_input_tokens_seen": 52380640, "step": 24265 }, { "epoch": 3.959216965742251, "grad_norm": 0.8769100904464722, "learning_rate": 3.780816323458269e-05, "loss": 0.1245, "num_input_tokens_seen": 52391136, "step": 24270 }, { "epoch": 3.960032626427406, "grad_norm": 0.24302352964878082, "learning_rate": 3.7802049858291515e-05, "loss": 0.07, "num_input_tokens_seen": 52401824, "step": 24275 }, { "epoch": 3.960848287112561, "grad_norm": 2.1642651557922363, "learning_rate": 3.779593544420122e-05, "loss": 0.4402, "num_input_tokens_seen": 52412576, "step": 24280 }, { "epoch": 3.961663947797716, "grad_norm": 7.140532493591309, "learning_rate": 3.7789819992807474e-05, "loss": 0.1253, "num_input_tokens_seen": 52422720, "step": 24285 }, { "epoch": 3.9624796084828713, "grad_norm": 5.465451240539551, "learning_rate": 3.778370350460601e-05, "loss": 0.1529, "num_input_tokens_seen": 52432800, "step": 24290 }, { "epoch": 3.963295269168026, "grad_norm": 0.20366117358207703, "learning_rate": 3.777758598009269e-05, "loss": 0.0684, "num_input_tokens_seen": 52443680, "step": 24295 }, { "epoch": 3.964110929853181, "grad_norm": 0.09867950528860092, "learning_rate": 3.777146741976342e-05, "loss": 0.1906, "num_input_tokens_seen": 52454976, "step": 24300 }, { "epoch": 3.964926590538336, "grad_norm": 4.806294918060303, "learning_rate": 3.776534782411419e-05, "loss": 0.2447, "num_input_tokens_seen": 52465376, "step": 24305 }, { "epoch": 3.965742251223491, "grad_norm": 0.7245295643806458, "learning_rate": 3.77592271936411e-05, "loss": 0.1734, "num_input_tokens_seen": 52477408, "step": 24310 }, { "epoch": 3.9665579119086463, "grad_norm": 0.575904130935669, "learning_rate": 3.775310552884031e-05, "loss": 0.0808, "num_input_tokens_seen": 52489472, "step": 24315 }, { "epoch": 3.967373572593801, "grad_norm": 0.27454447746276855, "learning_rate": 3.7746982830208075e-05, "loss": 0.0187, "num_input_tokens_seen": 52499648, "step": 24320 }, { "epoch": 3.968189233278956, "grad_norm": 0.15682311356067657, "learning_rate": 3.774085909824074e-05, "loss": 0.2283, "num_input_tokens_seen": 52508096, "step": 24325 }, { "epoch": 3.9690048939641107, "grad_norm": 0.3136940002441406, "learning_rate": 3.7734734333434726e-05, "loss": 0.0263, "num_input_tokens_seen": 52520224, "step": 24330 }, { "epoch": 3.969820554649266, "grad_norm": 0.11171292513608932, "learning_rate": 3.772860853628652e-05, "loss": 0.0889, "num_input_tokens_seen": 52531520, "step": 24335 }, { "epoch": 3.970636215334421, "grad_norm": 0.12412583827972412, "learning_rate": 3.772248170729272e-05, "loss": 0.2002, "num_input_tokens_seen": 52543040, "step": 24340 }, { "epoch": 3.971451876019576, "grad_norm": 2.202580451965332, "learning_rate": 3.771635384695001e-05, "loss": 0.4208, "num_input_tokens_seen": 52553760, "step": 24345 }, { "epoch": 3.972267536704731, "grad_norm": 0.40388861298561096, "learning_rate": 3.771022495575513e-05, "loss": 0.0678, "num_input_tokens_seen": 52564416, "step": 24350 }, { "epoch": 3.9730831973898857, "grad_norm": 3.802887201309204, "learning_rate": 3.770409503420492e-05, "loss": 0.1629, "num_input_tokens_seen": 52574880, "step": 24355 }, { "epoch": 3.9738988580750405, "grad_norm": 0.3135204017162323, "learning_rate": 3.769796408279631e-05, "loss": 0.2398, "num_input_tokens_seen": 52586464, "step": 24360 }, { "epoch": 3.9747145187601958, "grad_norm": 0.09923018515110016, "learning_rate": 3.76918321020263e-05, "loss": 0.0126, "num_input_tokens_seen": 52596608, "step": 24365 }, { "epoch": 3.9755301794453506, "grad_norm": 0.8163269758224487, "learning_rate": 3.768569909239199e-05, "loss": 0.093, "num_input_tokens_seen": 52607680, "step": 24370 }, { "epoch": 3.976345840130506, "grad_norm": 3.5307023525238037, "learning_rate": 3.767956505439054e-05, "loss": 0.2163, "num_input_tokens_seen": 52619648, "step": 24375 }, { "epoch": 3.9771615008156607, "grad_norm": 0.07754193991422653, "learning_rate": 3.767342998851921e-05, "loss": 0.0958, "num_input_tokens_seen": 52630528, "step": 24380 }, { "epoch": 3.9779771615008155, "grad_norm": 0.06371898204088211, "learning_rate": 3.766729389527535e-05, "loss": 0.0081, "num_input_tokens_seen": 52640608, "step": 24385 }, { "epoch": 3.9787928221859707, "grad_norm": 2.396165370941162, "learning_rate": 3.766115677515637e-05, "loss": 0.1193, "num_input_tokens_seen": 52651872, "step": 24390 }, { "epoch": 3.9796084828711256, "grad_norm": 0.4515656530857086, "learning_rate": 3.765501862865976e-05, "loss": 0.0623, "num_input_tokens_seen": 52663904, "step": 24395 }, { "epoch": 3.980424143556281, "grad_norm": 4.028966426849365, "learning_rate": 3.764887945628315e-05, "loss": 0.1423, "num_input_tokens_seen": 52674848, "step": 24400 }, { "epoch": 3.9812398042414356, "grad_norm": 3.8315951824188232, "learning_rate": 3.76427392585242e-05, "loss": 0.3372, "num_input_tokens_seen": 52686656, "step": 24405 }, { "epoch": 3.9820554649265905, "grad_norm": 0.09870853275060654, "learning_rate": 3.7636598035880633e-05, "loss": 0.0296, "num_input_tokens_seen": 52696544, "step": 24410 }, { "epoch": 3.9828711256117453, "grad_norm": 0.04006926342844963, "learning_rate": 3.763045578885033e-05, "loss": 0.0853, "num_input_tokens_seen": 52706912, "step": 24415 }, { "epoch": 3.9836867862969005, "grad_norm": 0.08128601312637329, "learning_rate": 3.762431251793118e-05, "loss": 0.1298, "num_input_tokens_seen": 52718016, "step": 24420 }, { "epoch": 3.9845024469820554, "grad_norm": 4.476246356964111, "learning_rate": 3.7618168223621215e-05, "loss": 0.4976, "num_input_tokens_seen": 52728288, "step": 24425 }, { "epoch": 3.9853181076672106, "grad_norm": 0.20408247411251068, "learning_rate": 3.761202290641851e-05, "loss": 0.0464, "num_input_tokens_seen": 52739104, "step": 24430 }, { "epoch": 3.9861337683523654, "grad_norm": 6.188340187072754, "learning_rate": 3.760587656682122e-05, "loss": 0.0654, "num_input_tokens_seen": 52749536, "step": 24435 }, { "epoch": 3.9869494290375203, "grad_norm": 3.187253952026367, "learning_rate": 3.759972920532762e-05, "loss": 0.2218, "num_input_tokens_seen": 52760416, "step": 24440 }, { "epoch": 3.9877650897226755, "grad_norm": 0.1321265697479248, "learning_rate": 3.759358082243604e-05, "loss": 0.1113, "num_input_tokens_seen": 52772160, "step": 24445 }, { "epoch": 3.9885807504078303, "grad_norm": 4.083325386047363, "learning_rate": 3.7587431418644906e-05, "loss": 0.0218, "num_input_tokens_seen": 52781984, "step": 24450 }, { "epoch": 3.9893964110929856, "grad_norm": 5.500246524810791, "learning_rate": 3.758128099445271e-05, "loss": 0.1482, "num_input_tokens_seen": 52791776, "step": 24455 }, { "epoch": 3.9902120717781404, "grad_norm": 0.1290796846151352, "learning_rate": 3.757512955035804e-05, "loss": 0.0392, "num_input_tokens_seen": 52802528, "step": 24460 }, { "epoch": 3.9910277324632952, "grad_norm": 1.1345018148422241, "learning_rate": 3.7568977086859566e-05, "loss": 0.0764, "num_input_tokens_seen": 52812640, "step": 24465 }, { "epoch": 3.99184339314845, "grad_norm": 0.26198068261146545, "learning_rate": 3.7562823604456035e-05, "loss": 0.1294, "num_input_tokens_seen": 52823744, "step": 24470 }, { "epoch": 3.9926590538336053, "grad_norm": 3.3135857582092285, "learning_rate": 3.7556669103646266e-05, "loss": 0.0971, "num_input_tokens_seen": 52834016, "step": 24475 }, { "epoch": 3.99347471451876, "grad_norm": 2.892432451248169, "learning_rate": 3.75505135849292e-05, "loss": 0.2205, "num_input_tokens_seen": 52845248, "step": 24480 }, { "epoch": 3.9942903752039154, "grad_norm": 0.3946531414985657, "learning_rate": 3.7544357048803824e-05, "loss": 0.0121, "num_input_tokens_seen": 52855104, "step": 24485 }, { "epoch": 3.99510603588907, "grad_norm": 0.11953412741422653, "learning_rate": 3.7538199495769214e-05, "loss": 0.0097, "num_input_tokens_seen": 52866048, "step": 24490 }, { "epoch": 3.995921696574225, "grad_norm": 0.09131952375173569, "learning_rate": 3.753204092632454e-05, "loss": 0.0429, "num_input_tokens_seen": 52876768, "step": 24495 }, { "epoch": 3.99673735725938, "grad_norm": 0.11290084570646286, "learning_rate": 3.752588134096903e-05, "loss": 0.1046, "num_input_tokens_seen": 52887872, "step": 24500 }, { "epoch": 3.997553017944535, "grad_norm": 7.183202266693115, "learning_rate": 3.751972074020202e-05, "loss": 0.1252, "num_input_tokens_seen": 52898624, "step": 24505 }, { "epoch": 3.99836867862969, "grad_norm": 0.767217755317688, "learning_rate": 3.751355912452294e-05, "loss": 0.1128, "num_input_tokens_seen": 52908768, "step": 24510 }, { "epoch": 3.999184339314845, "grad_norm": 0.08274319022893906, "learning_rate": 3.7507396494431246e-05, "loss": 0.038, "num_input_tokens_seen": 52920064, "step": 24515 }, { "epoch": 4.0, "grad_norm": 0.06304477155208588, "learning_rate": 3.750123285042654e-05, "loss": 0.006, "num_input_tokens_seen": 52929744, "step": 24520 }, { "epoch": 4.0, "eval_loss": 0.15006808936595917, "eval_runtime": 132.9283, "eval_samples_per_second": 20.5, "eval_steps_per_second": 5.131, "num_input_tokens_seen": 52929744, "step": 24520 }, { "epoch": 4.000815660685155, "grad_norm": 6.7583513259887695, "learning_rate": 3.749506819300846e-05, "loss": 0.1874, "num_input_tokens_seen": 52941840, "step": 24525 }, { "epoch": 4.00163132137031, "grad_norm": 0.07735206186771393, "learning_rate": 3.748890252267676e-05, "loss": 0.0937, "num_input_tokens_seen": 52952560, "step": 24530 }, { "epoch": 4.002446982055465, "grad_norm": 0.1584097295999527, "learning_rate": 3.748273583993126e-05, "loss": 0.1299, "num_input_tokens_seen": 52961968, "step": 24535 }, { "epoch": 4.00326264274062, "grad_norm": 8.218377113342285, "learning_rate": 3.747656814527185e-05, "loss": 0.1639, "num_input_tokens_seen": 52971408, "step": 24540 }, { "epoch": 4.004078303425775, "grad_norm": 0.07929620146751404, "learning_rate": 3.747039943919852e-05, "loss": 0.1938, "num_input_tokens_seen": 52982832, "step": 24545 }, { "epoch": 4.00489396411093, "grad_norm": 1.899898648262024, "learning_rate": 3.746422972221134e-05, "loss": 0.1135, "num_input_tokens_seen": 52993648, "step": 24550 }, { "epoch": 4.005709624796085, "grad_norm": 0.15819135308265686, "learning_rate": 3.745805899481045e-05, "loss": 0.1322, "num_input_tokens_seen": 53004272, "step": 24555 }, { "epoch": 4.006525285481239, "grad_norm": 4.0420684814453125, "learning_rate": 3.745188725749609e-05, "loss": 0.2079, "num_input_tokens_seen": 53013872, "step": 24560 }, { "epoch": 4.007340946166395, "grad_norm": 0.08327308297157288, "learning_rate": 3.744571451076856e-05, "loss": 0.0109, "num_input_tokens_seen": 53024912, "step": 24565 }, { "epoch": 4.00815660685155, "grad_norm": 6.343252658843994, "learning_rate": 3.7439540755128276e-05, "loss": 0.2233, "num_input_tokens_seen": 53037360, "step": 24570 }, { "epoch": 4.008972267536705, "grad_norm": 4.405014991760254, "learning_rate": 3.7433365991075695e-05, "loss": 0.1093, "num_input_tokens_seen": 53048688, "step": 24575 }, { "epoch": 4.00978792822186, "grad_norm": 0.39339569211006165, "learning_rate": 3.742719021911138e-05, "loss": 0.0156, "num_input_tokens_seen": 53059568, "step": 24580 }, { "epoch": 4.010603588907014, "grad_norm": 0.8301985859870911, "learning_rate": 3.742101343973598e-05, "loss": 0.0451, "num_input_tokens_seen": 53071312, "step": 24585 }, { "epoch": 4.011419249592169, "grad_norm": 3.7286832332611084, "learning_rate": 3.741483565345019e-05, "loss": 0.1269, "num_input_tokens_seen": 53082544, "step": 24590 }, { "epoch": 4.012234910277325, "grad_norm": 0.08277827501296997, "learning_rate": 3.740865686075484e-05, "loss": 0.0107, "num_input_tokens_seen": 53093168, "step": 24595 }, { "epoch": 4.01305057096248, "grad_norm": 0.09586529433727264, "learning_rate": 3.7402477062150795e-05, "loss": 0.1632, "num_input_tokens_seen": 53104688, "step": 24600 }, { "epoch": 4.013866231647635, "grad_norm": 0.16563250124454498, "learning_rate": 3.739629625813904e-05, "loss": 0.1802, "num_input_tokens_seen": 53116528, "step": 24605 }, { "epoch": 4.014681892332789, "grad_norm": 0.373230904340744, "learning_rate": 3.739011444922061e-05, "loss": 0.0248, "num_input_tokens_seen": 53128016, "step": 24610 }, { "epoch": 4.015497553017944, "grad_norm": 0.1066754013299942, "learning_rate": 3.7383931635896634e-05, "loss": 0.059, "num_input_tokens_seen": 53139120, "step": 24615 }, { "epoch": 4.0163132137031, "grad_norm": 0.12022466212511063, "learning_rate": 3.737774781866833e-05, "loss": 0.0085, "num_input_tokens_seen": 53149392, "step": 24620 }, { "epoch": 4.017128874388255, "grad_norm": 5.02327823638916, "learning_rate": 3.737156299803698e-05, "loss": 0.0226, "num_input_tokens_seen": 53160304, "step": 24625 }, { "epoch": 4.0179445350734095, "grad_norm": 2.427842140197754, "learning_rate": 3.7365377174503956e-05, "loss": 0.0662, "num_input_tokens_seen": 53171248, "step": 24630 }, { "epoch": 4.018760195758564, "grad_norm": 0.3013766407966614, "learning_rate": 3.7359190348570726e-05, "loss": 0.2066, "num_input_tokens_seen": 53183312, "step": 24635 }, { "epoch": 4.019575856443719, "grad_norm": 0.05398857966065407, "learning_rate": 3.735300252073881e-05, "loss": 0.1387, "num_input_tokens_seen": 53194192, "step": 24640 }, { "epoch": 4.020391517128874, "grad_norm": 0.1032828837633133, "learning_rate": 3.734681369150983e-05, "loss": 0.2984, "num_input_tokens_seen": 53205296, "step": 24645 }, { "epoch": 4.02120717781403, "grad_norm": 0.061879999935626984, "learning_rate": 3.7340623861385496e-05, "loss": 0.0551, "num_input_tokens_seen": 53215632, "step": 24650 }, { "epoch": 4.0220228384991845, "grad_norm": 0.23465439677238464, "learning_rate": 3.7334433030867564e-05, "loss": 0.1207, "num_input_tokens_seen": 53226704, "step": 24655 }, { "epoch": 4.022838499184339, "grad_norm": 0.050487224012613297, "learning_rate": 3.732824120045791e-05, "loss": 0.0071, "num_input_tokens_seen": 53237328, "step": 24660 }, { "epoch": 4.023654159869494, "grad_norm": 0.14931054413318634, "learning_rate": 3.732204837065847e-05, "loss": 0.0258, "num_input_tokens_seen": 53248912, "step": 24665 }, { "epoch": 4.024469820554649, "grad_norm": 0.04330739751458168, "learning_rate": 3.731585454197127e-05, "loss": 0.0847, "num_input_tokens_seen": 53260528, "step": 24670 }, { "epoch": 4.025285481239805, "grad_norm": 0.5237597823143005, "learning_rate": 3.7309659714898404e-05, "loss": 0.0995, "num_input_tokens_seen": 53272016, "step": 24675 }, { "epoch": 4.0261011419249595, "grad_norm": 0.34010252356529236, "learning_rate": 3.730346388994207e-05, "loss": 0.05, "num_input_tokens_seen": 53282640, "step": 24680 }, { "epoch": 4.026916802610114, "grad_norm": 0.019590429961681366, "learning_rate": 3.729726706760452e-05, "loss": 0.0682, "num_input_tokens_seen": 53293904, "step": 24685 }, { "epoch": 4.027732463295269, "grad_norm": 0.05539269000291824, "learning_rate": 3.729106924838812e-05, "loss": 0.0366, "num_input_tokens_seen": 53305904, "step": 24690 }, { "epoch": 4.028548123980424, "grad_norm": 0.0381401851773262, "learning_rate": 3.728487043279527e-05, "loss": 0.0064, "num_input_tokens_seen": 53316528, "step": 24695 }, { "epoch": 4.029363784665579, "grad_norm": 1.7421964406967163, "learning_rate": 3.727867062132849e-05, "loss": 0.1583, "num_input_tokens_seen": 53328464, "step": 24700 }, { "epoch": 4.0301794453507345, "grad_norm": 2.946322202682495, "learning_rate": 3.7272469814490376e-05, "loss": 0.1548, "num_input_tokens_seen": 53339440, "step": 24705 }, { "epoch": 4.030995106035889, "grad_norm": 0.048724982887506485, "learning_rate": 3.726626801278358e-05, "loss": 0.1217, "num_input_tokens_seen": 53350000, "step": 24710 }, { "epoch": 4.031810766721044, "grad_norm": 0.027808157727122307, "learning_rate": 3.726006521671086e-05, "loss": 0.0087, "num_input_tokens_seen": 53360528, "step": 24715 }, { "epoch": 4.032626427406199, "grad_norm": 0.04992597550153732, "learning_rate": 3.7253861426775056e-05, "loss": 0.071, "num_input_tokens_seen": 53371600, "step": 24720 }, { "epoch": 4.033442088091354, "grad_norm": 0.43019813299179077, "learning_rate": 3.7247656643479064e-05, "loss": 0.1201, "num_input_tokens_seen": 53382032, "step": 24725 }, { "epoch": 4.034257748776509, "grad_norm": 2.9925100803375244, "learning_rate": 3.724145086732588e-05, "loss": 0.1676, "num_input_tokens_seen": 53392816, "step": 24730 }, { "epoch": 4.035073409461664, "grad_norm": 0.16607451438903809, "learning_rate": 3.7235244098818576e-05, "loss": 0.0634, "num_input_tokens_seen": 53404624, "step": 24735 }, { "epoch": 4.035889070146819, "grad_norm": 4.58715295791626, "learning_rate": 3.722903633846031e-05, "loss": 0.1857, "num_input_tokens_seen": 53416048, "step": 24740 }, { "epoch": 4.036704730831974, "grad_norm": 11.995363235473633, "learning_rate": 3.72228275867543e-05, "loss": 0.0628, "num_input_tokens_seen": 53426704, "step": 24745 }, { "epoch": 4.037520391517129, "grad_norm": 4.142264366149902, "learning_rate": 3.721661784420387e-05, "loss": 0.1447, "num_input_tokens_seen": 53439248, "step": 24750 }, { "epoch": 4.0383360522022835, "grad_norm": 0.30402427911758423, "learning_rate": 3.721040711131242e-05, "loss": 0.2321, "num_input_tokens_seen": 53448400, "step": 24755 }, { "epoch": 4.039151712887439, "grad_norm": 8.091341018676758, "learning_rate": 3.72041953885834e-05, "loss": 0.2699, "num_input_tokens_seen": 53460336, "step": 24760 }, { "epoch": 4.039967373572594, "grad_norm": 3.046910047531128, "learning_rate": 3.719798267652038e-05, "loss": 0.1623, "num_input_tokens_seen": 53471568, "step": 24765 }, { "epoch": 4.040783034257749, "grad_norm": 0.10052645951509476, "learning_rate": 3.719176897562701e-05, "loss": 0.1048, "num_input_tokens_seen": 53483568, "step": 24770 }, { "epoch": 4.041598694942904, "grad_norm": 0.43055182695388794, "learning_rate": 3.718555428640697e-05, "loss": 0.0993, "num_input_tokens_seen": 53492848, "step": 24775 }, { "epoch": 4.0424143556280585, "grad_norm": 3.058279514312744, "learning_rate": 3.717933860936407e-05, "loss": 0.2066, "num_input_tokens_seen": 53502992, "step": 24780 }, { "epoch": 4.043230016313213, "grad_norm": 0.631894588470459, "learning_rate": 3.7173121945002197e-05, "loss": 0.0916, "num_input_tokens_seen": 53513264, "step": 24785 }, { "epoch": 4.044045676998369, "grad_norm": 0.27123937010765076, "learning_rate": 3.716690429382529e-05, "loss": 0.0888, "num_input_tokens_seen": 53523984, "step": 24790 }, { "epoch": 4.044861337683524, "grad_norm": 0.26726406812667847, "learning_rate": 3.716068565633738e-05, "loss": 0.0563, "num_input_tokens_seen": 53534672, "step": 24795 }, { "epoch": 4.045676998368679, "grad_norm": 0.06608258187770844, "learning_rate": 3.715446603304259e-05, "loss": 0.0395, "num_input_tokens_seen": 53546224, "step": 24800 }, { "epoch": 4.0464926590538335, "grad_norm": 0.05818143114447594, "learning_rate": 3.7148245424445114e-05, "loss": 0.0042, "num_input_tokens_seen": 53555952, "step": 24805 }, { "epoch": 4.047308319738988, "grad_norm": 0.06563541293144226, "learning_rate": 3.7142023831049226e-05, "loss": 0.1048, "num_input_tokens_seen": 53567024, "step": 24810 }, { "epoch": 4.048123980424143, "grad_norm": 0.14977294206619263, "learning_rate": 3.713580125335928e-05, "loss": 0.0826, "num_input_tokens_seen": 53578032, "step": 24815 }, { "epoch": 4.048939641109299, "grad_norm": 3.3613672256469727, "learning_rate": 3.7129577691879694e-05, "loss": 0.1246, "num_input_tokens_seen": 53589968, "step": 24820 }, { "epoch": 4.049755301794454, "grad_norm": 0.1577356606721878, "learning_rate": 3.712335314711501e-05, "loss": 0.0128, "num_input_tokens_seen": 53599472, "step": 24825 }, { "epoch": 4.0505709624796085, "grad_norm": 0.0451761893928051, "learning_rate": 3.7117127619569796e-05, "loss": 0.1854, "num_input_tokens_seen": 53611376, "step": 24830 }, { "epoch": 4.051386623164763, "grad_norm": 0.38689252734184265, "learning_rate": 3.7110901109748745e-05, "loss": 0.008, "num_input_tokens_seen": 53623760, "step": 24835 }, { "epoch": 4.052202283849918, "grad_norm": 6.373911380767822, "learning_rate": 3.710467361815659e-05, "loss": 0.3157, "num_input_tokens_seen": 53634416, "step": 24840 }, { "epoch": 4.053017944535074, "grad_norm": 0.07288794964551926, "learning_rate": 3.709844514529818e-05, "loss": 0.0646, "num_input_tokens_seen": 53644528, "step": 24845 }, { "epoch": 4.053833605220229, "grad_norm": 0.16024848818778992, "learning_rate": 3.709221569167842e-05, "loss": 0.0677, "num_input_tokens_seen": 53654480, "step": 24850 }, { "epoch": 4.054649265905383, "grad_norm": 0.06283432990312576, "learning_rate": 3.70859852578023e-05, "loss": 0.2092, "num_input_tokens_seen": 53665136, "step": 24855 }, { "epoch": 4.055464926590538, "grad_norm": 0.16095586121082306, "learning_rate": 3.70797538441749e-05, "loss": 0.2295, "num_input_tokens_seen": 53677104, "step": 24860 }, { "epoch": 4.056280587275693, "grad_norm": 0.05175670608878136, "learning_rate": 3.707352145130135e-05, "loss": 0.0215, "num_input_tokens_seen": 53688016, "step": 24865 }, { "epoch": 4.057096247960848, "grad_norm": 0.2593509256839752, "learning_rate": 3.706728807968689e-05, "loss": 0.1117, "num_input_tokens_seen": 53698544, "step": 24870 }, { "epoch": 4.057911908646004, "grad_norm": 6.267168045043945, "learning_rate": 3.706105372983683e-05, "loss": 0.0268, "num_input_tokens_seen": 53710000, "step": 24875 }, { "epoch": 4.058727569331158, "grad_norm": 0.06121160462498665, "learning_rate": 3.705481840225656e-05, "loss": 0.0126, "num_input_tokens_seen": 53720656, "step": 24880 }, { "epoch": 4.059543230016313, "grad_norm": 0.07203152030706406, "learning_rate": 3.704858209745155e-05, "loss": 0.1027, "num_input_tokens_seen": 53731280, "step": 24885 }, { "epoch": 4.060358890701468, "grad_norm": 2.044027090072632, "learning_rate": 3.704234481592733e-05, "loss": 0.0952, "num_input_tokens_seen": 53741712, "step": 24890 }, { "epoch": 4.061174551386623, "grad_norm": 0.1900075078010559, "learning_rate": 3.703610655818955e-05, "loss": 0.0872, "num_input_tokens_seen": 53753360, "step": 24895 }, { "epoch": 4.061990212071779, "grad_norm": 0.17512746155261993, "learning_rate": 3.702986732474389e-05, "loss": 0.1335, "num_input_tokens_seen": 53764112, "step": 24900 }, { "epoch": 4.062805872756933, "grad_norm": 0.19973358511924744, "learning_rate": 3.702362711609615e-05, "loss": 0.025, "num_input_tokens_seen": 53774320, "step": 24905 }, { "epoch": 4.063621533442088, "grad_norm": 0.05211591720581055, "learning_rate": 3.701738593275219e-05, "loss": 0.1012, "num_input_tokens_seen": 53784848, "step": 24910 }, { "epoch": 4.064437194127243, "grad_norm": 0.0706319734454155, "learning_rate": 3.701114377521795e-05, "loss": 0.0069, "num_input_tokens_seen": 53795632, "step": 24915 }, { "epoch": 4.065252854812398, "grad_norm": 0.8674832582473755, "learning_rate": 3.700490064399945e-05, "loss": 0.0962, "num_input_tokens_seen": 53806480, "step": 24920 }, { "epoch": 4.066068515497553, "grad_norm": 0.5811168551445007, "learning_rate": 3.6998656539602795e-05, "loss": 0.0128, "num_input_tokens_seen": 53816816, "step": 24925 }, { "epoch": 4.066884176182708, "grad_norm": 0.07982023805379868, "learning_rate": 3.699241146253416e-05, "loss": 0.0476, "num_input_tokens_seen": 53828720, "step": 24930 }, { "epoch": 4.067699836867863, "grad_norm": 0.31422337889671326, "learning_rate": 3.69861654132998e-05, "loss": 0.0843, "num_input_tokens_seen": 53840816, "step": 24935 }, { "epoch": 4.068515497553018, "grad_norm": 0.3347615897655487, "learning_rate": 3.6979918392406055e-05, "loss": 0.1054, "num_input_tokens_seen": 53851120, "step": 24940 }, { "epoch": 4.069331158238173, "grad_norm": 0.2270788550376892, "learning_rate": 3.697367040035934e-05, "loss": 0.1236, "num_input_tokens_seen": 53862288, "step": 24945 }, { "epoch": 4.070146818923328, "grad_norm": 5.550386905670166, "learning_rate": 3.696742143766615e-05, "loss": 0.3034, "num_input_tokens_seen": 53873584, "step": 24950 }, { "epoch": 4.0709624796084825, "grad_norm": 0.4365846812725067, "learning_rate": 3.696117150483306e-05, "loss": 0.3094, "num_input_tokens_seen": 53885328, "step": 24955 }, { "epoch": 4.071778140293638, "grad_norm": 0.3106023669242859, "learning_rate": 3.695492060236671e-05, "loss": 0.005, "num_input_tokens_seen": 53895376, "step": 24960 }, { "epoch": 4.072593800978793, "grad_norm": 0.06049007922410965, "learning_rate": 3.694866873077384e-05, "loss": 0.0536, "num_input_tokens_seen": 53906640, "step": 24965 }, { "epoch": 4.073409461663948, "grad_norm": 3.8268356323242188, "learning_rate": 3.6942415890561254e-05, "loss": 0.2055, "num_input_tokens_seen": 53917264, "step": 24970 }, { "epoch": 4.074225122349103, "grad_norm": 0.09823447465896606, "learning_rate": 3.6936162082235844e-05, "loss": 0.1302, "num_input_tokens_seen": 53927984, "step": 24975 }, { "epoch": 4.075040783034257, "grad_norm": 0.029776480048894882, "learning_rate": 3.692990730630457e-05, "loss": 0.0094, "num_input_tokens_seen": 53938672, "step": 24980 }, { "epoch": 4.075856443719413, "grad_norm": 3.100522994995117, "learning_rate": 3.692365156327448e-05, "loss": 0.0714, "num_input_tokens_seen": 53951120, "step": 24985 }, { "epoch": 4.076672104404568, "grad_norm": 0.08029317110776901, "learning_rate": 3.691739485365269e-05, "loss": 0.0815, "num_input_tokens_seen": 53962192, "step": 24990 }, { "epoch": 4.077487765089723, "grad_norm": 1.666698694229126, "learning_rate": 3.691113717794641e-05, "loss": 0.1829, "num_input_tokens_seen": 53973488, "step": 24995 }, { "epoch": 4.078303425774878, "grad_norm": 0.1005997583270073, "learning_rate": 3.6904878536662904e-05, "loss": 0.1201, "num_input_tokens_seen": 53984272, "step": 25000 }, { "epoch": 4.079119086460032, "grad_norm": 3.5812594890594482, "learning_rate": 3.6898618930309556e-05, "loss": 0.1481, "num_input_tokens_seen": 53994704, "step": 25005 }, { "epoch": 4.079934747145187, "grad_norm": 0.07321606576442719, "learning_rate": 3.6892358359393767e-05, "loss": 0.098, "num_input_tokens_seen": 54005552, "step": 25010 }, { "epoch": 4.080750407830343, "grad_norm": 0.19606611132621765, "learning_rate": 3.688609682442308e-05, "loss": 0.0063, "num_input_tokens_seen": 54016560, "step": 25015 }, { "epoch": 4.081566068515498, "grad_norm": 2.725309133529663, "learning_rate": 3.687983432590507e-05, "loss": 0.0964, "num_input_tokens_seen": 54026736, "step": 25020 }, { "epoch": 4.082381729200653, "grad_norm": 5.751374244689941, "learning_rate": 3.6873570864347415e-05, "loss": 0.2511, "num_input_tokens_seen": 54036592, "step": 25025 }, { "epoch": 4.083197389885807, "grad_norm": 0.20788143575191498, "learning_rate": 3.686730644025786e-05, "loss": 0.3024, "num_input_tokens_seen": 54046864, "step": 25030 }, { "epoch": 4.084013050570962, "grad_norm": 5.681027889251709, "learning_rate": 3.686104105414423e-05, "loss": 0.0208, "num_input_tokens_seen": 54058448, "step": 25035 }, { "epoch": 4.084828711256117, "grad_norm": 8.688946723937988, "learning_rate": 3.6854774706514424e-05, "loss": 0.1365, "num_input_tokens_seen": 54068848, "step": 25040 }, { "epoch": 4.085644371941273, "grad_norm": 0.15534977614879608, "learning_rate": 3.684850739787644e-05, "loss": 0.1952, "num_input_tokens_seen": 54079504, "step": 25045 }, { "epoch": 4.0864600326264275, "grad_norm": 0.14457550644874573, "learning_rate": 3.684223912873832e-05, "loss": 0.0333, "num_input_tokens_seen": 54089424, "step": 25050 }, { "epoch": 4.087275693311582, "grad_norm": 0.07060840725898743, "learning_rate": 3.683596989960821e-05, "loss": 0.0221, "num_input_tokens_seen": 54100848, "step": 25055 }, { "epoch": 4.088091353996737, "grad_norm": 0.23421601951122284, "learning_rate": 3.682969971099433e-05, "loss": 0.0752, "num_input_tokens_seen": 54111376, "step": 25060 }, { "epoch": 4.088907014681892, "grad_norm": 0.18303559720516205, "learning_rate": 3.682342856340496e-05, "loss": 0.0084, "num_input_tokens_seen": 54123408, "step": 25065 }, { "epoch": 4.089722675367048, "grad_norm": 0.104855477809906, "learning_rate": 3.681715645734848e-05, "loss": 0.0814, "num_input_tokens_seen": 54133296, "step": 25070 }, { "epoch": 4.0905383360522025, "grad_norm": 0.08674453943967819, "learning_rate": 3.681088339333334e-05, "loss": 0.0476, "num_input_tokens_seen": 54144240, "step": 25075 }, { "epoch": 4.091353996737357, "grad_norm": 0.05705615505576134, "learning_rate": 3.680460937186807e-05, "loss": 0.089, "num_input_tokens_seen": 54157008, "step": 25080 }, { "epoch": 4.092169657422512, "grad_norm": 0.06719832122325897, "learning_rate": 3.679833439346126e-05, "loss": 0.0936, "num_input_tokens_seen": 54167792, "step": 25085 }, { "epoch": 4.092985318107667, "grad_norm": 0.2682736814022064, "learning_rate": 3.6792058458621607e-05, "loss": 0.1036, "num_input_tokens_seen": 54178672, "step": 25090 }, { "epoch": 4.093800978792822, "grad_norm": 0.1056482195854187, "learning_rate": 3.678578156785786e-05, "loss": 0.015, "num_input_tokens_seen": 54190288, "step": 25095 }, { "epoch": 4.0946166394779775, "grad_norm": 7.483541488647461, "learning_rate": 3.677950372167885e-05, "loss": 0.1145, "num_input_tokens_seen": 54199728, "step": 25100 }, { "epoch": 4.095432300163132, "grad_norm": 0.26063913106918335, "learning_rate": 3.677322492059352e-05, "loss": 0.0126, "num_input_tokens_seen": 54209840, "step": 25105 }, { "epoch": 4.096247960848287, "grad_norm": 0.05677470564842224, "learning_rate": 3.676694516511083e-05, "loss": 0.1141, "num_input_tokens_seen": 54219856, "step": 25110 }, { "epoch": 4.097063621533442, "grad_norm": 0.07580921798944473, "learning_rate": 3.676066445573986e-05, "loss": 0.0271, "num_input_tokens_seen": 54230096, "step": 25115 }, { "epoch": 4.097879282218597, "grad_norm": 0.11550819873809814, "learning_rate": 3.675438279298975e-05, "loss": 0.0074, "num_input_tokens_seen": 54241840, "step": 25120 }, { "epoch": 4.0986949429037525, "grad_norm": 4.404228687286377, "learning_rate": 3.674810017736974e-05, "loss": 0.037, "num_input_tokens_seen": 54251920, "step": 25125 }, { "epoch": 4.099510603588907, "grad_norm": 0.21475058794021606, "learning_rate": 3.674181660938911e-05, "loss": 0.1168, "num_input_tokens_seen": 54263344, "step": 25130 }, { "epoch": 4.100326264274062, "grad_norm": 4.548152923583984, "learning_rate": 3.6735532089557256e-05, "loss": 0.108, "num_input_tokens_seen": 54274384, "step": 25135 }, { "epoch": 4.101141924959217, "grad_norm": 0.06051735207438469, "learning_rate": 3.672924661838362e-05, "loss": 0.0645, "num_input_tokens_seen": 54285296, "step": 25140 }, { "epoch": 4.101957585644372, "grad_norm": 0.24951577186584473, "learning_rate": 3.672296019637774e-05, "loss": 0.1373, "num_input_tokens_seen": 54295504, "step": 25145 }, { "epoch": 4.102773246329527, "grad_norm": 4.169466972351074, "learning_rate": 3.6716672824049234e-05, "loss": 0.0927, "num_input_tokens_seen": 54306704, "step": 25150 }, { "epoch": 4.103588907014682, "grad_norm": 0.026811687275767326, "learning_rate": 3.671038450190777e-05, "loss": 0.1799, "num_input_tokens_seen": 54316720, "step": 25155 }, { "epoch": 4.104404567699837, "grad_norm": 4.2193284034729, "learning_rate": 3.670409523046312e-05, "loss": 0.196, "num_input_tokens_seen": 54327408, "step": 25160 }, { "epoch": 4.105220228384992, "grad_norm": 1.2365455627441406, "learning_rate": 3.669780501022513e-05, "loss": 0.0864, "num_input_tokens_seen": 54337104, "step": 25165 }, { "epoch": 4.106035889070147, "grad_norm": 0.19575998187065125, "learning_rate": 3.669151384170371e-05, "loss": 0.1946, "num_input_tokens_seen": 54347408, "step": 25170 }, { "epoch": 4.1068515497553015, "grad_norm": 2.7266340255737305, "learning_rate": 3.668522172540886e-05, "loss": 0.1258, "num_input_tokens_seen": 54357264, "step": 25175 }, { "epoch": 4.107667210440456, "grad_norm": 3.082364559173584, "learning_rate": 3.667892866185064e-05, "loss": 0.1118, "num_input_tokens_seen": 54368816, "step": 25180 }, { "epoch": 4.108482871125612, "grad_norm": 0.3306456208229065, "learning_rate": 3.6672634651539205e-05, "loss": 0.0559, "num_input_tokens_seen": 54380848, "step": 25185 }, { "epoch": 4.109298531810767, "grad_norm": 0.9651033878326416, "learning_rate": 3.6666339694984785e-05, "loss": 0.0198, "num_input_tokens_seen": 54391760, "step": 25190 }, { "epoch": 4.110114192495922, "grad_norm": 0.12155020982027054, "learning_rate": 3.666004379269766e-05, "loss": 0.2691, "num_input_tokens_seen": 54402384, "step": 25195 }, { "epoch": 4.1109298531810765, "grad_norm": 0.10422401875257492, "learning_rate": 3.665374694518824e-05, "loss": 0.0097, "num_input_tokens_seen": 54413680, "step": 25200 }, { "epoch": 4.111745513866231, "grad_norm": 2.983114242553711, "learning_rate": 3.664744915296695e-05, "loss": 0.0988, "num_input_tokens_seen": 54422640, "step": 25205 }, { "epoch": 4.112561174551387, "grad_norm": 3.5289039611816406, "learning_rate": 3.664115041654434e-05, "loss": 0.2287, "num_input_tokens_seen": 54434576, "step": 25210 }, { "epoch": 4.113376835236542, "grad_norm": 0.06660588830709457, "learning_rate": 3.663485073643102e-05, "loss": 0.0055, "num_input_tokens_seen": 54444272, "step": 25215 }, { "epoch": 4.114192495921697, "grad_norm": 1.9301166534423828, "learning_rate": 3.6628550113137635e-05, "loss": 0.0347, "num_input_tokens_seen": 54456016, "step": 25220 }, { "epoch": 4.1150081566068515, "grad_norm": 0.05434871092438698, "learning_rate": 3.6622248547175e-05, "loss": 0.0046, "num_input_tokens_seen": 54466448, "step": 25225 }, { "epoch": 4.115823817292006, "grad_norm": 1.4604432582855225, "learning_rate": 3.661594603905392e-05, "loss": 0.1336, "num_input_tokens_seen": 54476592, "step": 25230 }, { "epoch": 4.116639477977161, "grad_norm": 0.024946851655840874, "learning_rate": 3.660964258928532e-05, "loss": 0.1005, "num_input_tokens_seen": 54488048, "step": 25235 }, { "epoch": 4.117455138662317, "grad_norm": 0.168961301445961, "learning_rate": 3.660333819838018e-05, "loss": 0.0691, "num_input_tokens_seen": 54499120, "step": 25240 }, { "epoch": 4.118270799347472, "grad_norm": 0.37382256984710693, "learning_rate": 3.659703286684957e-05, "loss": 0.1189, "num_input_tokens_seen": 54510576, "step": 25245 }, { "epoch": 4.1190864600326265, "grad_norm": 0.09253478050231934, "learning_rate": 3.659072659520463e-05, "loss": 0.1668, "num_input_tokens_seen": 54522160, "step": 25250 }, { "epoch": 4.119902120717781, "grad_norm": 0.05152549222111702, "learning_rate": 3.658441938395659e-05, "loss": 0.0054, "num_input_tokens_seen": 54533872, "step": 25255 }, { "epoch": 4.120717781402936, "grad_norm": 2.148808002471924, "learning_rate": 3.6578111233616726e-05, "loss": 0.1147, "num_input_tokens_seen": 54545200, "step": 25260 }, { "epoch": 4.121533442088092, "grad_norm": 0.0854240134358406, "learning_rate": 3.657180214469643e-05, "loss": 0.1078, "num_input_tokens_seen": 54554960, "step": 25265 }, { "epoch": 4.122349102773247, "grad_norm": 0.11657576262950897, "learning_rate": 3.656549211770713e-05, "loss": 0.1865, "num_input_tokens_seen": 54565456, "step": 25270 }, { "epoch": 4.123164763458401, "grad_norm": 0.14645171165466309, "learning_rate": 3.655918115316036e-05, "loss": 0.0936, "num_input_tokens_seen": 54575536, "step": 25275 }, { "epoch": 4.123980424143556, "grad_norm": 0.08738540858030319, "learning_rate": 3.655286925156772e-05, "loss": 0.0146, "num_input_tokens_seen": 54585296, "step": 25280 }, { "epoch": 4.124796084828711, "grad_norm": 0.14348307251930237, "learning_rate": 3.654655641344087e-05, "loss": 0.181, "num_input_tokens_seen": 54595600, "step": 25285 }, { "epoch": 4.125611745513866, "grad_norm": 0.10797704011201859, "learning_rate": 3.654024263929157e-05, "loss": 0.0894, "num_input_tokens_seen": 54606160, "step": 25290 }, { "epoch": 4.126427406199022, "grad_norm": 0.2273145169019699, "learning_rate": 3.653392792963165e-05, "loss": 0.0373, "num_input_tokens_seen": 54617232, "step": 25295 }, { "epoch": 4.127243066884176, "grad_norm": 4.387463569641113, "learning_rate": 3.652761228497301e-05, "loss": 0.1154, "num_input_tokens_seen": 54628688, "step": 25300 }, { "epoch": 4.128058727569331, "grad_norm": 0.1348361074924469, "learning_rate": 3.652129570582763e-05, "loss": 0.1604, "num_input_tokens_seen": 54640016, "step": 25305 }, { "epoch": 4.128874388254486, "grad_norm": 0.07359375804662704, "learning_rate": 3.651497819270756e-05, "loss": 0.0725, "num_input_tokens_seen": 54648848, "step": 25310 }, { "epoch": 4.129690048939641, "grad_norm": 6.487445831298828, "learning_rate": 3.650865974612493e-05, "loss": 0.1315, "num_input_tokens_seen": 54659376, "step": 25315 }, { "epoch": 4.130505709624796, "grad_norm": 0.057771340012550354, "learning_rate": 3.650234036659195e-05, "loss": 0.0084, "num_input_tokens_seen": 54670928, "step": 25320 }, { "epoch": 4.131321370309951, "grad_norm": 0.08444222062826157, "learning_rate": 3.649602005462089e-05, "loss": 0.1, "num_input_tokens_seen": 54681584, "step": 25325 }, { "epoch": 4.132137030995106, "grad_norm": 0.08515111356973648, "learning_rate": 3.648969881072412e-05, "loss": 0.1271, "num_input_tokens_seen": 54692432, "step": 25330 }, { "epoch": 4.132952691680261, "grad_norm": 0.14698028564453125, "learning_rate": 3.648337663541407e-05, "loss": 0.1074, "num_input_tokens_seen": 54703152, "step": 25335 }, { "epoch": 4.133768352365416, "grad_norm": 0.1383892446756363, "learning_rate": 3.647705352920324e-05, "loss": 0.0131, "num_input_tokens_seen": 54714032, "step": 25340 }, { "epoch": 4.134584013050571, "grad_norm": 0.27039095759391785, "learning_rate": 3.647072949260422e-05, "loss": 0.0114, "num_input_tokens_seen": 54724272, "step": 25345 }, { "epoch": 4.135399673735726, "grad_norm": 0.4326794743537903, "learning_rate": 3.646440452612965e-05, "loss": 0.0085, "num_input_tokens_seen": 54734864, "step": 25350 }, { "epoch": 4.136215334420881, "grad_norm": 0.07697264105081558, "learning_rate": 3.645807863029229e-05, "loss": 0.2007, "num_input_tokens_seen": 54744464, "step": 25355 }, { "epoch": 4.137030995106036, "grad_norm": 7.296753406524658, "learning_rate": 3.645175180560495e-05, "loss": 0.2002, "num_input_tokens_seen": 54754672, "step": 25360 }, { "epoch": 4.137846655791191, "grad_norm": 0.060346946120262146, "learning_rate": 3.644542405258049e-05, "loss": 0.1618, "num_input_tokens_seen": 54766256, "step": 25365 }, { "epoch": 4.138662316476346, "grad_norm": 0.06213405355811119, "learning_rate": 3.643909537173188e-05, "loss": 0.0759, "num_input_tokens_seen": 54777296, "step": 25370 }, { "epoch": 4.1394779771615005, "grad_norm": 2.9957454204559326, "learning_rate": 3.643276576357216e-05, "loss": 0.2049, "num_input_tokens_seen": 54787728, "step": 25375 }, { "epoch": 4.140293637846656, "grad_norm": 0.13104800879955292, "learning_rate": 3.642643522861444e-05, "loss": 0.1201, "num_input_tokens_seen": 54798864, "step": 25380 }, { "epoch": 4.141109298531811, "grad_norm": 0.057072713971138, "learning_rate": 3.642010376737191e-05, "loss": 0.0628, "num_input_tokens_seen": 54809872, "step": 25385 }, { "epoch": 4.141924959216966, "grad_norm": 0.07199932634830475, "learning_rate": 3.641377138035782e-05, "loss": 0.1763, "num_input_tokens_seen": 54819248, "step": 25390 }, { "epoch": 4.142740619902121, "grad_norm": 0.149081289768219, "learning_rate": 3.640743806808551e-05, "loss": 0.0838, "num_input_tokens_seen": 54830448, "step": 25395 }, { "epoch": 4.143556280587275, "grad_norm": 0.13100646436214447, "learning_rate": 3.640110383106838e-05, "loss": 0.0525, "num_input_tokens_seen": 54841776, "step": 25400 }, { "epoch": 4.14437194127243, "grad_norm": 3.682749032974243, "learning_rate": 3.639476866981993e-05, "loss": 0.1583, "num_input_tokens_seen": 54851888, "step": 25405 }, { "epoch": 4.145187601957586, "grad_norm": 4.476661682128906, "learning_rate": 3.638843258485372e-05, "loss": 0.2497, "num_input_tokens_seen": 54861968, "step": 25410 }, { "epoch": 4.146003262642741, "grad_norm": 0.06056198850274086, "learning_rate": 3.638209557668337e-05, "loss": 0.0421, "num_input_tokens_seen": 54873104, "step": 25415 }, { "epoch": 4.146818923327896, "grad_norm": 0.10154382139444351, "learning_rate": 3.637575764582261e-05, "loss": 0.0065, "num_input_tokens_seen": 54883600, "step": 25420 }, { "epoch": 4.14763458401305, "grad_norm": 0.06345032900571823, "learning_rate": 3.636941879278522e-05, "loss": 0.007, "num_input_tokens_seen": 54895184, "step": 25425 }, { "epoch": 4.148450244698205, "grad_norm": 0.2177605926990509, "learning_rate": 3.636307901808504e-05, "loss": 0.0904, "num_input_tokens_seen": 54906640, "step": 25430 }, { "epoch": 4.149265905383361, "grad_norm": 0.16488832235336304, "learning_rate": 3.635673832223603e-05, "loss": 0.1137, "num_input_tokens_seen": 54918192, "step": 25435 }, { "epoch": 4.150081566068516, "grad_norm": 0.16648943722248077, "learning_rate": 3.635039670575218e-05, "loss": 0.1023, "num_input_tokens_seen": 54927536, "step": 25440 }, { "epoch": 4.150897226753671, "grad_norm": 0.2220931351184845, "learning_rate": 3.6344054169147584e-05, "loss": 0.1194, "num_input_tokens_seen": 54938224, "step": 25445 }, { "epoch": 4.151712887438825, "grad_norm": 7.944691181182861, "learning_rate": 3.63377107129364e-05, "loss": 0.0566, "num_input_tokens_seen": 54949104, "step": 25450 }, { "epoch": 4.15252854812398, "grad_norm": 0.07716577500104904, "learning_rate": 3.633136633763286e-05, "loss": 0.007, "num_input_tokens_seen": 54960848, "step": 25455 }, { "epoch": 4.153344208809135, "grad_norm": 5.695812702178955, "learning_rate": 3.632502104375127e-05, "loss": 0.1043, "num_input_tokens_seen": 54971600, "step": 25460 }, { "epoch": 4.154159869494291, "grad_norm": 0.19582857191562653, "learning_rate": 3.6318674831806e-05, "loss": 0.0072, "num_input_tokens_seen": 54981936, "step": 25465 }, { "epoch": 4.1549755301794455, "grad_norm": 3.1364433765411377, "learning_rate": 3.6312327702311536e-05, "loss": 0.0413, "num_input_tokens_seen": 54993552, "step": 25470 }, { "epoch": 4.1557911908646, "grad_norm": 0.0790497288107872, "learning_rate": 3.630597965578238e-05, "loss": 0.024, "num_input_tokens_seen": 55005200, "step": 25475 }, { "epoch": 4.156606851549755, "grad_norm": 0.11226534843444824, "learning_rate": 3.629963069273315e-05, "loss": 0.005, "num_input_tokens_seen": 55016464, "step": 25480 }, { "epoch": 4.15742251223491, "grad_norm": 0.11973495781421661, "learning_rate": 3.6293280813678523e-05, "loss": 0.1225, "num_input_tokens_seen": 55026576, "step": 25485 }, { "epoch": 4.158238172920065, "grad_norm": 0.024105289950966835, "learning_rate": 3.628693001913325e-05, "loss": 0.0865, "num_input_tokens_seen": 55037904, "step": 25490 }, { "epoch": 4.1590538336052205, "grad_norm": 0.09786190837621689, "learning_rate": 3.6280578309612165e-05, "loss": 0.0863, "num_input_tokens_seen": 55047408, "step": 25495 }, { "epoch": 4.159869494290375, "grad_norm": 6.50067138671875, "learning_rate": 3.6274225685630156e-05, "loss": 0.2055, "num_input_tokens_seen": 55058064, "step": 25500 }, { "epoch": 4.16068515497553, "grad_norm": 0.45748159289360046, "learning_rate": 3.626787214770221e-05, "loss": 0.0278, "num_input_tokens_seen": 55069008, "step": 25505 }, { "epoch": 4.161500815660685, "grad_norm": 3.031940460205078, "learning_rate": 3.626151769634338e-05, "loss": 0.2087, "num_input_tokens_seen": 55079792, "step": 25510 }, { "epoch": 4.16231647634584, "grad_norm": 2.2870099544525146, "learning_rate": 3.6255162332068785e-05, "loss": 0.1916, "num_input_tokens_seen": 55090864, "step": 25515 }, { "epoch": 4.1631321370309955, "grad_norm": 0.695946455001831, "learning_rate": 3.624880605539362e-05, "loss": 0.1228, "num_input_tokens_seen": 55101904, "step": 25520 }, { "epoch": 4.16394779771615, "grad_norm": 5.246313571929932, "learning_rate": 3.6242448866833164e-05, "loss": 0.0776, "num_input_tokens_seen": 55112656, "step": 25525 }, { "epoch": 4.164763458401305, "grad_norm": 0.04840467497706413, "learning_rate": 3.623609076690275e-05, "loss": 0.2569, "num_input_tokens_seen": 55123024, "step": 25530 }, { "epoch": 4.16557911908646, "grad_norm": 0.11822468042373657, "learning_rate": 3.622973175611781e-05, "loss": 0.2145, "num_input_tokens_seen": 55133136, "step": 25535 }, { "epoch": 4.166394779771615, "grad_norm": 0.36142075061798096, "learning_rate": 3.622337183499384e-05, "loss": 0.062, "num_input_tokens_seen": 55142864, "step": 25540 }, { "epoch": 4.16721044045677, "grad_norm": 0.546535849571228, "learning_rate": 3.6217011004046404e-05, "loss": 0.06, "num_input_tokens_seen": 55152848, "step": 25545 }, { "epoch": 4.168026101141925, "grad_norm": 0.5461816191673279, "learning_rate": 3.621064926379114e-05, "loss": 0.1557, "num_input_tokens_seen": 55162032, "step": 25550 }, { "epoch": 4.16884176182708, "grad_norm": 0.1468852460384369, "learning_rate": 3.620428661474377e-05, "loss": 0.1167, "num_input_tokens_seen": 55173616, "step": 25555 }, { "epoch": 4.169657422512235, "grad_norm": 0.1524345725774765, "learning_rate": 3.619792305742006e-05, "loss": 0.098, "num_input_tokens_seen": 55185328, "step": 25560 }, { "epoch": 4.17047308319739, "grad_norm": 2.2733712196350098, "learning_rate": 3.619155859233589e-05, "loss": 0.1251, "num_input_tokens_seen": 55197072, "step": 25565 }, { "epoch": 4.171288743882545, "grad_norm": 0.15534906089305878, "learning_rate": 3.6185193220007214e-05, "loss": 0.0061, "num_input_tokens_seen": 55205936, "step": 25570 }, { "epoch": 4.1721044045677, "grad_norm": 1.2479537725448608, "learning_rate": 3.617882694095001e-05, "loss": 0.0121, "num_input_tokens_seen": 55216560, "step": 25575 }, { "epoch": 4.172920065252855, "grad_norm": 0.22745876014232635, "learning_rate": 3.617245975568038e-05, "loss": 0.0366, "num_input_tokens_seen": 55227504, "step": 25580 }, { "epoch": 4.17373572593801, "grad_norm": 0.9728760719299316, "learning_rate": 3.616609166471447e-05, "loss": 0.2422, "num_input_tokens_seen": 55238032, "step": 25585 }, { "epoch": 4.174551386623165, "grad_norm": 0.10870423913002014, "learning_rate": 3.615972266856851e-05, "loss": 0.0759, "num_input_tokens_seen": 55248080, "step": 25590 }, { "epoch": 4.1753670473083195, "grad_norm": 3.1872990131378174, "learning_rate": 3.6153352767758816e-05, "loss": 0.1896, "num_input_tokens_seen": 55259760, "step": 25595 }, { "epoch": 4.176182707993474, "grad_norm": 0.43499961495399475, "learning_rate": 3.6146981962801744e-05, "loss": 0.0128, "num_input_tokens_seen": 55270128, "step": 25600 }, { "epoch": 4.17699836867863, "grad_norm": 0.24309612810611725, "learning_rate": 3.6140610254213756e-05, "loss": 0.1526, "num_input_tokens_seen": 55280592, "step": 25605 }, { "epoch": 4.177814029363785, "grad_norm": 0.16260983049869537, "learning_rate": 3.613423764251138e-05, "loss": 0.0084, "num_input_tokens_seen": 55291344, "step": 25610 }, { "epoch": 4.17862969004894, "grad_norm": 2.815412998199463, "learning_rate": 3.61278641282112e-05, "loss": 0.112, "num_input_tokens_seen": 55301360, "step": 25615 }, { "epoch": 4.1794453507340945, "grad_norm": 11.62366008758545, "learning_rate": 3.612148971182989e-05, "loss": 0.0714, "num_input_tokens_seen": 55311696, "step": 25620 }, { "epoch": 4.180261011419249, "grad_norm": 0.08460749685764313, "learning_rate": 3.6115114393884206e-05, "loss": 0.1437, "num_input_tokens_seen": 55322704, "step": 25625 }, { "epoch": 4.181076672104404, "grad_norm": 3.9669132232666016, "learning_rate": 3.6108738174890944e-05, "loss": 0.0878, "num_input_tokens_seen": 55334704, "step": 25630 }, { "epoch": 4.18189233278956, "grad_norm": 0.20038148760795593, "learning_rate": 3.6102361055367e-05, "loss": 0.1143, "num_input_tokens_seen": 55344912, "step": 25635 }, { "epoch": 4.182707993474715, "grad_norm": 0.2454468011856079, "learning_rate": 3.609598303582934e-05, "loss": 0.1271, "num_input_tokens_seen": 55354832, "step": 25640 }, { "epoch": 4.1835236541598695, "grad_norm": 0.14531424641609192, "learning_rate": 3.608960411679499e-05, "loss": 0.0318, "num_input_tokens_seen": 55365776, "step": 25645 }, { "epoch": 4.184339314845024, "grad_norm": 0.06752365082502365, "learning_rate": 3.608322429878107e-05, "loss": 0.2215, "num_input_tokens_seen": 55376208, "step": 25650 }, { "epoch": 4.185154975530179, "grad_norm": 2.8704254627227783, "learning_rate": 3.6076843582304744e-05, "loss": 0.2243, "num_input_tokens_seen": 55386512, "step": 25655 }, { "epoch": 4.185970636215335, "grad_norm": 0.10375895351171494, "learning_rate": 3.607046196788328e-05, "loss": 0.0199, "num_input_tokens_seen": 55397168, "step": 25660 }, { "epoch": 4.18678629690049, "grad_norm": 0.07490267604589462, "learning_rate": 3.6064079456033996e-05, "loss": 0.2718, "num_input_tokens_seen": 55408912, "step": 25665 }, { "epoch": 4.1876019575856445, "grad_norm": 0.18197505176067352, "learning_rate": 3.6057696047274285e-05, "loss": 0.1544, "num_input_tokens_seen": 55418672, "step": 25670 }, { "epoch": 4.188417618270799, "grad_norm": 3.714529275894165, "learning_rate": 3.605131174212164e-05, "loss": 0.0537, "num_input_tokens_seen": 55430768, "step": 25675 }, { "epoch": 4.189233278955954, "grad_norm": 3.1004531383514404, "learning_rate": 3.604492654109357e-05, "loss": 0.09, "num_input_tokens_seen": 55441232, "step": 25680 }, { "epoch": 4.190048939641109, "grad_norm": 4.810756206512451, "learning_rate": 3.6038540444707734e-05, "loss": 0.1299, "num_input_tokens_seen": 55453040, "step": 25685 }, { "epoch": 4.190864600326265, "grad_norm": 0.07222369313240051, "learning_rate": 3.603215345348179e-05, "loss": 0.0728, "num_input_tokens_seen": 55462768, "step": 25690 }, { "epoch": 4.191680261011419, "grad_norm": 0.09615682065486908, "learning_rate": 3.602576556793352e-05, "loss": 0.03, "num_input_tokens_seen": 55474416, "step": 25695 }, { "epoch": 4.192495921696574, "grad_norm": 0.11471203714609146, "learning_rate": 3.601937678858074e-05, "loss": 0.0447, "num_input_tokens_seen": 55484272, "step": 25700 }, { "epoch": 4.193311582381729, "grad_norm": 0.2656436562538147, "learning_rate": 3.601298711594137e-05, "loss": 0.0209, "num_input_tokens_seen": 55496336, "step": 25705 }, { "epoch": 4.194127243066884, "grad_norm": 0.33770278096199036, "learning_rate": 3.6006596550533385e-05, "loss": 0.0088, "num_input_tokens_seen": 55508656, "step": 25710 }, { "epoch": 4.19494290375204, "grad_norm": 9.408333778381348, "learning_rate": 3.600020509287483e-05, "loss": 0.3765, "num_input_tokens_seen": 55519664, "step": 25715 }, { "epoch": 4.195758564437194, "grad_norm": 0.11777154356241226, "learning_rate": 3.599381274348385e-05, "loss": 0.1029, "num_input_tokens_seen": 55529744, "step": 25720 }, { "epoch": 4.196574225122349, "grad_norm": 0.18434153497219086, "learning_rate": 3.598741950287861e-05, "loss": 0.0072, "num_input_tokens_seen": 55540400, "step": 25725 }, { "epoch": 4.197389885807504, "grad_norm": 3.3441872596740723, "learning_rate": 3.5981025371577404e-05, "loss": 0.2807, "num_input_tokens_seen": 55551312, "step": 25730 }, { "epoch": 4.198205546492659, "grad_norm": 0.09962259978055954, "learning_rate": 3.5974630350098566e-05, "loss": 0.0252, "num_input_tokens_seen": 55562448, "step": 25735 }, { "epoch": 4.199021207177814, "grad_norm": 0.08699111640453339, "learning_rate": 3.5968234438960505e-05, "loss": 0.0126, "num_input_tokens_seen": 55572848, "step": 25740 }, { "epoch": 4.199836867862969, "grad_norm": 0.12356137484312057, "learning_rate": 3.5961837638681714e-05, "loss": 0.1189, "num_input_tokens_seen": 55582800, "step": 25745 }, { "epoch": 4.200652528548124, "grad_norm": 0.30556872487068176, "learning_rate": 3.595543994978073e-05, "loss": 0.1044, "num_input_tokens_seen": 55594096, "step": 25750 }, { "epoch": 4.201468189233279, "grad_norm": 0.13969768583774567, "learning_rate": 3.594904137277621e-05, "loss": 0.0817, "num_input_tokens_seen": 55605456, "step": 25755 }, { "epoch": 4.202283849918434, "grad_norm": 0.06743574142456055, "learning_rate": 3.594264190818683e-05, "loss": 0.005, "num_input_tokens_seen": 55617072, "step": 25760 }, { "epoch": 4.203099510603589, "grad_norm": 0.10467995703220367, "learning_rate": 3.593624155653138e-05, "loss": 0.256, "num_input_tokens_seen": 55628176, "step": 25765 }, { "epoch": 4.2039151712887435, "grad_norm": 0.0721723809838295, "learning_rate": 3.592984031832871e-05, "loss": 0.0999, "num_input_tokens_seen": 55637744, "step": 25770 }, { "epoch": 4.204730831973899, "grad_norm": 0.09203427284955978, "learning_rate": 3.5923438194097715e-05, "loss": 0.0045, "num_input_tokens_seen": 55649104, "step": 25775 }, { "epoch": 4.205546492659054, "grad_norm": 11.121380805969238, "learning_rate": 3.591703518435739e-05, "loss": 0.1183, "num_input_tokens_seen": 55660272, "step": 25780 }, { "epoch": 4.206362153344209, "grad_norm": 0.15184751152992249, "learning_rate": 3.591063128962681e-05, "loss": 0.1472, "num_input_tokens_seen": 55670864, "step": 25785 }, { "epoch": 4.207177814029364, "grad_norm": 2.250476360321045, "learning_rate": 3.5904226510425095e-05, "loss": 0.0418, "num_input_tokens_seen": 55681456, "step": 25790 }, { "epoch": 4.2079934747145185, "grad_norm": 0.5987846851348877, "learning_rate": 3.5897820847271446e-05, "loss": 0.0123, "num_input_tokens_seen": 55692464, "step": 25795 }, { "epoch": 4.208809135399674, "grad_norm": 0.22947758436203003, "learning_rate": 3.5891414300685155e-05, "loss": 0.1404, "num_input_tokens_seen": 55702704, "step": 25800 }, { "epoch": 4.209624796084829, "grad_norm": 0.05595271289348602, "learning_rate": 3.588500687118555e-05, "loss": 0.0833, "num_input_tokens_seen": 55713040, "step": 25805 }, { "epoch": 4.210440456769984, "grad_norm": 0.13147498667240143, "learning_rate": 3.587859855929207e-05, "loss": 0.0299, "num_input_tokens_seen": 55723920, "step": 25810 }, { "epoch": 4.211256117455139, "grad_norm": 19.015289306640625, "learning_rate": 3.5872189365524175e-05, "loss": 0.0729, "num_input_tokens_seen": 55735440, "step": 25815 }, { "epoch": 4.212071778140293, "grad_norm": 0.1084061786532402, "learning_rate": 3.586577929040146e-05, "loss": 0.1183, "num_input_tokens_seen": 55745680, "step": 25820 }, { "epoch": 4.212887438825448, "grad_norm": 2.7874019145965576, "learning_rate": 3.5859368334443536e-05, "loss": 0.1077, "num_input_tokens_seen": 55757680, "step": 25825 }, { "epoch": 4.213703099510604, "grad_norm": 3.409447193145752, "learning_rate": 3.585295649817011e-05, "loss": 0.13, "num_input_tokens_seen": 55767536, "step": 25830 }, { "epoch": 4.214518760195759, "grad_norm": 0.1951976865530014, "learning_rate": 3.5846543782100974e-05, "loss": 0.1067, "num_input_tokens_seen": 55778032, "step": 25835 }, { "epoch": 4.215334420880914, "grad_norm": 0.08985646069049835, "learning_rate": 3.584013018675596e-05, "loss": 0.0909, "num_input_tokens_seen": 55790000, "step": 25840 }, { "epoch": 4.216150081566068, "grad_norm": 0.09971218556165695, "learning_rate": 3.583371571265498e-05, "loss": 0.1387, "num_input_tokens_seen": 55800688, "step": 25845 }, { "epoch": 4.216965742251223, "grad_norm": 0.10093860328197479, "learning_rate": 3.582730036031805e-05, "loss": 0.0071, "num_input_tokens_seen": 55812208, "step": 25850 }, { "epoch": 4.217781402936378, "grad_norm": 3.3075568675994873, "learning_rate": 3.582088413026521e-05, "loss": 0.1078, "num_input_tokens_seen": 55823120, "step": 25855 }, { "epoch": 4.218597063621534, "grad_norm": 0.04948318377137184, "learning_rate": 3.581446702301659e-05, "loss": 0.0178, "num_input_tokens_seen": 55834416, "step": 25860 }, { "epoch": 4.219412724306689, "grad_norm": 0.09975404292345047, "learning_rate": 3.5808049039092414e-05, "loss": 0.1671, "num_input_tokens_seen": 55846480, "step": 25865 }, { "epoch": 4.220228384991843, "grad_norm": 2.3931007385253906, "learning_rate": 3.580163017901295e-05, "loss": 0.1215, "num_input_tokens_seen": 55857200, "step": 25870 }, { "epoch": 4.221044045676998, "grad_norm": 0.10142625123262405, "learning_rate": 3.579521044329852e-05, "loss": 0.1154, "num_input_tokens_seen": 55868976, "step": 25875 }, { "epoch": 4.221859706362153, "grad_norm": 13.955004692077637, "learning_rate": 3.578878983246956e-05, "loss": 0.1048, "num_input_tokens_seen": 55881360, "step": 25880 }, { "epoch": 4.222675367047309, "grad_norm": 0.5234656929969788, "learning_rate": 3.578236834704656e-05, "loss": 0.1197, "num_input_tokens_seen": 55892208, "step": 25885 }, { "epoch": 4.2234910277324635, "grad_norm": 0.026215214282274246, "learning_rate": 3.577594598755006e-05, "loss": 0.0956, "num_input_tokens_seen": 55901936, "step": 25890 }, { "epoch": 4.224306688417618, "grad_norm": 4.179839611053467, "learning_rate": 3.5769522754500714e-05, "loss": 0.0344, "num_input_tokens_seen": 55912784, "step": 25895 }, { "epoch": 4.225122349102773, "grad_norm": 0.2951279282569885, "learning_rate": 3.5763098648419216e-05, "loss": 0.2639, "num_input_tokens_seen": 55924432, "step": 25900 }, { "epoch": 4.225938009787928, "grad_norm": 0.055586107075214386, "learning_rate": 3.575667366982631e-05, "loss": 0.1581, "num_input_tokens_seen": 55935216, "step": 25905 }, { "epoch": 4.226753670473083, "grad_norm": 0.04721270874142647, "learning_rate": 3.575024781924288e-05, "loss": 0.0291, "num_input_tokens_seen": 55945680, "step": 25910 }, { "epoch": 4.2275693311582385, "grad_norm": 1.1868259906768799, "learning_rate": 3.574382109718979e-05, "loss": 0.1081, "num_input_tokens_seen": 55956496, "step": 25915 }, { "epoch": 4.228384991843393, "grad_norm": 0.051986824721097946, "learning_rate": 3.573739350418806e-05, "loss": 0.0312, "num_input_tokens_seen": 55967280, "step": 25920 }, { "epoch": 4.229200652528548, "grad_norm": 0.11333861202001572, "learning_rate": 3.573096504075874e-05, "loss": 0.0825, "num_input_tokens_seen": 55978160, "step": 25925 }, { "epoch": 4.230016313213703, "grad_norm": 0.6075248718261719, "learning_rate": 3.572453570742294e-05, "loss": 0.0778, "num_input_tokens_seen": 55989808, "step": 25930 }, { "epoch": 4.230831973898858, "grad_norm": 0.05753014609217644, "learning_rate": 3.571810550470186e-05, "loss": 0.1058, "num_input_tokens_seen": 56000944, "step": 25935 }, { "epoch": 4.231647634584013, "grad_norm": 0.0715099647641182, "learning_rate": 3.571167443311676e-05, "loss": 0.0819, "num_input_tokens_seen": 56011408, "step": 25940 }, { "epoch": 4.232463295269168, "grad_norm": 3.7070655822753906, "learning_rate": 3.5705242493188986e-05, "loss": 0.0146, "num_input_tokens_seen": 56020944, "step": 25945 }, { "epoch": 4.233278955954323, "grad_norm": 0.06861277669668198, "learning_rate": 3.569880968543994e-05, "loss": 0.005, "num_input_tokens_seen": 56031792, "step": 25950 }, { "epoch": 4.234094616639478, "grad_norm": 0.4699649512767792, "learning_rate": 3.569237601039109e-05, "loss": 0.0058, "num_input_tokens_seen": 56041936, "step": 25955 }, { "epoch": 4.234910277324633, "grad_norm": 0.14664636552333832, "learning_rate": 3.5685941468563985e-05, "loss": 0.0046, "num_input_tokens_seen": 56052272, "step": 25960 }, { "epoch": 4.235725938009788, "grad_norm": 0.028544507920742035, "learning_rate": 3.567950606048025e-05, "loss": 0.0023, "num_input_tokens_seen": 56061808, "step": 25965 }, { "epoch": 4.236541598694943, "grad_norm": 3.6018576622009277, "learning_rate": 3.5673069786661566e-05, "loss": 0.2213, "num_input_tokens_seen": 56071536, "step": 25970 }, { "epoch": 4.237357259380098, "grad_norm": 0.07787901163101196, "learning_rate": 3.566663264762969e-05, "loss": 0.1086, "num_input_tokens_seen": 56083568, "step": 25975 }, { "epoch": 4.238172920065253, "grad_norm": 0.11781366169452667, "learning_rate": 3.5660194643906455e-05, "loss": 0.0746, "num_input_tokens_seen": 56094000, "step": 25980 }, { "epoch": 4.238988580750408, "grad_norm": 2.923859119415283, "learning_rate": 3.5653755776013745e-05, "loss": 0.3323, "num_input_tokens_seen": 56103952, "step": 25985 }, { "epoch": 4.239804241435563, "grad_norm": 0.15193922817707062, "learning_rate": 3.5647316044473537e-05, "loss": 0.0133, "num_input_tokens_seen": 56114224, "step": 25990 }, { "epoch": 4.240619902120717, "grad_norm": 0.07131417095661163, "learning_rate": 3.564087544980786e-05, "loss": 0.007, "num_input_tokens_seen": 56124464, "step": 25995 }, { "epoch": 4.241435562805873, "grad_norm": 0.09576962888240814, "learning_rate": 3.563443399253883e-05, "loss": 0.0119, "num_input_tokens_seen": 56135280, "step": 26000 }, { "epoch": 4.242251223491028, "grad_norm": 0.1686936914920807, "learning_rate": 3.5627991673188624e-05, "loss": 0.1132, "num_input_tokens_seen": 56144976, "step": 26005 }, { "epoch": 4.243066884176183, "grad_norm": 0.8898986577987671, "learning_rate": 3.562154849227949e-05, "loss": 0.287, "num_input_tokens_seen": 56156688, "step": 26010 }, { "epoch": 4.2438825448613375, "grad_norm": 0.09941082447767258, "learning_rate": 3.561510445033375e-05, "loss": 0.0211, "num_input_tokens_seen": 56167856, "step": 26015 }, { "epoch": 4.244698205546492, "grad_norm": 3.588923692703247, "learning_rate": 3.560865954787377e-05, "loss": 0.0275, "num_input_tokens_seen": 56178928, "step": 26020 }, { "epoch": 4.245513866231648, "grad_norm": 0.07287617772817612, "learning_rate": 3.5602213785422025e-05, "loss": 0.1913, "num_input_tokens_seen": 56189584, "step": 26025 }, { "epoch": 4.246329526916803, "grad_norm": 0.20097781717777252, "learning_rate": 3.5595767163501034e-05, "loss": 0.1521, "num_input_tokens_seen": 56200080, "step": 26030 }, { "epoch": 4.247145187601958, "grad_norm": 0.06811130046844482, "learning_rate": 3.5589319682633393e-05, "loss": 0.2815, "num_input_tokens_seen": 56210288, "step": 26035 }, { "epoch": 4.2479608482871125, "grad_norm": 0.15266764163970947, "learning_rate": 3.558287134334177e-05, "loss": 0.0271, "num_input_tokens_seen": 56220976, "step": 26040 }, { "epoch": 4.248776508972267, "grad_norm": 6.053004264831543, "learning_rate": 3.55764221461489e-05, "loss": 0.1249, "num_input_tokens_seen": 56232176, "step": 26045 }, { "epoch": 4.249592169657422, "grad_norm": 1.8790841102600098, "learning_rate": 3.556997209157759e-05, "loss": 0.2825, "num_input_tokens_seen": 56244400, "step": 26050 }, { "epoch": 4.250407830342578, "grad_norm": 0.5246086716651917, "learning_rate": 3.5563521180150704e-05, "loss": 0.0094, "num_input_tokens_seen": 56255952, "step": 26055 }, { "epoch": 4.251223491027733, "grad_norm": 0.1442943662405014, "learning_rate": 3.55570694123912e-05, "loss": 0.0878, "num_input_tokens_seen": 56267760, "step": 26060 }, { "epoch": 4.2520391517128875, "grad_norm": 0.23565569519996643, "learning_rate": 3.5550616788822074e-05, "loss": 0.0115, "num_input_tokens_seen": 56278800, "step": 26065 }, { "epoch": 4.252854812398042, "grad_norm": 0.09448880702257156, "learning_rate": 3.5544163309966425e-05, "loss": 0.0604, "num_input_tokens_seen": 56289552, "step": 26070 }, { "epoch": 4.253670473083197, "grad_norm": 0.12274815142154694, "learning_rate": 3.5537708976347386e-05, "loss": 0.1721, "num_input_tokens_seen": 56301808, "step": 26075 }, { "epoch": 4.254486133768353, "grad_norm": 2.4918394088745117, "learning_rate": 3.55312537884882e-05, "loss": 0.1046, "num_input_tokens_seen": 56311536, "step": 26080 }, { "epoch": 4.255301794453508, "grad_norm": 0.09813707321882248, "learning_rate": 3.552479774691215e-05, "loss": 0.0328, "num_input_tokens_seen": 56323344, "step": 26085 }, { "epoch": 4.2561174551386625, "grad_norm": 1.4146684408187866, "learning_rate": 3.5518340852142587e-05, "loss": 0.0293, "num_input_tokens_seen": 56333904, "step": 26090 }, { "epoch": 4.256933115823817, "grad_norm": 1.7968381643295288, "learning_rate": 3.5511883104702943e-05, "loss": 0.0094, "num_input_tokens_seen": 56343312, "step": 26095 }, { "epoch": 4.257748776508972, "grad_norm": 0.16410556435585022, "learning_rate": 3.5505424505116714e-05, "loss": 0.0462, "num_input_tokens_seen": 56354960, "step": 26100 }, { "epoch": 4.258564437194127, "grad_norm": 0.13683778047561646, "learning_rate": 3.549896505390748e-05, "loss": 0.0776, "num_input_tokens_seen": 56366032, "step": 26105 }, { "epoch": 4.259380097879283, "grad_norm": 1.570995569229126, "learning_rate": 3.549250475159887e-05, "loss": 0.0721, "num_input_tokens_seen": 56376496, "step": 26110 }, { "epoch": 4.260195758564437, "grad_norm": 0.06962514668703079, "learning_rate": 3.5486043598714576e-05, "loss": 0.1436, "num_input_tokens_seen": 56387408, "step": 26115 }, { "epoch": 4.261011419249592, "grad_norm": 0.1006123498082161, "learning_rate": 3.547958159577839e-05, "loss": 0.1313, "num_input_tokens_seen": 56397328, "step": 26120 }, { "epoch": 4.261827079934747, "grad_norm": 0.11071991920471191, "learning_rate": 3.547311874331414e-05, "loss": 0.2202, "num_input_tokens_seen": 56406640, "step": 26125 }, { "epoch": 4.262642740619902, "grad_norm": 7.6907057762146, "learning_rate": 3.546665504184575e-05, "loss": 0.1149, "num_input_tokens_seen": 56416400, "step": 26130 }, { "epoch": 4.263458401305057, "grad_norm": 3.0809524059295654, "learning_rate": 3.5460190491897195e-05, "loss": 0.1143, "num_input_tokens_seen": 56426320, "step": 26135 }, { "epoch": 4.264274061990212, "grad_norm": 1.5005754232406616, "learning_rate": 3.5453725093992526e-05, "loss": 0.3749, "num_input_tokens_seen": 56436400, "step": 26140 }, { "epoch": 4.265089722675367, "grad_norm": 3.1108760833740234, "learning_rate": 3.544725884865585e-05, "loss": 0.25, "num_input_tokens_seen": 56445712, "step": 26145 }, { "epoch": 4.265905383360522, "grad_norm": 0.10910908132791519, "learning_rate": 3.544079175641137e-05, "loss": 0.1843, "num_input_tokens_seen": 56456720, "step": 26150 }, { "epoch": 4.266721044045677, "grad_norm": 0.19026920199394226, "learning_rate": 3.543432381778333e-05, "loss": 0.0665, "num_input_tokens_seen": 56468464, "step": 26155 }, { "epoch": 4.267536704730832, "grad_norm": 4.13557767868042, "learning_rate": 3.5427855033296056e-05, "loss": 0.2386, "num_input_tokens_seen": 56478096, "step": 26160 }, { "epoch": 4.268352365415987, "grad_norm": 10.79997730255127, "learning_rate": 3.542138540347395e-05, "loss": 0.4069, "num_input_tokens_seen": 56489776, "step": 26165 }, { "epoch": 4.269168026101142, "grad_norm": 0.19795987010002136, "learning_rate": 3.5414914928841467e-05, "loss": 0.0928, "num_input_tokens_seen": 56501232, "step": 26170 }, { "epoch": 4.269983686786297, "grad_norm": 0.12328500300645828, "learning_rate": 3.540844360992313e-05, "loss": 0.0106, "num_input_tokens_seen": 56511664, "step": 26175 }, { "epoch": 4.270799347471452, "grad_norm": 1.53931725025177, "learning_rate": 3.5401971447243545e-05, "loss": 0.0804, "num_input_tokens_seen": 56522032, "step": 26180 }, { "epoch": 4.271615008156607, "grad_norm": 0.07922205328941345, "learning_rate": 3.539549844132737e-05, "loss": 0.2395, "num_input_tokens_seen": 56533008, "step": 26185 }, { "epoch": 4.2724306688417615, "grad_norm": 0.5737140774726868, "learning_rate": 3.538902459269935e-05, "loss": 0.024, "num_input_tokens_seen": 56544176, "step": 26190 }, { "epoch": 4.273246329526917, "grad_norm": 0.715808093547821, "learning_rate": 3.538254990188429e-05, "loss": 0.1182, "num_input_tokens_seen": 56555376, "step": 26195 }, { "epoch": 4.274061990212072, "grad_norm": 0.2927914559841156, "learning_rate": 3.5376074369407044e-05, "loss": 0.0973, "num_input_tokens_seen": 56566256, "step": 26200 }, { "epoch": 4.274877650897227, "grad_norm": 2.412468194961548, "learning_rate": 3.536959799579256e-05, "loss": 0.1044, "num_input_tokens_seen": 56576976, "step": 26205 }, { "epoch": 4.275693311582382, "grad_norm": 0.12590470910072327, "learning_rate": 3.5363120781565854e-05, "loss": 0.0996, "num_input_tokens_seen": 56587504, "step": 26210 }, { "epoch": 4.2765089722675365, "grad_norm": 2.6490683555603027, "learning_rate": 3.535664272725199e-05, "loss": 0.1726, "num_input_tokens_seen": 56598800, "step": 26215 }, { "epoch": 4.277324632952691, "grad_norm": 1.593904733657837, "learning_rate": 3.5350163833376124e-05, "loss": 0.0986, "num_input_tokens_seen": 56609584, "step": 26220 }, { "epoch": 4.278140293637847, "grad_norm": 0.04505256563425064, "learning_rate": 3.534368410046346e-05, "loss": 0.0606, "num_input_tokens_seen": 56619984, "step": 26225 }, { "epoch": 4.278955954323002, "grad_norm": 0.25533008575439453, "learning_rate": 3.5337203529039275e-05, "loss": 0.1114, "num_input_tokens_seen": 56630128, "step": 26230 }, { "epoch": 4.279771615008157, "grad_norm": 7.802379131317139, "learning_rate": 3.533072211962892e-05, "loss": 0.1193, "num_input_tokens_seen": 56641264, "step": 26235 }, { "epoch": 4.280587275693311, "grad_norm": 0.22463050484657288, "learning_rate": 3.532423987275782e-05, "loss": 0.015, "num_input_tokens_seen": 56652688, "step": 26240 }, { "epoch": 4.281402936378466, "grad_norm": 2.565871477127075, "learning_rate": 3.531775678895145e-05, "loss": 0.1115, "num_input_tokens_seen": 56663600, "step": 26245 }, { "epoch": 4.282218597063622, "grad_norm": 0.06665221601724625, "learning_rate": 3.531127286873536e-05, "loss": 0.0191, "num_input_tokens_seen": 56674544, "step": 26250 }, { "epoch": 4.283034257748777, "grad_norm": 0.09177578240633011, "learning_rate": 3.530478811263518e-05, "loss": 0.0072, "num_input_tokens_seen": 56685680, "step": 26255 }, { "epoch": 4.283849918433932, "grad_norm": 3.6111679077148438, "learning_rate": 3.529830252117657e-05, "loss": 0.1132, "num_input_tokens_seen": 56696752, "step": 26260 }, { "epoch": 4.284665579119086, "grad_norm": 0.15600493550300598, "learning_rate": 3.529181609488532e-05, "loss": 0.1766, "num_input_tokens_seen": 56707600, "step": 26265 }, { "epoch": 4.285481239804241, "grad_norm": 0.22444719076156616, "learning_rate": 3.528532883428724e-05, "loss": 0.2635, "num_input_tokens_seen": 56718608, "step": 26270 }, { "epoch": 4.286296900489396, "grad_norm": 0.18730668723583221, "learning_rate": 3.527884073990822e-05, "loss": 0.2498, "num_input_tokens_seen": 56729840, "step": 26275 }, { "epoch": 4.287112561174552, "grad_norm": 0.2111804485321045, "learning_rate": 3.52723518122742e-05, "loss": 0.037, "num_input_tokens_seen": 56739984, "step": 26280 }, { "epoch": 4.287928221859707, "grad_norm": 0.25164511799812317, "learning_rate": 3.526586205191123e-05, "loss": 0.2955, "num_input_tokens_seen": 56751248, "step": 26285 }, { "epoch": 4.288743882544861, "grad_norm": 0.22389717400074005, "learning_rate": 3.525937145934539e-05, "loss": 0.1524, "num_input_tokens_seen": 56762480, "step": 26290 }, { "epoch": 4.289559543230016, "grad_norm": 2.940941572189331, "learning_rate": 3.525288003510285e-05, "loss": 0.1536, "num_input_tokens_seen": 56772016, "step": 26295 }, { "epoch": 4.290375203915171, "grad_norm": 2.946580171585083, "learning_rate": 3.524638777970982e-05, "loss": 0.058, "num_input_tokens_seen": 56781168, "step": 26300 }, { "epoch": 4.291190864600326, "grad_norm": 6.9011006355285645, "learning_rate": 3.523989469369262e-05, "loss": 0.0667, "num_input_tokens_seen": 56792720, "step": 26305 }, { "epoch": 4.2920065252854815, "grad_norm": 0.10244137793779373, "learning_rate": 3.523340077757759e-05, "loss": 0.0907, "num_input_tokens_seen": 56803856, "step": 26310 }, { "epoch": 4.292822185970636, "grad_norm": 0.16750401258468628, "learning_rate": 3.522690603189117e-05, "loss": 0.0724, "num_input_tokens_seen": 56814480, "step": 26315 }, { "epoch": 4.293637846655791, "grad_norm": 0.27848297357559204, "learning_rate": 3.522041045715986e-05, "loss": 0.1096, "num_input_tokens_seen": 56824816, "step": 26320 }, { "epoch": 4.294453507340946, "grad_norm": 0.3545133173465729, "learning_rate": 3.521391405391022e-05, "loss": 0.1749, "num_input_tokens_seen": 56834928, "step": 26325 }, { "epoch": 4.295269168026101, "grad_norm": 0.13783428072929382, "learning_rate": 3.520741682266888e-05, "loss": 0.0112, "num_input_tokens_seen": 56845680, "step": 26330 }, { "epoch": 4.2960848287112565, "grad_norm": 0.05638476088643074, "learning_rate": 3.520091876396255e-05, "loss": 0.0618, "num_input_tokens_seen": 56856464, "step": 26335 }, { "epoch": 4.296900489396411, "grad_norm": 0.12468436360359192, "learning_rate": 3.5194419878317975e-05, "loss": 0.1081, "num_input_tokens_seen": 56867920, "step": 26340 }, { "epoch": 4.297716150081566, "grad_norm": 0.14122150838375092, "learning_rate": 3.518792016626201e-05, "loss": 0.0986, "num_input_tokens_seen": 56878672, "step": 26345 }, { "epoch": 4.298531810766721, "grad_norm": 0.297615647315979, "learning_rate": 3.518141962832153e-05, "loss": 0.0148, "num_input_tokens_seen": 56889616, "step": 26350 }, { "epoch": 4.299347471451876, "grad_norm": 0.07948726415634155, "learning_rate": 3.517491826502352e-05, "loss": 0.0796, "num_input_tokens_seen": 56899184, "step": 26355 }, { "epoch": 4.300163132137031, "grad_norm": 0.1359148621559143, "learning_rate": 3.516841607689501e-05, "loss": 0.0834, "num_input_tokens_seen": 56910480, "step": 26360 }, { "epoch": 4.300978792822186, "grad_norm": 0.29599788784980774, "learning_rate": 3.516191306446309e-05, "loss": 0.2221, "num_input_tokens_seen": 56921040, "step": 26365 }, { "epoch": 4.301794453507341, "grad_norm": 4.5030741691589355, "learning_rate": 3.5155409228254946e-05, "loss": 0.1198, "num_input_tokens_seen": 56931920, "step": 26370 }, { "epoch": 4.302610114192496, "grad_norm": 4.534426212310791, "learning_rate": 3.5148904568797805e-05, "loss": 0.094, "num_input_tokens_seen": 56942960, "step": 26375 }, { "epoch": 4.303425774877651, "grad_norm": 0.2612292766571045, "learning_rate": 3.514239908661896e-05, "loss": 0.1264, "num_input_tokens_seen": 56955728, "step": 26380 }, { "epoch": 4.304241435562806, "grad_norm": 18.73579216003418, "learning_rate": 3.513589278224577e-05, "loss": 0.0536, "num_input_tokens_seen": 56966224, "step": 26385 }, { "epoch": 4.30505709624796, "grad_norm": 3.6883773803710938, "learning_rate": 3.5129385656205696e-05, "loss": 0.2356, "num_input_tokens_seen": 56975440, "step": 26390 }, { "epoch": 4.305872756933116, "grad_norm": 7.598383903503418, "learning_rate": 3.512287770902623e-05, "loss": 0.0494, "num_input_tokens_seen": 56986416, "step": 26395 }, { "epoch": 4.306688417618271, "grad_norm": 0.12900906801223755, "learning_rate": 3.5116368941234924e-05, "loss": 0.0968, "num_input_tokens_seen": 56997392, "step": 26400 }, { "epoch": 4.307504078303426, "grad_norm": 0.1268405169248581, "learning_rate": 3.510985935335943e-05, "loss": 0.0726, "num_input_tokens_seen": 57008144, "step": 26405 }, { "epoch": 4.308319738988581, "grad_norm": 0.26509571075439453, "learning_rate": 3.510334894592743e-05, "loss": 0.0582, "num_input_tokens_seen": 57019472, "step": 26410 }, { "epoch": 4.309135399673735, "grad_norm": 0.08845847100019455, "learning_rate": 3.509683771946671e-05, "loss": 0.1454, "num_input_tokens_seen": 57029136, "step": 26415 }, { "epoch": 4.309951060358891, "grad_norm": 3.4568231105804443, "learning_rate": 3.509032567450508e-05, "loss": 0.0526, "num_input_tokens_seen": 57039504, "step": 26420 }, { "epoch": 4.310766721044046, "grad_norm": 0.6754986643791199, "learning_rate": 3.508381281157046e-05, "loss": 0.1198, "num_input_tokens_seen": 57050064, "step": 26425 }, { "epoch": 4.311582381729201, "grad_norm": 0.21514974534511566, "learning_rate": 3.507729913119081e-05, "loss": 0.007, "num_input_tokens_seen": 57061232, "step": 26430 }, { "epoch": 4.3123980424143555, "grad_norm": 0.07706812769174576, "learning_rate": 3.507078463389417e-05, "loss": 0.1651, "num_input_tokens_seen": 57072016, "step": 26435 }, { "epoch": 4.31321370309951, "grad_norm": 2.491929292678833, "learning_rate": 3.506426932020861e-05, "loss": 0.0838, "num_input_tokens_seen": 57082576, "step": 26440 }, { "epoch": 4.314029363784665, "grad_norm": 0.3132483959197998, "learning_rate": 3.505775319066233e-05, "loss": 0.1013, "num_input_tokens_seen": 57093712, "step": 26445 }, { "epoch": 4.314845024469821, "grad_norm": 0.6698467135429382, "learning_rate": 3.5051236245783536e-05, "loss": 0.0183, "num_input_tokens_seen": 57104432, "step": 26450 }, { "epoch": 4.315660685154976, "grad_norm": 0.07937267422676086, "learning_rate": 3.5044718486100536e-05, "loss": 0.0059, "num_input_tokens_seen": 57114576, "step": 26455 }, { "epoch": 4.3164763458401305, "grad_norm": 4.7036943435668945, "learning_rate": 3.503819991214168e-05, "loss": 0.3437, "num_input_tokens_seen": 57125744, "step": 26460 }, { "epoch": 4.317292006525285, "grad_norm": 0.039076440036296844, "learning_rate": 3.503168052443542e-05, "loss": 0.2231, "num_input_tokens_seen": 57136784, "step": 26465 }, { "epoch": 4.31810766721044, "grad_norm": 0.3440026640892029, "learning_rate": 3.502516032351022e-05, "loss": 0.0521, "num_input_tokens_seen": 57147888, "step": 26470 }, { "epoch": 4.318923327895595, "grad_norm": 0.0627477690577507, "learning_rate": 3.501863930989467e-05, "loss": 0.065, "num_input_tokens_seen": 57158704, "step": 26475 }, { "epoch": 4.319738988580751, "grad_norm": 0.0356375128030777, "learning_rate": 3.501211748411738e-05, "loss": 0.0159, "num_input_tokens_seen": 57170096, "step": 26480 }, { "epoch": 4.3205546492659055, "grad_norm": 0.2010306566953659, "learning_rate": 3.500559484670705e-05, "loss": 0.0743, "num_input_tokens_seen": 57181008, "step": 26485 }, { "epoch": 4.32137030995106, "grad_norm": 3.861107587814331, "learning_rate": 3.499907139819242e-05, "loss": 0.0702, "num_input_tokens_seen": 57192752, "step": 26490 }, { "epoch": 4.322185970636215, "grad_norm": 3.188190221786499, "learning_rate": 3.499254713910234e-05, "loss": 0.1386, "num_input_tokens_seen": 57204976, "step": 26495 }, { "epoch": 4.32300163132137, "grad_norm": 7.758331298828125, "learning_rate": 3.498602206996569e-05, "loss": 0.184, "num_input_tokens_seen": 57216592, "step": 26500 }, { "epoch": 4.323817292006526, "grad_norm": 0.126559779047966, "learning_rate": 3.497949619131141e-05, "loss": 0.0089, "num_input_tokens_seen": 57227568, "step": 26505 }, { "epoch": 4.3246329526916805, "grad_norm": 3.4178578853607178, "learning_rate": 3.497296950366854e-05, "loss": 0.0823, "num_input_tokens_seen": 57238096, "step": 26510 }, { "epoch": 4.325448613376835, "grad_norm": 1.1395678520202637, "learning_rate": 3.4966442007566165e-05, "loss": 0.0142, "num_input_tokens_seen": 57250416, "step": 26515 }, { "epoch": 4.32626427406199, "grad_norm": 0.06961100548505783, "learning_rate": 3.495991370353342e-05, "loss": 0.1423, "num_input_tokens_seen": 57261008, "step": 26520 }, { "epoch": 4.327079934747145, "grad_norm": 6.854227542877197, "learning_rate": 3.4953384592099536e-05, "loss": 0.188, "num_input_tokens_seen": 57271344, "step": 26525 }, { "epoch": 4.327895595432301, "grad_norm": 8.178387641906738, "learning_rate": 3.494685467379381e-05, "loss": 0.0609, "num_input_tokens_seen": 57280752, "step": 26530 }, { "epoch": 4.328711256117455, "grad_norm": 0.1267123967409134, "learning_rate": 3.494032394914555e-05, "loss": 0.1506, "num_input_tokens_seen": 57290736, "step": 26535 }, { "epoch": 4.32952691680261, "grad_norm": 3.2645373344421387, "learning_rate": 3.493379241868421e-05, "loss": 0.147, "num_input_tokens_seen": 57301680, "step": 26540 }, { "epoch": 4.330342577487765, "grad_norm": 0.09174946695566177, "learning_rate": 3.492726008293925e-05, "loss": 0.1232, "num_input_tokens_seen": 57313200, "step": 26545 }, { "epoch": 4.33115823817292, "grad_norm": 4.232726097106934, "learning_rate": 3.4920726942440215e-05, "loss": 0.1457, "num_input_tokens_seen": 57323856, "step": 26550 }, { "epoch": 4.331973898858075, "grad_norm": 0.20001770555973053, "learning_rate": 3.4914192997716724e-05, "loss": 0.0065, "num_input_tokens_seen": 57334768, "step": 26555 }, { "epoch": 4.33278955954323, "grad_norm": 1.2704259157180786, "learning_rate": 3.4907658249298435e-05, "loss": 0.0722, "num_input_tokens_seen": 57344720, "step": 26560 }, { "epoch": 4.333605220228385, "grad_norm": 0.37678268551826477, "learning_rate": 3.4901122697715096e-05, "loss": 0.1788, "num_input_tokens_seen": 57356784, "step": 26565 }, { "epoch": 4.33442088091354, "grad_norm": 0.11859045177698135, "learning_rate": 3.4894586343496524e-05, "loss": 0.0226, "num_input_tokens_seen": 57367184, "step": 26570 }, { "epoch": 4.335236541598695, "grad_norm": 0.12833264470100403, "learning_rate": 3.4888049187172566e-05, "loss": 0.1555, "num_input_tokens_seen": 57377840, "step": 26575 }, { "epoch": 4.33605220228385, "grad_norm": 0.06211987882852554, "learning_rate": 3.4881511229273175e-05, "loss": 0.1053, "num_input_tokens_seen": 57389232, "step": 26580 }, { "epoch": 4.3368678629690045, "grad_norm": 0.6239936947822571, "learning_rate": 3.487497247032835e-05, "loss": 0.0939, "num_input_tokens_seen": 57400176, "step": 26585 }, { "epoch": 4.33768352365416, "grad_norm": 0.04757961258292198, "learning_rate": 3.4868432910868156e-05, "loss": 0.1114, "num_input_tokens_seen": 57410768, "step": 26590 }, { "epoch": 4.338499184339315, "grad_norm": 4.031280517578125, "learning_rate": 3.48618925514227e-05, "loss": 0.1682, "num_input_tokens_seen": 57421680, "step": 26595 }, { "epoch": 4.33931484502447, "grad_norm": 6.466372966766357, "learning_rate": 3.4855351392522214e-05, "loss": 0.216, "num_input_tokens_seen": 57431600, "step": 26600 }, { "epoch": 4.340130505709625, "grad_norm": 0.07889610528945923, "learning_rate": 3.4848809434696924e-05, "loss": 0.0065, "num_input_tokens_seen": 57442640, "step": 26605 }, { "epoch": 4.3409461663947795, "grad_norm": 2.756345510482788, "learning_rate": 3.484226667847718e-05, "loss": 0.0943, "num_input_tokens_seen": 57453072, "step": 26610 }, { "epoch": 4.341761827079935, "grad_norm": 1.4201195240020752, "learning_rate": 3.4835723124393347e-05, "loss": 0.0945, "num_input_tokens_seen": 57463792, "step": 26615 }, { "epoch": 4.34257748776509, "grad_norm": 0.17208625376224518, "learning_rate": 3.48291787729759e-05, "loss": 0.0832, "num_input_tokens_seen": 57474448, "step": 26620 }, { "epoch": 4.343393148450245, "grad_norm": 1.5106254816055298, "learning_rate": 3.482263362475535e-05, "loss": 0.0628, "num_input_tokens_seen": 57485904, "step": 26625 }, { "epoch": 4.3442088091354, "grad_norm": 0.17081543803215027, "learning_rate": 3.4816087680262275e-05, "loss": 0.1529, "num_input_tokens_seen": 57497008, "step": 26630 }, { "epoch": 4.3450244698205545, "grad_norm": 0.11218702793121338, "learning_rate": 3.480954094002733e-05, "loss": 0.0435, "num_input_tokens_seen": 57507920, "step": 26635 }, { "epoch": 4.345840130505709, "grad_norm": 0.3067167103290558, "learning_rate": 3.480299340458123e-05, "loss": 0.1803, "num_input_tokens_seen": 57518832, "step": 26640 }, { "epoch": 4.346655791190865, "grad_norm": 0.0654403418302536, "learning_rate": 3.479644507445473e-05, "loss": 0.0949, "num_input_tokens_seen": 57529808, "step": 26645 }, { "epoch": 4.34747145187602, "grad_norm": 3.237981081008911, "learning_rate": 3.4789895950178694e-05, "loss": 0.1251, "num_input_tokens_seen": 57540944, "step": 26650 }, { "epoch": 4.348287112561175, "grad_norm": 0.06239500269293785, "learning_rate": 3.478334603228401e-05, "loss": 0.1113, "num_input_tokens_seen": 57552176, "step": 26655 }, { "epoch": 4.349102773246329, "grad_norm": 0.1120411828160286, "learning_rate": 3.477679532130167e-05, "loss": 0.0927, "num_input_tokens_seen": 57563184, "step": 26660 }, { "epoch": 4.349918433931484, "grad_norm": 0.5621286034584045, "learning_rate": 3.4770243817762686e-05, "loss": 0.2926, "num_input_tokens_seen": 57574320, "step": 26665 }, { "epoch": 4.350734094616639, "grad_norm": 0.08487051725387573, "learning_rate": 3.476369152219817e-05, "loss": 0.1444, "num_input_tokens_seen": 57584944, "step": 26670 }, { "epoch": 4.351549755301795, "grad_norm": 5.538961410522461, "learning_rate": 3.4757138435139274e-05, "loss": 0.0384, "num_input_tokens_seen": 57595600, "step": 26675 }, { "epoch": 4.35236541598695, "grad_norm": 11.25765609741211, "learning_rate": 3.4750584557117234e-05, "loss": 0.3891, "num_input_tokens_seen": 57607472, "step": 26680 }, { "epoch": 4.353181076672104, "grad_norm": 1.2607146501541138, "learning_rate": 3.4744029888663326e-05, "loss": 0.3185, "num_input_tokens_seen": 57618800, "step": 26685 }, { "epoch": 4.353996737357259, "grad_norm": 0.4579668939113617, "learning_rate": 3.473747443030892e-05, "loss": 0.0086, "num_input_tokens_seen": 57629040, "step": 26690 }, { "epoch": 4.354812398042414, "grad_norm": 3.0267632007598877, "learning_rate": 3.473091818258543e-05, "loss": 0.1986, "num_input_tokens_seen": 57639728, "step": 26695 }, { "epoch": 4.35562805872757, "grad_norm": 1.9840439558029175, "learning_rate": 3.472436114602433e-05, "loss": 0.0924, "num_input_tokens_seen": 57650512, "step": 26700 }, { "epoch": 4.356443719412725, "grad_norm": 0.18485522270202637, "learning_rate": 3.471780332115719e-05, "loss": 0.1409, "num_input_tokens_seen": 57662096, "step": 26705 }, { "epoch": 4.357259380097879, "grad_norm": 0.09250368922948837, "learning_rate": 3.47112447085156e-05, "loss": 0.1419, "num_input_tokens_seen": 57672080, "step": 26710 }, { "epoch": 4.358075040783034, "grad_norm": 0.32140758633613586, "learning_rate": 3.470468530863123e-05, "loss": 0.2394, "num_input_tokens_seen": 57682704, "step": 26715 }, { "epoch": 4.358890701468189, "grad_norm": 3.253242254257202, "learning_rate": 3.469812512203584e-05, "loss": 0.2174, "num_input_tokens_seen": 57693520, "step": 26720 }, { "epoch": 4.359706362153344, "grad_norm": 0.4338360130786896, "learning_rate": 3.469156414926121e-05, "loss": 0.1294, "num_input_tokens_seen": 57703312, "step": 26725 }, { "epoch": 4.3605220228384995, "grad_norm": 3.105952024459839, "learning_rate": 3.4685002390839226e-05, "loss": 0.1444, "num_input_tokens_seen": 57714672, "step": 26730 }, { "epoch": 4.361337683523654, "grad_norm": 0.07433799654245377, "learning_rate": 3.467843984730179e-05, "loss": 0.009, "num_input_tokens_seen": 57724464, "step": 26735 }, { "epoch": 4.362153344208809, "grad_norm": 1.7410246133804321, "learning_rate": 3.467187651918093e-05, "loss": 0.017, "num_input_tokens_seen": 57735856, "step": 26740 }, { "epoch": 4.362969004893964, "grad_norm": 0.07141131907701492, "learning_rate": 3.466531240700868e-05, "loss": 0.0683, "num_input_tokens_seen": 57747440, "step": 26745 }, { "epoch": 4.363784665579119, "grad_norm": 0.12390885502099991, "learning_rate": 3.465874751131716e-05, "loss": 0.0592, "num_input_tokens_seen": 57757168, "step": 26750 }, { "epoch": 4.364600326264274, "grad_norm": 0.07554621249437332, "learning_rate": 3.4652181832638566e-05, "loss": 0.0097, "num_input_tokens_seen": 57768112, "step": 26755 }, { "epoch": 4.365415986949429, "grad_norm": 2.8957602977752686, "learning_rate": 3.464561537150513e-05, "loss": 0.1345, "num_input_tokens_seen": 57778480, "step": 26760 }, { "epoch": 4.366231647634584, "grad_norm": 0.10320143401622772, "learning_rate": 3.4639048128449175e-05, "loss": 0.0125, "num_input_tokens_seen": 57789328, "step": 26765 }, { "epoch": 4.367047308319739, "grad_norm": 0.08945643901824951, "learning_rate": 3.463248010400307e-05, "loss": 0.1218, "num_input_tokens_seen": 57800112, "step": 26770 }, { "epoch": 4.367862969004894, "grad_norm": 0.16159725189208984, "learning_rate": 3.462591129869925e-05, "loss": 0.1699, "num_input_tokens_seen": 57810064, "step": 26775 }, { "epoch": 4.368678629690049, "grad_norm": 0.7375674843788147, "learning_rate": 3.461934171307022e-05, "loss": 0.012, "num_input_tokens_seen": 57821680, "step": 26780 }, { "epoch": 4.369494290375204, "grad_norm": 0.18759872019290924, "learning_rate": 3.461277134764855e-05, "loss": 0.1464, "num_input_tokens_seen": 57832496, "step": 26785 }, { "epoch": 4.370309951060359, "grad_norm": 0.11277393251657486, "learning_rate": 3.460620020296684e-05, "loss": 0.2279, "num_input_tokens_seen": 57842768, "step": 26790 }, { "epoch": 4.371125611745514, "grad_norm": 3.195002794265747, "learning_rate": 3.459962827955782e-05, "loss": 0.1832, "num_input_tokens_seen": 57853008, "step": 26795 }, { "epoch": 4.371941272430669, "grad_norm": 6.2746357917785645, "learning_rate": 3.459305557795422e-05, "loss": 0.0506, "num_input_tokens_seen": 57863312, "step": 26800 }, { "epoch": 4.372756933115824, "grad_norm": 0.5368702411651611, "learning_rate": 3.458648209868886e-05, "loss": 0.0721, "num_input_tokens_seen": 57873552, "step": 26805 }, { "epoch": 4.373572593800978, "grad_norm": 13.515825271606445, "learning_rate": 3.4579907842294614e-05, "loss": 0.1683, "num_input_tokens_seen": 57884336, "step": 26810 }, { "epoch": 4.374388254486134, "grad_norm": 2.330129384994507, "learning_rate": 3.457333280930444e-05, "loss": 0.1345, "num_input_tokens_seen": 57894512, "step": 26815 }, { "epoch": 4.375203915171289, "grad_norm": 0.03426094353199005, "learning_rate": 3.456675700025132e-05, "loss": 0.0974, "num_input_tokens_seen": 57906576, "step": 26820 }, { "epoch": 4.376019575856444, "grad_norm": 0.16858018934726715, "learning_rate": 3.4560180415668354e-05, "loss": 0.071, "num_input_tokens_seen": 57916816, "step": 26825 }, { "epoch": 4.376835236541599, "grad_norm": 0.03578922525048256, "learning_rate": 3.455360305608865e-05, "loss": 0.0166, "num_input_tokens_seen": 57927088, "step": 26830 }, { "epoch": 4.377650897226753, "grad_norm": 0.17116525769233704, "learning_rate": 3.4547024922045405e-05, "loss": 0.0135, "num_input_tokens_seen": 57939120, "step": 26835 }, { "epoch": 4.378466557911908, "grad_norm": 0.2882421016693115, "learning_rate": 3.454044601407187e-05, "loss": 0.1348, "num_input_tokens_seen": 57949264, "step": 26840 }, { "epoch": 4.379282218597064, "grad_norm": 1.7336941957473755, "learning_rate": 3.453386633270138e-05, "loss": 0.224, "num_input_tokens_seen": 57961328, "step": 26845 }, { "epoch": 4.380097879282219, "grad_norm": 2.617790460586548, "learning_rate": 3.4527285878467305e-05, "loss": 0.2194, "num_input_tokens_seen": 57972816, "step": 26850 }, { "epoch": 4.3809135399673735, "grad_norm": 11.77711296081543, "learning_rate": 3.45207046519031e-05, "loss": 0.0778, "num_input_tokens_seen": 57983440, "step": 26855 }, { "epoch": 4.381729200652528, "grad_norm": 4.458096981048584, "learning_rate": 3.451412265354227e-05, "loss": 0.1339, "num_input_tokens_seen": 57994384, "step": 26860 }, { "epoch": 4.382544861337683, "grad_norm": 5.544656276702881, "learning_rate": 3.450753988391839e-05, "loss": 0.043, "num_input_tokens_seen": 58006192, "step": 26865 }, { "epoch": 4.383360522022839, "grad_norm": 0.11429605633020401, "learning_rate": 3.450095634356508e-05, "loss": 0.0063, "num_input_tokens_seen": 58016784, "step": 26870 }, { "epoch": 4.384176182707994, "grad_norm": 4.855502128601074, "learning_rate": 3.449437203301604e-05, "loss": 0.1091, "num_input_tokens_seen": 58026736, "step": 26875 }, { "epoch": 4.3849918433931485, "grad_norm": 0.0600491501390934, "learning_rate": 3.4487786952805035e-05, "loss": 0.0417, "num_input_tokens_seen": 58037200, "step": 26880 }, { "epoch": 4.385807504078303, "grad_norm": 0.2659836411476135, "learning_rate": 3.4481201103465875e-05, "loss": 0.0303, "num_input_tokens_seen": 58049200, "step": 26885 }, { "epoch": 4.386623164763458, "grad_norm": 0.12740908563137054, "learning_rate": 3.447461448553245e-05, "loss": 0.038, "num_input_tokens_seen": 58060144, "step": 26890 }, { "epoch": 4.387438825448613, "grad_norm": 0.18616020679473877, "learning_rate": 3.4468027099538694e-05, "loss": 0.2452, "num_input_tokens_seen": 58071600, "step": 26895 }, { "epoch": 4.388254486133769, "grad_norm": 1.6752793788909912, "learning_rate": 3.446143894601862e-05, "loss": 0.2502, "num_input_tokens_seen": 58081616, "step": 26900 }, { "epoch": 4.3890701468189235, "grad_norm": 0.16435900330543518, "learning_rate": 3.44548500255063e-05, "loss": 0.2265, "num_input_tokens_seen": 58092976, "step": 26905 }, { "epoch": 4.389885807504078, "grad_norm": 0.35967424511909485, "learning_rate": 3.444826033853587e-05, "loss": 0.0129, "num_input_tokens_seen": 58103536, "step": 26910 }, { "epoch": 4.390701468189233, "grad_norm": 0.11986248940229416, "learning_rate": 3.4441669885641517e-05, "loss": 0.0131, "num_input_tokens_seen": 58115504, "step": 26915 }, { "epoch": 4.391517128874388, "grad_norm": 0.09129441529512405, "learning_rate": 3.443507866735749e-05, "loss": 0.0819, "num_input_tokens_seen": 58125648, "step": 26920 }, { "epoch": 4.392332789559543, "grad_norm": 2.3310320377349854, "learning_rate": 3.4428486684218116e-05, "loss": 0.1177, "num_input_tokens_seen": 58136816, "step": 26925 }, { "epoch": 4.3931484502446985, "grad_norm": 3.101334810256958, "learning_rate": 3.442189393675777e-05, "loss": 0.155, "num_input_tokens_seen": 58146832, "step": 26930 }, { "epoch": 4.393964110929853, "grad_norm": 3.337653398513794, "learning_rate": 3.44153004255109e-05, "loss": 0.2613, "num_input_tokens_seen": 58156912, "step": 26935 }, { "epoch": 4.394779771615008, "grad_norm": 3.4108667373657227, "learning_rate": 3.4408706151012e-05, "loss": 0.0964, "num_input_tokens_seen": 58167856, "step": 26940 }, { "epoch": 4.395595432300163, "grad_norm": 1.8708680868148804, "learning_rate": 3.440211111379564e-05, "loss": 0.1649, "num_input_tokens_seen": 58178928, "step": 26945 }, { "epoch": 4.396411092985318, "grad_norm": 0.086298368871212, "learning_rate": 3.4395515314396445e-05, "loss": 0.0975, "num_input_tokens_seen": 58187888, "step": 26950 }, { "epoch": 4.397226753670473, "grad_norm": 0.3319890797138214, "learning_rate": 3.4388918753349106e-05, "loss": 0.0647, "num_input_tokens_seen": 58198160, "step": 26955 }, { "epoch": 4.398042414355628, "grad_norm": 0.09963146597146988, "learning_rate": 3.438232143118838e-05, "loss": 0.0069, "num_input_tokens_seen": 58209808, "step": 26960 }, { "epoch": 4.398858075040783, "grad_norm": 23.190380096435547, "learning_rate": 3.437572334844907e-05, "loss": 0.0889, "num_input_tokens_seen": 58221104, "step": 26965 }, { "epoch": 4.399673735725938, "grad_norm": 2.905056953430176, "learning_rate": 3.436912450566606e-05, "loss": 0.1671, "num_input_tokens_seen": 58230832, "step": 26970 }, { "epoch": 4.400489396411093, "grad_norm": 1.5367621183395386, "learning_rate": 3.436252490337428e-05, "loss": 0.1571, "num_input_tokens_seen": 58241328, "step": 26975 }, { "epoch": 4.401305057096248, "grad_norm": 0.13706767559051514, "learning_rate": 3.4355924542108716e-05, "loss": 0.0044, "num_input_tokens_seen": 58252720, "step": 26980 }, { "epoch": 4.402120717781403, "grad_norm": 1.714287281036377, "learning_rate": 3.4349323422404444e-05, "loss": 0.0284, "num_input_tokens_seen": 58263216, "step": 26985 }, { "epoch": 4.402936378466558, "grad_norm": 0.16424110531806946, "learning_rate": 3.434272154479657e-05, "loss": 0.0086, "num_input_tokens_seen": 58274320, "step": 26990 }, { "epoch": 4.403752039151713, "grad_norm": 0.08186185359954834, "learning_rate": 3.4336118909820295e-05, "loss": 0.2202, "num_input_tokens_seen": 58285392, "step": 26995 }, { "epoch": 4.404567699836868, "grad_norm": 3.257965326309204, "learning_rate": 3.432951551801084e-05, "loss": 0.2592, "num_input_tokens_seen": 58295728, "step": 27000 }, { "epoch": 4.4053833605220225, "grad_norm": 0.09292906522750854, "learning_rate": 3.432291136990352e-05, "loss": 0.1242, "num_input_tokens_seen": 58306704, "step": 27005 }, { "epoch": 4.406199021207178, "grad_norm": 0.21827851235866547, "learning_rate": 3.4316306466033704e-05, "loss": 0.1869, "num_input_tokens_seen": 58318288, "step": 27010 }, { "epoch": 4.407014681892333, "grad_norm": 1.5792090892791748, "learning_rate": 3.430970080693681e-05, "loss": 0.1153, "num_input_tokens_seen": 58330416, "step": 27015 }, { "epoch": 4.407830342577488, "grad_norm": 5.034025192260742, "learning_rate": 3.430309439314834e-05, "loss": 0.1188, "num_input_tokens_seen": 58341456, "step": 27020 }, { "epoch": 4.408646003262643, "grad_norm": 0.15225641429424286, "learning_rate": 3.4296487225203825e-05, "loss": 0.1919, "num_input_tokens_seen": 58351568, "step": 27025 }, { "epoch": 4.4094616639477975, "grad_norm": 2.4444918632507324, "learning_rate": 3.42898793036389e-05, "loss": 0.2354, "num_input_tokens_seen": 58361968, "step": 27030 }, { "epoch": 4.410277324632952, "grad_norm": 2.6613261699676514, "learning_rate": 3.428327062898921e-05, "loss": 0.1081, "num_input_tokens_seen": 58373200, "step": 27035 }, { "epoch": 4.411092985318108, "grad_norm": 0.1318122297525406, "learning_rate": 3.4276661201790506e-05, "loss": 0.0221, "num_input_tokens_seen": 58383632, "step": 27040 }, { "epoch": 4.411908646003263, "grad_norm": 2.835576295852661, "learning_rate": 3.427005102257857e-05, "loss": 0.1857, "num_input_tokens_seen": 58394768, "step": 27045 }, { "epoch": 4.412724306688418, "grad_norm": 7.886944770812988, "learning_rate": 3.426344009188927e-05, "loss": 0.0585, "num_input_tokens_seen": 58403856, "step": 27050 }, { "epoch": 4.4135399673735725, "grad_norm": 3.3899881839752197, "learning_rate": 3.425682841025851e-05, "loss": 0.2073, "num_input_tokens_seen": 58415152, "step": 27055 }, { "epoch": 4.414355628058727, "grad_norm": 4.721774578094482, "learning_rate": 3.4250215978222264e-05, "loss": 0.0383, "num_input_tokens_seen": 58425584, "step": 27060 }, { "epoch": 4.415171288743883, "grad_norm": 0.14894041419029236, "learning_rate": 3.424360279631659e-05, "loss": 0.1, "num_input_tokens_seen": 58435088, "step": 27065 }, { "epoch": 4.415986949429038, "grad_norm": 0.1305241733789444, "learning_rate": 3.423698886507756e-05, "loss": 0.0868, "num_input_tokens_seen": 58447088, "step": 27070 }, { "epoch": 4.416802610114193, "grad_norm": 7.27046537399292, "learning_rate": 3.4230374185041346e-05, "loss": 0.1465, "num_input_tokens_seen": 58458928, "step": 27075 }, { "epoch": 4.417618270799347, "grad_norm": 0.12002504616975784, "learning_rate": 3.4223758756744176e-05, "loss": 0.1507, "num_input_tokens_seen": 58470224, "step": 27080 }, { "epoch": 4.418433931484502, "grad_norm": 0.3602011203765869, "learning_rate": 3.421714258072231e-05, "loss": 0.0083, "num_input_tokens_seen": 58480816, "step": 27085 }, { "epoch": 4.419249592169657, "grad_norm": 0.3553735315799713, "learning_rate": 3.421052565751209e-05, "loss": 0.1393, "num_input_tokens_seen": 58492528, "step": 27090 }, { "epoch": 4.420065252854813, "grad_norm": 0.07079923152923584, "learning_rate": 3.420390798764995e-05, "loss": 0.0349, "num_input_tokens_seen": 58502640, "step": 27095 }, { "epoch": 4.420880913539968, "grad_norm": 14.102914810180664, "learning_rate": 3.4197289571672316e-05, "loss": 0.0637, "num_input_tokens_seen": 58514064, "step": 27100 }, { "epoch": 4.421696574225122, "grad_norm": 0.3079579174518585, "learning_rate": 3.4190670410115724e-05, "loss": 0.0062, "num_input_tokens_seen": 58526032, "step": 27105 }, { "epoch": 4.422512234910277, "grad_norm": 0.17233538627624512, "learning_rate": 3.418405050351674e-05, "loss": 0.0052, "num_input_tokens_seen": 58537904, "step": 27110 }, { "epoch": 4.423327895595432, "grad_norm": 3.1176302433013916, "learning_rate": 3.417742985241205e-05, "loss": 0.3075, "num_input_tokens_seen": 58548208, "step": 27115 }, { "epoch": 4.424143556280587, "grad_norm": 0.07439375668764114, "learning_rate": 3.417080845733831e-05, "loss": 0.2225, "num_input_tokens_seen": 58559568, "step": 27120 }, { "epoch": 4.424959216965743, "grad_norm": 1.2107059955596924, "learning_rate": 3.416418631883231e-05, "loss": 0.1219, "num_input_tokens_seen": 58570576, "step": 27125 }, { "epoch": 4.425774877650897, "grad_norm": 0.09766573458909988, "learning_rate": 3.415756343743088e-05, "loss": 0.0197, "num_input_tokens_seen": 58581680, "step": 27130 }, { "epoch": 4.426590538336052, "grad_norm": 0.06342066079378128, "learning_rate": 3.4150939813670886e-05, "loss": 0.0404, "num_input_tokens_seen": 58593040, "step": 27135 }, { "epoch": 4.427406199021207, "grad_norm": 0.046015411615371704, "learning_rate": 3.414431544808928e-05, "loss": 0.0074, "num_input_tokens_seen": 58603728, "step": 27140 }, { "epoch": 4.428221859706362, "grad_norm": 0.09383854269981384, "learning_rate": 3.413769034122306e-05, "loss": 0.0322, "num_input_tokens_seen": 58614640, "step": 27145 }, { "epoch": 4.4290375203915175, "grad_norm": 0.07216141372919083, "learning_rate": 3.41310644936093e-05, "loss": 0.0134, "num_input_tokens_seen": 58623536, "step": 27150 }, { "epoch": 4.429853181076672, "grad_norm": 5.684617519378662, "learning_rate": 3.412443790578512e-05, "loss": 0.0929, "num_input_tokens_seen": 58633648, "step": 27155 }, { "epoch": 4.430668841761827, "grad_norm": 0.039336834102869034, "learning_rate": 3.4117810578287704e-05, "loss": 0.1294, "num_input_tokens_seen": 58644624, "step": 27160 }, { "epoch": 4.431484502446982, "grad_norm": 0.17408223450183868, "learning_rate": 3.411118251165431e-05, "loss": 0.0368, "num_input_tokens_seen": 58655344, "step": 27165 }, { "epoch": 4.432300163132137, "grad_norm": 4.096160411834717, "learning_rate": 3.410455370642221e-05, "loss": 0.1473, "num_input_tokens_seen": 58666448, "step": 27170 }, { "epoch": 4.433115823817292, "grad_norm": 0.0826522707939148, "learning_rate": 3.409792416312881e-05, "loss": 0.0381, "num_input_tokens_seen": 58678672, "step": 27175 }, { "epoch": 4.433931484502447, "grad_norm": 1.2679916620254517, "learning_rate": 3.409129388231151e-05, "loss": 0.0126, "num_input_tokens_seen": 58690672, "step": 27180 }, { "epoch": 4.434747145187602, "grad_norm": 6.634708881378174, "learning_rate": 3.40846628645078e-05, "loss": 0.1661, "num_input_tokens_seen": 58702320, "step": 27185 }, { "epoch": 4.435562805872757, "grad_norm": 0.08634289354085922, "learning_rate": 3.407803111025522e-05, "loss": 0.0956, "num_input_tokens_seen": 58714192, "step": 27190 }, { "epoch": 4.436378466557912, "grad_norm": 0.0404016375541687, "learning_rate": 3.407139862009138e-05, "loss": 0.0887, "num_input_tokens_seen": 58725488, "step": 27195 }, { "epoch": 4.437194127243067, "grad_norm": 1.8363744020462036, "learning_rate": 3.406476539455394e-05, "loss": 0.1618, "num_input_tokens_seen": 58736208, "step": 27200 }, { "epoch": 4.438009787928221, "grad_norm": 0.05074001103639603, "learning_rate": 3.405813143418062e-05, "loss": 0.0049, "num_input_tokens_seen": 58746608, "step": 27205 }, { "epoch": 4.438825448613377, "grad_norm": 0.18867208063602448, "learning_rate": 3.4051496739509216e-05, "loss": 0.1844, "num_input_tokens_seen": 58756880, "step": 27210 }, { "epoch": 4.439641109298532, "grad_norm": 0.046085234731435776, "learning_rate": 3.404486131107754e-05, "loss": 0.0038, "num_input_tokens_seen": 58768080, "step": 27215 }, { "epoch": 4.440456769983687, "grad_norm": 2.8690378665924072, "learning_rate": 3.403822514942353e-05, "loss": 0.2246, "num_input_tokens_seen": 58778896, "step": 27220 }, { "epoch": 4.441272430668842, "grad_norm": 37.09941482543945, "learning_rate": 3.4031588255085126e-05, "loss": 0.3199, "num_input_tokens_seen": 58790032, "step": 27225 }, { "epoch": 4.442088091353996, "grad_norm": 0.10624881833791733, "learning_rate": 3.4024950628600345e-05, "loss": 0.0536, "num_input_tokens_seen": 58800880, "step": 27230 }, { "epoch": 4.442903752039152, "grad_norm": 0.03980366140604019, "learning_rate": 3.401831227050728e-05, "loss": 0.0121, "num_input_tokens_seen": 58811760, "step": 27235 }, { "epoch": 4.443719412724307, "grad_norm": 0.16718868911266327, "learning_rate": 3.401167318134406e-05, "loss": 0.1678, "num_input_tokens_seen": 58821808, "step": 27240 }, { "epoch": 4.444535073409462, "grad_norm": 0.4627486765384674, "learning_rate": 3.400503336164888e-05, "loss": 0.1981, "num_input_tokens_seen": 58832080, "step": 27245 }, { "epoch": 4.445350734094617, "grad_norm": 0.2281552255153656, "learning_rate": 3.3998392811960024e-05, "loss": 0.0237, "num_input_tokens_seen": 58842480, "step": 27250 }, { "epoch": 4.446166394779771, "grad_norm": 20.436752319335938, "learning_rate": 3.399175153281578e-05, "loss": 0.3219, "num_input_tokens_seen": 58852752, "step": 27255 }, { "epoch": 4.446982055464926, "grad_norm": 0.08264545351266861, "learning_rate": 3.3985109524754535e-05, "loss": 0.0609, "num_input_tokens_seen": 58863312, "step": 27260 }, { "epoch": 4.447797716150082, "grad_norm": 5.9884033203125, "learning_rate": 3.397846678831472e-05, "loss": 0.11, "num_input_tokens_seen": 58873104, "step": 27265 }, { "epoch": 4.448613376835237, "grad_norm": 2.786705255508423, "learning_rate": 3.397182332403482e-05, "loss": 0.0129, "num_input_tokens_seen": 58884624, "step": 27270 }, { "epoch": 4.4494290375203915, "grad_norm": 0.21680717170238495, "learning_rate": 3.3965179132453416e-05, "loss": 0.0276, "num_input_tokens_seen": 58894864, "step": 27275 }, { "epoch": 4.450244698205546, "grad_norm": 3.943552017211914, "learning_rate": 3.3958534214109095e-05, "loss": 0.1979, "num_input_tokens_seen": 58906512, "step": 27280 }, { "epoch": 4.451060358890701, "grad_norm": 0.06703973561525345, "learning_rate": 3.395188856954054e-05, "loss": 0.1125, "num_input_tokens_seen": 58917904, "step": 27285 }, { "epoch": 4.451876019575856, "grad_norm": 1.4853523969650269, "learning_rate": 3.394524219928647e-05, "loss": 0.081, "num_input_tokens_seen": 58928304, "step": 27290 }, { "epoch": 4.452691680261012, "grad_norm": 0.07906728982925415, "learning_rate": 3.3938595103885684e-05, "loss": 0.0084, "num_input_tokens_seen": 58939056, "step": 27295 }, { "epoch": 4.4535073409461665, "grad_norm": 0.12298695743083954, "learning_rate": 3.393194728387702e-05, "loss": 0.1072, "num_input_tokens_seen": 58950384, "step": 27300 }, { "epoch": 4.454323001631321, "grad_norm": 0.0752132460474968, "learning_rate": 3.39252987397994e-05, "loss": 0.0229, "num_input_tokens_seen": 58959952, "step": 27305 }, { "epoch": 4.455138662316476, "grad_norm": 7.1743950843811035, "learning_rate": 3.391864947219177e-05, "loss": 0.0405, "num_input_tokens_seen": 58970288, "step": 27310 }, { "epoch": 4.455954323001631, "grad_norm": 0.05831082910299301, "learning_rate": 3.391199948159315e-05, "loss": 0.1414, "num_input_tokens_seen": 58981936, "step": 27315 }, { "epoch": 4.456769983686787, "grad_norm": 0.03013616055250168, "learning_rate": 3.390534876854265e-05, "loss": 0.0076, "num_input_tokens_seen": 58992432, "step": 27320 }, { "epoch": 4.4575856443719415, "grad_norm": 0.12155994772911072, "learning_rate": 3.389869733357939e-05, "loss": 0.004, "num_input_tokens_seen": 59002832, "step": 27325 }, { "epoch": 4.458401305057096, "grad_norm": 6.801733016967773, "learning_rate": 3.389204517724256e-05, "loss": 0.0427, "num_input_tokens_seen": 59013936, "step": 27330 }, { "epoch": 4.459216965742251, "grad_norm": 0.1223423108458519, "learning_rate": 3.388539230007145e-05, "loss": 0.0067, "num_input_tokens_seen": 59023440, "step": 27335 }, { "epoch": 4.460032626427406, "grad_norm": 7.5102314949035645, "learning_rate": 3.387873870260534e-05, "loss": 0.0938, "num_input_tokens_seen": 59034224, "step": 27340 }, { "epoch": 4.460848287112561, "grad_norm": 0.18983300030231476, "learning_rate": 3.3872084385383626e-05, "loss": 0.0232, "num_input_tokens_seen": 59044016, "step": 27345 }, { "epoch": 4.4616639477977165, "grad_norm": 0.047964051365852356, "learning_rate": 3.3865429348945735e-05, "loss": 0.048, "num_input_tokens_seen": 59053648, "step": 27350 }, { "epoch": 4.462479608482871, "grad_norm": 0.09409400075674057, "learning_rate": 3.385877359383116e-05, "loss": 0.1408, "num_input_tokens_seen": 59064080, "step": 27355 }, { "epoch": 4.463295269168026, "grad_norm": 0.9640812873840332, "learning_rate": 3.385211712057945e-05, "loss": 0.2134, "num_input_tokens_seen": 59074672, "step": 27360 }, { "epoch": 4.464110929853181, "grad_norm": 2.773480176925659, "learning_rate": 3.384545992973021e-05, "loss": 0.2477, "num_input_tokens_seen": 59084784, "step": 27365 }, { "epoch": 4.464926590538336, "grad_norm": 0.2358299195766449, "learning_rate": 3.383880202182311e-05, "loss": 0.017, "num_input_tokens_seen": 59095504, "step": 27370 }, { "epoch": 4.465742251223491, "grad_norm": 0.06360097974538803, "learning_rate": 3.3832143397397855e-05, "loss": 0.006, "num_input_tokens_seen": 59105616, "step": 27375 }, { "epoch": 4.466557911908646, "grad_norm": 10.953752517700195, "learning_rate": 3.382548405699426e-05, "loss": 0.3359, "num_input_tokens_seen": 59114576, "step": 27380 }, { "epoch": 4.467373572593801, "grad_norm": 0.07115211337804794, "learning_rate": 3.3818824001152135e-05, "loss": 0.1698, "num_input_tokens_seen": 59124976, "step": 27385 }, { "epoch": 4.468189233278956, "grad_norm": 0.07822829484939575, "learning_rate": 3.38121632304114e-05, "loss": 0.0975, "num_input_tokens_seen": 59136336, "step": 27390 }, { "epoch": 4.469004893964111, "grad_norm": 0.29284754395484924, "learning_rate": 3.3805501745312e-05, "loss": 0.1261, "num_input_tokens_seen": 59146960, "step": 27395 }, { "epoch": 4.4698205546492655, "grad_norm": 1.5599113702774048, "learning_rate": 3.379883954639394e-05, "loss": 0.0087, "num_input_tokens_seen": 59158096, "step": 27400 }, { "epoch": 4.470636215334421, "grad_norm": 0.06289583444595337, "learning_rate": 3.379217663419731e-05, "loss": 0.0693, "num_input_tokens_seen": 59168912, "step": 27405 }, { "epoch": 4.471451876019576, "grad_norm": 0.05054587125778198, "learning_rate": 3.378551300926222e-05, "loss": 0.008, "num_input_tokens_seen": 59179728, "step": 27410 }, { "epoch": 4.472267536704731, "grad_norm": 4.775998592376709, "learning_rate": 3.3778848672128884e-05, "loss": 0.2334, "num_input_tokens_seen": 59191312, "step": 27415 }, { "epoch": 4.473083197389886, "grad_norm": 3.131594657897949, "learning_rate": 3.3772183623337524e-05, "loss": 0.013, "num_input_tokens_seen": 59202352, "step": 27420 }, { "epoch": 4.4738988580750405, "grad_norm": 0.06370081752538681, "learning_rate": 3.3765517863428456e-05, "loss": 0.0405, "num_input_tokens_seen": 59212560, "step": 27425 }, { "epoch": 4.474714518760196, "grad_norm": 0.916146993637085, "learning_rate": 3.375885139294202e-05, "loss": 0.1004, "num_input_tokens_seen": 59221968, "step": 27430 }, { "epoch": 4.475530179445351, "grad_norm": 5.828142166137695, "learning_rate": 3.375218421241866e-05, "loss": 0.1806, "num_input_tokens_seen": 59232848, "step": 27435 }, { "epoch": 4.476345840130506, "grad_norm": 0.044280849397182465, "learning_rate": 3.3745516322398834e-05, "loss": 0.1405, "num_input_tokens_seen": 59243984, "step": 27440 }, { "epoch": 4.477161500815661, "grad_norm": 0.1461818963289261, "learning_rate": 3.373884772342308e-05, "loss": 0.1658, "num_input_tokens_seen": 59253840, "step": 27445 }, { "epoch": 4.4779771615008155, "grad_norm": 0.08795010298490524, "learning_rate": 3.3732178416032e-05, "loss": 0.0094, "num_input_tokens_seen": 59264400, "step": 27450 }, { "epoch": 4.47879282218597, "grad_norm": 3.8043558597564697, "learning_rate": 3.372550840076622e-05, "loss": 0.271, "num_input_tokens_seen": 59275728, "step": 27455 }, { "epoch": 4.479608482871126, "grad_norm": 0.06280068308115005, "learning_rate": 3.371883767816646e-05, "loss": 0.1255, "num_input_tokens_seen": 59286896, "step": 27460 }, { "epoch": 4.480424143556281, "grad_norm": 0.3041984736919403, "learning_rate": 3.371216624877348e-05, "loss": 0.0791, "num_input_tokens_seen": 59297808, "step": 27465 }, { "epoch": 4.481239804241436, "grad_norm": 0.3431588113307953, "learning_rate": 3.370549411312809e-05, "loss": 0.085, "num_input_tokens_seen": 59308752, "step": 27470 }, { "epoch": 4.4820554649265905, "grad_norm": 8.118783950805664, "learning_rate": 3.3698821271771186e-05, "loss": 0.13, "num_input_tokens_seen": 59318320, "step": 27475 }, { "epoch": 4.482871125611745, "grad_norm": 4.661539554595947, "learning_rate": 3.369214772524369e-05, "loss": 0.1307, "num_input_tokens_seen": 59328464, "step": 27480 }, { "epoch": 4.4836867862969, "grad_norm": 0.4202403128147125, "learning_rate": 3.3685473474086584e-05, "loss": 0.0906, "num_input_tokens_seen": 59338000, "step": 27485 }, { "epoch": 4.484502446982056, "grad_norm": 0.6933928728103638, "learning_rate": 3.3678798518840946e-05, "loss": 0.1531, "num_input_tokens_seen": 59347792, "step": 27490 }, { "epoch": 4.485318107667211, "grad_norm": 0.10543301701545715, "learning_rate": 3.367212286004786e-05, "loss": 0.0093, "num_input_tokens_seen": 59357648, "step": 27495 }, { "epoch": 4.486133768352365, "grad_norm": 0.08430114388465881, "learning_rate": 3.366544649824849e-05, "loss": 0.0321, "num_input_tokens_seen": 59367760, "step": 27500 }, { "epoch": 4.48694942903752, "grad_norm": 0.11894170939922333, "learning_rate": 3.365876943398406e-05, "loss": 0.0485, "num_input_tokens_seen": 59378416, "step": 27505 }, { "epoch": 4.487765089722675, "grad_norm": 1.0292885303497314, "learning_rate": 3.365209166779585e-05, "loss": 0.039, "num_input_tokens_seen": 59388944, "step": 27510 }, { "epoch": 4.488580750407831, "grad_norm": 0.06280895322561264, "learning_rate": 3.3645413200225175e-05, "loss": 0.1303, "num_input_tokens_seen": 59397776, "step": 27515 }, { "epoch": 4.489396411092986, "grad_norm": 1.400780439376831, "learning_rate": 3.363873403181346e-05, "loss": 0.0044, "num_input_tokens_seen": 59408464, "step": 27520 }, { "epoch": 4.49021207177814, "grad_norm": 0.0447203665971756, "learning_rate": 3.363205416310212e-05, "loss": 0.2334, "num_input_tokens_seen": 59419888, "step": 27525 }, { "epoch": 4.491027732463295, "grad_norm": 0.022025197744369507, "learning_rate": 3.362537359463267e-05, "loss": 0.1503, "num_input_tokens_seen": 59429712, "step": 27530 }, { "epoch": 4.49184339314845, "grad_norm": 0.044437769800424576, "learning_rate": 3.361869232694666e-05, "loss": 0.0692, "num_input_tokens_seen": 59440688, "step": 27535 }, { "epoch": 4.492659053833605, "grad_norm": 0.05670817941427231, "learning_rate": 3.3612010360585744e-05, "loss": 0.0124, "num_input_tokens_seen": 59451120, "step": 27540 }, { "epoch": 4.493474714518761, "grad_norm": 0.0395425446331501, "learning_rate": 3.360532769609156e-05, "loss": 0.0974, "num_input_tokens_seen": 59461776, "step": 27545 }, { "epoch": 4.494290375203915, "grad_norm": 0.0732116624712944, "learning_rate": 3.359864433400585e-05, "loss": 0.2992, "num_input_tokens_seen": 59472464, "step": 27550 }, { "epoch": 4.49510603588907, "grad_norm": 0.42865869402885437, "learning_rate": 3.3591960274870394e-05, "loss": 0.3121, "num_input_tokens_seen": 59482704, "step": 27555 }, { "epoch": 4.495921696574225, "grad_norm": 5.648609161376953, "learning_rate": 3.3585275519227046e-05, "loss": 0.1836, "num_input_tokens_seen": 59492976, "step": 27560 }, { "epoch": 4.49673735725938, "grad_norm": 3.8055789470672607, "learning_rate": 3.357859006761771e-05, "loss": 0.0356, "num_input_tokens_seen": 59503344, "step": 27565 }, { "epoch": 4.497553017944535, "grad_norm": 0.21388722956180573, "learning_rate": 3.357190392058433e-05, "loss": 0.006, "num_input_tokens_seen": 59514896, "step": 27570 }, { "epoch": 4.49836867862969, "grad_norm": 0.39768272638320923, "learning_rate": 3.356521707866893e-05, "loss": 0.2554, "num_input_tokens_seen": 59527408, "step": 27575 }, { "epoch": 4.499184339314845, "grad_norm": 3.3803622722625732, "learning_rate": 3.3558529542413574e-05, "loss": 0.1233, "num_input_tokens_seen": 59537520, "step": 27580 }, { "epoch": 4.5, "grad_norm": 1.0554314851760864, "learning_rate": 3.3551841312360386e-05, "loss": 0.3003, "num_input_tokens_seen": 59549072, "step": 27585 }, { "epoch": 4.5, "eval_loss": 0.15887264907360077, "eval_runtime": 132.9408, "eval_samples_per_second": 20.498, "eval_steps_per_second": 5.13, "num_input_tokens_seen": 59549072, "step": 27585 }, { "epoch": 4.500815660685155, "grad_norm": 0.08097808808088303, "learning_rate": 3.354515238905155e-05, "loss": 0.0688, "num_input_tokens_seen": 59559824, "step": 27590 }, { "epoch": 4.50163132137031, "grad_norm": 0.2921116352081299, "learning_rate": 3.35384627730293e-05, "loss": 0.1252, "num_input_tokens_seen": 59569776, "step": 27595 }, { "epoch": 4.502446982055465, "grad_norm": 4.903281211853027, "learning_rate": 3.353177246483594e-05, "loss": 0.1634, "num_input_tokens_seen": 59579184, "step": 27600 }, { "epoch": 4.50326264274062, "grad_norm": 0.1140977144241333, "learning_rate": 3.352508146501381e-05, "loss": 0.0486, "num_input_tokens_seen": 59590416, "step": 27605 }, { "epoch": 4.504078303425775, "grad_norm": 0.10739248991012573, "learning_rate": 3.3518389774105326e-05, "loss": 0.0719, "num_input_tokens_seen": 59601136, "step": 27610 }, { "epoch": 4.50489396411093, "grad_norm": 2.9648189544677734, "learning_rate": 3.351169739265294e-05, "loss": 0.2624, "num_input_tokens_seen": 59612816, "step": 27615 }, { "epoch": 4.505709624796085, "grad_norm": 0.13738565146923065, "learning_rate": 3.350500432119917e-05, "loss": 0.0152, "num_input_tokens_seen": 59624080, "step": 27620 }, { "epoch": 4.506525285481239, "grad_norm": 3.4406023025512695, "learning_rate": 3.3498310560286604e-05, "loss": 0.0309, "num_input_tokens_seen": 59634608, "step": 27625 }, { "epoch": 4.507340946166395, "grad_norm": 0.14925315976142883, "learning_rate": 3.349161611045786e-05, "loss": 0.014, "num_input_tokens_seen": 59645200, "step": 27630 }, { "epoch": 4.50815660685155, "grad_norm": 2.3995683193206787, "learning_rate": 3.348492097225563e-05, "loss": 0.1583, "num_input_tokens_seen": 59654736, "step": 27635 }, { "epoch": 4.508972267536705, "grad_norm": 0.0835544764995575, "learning_rate": 3.347822514622265e-05, "loss": 0.1303, "num_input_tokens_seen": 59664624, "step": 27640 }, { "epoch": 4.50978792822186, "grad_norm": 0.06443583965301514, "learning_rate": 3.347152863290173e-05, "loss": 0.2365, "num_input_tokens_seen": 59674480, "step": 27645 }, { "epoch": 4.510603588907014, "grad_norm": 0.34996533393859863, "learning_rate": 3.346483143283571e-05, "loss": 0.0096, "num_input_tokens_seen": 59685392, "step": 27650 }, { "epoch": 4.511419249592169, "grad_norm": 1.139493703842163, "learning_rate": 3.3458133546567506e-05, "loss": 0.014, "num_input_tokens_seen": 59695280, "step": 27655 }, { "epoch": 4.512234910277325, "grad_norm": 0.2241903841495514, "learning_rate": 3.345143497464007e-05, "loss": 0.1069, "num_input_tokens_seen": 59705872, "step": 27660 }, { "epoch": 4.51305057096248, "grad_norm": 0.07191357016563416, "learning_rate": 3.344473571759645e-05, "loss": 0.0936, "num_input_tokens_seen": 59717936, "step": 27665 }, { "epoch": 4.513866231647635, "grad_norm": 6.5147199630737305, "learning_rate": 3.343803577597969e-05, "loss": 0.0942, "num_input_tokens_seen": 59728144, "step": 27670 }, { "epoch": 4.514681892332789, "grad_norm": 4.543582439422607, "learning_rate": 3.343133515033295e-05, "loss": 0.1052, "num_input_tokens_seen": 59739888, "step": 27675 }, { "epoch": 4.515497553017944, "grad_norm": 0.026386456564068794, "learning_rate": 3.342463384119939e-05, "loss": 0.0073, "num_input_tokens_seen": 59750320, "step": 27680 }, { "epoch": 4.5163132137031, "grad_norm": 0.4252329170703888, "learning_rate": 3.3417931849122275e-05, "loss": 0.0351, "num_input_tokens_seen": 59761520, "step": 27685 }, { "epoch": 4.517128874388255, "grad_norm": 0.2085205316543579, "learning_rate": 3.341122917464489e-05, "loss": 0.1756, "num_input_tokens_seen": 59771632, "step": 27690 }, { "epoch": 4.5179445350734095, "grad_norm": 2.8549644947052, "learning_rate": 3.340452581831057e-05, "loss": 0.0695, "num_input_tokens_seen": 59781744, "step": 27695 }, { "epoch": 4.518760195758564, "grad_norm": 4.925718307495117, "learning_rate": 3.3397821780662764e-05, "loss": 0.115, "num_input_tokens_seen": 59791760, "step": 27700 }, { "epoch": 4.519575856443719, "grad_norm": 0.08845800161361694, "learning_rate": 3.3391117062244913e-05, "loss": 0.1119, "num_input_tokens_seen": 59801872, "step": 27705 }, { "epoch": 4.520391517128875, "grad_norm": 0.1270817369222641, "learning_rate": 3.338441166360054e-05, "loss": 0.0802, "num_input_tokens_seen": 59811152, "step": 27710 }, { "epoch": 4.52120717781403, "grad_norm": 0.10203168541193008, "learning_rate": 3.33777055852732e-05, "loss": 0.0859, "num_input_tokens_seen": 59821264, "step": 27715 }, { "epoch": 4.5220228384991845, "grad_norm": 8.421981811523438, "learning_rate": 3.3370998827806543e-05, "loss": 0.1104, "num_input_tokens_seen": 59832336, "step": 27720 }, { "epoch": 4.522838499184339, "grad_norm": 0.15523754060268402, "learning_rate": 3.336429139174425e-05, "loss": 0.0492, "num_input_tokens_seen": 59842288, "step": 27725 }, { "epoch": 4.523654159869494, "grad_norm": 3.196800947189331, "learning_rate": 3.335758327763006e-05, "loss": 0.124, "num_input_tokens_seen": 59852912, "step": 27730 }, { "epoch": 4.524469820554649, "grad_norm": 0.06876061856746674, "learning_rate": 3.335087448600776e-05, "loss": 0.0119, "num_input_tokens_seen": 59863280, "step": 27735 }, { "epoch": 4.525285481239804, "grad_norm": 0.10988235473632812, "learning_rate": 3.33441650174212e-05, "loss": 0.0783, "num_input_tokens_seen": 59874288, "step": 27740 }, { "epoch": 4.5261011419249595, "grad_norm": 7.433391571044922, "learning_rate": 3.3337454872414294e-05, "loss": 0.0836, "num_input_tokens_seen": 59886000, "step": 27745 }, { "epoch": 4.526916802610114, "grad_norm": 4.332917213439941, "learning_rate": 3.333074405153098e-05, "loss": 0.0186, "num_input_tokens_seen": 59896432, "step": 27750 }, { "epoch": 4.527732463295269, "grad_norm": 7.137960433959961, "learning_rate": 3.332403255531529e-05, "loss": 0.2452, "num_input_tokens_seen": 59906704, "step": 27755 }, { "epoch": 4.528548123980424, "grad_norm": 3.40251088142395, "learning_rate": 3.331732038431129e-05, "loss": 0.0759, "num_input_tokens_seen": 59917392, "step": 27760 }, { "epoch": 4.529363784665579, "grad_norm": 0.0433967299759388, "learning_rate": 3.3310607539063096e-05, "loss": 0.2353, "num_input_tokens_seen": 59927984, "step": 27765 }, { "epoch": 4.5301794453507345, "grad_norm": 0.17781051993370056, "learning_rate": 3.3303894020114886e-05, "loss": 0.0743, "num_input_tokens_seen": 59938160, "step": 27770 }, { "epoch": 4.530995106035889, "grad_norm": 0.1469729095697403, "learning_rate": 3.329717982801089e-05, "loss": 0.0073, "num_input_tokens_seen": 59950000, "step": 27775 }, { "epoch": 4.531810766721044, "grad_norm": 0.36192265152931213, "learning_rate": 3.32904649632954e-05, "loss": 0.0058, "num_input_tokens_seen": 59960720, "step": 27780 }, { "epoch": 4.532626427406199, "grad_norm": 0.03430166840553284, "learning_rate": 3.328374942651275e-05, "loss": 0.004, "num_input_tokens_seen": 59970864, "step": 27785 }, { "epoch": 4.533442088091354, "grad_norm": 4.052310466766357, "learning_rate": 3.3277033218207346e-05, "loss": 0.1539, "num_input_tokens_seen": 59982256, "step": 27790 }, { "epoch": 4.5342577487765094, "grad_norm": 0.07250082492828369, "learning_rate": 3.327031633892363e-05, "loss": 0.1253, "num_input_tokens_seen": 59993584, "step": 27795 }, { "epoch": 4.535073409461664, "grad_norm": 0.08707235008478165, "learning_rate": 3.32635987892061e-05, "loss": 0.0591, "num_input_tokens_seen": 60004720, "step": 27800 }, { "epoch": 4.535889070146819, "grad_norm": 0.17127718031406403, "learning_rate": 3.3256880569599335e-05, "loss": 0.1543, "num_input_tokens_seen": 60015120, "step": 27805 }, { "epoch": 4.536704730831974, "grad_norm": 0.04245676100254059, "learning_rate": 3.325016168064794e-05, "loss": 0.0066, "num_input_tokens_seen": 60024752, "step": 27810 }, { "epoch": 4.537520391517129, "grad_norm": 0.213352769613266, "learning_rate": 3.324344212289657e-05, "loss": 0.0901, "num_input_tokens_seen": 60037072, "step": 27815 }, { "epoch": 4.5383360522022835, "grad_norm": 0.06449801474809647, "learning_rate": 3.3236721896889954e-05, "loss": 0.0429, "num_input_tokens_seen": 60048144, "step": 27820 }, { "epoch": 4.539151712887438, "grad_norm": 0.10689418762922287, "learning_rate": 3.323000100317287e-05, "loss": 0.1278, "num_input_tokens_seen": 60058544, "step": 27825 }, { "epoch": 4.539967373572594, "grad_norm": 5.112598419189453, "learning_rate": 3.3223279442290146e-05, "loss": 0.019, "num_input_tokens_seen": 60068688, "step": 27830 }, { "epoch": 4.540783034257749, "grad_norm": 0.2425864189863205, "learning_rate": 3.321655721478667e-05, "loss": 0.0157, "num_input_tokens_seen": 60079664, "step": 27835 }, { "epoch": 4.541598694942904, "grad_norm": 6.997628688812256, "learning_rate": 3.320983432120737e-05, "loss": 0.1535, "num_input_tokens_seen": 60089296, "step": 27840 }, { "epoch": 4.5424143556280585, "grad_norm": 0.0939311757683754, "learning_rate": 3.320311076209724e-05, "loss": 0.1832, "num_input_tokens_seen": 60099568, "step": 27845 }, { "epoch": 4.543230016313213, "grad_norm": 0.20239602029323578, "learning_rate": 3.3196386538001346e-05, "loss": 0.0991, "num_input_tokens_seen": 60110160, "step": 27850 }, { "epoch": 4.544045676998369, "grad_norm": 12.30156421661377, "learning_rate": 3.3189661649464754e-05, "loss": 0.0411, "num_input_tokens_seen": 60120880, "step": 27855 }, { "epoch": 4.544861337683524, "grad_norm": 0.08484846353530884, "learning_rate": 3.318293609703264e-05, "loss": 0.0251, "num_input_tokens_seen": 60132336, "step": 27860 }, { "epoch": 4.545676998368679, "grad_norm": 1.118166446685791, "learning_rate": 3.3176209881250206e-05, "loss": 0.0092, "num_input_tokens_seen": 60142608, "step": 27865 }, { "epoch": 4.5464926590538335, "grad_norm": 2.9867448806762695, "learning_rate": 3.3169483002662714e-05, "loss": 0.0851, "num_input_tokens_seen": 60153744, "step": 27870 }, { "epoch": 4.547308319738988, "grad_norm": 0.055605363100767136, "learning_rate": 3.316275546181548e-05, "loss": 0.0391, "num_input_tokens_seen": 60166288, "step": 27875 }, { "epoch": 4.548123980424144, "grad_norm": 3.6087186336517334, "learning_rate": 3.315602725925387e-05, "loss": 0.2699, "num_input_tokens_seen": 60176208, "step": 27880 }, { "epoch": 4.548939641109299, "grad_norm": 4.053160190582275, "learning_rate": 3.314929839552331e-05, "loss": 0.1586, "num_input_tokens_seen": 60187088, "step": 27885 }, { "epoch": 4.549755301794454, "grad_norm": 0.20458002388477325, "learning_rate": 3.314256887116927e-05, "loss": 0.2275, "num_input_tokens_seen": 60197488, "step": 27890 }, { "epoch": 4.5505709624796085, "grad_norm": 0.27609655261039734, "learning_rate": 3.313583868673728e-05, "loss": 0.0312, "num_input_tokens_seen": 60208816, "step": 27895 }, { "epoch": 4.551386623164763, "grad_norm": 0.4201211631298065, "learning_rate": 3.312910784277293e-05, "loss": 0.0431, "num_input_tokens_seen": 60219728, "step": 27900 }, { "epoch": 4.552202283849918, "grad_norm": 4.90434455871582, "learning_rate": 3.312237633982185e-05, "loss": 0.0687, "num_input_tokens_seen": 60230288, "step": 27905 }, { "epoch": 4.553017944535073, "grad_norm": 0.03794803097844124, "learning_rate": 3.3115644178429725e-05, "loss": 0.0302, "num_input_tokens_seen": 60242384, "step": 27910 }, { "epoch": 4.553833605220229, "grad_norm": 0.02601797692477703, "learning_rate": 3.310891135914231e-05, "loss": 0.0415, "num_input_tokens_seen": 60253360, "step": 27915 }, { "epoch": 4.554649265905383, "grad_norm": 0.05846745893359184, "learning_rate": 3.31021778825054e-05, "loss": 0.1983, "num_input_tokens_seen": 60263440, "step": 27920 }, { "epoch": 4.555464926590538, "grad_norm": 7.333343505859375, "learning_rate": 3.309544374906484e-05, "loss": 0.0208, "num_input_tokens_seen": 60273392, "step": 27925 }, { "epoch": 4.556280587275693, "grad_norm": 0.13697630167007446, "learning_rate": 3.308870895936652e-05, "loss": 0.19, "num_input_tokens_seen": 60285424, "step": 27930 }, { "epoch": 4.557096247960848, "grad_norm": 5.5662102699279785, "learning_rate": 3.308197351395643e-05, "loss": 0.078, "num_input_tokens_seen": 60295248, "step": 27935 }, { "epoch": 4.557911908646004, "grad_norm": 0.052226513624191284, "learning_rate": 3.3075237413380545e-05, "loss": 0.1936, "num_input_tokens_seen": 60306320, "step": 27940 }, { "epoch": 4.558727569331158, "grad_norm": 0.09828644245862961, "learning_rate": 3.306850065818494e-05, "loss": 0.0806, "num_input_tokens_seen": 60315600, "step": 27945 }, { "epoch": 4.559543230016313, "grad_norm": 0.05902625992894173, "learning_rate": 3.3061763248915744e-05, "loss": 0.2177, "num_input_tokens_seen": 60326608, "step": 27950 }, { "epoch": 4.560358890701468, "grad_norm": 0.0987459197640419, "learning_rate": 3.305502518611911e-05, "loss": 0.3177, "num_input_tokens_seen": 60336944, "step": 27955 }, { "epoch": 4.561174551386623, "grad_norm": 0.16019931435585022, "learning_rate": 3.304828647034126e-05, "loss": 0.1571, "num_input_tokens_seen": 60348656, "step": 27960 }, { "epoch": 4.561990212071779, "grad_norm": 5.781607627868652, "learning_rate": 3.304154710212847e-05, "loss": 0.0661, "num_input_tokens_seen": 60359568, "step": 27965 }, { "epoch": 4.562805872756933, "grad_norm": 10.70506763458252, "learning_rate": 3.303480708202708e-05, "loss": 0.1256, "num_input_tokens_seen": 60370192, "step": 27970 }, { "epoch": 4.563621533442088, "grad_norm": 0.08649314939975739, "learning_rate": 3.3028066410583456e-05, "loss": 0.0207, "num_input_tokens_seen": 60381136, "step": 27975 }, { "epoch": 4.564437194127243, "grad_norm": 0.43787726759910583, "learning_rate": 3.3021325088344036e-05, "loss": 0.1025, "num_input_tokens_seen": 60392304, "step": 27980 }, { "epoch": 4.565252854812398, "grad_norm": 0.07810968160629272, "learning_rate": 3.3014583115855304e-05, "loss": 0.2413, "num_input_tokens_seen": 60402544, "step": 27985 }, { "epoch": 4.566068515497553, "grad_norm": 0.13535502552986145, "learning_rate": 3.3007840493663794e-05, "loss": 0.021, "num_input_tokens_seen": 60413488, "step": 27990 }, { "epoch": 4.566884176182708, "grad_norm": 0.15852278470993042, "learning_rate": 3.30010972223161e-05, "loss": 0.0677, "num_input_tokens_seen": 60423696, "step": 27995 }, { "epoch": 4.567699836867863, "grad_norm": 0.3404339551925659, "learning_rate": 3.2994353302358875e-05, "loss": 0.0126, "num_input_tokens_seen": 60434576, "step": 28000 }, { "epoch": 4.568515497553018, "grad_norm": 17.157421112060547, "learning_rate": 3.298760873433881e-05, "loss": 0.1839, "num_input_tokens_seen": 60443856, "step": 28005 }, { "epoch": 4.569331158238173, "grad_norm": 3.211148262023926, "learning_rate": 3.298086351880265e-05, "loss": 0.1236, "num_input_tokens_seen": 60454864, "step": 28010 }, { "epoch": 4.570146818923328, "grad_norm": 5.151402473449707, "learning_rate": 3.2974117656297194e-05, "loss": 0.131, "num_input_tokens_seen": 60465456, "step": 28015 }, { "epoch": 4.5709624796084825, "grad_norm": 0.16944673657417297, "learning_rate": 3.2967371147369306e-05, "loss": 0.1117, "num_input_tokens_seen": 60476816, "step": 28020 }, { "epoch": 4.571778140293638, "grad_norm": 0.49499160051345825, "learning_rate": 3.296062399256587e-05, "loss": 0.103, "num_input_tokens_seen": 60488784, "step": 28025 }, { "epoch": 4.572593800978793, "grad_norm": 0.4088912308216095, "learning_rate": 3.295387619243389e-05, "loss": 0.0229, "num_input_tokens_seen": 60500656, "step": 28030 }, { "epoch": 4.573409461663948, "grad_norm": 0.9882709383964539, "learning_rate": 3.294712774752033e-05, "loss": 0.0098, "num_input_tokens_seen": 60511664, "step": 28035 }, { "epoch": 4.574225122349103, "grad_norm": 7.481235027313232, "learning_rate": 3.2940378658372276e-05, "loss": 0.1471, "num_input_tokens_seen": 60522000, "step": 28040 }, { "epoch": 4.575040783034257, "grad_norm": 0.19363392889499664, "learning_rate": 3.293362892553684e-05, "loss": 0.0344, "num_input_tokens_seen": 60532688, "step": 28045 }, { "epoch": 4.575856443719413, "grad_norm": 0.08438599854707718, "learning_rate": 3.292687854956119e-05, "loss": 0.0973, "num_input_tokens_seen": 60543152, "step": 28050 }, { "epoch": 4.576672104404568, "grad_norm": 0.08524743467569351, "learning_rate": 3.292012753099254e-05, "loss": 0.0623, "num_input_tokens_seen": 60552272, "step": 28055 }, { "epoch": 4.577487765089723, "grad_norm": 0.06825464218854904, "learning_rate": 3.2913375870378165e-05, "loss": 0.1963, "num_input_tokens_seen": 60563888, "step": 28060 }, { "epoch": 4.578303425774878, "grad_norm": 0.15410315990447998, "learning_rate": 3.2906623568265396e-05, "loss": 0.3716, "num_input_tokens_seen": 60574576, "step": 28065 }, { "epoch": 4.579119086460032, "grad_norm": 12.65908145904541, "learning_rate": 3.289987062520159e-05, "loss": 0.0591, "num_input_tokens_seen": 60584848, "step": 28070 }, { "epoch": 4.579934747145187, "grad_norm": 2.266758918762207, "learning_rate": 3.28931170417342e-05, "loss": 0.197, "num_input_tokens_seen": 60595312, "step": 28075 }, { "epoch": 4.580750407830343, "grad_norm": 0.5498905181884766, "learning_rate": 3.288636281841069e-05, "loss": 0.0149, "num_input_tokens_seen": 60604848, "step": 28080 }, { "epoch": 4.581566068515498, "grad_norm": 0.09909345954656601, "learning_rate": 3.287960795577859e-05, "loss": 0.0449, "num_input_tokens_seen": 60614640, "step": 28085 }, { "epoch": 4.582381729200653, "grad_norm": 0.5386938452720642, "learning_rate": 3.2872852454385495e-05, "loss": 0.0078, "num_input_tokens_seen": 60624912, "step": 28090 }, { "epoch": 4.583197389885807, "grad_norm": 0.22791962325572968, "learning_rate": 3.2866096314779035e-05, "loss": 0.0075, "num_input_tokens_seen": 60637968, "step": 28095 }, { "epoch": 4.584013050570962, "grad_norm": 0.03843766823410988, "learning_rate": 3.285933953750689e-05, "loss": 0.1, "num_input_tokens_seen": 60648816, "step": 28100 }, { "epoch": 4.584828711256117, "grad_norm": 2.0750088691711426, "learning_rate": 3.28525821231168e-05, "loss": 0.0982, "num_input_tokens_seen": 60659888, "step": 28105 }, { "epoch": 4.585644371941273, "grad_norm": 4.184640407562256, "learning_rate": 3.284582407215657e-05, "loss": 0.1045, "num_input_tokens_seen": 60671472, "step": 28110 }, { "epoch": 4.5864600326264275, "grad_norm": 7.943884372711182, "learning_rate": 3.283906538517403e-05, "loss": 0.119, "num_input_tokens_seen": 60683696, "step": 28115 }, { "epoch": 4.587275693311582, "grad_norm": 5.281688690185547, "learning_rate": 3.283230606271707e-05, "loss": 0.1073, "num_input_tokens_seen": 60694832, "step": 28120 }, { "epoch": 4.588091353996737, "grad_norm": 0.15140409767627716, "learning_rate": 3.2825546105333634e-05, "loss": 0.1108, "num_input_tokens_seen": 60706192, "step": 28125 }, { "epoch": 4.588907014681892, "grad_norm": 0.09975605458021164, "learning_rate": 3.281878551357174e-05, "loss": 0.0073, "num_input_tokens_seen": 60716880, "step": 28130 }, { "epoch": 4.589722675367048, "grad_norm": 4.524142265319824, "learning_rate": 3.281202428797941e-05, "loss": 0.2411, "num_input_tokens_seen": 60727792, "step": 28135 }, { "epoch": 4.5905383360522025, "grad_norm": 5.362515449523926, "learning_rate": 3.2805262429104755e-05, "loss": 0.2446, "num_input_tokens_seen": 60738800, "step": 28140 }, { "epoch": 4.591353996737357, "grad_norm": 17.26320457458496, "learning_rate": 3.279849993749593e-05, "loss": 0.1968, "num_input_tokens_seen": 60748336, "step": 28145 }, { "epoch": 4.592169657422512, "grad_norm": 0.10141994804143906, "learning_rate": 3.279173681370112e-05, "loss": 0.1395, "num_input_tokens_seen": 60759728, "step": 28150 }, { "epoch": 4.592985318107667, "grad_norm": 0.8254680037498474, "learning_rate": 3.27849730582686e-05, "loss": 0.0778, "num_input_tokens_seen": 60771184, "step": 28155 }, { "epoch": 4.593800978792823, "grad_norm": 0.06635220348834991, "learning_rate": 3.2778208671746654e-05, "loss": 0.014, "num_input_tokens_seen": 60782864, "step": 28160 }, { "epoch": 4.5946166394779775, "grad_norm": 13.53496265411377, "learning_rate": 3.277144365468365e-05, "loss": 0.2678, "num_input_tokens_seen": 60794800, "step": 28165 }, { "epoch": 4.595432300163132, "grad_norm": 0.04601912200450897, "learning_rate": 3.2764678007627994e-05, "loss": 0.0065, "num_input_tokens_seen": 60806224, "step": 28170 }, { "epoch": 4.596247960848287, "grad_norm": 0.028869764879345894, "learning_rate": 3.275791173112814e-05, "loss": 0.1233, "num_input_tokens_seen": 60817232, "step": 28175 }, { "epoch": 4.597063621533442, "grad_norm": 0.49962377548217773, "learning_rate": 3.2751144825732595e-05, "loss": 0.1578, "num_input_tokens_seen": 60828400, "step": 28180 }, { "epoch": 4.597879282218597, "grad_norm": 0.15380997955799103, "learning_rate": 3.274437729198992e-05, "loss": 0.2603, "num_input_tokens_seen": 60840080, "step": 28185 }, { "epoch": 4.598694942903752, "grad_norm": 0.30230018496513367, "learning_rate": 3.273760913044873e-05, "loss": 0.0198, "num_input_tokens_seen": 60851088, "step": 28190 }, { "epoch": 4.599510603588907, "grad_norm": 0.15211479365825653, "learning_rate": 3.273084034165769e-05, "loss": 0.12, "num_input_tokens_seen": 60862096, "step": 28195 }, { "epoch": 4.600326264274062, "grad_norm": 0.040675919502973557, "learning_rate": 3.2724070926165495e-05, "loss": 0.2047, "num_input_tokens_seen": 60872848, "step": 28200 }, { "epoch": 4.601141924959217, "grad_norm": 0.2294074147939682, "learning_rate": 3.271730088452093e-05, "loss": 0.0039, "num_input_tokens_seen": 60883760, "step": 28205 }, { "epoch": 4.601957585644372, "grad_norm": 3.3079845905303955, "learning_rate": 3.2710530217272794e-05, "loss": 0.1291, "num_input_tokens_seen": 60893456, "step": 28210 }, { "epoch": 4.602773246329527, "grad_norm": 0.5316904187202454, "learning_rate": 3.270375892496995e-05, "loss": 0.0295, "num_input_tokens_seen": 60905136, "step": 28215 }, { "epoch": 4.603588907014682, "grad_norm": 9.550395011901855, "learning_rate": 3.2696987008161325e-05, "loss": 0.1919, "num_input_tokens_seen": 60915088, "step": 28220 }, { "epoch": 4.604404567699837, "grad_norm": 0.0909288227558136, "learning_rate": 3.269021446739588e-05, "loss": 0.0407, "num_input_tokens_seen": 60926704, "step": 28225 }, { "epoch": 4.605220228384992, "grad_norm": 3.365037202835083, "learning_rate": 3.268344130322262e-05, "loss": 0.0203, "num_input_tokens_seen": 60937808, "step": 28230 }, { "epoch": 4.606035889070147, "grad_norm": 2.849531412124634, "learning_rate": 3.2676667516190634e-05, "loss": 0.0208, "num_input_tokens_seen": 60948112, "step": 28235 }, { "epoch": 4.6068515497553015, "grad_norm": 6.55972146987915, "learning_rate": 3.266989310684902e-05, "loss": 0.3289, "num_input_tokens_seen": 60959312, "step": 28240 }, { "epoch": 4.607667210440457, "grad_norm": 1.8439161777496338, "learning_rate": 3.266311807574697e-05, "loss": 0.0294, "num_input_tokens_seen": 60970256, "step": 28245 }, { "epoch": 4.608482871125612, "grad_norm": 0.03487294167280197, "learning_rate": 3.265634242343367e-05, "loss": 0.0056, "num_input_tokens_seen": 60980752, "step": 28250 }, { "epoch": 4.609298531810767, "grad_norm": 0.035358358174562454, "learning_rate": 3.264956615045841e-05, "loss": 0.0259, "num_input_tokens_seen": 60991344, "step": 28255 }, { "epoch": 4.610114192495922, "grad_norm": 0.0793282613158226, "learning_rate": 3.26427892573705e-05, "loss": 0.0044, "num_input_tokens_seen": 61001264, "step": 28260 }, { "epoch": 4.6109298531810765, "grad_norm": 0.04088451713323593, "learning_rate": 3.263601174471932e-05, "loss": 0.0042, "num_input_tokens_seen": 61011408, "step": 28265 }, { "epoch": 4.611745513866231, "grad_norm": 2.698221206665039, "learning_rate": 3.262923361305429e-05, "loss": 0.0359, "num_input_tokens_seen": 61020944, "step": 28270 }, { "epoch": 4.612561174551386, "grad_norm": 2.7350680828094482, "learning_rate": 3.262245486292486e-05, "loss": 0.075, "num_input_tokens_seen": 61031248, "step": 28275 }, { "epoch": 4.613376835236542, "grad_norm": 3.3778626918792725, "learning_rate": 3.261567549488056e-05, "loss": 0.166, "num_input_tokens_seen": 61042320, "step": 28280 }, { "epoch": 4.614192495921697, "grad_norm": 7.231352806091309, "learning_rate": 3.260889550947098e-05, "loss": 0.2126, "num_input_tokens_seen": 61053520, "step": 28285 }, { "epoch": 4.6150081566068515, "grad_norm": 0.32570576667785645, "learning_rate": 3.260211490724571e-05, "loss": 0.0126, "num_input_tokens_seen": 61062960, "step": 28290 }, { "epoch": 4.615823817292006, "grad_norm": 0.09390170872211456, "learning_rate": 3.259533368875444e-05, "loss": 0.0041, "num_input_tokens_seen": 61073392, "step": 28295 }, { "epoch": 4.616639477977161, "grad_norm": 0.01974472776055336, "learning_rate": 3.2588551854546876e-05, "loss": 0.0355, "num_input_tokens_seen": 61083984, "step": 28300 }, { "epoch": 4.617455138662317, "grad_norm": 8.095100402832031, "learning_rate": 3.2581769405172805e-05, "loss": 0.0251, "num_input_tokens_seen": 61095280, "step": 28305 }, { "epoch": 4.618270799347472, "grad_norm": 0.06068865582346916, "learning_rate": 3.2574986341182026e-05, "loss": 0.0151, "num_input_tokens_seen": 61107024, "step": 28310 }, { "epoch": 4.6190864600326265, "grad_norm": 0.045194510370492935, "learning_rate": 3.256820266312442e-05, "loss": 0.0635, "num_input_tokens_seen": 61117552, "step": 28315 }, { "epoch": 4.619902120717781, "grad_norm": 3.254822254180908, "learning_rate": 3.256141837154991e-05, "loss": 0.0813, "num_input_tokens_seen": 61129040, "step": 28320 }, { "epoch": 4.620717781402936, "grad_norm": 0.08180870115756989, "learning_rate": 3.255463346700846e-05, "loss": 0.0587, "num_input_tokens_seen": 61139664, "step": 28325 }, { "epoch": 4.621533442088092, "grad_norm": 8.497345924377441, "learning_rate": 3.254784795005008e-05, "loss": 0.3768, "num_input_tokens_seen": 61151728, "step": 28330 }, { "epoch": 4.622349102773247, "grad_norm": 17.47870445251465, "learning_rate": 3.254106182122486e-05, "loss": 0.1989, "num_input_tokens_seen": 61162448, "step": 28335 }, { "epoch": 4.623164763458401, "grad_norm": 3.7979838848114014, "learning_rate": 3.2534275081082896e-05, "loss": 0.1276, "num_input_tokens_seen": 61173136, "step": 28340 }, { "epoch": 4.623980424143556, "grad_norm": 8.182798385620117, "learning_rate": 3.252748773017437e-05, "loss": 0.2062, "num_input_tokens_seen": 61183120, "step": 28345 }, { "epoch": 4.624796084828711, "grad_norm": 0.25087881088256836, "learning_rate": 3.2520699769049496e-05, "loss": 0.0131, "num_input_tokens_seen": 61194000, "step": 28350 }, { "epoch": 4.625611745513866, "grad_norm": 8.378771781921387, "learning_rate": 3.251391119825854e-05, "loss": 0.0302, "num_input_tokens_seen": 61204688, "step": 28355 }, { "epoch": 4.626427406199021, "grad_norm": 3.5833957195281982, "learning_rate": 3.2507122018351815e-05, "loss": 0.191, "num_input_tokens_seen": 61215376, "step": 28360 }, { "epoch": 4.627243066884176, "grad_norm": 1.5312604904174805, "learning_rate": 3.250033222987969e-05, "loss": 0.0091, "num_input_tokens_seen": 61226256, "step": 28365 }, { "epoch": 4.628058727569331, "grad_norm": 0.10052099823951721, "learning_rate": 3.2493541833392575e-05, "loss": 0.0093, "num_input_tokens_seen": 61238192, "step": 28370 }, { "epoch": 4.628874388254486, "grad_norm": 0.09227970242500305, "learning_rate": 3.2486750829440946e-05, "loss": 0.1166, "num_input_tokens_seen": 61248944, "step": 28375 }, { "epoch": 4.629690048939641, "grad_norm": 0.04053603112697601, "learning_rate": 3.2479959218575295e-05, "loss": 0.0964, "num_input_tokens_seen": 61260752, "step": 28380 }, { "epoch": 4.630505709624796, "grad_norm": 2.4717376232147217, "learning_rate": 3.24731670013462e-05, "loss": 0.3137, "num_input_tokens_seen": 61271984, "step": 28385 }, { "epoch": 4.631321370309951, "grad_norm": 0.34996065497398376, "learning_rate": 3.246637417830427e-05, "loss": 0.0561, "num_input_tokens_seen": 61283248, "step": 28390 }, { "epoch": 4.632137030995106, "grad_norm": 0.12414489686489105, "learning_rate": 3.245958075000017e-05, "loss": 0.0059, "num_input_tokens_seen": 61294512, "step": 28395 }, { "epoch": 4.632952691680261, "grad_norm": 5.165646553039551, "learning_rate": 3.24527867169846e-05, "loss": 0.0387, "num_input_tokens_seen": 61304944, "step": 28400 }, { "epoch": 4.633768352365416, "grad_norm": 0.13805988430976868, "learning_rate": 3.244599207980833e-05, "loss": 0.0068, "num_input_tokens_seen": 61315408, "step": 28405 }, { "epoch": 4.634584013050571, "grad_norm": 0.4154154658317566, "learning_rate": 3.243919683902216e-05, "loss": 0.2016, "num_input_tokens_seen": 61326000, "step": 28410 }, { "epoch": 4.635399673735726, "grad_norm": 0.060973986983299255, "learning_rate": 3.2432400995176934e-05, "loss": 0.1803, "num_input_tokens_seen": 61335984, "step": 28415 }, { "epoch": 4.636215334420881, "grad_norm": 0.16784125566482544, "learning_rate": 3.242560454882359e-05, "loss": 0.0277, "num_input_tokens_seen": 61347344, "step": 28420 }, { "epoch": 4.637030995106036, "grad_norm": 3.4738075733184814, "learning_rate": 3.241880750051306e-05, "loss": 0.2198, "num_input_tokens_seen": 61358352, "step": 28425 }, { "epoch": 4.637846655791191, "grad_norm": 0.04191743582487106, "learning_rate": 3.241200985079634e-05, "loss": 0.0093, "num_input_tokens_seen": 61369168, "step": 28430 }, { "epoch": 4.638662316476346, "grad_norm": 3.6386828422546387, "learning_rate": 3.2405211600224503e-05, "loss": 0.2868, "num_input_tokens_seen": 61379984, "step": 28435 }, { "epoch": 4.6394779771615005, "grad_norm": 0.11262372881174088, "learning_rate": 3.239841274934863e-05, "loss": 0.0778, "num_input_tokens_seen": 61391760, "step": 28440 }, { "epoch": 4.640293637846656, "grad_norm": 0.34785395860671997, "learning_rate": 3.239161329871989e-05, "loss": 0.1569, "num_input_tokens_seen": 61401680, "step": 28445 }, { "epoch": 4.641109298531811, "grad_norm": 0.08060356229543686, "learning_rate": 3.2384813248889475e-05, "loss": 0.0088, "num_input_tokens_seen": 61411696, "step": 28450 }, { "epoch": 4.641924959216966, "grad_norm": 0.478834867477417, "learning_rate": 3.2378012600408625e-05, "loss": 0.0909, "num_input_tokens_seen": 61421968, "step": 28455 }, { "epoch": 4.642740619902121, "grad_norm": 5.574655055999756, "learning_rate": 3.2371211353828636e-05, "loss": 0.0306, "num_input_tokens_seen": 61430960, "step": 28460 }, { "epoch": 4.643556280587275, "grad_norm": 0.6605509519577026, "learning_rate": 3.236440950970085e-05, "loss": 0.3075, "num_input_tokens_seen": 61442032, "step": 28465 }, { "epoch": 4.64437194127243, "grad_norm": 3.7971913814544678, "learning_rate": 3.2357607068576664e-05, "loss": 0.2212, "num_input_tokens_seen": 61453136, "step": 28470 }, { "epoch": 4.645187601957586, "grad_norm": 6.3657941818237305, "learning_rate": 3.2350804031007524e-05, "loss": 0.1452, "num_input_tokens_seen": 61464048, "step": 28475 }, { "epoch": 4.646003262642741, "grad_norm": 0.04483633488416672, "learning_rate": 3.234400039754491e-05, "loss": 0.1186, "num_input_tokens_seen": 61475344, "step": 28480 }, { "epoch": 4.646818923327896, "grad_norm": 0.25604698061943054, "learning_rate": 3.2337196168740356e-05, "loss": 0.0275, "num_input_tokens_seen": 61485168, "step": 28485 }, { "epoch": 4.64763458401305, "grad_norm": 4.546419620513916, "learning_rate": 3.233039134514545e-05, "loss": 0.2129, "num_input_tokens_seen": 61496304, "step": 28490 }, { "epoch": 4.648450244698205, "grad_norm": 6.424332618713379, "learning_rate": 3.2323585927311825e-05, "loss": 0.0382, "num_input_tokens_seen": 61507088, "step": 28495 }, { "epoch": 4.649265905383361, "grad_norm": 2.768878936767578, "learning_rate": 3.231677991579118e-05, "loss": 0.1253, "num_input_tokens_seen": 61518032, "step": 28500 }, { "epoch": 4.650081566068516, "grad_norm": 0.47725772857666016, "learning_rate": 3.230997331113521e-05, "loss": 0.0204, "num_input_tokens_seen": 61529424, "step": 28505 }, { "epoch": 4.650897226753671, "grad_norm": 0.20057564973831177, "learning_rate": 3.230316611389573e-05, "loss": 0.0122, "num_input_tokens_seen": 61540976, "step": 28510 }, { "epoch": 4.651712887438825, "grad_norm": 0.1292143017053604, "learning_rate": 3.229635832462454e-05, "loss": 0.0137, "num_input_tokens_seen": 61552368, "step": 28515 }, { "epoch": 4.65252854812398, "grad_norm": 0.1415448933839798, "learning_rate": 3.228954994387352e-05, "loss": 0.0073, "num_input_tokens_seen": 61563184, "step": 28520 }, { "epoch": 4.653344208809135, "grad_norm": 4.261139392852783, "learning_rate": 3.2282740972194606e-05, "loss": 0.1982, "num_input_tokens_seen": 61572720, "step": 28525 }, { "epoch": 4.654159869494291, "grad_norm": 0.08284708112478256, "learning_rate": 3.2275931410139755e-05, "loss": 0.0975, "num_input_tokens_seen": 61584176, "step": 28530 }, { "epoch": 4.6549755301794455, "grad_norm": 0.12331119179725647, "learning_rate": 3.226912125826098e-05, "loss": 0.0922, "num_input_tokens_seen": 61595152, "step": 28535 }, { "epoch": 4.6557911908646, "grad_norm": 0.21844713389873505, "learning_rate": 3.226231051711035e-05, "loss": 0.083, "num_input_tokens_seen": 61606032, "step": 28540 }, { "epoch": 4.656606851549755, "grad_norm": 0.05081196874380112, "learning_rate": 3.225549918723999e-05, "loss": 0.0321, "num_input_tokens_seen": 61617680, "step": 28545 }, { "epoch": 4.65742251223491, "grad_norm": 0.1778022050857544, "learning_rate": 3.224868726920205e-05, "loss": 0.1595, "num_input_tokens_seen": 61628080, "step": 28550 }, { "epoch": 4.658238172920065, "grad_norm": 0.19769565761089325, "learning_rate": 3.224187476354873e-05, "loss": 0.2647, "num_input_tokens_seen": 61639440, "step": 28555 }, { "epoch": 4.6590538336052205, "grad_norm": 8.99872875213623, "learning_rate": 3.223506167083231e-05, "loss": 0.1861, "num_input_tokens_seen": 61650544, "step": 28560 }, { "epoch": 4.659869494290375, "grad_norm": 0.1628834456205368, "learning_rate": 3.222824799160508e-05, "loss": 0.2301, "num_input_tokens_seen": 61661232, "step": 28565 }, { "epoch": 4.66068515497553, "grad_norm": 1.367659330368042, "learning_rate": 3.222143372641938e-05, "loss": 0.0063, "num_input_tokens_seen": 61671984, "step": 28570 }, { "epoch": 4.661500815660685, "grad_norm": 0.21327966451644897, "learning_rate": 3.2214618875827626e-05, "loss": 0.04, "num_input_tokens_seen": 61682160, "step": 28575 }, { "epoch": 4.66231647634584, "grad_norm": 0.03936144337058067, "learning_rate": 3.220780344038227e-05, "loss": 0.1217, "num_input_tokens_seen": 61692112, "step": 28580 }, { "epoch": 4.6631321370309955, "grad_norm": 0.05543329194188118, "learning_rate": 3.220098742063578e-05, "loss": 0.1951, "num_input_tokens_seen": 61703312, "step": 28585 }, { "epoch": 4.66394779771615, "grad_norm": 2.2213213443756104, "learning_rate": 3.219417081714072e-05, "loss": 0.0089, "num_input_tokens_seen": 61713616, "step": 28590 }, { "epoch": 4.664763458401305, "grad_norm": 0.05598160997033119, "learning_rate": 3.218735363044967e-05, "loss": 0.2321, "num_input_tokens_seen": 61724080, "step": 28595 }, { "epoch": 4.66557911908646, "grad_norm": 4.151381015777588, "learning_rate": 3.218053586111526e-05, "loss": 0.1904, "num_input_tokens_seen": 61735216, "step": 28600 }, { "epoch": 4.666394779771615, "grad_norm": 0.36127719283103943, "learning_rate": 3.217371750969019e-05, "loss": 0.0418, "num_input_tokens_seen": 61746736, "step": 28605 }, { "epoch": 4.6672104404567705, "grad_norm": 4.928699493408203, "learning_rate": 3.2166898576727176e-05, "loss": 0.1783, "num_input_tokens_seen": 61758032, "step": 28610 }, { "epoch": 4.668026101141925, "grad_norm": 2.978797674179077, "learning_rate": 3.2160079062779005e-05, "loss": 0.2182, "num_input_tokens_seen": 61768624, "step": 28615 }, { "epoch": 4.66884176182708, "grad_norm": 1.138555645942688, "learning_rate": 3.215325896839848e-05, "loss": 0.0115, "num_input_tokens_seen": 61779408, "step": 28620 }, { "epoch": 4.669657422512235, "grad_norm": 1.297851800918579, "learning_rate": 3.2146438294138505e-05, "loss": 0.0284, "num_input_tokens_seen": 61790320, "step": 28625 }, { "epoch": 4.67047308319739, "grad_norm": 0.3052097260951996, "learning_rate": 3.2139617040551966e-05, "loss": 0.0303, "num_input_tokens_seen": 61801968, "step": 28630 }, { "epoch": 4.671288743882545, "grad_norm": 0.12733018398284912, "learning_rate": 3.2132795208191853e-05, "loss": 0.18, "num_input_tokens_seen": 61812944, "step": 28635 }, { "epoch": 4.672104404567699, "grad_norm": 6.405884265899658, "learning_rate": 3.212597279761116e-05, "loss": 0.0533, "num_input_tokens_seen": 61824912, "step": 28640 }, { "epoch": 4.672920065252855, "grad_norm": 0.052328865975141525, "learning_rate": 3.211914980936296e-05, "loss": 0.1802, "num_input_tokens_seen": 61835120, "step": 28645 }, { "epoch": 4.67373572593801, "grad_norm": 0.046687010675668716, "learning_rate": 3.2112326244000355e-05, "loss": 0.0724, "num_input_tokens_seen": 61845680, "step": 28650 }, { "epoch": 4.674551386623165, "grad_norm": 3.757732629776001, "learning_rate": 3.2105502102076494e-05, "loss": 0.1822, "num_input_tokens_seen": 61856368, "step": 28655 }, { "epoch": 4.6753670473083195, "grad_norm": 0.09236893057823181, "learning_rate": 3.209867738414459e-05, "loss": 0.0629, "num_input_tokens_seen": 61866448, "step": 28660 }, { "epoch": 4.676182707993474, "grad_norm": 0.2082487940788269, "learning_rate": 3.2091852090757865e-05, "loss": 0.1419, "num_input_tokens_seen": 61877040, "step": 28665 }, { "epoch": 4.67699836867863, "grad_norm": 3.1891257762908936, "learning_rate": 3.208502622246964e-05, "loss": 0.1606, "num_input_tokens_seen": 61887472, "step": 28670 }, { "epoch": 4.677814029363785, "grad_norm": 0.5192645788192749, "learning_rate": 3.207819977983323e-05, "loss": 0.0086, "num_input_tokens_seen": 61898864, "step": 28675 }, { "epoch": 4.67862969004894, "grad_norm": 0.9870857000350952, "learning_rate": 3.207137276340203e-05, "loss": 0.0099, "num_input_tokens_seen": 61910160, "step": 28680 }, { "epoch": 4.6794453507340945, "grad_norm": 3.560532569885254, "learning_rate": 3.206454517372949e-05, "loss": 0.3913, "num_input_tokens_seen": 61920176, "step": 28685 }, { "epoch": 4.680261011419249, "grad_norm": 3.4725193977355957, "learning_rate": 3.205771701136906e-05, "loss": 0.2192, "num_input_tokens_seen": 61930992, "step": 28690 }, { "epoch": 4.681076672104405, "grad_norm": 0.3666292428970337, "learning_rate": 3.205088827687428e-05, "loss": 0.1232, "num_input_tokens_seen": 61941744, "step": 28695 }, { "epoch": 4.68189233278956, "grad_norm": 0.22167155146598816, "learning_rate": 3.204405897079872e-05, "loss": 0.1114, "num_input_tokens_seen": 61951760, "step": 28700 }, { "epoch": 4.682707993474715, "grad_norm": 1.9301999807357788, "learning_rate": 3.2037229093696e-05, "loss": 0.3247, "num_input_tokens_seen": 61961040, "step": 28705 }, { "epoch": 4.6835236541598695, "grad_norm": 0.08697057515382767, "learning_rate": 3.203039864611978e-05, "loss": 0.0152, "num_input_tokens_seen": 61971856, "step": 28710 }, { "epoch": 4.684339314845024, "grad_norm": 0.22499242424964905, "learning_rate": 3.202356762862377e-05, "loss": 0.0114, "num_input_tokens_seen": 61983760, "step": 28715 }, { "epoch": 4.685154975530179, "grad_norm": 0.2075299471616745, "learning_rate": 3.201673604176174e-05, "loss": 0.0102, "num_input_tokens_seen": 61994480, "step": 28720 }, { "epoch": 4.685970636215334, "grad_norm": 3.208085060119629, "learning_rate": 3.2009903886087476e-05, "loss": 0.1886, "num_input_tokens_seen": 62005360, "step": 28725 }, { "epoch": 4.68678629690049, "grad_norm": 0.27816641330718994, "learning_rate": 3.200307116215485e-05, "loss": 0.1239, "num_input_tokens_seen": 62016976, "step": 28730 }, { "epoch": 4.6876019575856445, "grad_norm": 0.21742834150791168, "learning_rate": 3.1996237870517734e-05, "loss": 0.0602, "num_input_tokens_seen": 62028176, "step": 28735 }, { "epoch": 4.688417618270799, "grad_norm": 0.1057346984744072, "learning_rate": 3.198940401173007e-05, "loss": 0.1379, "num_input_tokens_seen": 62038672, "step": 28740 }, { "epoch": 4.689233278955954, "grad_norm": 0.11908625811338425, "learning_rate": 3.198256958634586e-05, "loss": 0.0066, "num_input_tokens_seen": 62049424, "step": 28745 }, { "epoch": 4.690048939641109, "grad_norm": 0.7854670286178589, "learning_rate": 3.197573459491913e-05, "loss": 0.0882, "num_input_tokens_seen": 62060112, "step": 28750 }, { "epoch": 4.690864600326265, "grad_norm": 0.11598500609397888, "learning_rate": 3.1968899038003965e-05, "loss": 0.1178, "num_input_tokens_seen": 62071728, "step": 28755 }, { "epoch": 4.691680261011419, "grad_norm": 0.1409139782190323, "learning_rate": 3.196206291615447e-05, "loss": 0.2613, "num_input_tokens_seen": 62081680, "step": 28760 }, { "epoch": 4.692495921696574, "grad_norm": 3.294092893600464, "learning_rate": 3.195522622992484e-05, "loss": 0.0389, "num_input_tokens_seen": 62091984, "step": 28765 }, { "epoch": 4.693311582381729, "grad_norm": 0.08398202806711197, "learning_rate": 3.1948388979869277e-05, "loss": 0.0969, "num_input_tokens_seen": 62102864, "step": 28770 }, { "epoch": 4.694127243066884, "grad_norm": 2.166158437728882, "learning_rate": 3.194155116654205e-05, "loss": 0.1787, "num_input_tokens_seen": 62114256, "step": 28775 }, { "epoch": 4.69494290375204, "grad_norm": 0.26545998454093933, "learning_rate": 3.193471279049746e-05, "loss": 0.011, "num_input_tokens_seen": 62124816, "step": 28780 }, { "epoch": 4.695758564437194, "grad_norm": 0.15841230750083923, "learning_rate": 3.192787385228987e-05, "loss": 0.0413, "num_input_tokens_seen": 62136368, "step": 28785 }, { "epoch": 4.696574225122349, "grad_norm": 0.11272495239973068, "learning_rate": 3.192103435247368e-05, "loss": 0.2126, "num_input_tokens_seen": 62147024, "step": 28790 }, { "epoch": 4.697389885807504, "grad_norm": 0.5528010129928589, "learning_rate": 3.1914194291603313e-05, "loss": 0.0297, "num_input_tokens_seen": 62156848, "step": 28795 }, { "epoch": 4.698205546492659, "grad_norm": 0.16787166893482208, "learning_rate": 3.190735367023328e-05, "loss": 0.0539, "num_input_tokens_seen": 62168496, "step": 28800 }, { "epoch": 4.699021207177814, "grad_norm": 1.6341058015823364, "learning_rate": 3.1900512488918114e-05, "loss": 0.1337, "num_input_tokens_seen": 62178800, "step": 28805 }, { "epoch": 4.699836867862969, "grad_norm": 0.21060863137245178, "learning_rate": 3.189367074821239e-05, "loss": 0.0353, "num_input_tokens_seen": 62190768, "step": 28810 }, { "epoch": 4.700652528548124, "grad_norm": 0.07222887128591537, "learning_rate": 3.1886828448670734e-05, "loss": 0.0087, "num_input_tokens_seen": 62201648, "step": 28815 }, { "epoch": 4.701468189233279, "grad_norm": 0.5213015675544739, "learning_rate": 3.1879985590847824e-05, "loss": 0.1385, "num_input_tokens_seen": 62212816, "step": 28820 }, { "epoch": 4.702283849918434, "grad_norm": 0.09626034647226334, "learning_rate": 3.187314217529838e-05, "loss": 0.0857, "num_input_tokens_seen": 62222992, "step": 28825 }, { "epoch": 4.703099510603589, "grad_norm": 8.10428237915039, "learning_rate": 3.1866298202577157e-05, "loss": 0.1228, "num_input_tokens_seen": 62234192, "step": 28830 }, { "epoch": 4.7039151712887435, "grad_norm": 0.03548339381814003, "learning_rate": 3.185945367323895e-05, "loss": 0.2025, "num_input_tokens_seen": 62244848, "step": 28835 }, { "epoch": 4.704730831973899, "grad_norm": 0.6064218878746033, "learning_rate": 3.185260858783864e-05, "loss": 0.1151, "num_input_tokens_seen": 62255376, "step": 28840 }, { "epoch": 4.705546492659054, "grad_norm": 4.100457668304443, "learning_rate": 3.1845762946931093e-05, "loss": 0.0526, "num_input_tokens_seen": 62266192, "step": 28845 }, { "epoch": 4.706362153344209, "grad_norm": 0.10805456340312958, "learning_rate": 3.183891675107128e-05, "loss": 0.0158, "num_input_tokens_seen": 62277232, "step": 28850 }, { "epoch": 4.707177814029364, "grad_norm": 0.08018821477890015, "learning_rate": 3.183207000081416e-05, "loss": 0.0051, "num_input_tokens_seen": 62288368, "step": 28855 }, { "epoch": 4.7079934747145185, "grad_norm": 1.5426335334777832, "learning_rate": 3.1825222696714796e-05, "loss": 0.1832, "num_input_tokens_seen": 62299024, "step": 28860 }, { "epoch": 4.708809135399674, "grad_norm": 4.21427583694458, "learning_rate": 3.1818374839328236e-05, "loss": 0.2846, "num_input_tokens_seen": 62309136, "step": 28865 }, { "epoch": 4.709624796084829, "grad_norm": 3.1382803916931152, "learning_rate": 3.181152642920962e-05, "loss": 0.1796, "num_input_tokens_seen": 62320656, "step": 28870 }, { "epoch": 4.710440456769984, "grad_norm": 0.0743112787604332, "learning_rate": 3.180467746691411e-05, "loss": 0.0119, "num_input_tokens_seen": 62330896, "step": 28875 }, { "epoch": 4.711256117455139, "grad_norm": 0.10520720481872559, "learning_rate": 3.1797827952996914e-05, "loss": 0.0609, "num_input_tokens_seen": 62342032, "step": 28880 }, { "epoch": 4.712071778140293, "grad_norm": 0.07115879654884338, "learning_rate": 3.1790977888013294e-05, "loss": 0.1152, "num_input_tokens_seen": 62351984, "step": 28885 }, { "epoch": 4.712887438825448, "grad_norm": 5.284401893615723, "learning_rate": 3.178412727251856e-05, "loss": 0.0579, "num_input_tokens_seen": 62362256, "step": 28890 }, { "epoch": 4.713703099510604, "grad_norm": 2.9111032485961914, "learning_rate": 3.177727610706804e-05, "loss": 0.1628, "num_input_tokens_seen": 62373040, "step": 28895 }, { "epoch": 4.714518760195759, "grad_norm": 0.05595320835709572, "learning_rate": 3.177042439221713e-05, "loss": 0.0899, "num_input_tokens_seen": 62384304, "step": 28900 }, { "epoch": 4.715334420880914, "grad_norm": 3.172632932662964, "learning_rate": 3.176357212852127e-05, "loss": 0.0814, "num_input_tokens_seen": 62394864, "step": 28905 }, { "epoch": 4.716150081566068, "grad_norm": 0.17778953909873962, "learning_rate": 3.175671931653593e-05, "loss": 0.0177, "num_input_tokens_seen": 62405040, "step": 28910 }, { "epoch": 4.716965742251223, "grad_norm": 0.05175602808594704, "learning_rate": 3.174986595681664e-05, "loss": 0.0738, "num_input_tokens_seen": 62417040, "step": 28915 }, { "epoch": 4.717781402936378, "grad_norm": 4.293681621551514, "learning_rate": 3.174301204991896e-05, "loss": 0.2695, "num_input_tokens_seen": 62427088, "step": 28920 }, { "epoch": 4.718597063621534, "grad_norm": 0.14565585553646088, "learning_rate": 3.173615759639852e-05, "loss": 0.1103, "num_input_tokens_seen": 62436816, "step": 28925 }, { "epoch": 4.719412724306689, "grad_norm": 0.08410273492336273, "learning_rate": 3.1729302596810965e-05, "loss": 0.0122, "num_input_tokens_seen": 62447248, "step": 28930 }, { "epoch": 4.720228384991843, "grad_norm": 0.11749638617038727, "learning_rate": 3.172244705171199e-05, "loss": 0.1997, "num_input_tokens_seen": 62456368, "step": 28935 }, { "epoch": 4.721044045676998, "grad_norm": 0.820249080657959, "learning_rate": 3.171559096165736e-05, "loss": 0.0587, "num_input_tokens_seen": 62467088, "step": 28940 }, { "epoch": 4.721859706362153, "grad_norm": 3.017606258392334, "learning_rate": 3.170873432720285e-05, "loss": 0.3186, "num_input_tokens_seen": 62476752, "step": 28945 }, { "epoch": 4.722675367047309, "grad_norm": 0.0420236699283123, "learning_rate": 3.170187714890429e-05, "loss": 0.0757, "num_input_tokens_seen": 62487152, "step": 28950 }, { "epoch": 4.7234910277324635, "grad_norm": 0.128378227353096, "learning_rate": 3.1695019427317564e-05, "loss": 0.0085, "num_input_tokens_seen": 62498544, "step": 28955 }, { "epoch": 4.724306688417618, "grad_norm": 0.039971981197595596, "learning_rate": 3.1688161162998595e-05, "loss": 0.0351, "num_input_tokens_seen": 62509488, "step": 28960 }, { "epoch": 4.725122349102773, "grad_norm": 0.21320219337940216, "learning_rate": 3.1681302356503337e-05, "loss": 0.074, "num_input_tokens_seen": 62520464, "step": 28965 }, { "epoch": 4.725938009787928, "grad_norm": 0.39001935720443726, "learning_rate": 3.167444300838782e-05, "loss": 0.3243, "num_input_tokens_seen": 62531536, "step": 28970 }, { "epoch": 4.726753670473083, "grad_norm": 2.2165637016296387, "learning_rate": 3.1667583119208085e-05, "loss": 0.0718, "num_input_tokens_seen": 62542320, "step": 28975 }, { "epoch": 4.7275693311582385, "grad_norm": 0.07460815459489822, "learning_rate": 3.1660722689520225e-05, "loss": 0.0782, "num_input_tokens_seen": 62551760, "step": 28980 }, { "epoch": 4.728384991843393, "grad_norm": 0.41532909870147705, "learning_rate": 3.16538617198804e-05, "loss": 0.014, "num_input_tokens_seen": 62562512, "step": 28985 }, { "epoch": 4.729200652528548, "grad_norm": 0.062238991260528564, "learning_rate": 3.164700021084478e-05, "loss": 0.15, "num_input_tokens_seen": 62573648, "step": 28990 }, { "epoch": 4.730016313213703, "grad_norm": 0.23913052678108215, "learning_rate": 3.164013816296959e-05, "loss": 0.179, "num_input_tokens_seen": 62584592, "step": 28995 }, { "epoch": 4.730831973898858, "grad_norm": 0.0670330822467804, "learning_rate": 3.163327557681111e-05, "loss": 0.0145, "num_input_tokens_seen": 62596496, "step": 29000 }, { "epoch": 4.731647634584013, "grad_norm": 0.11926860362291336, "learning_rate": 3.162641245292566e-05, "loss": 0.0289, "num_input_tokens_seen": 62607312, "step": 29005 }, { "epoch": 4.732463295269168, "grad_norm": 2.4738662242889404, "learning_rate": 3.161954879186959e-05, "loss": 0.0152, "num_input_tokens_seen": 62617680, "step": 29010 }, { "epoch": 4.733278955954323, "grad_norm": 5.717556953430176, "learning_rate": 3.161268459419931e-05, "loss": 0.0376, "num_input_tokens_seen": 62627664, "step": 29015 }, { "epoch": 4.734094616639478, "grad_norm": 5.469749927520752, "learning_rate": 3.160581986047127e-05, "loss": 0.2754, "num_input_tokens_seen": 62638352, "step": 29020 }, { "epoch": 4.734910277324633, "grad_norm": 0.16168759763240814, "learning_rate": 3.1598954591241934e-05, "loss": 0.1976, "num_input_tokens_seen": 62649520, "step": 29025 }, { "epoch": 4.735725938009788, "grad_norm": 0.15394924581050873, "learning_rate": 3.159208878706787e-05, "loss": 0.0558, "num_input_tokens_seen": 62660080, "step": 29030 }, { "epoch": 4.736541598694943, "grad_norm": 0.07229402661323547, "learning_rate": 3.1585222448505644e-05, "loss": 0.3034, "num_input_tokens_seen": 62669808, "step": 29035 }, { "epoch": 4.737357259380098, "grad_norm": 0.34821203351020813, "learning_rate": 3.1578355576111864e-05, "loss": 0.0129, "num_input_tokens_seen": 62681136, "step": 29040 }, { "epoch": 4.738172920065253, "grad_norm": 0.16153861582279205, "learning_rate": 3.157148817044321e-05, "loss": 0.2218, "num_input_tokens_seen": 62692144, "step": 29045 }, { "epoch": 4.738988580750408, "grad_norm": 0.10921628028154373, "learning_rate": 3.156462023205638e-05, "loss": 0.1657, "num_input_tokens_seen": 62701360, "step": 29050 }, { "epoch": 4.739804241435563, "grad_norm": 4.022568225860596, "learning_rate": 3.155775176150812e-05, "loss": 0.141, "num_input_tokens_seen": 62713488, "step": 29055 }, { "epoch": 4.740619902120718, "grad_norm": 0.04539196193218231, "learning_rate": 3.1550882759355246e-05, "loss": 0.1429, "num_input_tokens_seen": 62725264, "step": 29060 }, { "epoch": 4.741435562805873, "grad_norm": 0.3306664824485779, "learning_rate": 3.154401322615456e-05, "loss": 0.0185, "num_input_tokens_seen": 62735312, "step": 29065 }, { "epoch": 4.742251223491028, "grad_norm": 0.12463626265525818, "learning_rate": 3.153714316246297e-05, "loss": 0.1587, "num_input_tokens_seen": 62745744, "step": 29070 }, { "epoch": 4.743066884176183, "grad_norm": 5.152036190032959, "learning_rate": 3.153027256883737e-05, "loss": 0.1146, "num_input_tokens_seen": 62754960, "step": 29075 }, { "epoch": 4.7438825448613375, "grad_norm": 0.3472740948200226, "learning_rate": 3.152340144583475e-05, "loss": 0.1383, "num_input_tokens_seen": 62765712, "step": 29080 }, { "epoch": 4.744698205546492, "grad_norm": 0.1714463233947754, "learning_rate": 3.151652979401211e-05, "loss": 0.0352, "num_input_tokens_seen": 62776784, "step": 29085 }, { "epoch": 4.745513866231647, "grad_norm": 0.12423329800367355, "learning_rate": 3.15096576139265e-05, "loss": 0.2172, "num_input_tokens_seen": 62787984, "step": 29090 }, { "epoch": 4.746329526916803, "grad_norm": 0.10747270286083221, "learning_rate": 3.150278490613501e-05, "loss": 0.0812, "num_input_tokens_seen": 62797680, "step": 29095 }, { "epoch": 4.747145187601958, "grad_norm": 0.11853362619876862, "learning_rate": 3.149591167119479e-05, "loss": 0.0124, "num_input_tokens_seen": 62808112, "step": 29100 }, { "epoch": 4.7479608482871125, "grad_norm": 0.5263053178787231, "learning_rate": 3.148903790966301e-05, "loss": 0.2872, "num_input_tokens_seen": 62818928, "step": 29105 }, { "epoch": 4.748776508972267, "grad_norm": 0.15484373271465302, "learning_rate": 3.148216362209688e-05, "loss": 0.1805, "num_input_tokens_seen": 62829872, "step": 29110 }, { "epoch": 4.749592169657422, "grad_norm": 0.1782502830028534, "learning_rate": 3.1475288809053684e-05, "loss": 0.0874, "num_input_tokens_seen": 62840144, "step": 29115 }, { "epoch": 4.750407830342578, "grad_norm": 0.16154274344444275, "learning_rate": 3.146841347109072e-05, "loss": 0.1157, "num_input_tokens_seen": 62851856, "step": 29120 }, { "epoch": 4.751223491027733, "grad_norm": 0.45126184821128845, "learning_rate": 3.146153760876534e-05, "loss": 0.1545, "num_input_tokens_seen": 62863056, "step": 29125 }, { "epoch": 4.7520391517128875, "grad_norm": 7.471928119659424, "learning_rate": 3.145466122263494e-05, "loss": 0.149, "num_input_tokens_seen": 62872592, "step": 29130 }, { "epoch": 4.752854812398042, "grad_norm": 3.426064968109131, "learning_rate": 3.144778431325694e-05, "loss": 0.199, "num_input_tokens_seen": 62883600, "step": 29135 }, { "epoch": 4.753670473083197, "grad_norm": 11.00782585144043, "learning_rate": 3.1440906881188835e-05, "loss": 0.1378, "num_input_tokens_seen": 62894160, "step": 29140 }, { "epoch": 4.754486133768353, "grad_norm": 1.6691228151321411, "learning_rate": 3.143402892698814e-05, "loss": 0.0193, "num_input_tokens_seen": 62905648, "step": 29145 }, { "epoch": 4.755301794453508, "grad_norm": 0.09684643894433975, "learning_rate": 3.142715045121241e-05, "loss": 0.0216, "num_input_tokens_seen": 62918256, "step": 29150 }, { "epoch": 4.7561174551386625, "grad_norm": 6.829464912414551, "learning_rate": 3.142027145441926e-05, "loss": 0.2386, "num_input_tokens_seen": 62929200, "step": 29155 }, { "epoch": 4.756933115823817, "grad_norm": 0.9654311537742615, "learning_rate": 3.141339193716633e-05, "loss": 0.0162, "num_input_tokens_seen": 62940016, "step": 29160 }, { "epoch": 4.757748776508972, "grad_norm": 0.12815995514392853, "learning_rate": 3.1406511900011295e-05, "loss": 0.0096, "num_input_tokens_seen": 62949744, "step": 29165 }, { "epoch": 4.758564437194127, "grad_norm": 0.07554585486650467, "learning_rate": 3.139963134351191e-05, "loss": 0.0767, "num_input_tokens_seen": 62960848, "step": 29170 }, { "epoch": 4.759380097879282, "grad_norm": 0.09591441601514816, "learning_rate": 3.139275026822594e-05, "loss": 0.3065, "num_input_tokens_seen": 62971984, "step": 29175 }, { "epoch": 4.760195758564437, "grad_norm": 0.20106206834316254, "learning_rate": 3.138586867471118e-05, "loss": 0.0198, "num_input_tokens_seen": 62982224, "step": 29180 }, { "epoch": 4.761011419249592, "grad_norm": 3.4516098499298096, "learning_rate": 3.137898656352551e-05, "loss": 0.1415, "num_input_tokens_seen": 62992464, "step": 29185 }, { "epoch": 4.761827079934747, "grad_norm": 0.18744824826717377, "learning_rate": 3.137210393522683e-05, "loss": 0.0219, "num_input_tokens_seen": 63002960, "step": 29190 }, { "epoch": 4.762642740619902, "grad_norm": 0.335336297750473, "learning_rate": 3.136522079037307e-05, "loss": 0.1888, "num_input_tokens_seen": 63013520, "step": 29195 }, { "epoch": 4.763458401305057, "grad_norm": 2.757715940475464, "learning_rate": 3.135833712952222e-05, "loss": 0.1149, "num_input_tokens_seen": 63023760, "step": 29200 }, { "epoch": 4.764274061990212, "grad_norm": 0.05723390355706215, "learning_rate": 3.135145295323229e-05, "loss": 0.0981, "num_input_tokens_seen": 63035248, "step": 29205 }, { "epoch": 4.765089722675367, "grad_norm": 0.28717201948165894, "learning_rate": 3.1344568262061366e-05, "loss": 0.0862, "num_input_tokens_seen": 63045904, "step": 29210 }, { "epoch": 4.765905383360522, "grad_norm": 4.344303607940674, "learning_rate": 3.133768305656755e-05, "loss": 0.1569, "num_input_tokens_seen": 63056336, "step": 29215 }, { "epoch": 4.766721044045677, "grad_norm": 9.279243469238281, "learning_rate": 3.1330797337308984e-05, "loss": 0.2002, "num_input_tokens_seen": 63067376, "step": 29220 }, { "epoch": 4.767536704730832, "grad_norm": 0.12709756195545197, "learning_rate": 3.1323911104843865e-05, "loss": 0.1951, "num_input_tokens_seen": 63077456, "step": 29225 }, { "epoch": 4.768352365415987, "grad_norm": 0.16337549686431885, "learning_rate": 3.131702435973042e-05, "loss": 0.0064, "num_input_tokens_seen": 63087024, "step": 29230 }, { "epoch": 4.769168026101142, "grad_norm": 0.1980210691690445, "learning_rate": 3.1310137102526926e-05, "loss": 0.0942, "num_input_tokens_seen": 63098608, "step": 29235 }, { "epoch": 4.769983686786297, "grad_norm": 0.052752479910850525, "learning_rate": 3.13032493337917e-05, "loss": 0.0842, "num_input_tokens_seen": 63108144, "step": 29240 }, { "epoch": 4.770799347471452, "grad_norm": 2.5380256175994873, "learning_rate": 3.129636105408311e-05, "loss": 0.2796, "num_input_tokens_seen": 63118736, "step": 29245 }, { "epoch": 4.771615008156607, "grad_norm": 0.22159433364868164, "learning_rate": 3.128947226395954e-05, "loss": 0.0082, "num_input_tokens_seen": 63128656, "step": 29250 }, { "epoch": 4.7724306688417615, "grad_norm": 0.07380012422800064, "learning_rate": 3.1282582963979434e-05, "loss": 0.1333, "num_input_tokens_seen": 63139632, "step": 29255 }, { "epoch": 4.773246329526917, "grad_norm": 0.10260308533906937, "learning_rate": 3.127569315470128e-05, "loss": 0.0118, "num_input_tokens_seen": 63150128, "step": 29260 }, { "epoch": 4.774061990212072, "grad_norm": 0.5949441194534302, "learning_rate": 3.12688028366836e-05, "loss": 0.0929, "num_input_tokens_seen": 63160112, "step": 29265 }, { "epoch": 4.774877650897227, "grad_norm": 0.2540006637573242, "learning_rate": 3.126191201048494e-05, "loss": 0.119, "num_input_tokens_seen": 63172592, "step": 29270 }, { "epoch": 4.775693311582382, "grad_norm": 0.8490146994590759, "learning_rate": 3.125502067666393e-05, "loss": 0.1109, "num_input_tokens_seen": 63184304, "step": 29275 }, { "epoch": 4.7765089722675365, "grad_norm": 0.06298528611660004, "learning_rate": 3.1248128835779206e-05, "loss": 0.0083, "num_input_tokens_seen": 63195184, "step": 29280 }, { "epoch": 4.777324632952691, "grad_norm": 0.07878922671079636, "learning_rate": 3.124123648838946e-05, "loss": 0.0465, "num_input_tokens_seen": 63206064, "step": 29285 }, { "epoch": 4.778140293637847, "grad_norm": 0.09068889915943146, "learning_rate": 3.123434363505341e-05, "loss": 0.0074, "num_input_tokens_seen": 63215536, "step": 29290 }, { "epoch": 4.778955954323002, "grad_norm": 11.19113540649414, "learning_rate": 3.122745027632983e-05, "loss": 0.1293, "num_input_tokens_seen": 63226480, "step": 29295 }, { "epoch": 4.779771615008157, "grad_norm": 1.2498172521591187, "learning_rate": 3.1220556412777536e-05, "loss": 0.083, "num_input_tokens_seen": 63237200, "step": 29300 }, { "epoch": 4.780587275693311, "grad_norm": 0.10224765539169312, "learning_rate": 3.121366204495538e-05, "loss": 0.2689, "num_input_tokens_seen": 63247632, "step": 29305 }, { "epoch": 4.781402936378466, "grad_norm": 0.10505528002977371, "learning_rate": 3.120676717342225e-05, "loss": 0.1855, "num_input_tokens_seen": 63258864, "step": 29310 }, { "epoch": 4.782218597063622, "grad_norm": 1.2177871465682983, "learning_rate": 3.119987179873707e-05, "loss": 0.0732, "num_input_tokens_seen": 63270928, "step": 29315 }, { "epoch": 4.783034257748777, "grad_norm": 3.561885356903076, "learning_rate": 3.119297592145884e-05, "loss": 0.2115, "num_input_tokens_seen": 63281200, "step": 29320 }, { "epoch": 4.783849918433932, "grad_norm": 0.6261371970176697, "learning_rate": 3.1186079542146554e-05, "loss": 0.0488, "num_input_tokens_seen": 63291184, "step": 29325 }, { "epoch": 4.784665579119086, "grad_norm": 23.49911117553711, "learning_rate": 3.117918266135927e-05, "loss": 0.1411, "num_input_tokens_seen": 63302544, "step": 29330 }, { "epoch": 4.785481239804241, "grad_norm": 0.10632102936506271, "learning_rate": 3.1172285279656085e-05, "loss": 0.0995, "num_input_tokens_seen": 63312464, "step": 29335 }, { "epoch": 4.786296900489396, "grad_norm": 0.11808846890926361, "learning_rate": 3.1165387397596136e-05, "loss": 0.3068, "num_input_tokens_seen": 63323856, "step": 29340 }, { "epoch": 4.787112561174552, "grad_norm": 0.0858970656991005, "learning_rate": 3.1158489015738604e-05, "loss": 0.0119, "num_input_tokens_seen": 63335664, "step": 29345 }, { "epoch": 4.787928221859707, "grad_norm": 0.13045793771743774, "learning_rate": 3.1151590134642705e-05, "loss": 0.0056, "num_input_tokens_seen": 63347632, "step": 29350 }, { "epoch": 4.788743882544861, "grad_norm": 6.656271457672119, "learning_rate": 3.114469075486769e-05, "loss": 0.1221, "num_input_tokens_seen": 63359152, "step": 29355 }, { "epoch": 4.789559543230016, "grad_norm": 0.08970692753791809, "learning_rate": 3.113779087697287e-05, "loss": 0.013, "num_input_tokens_seen": 63370224, "step": 29360 }, { "epoch": 4.790375203915171, "grad_norm": 5.6270751953125, "learning_rate": 3.1130890501517586e-05, "loss": 0.0565, "num_input_tokens_seen": 63380784, "step": 29365 }, { "epoch": 4.791190864600326, "grad_norm": 0.20602378249168396, "learning_rate": 3.11239896290612e-05, "loss": 0.2154, "num_input_tokens_seen": 63392656, "step": 29370 }, { "epoch": 4.7920065252854815, "grad_norm": 0.1836508810520172, "learning_rate": 3.111708826016315e-05, "loss": 0.0466, "num_input_tokens_seen": 63403440, "step": 29375 }, { "epoch": 4.792822185970636, "grad_norm": 3.018244504928589, "learning_rate": 3.1110186395382885e-05, "loss": 0.1263, "num_input_tokens_seen": 63413968, "step": 29380 }, { "epoch": 4.793637846655791, "grad_norm": 0.5818954706192017, "learning_rate": 3.1103284035279905e-05, "loss": 0.1695, "num_input_tokens_seen": 63424752, "step": 29385 }, { "epoch": 4.794453507340946, "grad_norm": 0.061303652822971344, "learning_rate": 3.109638118041376e-05, "loss": 0.0091, "num_input_tokens_seen": 63435792, "step": 29390 }, { "epoch": 4.795269168026101, "grad_norm": 0.49282169342041016, "learning_rate": 3.108947783134402e-05, "loss": 0.0105, "num_input_tokens_seen": 63448016, "step": 29395 }, { "epoch": 4.7960848287112565, "grad_norm": 0.08261608332395554, "learning_rate": 3.10825739886303e-05, "loss": 0.0064, "num_input_tokens_seen": 63456880, "step": 29400 }, { "epoch": 4.796900489396411, "grad_norm": 3.3087964057922363, "learning_rate": 3.107566965283228e-05, "loss": 0.1434, "num_input_tokens_seen": 63467504, "step": 29405 }, { "epoch": 4.797716150081566, "grad_norm": 2.5278067588806152, "learning_rate": 3.106876482450964e-05, "loss": 0.1676, "num_input_tokens_seen": 63478288, "step": 29410 }, { "epoch": 4.798531810766721, "grad_norm": 0.04726960510015488, "learning_rate": 3.106185950422215e-05, "loss": 0.0193, "num_input_tokens_seen": 63487888, "step": 29415 }, { "epoch": 4.799347471451876, "grad_norm": 0.040033210068941116, "learning_rate": 3.105495369252956e-05, "loss": 0.0194, "num_input_tokens_seen": 63498576, "step": 29420 }, { "epoch": 4.800163132137031, "grad_norm": 0.04523498937487602, "learning_rate": 3.104804738999169e-05, "loss": 0.1039, "num_input_tokens_seen": 63509872, "step": 29425 }, { "epoch": 4.800978792822186, "grad_norm": 0.1345680207014084, "learning_rate": 3.1041140597168425e-05, "loss": 0.0041, "num_input_tokens_seen": 63521328, "step": 29430 }, { "epoch": 4.801794453507341, "grad_norm": 1.8691577911376953, "learning_rate": 3.1034233314619647e-05, "loss": 0.1054, "num_input_tokens_seen": 63531248, "step": 29435 }, { "epoch": 4.802610114192496, "grad_norm": 5.198378562927246, "learning_rate": 3.102732554290531e-05, "loss": 0.2767, "num_input_tokens_seen": 63541648, "step": 29440 }, { "epoch": 4.803425774877651, "grad_norm": 0.24566799402236938, "learning_rate": 3.102041728258537e-05, "loss": 0.0073, "num_input_tokens_seen": 63551408, "step": 29445 }, { "epoch": 4.804241435562806, "grad_norm": 0.1768108606338501, "learning_rate": 3.101350853421986e-05, "loss": 0.0063, "num_input_tokens_seen": 63562160, "step": 29450 }, { "epoch": 4.80505709624796, "grad_norm": 0.045698415488004684, "learning_rate": 3.1006599298368826e-05, "loss": 0.0792, "num_input_tokens_seen": 63573648, "step": 29455 }, { "epoch": 4.805872756933116, "grad_norm": 0.0276840440928936, "learning_rate": 3.099968957559239e-05, "loss": 0.1144, "num_input_tokens_seen": 63584048, "step": 29460 }, { "epoch": 4.806688417618271, "grad_norm": 0.159894198179245, "learning_rate": 3.0992779366450666e-05, "loss": 0.0532, "num_input_tokens_seen": 63595344, "step": 29465 }, { "epoch": 4.807504078303426, "grad_norm": 3.1524837017059326, "learning_rate": 3.098586867150385e-05, "loss": 0.1979, "num_input_tokens_seen": 63606800, "step": 29470 }, { "epoch": 4.808319738988581, "grad_norm": 4.080893039703369, "learning_rate": 3.097895749131214e-05, "loss": 0.1287, "num_input_tokens_seen": 63617456, "step": 29475 }, { "epoch": 4.809135399673735, "grad_norm": 0.25066328048706055, "learning_rate": 3.09720458264358e-05, "loss": 0.397, "num_input_tokens_seen": 63628336, "step": 29480 }, { "epoch": 4.809951060358891, "grad_norm": 0.0336231030523777, "learning_rate": 3.096513367743513e-05, "loss": 0.0741, "num_input_tokens_seen": 63640720, "step": 29485 }, { "epoch": 4.810766721044046, "grad_norm": 0.30254265666007996, "learning_rate": 3.095822104487045e-05, "loss": 0.0118, "num_input_tokens_seen": 63650960, "step": 29490 }, { "epoch": 4.811582381729201, "grad_norm": 3.7460927963256836, "learning_rate": 3.0951307929302136e-05, "loss": 0.1093, "num_input_tokens_seen": 63662192, "step": 29495 }, { "epoch": 4.8123980424143555, "grad_norm": 5.232872486114502, "learning_rate": 3.094439433129061e-05, "loss": 0.0899, "num_input_tokens_seen": 63673232, "step": 29500 }, { "epoch": 4.81321370309951, "grad_norm": 0.10796443372964859, "learning_rate": 3.093748025139632e-05, "loss": 0.0054, "num_input_tokens_seen": 63683728, "step": 29505 }, { "epoch": 4.814029363784666, "grad_norm": 4.561418533325195, "learning_rate": 3.093056569017975e-05, "loss": 0.0574, "num_input_tokens_seen": 63694768, "step": 29510 }, { "epoch": 4.814845024469821, "grad_norm": 0.11669047176837921, "learning_rate": 3.0923650648201436e-05, "loss": 0.2051, "num_input_tokens_seen": 63705488, "step": 29515 }, { "epoch": 4.815660685154976, "grad_norm": 0.6478515267372131, "learning_rate": 3.0916735126021945e-05, "loss": 0.2099, "num_input_tokens_seen": 63715600, "step": 29520 }, { "epoch": 4.8164763458401305, "grad_norm": 4.888887405395508, "learning_rate": 3.090981912420188e-05, "loss": 0.2501, "num_input_tokens_seen": 63726000, "step": 29525 }, { "epoch": 4.817292006525285, "grad_norm": 4.986412048339844, "learning_rate": 3.09029026433019e-05, "loss": 0.1143, "num_input_tokens_seen": 63736400, "step": 29530 }, { "epoch": 4.81810766721044, "grad_norm": 4.063265800476074, "learning_rate": 3.0895985683882675e-05, "loss": 0.4035, "num_input_tokens_seen": 63747600, "step": 29535 }, { "epoch": 4.818923327895595, "grad_norm": 0.17317841947078705, "learning_rate": 3.088906824650493e-05, "loss": 0.011, "num_input_tokens_seen": 63758416, "step": 29540 }, { "epoch": 4.819738988580751, "grad_norm": 0.1033623069524765, "learning_rate": 3.088215033172944e-05, "loss": 0.1442, "num_input_tokens_seen": 63768976, "step": 29545 }, { "epoch": 4.8205546492659055, "grad_norm": 0.15044410526752472, "learning_rate": 3.087523194011699e-05, "loss": 0.0491, "num_input_tokens_seen": 63778896, "step": 29550 }, { "epoch": 4.82137030995106, "grad_norm": 4.59242057800293, "learning_rate": 3.086831307222844e-05, "loss": 0.0512, "num_input_tokens_seen": 63790384, "step": 29555 }, { "epoch": 4.822185970636215, "grad_norm": 0.14458999037742615, "learning_rate": 3.086139372862464e-05, "loss": 0.0138, "num_input_tokens_seen": 63800688, "step": 29560 }, { "epoch": 4.82300163132137, "grad_norm": 0.08712341636419296, "learning_rate": 3.085447390986653e-05, "loss": 0.0484, "num_input_tokens_seen": 63811856, "step": 29565 }, { "epoch": 4.823817292006526, "grad_norm": 3.085439682006836, "learning_rate": 3.084755361651507e-05, "loss": 0.1413, "num_input_tokens_seen": 63822576, "step": 29570 }, { "epoch": 4.8246329526916805, "grad_norm": 0.8713060617446899, "learning_rate": 3.0840632849131236e-05, "loss": 0.0139, "num_input_tokens_seen": 63833168, "step": 29575 }, { "epoch": 4.825448613376835, "grad_norm": 0.1782616525888443, "learning_rate": 3.083371160827606e-05, "loss": 0.0084, "num_input_tokens_seen": 63843312, "step": 29580 }, { "epoch": 4.82626427406199, "grad_norm": 1.5676994323730469, "learning_rate": 3.082678989451063e-05, "loss": 0.3691, "num_input_tokens_seen": 63853904, "step": 29585 }, { "epoch": 4.827079934747145, "grad_norm": 4.571804046630859, "learning_rate": 3.081986770839605e-05, "loss": 0.4312, "num_input_tokens_seen": 63865072, "step": 29590 }, { "epoch": 4.827895595432301, "grad_norm": 0.29922136664390564, "learning_rate": 3.0812945050493464e-05, "loss": 0.0204, "num_input_tokens_seen": 63875760, "step": 29595 }, { "epoch": 4.828711256117455, "grad_norm": 0.1453588306903839, "learning_rate": 3.080602192136405e-05, "loss": 0.0312, "num_input_tokens_seen": 63887216, "step": 29600 }, { "epoch": 4.82952691680261, "grad_norm": 0.07486657053232193, "learning_rate": 3.079909832156905e-05, "loss": 0.0815, "num_input_tokens_seen": 63897104, "step": 29605 }, { "epoch": 4.830342577487765, "grad_norm": 0.0959646925330162, "learning_rate": 3.0792174251669706e-05, "loss": 0.1833, "num_input_tokens_seen": 63907984, "step": 29610 }, { "epoch": 4.83115823817292, "grad_norm": 0.22850702702999115, "learning_rate": 3.078524971222733e-05, "loss": 0.0527, "num_input_tokens_seen": 63918640, "step": 29615 }, { "epoch": 4.831973898858075, "grad_norm": 13.872844696044922, "learning_rate": 3.0778324703803256e-05, "loss": 0.203, "num_input_tokens_seen": 63929136, "step": 29620 }, { "epoch": 4.8327895595432295, "grad_norm": 0.11571940034627914, "learning_rate": 3.0771399226958865e-05, "loss": 0.2644, "num_input_tokens_seen": 63940464, "step": 29625 }, { "epoch": 4.833605220228385, "grad_norm": 0.6639408469200134, "learning_rate": 3.076447328225557e-05, "loss": 0.1808, "num_input_tokens_seen": 63950512, "step": 29630 }, { "epoch": 4.83442088091354, "grad_norm": 3.395463466644287, "learning_rate": 3.075754687025482e-05, "loss": 0.32, "num_input_tokens_seen": 63961840, "step": 29635 }, { "epoch": 4.835236541598695, "grad_norm": 0.3087729215621948, "learning_rate": 3.0750619991518115e-05, "loss": 0.032, "num_input_tokens_seen": 63971440, "step": 29640 }, { "epoch": 4.83605220228385, "grad_norm": 0.18898051977157593, "learning_rate": 3.074369264660697e-05, "loss": 0.1064, "num_input_tokens_seen": 63982608, "step": 29645 }, { "epoch": 4.8368678629690045, "grad_norm": 0.1280173659324646, "learning_rate": 3.0736764836082954e-05, "loss": 0.2161, "num_input_tokens_seen": 63994000, "step": 29650 }, { "epoch": 4.83768352365416, "grad_norm": 3.3714702129364014, "learning_rate": 3.072983656050767e-05, "loss": 0.1225, "num_input_tokens_seen": 64005008, "step": 29655 }, { "epoch": 4.838499184339315, "grad_norm": 0.44245752692222595, "learning_rate": 3.072290782044276e-05, "loss": 0.0651, "num_input_tokens_seen": 64014448, "step": 29660 }, { "epoch": 4.83931484502447, "grad_norm": 0.14941665530204773, "learning_rate": 3.0715978616449906e-05, "loss": 0.0101, "num_input_tokens_seen": 64026672, "step": 29665 }, { "epoch": 4.840130505709625, "grad_norm": 0.1486716866493225, "learning_rate": 3.070904894909083e-05, "loss": 0.1468, "num_input_tokens_seen": 64036752, "step": 29670 }, { "epoch": 4.8409461663947795, "grad_norm": 0.22281208634376526, "learning_rate": 3.070211881892727e-05, "loss": 0.2368, "num_input_tokens_seen": 64047760, "step": 29675 }, { "epoch": 4.841761827079935, "grad_norm": 0.10058537125587463, "learning_rate": 3.069518822652103e-05, "loss": 0.1777, "num_input_tokens_seen": 64059664, "step": 29680 }, { "epoch": 4.84257748776509, "grad_norm": 1.5558687448501587, "learning_rate": 3.0688257172433944e-05, "loss": 0.0738, "num_input_tokens_seen": 64071408, "step": 29685 }, { "epoch": 4.843393148450245, "grad_norm": 0.1454678624868393, "learning_rate": 3.068132565722786e-05, "loss": 0.098, "num_input_tokens_seen": 64082448, "step": 29690 }, { "epoch": 4.8442088091354, "grad_norm": 0.22452178597450256, "learning_rate": 3.06743936814647e-05, "loss": 0.0198, "num_input_tokens_seen": 64094320, "step": 29695 }, { "epoch": 4.8450244698205545, "grad_norm": 0.21330443024635315, "learning_rate": 3.0667461245706386e-05, "loss": 0.1539, "num_input_tokens_seen": 64105392, "step": 29700 }, { "epoch": 4.845840130505709, "grad_norm": 0.13264863193035126, "learning_rate": 3.066052835051491e-05, "loss": 0.1373, "num_input_tokens_seen": 64117008, "step": 29705 }, { "epoch": 4.846655791190865, "grad_norm": 0.4915766716003418, "learning_rate": 3.065359499645228e-05, "loss": 0.4959, "num_input_tokens_seen": 64128016, "step": 29710 }, { "epoch": 4.84747145187602, "grad_norm": 1.5233148336410522, "learning_rate": 3.064666118408057e-05, "loss": 0.0606, "num_input_tokens_seen": 64138448, "step": 29715 }, { "epoch": 4.848287112561175, "grad_norm": 0.1547406017780304, "learning_rate": 3.0639726913961833e-05, "loss": 0.0099, "num_input_tokens_seen": 64148848, "step": 29720 }, { "epoch": 4.849102773246329, "grad_norm": 1.3181960582733154, "learning_rate": 3.0632792186658225e-05, "loss": 0.1275, "num_input_tokens_seen": 64159280, "step": 29725 }, { "epoch": 4.849918433931484, "grad_norm": 0.19170302152633667, "learning_rate": 3.062585700273191e-05, "loss": 0.0658, "num_input_tokens_seen": 64170384, "step": 29730 }, { "epoch": 4.850734094616639, "grad_norm": 0.1350255161523819, "learning_rate": 3.0618921362745075e-05, "loss": 0.163, "num_input_tokens_seen": 64180112, "step": 29735 }, { "epoch": 4.851549755301795, "grad_norm": 0.6598547101020813, "learning_rate": 3.061198526725996e-05, "loss": 0.1019, "num_input_tokens_seen": 64190032, "step": 29740 }, { "epoch": 4.85236541598695, "grad_norm": 0.937228262424469, "learning_rate": 3.060504871683885e-05, "loss": 0.1113, "num_input_tokens_seen": 64201936, "step": 29745 }, { "epoch": 4.853181076672104, "grad_norm": 1.5486509799957275, "learning_rate": 3.059811171204404e-05, "loss": 0.06, "num_input_tokens_seen": 64212784, "step": 29750 }, { "epoch": 4.853996737357259, "grad_norm": 0.13088580965995789, "learning_rate": 3.0591174253437904e-05, "loss": 0.0781, "num_input_tokens_seen": 64224208, "step": 29755 }, { "epoch": 4.854812398042414, "grad_norm": 0.9528616070747375, "learning_rate": 3.05842363415828e-05, "loss": 0.0992, "num_input_tokens_seen": 64235792, "step": 29760 }, { "epoch": 4.85562805872757, "grad_norm": 0.058940738439559937, "learning_rate": 3.057729797704118e-05, "loss": 0.1113, "num_input_tokens_seen": 64246512, "step": 29765 }, { "epoch": 4.856443719412725, "grad_norm": 0.12711629271507263, "learning_rate": 3.057035916037548e-05, "loss": 0.0145, "num_input_tokens_seen": 64256624, "step": 29770 }, { "epoch": 4.857259380097879, "grad_norm": 0.2545081675052643, "learning_rate": 3.05634198921482e-05, "loss": 0.149, "num_input_tokens_seen": 64266544, "step": 29775 }, { "epoch": 4.858075040783034, "grad_norm": 3.3272788524627686, "learning_rate": 3.055648017292188e-05, "loss": 0.2232, "num_input_tokens_seen": 64277904, "step": 29780 }, { "epoch": 4.858890701468189, "grad_norm": 0.10898911207914352, "learning_rate": 3.0549540003259084e-05, "loss": 0.0329, "num_input_tokens_seen": 64288464, "step": 29785 }, { "epoch": 4.859706362153344, "grad_norm": 0.07357125729322433, "learning_rate": 3.054259938372242e-05, "loss": 0.229, "num_input_tokens_seen": 64299408, "step": 29790 }, { "epoch": 4.8605220228384995, "grad_norm": 0.28173163533210754, "learning_rate": 3.0535658314874515e-05, "loss": 0.1425, "num_input_tokens_seen": 64310224, "step": 29795 }, { "epoch": 4.861337683523654, "grad_norm": 4.659884452819824, "learning_rate": 3.0528716797278064e-05, "loss": 0.0321, "num_input_tokens_seen": 64321520, "step": 29800 }, { "epoch": 4.862153344208809, "grad_norm": 3.7471280097961426, "learning_rate": 3.052177483149578e-05, "loss": 0.2272, "num_input_tokens_seen": 64332880, "step": 29805 }, { "epoch": 4.862969004893964, "grad_norm": 0.1539532095193863, "learning_rate": 3.0514832418090406e-05, "loss": 0.0778, "num_input_tokens_seen": 64342576, "step": 29810 }, { "epoch": 4.863784665579119, "grad_norm": 0.2837701141834259, "learning_rate": 3.050788955762474e-05, "loss": 0.0697, "num_input_tokens_seen": 64353936, "step": 29815 }, { "epoch": 4.864600326264274, "grad_norm": 0.044804736971855164, "learning_rate": 3.05009462506616e-05, "loss": 0.069, "num_input_tokens_seen": 64364592, "step": 29820 }, { "epoch": 4.865415986949429, "grad_norm": 0.2907470762729645, "learning_rate": 3.049400249776384e-05, "loss": 0.0766, "num_input_tokens_seen": 64375696, "step": 29825 }, { "epoch": 4.866231647634584, "grad_norm": 0.12099996954202652, "learning_rate": 3.0487058299494363e-05, "loss": 0.0481, "num_input_tokens_seen": 64387440, "step": 29830 }, { "epoch": 4.867047308319739, "grad_norm": 0.10837902128696442, "learning_rate": 3.0480113656416103e-05, "loss": 0.029, "num_input_tokens_seen": 64399088, "step": 29835 }, { "epoch": 4.867862969004894, "grad_norm": 0.2713727355003357, "learning_rate": 3.047316856909202e-05, "loss": 0.3061, "num_input_tokens_seen": 64409712, "step": 29840 }, { "epoch": 4.868678629690049, "grad_norm": 0.18403325974941254, "learning_rate": 3.0466223038085128e-05, "loss": 0.227, "num_input_tokens_seen": 64421200, "step": 29845 }, { "epoch": 4.869494290375204, "grad_norm": 0.1828850656747818, "learning_rate": 3.0459277063958457e-05, "loss": 0.2055, "num_input_tokens_seen": 64431344, "step": 29850 }, { "epoch": 4.870309951060359, "grad_norm": 0.1920708417892456, "learning_rate": 3.0452330647275086e-05, "loss": 0.0306, "num_input_tokens_seen": 64442928, "step": 29855 }, { "epoch": 4.871125611745514, "grad_norm": 0.04284502938389778, "learning_rate": 3.044538378859813e-05, "loss": 0.0869, "num_input_tokens_seen": 64453968, "step": 29860 }, { "epoch": 4.871941272430669, "grad_norm": 0.06283725053071976, "learning_rate": 3.0438436488490736e-05, "loss": 0.0837, "num_input_tokens_seen": 64465008, "step": 29865 }, { "epoch": 4.872756933115824, "grad_norm": 0.14269883930683136, "learning_rate": 3.0431488747516085e-05, "loss": 0.0905, "num_input_tokens_seen": 64474832, "step": 29870 }, { "epoch": 4.873572593800979, "grad_norm": 3.301142930984497, "learning_rate": 3.0424540566237398e-05, "loss": 0.0312, "num_input_tokens_seen": 64485776, "step": 29875 }, { "epoch": 4.874388254486134, "grad_norm": 0.09250722080469131, "learning_rate": 3.041759194521792e-05, "loss": 0.1696, "num_input_tokens_seen": 64497392, "step": 29880 }, { "epoch": 4.875203915171289, "grad_norm": 0.20140580832958221, "learning_rate": 3.0410642885020957e-05, "loss": 0.0221, "num_input_tokens_seen": 64508176, "step": 29885 }, { "epoch": 4.876019575856444, "grad_norm": 2.741020679473877, "learning_rate": 3.040369338620983e-05, "loss": 0.2064, "num_input_tokens_seen": 64517808, "step": 29890 }, { "epoch": 4.876835236541599, "grad_norm": 9.838257789611816, "learning_rate": 3.0396743449347893e-05, "loss": 0.2057, "num_input_tokens_seen": 64527696, "step": 29895 }, { "epoch": 4.877650897226753, "grad_norm": 0.29459795355796814, "learning_rate": 3.0389793074998553e-05, "loss": 0.0092, "num_input_tokens_seen": 64538448, "step": 29900 }, { "epoch": 4.878466557911908, "grad_norm": 8.487483978271484, "learning_rate": 3.038284226372524e-05, "loss": 0.044, "num_input_tokens_seen": 64548784, "step": 29905 }, { "epoch": 4.879282218597064, "grad_norm": 0.05116702616214752, "learning_rate": 3.0375891016091424e-05, "loss": 0.0292, "num_input_tokens_seen": 64559152, "step": 29910 }, { "epoch": 4.880097879282219, "grad_norm": 0.11584285646677017, "learning_rate": 3.0368939332660603e-05, "loss": 0.0437, "num_input_tokens_seen": 64570096, "step": 29915 }, { "epoch": 4.8809135399673735, "grad_norm": 0.029633769765496254, "learning_rate": 3.036198721399631e-05, "loss": 0.0711, "num_input_tokens_seen": 64581744, "step": 29920 }, { "epoch": 4.881729200652528, "grad_norm": 4.062506198883057, "learning_rate": 3.035503466066214e-05, "loss": 0.0718, "num_input_tokens_seen": 64591856, "step": 29925 }, { "epoch": 4.882544861337683, "grad_norm": 0.05810140445828438, "learning_rate": 3.0348081673221678e-05, "loss": 0.0065, "num_input_tokens_seen": 64602192, "step": 29930 }, { "epoch": 4.883360522022839, "grad_norm": 0.07992871105670929, "learning_rate": 3.034112825223858e-05, "loss": 0.0151, "num_input_tokens_seen": 64612976, "step": 29935 }, { "epoch": 4.884176182707994, "grad_norm": 0.1341329663991928, "learning_rate": 3.0334174398276532e-05, "loss": 0.0064, "num_input_tokens_seen": 64623984, "step": 29940 }, { "epoch": 4.8849918433931485, "grad_norm": 4.25712776184082, "learning_rate": 3.032722011189924e-05, "loss": 0.0702, "num_input_tokens_seen": 64634992, "step": 29945 }, { "epoch": 4.885807504078303, "grad_norm": 0.12009090185165405, "learning_rate": 3.032026539367046e-05, "loss": 0.1729, "num_input_tokens_seen": 64645840, "step": 29950 }, { "epoch": 4.886623164763458, "grad_norm": 0.11088258028030396, "learning_rate": 3.0313310244153968e-05, "loss": 0.0295, "num_input_tokens_seen": 64656016, "step": 29955 }, { "epoch": 4.887438825448614, "grad_norm": 0.11191460490226746, "learning_rate": 3.0306354663913588e-05, "loss": 0.0801, "num_input_tokens_seen": 64667760, "step": 29960 }, { "epoch": 4.888254486133769, "grad_norm": 4.154870510101318, "learning_rate": 3.029939865351317e-05, "loss": 0.12, "num_input_tokens_seen": 64677872, "step": 29965 }, { "epoch": 4.8890701468189235, "grad_norm": 0.05185890570282936, "learning_rate": 3.0292442213516613e-05, "loss": 0.0034, "num_input_tokens_seen": 64688784, "step": 29970 }, { "epoch": 4.889885807504078, "grad_norm": 0.09086186438798904, "learning_rate": 3.0285485344487834e-05, "loss": 0.2251, "num_input_tokens_seen": 64698576, "step": 29975 }, { "epoch": 4.890701468189233, "grad_norm": 0.06559891998767853, "learning_rate": 3.02785280469908e-05, "loss": 0.0041, "num_input_tokens_seen": 64709520, "step": 29980 }, { "epoch": 4.891517128874388, "grad_norm": 0.1890016347169876, "learning_rate": 3.0271570321589494e-05, "loss": 0.0125, "num_input_tokens_seen": 64720592, "step": 29985 }, { "epoch": 4.892332789559543, "grad_norm": 2.3863577842712402, "learning_rate": 3.026461216884795e-05, "loss": 0.0922, "num_input_tokens_seen": 64731568, "step": 29990 }, { "epoch": 4.8931484502446985, "grad_norm": 5.497276782989502, "learning_rate": 3.025765358933024e-05, "loss": 0.2474, "num_input_tokens_seen": 64742256, "step": 29995 }, { "epoch": 4.893964110929853, "grad_norm": 0.04517051950097084, "learning_rate": 3.0250694583600448e-05, "loss": 0.0048, "num_input_tokens_seen": 64752208, "step": 30000 }, { "epoch": 4.894779771615008, "grad_norm": 0.0528087392449379, "learning_rate": 3.024373515222271e-05, "loss": 0.2048, "num_input_tokens_seen": 64761168, "step": 30005 }, { "epoch": 4.895595432300163, "grad_norm": 0.08989369869232178, "learning_rate": 3.0236775295761194e-05, "loss": 0.0963, "num_input_tokens_seen": 64773040, "step": 30010 }, { "epoch": 4.896411092985318, "grad_norm": 0.1310116946697235, "learning_rate": 3.02298150147801e-05, "loss": 0.2538, "num_input_tokens_seen": 64785584, "step": 30015 }, { "epoch": 4.897226753670473, "grad_norm": 0.09187537431716919, "learning_rate": 3.022285430984367e-05, "loss": 0.076, "num_input_tokens_seen": 64797648, "step": 30020 }, { "epoch": 4.898042414355628, "grad_norm": 2.4482696056365967, "learning_rate": 3.0215893181516163e-05, "loss": 0.0103, "num_input_tokens_seen": 64808912, "step": 30025 }, { "epoch": 4.898858075040783, "grad_norm": 0.2752697467803955, "learning_rate": 3.02089316303619e-05, "loss": 0.2575, "num_input_tokens_seen": 64820336, "step": 30030 }, { "epoch": 4.899673735725938, "grad_norm": 0.044006578624248505, "learning_rate": 3.0201969656945196e-05, "loss": 0.0275, "num_input_tokens_seen": 64831600, "step": 30035 }, { "epoch": 4.900489396411093, "grad_norm": 0.11853750795125961, "learning_rate": 3.0195007261830438e-05, "loss": 0.0452, "num_input_tokens_seen": 64842000, "step": 30040 }, { "epoch": 4.901305057096248, "grad_norm": 1.8368370532989502, "learning_rate": 3.018804444558204e-05, "loss": 0.0742, "num_input_tokens_seen": 64853456, "step": 30045 }, { "epoch": 4.902120717781403, "grad_norm": 0.20973731577396393, "learning_rate": 3.018108120876443e-05, "loss": 0.0836, "num_input_tokens_seen": 64863952, "step": 30050 }, { "epoch": 4.902936378466558, "grad_norm": 8.172491073608398, "learning_rate": 3.0174117551942087e-05, "loss": 0.1811, "num_input_tokens_seen": 64874128, "step": 30055 }, { "epoch": 4.903752039151713, "grad_norm": 0.05342257767915726, "learning_rate": 3.0167153475679527e-05, "loss": 0.0182, "num_input_tokens_seen": 64883856, "step": 30060 }, { "epoch": 4.904567699836868, "grad_norm": 2.8744332790374756, "learning_rate": 3.0160188980541288e-05, "loss": 0.1477, "num_input_tokens_seen": 64894736, "step": 30065 }, { "epoch": 4.9053833605220225, "grad_norm": 0.08954238891601562, "learning_rate": 3.0153224067091952e-05, "loss": 0.0055, "num_input_tokens_seen": 64905808, "step": 30070 }, { "epoch": 4.906199021207177, "grad_norm": 0.4676968455314636, "learning_rate": 3.0146258735896117e-05, "loss": 0.2841, "num_input_tokens_seen": 64917104, "step": 30075 }, { "epoch": 4.907014681892333, "grad_norm": 0.09429818391799927, "learning_rate": 3.0139292987518443e-05, "loss": 0.0045, "num_input_tokens_seen": 64926800, "step": 30080 }, { "epoch": 4.907830342577488, "grad_norm": 0.05108118802309036, "learning_rate": 3.0132326822523606e-05, "loss": 0.1334, "num_input_tokens_seen": 64937904, "step": 30085 }, { "epoch": 4.908646003262643, "grad_norm": 0.02007273957133293, "learning_rate": 3.01253602414763e-05, "loss": 0.2642, "num_input_tokens_seen": 64948592, "step": 30090 }, { "epoch": 4.9094616639477975, "grad_norm": 0.3647564649581909, "learning_rate": 3.0118393244941302e-05, "loss": 0.0283, "num_input_tokens_seen": 64959152, "step": 30095 }, { "epoch": 4.910277324632952, "grad_norm": 0.09454967081546783, "learning_rate": 3.011142583348337e-05, "loss": 0.4647, "num_input_tokens_seen": 64970736, "step": 30100 }, { "epoch": 4.911092985318108, "grad_norm": 6.534158706665039, "learning_rate": 3.010445800766733e-05, "loss": 0.1598, "num_input_tokens_seen": 64982032, "step": 30105 }, { "epoch": 4.911908646003263, "grad_norm": 0.4977462589740753, "learning_rate": 3.0097489768058022e-05, "loss": 0.0744, "num_input_tokens_seen": 64994448, "step": 30110 }, { "epoch": 4.912724306688418, "grad_norm": 0.10170254111289978, "learning_rate": 3.0090521115220327e-05, "loss": 0.1168, "num_input_tokens_seen": 65005744, "step": 30115 }, { "epoch": 4.9135399673735725, "grad_norm": 0.05971534177660942, "learning_rate": 3.0083552049719167e-05, "loss": 0.0047, "num_input_tokens_seen": 65016848, "step": 30120 }, { "epoch": 4.914355628058727, "grad_norm": 0.925605833530426, "learning_rate": 3.0076582572119473e-05, "loss": 0.1789, "num_input_tokens_seen": 65027600, "step": 30125 }, { "epoch": 4.915171288743883, "grad_norm": 0.20594875514507294, "learning_rate": 3.0069612682986247e-05, "loss": 0.1097, "num_input_tokens_seen": 65039120, "step": 30130 }, { "epoch": 4.915986949429038, "grad_norm": 11.441937446594238, "learning_rate": 3.0062642382884494e-05, "loss": 0.0927, "num_input_tokens_seen": 65050512, "step": 30135 }, { "epoch": 4.916802610114193, "grad_norm": 3.2262916564941406, "learning_rate": 3.005567167237926e-05, "loss": 0.1628, "num_input_tokens_seen": 65059888, "step": 30140 }, { "epoch": 4.917618270799347, "grad_norm": 0.2380727231502533, "learning_rate": 3.004870055203562e-05, "loss": 0.0341, "num_input_tokens_seen": 65071120, "step": 30145 }, { "epoch": 4.918433931484502, "grad_norm": 0.08102700114250183, "learning_rate": 3.0041729022418702e-05, "loss": 0.1206, "num_input_tokens_seen": 65081744, "step": 30150 }, { "epoch": 4.919249592169657, "grad_norm": 0.11428258568048477, "learning_rate": 3.003475708409365e-05, "loss": 0.0859, "num_input_tokens_seen": 65093040, "step": 30155 }, { "epoch": 4.920065252854813, "grad_norm": 0.03618955612182617, "learning_rate": 3.0027784737625646e-05, "loss": 0.1927, "num_input_tokens_seen": 65103056, "step": 30160 }, { "epoch": 4.920880913539968, "grad_norm": 0.0515863411128521, "learning_rate": 3.00208119835799e-05, "loss": 0.0133, "num_input_tokens_seen": 65113904, "step": 30165 }, { "epoch": 4.921696574225122, "grad_norm": 3.6284677982330322, "learning_rate": 3.0013838822521655e-05, "loss": 0.2296, "num_input_tokens_seen": 65124016, "step": 30170 }, { "epoch": 4.922512234910277, "grad_norm": 0.09662287682294846, "learning_rate": 3.0006865255016192e-05, "loss": 0.007, "num_input_tokens_seen": 65135120, "step": 30175 }, { "epoch": 4.923327895595432, "grad_norm": 0.07900609076023102, "learning_rate": 2.9999891281628832e-05, "loss": 0.2014, "num_input_tokens_seen": 65144784, "step": 30180 }, { "epoch": 4.924143556280587, "grad_norm": 6.857663154602051, "learning_rate": 2.9992916902924917e-05, "loss": 0.1551, "num_input_tokens_seen": 65154928, "step": 30185 }, { "epoch": 4.924959216965743, "grad_norm": 0.1024380773305893, "learning_rate": 2.998594211946982e-05, "loss": 0.0079, "num_input_tokens_seen": 65165808, "step": 30190 }, { "epoch": 4.925774877650897, "grad_norm": 0.13860879838466644, "learning_rate": 2.9978966931828957e-05, "loss": 0.0472, "num_input_tokens_seen": 65175952, "step": 30195 }, { "epoch": 4.926590538336052, "grad_norm": 0.13814367353916168, "learning_rate": 2.9971991340567773e-05, "loss": 0.1278, "num_input_tokens_seen": 65185904, "step": 30200 }, { "epoch": 4.927406199021207, "grad_norm": 0.0338091216981411, "learning_rate": 2.9965015346251747e-05, "loss": 0.04, "num_input_tokens_seen": 65196848, "step": 30205 }, { "epoch": 4.928221859706362, "grad_norm": 0.2112356722354889, "learning_rate": 2.995803894944637e-05, "loss": 0.2068, "num_input_tokens_seen": 65207440, "step": 30210 }, { "epoch": 4.9290375203915175, "grad_norm": 0.06825273483991623, "learning_rate": 2.9951062150717212e-05, "loss": 0.0128, "num_input_tokens_seen": 65217904, "step": 30215 }, { "epoch": 4.929853181076672, "grad_norm": 2.894876480102539, "learning_rate": 2.994408495062983e-05, "loss": 0.0186, "num_input_tokens_seen": 65228464, "step": 30220 }, { "epoch": 4.930668841761827, "grad_norm": 3.265835762023926, "learning_rate": 2.9937107349749842e-05, "loss": 0.1604, "num_input_tokens_seen": 65239024, "step": 30225 }, { "epoch": 4.931484502446982, "grad_norm": 10.262737274169922, "learning_rate": 2.9930129348642877e-05, "loss": 0.0816, "num_input_tokens_seen": 65249872, "step": 30230 }, { "epoch": 4.932300163132137, "grad_norm": 0.12191098928451538, "learning_rate": 2.9923150947874613e-05, "loss": 0.2379, "num_input_tokens_seen": 65260560, "step": 30235 }, { "epoch": 4.933115823817292, "grad_norm": 1.5043644905090332, "learning_rate": 2.991617214801075e-05, "loss": 0.0125, "num_input_tokens_seen": 65272016, "step": 30240 }, { "epoch": 4.933931484502447, "grad_norm": 0.11814262717962265, "learning_rate": 2.9909192949617036e-05, "loss": 0.0103, "num_input_tokens_seen": 65281136, "step": 30245 }, { "epoch": 4.934747145187602, "grad_norm": 3.2631006240844727, "learning_rate": 2.9902213353259223e-05, "loss": 0.1033, "num_input_tokens_seen": 65290640, "step": 30250 }, { "epoch": 4.935562805872757, "grad_norm": 11.115103721618652, "learning_rate": 2.989523335950313e-05, "loss": 0.2474, "num_input_tokens_seen": 65302032, "step": 30255 }, { "epoch": 4.936378466557912, "grad_norm": 0.1490999013185501, "learning_rate": 2.9888252968914576e-05, "loss": 0.2117, "num_input_tokens_seen": 65313168, "step": 30260 }, { "epoch": 4.937194127243067, "grad_norm": 0.15903590619564056, "learning_rate": 2.988127218205944e-05, "loss": 0.0053, "num_input_tokens_seen": 65324528, "step": 30265 }, { "epoch": 4.938009787928221, "grad_norm": 4.044406890869141, "learning_rate": 2.9874290999503606e-05, "loss": 0.0204, "num_input_tokens_seen": 65335568, "step": 30270 }, { "epoch": 4.938825448613377, "grad_norm": 3.8695523738861084, "learning_rate": 2.9867309421813018e-05, "loss": 0.2925, "num_input_tokens_seen": 65346032, "step": 30275 }, { "epoch": 4.939641109298532, "grad_norm": 0.18034009635448456, "learning_rate": 2.9860327449553626e-05, "loss": 0.1214, "num_input_tokens_seen": 65357712, "step": 30280 }, { "epoch": 4.940456769983687, "grad_norm": 0.035709358751773834, "learning_rate": 2.9853345083291434e-05, "loss": 0.0385, "num_input_tokens_seen": 65369552, "step": 30285 }, { "epoch": 4.941272430668842, "grad_norm": 10.548879623413086, "learning_rate": 2.9846362323592463e-05, "loss": 0.1767, "num_input_tokens_seen": 65380912, "step": 30290 }, { "epoch": 4.942088091353996, "grad_norm": 0.2996707558631897, "learning_rate": 2.9839379171022776e-05, "loss": 0.3709, "num_input_tokens_seen": 65392080, "step": 30295 }, { "epoch": 4.942903752039152, "grad_norm": 0.05010681226849556, "learning_rate": 2.983239562614845e-05, "loss": 0.0053, "num_input_tokens_seen": 65402256, "step": 30300 }, { "epoch": 4.943719412724307, "grad_norm": 1.4723745584487915, "learning_rate": 2.982541168953562e-05, "loss": 0.1478, "num_input_tokens_seen": 65413456, "step": 30305 }, { "epoch": 4.944535073409462, "grad_norm": 0.14409276843070984, "learning_rate": 2.9818427361750434e-05, "loss": 0.0575, "num_input_tokens_seen": 65424912, "step": 30310 }, { "epoch": 4.945350734094617, "grad_norm": 0.11816307157278061, "learning_rate": 2.9811442643359076e-05, "loss": 0.0052, "num_input_tokens_seen": 65434896, "step": 30315 }, { "epoch": 4.946166394779771, "grad_norm": 7.738298416137695, "learning_rate": 2.9804457534927772e-05, "loss": 0.0473, "num_input_tokens_seen": 65445424, "step": 30320 }, { "epoch": 4.946982055464927, "grad_norm": 0.046474017202854156, "learning_rate": 2.9797472037022757e-05, "loss": 0.089, "num_input_tokens_seen": 65454672, "step": 30325 }, { "epoch": 4.947797716150082, "grad_norm": 0.07026860117912292, "learning_rate": 2.9790486150210316e-05, "loss": 0.0908, "num_input_tokens_seen": 65465776, "step": 30330 }, { "epoch": 4.948613376835237, "grad_norm": 5.883081912994385, "learning_rate": 2.9783499875056766e-05, "loss": 0.3138, "num_input_tokens_seen": 65477168, "step": 30335 }, { "epoch": 4.9494290375203915, "grad_norm": 1.2314774990081787, "learning_rate": 2.9776513212128442e-05, "loss": 0.0062, "num_input_tokens_seen": 65487248, "step": 30340 }, { "epoch": 4.950244698205546, "grad_norm": 0.18590541183948517, "learning_rate": 2.976952616199172e-05, "loss": 0.0106, "num_input_tokens_seen": 65497360, "step": 30345 }, { "epoch": 4.951060358890701, "grad_norm": 0.14897702634334564, "learning_rate": 2.9762538725213007e-05, "loss": 0.1456, "num_input_tokens_seen": 65507728, "step": 30350 }, { "epoch": 4.951876019575856, "grad_norm": 0.5946313142776489, "learning_rate": 2.9755550902358737e-05, "loss": 0.0991, "num_input_tokens_seen": 65517104, "step": 30355 }, { "epoch": 4.952691680261012, "grad_norm": 3.074784755706787, "learning_rate": 2.9748562693995386e-05, "loss": 0.204, "num_input_tokens_seen": 65529168, "step": 30360 }, { "epoch": 4.9535073409461665, "grad_norm": 3.872284173965454, "learning_rate": 2.974157410068944e-05, "loss": 0.0447, "num_input_tokens_seen": 65538960, "step": 30365 }, { "epoch": 4.954323001631321, "grad_norm": 0.016039861366152763, "learning_rate": 2.9734585123007446e-05, "loss": 0.0053, "num_input_tokens_seen": 65549392, "step": 30370 }, { "epoch": 4.955138662316476, "grad_norm": 3.6199285984039307, "learning_rate": 2.9727595761515958e-05, "loss": 0.1062, "num_input_tokens_seen": 65559824, "step": 30375 }, { "epoch": 4.955954323001631, "grad_norm": 12.265833854675293, "learning_rate": 2.9720606016781577e-05, "loss": 0.2147, "num_input_tokens_seen": 65570896, "step": 30380 }, { "epoch": 4.956769983686787, "grad_norm": 0.09133773297071457, "learning_rate": 2.9713615889370917e-05, "loss": 0.1078, "num_input_tokens_seen": 65582384, "step": 30385 }, { "epoch": 4.9575856443719415, "grad_norm": 6.277388572692871, "learning_rate": 2.9706625379850627e-05, "loss": 0.3821, "num_input_tokens_seen": 65593264, "step": 30390 }, { "epoch": 4.958401305057096, "grad_norm": 0.1470021903514862, "learning_rate": 2.9699634488787415e-05, "loss": 0.2508, "num_input_tokens_seen": 65604016, "step": 30395 }, { "epoch": 4.959216965742251, "grad_norm": 1.6399931907653809, "learning_rate": 2.9692643216747978e-05, "loss": 0.0532, "num_input_tokens_seen": 65613616, "step": 30400 }, { "epoch": 4.960032626427406, "grad_norm": 3.120595932006836, "learning_rate": 2.9685651564299077e-05, "loss": 0.2575, "num_input_tokens_seen": 65624816, "step": 30405 }, { "epoch": 4.960848287112562, "grad_norm": 0.289460688829422, "learning_rate": 2.9678659532007475e-05, "loss": 0.1482, "num_input_tokens_seen": 65635792, "step": 30410 }, { "epoch": 4.9616639477977165, "grad_norm": 1.108140230178833, "learning_rate": 2.967166712044e-05, "loss": 0.1776, "num_input_tokens_seen": 65646512, "step": 30415 }, { "epoch": 4.962479608482871, "grad_norm": 1.021653413772583, "learning_rate": 2.9664674330163485e-05, "loss": 0.0673, "num_input_tokens_seen": 65656976, "step": 30420 }, { "epoch": 4.963295269168026, "grad_norm": 0.5491104125976562, "learning_rate": 2.96576811617448e-05, "loss": 0.0539, "num_input_tokens_seen": 65667792, "step": 30425 }, { "epoch": 4.964110929853181, "grad_norm": 0.3017660081386566, "learning_rate": 2.9650687615750843e-05, "loss": 0.1247, "num_input_tokens_seen": 65679088, "step": 30430 }, { "epoch": 4.964926590538336, "grad_norm": 1.055377721786499, "learning_rate": 2.964369369274856e-05, "loss": 0.0698, "num_input_tokens_seen": 65689840, "step": 30435 }, { "epoch": 4.9657422512234906, "grad_norm": 0.3545195758342743, "learning_rate": 2.963669939330489e-05, "loss": 0.1265, "num_input_tokens_seen": 65700528, "step": 30440 }, { "epoch": 4.966557911908646, "grad_norm": 0.11180157959461212, "learning_rate": 2.962970471798685e-05, "loss": 0.009, "num_input_tokens_seen": 65710128, "step": 30445 }, { "epoch": 4.967373572593801, "grad_norm": 0.06180586665868759, "learning_rate": 2.9622709667361455e-05, "loss": 0.2663, "num_input_tokens_seen": 65722064, "step": 30450 }, { "epoch": 4.968189233278956, "grad_norm": 0.6542524695396423, "learning_rate": 2.9615714241995758e-05, "loss": 0.0117, "num_input_tokens_seen": 65732336, "step": 30455 }, { "epoch": 4.969004893964111, "grad_norm": 0.08314738422632217, "learning_rate": 2.9608718442456844e-05, "loss": 0.0121, "num_input_tokens_seen": 65743504, "step": 30460 }, { "epoch": 4.9698205546492655, "grad_norm": 0.39737963676452637, "learning_rate": 2.9601722269311827e-05, "loss": 0.0133, "num_input_tokens_seen": 65754608, "step": 30465 }, { "epoch": 4.970636215334421, "grad_norm": 0.9284301996231079, "learning_rate": 2.9594725723127855e-05, "loss": 0.1873, "num_input_tokens_seen": 65765584, "step": 30470 }, { "epoch": 4.971451876019576, "grad_norm": 0.0789085254073143, "learning_rate": 2.9587728804472104e-05, "loss": 0.2233, "num_input_tokens_seen": 65776816, "step": 30475 }, { "epoch": 4.972267536704731, "grad_norm": 3.007207155227661, "learning_rate": 2.9580731513911773e-05, "loss": 0.2211, "num_input_tokens_seen": 65788336, "step": 30480 }, { "epoch": 4.973083197389886, "grad_norm": 0.045617252588272095, "learning_rate": 2.9573733852014112e-05, "loss": 0.22, "num_input_tokens_seen": 65798704, "step": 30485 }, { "epoch": 4.9738988580750405, "grad_norm": 0.6935256123542786, "learning_rate": 2.9566735819346376e-05, "loss": 0.0459, "num_input_tokens_seen": 65808816, "step": 30490 }, { "epoch": 4.974714518760196, "grad_norm": 0.8358042240142822, "learning_rate": 2.9559737416475863e-05, "loss": 0.0905, "num_input_tokens_seen": 65819088, "step": 30495 }, { "epoch": 4.975530179445351, "grad_norm": 2.981520414352417, "learning_rate": 2.9552738643969896e-05, "loss": 0.1319, "num_input_tokens_seen": 65830256, "step": 30500 }, { "epoch": 4.976345840130506, "grad_norm": 0.02853558212518692, "learning_rate": 2.9545739502395835e-05, "loss": 0.0606, "num_input_tokens_seen": 65839792, "step": 30505 }, { "epoch": 4.977161500815661, "grad_norm": 0.12231956422328949, "learning_rate": 2.9538739992321062e-05, "loss": 0.007, "num_input_tokens_seen": 65850512, "step": 30510 }, { "epoch": 4.9779771615008155, "grad_norm": 0.06114667281508446, "learning_rate": 2.9531740114313e-05, "loss": 0.1149, "num_input_tokens_seen": 65861584, "step": 30515 }, { "epoch": 4.97879282218597, "grad_norm": 1.5656062364578247, "learning_rate": 2.9524739868939088e-05, "loss": 0.3395, "num_input_tokens_seen": 65872880, "step": 30520 }, { "epoch": 4.979608482871125, "grad_norm": 0.08660892397165298, "learning_rate": 2.9517739256766803e-05, "loss": 0.0996, "num_input_tokens_seen": 65882544, "step": 30525 }, { "epoch": 4.980424143556281, "grad_norm": 0.043251391500234604, "learning_rate": 2.9510738278363652e-05, "loss": 0.0092, "num_input_tokens_seen": 65893456, "step": 30530 }, { "epoch": 4.981239804241436, "grad_norm": 0.7454972267150879, "learning_rate": 2.950373693429717e-05, "loss": 0.0158, "num_input_tokens_seen": 65905264, "step": 30535 }, { "epoch": 4.9820554649265905, "grad_norm": 4.430110454559326, "learning_rate": 2.949673522513492e-05, "loss": 0.2665, "num_input_tokens_seen": 65916400, "step": 30540 }, { "epoch": 4.982871125611745, "grad_norm": 0.11500086635351181, "learning_rate": 2.9489733151444497e-05, "loss": 0.1166, "num_input_tokens_seen": 65928272, "step": 30545 }, { "epoch": 4.9836867862969, "grad_norm": 0.3506146967411041, "learning_rate": 2.9482730713793526e-05, "loss": 0.0104, "num_input_tokens_seen": 65939792, "step": 30550 }, { "epoch": 4.984502446982056, "grad_norm": 0.10922025144100189, "learning_rate": 2.9475727912749656e-05, "loss": 0.1016, "num_input_tokens_seen": 65950576, "step": 30555 }, { "epoch": 4.985318107667211, "grad_norm": 9.96968936920166, "learning_rate": 2.946872474888058e-05, "loss": 0.3188, "num_input_tokens_seen": 65960240, "step": 30560 }, { "epoch": 4.986133768352365, "grad_norm": 0.38060101866722107, "learning_rate": 2.9461721222753992e-05, "loss": 0.2204, "num_input_tokens_seen": 65970544, "step": 30565 }, { "epoch": 4.98694942903752, "grad_norm": 0.05818890407681465, "learning_rate": 2.9454717334937638e-05, "loss": 0.0043, "num_input_tokens_seen": 65982768, "step": 30570 }, { "epoch": 4.987765089722675, "grad_norm": 0.23830565810203552, "learning_rate": 2.94477130859993e-05, "loss": 0.0388, "num_input_tokens_seen": 65993680, "step": 30575 }, { "epoch": 4.988580750407831, "grad_norm": 2.3207828998565674, "learning_rate": 2.9440708476506773e-05, "loss": 0.2399, "num_input_tokens_seen": 66005136, "step": 30580 }, { "epoch": 4.989396411092986, "grad_norm": 1.1624802350997925, "learning_rate": 2.943370350702789e-05, "loss": 0.151, "num_input_tokens_seen": 66015376, "step": 30585 }, { "epoch": 4.99021207177814, "grad_norm": 0.3364430069923401, "learning_rate": 2.9426698178130495e-05, "loss": 0.0074, "num_input_tokens_seen": 66025904, "step": 30590 }, { "epoch": 4.991027732463295, "grad_norm": 0.17804332077503204, "learning_rate": 2.9419692490382488e-05, "loss": 0.0859, "num_input_tokens_seen": 66036560, "step": 30595 }, { "epoch": 4.99184339314845, "grad_norm": 1.2029184103012085, "learning_rate": 2.9412686444351782e-05, "loss": 0.1147, "num_input_tokens_seen": 66047856, "step": 30600 }, { "epoch": 4.992659053833605, "grad_norm": 0.0865970030426979, "learning_rate": 2.9405680040606326e-05, "loss": 0.2751, "num_input_tokens_seen": 66058736, "step": 30605 }, { "epoch": 4.993474714518761, "grad_norm": 4.403013706207275, "learning_rate": 2.939867327971409e-05, "loss": 0.0799, "num_input_tokens_seen": 66068848, "step": 30610 }, { "epoch": 4.994290375203915, "grad_norm": 12.007497787475586, "learning_rate": 2.939166616224308e-05, "loss": 0.0594, "num_input_tokens_seen": 66079760, "step": 30615 }, { "epoch": 4.99510603588907, "grad_norm": 0.08808305114507675, "learning_rate": 2.938465868876133e-05, "loss": 0.1133, "num_input_tokens_seen": 66090128, "step": 30620 }, { "epoch": 4.995921696574225, "grad_norm": 15.229327201843262, "learning_rate": 2.9377650859836892e-05, "loss": 0.0856, "num_input_tokens_seen": 66100304, "step": 30625 }, { "epoch": 4.99673735725938, "grad_norm": 1.8007338047027588, "learning_rate": 2.9370642676037867e-05, "loss": 0.2761, "num_input_tokens_seen": 66111696, "step": 30630 }, { "epoch": 4.997553017944535, "grad_norm": 1.2313814163208008, "learning_rate": 2.936363413793237e-05, "loss": 0.2358, "num_input_tokens_seen": 66122064, "step": 30635 }, { "epoch": 4.99836867862969, "grad_norm": 0.06142730638384819, "learning_rate": 2.9356625246088554e-05, "loss": 0.0265, "num_input_tokens_seen": 66131376, "step": 30640 }, { "epoch": 4.999184339314845, "grad_norm": 0.09649068117141724, "learning_rate": 2.9349616001074588e-05, "loss": 0.2084, "num_input_tokens_seen": 66142800, "step": 30645 }, { "epoch": 5.0, "grad_norm": 0.20904871821403503, "learning_rate": 2.934260640345867e-05, "loss": 0.1177, "num_input_tokens_seen": 66152480, "step": 30650 }, { "epoch": 5.0, "eval_loss": 0.15915174782276154, "eval_runtime": 132.9324, "eval_samples_per_second": 20.499, "eval_steps_per_second": 5.13, "num_input_tokens_seen": 66152480, "step": 30650 }, { "epoch": 5.000815660685155, "grad_norm": 1.0603883266448975, "learning_rate": 2.9335596453809055e-05, "loss": 0.01, "num_input_tokens_seen": 66162912, "step": 30655 }, { "epoch": 5.00163132137031, "grad_norm": 6.042734146118164, "learning_rate": 2.9328586152693986e-05, "loss": 0.0805, "num_input_tokens_seen": 66173312, "step": 30660 }, { "epoch": 5.002446982055465, "grad_norm": 0.6006112098693848, "learning_rate": 2.9321575500681757e-05, "loss": 0.1409, "num_input_tokens_seen": 66184832, "step": 30665 }, { "epoch": 5.00326264274062, "grad_norm": 0.22609871625900269, "learning_rate": 2.93145644983407e-05, "loss": 0.0269, "num_input_tokens_seen": 66196544, "step": 30670 }, { "epoch": 5.004078303425775, "grad_norm": 0.08074972778558731, "learning_rate": 2.9307553146239146e-05, "loss": 0.0122, "num_input_tokens_seen": 66207904, "step": 30675 }, { "epoch": 5.00489396411093, "grad_norm": 0.10380563139915466, "learning_rate": 2.930054144494548e-05, "loss": 0.1043, "num_input_tokens_seen": 66218752, "step": 30680 }, { "epoch": 5.005709624796085, "grad_norm": 0.12653124332427979, "learning_rate": 2.9293529395028102e-05, "loss": 0.0058, "num_input_tokens_seen": 66230528, "step": 30685 }, { "epoch": 5.006525285481239, "grad_norm": 5.836536884307861, "learning_rate": 2.928651699705545e-05, "loss": 0.0815, "num_input_tokens_seen": 66240768, "step": 30690 }, { "epoch": 5.007340946166395, "grad_norm": 3.5016517639160156, "learning_rate": 2.927950425159598e-05, "loss": 0.1495, "num_input_tokens_seen": 66252672, "step": 30695 }, { "epoch": 5.00815660685155, "grad_norm": 0.23449905216693878, "learning_rate": 2.927249115921818e-05, "loss": 0.0862, "num_input_tokens_seen": 66263520, "step": 30700 }, { "epoch": 5.008972267536705, "grad_norm": 0.032858956605196, "learning_rate": 2.926547772049057e-05, "loss": 0.0033, "num_input_tokens_seen": 66274752, "step": 30705 }, { "epoch": 5.00978792822186, "grad_norm": 0.19907324016094208, "learning_rate": 2.9258463935981696e-05, "loss": 0.3923, "num_input_tokens_seen": 66285440, "step": 30710 }, { "epoch": 5.010603588907014, "grad_norm": 0.20320872962474823, "learning_rate": 2.9251449806260122e-05, "loss": 0.0063, "num_input_tokens_seen": 66295040, "step": 30715 }, { "epoch": 5.011419249592169, "grad_norm": 0.09244179725646973, "learning_rate": 2.9244435331894454e-05, "loss": 0.007, "num_input_tokens_seen": 66304384, "step": 30720 }, { "epoch": 5.012234910277325, "grad_norm": 0.0688881129026413, "learning_rate": 2.9237420513453328e-05, "loss": 0.1721, "num_input_tokens_seen": 66315648, "step": 30725 }, { "epoch": 5.01305057096248, "grad_norm": 0.08274415135383606, "learning_rate": 2.9230405351505386e-05, "loss": 0.0121, "num_input_tokens_seen": 66325664, "step": 30730 }, { "epoch": 5.013866231647635, "grad_norm": 0.14355804026126862, "learning_rate": 2.9223389846619326e-05, "loss": 0.0457, "num_input_tokens_seen": 66336544, "step": 30735 }, { "epoch": 5.014681892332789, "grad_norm": 0.4062250852584839, "learning_rate": 2.921637399936386e-05, "loss": 0.0097, "num_input_tokens_seen": 66347360, "step": 30740 }, { "epoch": 5.015497553017944, "grad_norm": 0.1017821654677391, "learning_rate": 2.920935781030772e-05, "loss": 0.0094, "num_input_tokens_seen": 66356992, "step": 30745 }, { "epoch": 5.0163132137031, "grad_norm": 0.06947299093008041, "learning_rate": 2.9202341280019675e-05, "loss": 0.0073, "num_input_tokens_seen": 66368032, "step": 30750 }, { "epoch": 5.017128874388255, "grad_norm": 1.3397068977355957, "learning_rate": 2.9195324409068525e-05, "loss": 0.1538, "num_input_tokens_seen": 66379424, "step": 30755 }, { "epoch": 5.0179445350734095, "grad_norm": 0.055242735892534256, "learning_rate": 2.9188307198023095e-05, "loss": 0.0166, "num_input_tokens_seen": 66390752, "step": 30760 }, { "epoch": 5.018760195758564, "grad_norm": 0.14783857762813568, "learning_rate": 2.918128964745223e-05, "loss": 0.0706, "num_input_tokens_seen": 66403200, "step": 30765 }, { "epoch": 5.019575856443719, "grad_norm": 0.038159240037202835, "learning_rate": 2.9174271757924814e-05, "loss": 0.0716, "num_input_tokens_seen": 66412768, "step": 30770 }, { "epoch": 5.020391517128874, "grad_norm": 3.5265026092529297, "learning_rate": 2.9167253530009748e-05, "loss": 0.1267, "num_input_tokens_seen": 66424448, "step": 30775 }, { "epoch": 5.02120717781403, "grad_norm": 0.13227763772010803, "learning_rate": 2.9160234964275963e-05, "loss": 0.218, "num_input_tokens_seen": 66434400, "step": 30780 }, { "epoch": 5.0220228384991845, "grad_norm": 0.04481755942106247, "learning_rate": 2.915321606129242e-05, "loss": 0.298, "num_input_tokens_seen": 66445440, "step": 30785 }, { "epoch": 5.022838499184339, "grad_norm": 2.932248115539551, "learning_rate": 2.9146196821628113e-05, "loss": 0.238, "num_input_tokens_seen": 66455776, "step": 30790 }, { "epoch": 5.023654159869494, "grad_norm": 0.03845330327749252, "learning_rate": 2.9139177245852056e-05, "loss": 0.0043, "num_input_tokens_seen": 66467616, "step": 30795 }, { "epoch": 5.024469820554649, "grad_norm": 0.18302397429943085, "learning_rate": 2.9132157334533295e-05, "loss": 0.0971, "num_input_tokens_seen": 66477056, "step": 30800 }, { "epoch": 5.025285481239805, "grad_norm": 0.14941471815109253, "learning_rate": 2.9125137088240885e-05, "loss": 0.1101, "num_input_tokens_seen": 66488352, "step": 30805 }, { "epoch": 5.0261011419249595, "grad_norm": 0.18477576971054077, "learning_rate": 2.9118116507543936e-05, "loss": 0.1074, "num_input_tokens_seen": 66497824, "step": 30810 }, { "epoch": 5.026916802610114, "grad_norm": 0.09586291015148163, "learning_rate": 2.9111095593011567e-05, "loss": 0.0776, "num_input_tokens_seen": 66509920, "step": 30815 }, { "epoch": 5.027732463295269, "grad_norm": 0.32964301109313965, "learning_rate": 2.9104074345212933e-05, "loss": 0.0103, "num_input_tokens_seen": 66519520, "step": 30820 }, { "epoch": 5.028548123980424, "grad_norm": 0.11387702077627182, "learning_rate": 2.9097052764717196e-05, "loss": 0.0161, "num_input_tokens_seen": 66529792, "step": 30825 }, { "epoch": 5.029363784665579, "grad_norm": 2.490086078643799, "learning_rate": 2.9090030852093586e-05, "loss": 0.1166, "num_input_tokens_seen": 66540416, "step": 30830 }, { "epoch": 5.0301794453507345, "grad_norm": 10.41690444946289, "learning_rate": 2.9083008607911322e-05, "loss": 0.0685, "num_input_tokens_seen": 66549280, "step": 30835 }, { "epoch": 5.030995106035889, "grad_norm": 0.26020798087120056, "learning_rate": 2.9075986032739656e-05, "loss": 0.0096, "num_input_tokens_seen": 66560640, "step": 30840 }, { "epoch": 5.031810766721044, "grad_norm": 0.4090158939361572, "learning_rate": 2.9068963127147886e-05, "loss": 0.1129, "num_input_tokens_seen": 66571808, "step": 30845 }, { "epoch": 5.032626427406199, "grad_norm": 11.681133270263672, "learning_rate": 2.906193989170532e-05, "loss": 0.2865, "num_input_tokens_seen": 66583424, "step": 30850 }, { "epoch": 5.033442088091354, "grad_norm": 23.934995651245117, "learning_rate": 2.9054916326981297e-05, "loss": 0.2309, "num_input_tokens_seen": 66593984, "step": 30855 }, { "epoch": 5.034257748776509, "grad_norm": 0.1975080817937851, "learning_rate": 2.9047892433545176e-05, "loss": 0.0191, "num_input_tokens_seen": 66604704, "step": 30860 }, { "epoch": 5.035073409461664, "grad_norm": 3.619813919067383, "learning_rate": 2.9040868211966364e-05, "loss": 0.2261, "num_input_tokens_seen": 66615808, "step": 30865 }, { "epoch": 5.035889070146819, "grad_norm": 0.3861522078514099, "learning_rate": 2.903384366281427e-05, "loss": 0.008, "num_input_tokens_seen": 66627008, "step": 30870 }, { "epoch": 5.036704730831974, "grad_norm": 0.08608371764421463, "learning_rate": 2.902681878665834e-05, "loss": 0.088, "num_input_tokens_seen": 66638464, "step": 30875 }, { "epoch": 5.037520391517129, "grad_norm": 0.2712276577949524, "learning_rate": 2.9019793584068046e-05, "loss": 0.1403, "num_input_tokens_seen": 66648544, "step": 30880 }, { "epoch": 5.0383360522022835, "grad_norm": 0.07102425396442413, "learning_rate": 2.9012768055612887e-05, "loss": 0.1543, "num_input_tokens_seen": 66660384, "step": 30885 }, { "epoch": 5.039151712887439, "grad_norm": 0.1890052855014801, "learning_rate": 2.9005742201862385e-05, "loss": 0.1326, "num_input_tokens_seen": 66671136, "step": 30890 }, { "epoch": 5.039967373572594, "grad_norm": 2.965296506881714, "learning_rate": 2.8998716023386096e-05, "loss": 0.201, "num_input_tokens_seen": 66681664, "step": 30895 }, { "epoch": 5.040783034257749, "grad_norm": 0.07390269637107849, "learning_rate": 2.8991689520753605e-05, "loss": 0.0141, "num_input_tokens_seen": 66691968, "step": 30900 }, { "epoch": 5.041598694942904, "grad_norm": 2.248039722442627, "learning_rate": 2.8984662694534504e-05, "loss": 0.0174, "num_input_tokens_seen": 66703488, "step": 30905 }, { "epoch": 5.0424143556280585, "grad_norm": 0.05766557902097702, "learning_rate": 2.897763554529842e-05, "loss": 0.0839, "num_input_tokens_seen": 66714304, "step": 30910 }, { "epoch": 5.043230016313213, "grad_norm": 2.0388343334198, "learning_rate": 2.8970608073615026e-05, "loss": 0.1338, "num_input_tokens_seen": 66724864, "step": 30915 }, { "epoch": 5.044045676998369, "grad_norm": 0.09663654118776321, "learning_rate": 2.8963580280053992e-05, "loss": 0.1101, "num_input_tokens_seen": 66736736, "step": 30920 }, { "epoch": 5.044861337683524, "grad_norm": 0.6017317771911621, "learning_rate": 2.8956552165185023e-05, "loss": 0.1653, "num_input_tokens_seen": 66748128, "step": 30925 }, { "epoch": 5.045676998368679, "grad_norm": 0.11222974956035614, "learning_rate": 2.894952372957787e-05, "loss": 0.0072, "num_input_tokens_seen": 66759200, "step": 30930 }, { "epoch": 5.0464926590538335, "grad_norm": 0.08082642406225204, "learning_rate": 2.894249497380228e-05, "loss": 0.1505, "num_input_tokens_seen": 66770720, "step": 30935 }, { "epoch": 5.047308319738988, "grad_norm": 16.161096572875977, "learning_rate": 2.893546589842805e-05, "loss": 0.1574, "num_input_tokens_seen": 66781312, "step": 30940 }, { "epoch": 5.048123980424143, "grad_norm": 0.13353952765464783, "learning_rate": 2.892843650402497e-05, "loss": 0.0102, "num_input_tokens_seen": 66792864, "step": 30945 }, { "epoch": 5.048939641109299, "grad_norm": 8.055357933044434, "learning_rate": 2.8921406791162902e-05, "loss": 0.2173, "num_input_tokens_seen": 66802976, "step": 30950 }, { "epoch": 5.049755301794454, "grad_norm": 0.23622195422649384, "learning_rate": 2.891437676041171e-05, "loss": 0.0082, "num_input_tokens_seen": 66812768, "step": 30955 }, { "epoch": 5.0505709624796085, "grad_norm": 0.10315897315740585, "learning_rate": 2.890734641234127e-05, "loss": 0.0993, "num_input_tokens_seen": 66823072, "step": 30960 }, { "epoch": 5.051386623164763, "grad_norm": 0.9988123178482056, "learning_rate": 2.8900315747521507e-05, "loss": 0.0113, "num_input_tokens_seen": 66835200, "step": 30965 }, { "epoch": 5.052202283849918, "grad_norm": 0.21393397450447083, "learning_rate": 2.8893284766522353e-05, "loss": 0.009, "num_input_tokens_seen": 66846528, "step": 30970 }, { "epoch": 5.053017944535074, "grad_norm": 1.640389084815979, "learning_rate": 2.8886253469913787e-05, "loss": 0.1719, "num_input_tokens_seen": 66858080, "step": 30975 }, { "epoch": 5.053833605220229, "grad_norm": 0.2589954435825348, "learning_rate": 2.8879221858265794e-05, "loss": 0.0182, "num_input_tokens_seen": 66869888, "step": 30980 }, { "epoch": 5.054649265905383, "grad_norm": 18.620851516723633, "learning_rate": 2.8872189932148392e-05, "loss": 0.2544, "num_input_tokens_seen": 66880256, "step": 30985 }, { "epoch": 5.055464926590538, "grad_norm": 0.7262380719184875, "learning_rate": 2.8865157692131633e-05, "loss": 0.1503, "num_input_tokens_seen": 66890784, "step": 30990 }, { "epoch": 5.056280587275693, "grad_norm": 0.15699176490306854, "learning_rate": 2.8858125138785568e-05, "loss": 0.0112, "num_input_tokens_seen": 66901920, "step": 30995 }, { "epoch": 5.057096247960848, "grad_norm": 0.6760200262069702, "learning_rate": 2.8851092272680313e-05, "loss": 0.1433, "num_input_tokens_seen": 66912608, "step": 31000 }, { "epoch": 5.057911908646004, "grad_norm": 0.12168726325035095, "learning_rate": 2.8844059094385977e-05, "loss": 0.1217, "num_input_tokens_seen": 66922336, "step": 31005 }, { "epoch": 5.058727569331158, "grad_norm": 0.2774670422077179, "learning_rate": 2.883702560447271e-05, "loss": 0.1797, "num_input_tokens_seen": 66932960, "step": 31010 }, { "epoch": 5.059543230016313, "grad_norm": 0.12740923464298248, "learning_rate": 2.8829991803510675e-05, "loss": 0.1272, "num_input_tokens_seen": 66944448, "step": 31015 }, { "epoch": 5.060358890701468, "grad_norm": 0.5450809001922607, "learning_rate": 2.8822957692070073e-05, "loss": 0.0596, "num_input_tokens_seen": 66955264, "step": 31020 }, { "epoch": 5.061174551386623, "grad_norm": 0.08875981718301773, "learning_rate": 2.8815923270721124e-05, "loss": 0.005, "num_input_tokens_seen": 66965024, "step": 31025 }, { "epoch": 5.061990212071779, "grad_norm": 0.08391879498958588, "learning_rate": 2.8808888540034067e-05, "loss": 0.0838, "num_input_tokens_seen": 66975072, "step": 31030 }, { "epoch": 5.062805872756933, "grad_norm": 0.03846724331378937, "learning_rate": 2.8801853500579183e-05, "loss": 0.1336, "num_input_tokens_seen": 66987232, "step": 31035 }, { "epoch": 5.063621533442088, "grad_norm": 2.5398213863372803, "learning_rate": 2.879481815292676e-05, "loss": 0.2049, "num_input_tokens_seen": 66995968, "step": 31040 }, { "epoch": 5.064437194127243, "grad_norm": 4.764420032501221, "learning_rate": 2.878778249764713e-05, "loss": 0.2104, "num_input_tokens_seen": 67006912, "step": 31045 }, { "epoch": 5.065252854812398, "grad_norm": 0.1985485851764679, "learning_rate": 2.8780746535310623e-05, "loss": 0.0223, "num_input_tokens_seen": 67017824, "step": 31050 }, { "epoch": 5.066068515497553, "grad_norm": 0.08837421983480453, "learning_rate": 2.8773710266487623e-05, "loss": 0.008, "num_input_tokens_seen": 67028736, "step": 31055 }, { "epoch": 5.066884176182708, "grad_norm": 0.10470012575387955, "learning_rate": 2.8766673691748524e-05, "loss": 0.048, "num_input_tokens_seen": 67039936, "step": 31060 }, { "epoch": 5.067699836867863, "grad_norm": 0.11755790561437607, "learning_rate": 2.875963681166373e-05, "loss": 0.0116, "num_input_tokens_seen": 67051616, "step": 31065 }, { "epoch": 5.068515497553018, "grad_norm": 0.062179673463106155, "learning_rate": 2.8752599626803717e-05, "loss": 0.0514, "num_input_tokens_seen": 67062240, "step": 31070 }, { "epoch": 5.069331158238173, "grad_norm": 0.05381360277533531, "learning_rate": 2.874556213773893e-05, "loss": 0.0173, "num_input_tokens_seen": 67073408, "step": 31075 }, { "epoch": 5.070146818923328, "grad_norm": 0.10158298909664154, "learning_rate": 2.8738524345039876e-05, "loss": 0.1015, "num_input_tokens_seen": 67084896, "step": 31080 }, { "epoch": 5.0709624796084825, "grad_norm": 0.07138735055923462, "learning_rate": 2.8731486249277062e-05, "loss": 0.005, "num_input_tokens_seen": 67095040, "step": 31085 }, { "epoch": 5.071778140293638, "grad_norm": 0.0807000994682312, "learning_rate": 2.8724447851021047e-05, "loss": 0.0992, "num_input_tokens_seen": 67106336, "step": 31090 }, { "epoch": 5.072593800978793, "grad_norm": 0.09665209800004959, "learning_rate": 2.871740915084239e-05, "loss": 0.2611, "num_input_tokens_seen": 67116832, "step": 31095 }, { "epoch": 5.073409461663948, "grad_norm": 11.138023376464844, "learning_rate": 2.871037014931168e-05, "loss": 0.1062, "num_input_tokens_seen": 67128736, "step": 31100 }, { "epoch": 5.074225122349103, "grad_norm": 3.8535048961639404, "learning_rate": 2.870333084699954e-05, "loss": 0.0221, "num_input_tokens_seen": 67140128, "step": 31105 }, { "epoch": 5.075040783034257, "grad_norm": 0.25655168294906616, "learning_rate": 2.8696291244476613e-05, "loss": 0.0776, "num_input_tokens_seen": 67151232, "step": 31110 }, { "epoch": 5.075856443719413, "grad_norm": 0.08060576021671295, "learning_rate": 2.8689251342313562e-05, "loss": 0.1069, "num_input_tokens_seen": 67161984, "step": 31115 }, { "epoch": 5.076672104404568, "grad_norm": 1.157850980758667, "learning_rate": 2.8682211141081084e-05, "loss": 0.1153, "num_input_tokens_seen": 67172224, "step": 31120 }, { "epoch": 5.077487765089723, "grad_norm": 0.04678581282496452, "learning_rate": 2.867517064134988e-05, "loss": 0.0092, "num_input_tokens_seen": 67182560, "step": 31125 }, { "epoch": 5.078303425774878, "grad_norm": 0.0874355211853981, "learning_rate": 2.8668129843690704e-05, "loss": 0.0052, "num_input_tokens_seen": 67193024, "step": 31130 }, { "epoch": 5.079119086460032, "grad_norm": 0.08548480272293091, "learning_rate": 2.86610887486743e-05, "loss": 0.2387, "num_input_tokens_seen": 67204096, "step": 31135 }, { "epoch": 5.079934747145187, "grad_norm": 0.10910390317440033, "learning_rate": 2.8654047356871473e-05, "loss": 0.0158, "num_input_tokens_seen": 67214432, "step": 31140 }, { "epoch": 5.080750407830343, "grad_norm": 0.32175755500793457, "learning_rate": 2.8647005668853027e-05, "loss": 0.0158, "num_input_tokens_seen": 67225792, "step": 31145 }, { "epoch": 5.081566068515498, "grad_norm": 1.7120707035064697, "learning_rate": 2.8639963685189795e-05, "loss": 0.2072, "num_input_tokens_seen": 67236128, "step": 31150 }, { "epoch": 5.082381729200653, "grad_norm": 0.13650250434875488, "learning_rate": 2.8632921406452635e-05, "loss": 0.0841, "num_input_tokens_seen": 67247072, "step": 31155 }, { "epoch": 5.083197389885807, "grad_norm": 0.14863349497318268, "learning_rate": 2.862587883321244e-05, "loss": 0.0944, "num_input_tokens_seen": 67257344, "step": 31160 }, { "epoch": 5.084013050570962, "grad_norm": 6.298161506652832, "learning_rate": 2.8618835966040104e-05, "loss": 0.0831, "num_input_tokens_seen": 67266848, "step": 31165 }, { "epoch": 5.084828711256117, "grad_norm": 0.17926152050495148, "learning_rate": 2.8611792805506565e-05, "loss": 0.2324, "num_input_tokens_seen": 67277056, "step": 31170 }, { "epoch": 5.085644371941273, "grad_norm": 0.08532644063234329, "learning_rate": 2.8604749352182776e-05, "loss": 0.065, "num_input_tokens_seen": 67288672, "step": 31175 }, { "epoch": 5.0864600326264275, "grad_norm": 0.06849771738052368, "learning_rate": 2.8597705606639707e-05, "loss": 0.0989, "num_input_tokens_seen": 67299424, "step": 31180 }, { "epoch": 5.087275693311582, "grad_norm": 0.16645334661006927, "learning_rate": 2.8590661569448372e-05, "loss": 0.013, "num_input_tokens_seen": 67309056, "step": 31185 }, { "epoch": 5.088091353996737, "grad_norm": 0.2268446981906891, "learning_rate": 2.8583617241179796e-05, "loss": 0.0084, "num_input_tokens_seen": 67320320, "step": 31190 }, { "epoch": 5.088907014681892, "grad_norm": 0.165228009223938, "learning_rate": 2.8576572622405017e-05, "loss": 0.2108, "num_input_tokens_seen": 67330944, "step": 31195 }, { "epoch": 5.089722675367048, "grad_norm": 0.0722666010260582, "learning_rate": 2.856952771369512e-05, "loss": 0.0045, "num_input_tokens_seen": 67342720, "step": 31200 }, { "epoch": 5.0905383360522025, "grad_norm": 4.162609100341797, "learning_rate": 2.856248251562119e-05, "loss": 0.1851, "num_input_tokens_seen": 67353856, "step": 31205 }, { "epoch": 5.091353996737357, "grad_norm": 0.02475403994321823, "learning_rate": 2.8555437028754352e-05, "loss": 0.2968, "num_input_tokens_seen": 67364384, "step": 31210 }, { "epoch": 5.092169657422512, "grad_norm": 0.10372800379991531, "learning_rate": 2.8548391253665746e-05, "loss": 0.2329, "num_input_tokens_seen": 67375328, "step": 31215 }, { "epoch": 5.092985318107667, "grad_norm": 0.17950288951396942, "learning_rate": 2.854134519092654e-05, "loss": 0.0115, "num_input_tokens_seen": 67386240, "step": 31220 }, { "epoch": 5.093800978792822, "grad_norm": 0.15762710571289062, "learning_rate": 2.8534298841107925e-05, "loss": 0.0208, "num_input_tokens_seen": 67397760, "step": 31225 }, { "epoch": 5.0946166394779775, "grad_norm": 0.045956000685691833, "learning_rate": 2.8527252204781117e-05, "loss": 0.0207, "num_input_tokens_seen": 67408768, "step": 31230 }, { "epoch": 5.095432300163132, "grad_norm": 7.062222480773926, "learning_rate": 2.852020528251735e-05, "loss": 0.141, "num_input_tokens_seen": 67419456, "step": 31235 }, { "epoch": 5.096247960848287, "grad_norm": 0.2240462750196457, "learning_rate": 2.8513158074887875e-05, "loss": 0.1079, "num_input_tokens_seen": 67430528, "step": 31240 }, { "epoch": 5.097063621533442, "grad_norm": 0.3925155997276306, "learning_rate": 2.8506110582463984e-05, "loss": 0.0875, "num_input_tokens_seen": 67441472, "step": 31245 }, { "epoch": 5.097879282218597, "grad_norm": 3.969595432281494, "learning_rate": 2.8499062805816984e-05, "loss": 0.249, "num_input_tokens_seen": 67451808, "step": 31250 }, { "epoch": 5.0986949429037525, "grad_norm": 0.21328844130039215, "learning_rate": 2.8492014745518192e-05, "loss": 0.0764, "num_input_tokens_seen": 67463104, "step": 31255 }, { "epoch": 5.099510603588907, "grad_norm": 0.07602519541978836, "learning_rate": 2.8484966402138968e-05, "loss": 0.0097, "num_input_tokens_seen": 67474144, "step": 31260 }, { "epoch": 5.100326264274062, "grad_norm": 0.20172946155071259, "learning_rate": 2.8477917776250683e-05, "loss": 0.0227, "num_input_tokens_seen": 67484096, "step": 31265 }, { "epoch": 5.101141924959217, "grad_norm": 0.03285258263349533, "learning_rate": 2.847086886842474e-05, "loss": 0.1259, "num_input_tokens_seen": 67493664, "step": 31270 }, { "epoch": 5.101957585644372, "grad_norm": 1.429862141609192, "learning_rate": 2.8463819679232555e-05, "loss": 0.3046, "num_input_tokens_seen": 67504768, "step": 31275 }, { "epoch": 5.102773246329527, "grad_norm": 0.3224422335624695, "learning_rate": 2.845677020924557e-05, "loss": 0.0953, "num_input_tokens_seen": 67514848, "step": 31280 }, { "epoch": 5.103588907014682, "grad_norm": 0.0700070783495903, "learning_rate": 2.8449720459035256e-05, "loss": 0.0796, "num_input_tokens_seen": 67525248, "step": 31285 }, { "epoch": 5.104404567699837, "grad_norm": 2.38926100730896, "learning_rate": 2.8442670429173098e-05, "loss": 0.0528, "num_input_tokens_seen": 67534656, "step": 31290 }, { "epoch": 5.105220228384992, "grad_norm": 5.326434135437012, "learning_rate": 2.8435620120230595e-05, "loss": 0.2816, "num_input_tokens_seen": 67545024, "step": 31295 }, { "epoch": 5.106035889070147, "grad_norm": 6.8628644943237305, "learning_rate": 2.84285695327793e-05, "loss": 0.2522, "num_input_tokens_seen": 67556640, "step": 31300 }, { "epoch": 5.1068515497553015, "grad_norm": 0.13472425937652588, "learning_rate": 2.842151866739077e-05, "loss": 0.0066, "num_input_tokens_seen": 67568032, "step": 31305 }, { "epoch": 5.107667210440456, "grad_norm": 0.11969398707151413, "learning_rate": 2.8414467524636568e-05, "loss": 0.0087, "num_input_tokens_seen": 67578688, "step": 31310 }, { "epoch": 5.108482871125612, "grad_norm": 0.055297184735536575, "learning_rate": 2.8407416105088304e-05, "loss": 0.0137, "num_input_tokens_seen": 67589664, "step": 31315 }, { "epoch": 5.109298531810767, "grad_norm": 0.9463332295417786, "learning_rate": 2.8400364409317604e-05, "loss": 0.1658, "num_input_tokens_seen": 67600736, "step": 31320 }, { "epoch": 5.110114192495922, "grad_norm": 0.21723707020282745, "learning_rate": 2.8393312437896112e-05, "loss": 0.2344, "num_input_tokens_seen": 67612288, "step": 31325 }, { "epoch": 5.1109298531810765, "grad_norm": 3.053849220275879, "learning_rate": 2.8386260191395497e-05, "loss": 0.2375, "num_input_tokens_seen": 67622592, "step": 31330 }, { "epoch": 5.111745513866231, "grad_norm": 0.13577203452587128, "learning_rate": 2.837920767038745e-05, "loss": 0.1754, "num_input_tokens_seen": 67633568, "step": 31335 }, { "epoch": 5.112561174551387, "grad_norm": 0.09780284017324448, "learning_rate": 2.837215487544368e-05, "loss": 0.1429, "num_input_tokens_seen": 67644000, "step": 31340 }, { "epoch": 5.113376835236542, "grad_norm": 7.872146129608154, "learning_rate": 2.836510180713593e-05, "loss": 0.2021, "num_input_tokens_seen": 67654816, "step": 31345 }, { "epoch": 5.114192495921697, "grad_norm": 2.2904276847839355, "learning_rate": 2.835804846603595e-05, "loss": 0.3047, "num_input_tokens_seen": 67666304, "step": 31350 }, { "epoch": 5.1150081566068515, "grad_norm": 4.582032203674316, "learning_rate": 2.8350994852715522e-05, "loss": 0.1335, "num_input_tokens_seen": 67677536, "step": 31355 }, { "epoch": 5.115823817292006, "grad_norm": 0.11408593505620956, "learning_rate": 2.8343940967746453e-05, "loss": 0.1285, "num_input_tokens_seen": 67688608, "step": 31360 }, { "epoch": 5.116639477977161, "grad_norm": 19.40458106994629, "learning_rate": 2.8336886811700548e-05, "loss": 0.1566, "num_input_tokens_seen": 67699200, "step": 31365 }, { "epoch": 5.117455138662317, "grad_norm": 0.1368633657693863, "learning_rate": 2.8329832385149678e-05, "loss": 0.022, "num_input_tokens_seen": 67709568, "step": 31370 }, { "epoch": 5.118270799347472, "grad_norm": 2.977189540863037, "learning_rate": 2.8322777688665704e-05, "loss": 0.1955, "num_input_tokens_seen": 67719360, "step": 31375 }, { "epoch": 5.1190864600326265, "grad_norm": 0.13537751138210297, "learning_rate": 2.83157227228205e-05, "loss": 0.065, "num_input_tokens_seen": 67729472, "step": 31380 }, { "epoch": 5.119902120717781, "grad_norm": 5.512944221496582, "learning_rate": 2.830866748818599e-05, "loss": 0.0586, "num_input_tokens_seen": 67739424, "step": 31385 }, { "epoch": 5.120717781402936, "grad_norm": 13.24183177947998, "learning_rate": 2.8301611985334115e-05, "loss": 0.0685, "num_input_tokens_seen": 67750624, "step": 31390 }, { "epoch": 5.121533442088092, "grad_norm": 0.27740323543548584, "learning_rate": 2.8294556214836813e-05, "loss": 0.0966, "num_input_tokens_seen": 67760960, "step": 31395 }, { "epoch": 5.122349102773247, "grad_norm": 0.24310491979122162, "learning_rate": 2.828750017726607e-05, "loss": 0.0623, "num_input_tokens_seen": 67771232, "step": 31400 }, { "epoch": 5.123164763458401, "grad_norm": 3.1149415969848633, "learning_rate": 2.8280443873193884e-05, "loss": 0.0122, "num_input_tokens_seen": 67781408, "step": 31405 }, { "epoch": 5.123980424143556, "grad_norm": 0.09213629364967346, "learning_rate": 2.8273387303192266e-05, "loss": 0.1982, "num_input_tokens_seen": 67793472, "step": 31410 }, { "epoch": 5.124796084828711, "grad_norm": 0.2630845904350281, "learning_rate": 2.8266330467833274e-05, "loss": 0.0068, "num_input_tokens_seen": 67804704, "step": 31415 }, { "epoch": 5.125611745513866, "grad_norm": 0.10485062748193741, "learning_rate": 2.8259273367688954e-05, "loss": 0.1833, "num_input_tokens_seen": 67815840, "step": 31420 }, { "epoch": 5.126427406199022, "grad_norm": 0.11413134634494781, "learning_rate": 2.8252216003331395e-05, "loss": 0.0076, "num_input_tokens_seen": 67827008, "step": 31425 }, { "epoch": 5.127243066884176, "grad_norm": 2.9589438438415527, "learning_rate": 2.824515837533271e-05, "loss": 0.3522, "num_input_tokens_seen": 67837024, "step": 31430 }, { "epoch": 5.128058727569331, "grad_norm": 0.2477777749300003, "learning_rate": 2.823810048426502e-05, "loss": 0.0164, "num_input_tokens_seen": 67847360, "step": 31435 }, { "epoch": 5.128874388254486, "grad_norm": 0.05343763157725334, "learning_rate": 2.8231042330700473e-05, "loss": 0.341, "num_input_tokens_seen": 67857152, "step": 31440 }, { "epoch": 5.129690048939641, "grad_norm": 3.411064386367798, "learning_rate": 2.822398391521125e-05, "loss": 0.1293, "num_input_tokens_seen": 67867680, "step": 31445 }, { "epoch": 5.130505709624796, "grad_norm": 0.09728110581636429, "learning_rate": 2.8216925238369518e-05, "loss": 0.0085, "num_input_tokens_seen": 67877632, "step": 31450 }, { "epoch": 5.131321370309951, "grad_norm": 0.0923701673746109, "learning_rate": 2.820986630074751e-05, "loss": 0.1656, "num_input_tokens_seen": 67886912, "step": 31455 }, { "epoch": 5.132137030995106, "grad_norm": 0.14128021895885468, "learning_rate": 2.8202807102917457e-05, "loss": 0.0103, "num_input_tokens_seen": 67895040, "step": 31460 }, { "epoch": 5.132952691680261, "grad_norm": 0.20301920175552368, "learning_rate": 2.8195747645451605e-05, "loss": 0.0616, "num_input_tokens_seen": 67906336, "step": 31465 }, { "epoch": 5.133768352365416, "grad_norm": 0.07375723868608475, "learning_rate": 2.8188687928922237e-05, "loss": 0.1307, "num_input_tokens_seen": 67916992, "step": 31470 }, { "epoch": 5.134584013050571, "grad_norm": 0.12706264853477478, "learning_rate": 2.818162795390164e-05, "loss": 0.0992, "num_input_tokens_seen": 67928352, "step": 31475 }, { "epoch": 5.135399673735726, "grad_norm": 0.8841932415962219, "learning_rate": 2.817456772096214e-05, "loss": 0.2177, "num_input_tokens_seen": 67940160, "step": 31480 }, { "epoch": 5.136215334420881, "grad_norm": 1.2765356302261353, "learning_rate": 2.8167507230676077e-05, "loss": 0.0696, "num_input_tokens_seen": 67950304, "step": 31485 }, { "epoch": 5.137030995106036, "grad_norm": 0.15975184738636017, "learning_rate": 2.8160446483615804e-05, "loss": 0.1641, "num_input_tokens_seen": 67960224, "step": 31490 }, { "epoch": 5.137846655791191, "grad_norm": 2.555025577545166, "learning_rate": 2.8153385480353705e-05, "loss": 0.1185, "num_input_tokens_seen": 67971616, "step": 31495 }, { "epoch": 5.138662316476346, "grad_norm": 0.24455052614212036, "learning_rate": 2.814632422146218e-05, "loss": 0.1452, "num_input_tokens_seen": 67982048, "step": 31500 }, { "epoch": 5.1394779771615005, "grad_norm": 0.1774958372116089, "learning_rate": 2.8139262707513647e-05, "loss": 0.1026, "num_input_tokens_seen": 67991584, "step": 31505 }, { "epoch": 5.140293637846656, "grad_norm": 6.054966449737549, "learning_rate": 2.813220093908055e-05, "loss": 0.1188, "num_input_tokens_seen": 68002208, "step": 31510 }, { "epoch": 5.141109298531811, "grad_norm": 0.17508752644062042, "learning_rate": 2.812513891673535e-05, "loss": 0.0089, "num_input_tokens_seen": 68013344, "step": 31515 }, { "epoch": 5.141924959216966, "grad_norm": 1.0667266845703125, "learning_rate": 2.8118076641050535e-05, "loss": 0.1915, "num_input_tokens_seen": 68025056, "step": 31520 }, { "epoch": 5.142740619902121, "grad_norm": 0.3312191069126129, "learning_rate": 2.81110141125986e-05, "loss": 0.0102, "num_input_tokens_seen": 68035296, "step": 31525 }, { "epoch": 5.143556280587275, "grad_norm": 2.1736700534820557, "learning_rate": 2.8103951331952083e-05, "loss": 0.0901, "num_input_tokens_seen": 68046080, "step": 31530 }, { "epoch": 5.14437194127243, "grad_norm": 0.08169755339622498, "learning_rate": 2.8096888299683515e-05, "loss": 0.0885, "num_input_tokens_seen": 68057344, "step": 31535 }, { "epoch": 5.145187601957586, "grad_norm": 0.13582096993923187, "learning_rate": 2.8089825016365478e-05, "loss": 0.127, "num_input_tokens_seen": 68067488, "step": 31540 }, { "epoch": 5.146003262642741, "grad_norm": 14.365918159484863, "learning_rate": 2.808276148257054e-05, "loss": 0.151, "num_input_tokens_seen": 68079584, "step": 31545 }, { "epoch": 5.146818923327896, "grad_norm": 0.16848717629909515, "learning_rate": 2.807569769887132e-05, "loss": 0.2657, "num_input_tokens_seen": 68090816, "step": 31550 }, { "epoch": 5.14763458401305, "grad_norm": 0.10022691637277603, "learning_rate": 2.8068633665840438e-05, "loss": 0.0765, "num_input_tokens_seen": 68102592, "step": 31555 }, { "epoch": 5.148450244698205, "grad_norm": 0.2540022134780884, "learning_rate": 2.806156938405054e-05, "loss": 0.0084, "num_input_tokens_seen": 68113664, "step": 31560 }, { "epoch": 5.149265905383361, "grad_norm": 0.15770824253559113, "learning_rate": 2.8054504854074293e-05, "loss": 0.0684, "num_input_tokens_seen": 68124480, "step": 31565 }, { "epoch": 5.150081566068516, "grad_norm": 0.11198746412992477, "learning_rate": 2.8047440076484383e-05, "loss": 0.093, "num_input_tokens_seen": 68135424, "step": 31570 }, { "epoch": 5.150897226753671, "grad_norm": 0.7075061202049255, "learning_rate": 2.8040375051853522e-05, "loss": 0.1386, "num_input_tokens_seen": 68146784, "step": 31575 }, { "epoch": 5.151712887438825, "grad_norm": 0.45923352241516113, "learning_rate": 2.803330978075443e-05, "loss": 0.1452, "num_input_tokens_seen": 68155744, "step": 31580 }, { "epoch": 5.15252854812398, "grad_norm": 0.06245943903923035, "learning_rate": 2.802624426375985e-05, "loss": 0.0054, "num_input_tokens_seen": 68167168, "step": 31585 }, { "epoch": 5.153344208809135, "grad_norm": 3.847533702850342, "learning_rate": 2.801917850144256e-05, "loss": 0.1779, "num_input_tokens_seen": 68178496, "step": 31590 }, { "epoch": 5.154159869494291, "grad_norm": 0.1625690907239914, "learning_rate": 2.8012112494375342e-05, "loss": 0.0659, "num_input_tokens_seen": 68189568, "step": 31595 }, { "epoch": 5.1549755301794455, "grad_norm": 0.17873123288154602, "learning_rate": 2.8005046243131005e-05, "loss": 0.0522, "num_input_tokens_seen": 68198624, "step": 31600 }, { "epoch": 5.1557911908646, "grad_norm": 0.1790742725133896, "learning_rate": 2.7997979748282364e-05, "loss": 0.0064, "num_input_tokens_seen": 68210528, "step": 31605 }, { "epoch": 5.156606851549755, "grad_norm": 0.09914734214544296, "learning_rate": 2.7990913010402282e-05, "loss": 0.0718, "num_input_tokens_seen": 68220384, "step": 31610 }, { "epoch": 5.15742251223491, "grad_norm": 0.21887736022472382, "learning_rate": 2.798384603006361e-05, "loss": 0.1765, "num_input_tokens_seen": 68230464, "step": 31615 }, { "epoch": 5.158238172920065, "grad_norm": 0.1654205173254013, "learning_rate": 2.7976778807839245e-05, "loss": 0.1839, "num_input_tokens_seen": 68240384, "step": 31620 }, { "epoch": 5.1590538336052205, "grad_norm": 0.12954449653625488, "learning_rate": 2.796971134430208e-05, "loss": 0.0274, "num_input_tokens_seen": 68251840, "step": 31625 }, { "epoch": 5.159869494290375, "grad_norm": 1.3999621868133545, "learning_rate": 2.7962643640025044e-05, "loss": 0.1217, "num_input_tokens_seen": 68263008, "step": 31630 }, { "epoch": 5.16068515497553, "grad_norm": 0.18048496544361115, "learning_rate": 2.7955575695581083e-05, "loss": 0.0967, "num_input_tokens_seen": 68273920, "step": 31635 }, { "epoch": 5.161500815660685, "grad_norm": 1.6067979335784912, "learning_rate": 2.794850751154316e-05, "loss": 0.1178, "num_input_tokens_seen": 68284256, "step": 31640 }, { "epoch": 5.16231647634584, "grad_norm": 0.06456427276134491, "learning_rate": 2.794143908848426e-05, "loss": 0.1148, "num_input_tokens_seen": 68296000, "step": 31645 }, { "epoch": 5.1631321370309955, "grad_norm": 0.11345984786748886, "learning_rate": 2.7934370426977385e-05, "loss": 0.013, "num_input_tokens_seen": 68308224, "step": 31650 }, { "epoch": 5.16394779771615, "grad_norm": 0.10962020605802536, "learning_rate": 2.792730152759555e-05, "loss": 0.0314, "num_input_tokens_seen": 68318560, "step": 31655 }, { "epoch": 5.164763458401305, "grad_norm": 0.11335140466690063, "learning_rate": 2.7920232390911805e-05, "loss": 0.1019, "num_input_tokens_seen": 68328256, "step": 31660 }, { "epoch": 5.16557911908646, "grad_norm": 0.08123160898685455, "learning_rate": 2.79131630174992e-05, "loss": 0.0973, "num_input_tokens_seen": 68339648, "step": 31665 }, { "epoch": 5.166394779771615, "grad_norm": 12.271551132202148, "learning_rate": 2.790609340793082e-05, "loss": 0.1062, "num_input_tokens_seen": 68351072, "step": 31670 }, { "epoch": 5.16721044045677, "grad_norm": 10.418120384216309, "learning_rate": 2.789902356277977e-05, "loss": 0.054, "num_input_tokens_seen": 68360832, "step": 31675 }, { "epoch": 5.168026101141925, "grad_norm": 2.957167387008667, "learning_rate": 2.7891953482619148e-05, "loss": 0.137, "num_input_tokens_seen": 68370016, "step": 31680 }, { "epoch": 5.16884176182708, "grad_norm": 0.08972841501235962, "learning_rate": 2.788488316802211e-05, "loss": 0.064, "num_input_tokens_seen": 68380768, "step": 31685 }, { "epoch": 5.169657422512235, "grad_norm": 0.12127567827701569, "learning_rate": 2.78778126195618e-05, "loss": 0.0072, "num_input_tokens_seen": 68392448, "step": 31690 }, { "epoch": 5.17047308319739, "grad_norm": 0.649427592754364, "learning_rate": 2.7870741837811404e-05, "loss": 0.1453, "num_input_tokens_seen": 68402016, "step": 31695 }, { "epoch": 5.171288743882545, "grad_norm": 0.31435349583625793, "learning_rate": 2.7863670823344106e-05, "loss": 0.0956, "num_input_tokens_seen": 68412704, "step": 31700 }, { "epoch": 5.1721044045677, "grad_norm": 0.11511395126581192, "learning_rate": 2.7856599576733124e-05, "loss": 0.1296, "num_input_tokens_seen": 68424448, "step": 31705 }, { "epoch": 5.172920065252855, "grad_norm": 0.131520077586174, "learning_rate": 2.7849528098551682e-05, "loss": 0.212, "num_input_tokens_seen": 68435168, "step": 31710 }, { "epoch": 5.17373572593801, "grad_norm": 0.3217465579509735, "learning_rate": 2.7842456389373032e-05, "loss": 0.1175, "num_input_tokens_seen": 68445696, "step": 31715 }, { "epoch": 5.174551386623165, "grad_norm": 3.1008524894714355, "learning_rate": 2.783538444977045e-05, "loss": 0.0683, "num_input_tokens_seen": 68455936, "step": 31720 }, { "epoch": 5.1753670473083195, "grad_norm": 0.08708109706640244, "learning_rate": 2.7828312280317214e-05, "loss": 0.0753, "num_input_tokens_seen": 68465344, "step": 31725 }, { "epoch": 5.176182707993474, "grad_norm": 0.11733467131853104, "learning_rate": 2.782123988158664e-05, "loss": 0.0063, "num_input_tokens_seen": 68475936, "step": 31730 }, { "epoch": 5.17699836867863, "grad_norm": 0.33614835143089294, "learning_rate": 2.781416725415204e-05, "loss": 0.0123, "num_input_tokens_seen": 68486496, "step": 31735 }, { "epoch": 5.177814029363785, "grad_norm": 4.399515151977539, "learning_rate": 2.780709439858677e-05, "loss": 0.019, "num_input_tokens_seen": 68497408, "step": 31740 }, { "epoch": 5.17862969004894, "grad_norm": 3.38046932220459, "learning_rate": 2.7800021315464176e-05, "loss": 0.0222, "num_input_tokens_seen": 68506720, "step": 31745 }, { "epoch": 5.1794453507340945, "grad_norm": 3.1816532611846924, "learning_rate": 2.779294800535765e-05, "loss": 0.1885, "num_input_tokens_seen": 68516800, "step": 31750 }, { "epoch": 5.180261011419249, "grad_norm": 3.5422580242156982, "learning_rate": 2.778587446884059e-05, "loss": 0.0761, "num_input_tokens_seen": 68525120, "step": 31755 }, { "epoch": 5.181076672104404, "grad_norm": 0.10872621089220047, "learning_rate": 2.777880070648641e-05, "loss": 0.063, "num_input_tokens_seen": 68535872, "step": 31760 }, { "epoch": 5.18189233278956, "grad_norm": 0.17329590022563934, "learning_rate": 2.777172671886854e-05, "loss": 0.0395, "num_input_tokens_seen": 68545760, "step": 31765 }, { "epoch": 5.182707993474715, "grad_norm": 0.13874921202659607, "learning_rate": 2.776465250656044e-05, "loss": 0.0084, "num_input_tokens_seen": 68555552, "step": 31770 }, { "epoch": 5.1835236541598695, "grad_norm": 9.618099212646484, "learning_rate": 2.7757578070135588e-05, "loss": 0.0973, "num_input_tokens_seen": 68565760, "step": 31775 }, { "epoch": 5.184339314845024, "grad_norm": 0.17414389550685883, "learning_rate": 2.775050341016746e-05, "loss": 0.0045, "num_input_tokens_seen": 68576160, "step": 31780 }, { "epoch": 5.185154975530179, "grad_norm": 3.377487897872925, "learning_rate": 2.774342852722957e-05, "loss": 0.3322, "num_input_tokens_seen": 68587360, "step": 31785 }, { "epoch": 5.185970636215335, "grad_norm": 0.9131451845169067, "learning_rate": 2.7736353421895445e-05, "loss": 0.0215, "num_input_tokens_seen": 68598336, "step": 31790 }, { "epoch": 5.18678629690049, "grad_norm": 0.06972074508666992, "learning_rate": 2.772927809473862e-05, "loss": 0.1982, "num_input_tokens_seen": 68610560, "step": 31795 }, { "epoch": 5.1876019575856445, "grad_norm": 0.153375044465065, "learning_rate": 2.7722202546332676e-05, "loss": 0.0079, "num_input_tokens_seen": 68621472, "step": 31800 }, { "epoch": 5.188417618270799, "grad_norm": 3.2225635051727295, "learning_rate": 2.7715126777251177e-05, "loss": 0.2064, "num_input_tokens_seen": 68632064, "step": 31805 }, { "epoch": 5.189233278955954, "grad_norm": 0.3280313313007355, "learning_rate": 2.7708050788067724e-05, "loss": 0.112, "num_input_tokens_seen": 68642784, "step": 31810 }, { "epoch": 5.190048939641109, "grad_norm": 0.27965638041496277, "learning_rate": 2.7700974579355933e-05, "loss": 0.098, "num_input_tokens_seen": 68652448, "step": 31815 }, { "epoch": 5.190864600326265, "grad_norm": 3.0123000144958496, "learning_rate": 2.769389815168944e-05, "loss": 0.1138, "num_input_tokens_seen": 68662656, "step": 31820 }, { "epoch": 5.191680261011419, "grad_norm": 0.11257915198802948, "learning_rate": 2.7686821505641893e-05, "loss": 0.0073, "num_input_tokens_seen": 68673248, "step": 31825 }, { "epoch": 5.192495921696574, "grad_norm": 2.494020462036133, "learning_rate": 2.7679744641786963e-05, "loss": 0.0867, "num_input_tokens_seen": 68683776, "step": 31830 }, { "epoch": 5.193311582381729, "grad_norm": 0.20498842000961304, "learning_rate": 2.7672667560698328e-05, "loss": 0.0584, "num_input_tokens_seen": 68694112, "step": 31835 }, { "epoch": 5.194127243066884, "grad_norm": 4.805168628692627, "learning_rate": 2.7665590262949707e-05, "loss": 0.2257, "num_input_tokens_seen": 68704512, "step": 31840 }, { "epoch": 5.19494290375204, "grad_norm": 0.2649928033351898, "learning_rate": 2.7658512749114816e-05, "loss": 0.1219, "num_input_tokens_seen": 68715776, "step": 31845 }, { "epoch": 5.195758564437194, "grad_norm": 0.6095231771469116, "learning_rate": 2.7651435019767384e-05, "loss": 0.1598, "num_input_tokens_seen": 68725888, "step": 31850 }, { "epoch": 5.196574225122349, "grad_norm": 0.1108047366142273, "learning_rate": 2.764435707548118e-05, "loss": 0.1038, "num_input_tokens_seen": 68736864, "step": 31855 }, { "epoch": 5.197389885807504, "grad_norm": 0.11246475577354431, "learning_rate": 2.7637278916829977e-05, "loss": 0.0156, "num_input_tokens_seen": 68747264, "step": 31860 }, { "epoch": 5.198205546492659, "grad_norm": 0.062301430851221085, "learning_rate": 2.7630200544387562e-05, "loss": 0.2893, "num_input_tokens_seen": 68757792, "step": 31865 }, { "epoch": 5.199021207177814, "grad_norm": 0.531463623046875, "learning_rate": 2.762312195872775e-05, "loss": 0.1987, "num_input_tokens_seen": 68769440, "step": 31870 }, { "epoch": 5.199836867862969, "grad_norm": 2.7355659008026123, "learning_rate": 2.761604316042436e-05, "loss": 0.1167, "num_input_tokens_seen": 68780224, "step": 31875 }, { "epoch": 5.200652528548124, "grad_norm": 6.167346954345703, "learning_rate": 2.760896415005123e-05, "loss": 0.0425, "num_input_tokens_seen": 68791008, "step": 31880 }, { "epoch": 5.201468189233279, "grad_norm": 0.10047260671854019, "learning_rate": 2.7601884928182238e-05, "loss": 0.0165, "num_input_tokens_seen": 68801088, "step": 31885 }, { "epoch": 5.202283849918434, "grad_norm": 0.17573612928390503, "learning_rate": 2.759480549539125e-05, "loss": 0.0088, "num_input_tokens_seen": 68812128, "step": 31890 }, { "epoch": 5.203099510603589, "grad_norm": 2.3614935874938965, "learning_rate": 2.758772585225216e-05, "loss": 0.026, "num_input_tokens_seen": 68822688, "step": 31895 }, { "epoch": 5.2039151712887435, "grad_norm": 30.16387939453125, "learning_rate": 2.7580645999338885e-05, "loss": 0.2327, "num_input_tokens_seen": 68832896, "step": 31900 }, { "epoch": 5.204730831973899, "grad_norm": 0.14647795259952545, "learning_rate": 2.757356593722534e-05, "loss": 0.0248, "num_input_tokens_seen": 68844224, "step": 31905 }, { "epoch": 5.205546492659054, "grad_norm": 0.11921296268701553, "learning_rate": 2.7566485666485496e-05, "loss": 0.1069, "num_input_tokens_seen": 68856512, "step": 31910 }, { "epoch": 5.206362153344209, "grad_norm": 0.13035118579864502, "learning_rate": 2.75594051876933e-05, "loss": 0.0794, "num_input_tokens_seen": 68867552, "step": 31915 }, { "epoch": 5.207177814029364, "grad_norm": 0.6677807569503784, "learning_rate": 2.755232450142272e-05, "loss": 0.0437, "num_input_tokens_seen": 68877536, "step": 31920 }, { "epoch": 5.2079934747145185, "grad_norm": 5.915046215057373, "learning_rate": 2.754524360824778e-05, "loss": 0.3672, "num_input_tokens_seen": 68887360, "step": 31925 }, { "epoch": 5.208809135399674, "grad_norm": 0.9929408431053162, "learning_rate": 2.7538162508742472e-05, "loss": 0.1934, "num_input_tokens_seen": 68899328, "step": 31930 }, { "epoch": 5.209624796084829, "grad_norm": 0.0498029850423336, "learning_rate": 2.7531081203480834e-05, "loss": 0.023, "num_input_tokens_seen": 68909888, "step": 31935 }, { "epoch": 5.210440456769984, "grad_norm": 12.45026683807373, "learning_rate": 2.7523999693036916e-05, "loss": 0.189, "num_input_tokens_seen": 68920416, "step": 31940 }, { "epoch": 5.211256117455139, "grad_norm": 0.17802025377750397, "learning_rate": 2.7516917977984773e-05, "loss": 0.0187, "num_input_tokens_seen": 68931936, "step": 31945 }, { "epoch": 5.212071778140293, "grad_norm": 12.071422576904297, "learning_rate": 2.7509836058898487e-05, "loss": 0.2195, "num_input_tokens_seen": 68942912, "step": 31950 }, { "epoch": 5.212887438825448, "grad_norm": 0.10195551812648773, "learning_rate": 2.750275393635215e-05, "loss": 0.314, "num_input_tokens_seen": 68954400, "step": 31955 }, { "epoch": 5.213703099510604, "grad_norm": 4.505454063415527, "learning_rate": 2.7495671610919886e-05, "loss": 0.1015, "num_input_tokens_seen": 68964032, "step": 31960 }, { "epoch": 5.214518760195759, "grad_norm": 0.11572358757257462, "learning_rate": 2.748858908317582e-05, "loss": 0.0083, "num_input_tokens_seen": 68974944, "step": 31965 }, { "epoch": 5.215334420880914, "grad_norm": 0.1790105402469635, "learning_rate": 2.7481506353694092e-05, "loss": 0.021, "num_input_tokens_seen": 68985856, "step": 31970 }, { "epoch": 5.216150081566068, "grad_norm": 0.15279224514961243, "learning_rate": 2.7474423423048873e-05, "loss": 0.0088, "num_input_tokens_seen": 68996768, "step": 31975 }, { "epoch": 5.216965742251223, "grad_norm": 0.16828513145446777, "learning_rate": 2.746734029181433e-05, "loss": 0.0227, "num_input_tokens_seen": 69008064, "step": 31980 }, { "epoch": 5.217781402936378, "grad_norm": 0.07797053456306458, "learning_rate": 2.7460256960564668e-05, "loss": 0.1128, "num_input_tokens_seen": 69018240, "step": 31985 }, { "epoch": 5.218597063621534, "grad_norm": 0.19876162707805634, "learning_rate": 2.7453173429874096e-05, "loss": 0.1801, "num_input_tokens_seen": 69028128, "step": 31990 }, { "epoch": 5.219412724306689, "grad_norm": 0.5721792578697205, "learning_rate": 2.744608970031683e-05, "loss": 0.0687, "num_input_tokens_seen": 69038720, "step": 31995 }, { "epoch": 5.220228384991843, "grad_norm": 0.0984506830573082, "learning_rate": 2.7439005772467126e-05, "loss": 0.004, "num_input_tokens_seen": 69050368, "step": 32000 }, { "epoch": 5.221044045676998, "grad_norm": 0.04918161779642105, "learning_rate": 2.743192164689924e-05, "loss": 0.0043, "num_input_tokens_seen": 69060672, "step": 32005 }, { "epoch": 5.221859706362153, "grad_norm": 0.16197675466537476, "learning_rate": 2.742483732418744e-05, "loss": 0.2101, "num_input_tokens_seen": 69072480, "step": 32010 }, { "epoch": 5.222675367047309, "grad_norm": 0.11991266161203384, "learning_rate": 2.7417752804906027e-05, "loss": 0.0746, "num_input_tokens_seen": 69083232, "step": 32015 }, { "epoch": 5.2234910277324635, "grad_norm": 0.42736032605171204, "learning_rate": 2.7410668089629304e-05, "loss": 0.008, "num_input_tokens_seen": 69094912, "step": 32020 }, { "epoch": 5.224306688417618, "grad_norm": 0.07446371018886566, "learning_rate": 2.7403583178931597e-05, "loss": 0.0042, "num_input_tokens_seen": 69106432, "step": 32025 }, { "epoch": 5.225122349102773, "grad_norm": 7.186981201171875, "learning_rate": 2.7396498073387245e-05, "loss": 0.0339, "num_input_tokens_seen": 69117184, "step": 32030 }, { "epoch": 5.225938009787928, "grad_norm": 0.08522886037826538, "learning_rate": 2.7389412773570595e-05, "loss": 0.0258, "num_input_tokens_seen": 69127520, "step": 32035 }, { "epoch": 5.226753670473083, "grad_norm": 0.02839471399784088, "learning_rate": 2.738232728005602e-05, "loss": 0.0092, "num_input_tokens_seen": 69139104, "step": 32040 }, { "epoch": 5.2275693311582385, "grad_norm": 3.8303990364074707, "learning_rate": 2.737524159341791e-05, "loss": 0.114, "num_input_tokens_seen": 69149312, "step": 32045 }, { "epoch": 5.228384991843393, "grad_norm": 0.08804535120725632, "learning_rate": 2.7368155714230663e-05, "loss": 0.1289, "num_input_tokens_seen": 69160576, "step": 32050 }, { "epoch": 5.229200652528548, "grad_norm": 0.2264700084924698, "learning_rate": 2.7361069643068698e-05, "loss": 0.0056, "num_input_tokens_seen": 69171360, "step": 32055 }, { "epoch": 5.230016313213703, "grad_norm": 0.2846857011318207, "learning_rate": 2.7353983380506444e-05, "loss": 0.3607, "num_input_tokens_seen": 69183360, "step": 32060 }, { "epoch": 5.230831973898858, "grad_norm": 0.09883857518434525, "learning_rate": 2.734689692711836e-05, "loss": 0.1259, "num_input_tokens_seen": 69195392, "step": 32065 }, { "epoch": 5.231647634584013, "grad_norm": 0.09415154159069061, "learning_rate": 2.73398102834789e-05, "loss": 0.0037, "num_input_tokens_seen": 69206816, "step": 32070 }, { "epoch": 5.232463295269168, "grad_norm": 0.13084068894386292, "learning_rate": 2.7332723450162544e-05, "loss": 0.0041, "num_input_tokens_seen": 69217248, "step": 32075 }, { "epoch": 5.233278955954323, "grad_norm": 4.655344009399414, "learning_rate": 2.7325636427743788e-05, "loss": 0.2244, "num_input_tokens_seen": 69228480, "step": 32080 }, { "epoch": 5.234094616639478, "grad_norm": 10.137812614440918, "learning_rate": 2.731854921679715e-05, "loss": 0.2596, "num_input_tokens_seen": 69240832, "step": 32085 }, { "epoch": 5.234910277324633, "grad_norm": 0.17076519131660461, "learning_rate": 2.7311461817897143e-05, "loss": 0.0161, "num_input_tokens_seen": 69250976, "step": 32090 }, { "epoch": 5.235725938009788, "grad_norm": 0.07735753059387207, "learning_rate": 2.7304374231618318e-05, "loss": 0.0434, "num_input_tokens_seen": 69261184, "step": 32095 }, { "epoch": 5.236541598694943, "grad_norm": 0.6737622618675232, "learning_rate": 2.729728645853522e-05, "loss": 0.2213, "num_input_tokens_seen": 69273024, "step": 32100 }, { "epoch": 5.237357259380098, "grad_norm": 0.08265198767185211, "learning_rate": 2.729019849922243e-05, "loss": 0.0046, "num_input_tokens_seen": 69285280, "step": 32105 }, { "epoch": 5.238172920065253, "grad_norm": 0.08026156574487686, "learning_rate": 2.7283110354254526e-05, "loss": 0.0283, "num_input_tokens_seen": 69295680, "step": 32110 }, { "epoch": 5.238988580750408, "grad_norm": 2.73895001411438, "learning_rate": 2.727602202420611e-05, "loss": 0.1615, "num_input_tokens_seen": 69305888, "step": 32115 }, { "epoch": 5.239804241435563, "grad_norm": 3.35025954246521, "learning_rate": 2.7268933509651806e-05, "loss": 0.0164, "num_input_tokens_seen": 69317856, "step": 32120 }, { "epoch": 5.240619902120717, "grad_norm": 13.326385498046875, "learning_rate": 2.7261844811166236e-05, "loss": 0.2848, "num_input_tokens_seen": 69329088, "step": 32125 }, { "epoch": 5.241435562805873, "grad_norm": 2.5656604766845703, "learning_rate": 2.725475592932405e-05, "loss": 0.0907, "num_input_tokens_seen": 69340544, "step": 32130 }, { "epoch": 5.242251223491028, "grad_norm": 2.8082115650177, "learning_rate": 2.724766686469991e-05, "loss": 0.2859, "num_input_tokens_seen": 69351104, "step": 32135 }, { "epoch": 5.243066884176183, "grad_norm": 0.03363499417901039, "learning_rate": 2.724057761786849e-05, "loss": 0.1132, "num_input_tokens_seen": 69362848, "step": 32140 }, { "epoch": 5.2438825448613375, "grad_norm": 2.582084894180298, "learning_rate": 2.7233488189404478e-05, "loss": 0.1285, "num_input_tokens_seen": 69373504, "step": 32145 }, { "epoch": 5.244698205546492, "grad_norm": 0.06234988570213318, "learning_rate": 2.7226398579882573e-05, "loss": 0.008, "num_input_tokens_seen": 69384256, "step": 32150 }, { "epoch": 5.245513866231648, "grad_norm": 0.13269774615764618, "learning_rate": 2.7219308789877513e-05, "loss": 0.1043, "num_input_tokens_seen": 69394880, "step": 32155 }, { "epoch": 5.246329526916803, "grad_norm": 0.15573489665985107, "learning_rate": 2.7212218819964013e-05, "loss": 0.1152, "num_input_tokens_seen": 69405120, "step": 32160 }, { "epoch": 5.247145187601958, "grad_norm": 0.040256571024656296, "learning_rate": 2.720512867071684e-05, "loss": 0.0387, "num_input_tokens_seen": 69416640, "step": 32165 }, { "epoch": 5.2479608482871125, "grad_norm": 1.0483318567276, "learning_rate": 2.719803834271074e-05, "loss": 0.2663, "num_input_tokens_seen": 69426912, "step": 32170 }, { "epoch": 5.248776508972267, "grad_norm": 0.050562601536512375, "learning_rate": 2.7190947836520502e-05, "loss": 0.1559, "num_input_tokens_seen": 69436512, "step": 32175 }, { "epoch": 5.249592169657422, "grad_norm": 0.1760358065366745, "learning_rate": 2.718385715272092e-05, "loss": 0.4186, "num_input_tokens_seen": 69447424, "step": 32180 }, { "epoch": 5.250407830342578, "grad_norm": 6.044267654418945, "learning_rate": 2.7176766291886792e-05, "loss": 0.1837, "num_input_tokens_seen": 69459136, "step": 32185 }, { "epoch": 5.251223491027733, "grad_norm": 12.450900077819824, "learning_rate": 2.7169675254592947e-05, "loss": 0.1447, "num_input_tokens_seen": 69470048, "step": 32190 }, { "epoch": 5.2520391517128875, "grad_norm": 5.102044582366943, "learning_rate": 2.716258404141421e-05, "loss": 0.0608, "num_input_tokens_seen": 69480992, "step": 32195 }, { "epoch": 5.252854812398042, "grad_norm": 21.830202102661133, "learning_rate": 2.7155492652925446e-05, "loss": 0.0556, "num_input_tokens_seen": 69491936, "step": 32200 }, { "epoch": 5.253670473083197, "grad_norm": 2.6970579624176025, "learning_rate": 2.714840108970151e-05, "loss": 0.2538, "num_input_tokens_seen": 69500896, "step": 32205 }, { "epoch": 5.254486133768353, "grad_norm": 8.061768531799316, "learning_rate": 2.7141309352317278e-05, "loss": 0.0383, "num_input_tokens_seen": 69509888, "step": 32210 }, { "epoch": 5.255301794453508, "grad_norm": 0.07931341230869293, "learning_rate": 2.7134217441347647e-05, "loss": 0.0095, "num_input_tokens_seen": 69519840, "step": 32215 }, { "epoch": 5.2561174551386625, "grad_norm": 11.895001411437988, "learning_rate": 2.7127125357367515e-05, "loss": 0.0714, "num_input_tokens_seen": 69530400, "step": 32220 }, { "epoch": 5.256933115823817, "grad_norm": 0.06481499969959259, "learning_rate": 2.7120033100951814e-05, "loss": 0.0065, "num_input_tokens_seen": 69540992, "step": 32225 }, { "epoch": 5.257748776508972, "grad_norm": 0.3081865608692169, "learning_rate": 2.7112940672675473e-05, "loss": 0.0856, "num_input_tokens_seen": 69552704, "step": 32230 }, { "epoch": 5.258564437194127, "grad_norm": 1.779091477394104, "learning_rate": 2.7105848073113433e-05, "loss": 0.0184, "num_input_tokens_seen": 69563648, "step": 32235 }, { "epoch": 5.259380097879283, "grad_norm": 3.2973453998565674, "learning_rate": 2.709875530284067e-05, "loss": 0.0693, "num_input_tokens_seen": 69574880, "step": 32240 }, { "epoch": 5.260195758564437, "grad_norm": 6.793567180633545, "learning_rate": 2.7091662362432153e-05, "loss": 0.2804, "num_input_tokens_seen": 69586496, "step": 32245 }, { "epoch": 5.261011419249592, "grad_norm": 1.9933561086654663, "learning_rate": 2.7084569252462873e-05, "loss": 0.0954, "num_input_tokens_seen": 69596704, "step": 32250 }, { "epoch": 5.261827079934747, "grad_norm": 4.946976661682129, "learning_rate": 2.7077475973507832e-05, "loss": 0.235, "num_input_tokens_seen": 69606912, "step": 32255 }, { "epoch": 5.262642740619902, "grad_norm": 6.156367778778076, "learning_rate": 2.7070382526142045e-05, "loss": 0.3343, "num_input_tokens_seen": 69618880, "step": 32260 }, { "epoch": 5.263458401305057, "grad_norm": 2.610217809677124, "learning_rate": 2.706328891094055e-05, "loss": 0.075, "num_input_tokens_seen": 69629440, "step": 32265 }, { "epoch": 5.264274061990212, "grad_norm": 0.1646733283996582, "learning_rate": 2.7056195128478384e-05, "loss": 0.0419, "num_input_tokens_seen": 69640128, "step": 32270 }, { "epoch": 5.265089722675367, "grad_norm": 1.4181983470916748, "learning_rate": 2.7049101179330605e-05, "loss": 0.0451, "num_input_tokens_seen": 69650816, "step": 32275 }, { "epoch": 5.265905383360522, "grad_norm": 0.19968171417713165, "learning_rate": 2.7042007064072288e-05, "loss": 0.1225, "num_input_tokens_seen": 69661376, "step": 32280 }, { "epoch": 5.266721044045677, "grad_norm": 3.8542120456695557, "learning_rate": 2.703491278327852e-05, "loss": 0.0796, "num_input_tokens_seen": 69671200, "step": 32285 }, { "epoch": 5.267536704730832, "grad_norm": 2.813113212585449, "learning_rate": 2.7027818337524396e-05, "loss": 0.182, "num_input_tokens_seen": 69682048, "step": 32290 }, { "epoch": 5.268352365415987, "grad_norm": 0.17513194680213928, "learning_rate": 2.7020723727385027e-05, "loss": 0.0331, "num_input_tokens_seen": 69694208, "step": 32295 }, { "epoch": 5.269168026101142, "grad_norm": 0.10825838148593903, "learning_rate": 2.7013628953435544e-05, "loss": 0.0772, "num_input_tokens_seen": 69702848, "step": 32300 }, { "epoch": 5.269983686786297, "grad_norm": 0.9168631434440613, "learning_rate": 2.7006534016251072e-05, "loss": 0.0105, "num_input_tokens_seen": 69714080, "step": 32305 }, { "epoch": 5.270799347471452, "grad_norm": 0.14680083096027374, "learning_rate": 2.6999438916406777e-05, "loss": 0.0493, "num_input_tokens_seen": 69725408, "step": 32310 }, { "epoch": 5.271615008156607, "grad_norm": 0.29254454374313354, "learning_rate": 2.6992343654477825e-05, "loss": 0.008, "num_input_tokens_seen": 69736704, "step": 32315 }, { "epoch": 5.2724306688417615, "grad_norm": 0.31529340147972107, "learning_rate": 2.6985248231039378e-05, "loss": 0.0111, "num_input_tokens_seen": 69747264, "step": 32320 }, { "epoch": 5.273246329526917, "grad_norm": 0.7765026688575745, "learning_rate": 2.6978152646666644e-05, "loss": 0.0907, "num_input_tokens_seen": 69758592, "step": 32325 }, { "epoch": 5.274061990212072, "grad_norm": 0.2740999460220337, "learning_rate": 2.697105690193481e-05, "loss": 0.0996, "num_input_tokens_seen": 69769088, "step": 32330 }, { "epoch": 5.274877650897227, "grad_norm": 0.20411568880081177, "learning_rate": 2.696396099741911e-05, "loss": 0.0103, "num_input_tokens_seen": 69780064, "step": 32335 }, { "epoch": 5.275693311582382, "grad_norm": 0.0828116163611412, "learning_rate": 2.695686493369476e-05, "loss": 0.1759, "num_input_tokens_seen": 69792064, "step": 32340 }, { "epoch": 5.2765089722675365, "grad_norm": 0.8827752470970154, "learning_rate": 2.6949768711337015e-05, "loss": 0.1659, "num_input_tokens_seen": 69802304, "step": 32345 }, { "epoch": 5.277324632952691, "grad_norm": 0.10121121257543564, "learning_rate": 2.6942672330921124e-05, "loss": 0.0183, "num_input_tokens_seen": 69811104, "step": 32350 }, { "epoch": 5.278140293637847, "grad_norm": 0.07794969528913498, "learning_rate": 2.693557579302236e-05, "loss": 0.0108, "num_input_tokens_seen": 69823072, "step": 32355 }, { "epoch": 5.278955954323002, "grad_norm": 1.5654137134552002, "learning_rate": 2.6928479098216e-05, "loss": 0.076, "num_input_tokens_seen": 69835104, "step": 32360 }, { "epoch": 5.279771615008157, "grad_norm": 0.2712656557559967, "learning_rate": 2.6921382247077336e-05, "loss": 0.0235, "num_input_tokens_seen": 69846784, "step": 32365 }, { "epoch": 5.280587275693311, "grad_norm": 0.18609561026096344, "learning_rate": 2.691428524018168e-05, "loss": 0.0113, "num_input_tokens_seen": 69857024, "step": 32370 }, { "epoch": 5.281402936378466, "grad_norm": 0.08994683623313904, "learning_rate": 2.6907188078104352e-05, "loss": 0.0045, "num_input_tokens_seen": 69866624, "step": 32375 }, { "epoch": 5.282218597063622, "grad_norm": 0.39006930589675903, "learning_rate": 2.690009076142067e-05, "loss": 0.0389, "num_input_tokens_seen": 69878816, "step": 32380 }, { "epoch": 5.283034257748777, "grad_norm": 0.05270082503557205, "learning_rate": 2.6892993290706e-05, "loss": 0.0967, "num_input_tokens_seen": 69889184, "step": 32385 }, { "epoch": 5.283849918433932, "grad_norm": 0.22735679149627686, "learning_rate": 2.6885895666535684e-05, "loss": 0.1762, "num_input_tokens_seen": 69900160, "step": 32390 }, { "epoch": 5.284665579119086, "grad_norm": 0.024947430938482285, "learning_rate": 2.6878797889485096e-05, "loss": 0.1381, "num_input_tokens_seen": 69910848, "step": 32395 }, { "epoch": 5.285481239804241, "grad_norm": 0.04396042972803116, "learning_rate": 2.687169996012962e-05, "loss": 0.0053, "num_input_tokens_seen": 69921856, "step": 32400 }, { "epoch": 5.286296900489396, "grad_norm": 0.05487280711531639, "learning_rate": 2.6864601879044653e-05, "loss": 0.0675, "num_input_tokens_seen": 69932768, "step": 32405 }, { "epoch": 5.287112561174552, "grad_norm": 2.912031888961792, "learning_rate": 2.6857503646805593e-05, "loss": 0.1006, "num_input_tokens_seen": 69943488, "step": 32410 }, { "epoch": 5.287928221859707, "grad_norm": 9.795567512512207, "learning_rate": 2.6850405263987867e-05, "loss": 0.101, "num_input_tokens_seen": 69953312, "step": 32415 }, { "epoch": 5.288743882544861, "grad_norm": 4.187163352966309, "learning_rate": 2.6843306731166894e-05, "loss": 0.0863, "num_input_tokens_seen": 69963072, "step": 32420 }, { "epoch": 5.289559543230016, "grad_norm": 0.03776213154196739, "learning_rate": 2.6836208048918132e-05, "loss": 0.1004, "num_input_tokens_seen": 69973600, "step": 32425 }, { "epoch": 5.290375203915171, "grad_norm": 0.17811918258666992, "learning_rate": 2.682910921781702e-05, "loss": 0.0504, "num_input_tokens_seen": 69984384, "step": 32430 }, { "epoch": 5.291190864600326, "grad_norm": 0.16918852925300598, "learning_rate": 2.682201023843904e-05, "loss": 0.0149, "num_input_tokens_seen": 69995616, "step": 32435 }, { "epoch": 5.2920065252854815, "grad_norm": 2.7108592987060547, "learning_rate": 2.6814911111359665e-05, "loss": 0.0701, "num_input_tokens_seen": 70006432, "step": 32440 }, { "epoch": 5.292822185970636, "grad_norm": 3.337555408477783, "learning_rate": 2.6807811837154383e-05, "loss": 0.2409, "num_input_tokens_seen": 70017120, "step": 32445 }, { "epoch": 5.293637846655791, "grad_norm": 0.2774198651313782, "learning_rate": 2.6800712416398705e-05, "loss": 0.1345, "num_input_tokens_seen": 70027744, "step": 32450 }, { "epoch": 5.294453507340946, "grad_norm": 4.527461051940918, "learning_rate": 2.6793612849668138e-05, "loss": 0.0884, "num_input_tokens_seen": 70039360, "step": 32455 }, { "epoch": 5.295269168026101, "grad_norm": 0.035836637020111084, "learning_rate": 2.6786513137538216e-05, "loss": 0.249, "num_input_tokens_seen": 70049952, "step": 32460 }, { "epoch": 5.2960848287112565, "grad_norm": 0.10114829242229462, "learning_rate": 2.677941328058447e-05, "loss": 0.1256, "num_input_tokens_seen": 70059904, "step": 32465 }, { "epoch": 5.296900489396411, "grad_norm": 0.07702568918466568, "learning_rate": 2.677231327938246e-05, "loss": 0.1912, "num_input_tokens_seen": 70070464, "step": 32470 }, { "epoch": 5.297716150081566, "grad_norm": 1.3145924806594849, "learning_rate": 2.676521313450774e-05, "loss": 0.0132, "num_input_tokens_seen": 70081472, "step": 32475 }, { "epoch": 5.298531810766721, "grad_norm": 0.15544812381267548, "learning_rate": 2.6758112846535888e-05, "loss": 0.0579, "num_input_tokens_seen": 70091360, "step": 32480 }, { "epoch": 5.299347471451876, "grad_norm": 0.7376506328582764, "learning_rate": 2.6751012416042487e-05, "loss": 0.1002, "num_input_tokens_seen": 70101088, "step": 32485 }, { "epoch": 5.300163132137031, "grad_norm": 0.09198694676160812, "learning_rate": 2.674391184360313e-05, "loss": 0.039, "num_input_tokens_seen": 70112608, "step": 32490 }, { "epoch": 5.300978792822186, "grad_norm": 0.05273007974028587, "learning_rate": 2.6736811129793438e-05, "loss": 0.0071, "num_input_tokens_seen": 70124128, "step": 32495 }, { "epoch": 5.301794453507341, "grad_norm": 3.622174024581909, "learning_rate": 2.6729710275189024e-05, "loss": 0.2176, "num_input_tokens_seen": 70136736, "step": 32500 }, { "epoch": 5.302610114192496, "grad_norm": 0.08061046898365021, "learning_rate": 2.672260928036552e-05, "loss": 0.064, "num_input_tokens_seen": 70148768, "step": 32505 }, { "epoch": 5.303425774877651, "grad_norm": 3.023188352584839, "learning_rate": 2.671550814589856e-05, "loss": 0.1831, "num_input_tokens_seen": 70158592, "step": 32510 }, { "epoch": 5.304241435562806, "grad_norm": 0.07232240587472916, "learning_rate": 2.6708406872363813e-05, "loss": 0.0991, "num_input_tokens_seen": 70169856, "step": 32515 }, { "epoch": 5.30505709624796, "grad_norm": 0.1896428018808365, "learning_rate": 2.670130546033693e-05, "loss": 0.0068, "num_input_tokens_seen": 70179680, "step": 32520 }, { "epoch": 5.305872756933116, "grad_norm": 0.2053537368774414, "learning_rate": 2.6694203910393594e-05, "loss": 0.1245, "num_input_tokens_seen": 70191104, "step": 32525 }, { "epoch": 5.306688417618271, "grad_norm": 0.10377363860607147, "learning_rate": 2.66871022231095e-05, "loss": 0.1242, "num_input_tokens_seen": 70201728, "step": 32530 }, { "epoch": 5.307504078303426, "grad_norm": 0.12116867303848267, "learning_rate": 2.6680000399060327e-05, "loss": 0.0038, "num_input_tokens_seen": 70213088, "step": 32535 }, { "epoch": 5.308319738988581, "grad_norm": 0.14606209099292755, "learning_rate": 2.6672898438821808e-05, "loss": 0.0074, "num_input_tokens_seen": 70223648, "step": 32540 }, { "epoch": 5.309135399673735, "grad_norm": 0.9420191049575806, "learning_rate": 2.666579634296965e-05, "loss": 0.0239, "num_input_tokens_seen": 70234272, "step": 32545 }, { "epoch": 5.309951060358891, "grad_norm": 0.267930805683136, "learning_rate": 2.6658694112079586e-05, "loss": 0.1197, "num_input_tokens_seen": 70244800, "step": 32550 }, { "epoch": 5.310766721044046, "grad_norm": 0.0489763505756855, "learning_rate": 2.6651591746727363e-05, "loss": 0.1675, "num_input_tokens_seen": 70255360, "step": 32555 }, { "epoch": 5.311582381729201, "grad_norm": 0.06418640166521072, "learning_rate": 2.6644489247488735e-05, "loss": 0.022, "num_input_tokens_seen": 70265888, "step": 32560 }, { "epoch": 5.3123980424143555, "grad_norm": 0.09622830152511597, "learning_rate": 2.6637386614939464e-05, "loss": 0.1332, "num_input_tokens_seen": 70276544, "step": 32565 }, { "epoch": 5.31321370309951, "grad_norm": 0.08779608458280563, "learning_rate": 2.6630283849655326e-05, "loss": 0.0889, "num_input_tokens_seen": 70287904, "step": 32570 }, { "epoch": 5.314029363784665, "grad_norm": 0.14219705760478973, "learning_rate": 2.6623180952212106e-05, "loss": 0.0728, "num_input_tokens_seen": 70299872, "step": 32575 }, { "epoch": 5.314845024469821, "grad_norm": 0.05864930897951126, "learning_rate": 2.66160779231856e-05, "loss": 0.1205, "num_input_tokens_seen": 70310624, "step": 32580 }, { "epoch": 5.315660685154976, "grad_norm": 0.11006398499011993, "learning_rate": 2.660897476315162e-05, "loss": 0.0054, "num_input_tokens_seen": 70321312, "step": 32585 }, { "epoch": 5.3164763458401305, "grad_norm": 0.12669697403907776, "learning_rate": 2.6601871472685985e-05, "loss": 0.0715, "num_input_tokens_seen": 70332384, "step": 32590 }, { "epoch": 5.317292006525285, "grad_norm": 1.7560433149337769, "learning_rate": 2.659476805236451e-05, "loss": 0.0444, "num_input_tokens_seen": 70343232, "step": 32595 }, { "epoch": 5.31810766721044, "grad_norm": 3.5992536544799805, "learning_rate": 2.6587664502763054e-05, "loss": 0.4086, "num_input_tokens_seen": 70355104, "step": 32600 }, { "epoch": 5.318923327895595, "grad_norm": 3.6785888671875, "learning_rate": 2.6580560824457457e-05, "loss": 0.2076, "num_input_tokens_seen": 70366400, "step": 32605 }, { "epoch": 5.319738988580751, "grad_norm": 0.10741819441318512, "learning_rate": 2.657345701802358e-05, "loss": 0.1342, "num_input_tokens_seen": 70378400, "step": 32610 }, { "epoch": 5.3205546492659055, "grad_norm": 0.1747598648071289, "learning_rate": 2.6566353084037295e-05, "loss": 0.0091, "num_input_tokens_seen": 70389952, "step": 32615 }, { "epoch": 5.32137030995106, "grad_norm": 0.151988685131073, "learning_rate": 2.6559249023074474e-05, "loss": 0.0898, "num_input_tokens_seen": 70401408, "step": 32620 }, { "epoch": 5.322185970636215, "grad_norm": 0.061704110354185104, "learning_rate": 2.6552144835711017e-05, "loss": 0.1951, "num_input_tokens_seen": 70412640, "step": 32625 }, { "epoch": 5.32300163132137, "grad_norm": 0.3974476456642151, "learning_rate": 2.6545040522522828e-05, "loss": 0.0756, "num_input_tokens_seen": 70422592, "step": 32630 }, { "epoch": 5.323817292006526, "grad_norm": 0.20766876637935638, "learning_rate": 2.653793608408582e-05, "loss": 0.1215, "num_input_tokens_seen": 70433856, "step": 32635 }, { "epoch": 5.3246329526916805, "grad_norm": 9.215160369873047, "learning_rate": 2.6530831520975903e-05, "loss": 0.163, "num_input_tokens_seen": 70445376, "step": 32640 }, { "epoch": 5.325448613376835, "grad_norm": 4.210348129272461, "learning_rate": 2.652372683376902e-05, "loss": 0.2986, "num_input_tokens_seen": 70456384, "step": 32645 }, { "epoch": 5.32626427406199, "grad_norm": 0.09094883501529694, "learning_rate": 2.65166220230411e-05, "loss": 0.1897, "num_input_tokens_seen": 70467040, "step": 32650 }, { "epoch": 5.327079934747145, "grad_norm": 0.2546117901802063, "learning_rate": 2.650951708936811e-05, "loss": 0.1962, "num_input_tokens_seen": 70477248, "step": 32655 }, { "epoch": 5.327895595432301, "grad_norm": 0.08630216121673584, "learning_rate": 2.6502412033326e-05, "loss": 0.1757, "num_input_tokens_seen": 70487648, "step": 32660 }, { "epoch": 5.328711256117455, "grad_norm": 6.063177108764648, "learning_rate": 2.6495306855490754e-05, "loss": 0.1292, "num_input_tokens_seen": 70499296, "step": 32665 }, { "epoch": 5.32952691680261, "grad_norm": 2.3382294178009033, "learning_rate": 2.6488201556438346e-05, "loss": 0.088, "num_input_tokens_seen": 70509664, "step": 32670 }, { "epoch": 5.330342577487765, "grad_norm": 0.6118956804275513, "learning_rate": 2.648109613674477e-05, "loss": 0.017, "num_input_tokens_seen": 70520064, "step": 32675 }, { "epoch": 5.33115823817292, "grad_norm": 2.301363945007324, "learning_rate": 2.647399059698602e-05, "loss": 0.1289, "num_input_tokens_seen": 70531104, "step": 32680 }, { "epoch": 5.331973898858075, "grad_norm": 0.10155824571847916, "learning_rate": 2.646688493773812e-05, "loss": 0.0855, "num_input_tokens_seen": 70542528, "step": 32685 }, { "epoch": 5.33278955954323, "grad_norm": 2.3470518589019775, "learning_rate": 2.6459779159577077e-05, "loss": 0.0781, "num_input_tokens_seen": 70553216, "step": 32690 }, { "epoch": 5.333605220228385, "grad_norm": 3.9175291061401367, "learning_rate": 2.645267326307893e-05, "loss": 0.2545, "num_input_tokens_seen": 70565504, "step": 32695 }, { "epoch": 5.33442088091354, "grad_norm": 0.26578038930892944, "learning_rate": 2.6445567248819726e-05, "loss": 0.112, "num_input_tokens_seen": 70577376, "step": 32700 }, { "epoch": 5.335236541598695, "grad_norm": 0.4504992961883545, "learning_rate": 2.643846111737549e-05, "loss": 0.1135, "num_input_tokens_seen": 70588960, "step": 32705 }, { "epoch": 5.33605220228385, "grad_norm": 0.229661762714386, "learning_rate": 2.643135486932231e-05, "loss": 0.0112, "num_input_tokens_seen": 70600192, "step": 32710 }, { "epoch": 5.3368678629690045, "grad_norm": 0.12986430525779724, "learning_rate": 2.642424850523624e-05, "loss": 0.1258, "num_input_tokens_seen": 70610816, "step": 32715 }, { "epoch": 5.33768352365416, "grad_norm": 3.3342740535736084, "learning_rate": 2.641714202569336e-05, "loss": 0.3, "num_input_tokens_seen": 70621280, "step": 32720 }, { "epoch": 5.338499184339315, "grad_norm": 0.16055022180080414, "learning_rate": 2.6410035431269754e-05, "loss": 0.14, "num_input_tokens_seen": 70631968, "step": 32725 }, { "epoch": 5.33931484502447, "grad_norm": 0.05201416462659836, "learning_rate": 2.6402928722541524e-05, "loss": 0.0999, "num_input_tokens_seen": 70643296, "step": 32730 }, { "epoch": 5.340130505709625, "grad_norm": 0.1715608537197113, "learning_rate": 2.6395821900084772e-05, "loss": 0.0128, "num_input_tokens_seen": 70652576, "step": 32735 }, { "epoch": 5.3409461663947795, "grad_norm": 0.15612873435020447, "learning_rate": 2.638871496447562e-05, "loss": 0.0635, "num_input_tokens_seen": 70663424, "step": 32740 }, { "epoch": 5.341761827079935, "grad_norm": 3.142871856689453, "learning_rate": 2.638160791629018e-05, "loss": 0.0558, "num_input_tokens_seen": 70673536, "step": 32745 }, { "epoch": 5.34257748776509, "grad_norm": 1.802825689315796, "learning_rate": 2.6374500756104594e-05, "loss": 0.0178, "num_input_tokens_seen": 70683712, "step": 32750 }, { "epoch": 5.343393148450245, "grad_norm": 2.878145933151245, "learning_rate": 2.6367393484494994e-05, "loss": 0.1787, "num_input_tokens_seen": 70693856, "step": 32755 }, { "epoch": 5.3442088091354, "grad_norm": 0.1917629837989807, "learning_rate": 2.636028610203755e-05, "loss": 0.0087, "num_input_tokens_seen": 70705024, "step": 32760 }, { "epoch": 5.3450244698205545, "grad_norm": 0.16410884261131287, "learning_rate": 2.635317860930841e-05, "loss": 0.0105, "num_input_tokens_seen": 70715040, "step": 32765 }, { "epoch": 5.345840130505709, "grad_norm": 12.318873405456543, "learning_rate": 2.6346071006883748e-05, "loss": 0.0714, "num_input_tokens_seen": 70726080, "step": 32770 }, { "epoch": 5.346655791190865, "grad_norm": 0.12560924887657166, "learning_rate": 2.6338963295339737e-05, "loss": 0.0146, "num_input_tokens_seen": 70736800, "step": 32775 }, { "epoch": 5.34747145187602, "grad_norm": 19.746686935424805, "learning_rate": 2.633185547525257e-05, "loss": 0.0734, "num_input_tokens_seen": 70748928, "step": 32780 }, { "epoch": 5.348287112561175, "grad_norm": 0.23885926604270935, "learning_rate": 2.6324747547198443e-05, "loss": 0.0997, "num_input_tokens_seen": 70760736, "step": 32785 }, { "epoch": 5.349102773246329, "grad_norm": 2.257115364074707, "learning_rate": 2.631763951175355e-05, "loss": 0.1691, "num_input_tokens_seen": 70771552, "step": 32790 }, { "epoch": 5.349918433931484, "grad_norm": 1.4539097547531128, "learning_rate": 2.6310531369494118e-05, "loss": 0.0141, "num_input_tokens_seen": 70782656, "step": 32795 }, { "epoch": 5.350734094616639, "grad_norm": 13.407401084899902, "learning_rate": 2.630342312099637e-05, "loss": 0.1118, "num_input_tokens_seen": 70792736, "step": 32800 }, { "epoch": 5.351549755301795, "grad_norm": 0.21261943876743317, "learning_rate": 2.629631476683652e-05, "loss": 0.1466, "num_input_tokens_seen": 70804032, "step": 32805 }, { "epoch": 5.35236541598695, "grad_norm": 1.5478466749191284, "learning_rate": 2.6289206307590815e-05, "loss": 0.0089, "num_input_tokens_seen": 70814208, "step": 32810 }, { "epoch": 5.353181076672104, "grad_norm": 2.2880988121032715, "learning_rate": 2.6282097743835517e-05, "loss": 0.1866, "num_input_tokens_seen": 70825312, "step": 32815 }, { "epoch": 5.353996737357259, "grad_norm": 0.7886966466903687, "learning_rate": 2.627498907614686e-05, "loss": 0.1134, "num_input_tokens_seen": 70835680, "step": 32820 }, { "epoch": 5.354812398042414, "grad_norm": 0.09569641202688217, "learning_rate": 2.6267880305101127e-05, "loss": 0.0423, "num_input_tokens_seen": 70846336, "step": 32825 }, { "epoch": 5.35562805872757, "grad_norm": 2.4989962577819824, "learning_rate": 2.626077143127458e-05, "loss": 0.1074, "num_input_tokens_seen": 70858016, "step": 32830 }, { "epoch": 5.356443719412725, "grad_norm": 3.8092448711395264, "learning_rate": 2.6253662455243504e-05, "loss": 0.2957, "num_input_tokens_seen": 70868704, "step": 32835 }, { "epoch": 5.357259380097879, "grad_norm": 0.07583662122488022, "learning_rate": 2.6246553377584186e-05, "loss": 0.0413, "num_input_tokens_seen": 70879392, "step": 32840 }, { "epoch": 5.358075040783034, "grad_norm": 2.4972646236419678, "learning_rate": 2.623944419887293e-05, "loss": 0.1723, "num_input_tokens_seen": 70890848, "step": 32845 }, { "epoch": 5.358890701468189, "grad_norm": 0.3224732577800751, "learning_rate": 2.6232334919686035e-05, "loss": 0.0123, "num_input_tokens_seen": 70901472, "step": 32850 }, { "epoch": 5.359706362153344, "grad_norm": 10.250856399536133, "learning_rate": 2.6225225540599825e-05, "loss": 0.2318, "num_input_tokens_seen": 70911904, "step": 32855 }, { "epoch": 5.3605220228384995, "grad_norm": 0.19227977097034454, "learning_rate": 2.6218116062190605e-05, "loss": 0.0105, "num_input_tokens_seen": 70922880, "step": 32860 }, { "epoch": 5.361337683523654, "grad_norm": 0.1123371422290802, "learning_rate": 2.621100648503472e-05, "loss": 0.0705, "num_input_tokens_seen": 70933472, "step": 32865 }, { "epoch": 5.362153344208809, "grad_norm": 0.0657767653465271, "learning_rate": 2.6203896809708512e-05, "loss": 0.1193, "num_input_tokens_seen": 70944864, "step": 32870 }, { "epoch": 5.362969004893964, "grad_norm": 8.928384780883789, "learning_rate": 2.619678703678832e-05, "loss": 0.0632, "num_input_tokens_seen": 70956512, "step": 32875 }, { "epoch": 5.363784665579119, "grad_norm": 0.13426414132118225, "learning_rate": 2.618967716685049e-05, "loss": 0.0059, "num_input_tokens_seen": 70966400, "step": 32880 }, { "epoch": 5.364600326264274, "grad_norm": 13.635554313659668, "learning_rate": 2.6182567200471396e-05, "loss": 0.0216, "num_input_tokens_seen": 70977120, "step": 32885 }, { "epoch": 5.365415986949429, "grad_norm": 0.16418886184692383, "learning_rate": 2.6175457138227404e-05, "loss": 0.1141, "num_input_tokens_seen": 70987712, "step": 32890 }, { "epoch": 5.366231647634584, "grad_norm": 0.2703512907028198, "learning_rate": 2.6168346980694896e-05, "loss": 0.0088, "num_input_tokens_seen": 70998080, "step": 32895 }, { "epoch": 5.367047308319739, "grad_norm": 0.12325150519609451, "learning_rate": 2.6161236728450257e-05, "loss": 0.2325, "num_input_tokens_seen": 71009568, "step": 32900 }, { "epoch": 5.367862969004894, "grad_norm": 3.2493736743927, "learning_rate": 2.6154126382069866e-05, "loss": 0.2135, "num_input_tokens_seen": 71021728, "step": 32905 }, { "epoch": 5.368678629690049, "grad_norm": 0.23775611817836761, "learning_rate": 2.6147015942130143e-05, "loss": 0.0112, "num_input_tokens_seen": 71032704, "step": 32910 }, { "epoch": 5.369494290375204, "grad_norm": 0.13886013627052307, "learning_rate": 2.6139905409207475e-05, "loss": 0.0079, "num_input_tokens_seen": 71043808, "step": 32915 }, { "epoch": 5.370309951060359, "grad_norm": 0.10095523297786713, "learning_rate": 2.61327947838783e-05, "loss": 0.2003, "num_input_tokens_seen": 71054752, "step": 32920 }, { "epoch": 5.371125611745514, "grad_norm": 0.23826496303081512, "learning_rate": 2.6125684066719036e-05, "loss": 0.1355, "num_input_tokens_seen": 71065216, "step": 32925 }, { "epoch": 5.371941272430669, "grad_norm": 0.20730678737163544, "learning_rate": 2.6118573258306106e-05, "loss": 0.1139, "num_input_tokens_seen": 71074752, "step": 32930 }, { "epoch": 5.372756933115824, "grad_norm": 0.11309021711349487, "learning_rate": 2.6111462359215944e-05, "loss": 0.0195, "num_input_tokens_seen": 71084864, "step": 32935 }, { "epoch": 5.373572593800978, "grad_norm": 0.2552671730518341, "learning_rate": 2.6104351370025014e-05, "loss": 0.1077, "num_input_tokens_seen": 71095392, "step": 32940 }, { "epoch": 5.374388254486134, "grad_norm": 0.07608038187026978, "learning_rate": 2.6097240291309756e-05, "loss": 0.1251, "num_input_tokens_seen": 71105472, "step": 32945 }, { "epoch": 5.375203915171289, "grad_norm": 0.12142956256866455, "learning_rate": 2.6090129123646633e-05, "loss": 0.1885, "num_input_tokens_seen": 71116448, "step": 32950 }, { "epoch": 5.376019575856444, "grad_norm": 0.08452937006950378, "learning_rate": 2.6083017867612115e-05, "loss": 0.0086, "num_input_tokens_seen": 71128032, "step": 32955 }, { "epoch": 5.376835236541599, "grad_norm": 0.1397397518157959, "learning_rate": 2.6075906523782666e-05, "loss": 0.0198, "num_input_tokens_seen": 71140448, "step": 32960 }, { "epoch": 5.377650897226753, "grad_norm": 0.1688927561044693, "learning_rate": 2.6068795092734783e-05, "loss": 0.0051, "num_input_tokens_seen": 71150720, "step": 32965 }, { "epoch": 5.378466557911908, "grad_norm": 0.32195374369621277, "learning_rate": 2.6061683575044937e-05, "loss": 0.0985, "num_input_tokens_seen": 71161920, "step": 32970 }, { "epoch": 5.379282218597064, "grad_norm": 0.14487984776496887, "learning_rate": 2.605457197128964e-05, "loss": 0.0065, "num_input_tokens_seen": 71173888, "step": 32975 }, { "epoch": 5.380097879282219, "grad_norm": 0.05538102984428406, "learning_rate": 2.6047460282045388e-05, "loss": 0.0078, "num_input_tokens_seen": 71185984, "step": 32980 }, { "epoch": 5.3809135399673735, "grad_norm": 0.16724319756031036, "learning_rate": 2.604034850788869e-05, "loss": 0.0074, "num_input_tokens_seen": 71196640, "step": 32985 }, { "epoch": 5.381729200652528, "grad_norm": 0.04707736149430275, "learning_rate": 2.6033236649396063e-05, "loss": 0.0034, "num_input_tokens_seen": 71208000, "step": 32990 }, { "epoch": 5.382544861337683, "grad_norm": 0.18590906262397766, "learning_rate": 2.6026124707144033e-05, "loss": 0.0074, "num_input_tokens_seen": 71219616, "step": 32995 }, { "epoch": 5.383360522022839, "grad_norm": 0.07881730049848557, "learning_rate": 2.6019012681709127e-05, "loss": 0.1255, "num_input_tokens_seen": 71230880, "step": 33000 }, { "epoch": 5.384176182707994, "grad_norm": 3.265533685684204, "learning_rate": 2.601190057366788e-05, "loss": 0.248, "num_input_tokens_seen": 71240800, "step": 33005 }, { "epoch": 5.3849918433931485, "grad_norm": 0.11597271263599396, "learning_rate": 2.600478838359684e-05, "loss": 0.1397, "num_input_tokens_seen": 71250784, "step": 33010 }, { "epoch": 5.385807504078303, "grad_norm": 0.06675510108470917, "learning_rate": 2.5997676112072557e-05, "loss": 0.0938, "num_input_tokens_seen": 71260576, "step": 33015 }, { "epoch": 5.386623164763458, "grad_norm": 0.37705445289611816, "learning_rate": 2.5990563759671575e-05, "loss": 0.0073, "num_input_tokens_seen": 71270976, "step": 33020 }, { "epoch": 5.387438825448613, "grad_norm": 0.04514387622475624, "learning_rate": 2.598345132697048e-05, "loss": 0.082, "num_input_tokens_seen": 71281216, "step": 33025 }, { "epoch": 5.388254486133769, "grad_norm": 0.4911499321460724, "learning_rate": 2.597633881454583e-05, "loss": 0.2121, "num_input_tokens_seen": 71292064, "step": 33030 }, { "epoch": 5.3890701468189235, "grad_norm": 0.19001545011997223, "learning_rate": 2.5969226222974196e-05, "loss": 0.1146, "num_input_tokens_seen": 71303392, "step": 33035 }, { "epoch": 5.389885807504078, "grad_norm": 0.0863967165350914, "learning_rate": 2.5962113552832173e-05, "loss": 0.0243, "num_input_tokens_seen": 71314016, "step": 33040 }, { "epoch": 5.390701468189233, "grad_norm": 0.10328985005617142, "learning_rate": 2.5955000804696345e-05, "loss": 0.0575, "num_input_tokens_seen": 71326016, "step": 33045 }, { "epoch": 5.391517128874388, "grad_norm": 1.9994709491729736, "learning_rate": 2.5947887979143304e-05, "loss": 0.1321, "num_input_tokens_seen": 71335840, "step": 33050 }, { "epoch": 5.392332789559543, "grad_norm": 0.18004560470581055, "learning_rate": 2.594077507674965e-05, "loss": 0.0847, "num_input_tokens_seen": 71347072, "step": 33055 }, { "epoch": 5.3931484502446985, "grad_norm": 11.753279685974121, "learning_rate": 2.5933662098091997e-05, "loss": 0.069, "num_input_tokens_seen": 71358464, "step": 33060 }, { "epoch": 5.393964110929853, "grad_norm": 9.915629386901855, "learning_rate": 2.5926549043746962e-05, "loss": 0.0235, "num_input_tokens_seen": 71367776, "step": 33065 }, { "epoch": 5.394779771615008, "grad_norm": 0.18175454437732697, "learning_rate": 2.591943591429115e-05, "loss": 0.0044, "num_input_tokens_seen": 71378368, "step": 33070 }, { "epoch": 5.395595432300163, "grad_norm": 3.6783509254455566, "learning_rate": 2.5912322710301202e-05, "loss": 0.1407, "num_input_tokens_seen": 71389632, "step": 33075 }, { "epoch": 5.396411092985318, "grad_norm": 0.12951412796974182, "learning_rate": 2.590520943235375e-05, "loss": 0.0037, "num_input_tokens_seen": 71400064, "step": 33080 }, { "epoch": 5.397226753670473, "grad_norm": 0.09587570279836655, "learning_rate": 2.5898096081025424e-05, "loss": 0.0096, "num_input_tokens_seen": 71410784, "step": 33085 }, { "epoch": 5.398042414355628, "grad_norm": 5.226691246032715, "learning_rate": 2.589098265689287e-05, "loss": 0.227, "num_input_tokens_seen": 71421632, "step": 33090 }, { "epoch": 5.398858075040783, "grad_norm": 0.03244301676750183, "learning_rate": 2.5883869160532743e-05, "loss": 0.2446, "num_input_tokens_seen": 71432640, "step": 33095 }, { "epoch": 5.399673735725938, "grad_norm": 0.13412439823150635, "learning_rate": 2.58767555925217e-05, "loss": 0.0039, "num_input_tokens_seen": 71442816, "step": 33100 }, { "epoch": 5.400489396411093, "grad_norm": 0.16147126257419586, "learning_rate": 2.5869641953436402e-05, "loss": 0.0198, "num_input_tokens_seen": 71453568, "step": 33105 }, { "epoch": 5.401305057096248, "grad_norm": 0.09940747916698456, "learning_rate": 2.5862528243853513e-05, "loss": 0.0154, "num_input_tokens_seen": 71464448, "step": 33110 }, { "epoch": 5.402120717781403, "grad_norm": 0.026617012917995453, "learning_rate": 2.5855414464349707e-05, "loss": 0.1371, "num_input_tokens_seen": 71475264, "step": 33115 }, { "epoch": 5.402936378466558, "grad_norm": 0.08824633061885834, "learning_rate": 2.5848300615501663e-05, "loss": 0.0712, "num_input_tokens_seen": 71484704, "step": 33120 }, { "epoch": 5.403752039151713, "grad_norm": 10.036717414855957, "learning_rate": 2.5841186697886065e-05, "loss": 0.1291, "num_input_tokens_seen": 71494592, "step": 33125 }, { "epoch": 5.404567699836868, "grad_norm": 3.598909378051758, "learning_rate": 2.583407271207961e-05, "loss": 0.2294, "num_input_tokens_seen": 71505568, "step": 33130 }, { "epoch": 5.4053833605220225, "grad_norm": 0.037693921476602554, "learning_rate": 2.582695865865899e-05, "loss": 0.1229, "num_input_tokens_seen": 71515872, "step": 33135 }, { "epoch": 5.406199021207178, "grad_norm": 0.03884103149175644, "learning_rate": 2.5819844538200906e-05, "loss": 0.0429, "num_input_tokens_seen": 71526720, "step": 33140 }, { "epoch": 5.407014681892333, "grad_norm": 0.0533999465405941, "learning_rate": 2.5812730351282056e-05, "loss": 0.096, "num_input_tokens_seen": 71537856, "step": 33145 }, { "epoch": 5.407830342577488, "grad_norm": 0.11679085344076157, "learning_rate": 2.5805616098479167e-05, "loss": 0.0177, "num_input_tokens_seen": 71546624, "step": 33150 }, { "epoch": 5.408646003262643, "grad_norm": 0.03162815421819687, "learning_rate": 2.5798501780368944e-05, "loss": 0.201, "num_input_tokens_seen": 71556864, "step": 33155 }, { "epoch": 5.4094616639477975, "grad_norm": 1.8352751731872559, "learning_rate": 2.5791387397528123e-05, "loss": 0.0315, "num_input_tokens_seen": 71566848, "step": 33160 }, { "epoch": 5.410277324632952, "grad_norm": 2.4174234867095947, "learning_rate": 2.578427295053341e-05, "loss": 0.1758, "num_input_tokens_seen": 71576544, "step": 33165 }, { "epoch": 5.411092985318108, "grad_norm": 0.07857690751552582, "learning_rate": 2.5777158439961564e-05, "loss": 0.0328, "num_input_tokens_seen": 71586816, "step": 33170 }, { "epoch": 5.411908646003263, "grad_norm": 6.798092842102051, "learning_rate": 2.577004386638931e-05, "loss": 0.0316, "num_input_tokens_seen": 71596480, "step": 33175 }, { "epoch": 5.412724306688418, "grad_norm": 0.40470385551452637, "learning_rate": 2.576292923039339e-05, "loss": 0.0332, "num_input_tokens_seen": 71607968, "step": 33180 }, { "epoch": 5.4135399673735725, "grad_norm": 3.3327794075012207, "learning_rate": 2.5755814532550553e-05, "loss": 0.2206, "num_input_tokens_seen": 71618560, "step": 33185 }, { "epoch": 5.414355628058727, "grad_norm": 0.14963901042938232, "learning_rate": 2.574869977343756e-05, "loss": 0.0054, "num_input_tokens_seen": 71629952, "step": 33190 }, { "epoch": 5.415171288743883, "grad_norm": 0.42876407504081726, "learning_rate": 2.574158495363117e-05, "loss": 0.0304, "num_input_tokens_seen": 71639296, "step": 33195 }, { "epoch": 5.415986949429038, "grad_norm": 0.11662980914115906, "learning_rate": 2.5734470073708133e-05, "loss": 0.0899, "num_input_tokens_seen": 71649760, "step": 33200 }, { "epoch": 5.416802610114193, "grad_norm": 0.3094348907470703, "learning_rate": 2.572735513424523e-05, "loss": 0.0565, "num_input_tokens_seen": 71660800, "step": 33205 }, { "epoch": 5.417618270799347, "grad_norm": 0.10273489356040955, "learning_rate": 2.5720240135819223e-05, "loss": 0.0554, "num_input_tokens_seen": 71671104, "step": 33210 }, { "epoch": 5.418433931484502, "grad_norm": 3.2516162395477295, "learning_rate": 2.57131250790069e-05, "loss": 0.1653, "num_input_tokens_seen": 71681568, "step": 33215 }, { "epoch": 5.419249592169657, "grad_norm": 0.19021402299404144, "learning_rate": 2.570600996438504e-05, "loss": 0.1874, "num_input_tokens_seen": 71691008, "step": 33220 }, { "epoch": 5.420065252854813, "grad_norm": 0.20147915184497833, "learning_rate": 2.5698894792530432e-05, "loss": 0.0104, "num_input_tokens_seen": 71702112, "step": 33225 }, { "epoch": 5.420880913539968, "grad_norm": 0.3195922374725342, "learning_rate": 2.5691779564019862e-05, "loss": 0.0191, "num_input_tokens_seen": 71713696, "step": 33230 }, { "epoch": 5.421696574225122, "grad_norm": 0.41486889123916626, "learning_rate": 2.5684664279430125e-05, "loss": 0.2334, "num_input_tokens_seen": 71724800, "step": 33235 }, { "epoch": 5.422512234910277, "grad_norm": 10.956829071044922, "learning_rate": 2.5677548939338035e-05, "loss": 0.0639, "num_input_tokens_seen": 71736128, "step": 33240 }, { "epoch": 5.423327895595432, "grad_norm": 0.10004019737243652, "learning_rate": 2.5670433544320388e-05, "loss": 0.0899, "num_input_tokens_seen": 71746912, "step": 33245 }, { "epoch": 5.424143556280587, "grad_norm": 3.6930129528045654, "learning_rate": 2.5663318094953997e-05, "loss": 0.2368, "num_input_tokens_seen": 71757888, "step": 33250 }, { "epoch": 5.424959216965743, "grad_norm": 0.07590536773204803, "learning_rate": 2.5656202591815675e-05, "loss": 0.1169, "num_input_tokens_seen": 71769504, "step": 33255 }, { "epoch": 5.425774877650897, "grad_norm": 11.46955394744873, "learning_rate": 2.5649087035482243e-05, "loss": 0.0598, "num_input_tokens_seen": 71780352, "step": 33260 }, { "epoch": 5.426590538336052, "grad_norm": 2.9528470039367676, "learning_rate": 2.5641971426530525e-05, "loss": 0.1247, "num_input_tokens_seen": 71791488, "step": 33265 }, { "epoch": 5.427406199021207, "grad_norm": 0.3688742220401764, "learning_rate": 2.5634855765537347e-05, "loss": 0.0069, "num_input_tokens_seen": 71801728, "step": 33270 }, { "epoch": 5.428221859706362, "grad_norm": 11.852249145507812, "learning_rate": 2.5627740053079534e-05, "loss": 0.1007, "num_input_tokens_seen": 71812096, "step": 33275 }, { "epoch": 5.4290375203915175, "grad_norm": 0.11403562873601913, "learning_rate": 2.562062428973393e-05, "loss": 0.007, "num_input_tokens_seen": 71822752, "step": 33280 }, { "epoch": 5.429853181076672, "grad_norm": 0.026809722185134888, "learning_rate": 2.5613508476077365e-05, "loss": 0.0073, "num_input_tokens_seen": 71833056, "step": 33285 }, { "epoch": 5.430668841761827, "grad_norm": 0.06105045601725578, "learning_rate": 2.5606392612686697e-05, "loss": 0.1306, "num_input_tokens_seen": 71842528, "step": 33290 }, { "epoch": 5.431484502446982, "grad_norm": 0.19938939809799194, "learning_rate": 2.5599276700138764e-05, "loss": 0.0688, "num_input_tokens_seen": 71853440, "step": 33295 }, { "epoch": 5.432300163132137, "grad_norm": 0.07202395796775818, "learning_rate": 2.5592160739010425e-05, "loss": 0.0405, "num_input_tokens_seen": 71863040, "step": 33300 }, { "epoch": 5.433115823817292, "grad_norm": 0.044230278581380844, "learning_rate": 2.5585044729878526e-05, "loss": 0.0284, "num_input_tokens_seen": 71873792, "step": 33305 }, { "epoch": 5.433931484502447, "grad_norm": 4.130042552947998, "learning_rate": 2.557792867331994e-05, "loss": 0.0519, "num_input_tokens_seen": 71885856, "step": 33310 }, { "epoch": 5.434747145187602, "grad_norm": 0.1673230081796646, "learning_rate": 2.5570812569911518e-05, "loss": 0.007, "num_input_tokens_seen": 71896416, "step": 33315 }, { "epoch": 5.435562805872757, "grad_norm": 0.05843973904848099, "learning_rate": 2.556369642023013e-05, "loss": 0.0845, "num_input_tokens_seen": 71906976, "step": 33320 }, { "epoch": 5.436378466557912, "grad_norm": 0.03759612888097763, "learning_rate": 2.5556580224852655e-05, "loss": 0.037, "num_input_tokens_seen": 71916864, "step": 33325 }, { "epoch": 5.437194127243067, "grad_norm": 0.16183309257030487, "learning_rate": 2.5549463984355964e-05, "loss": 0.0048, "num_input_tokens_seen": 71928288, "step": 33330 }, { "epoch": 5.438009787928221, "grad_norm": 0.09352131932973862, "learning_rate": 2.5542347699316933e-05, "loss": 0.0933, "num_input_tokens_seen": 71939104, "step": 33335 }, { "epoch": 5.438825448613377, "grad_norm": 0.7050575017929077, "learning_rate": 2.553523137031244e-05, "loss": 0.0098, "num_input_tokens_seen": 71950208, "step": 33340 }, { "epoch": 5.439641109298532, "grad_norm": 3.3034229278564453, "learning_rate": 2.5528114997919384e-05, "loss": 0.3338, "num_input_tokens_seen": 71961312, "step": 33345 }, { "epoch": 5.440456769983687, "grad_norm": 3.80537748336792, "learning_rate": 2.5520998582714645e-05, "loss": 0.28, "num_input_tokens_seen": 71973248, "step": 33350 }, { "epoch": 5.441272430668842, "grad_norm": 0.05213324353098869, "learning_rate": 2.5513882125275113e-05, "loss": 0.0399, "num_input_tokens_seen": 71983648, "step": 33355 }, { "epoch": 5.442088091353996, "grad_norm": 0.021955057978630066, "learning_rate": 2.5506765626177697e-05, "loss": 0.0565, "num_input_tokens_seen": 71995072, "step": 33360 }, { "epoch": 5.442903752039152, "grad_norm": 18.607219696044922, "learning_rate": 2.5499649085999282e-05, "loss": 0.1689, "num_input_tokens_seen": 72006112, "step": 33365 }, { "epoch": 5.443719412724307, "grad_norm": 0.10573983937501907, "learning_rate": 2.549253250531678e-05, "loss": 0.007, "num_input_tokens_seen": 72017184, "step": 33370 }, { "epoch": 5.444535073409462, "grad_norm": 3.3179361820220947, "learning_rate": 2.548541588470709e-05, "loss": 0.0748, "num_input_tokens_seen": 72028352, "step": 33375 }, { "epoch": 5.445350734094617, "grad_norm": 0.037115976214408875, "learning_rate": 2.547829922474713e-05, "loss": 0.0365, "num_input_tokens_seen": 72037664, "step": 33380 }, { "epoch": 5.446166394779771, "grad_norm": 0.151457279920578, "learning_rate": 2.5471182526013805e-05, "loss": 0.0537, "num_input_tokens_seen": 72049120, "step": 33385 }, { "epoch": 5.446982055464926, "grad_norm": 0.39002159237861633, "learning_rate": 2.546406578908403e-05, "loss": 0.1165, "num_input_tokens_seen": 72059104, "step": 33390 }, { "epoch": 5.447797716150082, "grad_norm": 3.467186450958252, "learning_rate": 2.545694901453473e-05, "loss": 0.1031, "num_input_tokens_seen": 72070048, "step": 33395 }, { "epoch": 5.448613376835237, "grad_norm": 0.12871865928173065, "learning_rate": 2.5449832202942832e-05, "loss": 0.265, "num_input_tokens_seen": 72080928, "step": 33400 }, { "epoch": 5.4494290375203915, "grad_norm": 0.06301422417163849, "learning_rate": 2.5442715354885237e-05, "loss": 0.0519, "num_input_tokens_seen": 72091488, "step": 33405 }, { "epoch": 5.450244698205546, "grad_norm": 5.510315895080566, "learning_rate": 2.5435598470938903e-05, "loss": 0.0918, "num_input_tokens_seen": 72102528, "step": 33410 }, { "epoch": 5.451060358890701, "grad_norm": 1.7421666383743286, "learning_rate": 2.5428481551680745e-05, "loss": 0.0615, "num_input_tokens_seen": 72112992, "step": 33415 }, { "epoch": 5.451876019575856, "grad_norm": 0.5023473501205444, "learning_rate": 2.5421364597687696e-05, "loss": 0.0449, "num_input_tokens_seen": 72124096, "step": 33420 }, { "epoch": 5.452691680261012, "grad_norm": 0.3875141739845276, "learning_rate": 2.5414247609536696e-05, "loss": 0.044, "num_input_tokens_seen": 72134624, "step": 33425 }, { "epoch": 5.4535073409461665, "grad_norm": 2.727522611618042, "learning_rate": 2.5407130587804685e-05, "loss": 0.2095, "num_input_tokens_seen": 72145344, "step": 33430 }, { "epoch": 5.454323001631321, "grad_norm": 0.17522837221622467, "learning_rate": 2.5400013533068594e-05, "loss": 0.0596, "num_input_tokens_seen": 72157312, "step": 33435 }, { "epoch": 5.455138662316476, "grad_norm": 16.254186630249023, "learning_rate": 2.5392896445905385e-05, "loss": 0.2993, "num_input_tokens_seen": 72168032, "step": 33440 }, { "epoch": 5.455954323001631, "grad_norm": 0.13682357966899872, "learning_rate": 2.538577932689199e-05, "loss": 0.0372, "num_input_tokens_seen": 72178944, "step": 33445 }, { "epoch": 5.456769983686787, "grad_norm": 5.391839027404785, "learning_rate": 2.537866217660537e-05, "loss": 0.0823, "num_input_tokens_seen": 72188640, "step": 33450 }, { "epoch": 5.4575856443719415, "grad_norm": 0.13826815783977509, "learning_rate": 2.5371544995622472e-05, "loss": 0.1648, "num_input_tokens_seen": 72198720, "step": 33455 }, { "epoch": 5.458401305057096, "grad_norm": 0.025393128395080566, "learning_rate": 2.536442778452025e-05, "loss": 0.0092, "num_input_tokens_seen": 72208704, "step": 33460 }, { "epoch": 5.459216965742251, "grad_norm": 0.035799961537122726, "learning_rate": 2.5357310543875667e-05, "loss": 0.1064, "num_input_tokens_seen": 72218688, "step": 33465 }, { "epoch": 5.460032626427406, "grad_norm": 0.5198113918304443, "learning_rate": 2.5350193274265678e-05, "loss": 0.0613, "num_input_tokens_seen": 72230336, "step": 33470 }, { "epoch": 5.460848287112561, "grad_norm": 0.08530508726835251, "learning_rate": 2.5343075976267234e-05, "loss": 0.0067, "num_input_tokens_seen": 72241024, "step": 33475 }, { "epoch": 5.4616639477977165, "grad_norm": 1.217314600944519, "learning_rate": 2.533595865045732e-05, "loss": 0.0833, "num_input_tokens_seen": 72252064, "step": 33480 }, { "epoch": 5.462479608482871, "grad_norm": 2.1794791221618652, "learning_rate": 2.532884129741289e-05, "loss": 0.2085, "num_input_tokens_seen": 72263616, "step": 33485 }, { "epoch": 5.463295269168026, "grad_norm": 4.713282108306885, "learning_rate": 2.5321723917710923e-05, "loss": 0.2499, "num_input_tokens_seen": 72274784, "step": 33490 }, { "epoch": 5.464110929853181, "grad_norm": 0.04681149125099182, "learning_rate": 2.531460651192838e-05, "loss": 0.1314, "num_input_tokens_seen": 72284064, "step": 33495 }, { "epoch": 5.464926590538336, "grad_norm": 2.854098320007324, "learning_rate": 2.5307489080642227e-05, "loss": 0.1856, "num_input_tokens_seen": 72295616, "step": 33500 }, { "epoch": 5.465742251223491, "grad_norm": 0.08297780901193619, "learning_rate": 2.530037162442946e-05, "loss": 0.1858, "num_input_tokens_seen": 72305952, "step": 33505 }, { "epoch": 5.466557911908646, "grad_norm": 0.11627074331045151, "learning_rate": 2.529325414386704e-05, "loss": 0.0695, "num_input_tokens_seen": 72316512, "step": 33510 }, { "epoch": 5.467373572593801, "grad_norm": 0.10977844893932343, "learning_rate": 2.5286136639531956e-05, "loss": 0.1574, "num_input_tokens_seen": 72327328, "step": 33515 }, { "epoch": 5.468189233278956, "grad_norm": 4.253857135772705, "learning_rate": 2.527901911200118e-05, "loss": 0.0769, "num_input_tokens_seen": 72338560, "step": 33520 }, { "epoch": 5.469004893964111, "grad_norm": 4.271973133087158, "learning_rate": 2.5271901561851703e-05, "loss": 0.0998, "num_input_tokens_seen": 72349312, "step": 33525 }, { "epoch": 5.4698205546492655, "grad_norm": 8.721693992614746, "learning_rate": 2.52647839896605e-05, "loss": 0.0377, "num_input_tokens_seen": 72361024, "step": 33530 }, { "epoch": 5.470636215334421, "grad_norm": 0.10647906363010406, "learning_rate": 2.525766639600457e-05, "loss": 0.2605, "num_input_tokens_seen": 72372352, "step": 33535 }, { "epoch": 5.471451876019576, "grad_norm": 0.11971849203109741, "learning_rate": 2.525054878146089e-05, "loss": 0.0552, "num_input_tokens_seen": 72383136, "step": 33540 }, { "epoch": 5.472267536704731, "grad_norm": 0.08364076167345047, "learning_rate": 2.5243431146606456e-05, "loss": 0.0049, "num_input_tokens_seen": 72392224, "step": 33545 }, { "epoch": 5.473083197389886, "grad_norm": 0.12079733610153198, "learning_rate": 2.5236313492018254e-05, "loss": 0.0166, "num_input_tokens_seen": 72402176, "step": 33550 }, { "epoch": 5.4738988580750405, "grad_norm": 10.084371566772461, "learning_rate": 2.5229195818273284e-05, "loss": 0.135, "num_input_tokens_seen": 72413792, "step": 33555 }, { "epoch": 5.474714518760196, "grad_norm": 9.428359031677246, "learning_rate": 2.5222078125948534e-05, "loss": 0.1116, "num_input_tokens_seen": 72425792, "step": 33560 }, { "epoch": 5.475530179445351, "grad_norm": 0.036408375948667526, "learning_rate": 2.5214960415621007e-05, "loss": 0.0036, "num_input_tokens_seen": 72436864, "step": 33565 }, { "epoch": 5.476345840130506, "grad_norm": 0.14001904428005219, "learning_rate": 2.5207842687867705e-05, "loss": 0.0056, "num_input_tokens_seen": 72448096, "step": 33570 }, { "epoch": 5.477161500815661, "grad_norm": 0.4629044234752655, "learning_rate": 2.5200724943265614e-05, "loss": 0.0066, "num_input_tokens_seen": 72459776, "step": 33575 }, { "epoch": 5.4779771615008155, "grad_norm": 3.62270450592041, "learning_rate": 2.519360718239174e-05, "loss": 0.2214, "num_input_tokens_seen": 72470528, "step": 33580 }, { "epoch": 5.47879282218597, "grad_norm": 11.740579605102539, "learning_rate": 2.5186489405823087e-05, "loss": 0.435, "num_input_tokens_seen": 72482144, "step": 33585 }, { "epoch": 5.479608482871126, "grad_norm": 4.328428745269775, "learning_rate": 2.517937161413666e-05, "loss": 0.1729, "num_input_tokens_seen": 72493504, "step": 33590 }, { "epoch": 5.480424143556281, "grad_norm": 0.04599461331963539, "learning_rate": 2.517225380790946e-05, "loss": 0.1162, "num_input_tokens_seen": 72503904, "step": 33595 }, { "epoch": 5.481239804241436, "grad_norm": 4.032310485839844, "learning_rate": 2.5165135987718486e-05, "loss": 0.2707, "num_input_tokens_seen": 72514560, "step": 33600 }, { "epoch": 5.4820554649265905, "grad_norm": 0.14988665282726288, "learning_rate": 2.515801815414075e-05, "loss": 0.1035, "num_input_tokens_seen": 72526080, "step": 33605 }, { "epoch": 5.482871125611745, "grad_norm": 0.06683401018381119, "learning_rate": 2.5150900307753267e-05, "loss": 0.0934, "num_input_tokens_seen": 72536992, "step": 33610 }, { "epoch": 5.4836867862969, "grad_norm": 0.2525116801261902, "learning_rate": 2.5143782449133036e-05, "loss": 0.1114, "num_input_tokens_seen": 72548224, "step": 33615 }, { "epoch": 5.484502446982056, "grad_norm": 0.0889025330543518, "learning_rate": 2.5136664578857072e-05, "loss": 0.0169, "num_input_tokens_seen": 72559744, "step": 33620 }, { "epoch": 5.485318107667211, "grad_norm": 2.2922513484954834, "learning_rate": 2.5129546697502382e-05, "loss": 0.0187, "num_input_tokens_seen": 72570528, "step": 33625 }, { "epoch": 5.486133768352365, "grad_norm": 0.5152546167373657, "learning_rate": 2.512242880564598e-05, "loss": 0.0991, "num_input_tokens_seen": 72582112, "step": 33630 }, { "epoch": 5.48694942903752, "grad_norm": 0.09374929219484329, "learning_rate": 2.5115310903864874e-05, "loss": 0.1365, "num_input_tokens_seen": 72592736, "step": 33635 }, { "epoch": 5.487765089722675, "grad_norm": 0.0640566498041153, "learning_rate": 2.510819299273609e-05, "loss": 0.0885, "num_input_tokens_seen": 72604096, "step": 33640 }, { "epoch": 5.488580750407831, "grad_norm": 2.2651100158691406, "learning_rate": 2.510107507283663e-05, "loss": 0.0139, "num_input_tokens_seen": 72614560, "step": 33645 }, { "epoch": 5.489396411092986, "grad_norm": 4.540492534637451, "learning_rate": 2.5093957144743507e-05, "loss": 0.2888, "num_input_tokens_seen": 72625536, "step": 33650 }, { "epoch": 5.49021207177814, "grad_norm": 0.2596532702445984, "learning_rate": 2.5086839209033747e-05, "loss": 0.1229, "num_input_tokens_seen": 72636384, "step": 33655 }, { "epoch": 5.491027732463295, "grad_norm": 9.449118614196777, "learning_rate": 2.507972126628435e-05, "loss": 0.2381, "num_input_tokens_seen": 72646272, "step": 33660 }, { "epoch": 5.49184339314845, "grad_norm": 0.06537148356437683, "learning_rate": 2.5072603317072353e-05, "loss": 0.0975, "num_input_tokens_seen": 72657184, "step": 33665 }, { "epoch": 5.492659053833605, "grad_norm": 0.25678354501724243, "learning_rate": 2.5065485361974754e-05, "loss": 0.1541, "num_input_tokens_seen": 72668544, "step": 33670 }, { "epoch": 5.493474714518761, "grad_norm": 0.0459088459610939, "learning_rate": 2.505836740156859e-05, "loss": 0.0104, "num_input_tokens_seen": 72679104, "step": 33675 }, { "epoch": 5.494290375203915, "grad_norm": 1.9900023937225342, "learning_rate": 2.5051249436430862e-05, "loss": 0.2362, "num_input_tokens_seen": 72689856, "step": 33680 }, { "epoch": 5.49510603588907, "grad_norm": 0.3316431939601898, "learning_rate": 2.5044131467138597e-05, "loss": 0.0106, "num_input_tokens_seen": 72701312, "step": 33685 }, { "epoch": 5.495921696574225, "grad_norm": 0.058297768235206604, "learning_rate": 2.5037013494268814e-05, "loss": 0.0049, "num_input_tokens_seen": 72711808, "step": 33690 }, { "epoch": 5.49673735725938, "grad_norm": 0.16702599823474884, "learning_rate": 2.502989551839852e-05, "loss": 0.0562, "num_input_tokens_seen": 72722528, "step": 33695 }, { "epoch": 5.497553017944535, "grad_norm": 10.069947242736816, "learning_rate": 2.5022777540104752e-05, "loss": 0.1806, "num_input_tokens_seen": 72733408, "step": 33700 }, { "epoch": 5.49836867862969, "grad_norm": 11.7423677444458, "learning_rate": 2.5015659559964516e-05, "loss": 0.0934, "num_input_tokens_seen": 72744544, "step": 33705 }, { "epoch": 5.499184339314845, "grad_norm": 0.18311762809753418, "learning_rate": 2.5008541578554838e-05, "loss": 0.0133, "num_input_tokens_seen": 72755712, "step": 33710 }, { "epoch": 5.5, "grad_norm": 0.5082947015762329, "learning_rate": 2.5001423596452738e-05, "loss": 0.0486, "num_input_tokens_seen": 72765696, "step": 33715 }, { "epoch": 5.5, "eval_loss": 0.16719458997249603, "eval_runtime": 132.9383, "eval_samples_per_second": 20.498, "eval_steps_per_second": 5.13, "num_input_tokens_seen": 72765696, "step": 33715 }, { "epoch": 5.500815660685155, "grad_norm": 0.16994374990463257, "learning_rate": 2.4994305614235228e-05, "loss": 0.0077, "num_input_tokens_seen": 72776576, "step": 33720 }, { "epoch": 5.50163132137031, "grad_norm": 0.14518482983112335, "learning_rate": 2.498718763247934e-05, "loss": 0.0109, "num_input_tokens_seen": 72786112, "step": 33725 }, { "epoch": 5.502446982055465, "grad_norm": 0.035523671656847, "learning_rate": 2.4980069651762085e-05, "loss": 0.0184, "num_input_tokens_seen": 72796384, "step": 33730 }, { "epoch": 5.50326264274062, "grad_norm": 0.06263341754674911, "learning_rate": 2.4972951672660487e-05, "loss": 0.0131, "num_input_tokens_seen": 72806720, "step": 33735 }, { "epoch": 5.504078303425775, "grad_norm": 6.641420841217041, "learning_rate": 2.4965833695751563e-05, "loss": 0.0889, "num_input_tokens_seen": 72817408, "step": 33740 }, { "epoch": 5.50489396411093, "grad_norm": 4.616932392120361, "learning_rate": 2.4958715721612335e-05, "loss": 0.1256, "num_input_tokens_seen": 72828096, "step": 33745 }, { "epoch": 5.505709624796085, "grad_norm": 0.09794651716947556, "learning_rate": 2.495159775081982e-05, "loss": 0.0048, "num_input_tokens_seen": 72839040, "step": 33750 }, { "epoch": 5.506525285481239, "grad_norm": 1.5033702850341797, "learning_rate": 2.4944479783951037e-05, "loss": 0.0201, "num_input_tokens_seen": 72849728, "step": 33755 }, { "epoch": 5.507340946166395, "grad_norm": 0.0542055144906044, "learning_rate": 2.4937361821583e-05, "loss": 0.1689, "num_input_tokens_seen": 72859872, "step": 33760 }, { "epoch": 5.50815660685155, "grad_norm": 12.139034271240234, "learning_rate": 2.4930243864292736e-05, "loss": 0.3785, "num_input_tokens_seen": 72871648, "step": 33765 }, { "epoch": 5.508972267536705, "grad_norm": 0.08845608681440353, "learning_rate": 2.492312591265726e-05, "loss": 0.0111, "num_input_tokens_seen": 72883136, "step": 33770 }, { "epoch": 5.50978792822186, "grad_norm": 0.36479949951171875, "learning_rate": 2.4916007967253576e-05, "loss": 0.0066, "num_input_tokens_seen": 72892896, "step": 33775 }, { "epoch": 5.510603588907014, "grad_norm": 0.020641397684812546, "learning_rate": 2.490889002865872e-05, "loss": 0.1182, "num_input_tokens_seen": 72904576, "step": 33780 }, { "epoch": 5.511419249592169, "grad_norm": 0.07545993477106094, "learning_rate": 2.4901772097449703e-05, "loss": 0.0762, "num_input_tokens_seen": 72914016, "step": 33785 }, { "epoch": 5.512234910277325, "grad_norm": 0.06554381549358368, "learning_rate": 2.4894654174203535e-05, "loss": 0.0166, "num_input_tokens_seen": 72925152, "step": 33790 }, { "epoch": 5.51305057096248, "grad_norm": 0.7819629311561584, "learning_rate": 2.488753625949723e-05, "loss": 0.0069, "num_input_tokens_seen": 72936320, "step": 33795 }, { "epoch": 5.513866231647635, "grad_norm": 0.027263063937425613, "learning_rate": 2.488041835390781e-05, "loss": 0.005, "num_input_tokens_seen": 72946848, "step": 33800 }, { "epoch": 5.514681892332789, "grad_norm": 0.08070861548185349, "learning_rate": 2.4873300458012285e-05, "loss": 0.0044, "num_input_tokens_seen": 72958208, "step": 33805 }, { "epoch": 5.515497553017944, "grad_norm": 4.997661113739014, "learning_rate": 2.486618257238767e-05, "loss": 0.0146, "num_input_tokens_seen": 72969152, "step": 33810 }, { "epoch": 5.5163132137031, "grad_norm": 0.07219992578029633, "learning_rate": 2.4859064697610977e-05, "loss": 0.237, "num_input_tokens_seen": 72981664, "step": 33815 }, { "epoch": 5.517128874388255, "grad_norm": 0.29128319025039673, "learning_rate": 2.485194683425921e-05, "loss": 0.0061, "num_input_tokens_seen": 72992224, "step": 33820 }, { "epoch": 5.5179445350734095, "grad_norm": 0.06747906655073166, "learning_rate": 2.4844828982909388e-05, "loss": 0.0388, "num_input_tokens_seen": 73003712, "step": 33825 }, { "epoch": 5.518760195758564, "grad_norm": 0.09900006651878357, "learning_rate": 2.4837711144138514e-05, "loss": 0.259, "num_input_tokens_seen": 73015328, "step": 33830 }, { "epoch": 5.519575856443719, "grad_norm": 3.314185857772827, "learning_rate": 2.48305933185236e-05, "loss": 0.1504, "num_input_tokens_seen": 73026304, "step": 33835 }, { "epoch": 5.520391517128875, "grad_norm": 0.2923809587955475, "learning_rate": 2.4823475506641646e-05, "loss": 0.2657, "num_input_tokens_seen": 73038688, "step": 33840 }, { "epoch": 5.52120717781403, "grad_norm": 0.23544256389141083, "learning_rate": 2.481635770906967e-05, "loss": 0.0075, "num_input_tokens_seen": 73049824, "step": 33845 }, { "epoch": 5.5220228384991845, "grad_norm": 0.06933756172657013, "learning_rate": 2.4809239926384664e-05, "loss": 0.1956, "num_input_tokens_seen": 73061760, "step": 33850 }, { "epoch": 5.522838499184339, "grad_norm": 0.17645519971847534, "learning_rate": 2.480212215916364e-05, "loss": 0.1016, "num_input_tokens_seen": 73072704, "step": 33855 }, { "epoch": 5.523654159869494, "grad_norm": 3.188039541244507, "learning_rate": 2.4795004407983593e-05, "loss": 0.1201, "num_input_tokens_seen": 73084032, "step": 33860 }, { "epoch": 5.524469820554649, "grad_norm": 0.21747814118862152, "learning_rate": 2.4787886673421536e-05, "loss": 0.0762, "num_input_tokens_seen": 73094592, "step": 33865 }, { "epoch": 5.525285481239804, "grad_norm": 0.2958066165447235, "learning_rate": 2.4780768956054457e-05, "loss": 0.1048, "num_input_tokens_seen": 73106304, "step": 33870 }, { "epoch": 5.5261011419249595, "grad_norm": 0.08281079679727554, "learning_rate": 2.477365125645936e-05, "loss": 0.0069, "num_input_tokens_seen": 73117088, "step": 33875 }, { "epoch": 5.526916802610114, "grad_norm": 0.08794347196817398, "learning_rate": 2.4766533575213242e-05, "loss": 0.0521, "num_input_tokens_seen": 73127776, "step": 33880 }, { "epoch": 5.527732463295269, "grad_norm": 1.4676828384399414, "learning_rate": 2.4759415912893096e-05, "loss": 0.0064, "num_input_tokens_seen": 73139904, "step": 33885 }, { "epoch": 5.528548123980424, "grad_norm": 0.09515149891376495, "learning_rate": 2.4752298270075918e-05, "loss": 0.0054, "num_input_tokens_seen": 73150688, "step": 33890 }, { "epoch": 5.529363784665579, "grad_norm": 0.16909362375736237, "learning_rate": 2.47451806473387e-05, "loss": 0.0183, "num_input_tokens_seen": 73160576, "step": 33895 }, { "epoch": 5.5301794453507345, "grad_norm": 0.14664624631404877, "learning_rate": 2.4738063045258415e-05, "loss": 0.1083, "num_input_tokens_seen": 73170272, "step": 33900 }, { "epoch": 5.530995106035889, "grad_norm": 0.5503304600715637, "learning_rate": 2.4730945464412085e-05, "loss": 0.23, "num_input_tokens_seen": 73181760, "step": 33905 }, { "epoch": 5.531810766721044, "grad_norm": 0.024620776996016502, "learning_rate": 2.472382790537668e-05, "loss": 0.0087, "num_input_tokens_seen": 73192096, "step": 33910 }, { "epoch": 5.532626427406199, "grad_norm": 3.65716290473938, "learning_rate": 2.4716710368729187e-05, "loss": 0.1757, "num_input_tokens_seen": 73201408, "step": 33915 }, { "epoch": 5.533442088091354, "grad_norm": 0.3498358130455017, "learning_rate": 2.4709592855046587e-05, "loss": 0.0301, "num_input_tokens_seen": 73211936, "step": 33920 }, { "epoch": 5.5342577487765094, "grad_norm": 16.28522491455078, "learning_rate": 2.4702475364905864e-05, "loss": 0.1952, "num_input_tokens_seen": 73223392, "step": 33925 }, { "epoch": 5.535073409461664, "grad_norm": 4.879078388214111, "learning_rate": 2.4695357898883998e-05, "loss": 0.1818, "num_input_tokens_seen": 73233920, "step": 33930 }, { "epoch": 5.535889070146819, "grad_norm": 0.27701303362846375, "learning_rate": 2.4688240457557967e-05, "loss": 0.0074, "num_input_tokens_seen": 73244288, "step": 33935 }, { "epoch": 5.536704730831974, "grad_norm": 2.499258279800415, "learning_rate": 2.4681123041504746e-05, "loss": 0.3055, "num_input_tokens_seen": 73256256, "step": 33940 }, { "epoch": 5.537520391517129, "grad_norm": 2.6599695682525635, "learning_rate": 2.46740056513013e-05, "loss": 0.2024, "num_input_tokens_seen": 73267264, "step": 33945 }, { "epoch": 5.5383360522022835, "grad_norm": 0.18573834002017975, "learning_rate": 2.466688828752462e-05, "loss": 0.1245, "num_input_tokens_seen": 73279168, "step": 33950 }, { "epoch": 5.539151712887438, "grad_norm": 2.9579179286956787, "learning_rate": 2.4659770950751666e-05, "loss": 0.1006, "num_input_tokens_seen": 73288128, "step": 33955 }, { "epoch": 5.539967373572594, "grad_norm": 5.895270824432373, "learning_rate": 2.4652653641559404e-05, "loss": 0.263, "num_input_tokens_seen": 73297472, "step": 33960 }, { "epoch": 5.540783034257749, "grad_norm": 0.10312424600124359, "learning_rate": 2.46455363605248e-05, "loss": 0.0883, "num_input_tokens_seen": 73309504, "step": 33965 }, { "epoch": 5.541598694942904, "grad_norm": 0.10254838317632675, "learning_rate": 2.4638419108224817e-05, "loss": 0.0273, "num_input_tokens_seen": 73320192, "step": 33970 }, { "epoch": 5.5424143556280585, "grad_norm": 0.04222483187913895, "learning_rate": 2.4631301885236415e-05, "loss": 0.1335, "num_input_tokens_seen": 73331168, "step": 33975 }, { "epoch": 5.543230016313213, "grad_norm": 10.71635627746582, "learning_rate": 2.4624184692136554e-05, "loss": 0.1051, "num_input_tokens_seen": 73342944, "step": 33980 }, { "epoch": 5.544045676998369, "grad_norm": 0.07811756432056427, "learning_rate": 2.4617067529502188e-05, "loss": 0.0046, "num_input_tokens_seen": 73353920, "step": 33985 }, { "epoch": 5.544861337683524, "grad_norm": 3.223130226135254, "learning_rate": 2.460995039791027e-05, "loss": 0.3277, "num_input_tokens_seen": 73366432, "step": 33990 }, { "epoch": 5.545676998368679, "grad_norm": 0.07836192101240158, "learning_rate": 2.4602833297937755e-05, "loss": 0.0763, "num_input_tokens_seen": 73377824, "step": 33995 }, { "epoch": 5.5464926590538335, "grad_norm": 0.31072986125946045, "learning_rate": 2.4595716230161586e-05, "loss": 0.0137, "num_input_tokens_seen": 73387840, "step": 34000 }, { "epoch": 5.547308319738988, "grad_norm": 0.13992208242416382, "learning_rate": 2.45885991951587e-05, "loss": 0.0282, "num_input_tokens_seen": 73398304, "step": 34005 }, { "epoch": 5.548123980424144, "grad_norm": 8.604073524475098, "learning_rate": 2.458148219350606e-05, "loss": 0.2722, "num_input_tokens_seen": 73410016, "step": 34010 }, { "epoch": 5.548939641109299, "grad_norm": 33.65258026123047, "learning_rate": 2.45743652257806e-05, "loss": 0.0335, "num_input_tokens_seen": 73420960, "step": 34015 }, { "epoch": 5.549755301794454, "grad_norm": 0.2566872537136078, "learning_rate": 2.4567248292559253e-05, "loss": 0.0085, "num_input_tokens_seen": 73431712, "step": 34020 }, { "epoch": 5.5505709624796085, "grad_norm": 4.51474142074585, "learning_rate": 2.4560131394418958e-05, "loss": 0.2098, "num_input_tokens_seen": 73441376, "step": 34025 }, { "epoch": 5.551386623164763, "grad_norm": 0.10575133562088013, "learning_rate": 2.4553014531936632e-05, "loss": 0.0309, "num_input_tokens_seen": 73450752, "step": 34030 }, { "epoch": 5.552202283849918, "grad_norm": 3.058199167251587, "learning_rate": 2.4545897705689223e-05, "loss": 0.1061, "num_input_tokens_seen": 73461952, "step": 34035 }, { "epoch": 5.553017944535073, "grad_norm": 6.396117210388184, "learning_rate": 2.4538780916253657e-05, "loss": 0.0286, "num_input_tokens_seen": 73472064, "step": 34040 }, { "epoch": 5.553833605220229, "grad_norm": 0.06466473639011383, "learning_rate": 2.4531664164206843e-05, "loss": 0.0063, "num_input_tokens_seen": 73483200, "step": 34045 }, { "epoch": 5.554649265905383, "grad_norm": 0.3745845556259155, "learning_rate": 2.4524547450125713e-05, "loss": 0.0039, "num_input_tokens_seen": 73494080, "step": 34050 }, { "epoch": 5.555464926590538, "grad_norm": 0.0476202592253685, "learning_rate": 2.4517430774587174e-05, "loss": 0.2585, "num_input_tokens_seen": 73503616, "step": 34055 }, { "epoch": 5.556280587275693, "grad_norm": 0.22952498495578766, "learning_rate": 2.4510314138168146e-05, "loss": 0.0652, "num_input_tokens_seen": 73515328, "step": 34060 }, { "epoch": 5.557096247960848, "grad_norm": 0.07610568404197693, "learning_rate": 2.4503197541445545e-05, "loss": 0.1514, "num_input_tokens_seen": 73527520, "step": 34065 }, { "epoch": 5.557911908646004, "grad_norm": 0.9586765170097351, "learning_rate": 2.4496080984996264e-05, "loss": 0.179, "num_input_tokens_seen": 73538688, "step": 34070 }, { "epoch": 5.558727569331158, "grad_norm": 4.535788059234619, "learning_rate": 2.448896446939722e-05, "loss": 0.0607, "num_input_tokens_seen": 73548544, "step": 34075 }, { "epoch": 5.559543230016313, "grad_norm": 0.20011794567108154, "learning_rate": 2.4481847995225307e-05, "loss": 0.0705, "num_input_tokens_seen": 73559712, "step": 34080 }, { "epoch": 5.560358890701468, "grad_norm": 0.09488652646541595, "learning_rate": 2.4474731563057426e-05, "loss": 0.0051, "num_input_tokens_seen": 73570304, "step": 34085 }, { "epoch": 5.561174551386623, "grad_norm": 2.0041866302490234, "learning_rate": 2.446761517347046e-05, "loss": 0.1299, "num_input_tokens_seen": 73581056, "step": 34090 }, { "epoch": 5.561990212071779, "grad_norm": 0.039340272545814514, "learning_rate": 2.446049882704132e-05, "loss": 0.0084, "num_input_tokens_seen": 73592064, "step": 34095 }, { "epoch": 5.562805872756933, "grad_norm": 0.13631676137447357, "learning_rate": 2.4453382524346882e-05, "loss": 0.0044, "num_input_tokens_seen": 73603296, "step": 34100 }, { "epoch": 5.563621533442088, "grad_norm": 0.2204047441482544, "learning_rate": 2.444626626596403e-05, "loss": 0.0689, "num_input_tokens_seen": 73612704, "step": 34105 }, { "epoch": 5.564437194127243, "grad_norm": 3.499032735824585, "learning_rate": 2.4439150052469644e-05, "loss": 0.1091, "num_input_tokens_seen": 73623168, "step": 34110 }, { "epoch": 5.565252854812398, "grad_norm": 0.06525424122810364, "learning_rate": 2.4432033884440585e-05, "loss": 0.3741, "num_input_tokens_seen": 73633088, "step": 34115 }, { "epoch": 5.566068515497553, "grad_norm": 0.24328632652759552, "learning_rate": 2.4424917762453757e-05, "loss": 0.0066, "num_input_tokens_seen": 73644512, "step": 34120 }, { "epoch": 5.566884176182708, "grad_norm": 0.05231905356049538, "learning_rate": 2.4417801687086013e-05, "loss": 0.0376, "num_input_tokens_seen": 73655008, "step": 34125 }, { "epoch": 5.567699836867863, "grad_norm": 0.5238412618637085, "learning_rate": 2.4410685658914213e-05, "loss": 0.0184, "num_input_tokens_seen": 73665216, "step": 34130 }, { "epoch": 5.568515497553018, "grad_norm": 0.07202116400003433, "learning_rate": 2.4403569678515227e-05, "loss": 0.0051, "num_input_tokens_seen": 73676384, "step": 34135 }, { "epoch": 5.569331158238173, "grad_norm": 0.1429869532585144, "learning_rate": 2.4396453746465912e-05, "loss": 0.2107, "num_input_tokens_seen": 73687456, "step": 34140 }, { "epoch": 5.570146818923328, "grad_norm": 0.3126033544540405, "learning_rate": 2.4389337863343117e-05, "loss": 0.0808, "num_input_tokens_seen": 73699424, "step": 34145 }, { "epoch": 5.5709624796084825, "grad_norm": 0.1548863798379898, "learning_rate": 2.4382222029723693e-05, "loss": 0.0059, "num_input_tokens_seen": 73710400, "step": 34150 }, { "epoch": 5.571778140293638, "grad_norm": 0.1327655166387558, "learning_rate": 2.4375106246184484e-05, "loss": 0.3141, "num_input_tokens_seen": 73721184, "step": 34155 }, { "epoch": 5.572593800978793, "grad_norm": 0.3220544457435608, "learning_rate": 2.4367990513302336e-05, "loss": 0.047, "num_input_tokens_seen": 73732032, "step": 34160 }, { "epoch": 5.573409461663948, "grad_norm": 4.705662727355957, "learning_rate": 2.4360874831654083e-05, "loss": 0.2368, "num_input_tokens_seen": 73742944, "step": 34165 }, { "epoch": 5.574225122349103, "grad_norm": 0.08677390217781067, "learning_rate": 2.4353759201816555e-05, "loss": 0.0184, "num_input_tokens_seen": 73753312, "step": 34170 }, { "epoch": 5.575040783034257, "grad_norm": 0.255723237991333, "learning_rate": 2.4346643624366586e-05, "loss": 0.0963, "num_input_tokens_seen": 73764480, "step": 34175 }, { "epoch": 5.575856443719413, "grad_norm": 4.120612621307373, "learning_rate": 2.4339528099881e-05, "loss": 0.1556, "num_input_tokens_seen": 73774496, "step": 34180 }, { "epoch": 5.576672104404568, "grad_norm": 0.13706041872501373, "learning_rate": 2.433241262893662e-05, "loss": 0.0919, "num_input_tokens_seen": 73786688, "step": 34185 }, { "epoch": 5.577487765089723, "grad_norm": 0.13830256462097168, "learning_rate": 2.432529721211026e-05, "loss": 0.134, "num_input_tokens_seen": 73798080, "step": 34190 }, { "epoch": 5.578303425774878, "grad_norm": 0.3326249420642853, "learning_rate": 2.4318181849978733e-05, "loss": 0.005, "num_input_tokens_seen": 73808736, "step": 34195 }, { "epoch": 5.579119086460032, "grad_norm": 0.2934521436691284, "learning_rate": 2.4311066543118842e-05, "loss": 0.0188, "num_input_tokens_seen": 73819232, "step": 34200 }, { "epoch": 5.579934747145187, "grad_norm": 0.2670780420303345, "learning_rate": 2.4303951292107395e-05, "loss": 0.0091, "num_input_tokens_seen": 73829312, "step": 34205 }, { "epoch": 5.580750407830343, "grad_norm": 0.0827723890542984, "learning_rate": 2.4296836097521186e-05, "loss": 0.01, "num_input_tokens_seen": 73840320, "step": 34210 }, { "epoch": 5.581566068515498, "grad_norm": 0.13001371920108795, "learning_rate": 2.4289720959937008e-05, "loss": 0.0435, "num_input_tokens_seen": 73850464, "step": 34215 }, { "epoch": 5.582381729200653, "grad_norm": 0.05852844938635826, "learning_rate": 2.4282605879931647e-05, "loss": 0.0074, "num_input_tokens_seen": 73861152, "step": 34220 }, { "epoch": 5.583197389885807, "grad_norm": 0.036576732993125916, "learning_rate": 2.4275490858081903e-05, "loss": 0.0025, "num_input_tokens_seen": 73872288, "step": 34225 }, { "epoch": 5.584013050570962, "grad_norm": 0.10926913470029831, "learning_rate": 2.4268375894964544e-05, "loss": 0.0077, "num_input_tokens_seen": 73882720, "step": 34230 }, { "epoch": 5.584828711256117, "grad_norm": 0.08241516351699829, "learning_rate": 2.426126099115635e-05, "loss": 0.1167, "num_input_tokens_seen": 73892448, "step": 34235 }, { "epoch": 5.585644371941273, "grad_norm": 4.633983135223389, "learning_rate": 2.4254146147234087e-05, "loss": 0.2502, "num_input_tokens_seen": 73904224, "step": 34240 }, { "epoch": 5.5864600326264275, "grad_norm": 3.368659496307373, "learning_rate": 2.4247031363774523e-05, "loss": 0.1073, "num_input_tokens_seen": 73915424, "step": 34245 }, { "epoch": 5.587275693311582, "grad_norm": 0.18018129467964172, "learning_rate": 2.4239916641354417e-05, "loss": 0.1531, "num_input_tokens_seen": 73925344, "step": 34250 }, { "epoch": 5.588091353996737, "grad_norm": 0.1320713758468628, "learning_rate": 2.4232801980550523e-05, "loss": 0.0566, "num_input_tokens_seen": 73935904, "step": 34255 }, { "epoch": 5.588907014681892, "grad_norm": 0.05851376801729202, "learning_rate": 2.422568738193959e-05, "loss": 0.0033, "num_input_tokens_seen": 73947296, "step": 34260 }, { "epoch": 5.589722675367048, "grad_norm": 0.072971411049366, "learning_rate": 2.421857284609837e-05, "loss": 0.0053, "num_input_tokens_seen": 73955680, "step": 34265 }, { "epoch": 5.5905383360522025, "grad_norm": 3.2490286827087402, "learning_rate": 2.42114583736036e-05, "loss": 0.02, "num_input_tokens_seen": 73966304, "step": 34270 }, { "epoch": 5.591353996737357, "grad_norm": 0.03564507141709328, "learning_rate": 2.4204343965032015e-05, "loss": 0.0049, "num_input_tokens_seen": 73977120, "step": 34275 }, { "epoch": 5.592169657422512, "grad_norm": 0.06907259672880173, "learning_rate": 2.4197229620960347e-05, "loss": 0.0692, "num_input_tokens_seen": 73987648, "step": 34280 }, { "epoch": 5.592985318107667, "grad_norm": 0.8568238615989685, "learning_rate": 2.4190115341965316e-05, "loss": 0.0646, "num_input_tokens_seen": 73997312, "step": 34285 }, { "epoch": 5.593800978792823, "grad_norm": 0.08649122714996338, "learning_rate": 2.418300112862365e-05, "loss": 0.1882, "num_input_tokens_seen": 74009056, "step": 34290 }, { "epoch": 5.5946166394779775, "grad_norm": 6.20982027053833, "learning_rate": 2.4175886981512054e-05, "loss": 0.0242, "num_input_tokens_seen": 74019488, "step": 34295 }, { "epoch": 5.595432300163132, "grad_norm": 0.0463801734149456, "learning_rate": 2.416877290120724e-05, "loss": 0.0038, "num_input_tokens_seen": 74030880, "step": 34300 }, { "epoch": 5.596247960848287, "grad_norm": 0.05491442233324051, "learning_rate": 2.4161658888285916e-05, "loss": 0.1812, "num_input_tokens_seen": 74042080, "step": 34305 }, { "epoch": 5.597063621533442, "grad_norm": 0.015205527655780315, "learning_rate": 2.4154544943324772e-05, "loss": 0.0934, "num_input_tokens_seen": 74052448, "step": 34310 }, { "epoch": 5.597879282218597, "grad_norm": 11.813061714172363, "learning_rate": 2.414743106690051e-05, "loss": 0.0904, "num_input_tokens_seen": 74063392, "step": 34315 }, { "epoch": 5.598694942903752, "grad_norm": 14.201874732971191, "learning_rate": 2.41403172595898e-05, "loss": 0.1326, "num_input_tokens_seen": 74074720, "step": 34320 }, { "epoch": 5.599510603588907, "grad_norm": 4.2750959396362305, "learning_rate": 2.413320352196934e-05, "loss": 0.2027, "num_input_tokens_seen": 74084512, "step": 34325 }, { "epoch": 5.600326264274062, "grad_norm": 0.09404604882001877, "learning_rate": 2.4126089854615802e-05, "loss": 0.0753, "num_input_tokens_seen": 74095520, "step": 34330 }, { "epoch": 5.601141924959217, "grad_norm": 0.12843450903892517, "learning_rate": 2.411897625810586e-05, "loss": 0.1805, "num_input_tokens_seen": 74105792, "step": 34335 }, { "epoch": 5.601957585644372, "grad_norm": 0.033789101988077164, "learning_rate": 2.4111862733016164e-05, "loss": 0.0054, "num_input_tokens_seen": 74116224, "step": 34340 }, { "epoch": 5.602773246329527, "grad_norm": 3.307924509048462, "learning_rate": 2.4104749279923383e-05, "loss": 0.0375, "num_input_tokens_seen": 74127232, "step": 34345 }, { "epoch": 5.603588907014682, "grad_norm": 3.4159066677093506, "learning_rate": 2.409763589940417e-05, "loss": 0.1128, "num_input_tokens_seen": 74137088, "step": 34350 }, { "epoch": 5.604404567699837, "grad_norm": 0.04170740395784378, "learning_rate": 2.4090522592035172e-05, "loss": 0.0971, "num_input_tokens_seen": 74148128, "step": 34355 }, { "epoch": 5.605220228384992, "grad_norm": 0.06444224715232849, "learning_rate": 2.408340935839303e-05, "loss": 0.0072, "num_input_tokens_seen": 74158432, "step": 34360 }, { "epoch": 5.606035889070147, "grad_norm": 9.898969650268555, "learning_rate": 2.407629619905437e-05, "loss": 0.1866, "num_input_tokens_seen": 74169376, "step": 34365 }, { "epoch": 5.6068515497553015, "grad_norm": 4.78087043762207, "learning_rate": 2.406918311459583e-05, "loss": 0.0765, "num_input_tokens_seen": 74180352, "step": 34370 }, { "epoch": 5.607667210440457, "grad_norm": 0.11153988540172577, "learning_rate": 2.406207010559403e-05, "loss": 0.0068, "num_input_tokens_seen": 74190304, "step": 34375 }, { "epoch": 5.608482871125612, "grad_norm": 4.656994819641113, "learning_rate": 2.4054957172625584e-05, "loss": 0.2249, "num_input_tokens_seen": 74201504, "step": 34380 }, { "epoch": 5.609298531810767, "grad_norm": 3.565504550933838, "learning_rate": 2.4047844316267104e-05, "loss": 0.388, "num_input_tokens_seen": 74211968, "step": 34385 }, { "epoch": 5.610114192495922, "grad_norm": 0.014209095388650894, "learning_rate": 2.40407315370952e-05, "loss": 0.1318, "num_input_tokens_seen": 74223232, "step": 34390 }, { "epoch": 5.6109298531810765, "grad_norm": 3.1884076595306396, "learning_rate": 2.4033618835686462e-05, "loss": 0.1181, "num_input_tokens_seen": 74234272, "step": 34395 }, { "epoch": 5.611745513866231, "grad_norm": 0.039724018424749374, "learning_rate": 2.4026506212617485e-05, "loss": 0.0118, "num_input_tokens_seen": 74244608, "step": 34400 }, { "epoch": 5.612561174551386, "grad_norm": 4.550290107727051, "learning_rate": 2.4019393668464846e-05, "loss": 0.165, "num_input_tokens_seen": 74255104, "step": 34405 }, { "epoch": 5.613376835236542, "grad_norm": 0.07191260904073715, "learning_rate": 2.4012281203805138e-05, "loss": 0.008, "num_input_tokens_seen": 74265792, "step": 34410 }, { "epoch": 5.614192495921697, "grad_norm": 0.23330551385879517, "learning_rate": 2.4005168819214926e-05, "loss": 0.191, "num_input_tokens_seen": 74277120, "step": 34415 }, { "epoch": 5.6150081566068515, "grad_norm": 0.023702984675765038, "learning_rate": 2.3998056515270782e-05, "loss": 0.0083, "num_input_tokens_seen": 74286848, "step": 34420 }, { "epoch": 5.615823817292006, "grad_norm": 0.6659714579582214, "learning_rate": 2.3990944292549257e-05, "loss": 0.0075, "num_input_tokens_seen": 74296992, "step": 34425 }, { "epoch": 5.616639477977161, "grad_norm": 0.07955005764961243, "learning_rate": 2.3983832151626897e-05, "loss": 0.0786, "num_input_tokens_seen": 74307680, "step": 34430 }, { "epoch": 5.617455138662317, "grad_norm": 3.4622609615325928, "learning_rate": 2.397672009308027e-05, "loss": 0.3743, "num_input_tokens_seen": 74318368, "step": 34435 }, { "epoch": 5.618270799347472, "grad_norm": 0.051373761147260666, "learning_rate": 2.3969608117485906e-05, "loss": 0.0758, "num_input_tokens_seen": 74328544, "step": 34440 }, { "epoch": 5.6190864600326265, "grad_norm": 0.12117135524749756, "learning_rate": 2.3962496225420335e-05, "loss": 0.0345, "num_input_tokens_seen": 74339904, "step": 34445 }, { "epoch": 5.619902120717781, "grad_norm": 0.08636009693145752, "learning_rate": 2.3955384417460084e-05, "loss": 0.0128, "num_input_tokens_seen": 74350784, "step": 34450 }, { "epoch": 5.620717781402936, "grad_norm": 0.04565194249153137, "learning_rate": 2.3948272694181673e-05, "loss": 0.1465, "num_input_tokens_seen": 74361408, "step": 34455 }, { "epoch": 5.621533442088092, "grad_norm": 0.06937927007675171, "learning_rate": 2.3941161056161612e-05, "loss": 0.2219, "num_input_tokens_seen": 74371904, "step": 34460 }, { "epoch": 5.622349102773247, "grad_norm": 0.06344190984964371, "learning_rate": 2.393404950397641e-05, "loss": 0.2713, "num_input_tokens_seen": 74383232, "step": 34465 }, { "epoch": 5.623164763458401, "grad_norm": 0.1508975625038147, "learning_rate": 2.3926938038202565e-05, "loss": 0.0963, "num_input_tokens_seen": 74394240, "step": 34470 }, { "epoch": 5.623980424143556, "grad_norm": 0.060986749827861786, "learning_rate": 2.3919826659416564e-05, "loss": 0.0041, "num_input_tokens_seen": 74404160, "step": 34475 }, { "epoch": 5.624796084828711, "grad_norm": 9.748682975769043, "learning_rate": 2.3912715368194895e-05, "loss": 0.3195, "num_input_tokens_seen": 74415904, "step": 34480 }, { "epoch": 5.625611745513866, "grad_norm": 0.14794349670410156, "learning_rate": 2.3905604165114038e-05, "loss": 0.044, "num_input_tokens_seen": 74426336, "step": 34485 }, { "epoch": 5.626427406199021, "grad_norm": 0.2329591065645218, "learning_rate": 2.3898493050750453e-05, "loss": 0.0738, "num_input_tokens_seen": 74434592, "step": 34490 }, { "epoch": 5.627243066884176, "grad_norm": 0.11005127429962158, "learning_rate": 2.3891382025680616e-05, "loss": 0.0661, "num_input_tokens_seen": 74446464, "step": 34495 }, { "epoch": 5.628058727569331, "grad_norm": 0.14500364661216736, "learning_rate": 2.388427109048098e-05, "loss": 0.1189, "num_input_tokens_seen": 74457056, "step": 34500 }, { "epoch": 5.628874388254486, "grad_norm": 1.0034968852996826, "learning_rate": 2.3877160245727988e-05, "loss": 0.0059, "num_input_tokens_seen": 74467584, "step": 34505 }, { "epoch": 5.629690048939641, "grad_norm": 0.17740269005298615, "learning_rate": 2.3870049491998082e-05, "loss": 0.0201, "num_input_tokens_seen": 74477376, "step": 34510 }, { "epoch": 5.630505709624796, "grad_norm": 0.6840254664421082, "learning_rate": 2.3862938829867698e-05, "loss": 0.229, "num_input_tokens_seen": 74487648, "step": 34515 }, { "epoch": 5.631321370309951, "grad_norm": 4.9663166999816895, "learning_rate": 2.3855828259913262e-05, "loss": 0.0929, "num_input_tokens_seen": 74498496, "step": 34520 }, { "epoch": 5.632137030995106, "grad_norm": 13.157565116882324, "learning_rate": 2.3848717782711194e-05, "loss": 0.1202, "num_input_tokens_seen": 74509664, "step": 34525 }, { "epoch": 5.632952691680261, "grad_norm": 0.19897565245628357, "learning_rate": 2.3841607398837902e-05, "loss": 0.0507, "num_input_tokens_seen": 74519808, "step": 34530 }, { "epoch": 5.633768352365416, "grad_norm": 2.7781496047973633, "learning_rate": 2.3834497108869797e-05, "loss": 0.0264, "num_input_tokens_seen": 74531840, "step": 34535 }, { "epoch": 5.634584013050571, "grad_norm": 0.18439719080924988, "learning_rate": 2.3827386913383254e-05, "loss": 0.1879, "num_input_tokens_seen": 74543168, "step": 34540 }, { "epoch": 5.635399673735726, "grad_norm": 0.28179651498794556, "learning_rate": 2.3820276812954688e-05, "loss": 0.0074, "num_input_tokens_seen": 74553792, "step": 34545 }, { "epoch": 5.636215334420881, "grad_norm": 0.07636716961860657, "learning_rate": 2.3813166808160472e-05, "loss": 0.085, "num_input_tokens_seen": 74565312, "step": 34550 }, { "epoch": 5.637030995106036, "grad_norm": 0.07693570852279663, "learning_rate": 2.3806056899576978e-05, "loss": 0.3763, "num_input_tokens_seen": 74576256, "step": 34555 }, { "epoch": 5.637846655791191, "grad_norm": 0.07625547051429749, "learning_rate": 2.3798947087780567e-05, "loss": 0.1703, "num_input_tokens_seen": 74586560, "step": 34560 }, { "epoch": 5.638662316476346, "grad_norm": 7.158646106719971, "learning_rate": 2.37918373733476e-05, "loss": 0.0801, "num_input_tokens_seen": 74597792, "step": 34565 }, { "epoch": 5.6394779771615005, "grad_norm": 33.091461181640625, "learning_rate": 2.3784727756854425e-05, "loss": 0.3652, "num_input_tokens_seen": 74609504, "step": 34570 }, { "epoch": 5.640293637846656, "grad_norm": 0.06495015323162079, "learning_rate": 2.377761823887738e-05, "loss": 0.1862, "num_input_tokens_seen": 74620160, "step": 34575 }, { "epoch": 5.641109298531811, "grad_norm": 23.01750946044922, "learning_rate": 2.3770508819992807e-05, "loss": 0.0941, "num_input_tokens_seen": 74631072, "step": 34580 }, { "epoch": 5.641924959216966, "grad_norm": 0.04679006710648537, "learning_rate": 2.376339950077703e-05, "loss": 0.0067, "num_input_tokens_seen": 74641088, "step": 34585 }, { "epoch": 5.642740619902121, "grad_norm": 0.220841184258461, "learning_rate": 2.3756290281806358e-05, "loss": 0.0244, "num_input_tokens_seen": 74650560, "step": 34590 }, { "epoch": 5.643556280587275, "grad_norm": 0.10817943513393402, "learning_rate": 2.3749181163657114e-05, "loss": 0.1005, "num_input_tokens_seen": 74660608, "step": 34595 }, { "epoch": 5.64437194127243, "grad_norm": 0.15172620117664337, "learning_rate": 2.3742072146905587e-05, "loss": 0.0074, "num_input_tokens_seen": 74672064, "step": 34600 }, { "epoch": 5.645187601957586, "grad_norm": 4.3850908279418945, "learning_rate": 2.3734963232128072e-05, "loss": 0.0824, "num_input_tokens_seen": 74683200, "step": 34605 }, { "epoch": 5.646003262642741, "grad_norm": 0.14718104898929596, "learning_rate": 2.372785441990086e-05, "loss": 0.1096, "num_input_tokens_seen": 74694016, "step": 34610 }, { "epoch": 5.646818923327896, "grad_norm": 1.600773811340332, "learning_rate": 2.3720745710800225e-05, "loss": 0.4462, "num_input_tokens_seen": 74704096, "step": 34615 }, { "epoch": 5.64763458401305, "grad_norm": 0.0689341202378273, "learning_rate": 2.371363710540243e-05, "loss": 0.1558, "num_input_tokens_seen": 74714528, "step": 34620 }, { "epoch": 5.648450244698205, "grad_norm": 0.4660826027393341, "learning_rate": 2.370652860428374e-05, "loss": 0.281, "num_input_tokens_seen": 74725888, "step": 34625 }, { "epoch": 5.649265905383361, "grad_norm": 0.4790792167186737, "learning_rate": 2.3699420208020403e-05, "loss": 0.0261, "num_input_tokens_seen": 74736384, "step": 34630 }, { "epoch": 5.650081566068516, "grad_norm": 3.3586387634277344, "learning_rate": 2.3692311917188658e-05, "loss": 0.2205, "num_input_tokens_seen": 74748064, "step": 34635 }, { "epoch": 5.650897226753671, "grad_norm": 0.03131727874279022, "learning_rate": 2.3685203732364754e-05, "loss": 0.0594, "num_input_tokens_seen": 74758720, "step": 34640 }, { "epoch": 5.651712887438825, "grad_norm": 0.06394805014133453, "learning_rate": 2.3678095654124893e-05, "loss": 0.0058, "num_input_tokens_seen": 74770208, "step": 34645 }, { "epoch": 5.65252854812398, "grad_norm": 0.09772054105997086, "learning_rate": 2.3670987683045317e-05, "loss": 0.0046, "num_input_tokens_seen": 74781568, "step": 34650 }, { "epoch": 5.653344208809135, "grad_norm": 0.07734900712966919, "learning_rate": 2.366387981970222e-05, "loss": 0.1028, "num_input_tokens_seen": 74792608, "step": 34655 }, { "epoch": 5.654159869494291, "grad_norm": 0.30056944489479065, "learning_rate": 2.36567720646718e-05, "loss": 0.0053, "num_input_tokens_seen": 74801760, "step": 34660 }, { "epoch": 5.6549755301794455, "grad_norm": 0.09153929352760315, "learning_rate": 2.3649664418530258e-05, "loss": 0.0277, "num_input_tokens_seen": 74812256, "step": 34665 }, { "epoch": 5.6557911908646, "grad_norm": 0.07793822884559631, "learning_rate": 2.364255688185377e-05, "loss": 0.0704, "num_input_tokens_seen": 74822848, "step": 34670 }, { "epoch": 5.656606851549755, "grad_norm": 0.06883666664361954, "learning_rate": 2.3635449455218506e-05, "loss": 0.0998, "num_input_tokens_seen": 74832992, "step": 34675 }, { "epoch": 5.65742251223491, "grad_norm": 0.08330200612545013, "learning_rate": 2.3628342139200636e-05, "loss": 0.0607, "num_input_tokens_seen": 74843136, "step": 34680 }, { "epoch": 5.658238172920065, "grad_norm": 0.017148641869425774, "learning_rate": 2.362123493437631e-05, "loss": 0.0042, "num_input_tokens_seen": 74852800, "step": 34685 }, { "epoch": 5.6590538336052205, "grad_norm": 0.19309087097644806, "learning_rate": 2.3614127841321677e-05, "loss": 0.073, "num_input_tokens_seen": 74864512, "step": 34690 }, { "epoch": 5.659869494290375, "grad_norm": 6.053650856018066, "learning_rate": 2.3607020860612872e-05, "loss": 0.0083, "num_input_tokens_seen": 74875936, "step": 34695 }, { "epoch": 5.66068515497553, "grad_norm": 0.07050441205501556, "learning_rate": 2.3599913992826023e-05, "loss": 0.1225, "num_input_tokens_seen": 74885504, "step": 34700 }, { "epoch": 5.661500815660685, "grad_norm": 10.587239265441895, "learning_rate": 2.3592807238537253e-05, "loss": 0.1015, "num_input_tokens_seen": 74897632, "step": 34705 }, { "epoch": 5.66231647634584, "grad_norm": 5.930193901062012, "learning_rate": 2.3585700598322665e-05, "loss": 0.0922, "num_input_tokens_seen": 74908672, "step": 34710 }, { "epoch": 5.6631321370309955, "grad_norm": 3.4469075202941895, "learning_rate": 2.3578594072758363e-05, "loss": 0.1463, "num_input_tokens_seen": 74919104, "step": 34715 }, { "epoch": 5.66394779771615, "grad_norm": 10.311992645263672, "learning_rate": 2.3571487662420433e-05, "loss": 0.0571, "num_input_tokens_seen": 74929952, "step": 34720 }, { "epoch": 5.664763458401305, "grad_norm": 11.221634864807129, "learning_rate": 2.3564381367884965e-05, "loss": 0.1078, "num_input_tokens_seen": 74941760, "step": 34725 }, { "epoch": 5.66557911908646, "grad_norm": 0.6332096457481384, "learning_rate": 2.3557275189728032e-05, "loss": 0.0771, "num_input_tokens_seen": 74952000, "step": 34730 }, { "epoch": 5.666394779771615, "grad_norm": 0.05423618480563164, "learning_rate": 2.3550169128525688e-05, "loss": 0.0086, "num_input_tokens_seen": 74963008, "step": 34735 }, { "epoch": 5.6672104404567705, "grad_norm": 1.9471640586853027, "learning_rate": 2.3543063184853994e-05, "loss": 0.0095, "num_input_tokens_seen": 74973408, "step": 34740 }, { "epoch": 5.668026101141925, "grad_norm": 0.691474974155426, "learning_rate": 2.353595735928899e-05, "loss": 0.4548, "num_input_tokens_seen": 74983744, "step": 34745 }, { "epoch": 5.66884176182708, "grad_norm": 0.10274942964315414, "learning_rate": 2.3528851652406697e-05, "loss": 0.1179, "num_input_tokens_seen": 74993952, "step": 34750 }, { "epoch": 5.669657422512235, "grad_norm": 0.07681412994861603, "learning_rate": 2.3521746064783168e-05, "loss": 0.0111, "num_input_tokens_seen": 75004128, "step": 34755 }, { "epoch": 5.67047308319739, "grad_norm": 2.1961638927459717, "learning_rate": 2.3514640596994404e-05, "loss": 0.0706, "num_input_tokens_seen": 75014496, "step": 34760 }, { "epoch": 5.671288743882545, "grad_norm": 0.0980810821056366, "learning_rate": 2.350753524961641e-05, "loss": 0.3276, "num_input_tokens_seen": 75024704, "step": 34765 }, { "epoch": 5.672104404567699, "grad_norm": 0.07521381974220276, "learning_rate": 2.3500430023225174e-05, "loss": 0.0969, "num_input_tokens_seen": 75035488, "step": 34770 }, { "epoch": 5.672920065252855, "grad_norm": 0.062481507658958435, "learning_rate": 2.3493324918396696e-05, "loss": 0.0655, "num_input_tokens_seen": 75048192, "step": 34775 }, { "epoch": 5.67373572593801, "grad_norm": 0.060710132122039795, "learning_rate": 2.3486219935706944e-05, "loss": 0.1326, "num_input_tokens_seen": 75059680, "step": 34780 }, { "epoch": 5.674551386623165, "grad_norm": 6.397144794464111, "learning_rate": 2.3479115075731886e-05, "loss": 0.0355, "num_input_tokens_seen": 75069696, "step": 34785 }, { "epoch": 5.6753670473083195, "grad_norm": 0.37986424565315247, "learning_rate": 2.3472010339047474e-05, "loss": 0.292, "num_input_tokens_seen": 75080512, "step": 34790 }, { "epoch": 5.676182707993474, "grad_norm": 1.0515350103378296, "learning_rate": 2.3464905726229657e-05, "loss": 0.1161, "num_input_tokens_seen": 75092000, "step": 34795 }, { "epoch": 5.67699836867863, "grad_norm": 0.05977501720190048, "learning_rate": 2.3457801237854367e-05, "loss": 0.114, "num_input_tokens_seen": 75102208, "step": 34800 }, { "epoch": 5.677814029363785, "grad_norm": 0.07490808516740799, "learning_rate": 2.345069687449754e-05, "loss": 0.1285, "num_input_tokens_seen": 75112960, "step": 34805 }, { "epoch": 5.67862969004894, "grad_norm": 0.1772504597902298, "learning_rate": 2.3443592636735085e-05, "loss": 0.0438, "num_input_tokens_seen": 75125824, "step": 34810 }, { "epoch": 5.6794453507340945, "grad_norm": 0.06352439522743225, "learning_rate": 2.3436488525142906e-05, "loss": 0.151, "num_input_tokens_seen": 75136896, "step": 34815 }, { "epoch": 5.680261011419249, "grad_norm": 8.625276565551758, "learning_rate": 2.3429384540296902e-05, "loss": 0.0414, "num_input_tokens_seen": 75146528, "step": 34820 }, { "epoch": 5.681076672104405, "grad_norm": 0.10543076694011688, "learning_rate": 2.3422280682772953e-05, "loss": 0.0634, "num_input_tokens_seen": 75157184, "step": 34825 }, { "epoch": 5.68189233278956, "grad_norm": 9.880266189575195, "learning_rate": 2.341517695314694e-05, "loss": 0.0378, "num_input_tokens_seen": 75166016, "step": 34830 }, { "epoch": 5.682707993474715, "grad_norm": 0.06467192620038986, "learning_rate": 2.3408073351994726e-05, "loss": 0.2627, "num_input_tokens_seen": 75177824, "step": 34835 }, { "epoch": 5.6835236541598695, "grad_norm": 0.27344152331352234, "learning_rate": 2.340096987989216e-05, "loss": 0.1237, "num_input_tokens_seen": 75188128, "step": 34840 }, { "epoch": 5.684339314845024, "grad_norm": 0.054349880665540695, "learning_rate": 2.3393866537415093e-05, "loss": 0.064, "num_input_tokens_seen": 75198016, "step": 34845 }, { "epoch": 5.685154975530179, "grad_norm": 0.05116664618253708, "learning_rate": 2.3386763325139353e-05, "loss": 0.0093, "num_input_tokens_seen": 75208768, "step": 34850 }, { "epoch": 5.685970636215334, "grad_norm": 0.057738080620765686, "learning_rate": 2.337966024364076e-05, "loss": 0.0105, "num_input_tokens_seen": 75220064, "step": 34855 }, { "epoch": 5.68678629690049, "grad_norm": 3.193617582321167, "learning_rate": 2.337255729349512e-05, "loss": 0.0188, "num_input_tokens_seen": 75231552, "step": 34860 }, { "epoch": 5.6876019575856445, "grad_norm": 1.0187528133392334, "learning_rate": 2.3365454475278257e-05, "loss": 0.0601, "num_input_tokens_seen": 75242080, "step": 34865 }, { "epoch": 5.688417618270799, "grad_norm": 0.03958727419376373, "learning_rate": 2.3358351789565945e-05, "loss": 0.0723, "num_input_tokens_seen": 75253312, "step": 34870 }, { "epoch": 5.689233278955954, "grad_norm": 0.12234574556350708, "learning_rate": 2.335124923693397e-05, "loss": 0.0066, "num_input_tokens_seen": 75265376, "step": 34875 }, { "epoch": 5.690048939641109, "grad_norm": 0.1843361258506775, "learning_rate": 2.334414681795809e-05, "loss": 0.2009, "num_input_tokens_seen": 75275872, "step": 34880 }, { "epoch": 5.690864600326265, "grad_norm": 0.19655779004096985, "learning_rate": 2.3337044533214068e-05, "loss": 0.005, "num_input_tokens_seen": 75288096, "step": 34885 }, { "epoch": 5.691680261011419, "grad_norm": 0.36933252215385437, "learning_rate": 2.3329942383277665e-05, "loss": 0.0082, "num_input_tokens_seen": 75298112, "step": 34890 }, { "epoch": 5.692495921696574, "grad_norm": 0.1846073865890503, "learning_rate": 2.3322840368724598e-05, "loss": 0.1659, "num_input_tokens_seen": 75308992, "step": 34895 }, { "epoch": 5.693311582381729, "grad_norm": 0.1440029889345169, "learning_rate": 2.3315738490130606e-05, "loss": 0.1509, "num_input_tokens_seen": 75320352, "step": 34900 }, { "epoch": 5.694127243066884, "grad_norm": 0.1343105584383011, "learning_rate": 2.3308636748071395e-05, "loss": 0.0064, "num_input_tokens_seen": 75333056, "step": 34905 }, { "epoch": 5.69494290375204, "grad_norm": 0.07324358075857162, "learning_rate": 2.3301535143122675e-05, "loss": 0.1334, "num_input_tokens_seen": 75345248, "step": 34910 }, { "epoch": 5.695758564437194, "grad_norm": 10.516416549682617, "learning_rate": 2.3294433675860134e-05, "loss": 0.2258, "num_input_tokens_seen": 75356576, "step": 34915 }, { "epoch": 5.696574225122349, "grad_norm": 0.055170800536870956, "learning_rate": 2.328733234685945e-05, "loss": 0.0173, "num_input_tokens_seen": 75368000, "step": 34920 }, { "epoch": 5.697389885807504, "grad_norm": 1.2303236722946167, "learning_rate": 2.3280231156696297e-05, "loss": 0.019, "num_input_tokens_seen": 75378208, "step": 34925 }, { "epoch": 5.698205546492659, "grad_norm": 6.7660231590271, "learning_rate": 2.3273130105946333e-05, "loss": 0.087, "num_input_tokens_seen": 75390112, "step": 34930 }, { "epoch": 5.699021207177814, "grad_norm": 0.06041080877184868, "learning_rate": 2.3266029195185204e-05, "loss": 0.0052, "num_input_tokens_seen": 75401824, "step": 34935 }, { "epoch": 5.699836867862969, "grad_norm": 2.1426451206207275, "learning_rate": 2.3258928424988548e-05, "loss": 0.1177, "num_input_tokens_seen": 75412384, "step": 34940 }, { "epoch": 5.700652528548124, "grad_norm": 0.7164055109024048, "learning_rate": 2.325182779593198e-05, "loss": 0.007, "num_input_tokens_seen": 75423648, "step": 34945 }, { "epoch": 5.701468189233279, "grad_norm": 5.581310272216797, "learning_rate": 2.3244727308591126e-05, "loss": 0.2702, "num_input_tokens_seen": 75435232, "step": 34950 }, { "epoch": 5.702283849918434, "grad_norm": 1.0950038433074951, "learning_rate": 2.3237626963541588e-05, "loss": 0.0081, "num_input_tokens_seen": 75445952, "step": 34955 }, { "epoch": 5.703099510603589, "grad_norm": 0.08078707754611969, "learning_rate": 2.3230526761358944e-05, "loss": 0.0047, "num_input_tokens_seen": 75457696, "step": 34960 }, { "epoch": 5.7039151712887435, "grad_norm": 0.10439500957727432, "learning_rate": 2.3223426702618776e-05, "loss": 0.0796, "num_input_tokens_seen": 75467424, "step": 34965 }, { "epoch": 5.704730831973899, "grad_norm": 0.06901131570339203, "learning_rate": 2.3216326787896652e-05, "loss": 0.1305, "num_input_tokens_seen": 75476928, "step": 34970 }, { "epoch": 5.705546492659054, "grad_norm": 11.365278244018555, "learning_rate": 2.3209227017768137e-05, "loss": 0.0197, "num_input_tokens_seen": 75486624, "step": 34975 }, { "epoch": 5.706362153344209, "grad_norm": 0.10638049244880676, "learning_rate": 2.3202127392808768e-05, "loss": 0.1906, "num_input_tokens_seen": 75497888, "step": 34980 }, { "epoch": 5.707177814029364, "grad_norm": 0.10540467500686646, "learning_rate": 2.319502791359407e-05, "loss": 0.0617, "num_input_tokens_seen": 75508352, "step": 34985 }, { "epoch": 5.7079934747145185, "grad_norm": 0.16034981608390808, "learning_rate": 2.3187928580699573e-05, "loss": 0.0047, "num_input_tokens_seen": 75520416, "step": 34990 }, { "epoch": 5.708809135399674, "grad_norm": 0.09266732633113861, "learning_rate": 2.3180829394700775e-05, "loss": 0.1377, "num_input_tokens_seen": 75531168, "step": 34995 }, { "epoch": 5.709624796084829, "grad_norm": 0.029344908893108368, "learning_rate": 2.317373035617318e-05, "loss": 0.0652, "num_input_tokens_seen": 75541632, "step": 35000 }, { "epoch": 5.710440456769984, "grad_norm": 0.09086069464683533, "learning_rate": 2.3166631465692264e-05, "loss": 0.07, "num_input_tokens_seen": 75552224, "step": 35005 }, { "epoch": 5.711256117455139, "grad_norm": 0.066799096763134, "learning_rate": 2.3159532723833508e-05, "loss": 0.0038, "num_input_tokens_seen": 75562080, "step": 35010 }, { "epoch": 5.712071778140293, "grad_norm": 0.09885165840387344, "learning_rate": 2.3152434131172368e-05, "loss": 0.0062, "num_input_tokens_seen": 75571616, "step": 35015 }, { "epoch": 5.712887438825448, "grad_norm": 0.08804667741060257, "learning_rate": 2.3145335688284288e-05, "loss": 0.0512, "num_input_tokens_seen": 75581952, "step": 35020 }, { "epoch": 5.713703099510604, "grad_norm": 0.07176681607961655, "learning_rate": 2.3138237395744712e-05, "loss": 0.0781, "num_input_tokens_seen": 75592448, "step": 35025 }, { "epoch": 5.714518760195759, "grad_norm": 0.2840748727321625, "learning_rate": 2.313113925412905e-05, "loss": 0.0096, "num_input_tokens_seen": 75603872, "step": 35030 }, { "epoch": 5.715334420880914, "grad_norm": 0.04150667041540146, "learning_rate": 2.312404126401273e-05, "loss": 0.1128, "num_input_tokens_seen": 75614752, "step": 35035 }, { "epoch": 5.716150081566068, "grad_norm": 0.07693462073802948, "learning_rate": 2.3116943425971144e-05, "loss": 0.004, "num_input_tokens_seen": 75625600, "step": 35040 }, { "epoch": 5.716965742251223, "grad_norm": 5.008595943450928, "learning_rate": 2.3109845740579676e-05, "loss": 0.2178, "num_input_tokens_seen": 75635648, "step": 35045 }, { "epoch": 5.717781402936378, "grad_norm": 0.06779158115386963, "learning_rate": 2.3102748208413706e-05, "loss": 0.089, "num_input_tokens_seen": 75647680, "step": 35050 }, { "epoch": 5.718597063621534, "grad_norm": 0.12235770374536514, "learning_rate": 2.3095650830048595e-05, "loss": 0.0835, "num_input_tokens_seen": 75658080, "step": 35055 }, { "epoch": 5.719412724306689, "grad_norm": 0.05873510614037514, "learning_rate": 2.3088553606059686e-05, "loss": 0.1574, "num_input_tokens_seen": 75669920, "step": 35060 }, { "epoch": 5.720228384991843, "grad_norm": 0.04165768623352051, "learning_rate": 2.308145653702232e-05, "loss": 0.0149, "num_input_tokens_seen": 75681792, "step": 35065 }, { "epoch": 5.721044045676998, "grad_norm": 0.04350676387548447, "learning_rate": 2.307435962351181e-05, "loss": 0.0038, "num_input_tokens_seen": 75692992, "step": 35070 }, { "epoch": 5.721859706362153, "grad_norm": 0.036871396005153656, "learning_rate": 2.3067262866103492e-05, "loss": 0.0045, "num_input_tokens_seen": 75704608, "step": 35075 }, { "epoch": 5.722675367047309, "grad_norm": 0.3714873790740967, "learning_rate": 2.3060166265372654e-05, "loss": 0.0238, "num_input_tokens_seen": 75715328, "step": 35080 }, { "epoch": 5.7234910277324635, "grad_norm": 3.1402053833007812, "learning_rate": 2.3053069821894578e-05, "loss": 0.1874, "num_input_tokens_seen": 75725376, "step": 35085 }, { "epoch": 5.724306688417618, "grad_norm": 0.044017672538757324, "learning_rate": 2.3045973536244543e-05, "loss": 0.0289, "num_input_tokens_seen": 75736512, "step": 35090 }, { "epoch": 5.725122349102773, "grad_norm": 0.032331086695194244, "learning_rate": 2.303887740899781e-05, "loss": 0.0105, "num_input_tokens_seen": 75747552, "step": 35095 }, { "epoch": 5.725938009787928, "grad_norm": 2.916252613067627, "learning_rate": 2.3031781440729623e-05, "loss": 0.1272, "num_input_tokens_seen": 75759136, "step": 35100 }, { "epoch": 5.726753670473083, "grad_norm": 0.052721064537763596, "learning_rate": 2.3024685632015218e-05, "loss": 0.1054, "num_input_tokens_seen": 75770208, "step": 35105 }, { "epoch": 5.7275693311582385, "grad_norm": 0.08456026017665863, "learning_rate": 2.3017589983429817e-05, "loss": 0.0069, "num_input_tokens_seen": 75780800, "step": 35110 }, { "epoch": 5.728384991843393, "grad_norm": 0.09905166923999786, "learning_rate": 2.301049449554863e-05, "loss": 0.0044, "num_input_tokens_seen": 75791328, "step": 35115 }, { "epoch": 5.729200652528548, "grad_norm": 0.20791390538215637, "learning_rate": 2.3003399168946855e-05, "loss": 0.3981, "num_input_tokens_seen": 75802688, "step": 35120 }, { "epoch": 5.730016313213703, "grad_norm": 0.06286616623401642, "learning_rate": 2.2996304004199677e-05, "loss": 0.1911, "num_input_tokens_seen": 75812640, "step": 35125 }, { "epoch": 5.730831973898858, "grad_norm": 0.29400545358657837, "learning_rate": 2.298920900188226e-05, "loss": 0.0224, "num_input_tokens_seen": 75823680, "step": 35130 }, { "epoch": 5.731647634584013, "grad_norm": 0.09266781806945801, "learning_rate": 2.2982114162569766e-05, "loss": 0.0076, "num_input_tokens_seen": 75834208, "step": 35135 }, { "epoch": 5.732463295269168, "grad_norm": 0.027921630069613457, "learning_rate": 2.2975019486837334e-05, "loss": 0.0931, "num_input_tokens_seen": 75846432, "step": 35140 }, { "epoch": 5.733278955954323, "grad_norm": 0.08629529178142548, "learning_rate": 2.29679249752601e-05, "loss": 0.202, "num_input_tokens_seen": 75856960, "step": 35145 }, { "epoch": 5.734094616639478, "grad_norm": 0.16682440042495728, "learning_rate": 2.2960830628413175e-05, "loss": 0.0054, "num_input_tokens_seen": 75868256, "step": 35150 }, { "epoch": 5.734910277324633, "grad_norm": 0.09931493550539017, "learning_rate": 2.295373644687167e-05, "loss": 0.0618, "num_input_tokens_seen": 75879808, "step": 35155 }, { "epoch": 5.735725938009788, "grad_norm": 5.3806843757629395, "learning_rate": 2.294664243121067e-05, "loss": 0.3896, "num_input_tokens_seen": 75891168, "step": 35160 }, { "epoch": 5.736541598694943, "grad_norm": 0.0984605923295021, "learning_rate": 2.2939548582005253e-05, "loss": 0.2172, "num_input_tokens_seen": 75902080, "step": 35165 }, { "epoch": 5.737357259380098, "grad_norm": 1.2241793870925903, "learning_rate": 2.293245489983048e-05, "loss": 0.014, "num_input_tokens_seen": 75913568, "step": 35170 }, { "epoch": 5.738172920065253, "grad_norm": 0.05884980410337448, "learning_rate": 2.2925361385261402e-05, "loss": 0.1283, "num_input_tokens_seen": 75923552, "step": 35175 }, { "epoch": 5.738988580750408, "grad_norm": 14.16159439086914, "learning_rate": 2.2918268038873055e-05, "loss": 0.1801, "num_input_tokens_seen": 75933856, "step": 35180 }, { "epoch": 5.739804241435563, "grad_norm": 0.08795402199029922, "learning_rate": 2.291117486124047e-05, "loss": 0.1888, "num_input_tokens_seen": 75944896, "step": 35185 }, { "epoch": 5.740619902120718, "grad_norm": 2.2796897888183594, "learning_rate": 2.290408185293865e-05, "loss": 0.1177, "num_input_tokens_seen": 75954880, "step": 35190 }, { "epoch": 5.741435562805873, "grad_norm": 0.11136547476053238, "learning_rate": 2.2896989014542584e-05, "loss": 0.0575, "num_input_tokens_seen": 75964800, "step": 35195 }, { "epoch": 5.742251223491028, "grad_norm": 5.454838752746582, "learning_rate": 2.2889896346627256e-05, "loss": 0.1566, "num_input_tokens_seen": 75975264, "step": 35200 }, { "epoch": 5.743066884176183, "grad_norm": 0.059465523809194565, "learning_rate": 2.2882803849767646e-05, "loss": 0.008, "num_input_tokens_seen": 75986976, "step": 35205 }, { "epoch": 5.7438825448613375, "grad_norm": 0.1806742250919342, "learning_rate": 2.2875711524538697e-05, "loss": 0.0201, "num_input_tokens_seen": 75998752, "step": 35210 }, { "epoch": 5.744698205546492, "grad_norm": 0.9701160788536072, "learning_rate": 2.2868619371515348e-05, "loss": 0.0964, "num_input_tokens_seen": 76010688, "step": 35215 }, { "epoch": 5.745513866231647, "grad_norm": 2.5485148429870605, "learning_rate": 2.2861527391272526e-05, "loss": 0.2846, "num_input_tokens_seen": 76021408, "step": 35220 }, { "epoch": 5.746329526916803, "grad_norm": 0.0871826633810997, "learning_rate": 2.285443558438515e-05, "loss": 0.1911, "num_input_tokens_seen": 76032480, "step": 35225 }, { "epoch": 5.747145187601958, "grad_norm": 2.6616318225860596, "learning_rate": 2.2847343951428106e-05, "loss": 0.1337, "num_input_tokens_seen": 76043072, "step": 35230 }, { "epoch": 5.7479608482871125, "grad_norm": 0.1521041989326477, "learning_rate": 2.284025249297629e-05, "loss": 0.1055, "num_input_tokens_seen": 76054208, "step": 35235 }, { "epoch": 5.748776508972267, "grad_norm": 0.1400117129087448, "learning_rate": 2.2833161209604557e-05, "loss": 0.0237, "num_input_tokens_seen": 76066080, "step": 35240 }, { "epoch": 5.749592169657422, "grad_norm": 8.059832572937012, "learning_rate": 2.2826070101887777e-05, "loss": 0.11, "num_input_tokens_seen": 76077088, "step": 35245 }, { "epoch": 5.750407830342578, "grad_norm": 0.08581706136465073, "learning_rate": 2.2818979170400785e-05, "loss": 0.1219, "num_input_tokens_seen": 76087104, "step": 35250 }, { "epoch": 5.751223491027733, "grad_norm": 4.261509418487549, "learning_rate": 2.2811888415718405e-05, "loss": 0.0208, "num_input_tokens_seen": 76098560, "step": 35255 }, { "epoch": 5.7520391517128875, "grad_norm": 0.7076281905174255, "learning_rate": 2.2804797838415448e-05, "loss": 0.0129, "num_input_tokens_seen": 76109472, "step": 35260 }, { "epoch": 5.752854812398042, "grad_norm": 6.184848308563232, "learning_rate": 2.2797707439066724e-05, "loss": 0.0707, "num_input_tokens_seen": 76119744, "step": 35265 }, { "epoch": 5.753670473083197, "grad_norm": 0.08045632392168045, "learning_rate": 2.2790617218247005e-05, "loss": 0.0201, "num_input_tokens_seen": 76129664, "step": 35270 }, { "epoch": 5.754486133768353, "grad_norm": 0.09801807254552841, "learning_rate": 2.278352717653107e-05, "loss": 0.0051, "num_input_tokens_seen": 76140384, "step": 35275 }, { "epoch": 5.755301794453508, "grad_norm": 2.68274188041687, "learning_rate": 2.2776437314493666e-05, "loss": 0.1434, "num_input_tokens_seen": 76150624, "step": 35280 }, { "epoch": 5.7561174551386625, "grad_norm": 0.15872056782245636, "learning_rate": 2.2769347632709523e-05, "loss": 0.0085, "num_input_tokens_seen": 76161952, "step": 35285 }, { "epoch": 5.756933115823817, "grad_norm": 0.19446246325969696, "learning_rate": 2.276225813175339e-05, "loss": 0.0688, "num_input_tokens_seen": 76173600, "step": 35290 }, { "epoch": 5.757748776508972, "grad_norm": 0.1902131289243698, "learning_rate": 2.275516881219997e-05, "loss": 0.0055, "num_input_tokens_seen": 76183808, "step": 35295 }, { "epoch": 5.758564437194127, "grad_norm": 3.389545202255249, "learning_rate": 2.2748079674623954e-05, "loss": 0.1351, "num_input_tokens_seen": 76194752, "step": 35300 }, { "epoch": 5.759380097879282, "grad_norm": 8.538814544677734, "learning_rate": 2.2740990719600026e-05, "loss": 0.1226, "num_input_tokens_seen": 76205696, "step": 35305 }, { "epoch": 5.760195758564437, "grad_norm": 0.15769486129283905, "learning_rate": 2.2733901947702852e-05, "loss": 0.1312, "num_input_tokens_seen": 76215072, "step": 35310 }, { "epoch": 5.761011419249592, "grad_norm": 0.24559558928012848, "learning_rate": 2.2726813359507084e-05, "loss": 0.1406, "num_input_tokens_seen": 76225248, "step": 35315 }, { "epoch": 5.761827079934747, "grad_norm": 7.306979179382324, "learning_rate": 2.271972495558736e-05, "loss": 0.1356, "num_input_tokens_seen": 76235552, "step": 35320 }, { "epoch": 5.762642740619902, "grad_norm": 3.5641539096832275, "learning_rate": 2.27126367365183e-05, "loss": 0.1759, "num_input_tokens_seen": 76246624, "step": 35325 }, { "epoch": 5.763458401305057, "grad_norm": 33.02423858642578, "learning_rate": 2.2705548702874512e-05, "loss": 0.1133, "num_input_tokens_seen": 76256352, "step": 35330 }, { "epoch": 5.764274061990212, "grad_norm": 0.10651031136512756, "learning_rate": 2.269846085523059e-05, "loss": 0.1168, "num_input_tokens_seen": 76266912, "step": 35335 }, { "epoch": 5.765089722675367, "grad_norm": 0.11089621484279633, "learning_rate": 2.2691373194161107e-05, "loss": 0.2604, "num_input_tokens_seen": 76277344, "step": 35340 }, { "epoch": 5.765905383360522, "grad_norm": 0.3724483847618103, "learning_rate": 2.2684285720240624e-05, "loss": 0.2363, "num_input_tokens_seen": 76288544, "step": 35345 }, { "epoch": 5.766721044045677, "grad_norm": 5.472805976867676, "learning_rate": 2.2677198434043695e-05, "loss": 0.1657, "num_input_tokens_seen": 76299744, "step": 35350 }, { "epoch": 5.767536704730832, "grad_norm": 0.9301806688308716, "learning_rate": 2.2670111336144844e-05, "loss": 0.1473, "num_input_tokens_seen": 76309280, "step": 35355 }, { "epoch": 5.768352365415987, "grad_norm": 0.2854974865913391, "learning_rate": 2.2663024427118592e-05, "loss": 0.071, "num_input_tokens_seen": 76319072, "step": 35360 }, { "epoch": 5.769168026101142, "grad_norm": 7.550900459289551, "learning_rate": 2.2655937707539437e-05, "loss": 0.0871, "num_input_tokens_seen": 76329696, "step": 35365 }, { "epoch": 5.769983686786297, "grad_norm": 0.12121465802192688, "learning_rate": 2.2648851177981868e-05, "loss": 0.1767, "num_input_tokens_seen": 76339904, "step": 35370 }, { "epoch": 5.770799347471452, "grad_norm": 0.18952710926532745, "learning_rate": 2.264176483902035e-05, "loss": 0.0636, "num_input_tokens_seen": 76351776, "step": 35375 }, { "epoch": 5.771615008156607, "grad_norm": 0.11591441184282303, "learning_rate": 2.263467869122934e-05, "loss": 0.1627, "num_input_tokens_seen": 76362208, "step": 35380 }, { "epoch": 5.7724306688417615, "grad_norm": 0.19922132790088654, "learning_rate": 2.262759273518327e-05, "loss": 0.0083, "num_input_tokens_seen": 76374624, "step": 35385 }, { "epoch": 5.773246329526917, "grad_norm": 0.07203475385904312, "learning_rate": 2.262050697145657e-05, "loss": 0.025, "num_input_tokens_seen": 76384064, "step": 35390 }, { "epoch": 5.774061990212072, "grad_norm": 8.112296104431152, "learning_rate": 2.2613421400623653e-05, "loss": 0.0815, "num_input_tokens_seen": 76394976, "step": 35395 }, { "epoch": 5.774877650897227, "grad_norm": 1.2074600458145142, "learning_rate": 2.2606336023258907e-05, "loss": 0.2279, "num_input_tokens_seen": 76407136, "step": 35400 }, { "epoch": 5.775693311582382, "grad_norm": 1.015274167060852, "learning_rate": 2.259925083993671e-05, "loss": 0.0182, "num_input_tokens_seen": 76417536, "step": 35405 }, { "epoch": 5.7765089722675365, "grad_norm": 0.033531554043293, "learning_rate": 2.2592165851231423e-05, "loss": 0.0066, "num_input_tokens_seen": 76428576, "step": 35410 }, { "epoch": 5.777324632952691, "grad_norm": 2.3847498893737793, "learning_rate": 2.2585081057717387e-05, "loss": 0.0937, "num_input_tokens_seen": 76439040, "step": 35415 }, { "epoch": 5.778140293637847, "grad_norm": 0.7646045088768005, "learning_rate": 2.2577996459968935e-05, "loss": 0.2439, "num_input_tokens_seen": 76450624, "step": 35420 }, { "epoch": 5.778955954323002, "grad_norm": 0.11400401592254639, "learning_rate": 2.2570912058560375e-05, "loss": 0.1651, "num_input_tokens_seen": 76459840, "step": 35425 }, { "epoch": 5.779771615008157, "grad_norm": 5.900925159454346, "learning_rate": 2.2563827854066007e-05, "loss": 0.2483, "num_input_tokens_seen": 76470688, "step": 35430 }, { "epoch": 5.780587275693311, "grad_norm": 0.12469127029180527, "learning_rate": 2.2556743847060118e-05, "loss": 0.2205, "num_input_tokens_seen": 76481728, "step": 35435 }, { "epoch": 5.781402936378466, "grad_norm": 4.723505020141602, "learning_rate": 2.254966003811697e-05, "loss": 0.1852, "num_input_tokens_seen": 76492032, "step": 35440 }, { "epoch": 5.782218597063622, "grad_norm": 0.492357075214386, "learning_rate": 2.2542576427810813e-05, "loss": 0.0147, "num_input_tokens_seen": 76502784, "step": 35445 }, { "epoch": 5.783034257748777, "grad_norm": 0.08330883085727692, "learning_rate": 2.2535493016715882e-05, "loss": 0.0254, "num_input_tokens_seen": 76513856, "step": 35450 }, { "epoch": 5.783849918433932, "grad_norm": 0.22224728763103485, "learning_rate": 2.2528409805406388e-05, "loss": 0.0112, "num_input_tokens_seen": 76525056, "step": 35455 }, { "epoch": 5.784665579119086, "grad_norm": 4.219114780426025, "learning_rate": 2.2521326794456537e-05, "loss": 0.1197, "num_input_tokens_seen": 76535584, "step": 35460 }, { "epoch": 5.785481239804241, "grad_norm": 12.117701530456543, "learning_rate": 2.2514243984440512e-05, "loss": 0.1731, "num_input_tokens_seen": 76547168, "step": 35465 }, { "epoch": 5.786296900489396, "grad_norm": 0.08506445586681366, "learning_rate": 2.2507161375932484e-05, "loss": 0.0174, "num_input_tokens_seen": 76557888, "step": 35470 }, { "epoch": 5.787112561174552, "grad_norm": 0.16435596346855164, "learning_rate": 2.25000789695066e-05, "loss": 0.0159, "num_input_tokens_seen": 76568864, "step": 35475 }, { "epoch": 5.787928221859707, "grad_norm": 0.21710620820522308, "learning_rate": 2.2492996765737004e-05, "loss": 0.012, "num_input_tokens_seen": 76579296, "step": 35480 }, { "epoch": 5.788743882544861, "grad_norm": 0.04540694132447243, "learning_rate": 2.2485914765197807e-05, "loss": 0.0091, "num_input_tokens_seen": 76589824, "step": 35485 }, { "epoch": 5.789559543230016, "grad_norm": 0.17728549242019653, "learning_rate": 2.247883296846311e-05, "loss": 0.1958, "num_input_tokens_seen": 76599744, "step": 35490 }, { "epoch": 5.790375203915171, "grad_norm": 0.3285267949104309, "learning_rate": 2.2471751376107006e-05, "loss": 0.0901, "num_input_tokens_seen": 76612000, "step": 35495 }, { "epoch": 5.791190864600326, "grad_norm": 0.1349720060825348, "learning_rate": 2.246466998870357e-05, "loss": 0.0504, "num_input_tokens_seen": 76623936, "step": 35500 }, { "epoch": 5.7920065252854815, "grad_norm": 0.16432398557662964, "learning_rate": 2.245758880682685e-05, "loss": 0.1017, "num_input_tokens_seen": 76634240, "step": 35505 }, { "epoch": 5.792822185970636, "grad_norm": 0.15062400698661804, "learning_rate": 2.2450507831050876e-05, "loss": 0.0936, "num_input_tokens_seen": 76645344, "step": 35510 }, { "epoch": 5.793637846655791, "grad_norm": 0.05078652873635292, "learning_rate": 2.2443427061949672e-05, "loss": 0.0044, "num_input_tokens_seen": 76656000, "step": 35515 }, { "epoch": 5.794453507340946, "grad_norm": 4.0731916427612305, "learning_rate": 2.2436346500097247e-05, "loss": 0.1775, "num_input_tokens_seen": 76666976, "step": 35520 }, { "epoch": 5.795269168026101, "grad_norm": 19.542570114135742, "learning_rate": 2.2429266146067582e-05, "loss": 0.1237, "num_input_tokens_seen": 76676192, "step": 35525 }, { "epoch": 5.7960848287112565, "grad_norm": 1.1487821340560913, "learning_rate": 2.242218600043465e-05, "loss": 0.0618, "num_input_tokens_seen": 76687104, "step": 35530 }, { "epoch": 5.796900489396411, "grad_norm": 12.550699234008789, "learning_rate": 2.2415106063772394e-05, "loss": 0.0512, "num_input_tokens_seen": 76699104, "step": 35535 }, { "epoch": 5.797716150081566, "grad_norm": 0.05929354205727577, "learning_rate": 2.240802633665476e-05, "loss": 0.2031, "num_input_tokens_seen": 76710464, "step": 35540 }, { "epoch": 5.798531810766721, "grad_norm": 0.07123995572328568, "learning_rate": 2.2400946819655663e-05, "loss": 0.009, "num_input_tokens_seen": 76720864, "step": 35545 }, { "epoch": 5.799347471451876, "grad_norm": 0.061252374202013016, "learning_rate": 2.2393867513349002e-05, "loss": 0.1249, "num_input_tokens_seen": 76731520, "step": 35550 }, { "epoch": 5.800163132137031, "grad_norm": 3.8870437145233154, "learning_rate": 2.238678841830867e-05, "loss": 0.1198, "num_input_tokens_seen": 76741536, "step": 35555 }, { "epoch": 5.800978792822186, "grad_norm": 0.067466601729393, "learning_rate": 2.2379709535108524e-05, "loss": 0.0693, "num_input_tokens_seen": 76751904, "step": 35560 }, { "epoch": 5.801794453507341, "grad_norm": 2.9204559326171875, "learning_rate": 2.2372630864322416e-05, "loss": 0.2492, "num_input_tokens_seen": 76761504, "step": 35565 }, { "epoch": 5.802610114192496, "grad_norm": 0.11660844832658768, "learning_rate": 2.2365552406524183e-05, "loss": 0.0035, "num_input_tokens_seen": 76771968, "step": 35570 }, { "epoch": 5.803425774877651, "grad_norm": 4.045225143432617, "learning_rate": 2.2358474162287635e-05, "loss": 0.1328, "num_input_tokens_seen": 76782432, "step": 35575 }, { "epoch": 5.804241435562806, "grad_norm": 10.254171371459961, "learning_rate": 2.235139613218658e-05, "loss": 0.1031, "num_input_tokens_seen": 76793152, "step": 35580 }, { "epoch": 5.80505709624796, "grad_norm": 9.46228313446045, "learning_rate": 2.234431831679479e-05, "loss": 0.1023, "num_input_tokens_seen": 76804000, "step": 35585 }, { "epoch": 5.805872756933116, "grad_norm": 0.13323760032653809, "learning_rate": 2.2337240716686035e-05, "loss": 0.0071, "num_input_tokens_seen": 76814432, "step": 35590 }, { "epoch": 5.806688417618271, "grad_norm": 3.207540512084961, "learning_rate": 2.2330163332434056e-05, "loss": 0.0341, "num_input_tokens_seen": 76825472, "step": 35595 }, { "epoch": 5.807504078303426, "grad_norm": 0.17430220544338226, "learning_rate": 2.2323086164612584e-05, "loss": 0.0048, "num_input_tokens_seen": 76837568, "step": 35600 }, { "epoch": 5.808319738988581, "grad_norm": 0.1959105283021927, "learning_rate": 2.2316009213795323e-05, "loss": 0.006, "num_input_tokens_seen": 76848768, "step": 35605 }, { "epoch": 5.809135399673735, "grad_norm": 3.450186014175415, "learning_rate": 2.230893248055598e-05, "loss": 0.0121, "num_input_tokens_seen": 76860000, "step": 35610 }, { "epoch": 5.809951060358891, "grad_norm": 0.04870615154504776, "learning_rate": 2.2301855965468226e-05, "loss": 0.0984, "num_input_tokens_seen": 76871904, "step": 35615 }, { "epoch": 5.810766721044046, "grad_norm": 0.1246744841337204, "learning_rate": 2.2294779669105716e-05, "loss": 0.0748, "num_input_tokens_seen": 76881504, "step": 35620 }, { "epoch": 5.811582381729201, "grad_norm": 6.546163082122803, "learning_rate": 2.2287703592042096e-05, "loss": 0.2706, "num_input_tokens_seen": 76894080, "step": 35625 }, { "epoch": 5.8123980424143555, "grad_norm": 0.32414501905441284, "learning_rate": 2.2280627734850984e-05, "loss": 0.0746, "num_input_tokens_seen": 76904672, "step": 35630 }, { "epoch": 5.81321370309951, "grad_norm": 0.26913467049598694, "learning_rate": 2.2273552098105983e-05, "loss": 0.0074, "num_input_tokens_seen": 76914976, "step": 35635 }, { "epoch": 5.814029363784666, "grad_norm": 15.564455032348633, "learning_rate": 2.2266476682380685e-05, "loss": 0.2012, "num_input_tokens_seen": 76925824, "step": 35640 }, { "epoch": 5.814845024469821, "grad_norm": 0.3071824312210083, "learning_rate": 2.2259401488248658e-05, "loss": 0.0614, "num_input_tokens_seen": 76936160, "step": 35645 }, { "epoch": 5.815660685154976, "grad_norm": 7.369630336761475, "learning_rate": 2.225232651628345e-05, "loss": 0.1465, "num_input_tokens_seen": 76946016, "step": 35650 }, { "epoch": 5.8164763458401305, "grad_norm": 0.2431706339120865, "learning_rate": 2.2245251767058595e-05, "loss": 0.0753, "num_input_tokens_seen": 76956704, "step": 35655 }, { "epoch": 5.817292006525285, "grad_norm": 0.23439978063106537, "learning_rate": 2.2238177241147607e-05, "loss": 0.0147, "num_input_tokens_seen": 76967360, "step": 35660 }, { "epoch": 5.81810766721044, "grad_norm": 0.7164652347564697, "learning_rate": 2.223110293912399e-05, "loss": 0.0953, "num_input_tokens_seen": 76979008, "step": 35665 }, { "epoch": 5.818923327895595, "grad_norm": 0.04254661127924919, "learning_rate": 2.2224028861561215e-05, "loss": 0.0941, "num_input_tokens_seen": 76988800, "step": 35670 }, { "epoch": 5.819738988580751, "grad_norm": 19.6538028717041, "learning_rate": 2.2216955009032747e-05, "loss": 0.2615, "num_input_tokens_seen": 76999616, "step": 35675 }, { "epoch": 5.8205546492659055, "grad_norm": 0.13810043036937714, "learning_rate": 2.2209881382112026e-05, "loss": 0.0046, "num_input_tokens_seen": 77010656, "step": 35680 }, { "epoch": 5.82137030995106, "grad_norm": 0.025951286777853966, "learning_rate": 2.220280798137248e-05, "loss": 0.0232, "num_input_tokens_seen": 77021920, "step": 35685 }, { "epoch": 5.822185970636215, "grad_norm": 6.147768497467041, "learning_rate": 2.219573480738751e-05, "loss": 0.012, "num_input_tokens_seen": 77032416, "step": 35690 }, { "epoch": 5.82300163132137, "grad_norm": 5.249512195587158, "learning_rate": 2.2188661860730507e-05, "loss": 0.0562, "num_input_tokens_seen": 77042944, "step": 35695 }, { "epoch": 5.823817292006526, "grad_norm": 0.30161696672439575, "learning_rate": 2.2181589141974836e-05, "loss": 0.0366, "num_input_tokens_seen": 77051584, "step": 35700 }, { "epoch": 5.8246329526916805, "grad_norm": 3.3952252864837646, "learning_rate": 2.217451665169385e-05, "loss": 0.1948, "num_input_tokens_seen": 77062048, "step": 35705 }, { "epoch": 5.825448613376835, "grad_norm": 0.2144870162010193, "learning_rate": 2.216744439046087e-05, "loss": 0.1756, "num_input_tokens_seen": 77074080, "step": 35710 }, { "epoch": 5.82626427406199, "grad_norm": 0.20196601748466492, "learning_rate": 2.2160372358849234e-05, "loss": 0.3953, "num_input_tokens_seen": 77085632, "step": 35715 }, { "epoch": 5.827079934747145, "grad_norm": 0.05781104043126106, "learning_rate": 2.215330055743222e-05, "loss": 0.0032, "num_input_tokens_seen": 77097664, "step": 35720 }, { "epoch": 5.827895595432301, "grad_norm": 0.18307341635227203, "learning_rate": 2.2146228986783105e-05, "loss": 0.0092, "num_input_tokens_seen": 77109568, "step": 35725 }, { "epoch": 5.828711256117455, "grad_norm": 4.371525764465332, "learning_rate": 2.213915764747515e-05, "loss": 0.1864, "num_input_tokens_seen": 77119648, "step": 35730 }, { "epoch": 5.82952691680261, "grad_norm": 1.8286266326904297, "learning_rate": 2.2132086540081593e-05, "loss": 0.0056, "num_input_tokens_seen": 77132064, "step": 35735 }, { "epoch": 5.830342577487765, "grad_norm": 0.1642422080039978, "learning_rate": 2.212501566517565e-05, "loss": 0.0221, "num_input_tokens_seen": 77142880, "step": 35740 }, { "epoch": 5.83115823817292, "grad_norm": 3.268336534500122, "learning_rate": 2.211794502333052e-05, "loss": 0.1043, "num_input_tokens_seen": 77154272, "step": 35745 }, { "epoch": 5.831973898858075, "grad_norm": 3.634614944458008, "learning_rate": 2.2110874615119396e-05, "loss": 0.3382, "num_input_tokens_seen": 77165280, "step": 35750 }, { "epoch": 5.8327895595432295, "grad_norm": 0.4623981714248657, "learning_rate": 2.2103804441115434e-05, "loss": 0.007, "num_input_tokens_seen": 77176032, "step": 35755 }, { "epoch": 5.833605220228385, "grad_norm": 0.05410349369049072, "learning_rate": 2.209673450189178e-05, "loss": 0.1039, "num_input_tokens_seen": 77186880, "step": 35760 }, { "epoch": 5.83442088091354, "grad_norm": 8.331046104431152, "learning_rate": 2.208966479802156e-05, "loss": 0.1679, "num_input_tokens_seen": 77196704, "step": 35765 }, { "epoch": 5.835236541598695, "grad_norm": 1.2005671262741089, "learning_rate": 2.2082595330077878e-05, "loss": 0.011, "num_input_tokens_seen": 77208064, "step": 35770 }, { "epoch": 5.83605220228385, "grad_norm": 0.37973007559776306, "learning_rate": 2.2075526098633816e-05, "loss": 0.0972, "num_input_tokens_seen": 77219328, "step": 35775 }, { "epoch": 5.8368678629690045, "grad_norm": 0.15165221691131592, "learning_rate": 2.206845710426245e-05, "loss": 0.0747, "num_input_tokens_seen": 77230592, "step": 35780 }, { "epoch": 5.83768352365416, "grad_norm": 0.22922126948833466, "learning_rate": 2.2061388347536828e-05, "loss": 0.1342, "num_input_tokens_seen": 77240992, "step": 35785 }, { "epoch": 5.838499184339315, "grad_norm": 0.3719537854194641, "learning_rate": 2.2054319829029975e-05, "loss": 0.0888, "num_input_tokens_seen": 77250144, "step": 35790 }, { "epoch": 5.83931484502447, "grad_norm": 0.06579183042049408, "learning_rate": 2.2047251549314907e-05, "loss": 0.1123, "num_input_tokens_seen": 77262464, "step": 35795 }, { "epoch": 5.840130505709625, "grad_norm": 0.586622416973114, "learning_rate": 2.20401835089646e-05, "loss": 0.1145, "num_input_tokens_seen": 77273248, "step": 35800 }, { "epoch": 5.8409461663947795, "grad_norm": 6.7092413902282715, "learning_rate": 2.2033115708552044e-05, "loss": 0.18, "num_input_tokens_seen": 77284032, "step": 35805 }, { "epoch": 5.841761827079935, "grad_norm": 2.1209661960601807, "learning_rate": 2.202604814865018e-05, "loss": 0.0114, "num_input_tokens_seen": 77295072, "step": 35810 }, { "epoch": 5.84257748776509, "grad_norm": 5.148712158203125, "learning_rate": 2.201898082983194e-05, "loss": 0.2185, "num_input_tokens_seen": 77306208, "step": 35815 }, { "epoch": 5.843393148450245, "grad_norm": 0.052786875516176224, "learning_rate": 2.2011913752670242e-05, "loss": 0.0066, "num_input_tokens_seen": 77317024, "step": 35820 }, { "epoch": 5.8442088091354, "grad_norm": 2.9350016117095947, "learning_rate": 2.2004846917737978e-05, "loss": 0.1611, "num_input_tokens_seen": 77327648, "step": 35825 }, { "epoch": 5.8450244698205545, "grad_norm": 0.11455246806144714, "learning_rate": 2.1997780325608013e-05, "loss": 0.0899, "num_input_tokens_seen": 77338304, "step": 35830 }, { "epoch": 5.845840130505709, "grad_norm": 0.07974403351545334, "learning_rate": 2.1990713976853216e-05, "loss": 0.0844, "num_input_tokens_seen": 77348768, "step": 35835 }, { "epoch": 5.846655791190865, "grad_norm": 0.0672159492969513, "learning_rate": 2.1983647872046412e-05, "loss": 0.1166, "num_input_tokens_seen": 77358208, "step": 35840 }, { "epoch": 5.84747145187602, "grad_norm": 0.10648999363183975, "learning_rate": 2.1976582011760415e-05, "loss": 0.008, "num_input_tokens_seen": 77368896, "step": 35845 }, { "epoch": 5.848287112561175, "grad_norm": 0.3450216054916382, "learning_rate": 2.1969516396568023e-05, "loss": 0.008, "num_input_tokens_seen": 77379616, "step": 35850 }, { "epoch": 5.849102773246329, "grad_norm": 6.794552326202393, "learning_rate": 2.196245102704201e-05, "loss": 0.0232, "num_input_tokens_seen": 77390400, "step": 35855 }, { "epoch": 5.849918433931484, "grad_norm": 4.051516056060791, "learning_rate": 2.1955385903755127e-05, "loss": 0.0822, "num_input_tokens_seen": 77400544, "step": 35860 }, { "epoch": 5.850734094616639, "grad_norm": 0.050954125821590424, "learning_rate": 2.1948321027280108e-05, "loss": 0.0322, "num_input_tokens_seen": 77411136, "step": 35865 }, { "epoch": 5.851549755301795, "grad_norm": 0.21890828013420105, "learning_rate": 2.1941256398189676e-05, "loss": 0.1633, "num_input_tokens_seen": 77422240, "step": 35870 }, { "epoch": 5.85236541598695, "grad_norm": 0.1725703775882721, "learning_rate": 2.1934192017056515e-05, "loss": 0.0885, "num_input_tokens_seen": 77431648, "step": 35875 }, { "epoch": 5.853181076672104, "grad_norm": 0.2586349546909332, "learning_rate": 2.1927127884453307e-05, "loss": 0.0061, "num_input_tokens_seen": 77443680, "step": 35880 }, { "epoch": 5.853996737357259, "grad_norm": 0.13698704540729523, "learning_rate": 2.19200640009527e-05, "loss": 0.1708, "num_input_tokens_seen": 77453824, "step": 35885 }, { "epoch": 5.854812398042414, "grad_norm": 0.05802655220031738, "learning_rate": 2.1913000367127337e-05, "loss": 0.1927, "num_input_tokens_seen": 77464512, "step": 35890 }, { "epoch": 5.85562805872757, "grad_norm": 10.350048065185547, "learning_rate": 2.190593698354983e-05, "loss": 0.2048, "num_input_tokens_seen": 77475584, "step": 35895 }, { "epoch": 5.856443719412725, "grad_norm": 12.542141914367676, "learning_rate": 2.1898873850792768e-05, "loss": 0.0516, "num_input_tokens_seen": 77486464, "step": 35900 }, { "epoch": 5.857259380097879, "grad_norm": 3.6151926517486572, "learning_rate": 2.1891810969428724e-05, "loss": 0.1128, "num_input_tokens_seen": 77495168, "step": 35905 }, { "epoch": 5.858075040783034, "grad_norm": 9.008210182189941, "learning_rate": 2.1884748340030255e-05, "loss": 0.1673, "num_input_tokens_seen": 77505632, "step": 35910 }, { "epoch": 5.858890701468189, "grad_norm": 0.17407682538032532, "learning_rate": 2.1877685963169893e-05, "loss": 0.0957, "num_input_tokens_seen": 77516512, "step": 35915 }, { "epoch": 5.859706362153344, "grad_norm": 20.76967430114746, "learning_rate": 2.187062383942015e-05, "loss": 0.0564, "num_input_tokens_seen": 77528352, "step": 35920 }, { "epoch": 5.8605220228384995, "grad_norm": 0.37421369552612305, "learning_rate": 2.186356196935351e-05, "loss": 0.069, "num_input_tokens_seen": 77537728, "step": 35925 }, { "epoch": 5.861337683523654, "grad_norm": 0.04438697174191475, "learning_rate": 2.185650035354245e-05, "loss": 0.1387, "num_input_tokens_seen": 77549216, "step": 35930 }, { "epoch": 5.862153344208809, "grad_norm": 3.2634708881378174, "learning_rate": 2.184943899255943e-05, "loss": 0.2825, "num_input_tokens_seen": 77560288, "step": 35935 }, { "epoch": 5.862969004893964, "grad_norm": 16.22699737548828, "learning_rate": 2.1842377886976873e-05, "loss": 0.0552, "num_input_tokens_seen": 77571968, "step": 35940 }, { "epoch": 5.863784665579119, "grad_norm": 0.07784358412027359, "learning_rate": 2.183531703736718e-05, "loss": 0.005, "num_input_tokens_seen": 77583872, "step": 35945 }, { "epoch": 5.864600326264274, "grad_norm": 0.07082092016935349, "learning_rate": 2.182825644430275e-05, "loss": 0.1043, "num_input_tokens_seen": 77595232, "step": 35950 }, { "epoch": 5.865415986949429, "grad_norm": 0.057193465530872345, "learning_rate": 2.1821196108355944e-05, "loss": 0.004, "num_input_tokens_seen": 77605696, "step": 35955 }, { "epoch": 5.866231647634584, "grad_norm": 0.2835041582584381, "learning_rate": 2.181413603009911e-05, "loss": 0.0368, "num_input_tokens_seen": 77616192, "step": 35960 }, { "epoch": 5.867047308319739, "grad_norm": 1.8883839845657349, "learning_rate": 2.1807076210104575e-05, "loss": 0.0098, "num_input_tokens_seen": 77626688, "step": 35965 }, { "epoch": 5.867862969004894, "grad_norm": 0.06178255379199982, "learning_rate": 2.1800016648944638e-05, "loss": 0.0253, "num_input_tokens_seen": 77637152, "step": 35970 }, { "epoch": 5.868678629690049, "grad_norm": 3.3733088970184326, "learning_rate": 2.1792957347191594e-05, "loss": 0.0982, "num_input_tokens_seen": 77647328, "step": 35975 }, { "epoch": 5.869494290375204, "grad_norm": 0.06617254763841629, "learning_rate": 2.1785898305417698e-05, "loss": 0.0047, "num_input_tokens_seen": 77657824, "step": 35980 }, { "epoch": 5.870309951060359, "grad_norm": 0.06605422496795654, "learning_rate": 2.1778839524195195e-05, "loss": 0.0493, "num_input_tokens_seen": 77668768, "step": 35985 }, { "epoch": 5.871125611745514, "grad_norm": 0.24393099546432495, "learning_rate": 2.1771781004096304e-05, "loss": 0.1428, "num_input_tokens_seen": 77679040, "step": 35990 }, { "epoch": 5.871941272430669, "grad_norm": 0.06658919900655746, "learning_rate": 2.1764722745693223e-05, "loss": 0.0349, "num_input_tokens_seen": 77690304, "step": 35995 }, { "epoch": 5.872756933115824, "grad_norm": 3.923738479614258, "learning_rate": 2.1757664749558132e-05, "loss": 0.1157, "num_input_tokens_seen": 77700480, "step": 36000 }, { "epoch": 5.873572593800979, "grad_norm": 0.4210411012172699, "learning_rate": 2.1750607016263192e-05, "loss": 0.0086, "num_input_tokens_seen": 77711552, "step": 36005 }, { "epoch": 5.874388254486134, "grad_norm": 0.21074756979942322, "learning_rate": 2.1743549546380527e-05, "loss": 0.1148, "num_input_tokens_seen": 77722944, "step": 36010 }, { "epoch": 5.875203915171289, "grad_norm": 0.02160373516380787, "learning_rate": 2.1736492340482267e-05, "loss": 0.0082, "num_input_tokens_seen": 77735072, "step": 36015 }, { "epoch": 5.876019575856444, "grad_norm": 0.042782049626111984, "learning_rate": 2.172943539914049e-05, "loss": 0.0038, "num_input_tokens_seen": 77746848, "step": 36020 }, { "epoch": 5.876835236541599, "grad_norm": 0.07420703768730164, "learning_rate": 2.172237872292728e-05, "loss": 0.1185, "num_input_tokens_seen": 77757664, "step": 36025 }, { "epoch": 5.877650897226753, "grad_norm": 0.049979452043771744, "learning_rate": 2.1715322312414664e-05, "loss": 0.0241, "num_input_tokens_seen": 77768288, "step": 36030 }, { "epoch": 5.878466557911908, "grad_norm": 0.027508003637194633, "learning_rate": 2.1708266168174703e-05, "loss": 0.1088, "num_input_tokens_seen": 77778080, "step": 36035 }, { "epoch": 5.879282218597064, "grad_norm": 3.2788803577423096, "learning_rate": 2.170121029077939e-05, "loss": 0.2079, "num_input_tokens_seen": 77790784, "step": 36040 }, { "epoch": 5.880097879282219, "grad_norm": 0.21825259923934937, "learning_rate": 2.1694154680800706e-05, "loss": 0.0829, "num_input_tokens_seen": 77801344, "step": 36045 }, { "epoch": 5.8809135399673735, "grad_norm": 0.8757532238960266, "learning_rate": 2.168709933881062e-05, "loss": 0.0593, "num_input_tokens_seen": 77811392, "step": 36050 }, { "epoch": 5.881729200652528, "grad_norm": 5.703310489654541, "learning_rate": 2.168004426538106e-05, "loss": 0.1182, "num_input_tokens_seen": 77821312, "step": 36055 }, { "epoch": 5.882544861337683, "grad_norm": 0.0797046422958374, "learning_rate": 2.1672989461083964e-05, "loss": 0.2148, "num_input_tokens_seen": 77831872, "step": 36060 }, { "epoch": 5.883360522022839, "grad_norm": 0.18410950899124146, "learning_rate": 2.1665934926491226e-05, "loss": 0.1024, "num_input_tokens_seen": 77842752, "step": 36065 }, { "epoch": 5.884176182707994, "grad_norm": 0.05765313655138016, "learning_rate": 2.1658880662174717e-05, "loss": 0.0069, "num_input_tokens_seen": 77853696, "step": 36070 }, { "epoch": 5.8849918433931485, "grad_norm": 31.744882583618164, "learning_rate": 2.1651826668706297e-05, "loss": 0.2001, "num_input_tokens_seen": 77864288, "step": 36075 }, { "epoch": 5.885807504078303, "grad_norm": 0.1185903251171112, "learning_rate": 2.1644772946657795e-05, "loss": 0.0989, "num_input_tokens_seen": 77874880, "step": 36080 }, { "epoch": 5.886623164763458, "grad_norm": 0.07939574122428894, "learning_rate": 2.163771949660102e-05, "loss": 0.0043, "num_input_tokens_seen": 77886688, "step": 36085 }, { "epoch": 5.887438825448614, "grad_norm": 0.15546125173568726, "learning_rate": 2.1630666319107767e-05, "loss": 0.0839, "num_input_tokens_seen": 77897376, "step": 36090 }, { "epoch": 5.888254486133769, "grad_norm": 0.21585629880428314, "learning_rate": 2.1623613414749797e-05, "loss": 0.0233, "num_input_tokens_seen": 77908736, "step": 36095 }, { "epoch": 5.8890701468189235, "grad_norm": 0.05212084576487541, "learning_rate": 2.1616560784098856e-05, "loss": 0.2501, "num_input_tokens_seen": 77920352, "step": 36100 }, { "epoch": 5.889885807504078, "grad_norm": 0.07802629470825195, "learning_rate": 2.160950842772666e-05, "loss": 0.2069, "num_input_tokens_seen": 77931360, "step": 36105 }, { "epoch": 5.890701468189233, "grad_norm": 0.07027492672204971, "learning_rate": 2.160245634620492e-05, "loss": 0.0748, "num_input_tokens_seen": 77942208, "step": 36110 }, { "epoch": 5.891517128874388, "grad_norm": 0.032340358942747116, "learning_rate": 2.1595404540105295e-05, "loss": 0.0042, "num_input_tokens_seen": 77953248, "step": 36115 }, { "epoch": 5.892332789559543, "grad_norm": 0.056712131947278976, "learning_rate": 2.1588353009999464e-05, "loss": 0.0043, "num_input_tokens_seen": 77963584, "step": 36120 }, { "epoch": 5.8931484502446985, "grad_norm": 21.15962028503418, "learning_rate": 2.158130175645905e-05, "loss": 0.0409, "num_input_tokens_seen": 77973920, "step": 36125 }, { "epoch": 5.893964110929853, "grad_norm": 0.0940650999546051, "learning_rate": 2.1574250780055654e-05, "loss": 0.0596, "num_input_tokens_seen": 77984736, "step": 36130 }, { "epoch": 5.894779771615008, "grad_norm": 3.097771406173706, "learning_rate": 2.156720008136087e-05, "loss": 0.1741, "num_input_tokens_seen": 77996160, "step": 36135 }, { "epoch": 5.895595432300163, "grad_norm": 1.6625806093215942, "learning_rate": 2.156014966094627e-05, "loss": 0.1153, "num_input_tokens_seen": 78006592, "step": 36140 }, { "epoch": 5.896411092985318, "grad_norm": 0.02249511331319809, "learning_rate": 2.1553099519383394e-05, "loss": 0.0952, "num_input_tokens_seen": 78015936, "step": 36145 }, { "epoch": 5.897226753670473, "grad_norm": 3.918808698654175, "learning_rate": 2.154604965724376e-05, "loss": 0.3321, "num_input_tokens_seen": 78028160, "step": 36150 }, { "epoch": 5.898042414355628, "grad_norm": 0.044293925166130066, "learning_rate": 2.1539000075098868e-05, "loss": 0.0118, "num_input_tokens_seen": 78039712, "step": 36155 }, { "epoch": 5.898858075040783, "grad_norm": 0.05846496298909187, "learning_rate": 2.1531950773520187e-05, "loss": 0.01, "num_input_tokens_seen": 78051936, "step": 36160 }, { "epoch": 5.899673735725938, "grad_norm": 0.14292560517787933, "learning_rate": 2.1524901753079176e-05, "loss": 0.2394, "num_input_tokens_seen": 78062464, "step": 36165 }, { "epoch": 5.900489396411093, "grad_norm": 0.3116625249385834, "learning_rate": 2.1517853014347262e-05, "loss": 0.0095, "num_input_tokens_seen": 78072352, "step": 36170 }, { "epoch": 5.901305057096248, "grad_norm": 0.06515955924987793, "learning_rate": 2.1510804557895847e-05, "loss": 0.0567, "num_input_tokens_seen": 78082816, "step": 36175 }, { "epoch": 5.902120717781403, "grad_norm": 2.8720502853393555, "learning_rate": 2.1503756384296323e-05, "loss": 0.2682, "num_input_tokens_seen": 78092288, "step": 36180 }, { "epoch": 5.902936378466558, "grad_norm": 0.4986286759376526, "learning_rate": 2.1496708494120043e-05, "loss": 0.0928, "num_input_tokens_seen": 78104608, "step": 36185 }, { "epoch": 5.903752039151713, "grad_norm": 0.01598019152879715, "learning_rate": 2.148966088793835e-05, "loss": 0.0145, "num_input_tokens_seen": 78116160, "step": 36190 }, { "epoch": 5.904567699836868, "grad_norm": 1.459521770477295, "learning_rate": 2.1482613566322558e-05, "loss": 0.2332, "num_input_tokens_seen": 78127200, "step": 36195 }, { "epoch": 5.9053833605220225, "grad_norm": 4.28659200668335, "learning_rate": 2.147556652984395e-05, "loss": 0.2466, "num_input_tokens_seen": 78136928, "step": 36200 }, { "epoch": 5.906199021207177, "grad_norm": 0.06842447817325592, "learning_rate": 2.1468519779073805e-05, "loss": 0.0482, "num_input_tokens_seen": 78148704, "step": 36205 }, { "epoch": 5.907014681892333, "grad_norm": 0.8169599175453186, "learning_rate": 2.146147331458337e-05, "loss": 0.0213, "num_input_tokens_seen": 78158752, "step": 36210 }, { "epoch": 5.907830342577488, "grad_norm": 0.37449151277542114, "learning_rate": 2.1454427136943858e-05, "loss": 0.3498, "num_input_tokens_seen": 78167968, "step": 36215 }, { "epoch": 5.908646003262643, "grad_norm": 3.2668418884277344, "learning_rate": 2.1447381246726473e-05, "loss": 0.1727, "num_input_tokens_seen": 78178688, "step": 36220 }, { "epoch": 5.9094616639477975, "grad_norm": 3.2942874431610107, "learning_rate": 2.144033564450239e-05, "loss": 0.1158, "num_input_tokens_seen": 78189376, "step": 36225 }, { "epoch": 5.910277324632952, "grad_norm": 0.04981905594468117, "learning_rate": 2.1433290330842764e-05, "loss": 0.0791, "num_input_tokens_seen": 78200000, "step": 36230 }, { "epoch": 5.911092985318108, "grad_norm": 6.152901649475098, "learning_rate": 2.142624530631872e-05, "loss": 0.1143, "num_input_tokens_seen": 78210592, "step": 36235 }, { "epoch": 5.911908646003263, "grad_norm": 3.7300734519958496, "learning_rate": 2.1419200571501363e-05, "loss": 0.0654, "num_input_tokens_seen": 78220896, "step": 36240 }, { "epoch": 5.912724306688418, "grad_norm": 11.314860343933105, "learning_rate": 2.141215612696177e-05, "loss": 0.0129, "num_input_tokens_seen": 78233088, "step": 36245 }, { "epoch": 5.9135399673735725, "grad_norm": 6.528377056121826, "learning_rate": 2.140511197327101e-05, "loss": 0.2043, "num_input_tokens_seen": 78243680, "step": 36250 }, { "epoch": 5.914355628058727, "grad_norm": 0.046376537531614304, "learning_rate": 2.139806811100012e-05, "loss": 0.0192, "num_input_tokens_seen": 78253344, "step": 36255 }, { "epoch": 5.915171288743883, "grad_norm": 0.03144540637731552, "learning_rate": 2.1391024540720102e-05, "loss": 0.1141, "num_input_tokens_seen": 78264864, "step": 36260 }, { "epoch": 5.915986949429038, "grad_norm": 0.15380115807056427, "learning_rate": 2.1383981263001947e-05, "loss": 0.0134, "num_input_tokens_seen": 78276000, "step": 36265 }, { "epoch": 5.916802610114193, "grad_norm": 3.3430352210998535, "learning_rate": 2.1376938278416615e-05, "loss": 0.1266, "num_input_tokens_seen": 78287360, "step": 36270 }, { "epoch": 5.917618270799347, "grad_norm": 0.07964406907558441, "learning_rate": 2.1369895587535052e-05, "loss": 0.0051, "num_input_tokens_seen": 78296608, "step": 36275 }, { "epoch": 5.918433931484502, "grad_norm": 2.8070991039276123, "learning_rate": 2.1362853190928172e-05, "loss": 0.0666, "num_input_tokens_seen": 78306848, "step": 36280 }, { "epoch": 5.919249592169657, "grad_norm": 0.9714038968086243, "learning_rate": 2.135581108916686e-05, "loss": 0.0085, "num_input_tokens_seen": 78317504, "step": 36285 }, { "epoch": 5.920065252854813, "grad_norm": 2.707551956176758, "learning_rate": 2.1348769282822e-05, "loss": 0.136, "num_input_tokens_seen": 78327360, "step": 36290 }, { "epoch": 5.920880913539968, "grad_norm": 0.10459740459918976, "learning_rate": 2.1341727772464425e-05, "loss": 0.1302, "num_input_tokens_seen": 78338656, "step": 36295 }, { "epoch": 5.921696574225122, "grad_norm": 1.3902908563613892, "learning_rate": 2.1334686558664964e-05, "loss": 0.0922, "num_input_tokens_seen": 78350464, "step": 36300 }, { "epoch": 5.922512234910277, "grad_norm": 3.597879648208618, "learning_rate": 2.1327645641994404e-05, "loss": 0.1789, "num_input_tokens_seen": 78360480, "step": 36305 }, { "epoch": 5.923327895595432, "grad_norm": 4.231083869934082, "learning_rate": 2.1320605023023522e-05, "loss": 0.0886, "num_input_tokens_seen": 78371264, "step": 36310 }, { "epoch": 5.924143556280587, "grad_norm": 3.2040674686431885, "learning_rate": 2.1313564702323064e-05, "loss": 0.1933, "num_input_tokens_seen": 78382176, "step": 36315 }, { "epoch": 5.924959216965743, "grad_norm": 0.34265559911727905, "learning_rate": 2.1306524680463758e-05, "loss": 0.2899, "num_input_tokens_seen": 78392704, "step": 36320 }, { "epoch": 5.925774877650897, "grad_norm": 0.8359548449516296, "learning_rate": 2.12994849580163e-05, "loss": 0.2695, "num_input_tokens_seen": 78404544, "step": 36325 }, { "epoch": 5.926590538336052, "grad_norm": 0.05384374409914017, "learning_rate": 2.129244553555137e-05, "loss": 0.0999, "num_input_tokens_seen": 78414400, "step": 36330 }, { "epoch": 5.927406199021207, "grad_norm": 0.1213223859667778, "learning_rate": 2.1285406413639616e-05, "loss": 0.0138, "num_input_tokens_seen": 78425664, "step": 36335 }, { "epoch": 5.928221859706362, "grad_norm": 0.28292325139045715, "learning_rate": 2.1278367592851668e-05, "loss": 0.01, "num_input_tokens_seen": 78437120, "step": 36340 }, { "epoch": 5.9290375203915175, "grad_norm": 3.51906156539917, "learning_rate": 2.1271329073758118e-05, "loss": 0.0215, "num_input_tokens_seen": 78447456, "step": 36345 }, { "epoch": 5.929853181076672, "grad_norm": 0.1483282446861267, "learning_rate": 2.1264290856929553e-05, "loss": 0.1043, "num_input_tokens_seen": 78457760, "step": 36350 }, { "epoch": 5.930668841761827, "grad_norm": 0.12273486703634262, "learning_rate": 2.125725294293653e-05, "loss": 0.0047, "num_input_tokens_seen": 78469248, "step": 36355 }, { "epoch": 5.931484502446982, "grad_norm": 0.05951046571135521, "learning_rate": 2.1250215332349575e-05, "loss": 0.1769, "num_input_tokens_seen": 78480864, "step": 36360 }, { "epoch": 5.932300163132137, "grad_norm": 3.0192644596099854, "learning_rate": 2.1243178025739193e-05, "loss": 0.1787, "num_input_tokens_seen": 78492224, "step": 36365 }, { "epoch": 5.933115823817292, "grad_norm": 0.5985738039016724, "learning_rate": 2.1236141023675855e-05, "loss": 0.0061, "num_input_tokens_seen": 78503072, "step": 36370 }, { "epoch": 5.933931484502447, "grad_norm": 0.15368081629276276, "learning_rate": 2.122910432673003e-05, "loss": 0.1992, "num_input_tokens_seen": 78511744, "step": 36375 }, { "epoch": 5.934747145187602, "grad_norm": 0.7402132153511047, "learning_rate": 2.122206793547214e-05, "loss": 0.1226, "num_input_tokens_seen": 78522176, "step": 36380 }, { "epoch": 5.935562805872757, "grad_norm": 0.0874972864985466, "learning_rate": 2.1215031850472593e-05, "loss": 0.0079, "num_input_tokens_seen": 78532928, "step": 36385 }, { "epoch": 5.936378466557912, "grad_norm": 0.14864759147167206, "learning_rate": 2.120799607230177e-05, "loss": 0.1188, "num_input_tokens_seen": 78543552, "step": 36390 }, { "epoch": 5.937194127243067, "grad_norm": 0.14092212915420532, "learning_rate": 2.1200960601530022e-05, "loss": 0.0116, "num_input_tokens_seen": 78553824, "step": 36395 }, { "epoch": 5.938009787928221, "grad_norm": 4.489637851715088, "learning_rate": 2.119392543872769e-05, "loss": 0.1303, "num_input_tokens_seen": 78565248, "step": 36400 }, { "epoch": 5.938825448613377, "grad_norm": 0.9154524803161621, "learning_rate": 2.1186890584465068e-05, "loss": 0.1014, "num_input_tokens_seen": 78577440, "step": 36405 }, { "epoch": 5.939641109298532, "grad_norm": 0.2515009641647339, "learning_rate": 2.1179856039312446e-05, "loss": 0.1446, "num_input_tokens_seen": 78588160, "step": 36410 }, { "epoch": 5.940456769983687, "grad_norm": 4.479397773742676, "learning_rate": 2.1172821803840077e-05, "loss": 0.1346, "num_input_tokens_seen": 78598976, "step": 36415 }, { "epoch": 5.941272430668842, "grad_norm": 1.1386377811431885, "learning_rate": 2.116578787861819e-05, "loss": 0.1998, "num_input_tokens_seen": 78608768, "step": 36420 }, { "epoch": 5.942088091353996, "grad_norm": 0.6867659687995911, "learning_rate": 2.1158754264216992e-05, "loss": 0.0703, "num_input_tokens_seen": 78620736, "step": 36425 }, { "epoch": 5.942903752039152, "grad_norm": 0.16088555753231049, "learning_rate": 2.1151720961206657e-05, "loss": 0.1494, "num_input_tokens_seen": 78629216, "step": 36430 }, { "epoch": 5.943719412724307, "grad_norm": 0.11808604747056961, "learning_rate": 2.1144687970157357e-05, "loss": 0.0134, "num_input_tokens_seen": 78639040, "step": 36435 }, { "epoch": 5.944535073409462, "grad_norm": 0.10915657877922058, "learning_rate": 2.1137655291639206e-05, "loss": 0.039, "num_input_tokens_seen": 78650624, "step": 36440 }, { "epoch": 5.945350734094617, "grad_norm": 0.13303032517433167, "learning_rate": 2.113062292622232e-05, "loss": 0.0145, "num_input_tokens_seen": 78662368, "step": 36445 }, { "epoch": 5.946166394779771, "grad_norm": 3.116680860519409, "learning_rate": 2.112359087447677e-05, "loss": 0.2113, "num_input_tokens_seen": 78673984, "step": 36450 }, { "epoch": 5.946982055464927, "grad_norm": 0.17844155430793762, "learning_rate": 2.11165591369726e-05, "loss": 0.0083, "num_input_tokens_seen": 78683328, "step": 36455 }, { "epoch": 5.947797716150082, "grad_norm": 17.343334197998047, "learning_rate": 2.110952771427986e-05, "loss": 0.039, "num_input_tokens_seen": 78694496, "step": 36460 }, { "epoch": 5.948613376835237, "grad_norm": 0.1877734214067459, "learning_rate": 2.110249660696855e-05, "loss": 0.0085, "num_input_tokens_seen": 78704832, "step": 36465 }, { "epoch": 5.9494290375203915, "grad_norm": 13.953911781311035, "learning_rate": 2.1095465815608637e-05, "loss": 0.132, "num_input_tokens_seen": 78715840, "step": 36470 }, { "epoch": 5.950244698205546, "grad_norm": 0.11150404065847397, "learning_rate": 2.1088435340770074e-05, "loss": 0.0852, "num_input_tokens_seen": 78726592, "step": 36475 }, { "epoch": 5.951060358890701, "grad_norm": 0.060464903712272644, "learning_rate": 2.108140518302279e-05, "loss": 0.1296, "num_input_tokens_seen": 78738048, "step": 36480 }, { "epoch": 5.951876019575856, "grad_norm": 6.174735069274902, "learning_rate": 2.107437534293669e-05, "loss": 0.112, "num_input_tokens_seen": 78749312, "step": 36485 }, { "epoch": 5.952691680261012, "grad_norm": 0.06503239274024963, "learning_rate": 2.106734582108164e-05, "loss": 0.2002, "num_input_tokens_seen": 78760512, "step": 36490 }, { "epoch": 5.9535073409461665, "grad_norm": 0.266341894865036, "learning_rate": 2.1060316618027493e-05, "loss": 0.0518, "num_input_tokens_seen": 78771072, "step": 36495 }, { "epoch": 5.954323001631321, "grad_norm": 0.05134322866797447, "learning_rate": 2.105328773434407e-05, "loss": 0.0064, "num_input_tokens_seen": 78781376, "step": 36500 }, { "epoch": 5.955138662316476, "grad_norm": 0.18325527012348175, "learning_rate": 2.1046259170601167e-05, "loss": 0.1604, "num_input_tokens_seen": 78792640, "step": 36505 }, { "epoch": 5.955954323001631, "grad_norm": 2.8084986209869385, "learning_rate": 2.1039230927368556e-05, "loss": 0.1192, "num_input_tokens_seen": 78803776, "step": 36510 }, { "epoch": 5.956769983686787, "grad_norm": 3.9901211261749268, "learning_rate": 2.1032203005215978e-05, "loss": 0.0995, "num_input_tokens_seen": 78813440, "step": 36515 }, { "epoch": 5.9575856443719415, "grad_norm": 0.11165003478527069, "learning_rate": 2.1025175404713167e-05, "loss": 0.2142, "num_input_tokens_seen": 78824736, "step": 36520 }, { "epoch": 5.958401305057096, "grad_norm": 0.04973118007183075, "learning_rate": 2.1018148126429797e-05, "loss": 0.1009, "num_input_tokens_seen": 78833920, "step": 36525 }, { "epoch": 5.959216965742251, "grad_norm": 0.05530374497175217, "learning_rate": 2.101112117093555e-05, "loss": 0.0538, "num_input_tokens_seen": 78845376, "step": 36530 }, { "epoch": 5.960032626427406, "grad_norm": 5.85557746887207, "learning_rate": 2.1004094538800058e-05, "loss": 0.0815, "num_input_tokens_seen": 78858016, "step": 36535 }, { "epoch": 5.960848287112562, "grad_norm": 0.14480511844158173, "learning_rate": 2.099706823059294e-05, "loss": 0.0884, "num_input_tokens_seen": 78869216, "step": 36540 }, { "epoch": 5.9616639477977165, "grad_norm": 0.5307287573814392, "learning_rate": 2.0990042246883777e-05, "loss": 0.0812, "num_input_tokens_seen": 78880096, "step": 36545 }, { "epoch": 5.962479608482871, "grad_norm": 0.09958609193563461, "learning_rate": 2.0983016588242145e-05, "loss": 0.1191, "num_input_tokens_seen": 78889664, "step": 36550 }, { "epoch": 5.963295269168026, "grad_norm": 0.035562992095947266, "learning_rate": 2.0975991255237562e-05, "loss": 0.0854, "num_input_tokens_seen": 78902240, "step": 36555 }, { "epoch": 5.964110929853181, "grad_norm": 5.524087905883789, "learning_rate": 2.0968966248439536e-05, "loss": 0.0837, "num_input_tokens_seen": 78911872, "step": 36560 }, { "epoch": 5.964926590538336, "grad_norm": 0.17915554344654083, "learning_rate": 2.096194156841757e-05, "loss": 0.0053, "num_input_tokens_seen": 78922304, "step": 36565 }, { "epoch": 5.9657422512234906, "grad_norm": 0.11609411239624023, "learning_rate": 2.0954917215741113e-05, "loss": 0.0124, "num_input_tokens_seen": 78933408, "step": 36570 }, { "epoch": 5.966557911908646, "grad_norm": 0.09525105357170105, "learning_rate": 2.0947893190979588e-05, "loss": 0.1277, "num_input_tokens_seen": 78944704, "step": 36575 }, { "epoch": 5.967373572593801, "grad_norm": 0.12271397560834885, "learning_rate": 2.09408694947024e-05, "loss": 0.0382, "num_input_tokens_seen": 78955584, "step": 36580 }, { "epoch": 5.968189233278956, "grad_norm": 2.0783212184906006, "learning_rate": 2.0933846127478928e-05, "loss": 0.0751, "num_input_tokens_seen": 78966272, "step": 36585 }, { "epoch": 5.969004893964111, "grad_norm": 0.09040044993162155, "learning_rate": 2.092682308987852e-05, "loss": 0.1292, "num_input_tokens_seen": 78976608, "step": 36590 }, { "epoch": 5.9698205546492655, "grad_norm": 0.09519308060407639, "learning_rate": 2.0919800382470503e-05, "loss": 0.0067, "num_input_tokens_seen": 78987360, "step": 36595 }, { "epoch": 5.970636215334421, "grad_norm": 0.10772380232810974, "learning_rate": 2.091277800582416e-05, "loss": 0.0503, "num_input_tokens_seen": 78999040, "step": 36600 }, { "epoch": 5.971451876019576, "grad_norm": 0.14227809011936188, "learning_rate": 2.0905755960508778e-05, "loss": 0.0045, "num_input_tokens_seen": 79010112, "step": 36605 }, { "epoch": 5.972267536704731, "grad_norm": 0.15200011432170868, "learning_rate": 2.0898734247093592e-05, "loss": 0.0059, "num_input_tokens_seen": 79020416, "step": 36610 }, { "epoch": 5.973083197389886, "grad_norm": 7.089992046356201, "learning_rate": 2.0891712866147812e-05, "loss": 0.2296, "num_input_tokens_seen": 79031648, "step": 36615 }, { "epoch": 5.9738988580750405, "grad_norm": 0.09132542461156845, "learning_rate": 2.088469181824064e-05, "loss": 0.0704, "num_input_tokens_seen": 79042560, "step": 36620 }, { "epoch": 5.974714518760196, "grad_norm": 0.3920039236545563, "learning_rate": 2.0877671103941228e-05, "loss": 0.1477, "num_input_tokens_seen": 79052672, "step": 36625 }, { "epoch": 5.975530179445351, "grad_norm": 2.6713600158691406, "learning_rate": 2.0870650723818706e-05, "loss": 0.3068, "num_input_tokens_seen": 79063744, "step": 36630 }, { "epoch": 5.976345840130506, "grad_norm": 3.448038101196289, "learning_rate": 2.0863630678442196e-05, "loss": 0.0742, "num_input_tokens_seen": 79075424, "step": 36635 }, { "epoch": 5.977161500815661, "grad_norm": 0.18815290927886963, "learning_rate": 2.0856610968380768e-05, "loss": 0.0229, "num_input_tokens_seen": 79085792, "step": 36640 }, { "epoch": 5.9779771615008155, "grad_norm": 0.16216164827346802, "learning_rate": 2.0849591594203482e-05, "loss": 0.0118, "num_input_tokens_seen": 79096704, "step": 36645 }, { "epoch": 5.97879282218597, "grad_norm": 2.9226410388946533, "learning_rate": 2.0842572556479355e-05, "loss": 0.2137, "num_input_tokens_seen": 79107328, "step": 36650 }, { "epoch": 5.979608482871125, "grad_norm": 0.35585007071495056, "learning_rate": 2.0835553855777396e-05, "loss": 0.0105, "num_input_tokens_seen": 79118144, "step": 36655 }, { "epoch": 5.980424143556281, "grad_norm": 0.021596642211079597, "learning_rate": 2.082853549266656e-05, "loss": 0.1753, "num_input_tokens_seen": 79129568, "step": 36660 }, { "epoch": 5.981239804241436, "grad_norm": 0.1883329153060913, "learning_rate": 2.0821517467715815e-05, "loss": 0.0965, "num_input_tokens_seen": 79139936, "step": 36665 }, { "epoch": 5.9820554649265905, "grad_norm": 0.22896647453308105, "learning_rate": 2.0814499781494057e-05, "loss": 0.0597, "num_input_tokens_seen": 79149664, "step": 36670 }, { "epoch": 5.982871125611745, "grad_norm": 0.12242002040147781, "learning_rate": 2.0807482434570187e-05, "loss": 0.0531, "num_input_tokens_seen": 79161280, "step": 36675 }, { "epoch": 5.9836867862969, "grad_norm": 0.0653303861618042, "learning_rate": 2.080046542751307e-05, "loss": 0.1036, "num_input_tokens_seen": 79171488, "step": 36680 }, { "epoch": 5.984502446982056, "grad_norm": 0.14149434864521027, "learning_rate": 2.079344876089152e-05, "loss": 0.0097, "num_input_tokens_seen": 79182656, "step": 36685 }, { "epoch": 5.985318107667211, "grad_norm": 0.21198756992816925, "learning_rate": 2.078643243527437e-05, "loss": 0.2048, "num_input_tokens_seen": 79194016, "step": 36690 }, { "epoch": 5.986133768352365, "grad_norm": 1.6793668270111084, "learning_rate": 2.0779416451230382e-05, "loss": 0.1456, "num_input_tokens_seen": 79205472, "step": 36695 }, { "epoch": 5.98694942903752, "grad_norm": 0.07833819836378098, "learning_rate": 2.0772400809328314e-05, "loss": 0.008, "num_input_tokens_seen": 79215904, "step": 36700 }, { "epoch": 5.987765089722675, "grad_norm": 0.06340375542640686, "learning_rate": 2.0765385510136884e-05, "loss": 0.0598, "num_input_tokens_seen": 79227776, "step": 36705 }, { "epoch": 5.988580750407831, "grad_norm": 0.02490225061774254, "learning_rate": 2.0758370554224793e-05, "loss": 0.1367, "num_input_tokens_seen": 79239488, "step": 36710 }, { "epoch": 5.989396411092986, "grad_norm": 11.206292152404785, "learning_rate": 2.0751355942160706e-05, "loss": 0.1416, "num_input_tokens_seen": 79249600, "step": 36715 }, { "epoch": 5.99021207177814, "grad_norm": 0.42250674962997437, "learning_rate": 2.0744341674513264e-05, "loss": 0.1338, "num_input_tokens_seen": 79259840, "step": 36720 }, { "epoch": 5.991027732463295, "grad_norm": 0.16194994747638702, "learning_rate": 2.0737327751851075e-05, "loss": 0.0856, "num_input_tokens_seen": 79271488, "step": 36725 }, { "epoch": 5.99184339314845, "grad_norm": 0.08617598563432693, "learning_rate": 2.0730314174742733e-05, "loss": 0.0053, "num_input_tokens_seen": 79282752, "step": 36730 }, { "epoch": 5.992659053833605, "grad_norm": 2.626254081726074, "learning_rate": 2.0723300943756783e-05, "loss": 0.0878, "num_input_tokens_seen": 79294336, "step": 36735 }, { "epoch": 5.993474714518761, "grad_norm": 2.470318555831909, "learning_rate": 2.0716288059461764e-05, "loss": 0.3177, "num_input_tokens_seen": 79306272, "step": 36740 }, { "epoch": 5.994290375203915, "grad_norm": 0.18605534732341766, "learning_rate": 2.0709275522426158e-05, "loss": 0.0187, "num_input_tokens_seen": 79317120, "step": 36745 }, { "epoch": 5.99510603588907, "grad_norm": 0.06405258178710938, "learning_rate": 2.070226333321846e-05, "loss": 0.0039, "num_input_tokens_seen": 79327200, "step": 36750 }, { "epoch": 5.995921696574225, "grad_norm": 8.52464771270752, "learning_rate": 2.06952514924071e-05, "loss": 0.1118, "num_input_tokens_seen": 79337888, "step": 36755 }, { "epoch": 5.99673735725938, "grad_norm": 0.3218190371990204, "learning_rate": 2.0688240000560498e-05, "loss": 0.1347, "num_input_tokens_seen": 79348480, "step": 36760 }, { "epoch": 5.997553017944535, "grad_norm": 0.3105626106262207, "learning_rate": 2.0681228858247038e-05, "loss": 0.1995, "num_input_tokens_seen": 79360480, "step": 36765 }, { "epoch": 5.99836867862969, "grad_norm": 0.2892025411128998, "learning_rate": 2.067421806603508e-05, "loss": 0.0095, "num_input_tokens_seen": 79370912, "step": 36770 }, { "epoch": 5.999184339314845, "grad_norm": 0.10774438828229904, "learning_rate": 2.0667207624492943e-05, "loss": 0.007, "num_input_tokens_seen": 79380288, "step": 36775 }, { "epoch": 6.0, "grad_norm": 0.07041917741298676, "learning_rate": 2.0660197534188952e-05, "loss": 0.0755, "num_input_tokens_seen": 79389648, "step": 36780 }, { "epoch": 6.0, "eval_loss": 0.17715702950954437, "eval_runtime": 132.8411, "eval_samples_per_second": 20.513, "eval_steps_per_second": 5.134, "num_input_tokens_seen": 79389648, "step": 36780 }, { "epoch": 6.000815660685155, "grad_norm": 0.17613597214221954, "learning_rate": 2.065318779569137e-05, "loss": 0.0061, "num_input_tokens_seen": 79400656, "step": 36785 }, { "epoch": 6.00163132137031, "grad_norm": 14.462206840515137, "learning_rate": 2.064617840956844e-05, "loss": 0.0414, "num_input_tokens_seen": 79412656, "step": 36790 }, { "epoch": 6.002446982055465, "grad_norm": 0.06690766662359238, "learning_rate": 2.063916937638838e-05, "loss": 0.0082, "num_input_tokens_seen": 79424048, "step": 36795 }, { "epoch": 6.00326264274062, "grad_norm": 0.13082489371299744, "learning_rate": 2.063216069671937e-05, "loss": 0.0717, "num_input_tokens_seen": 79434896, "step": 36800 }, { "epoch": 6.004078303425775, "grad_norm": 0.05276201665401459, "learning_rate": 2.0625152371129585e-05, "loss": 0.1992, "num_input_tokens_seen": 79445680, "step": 36805 }, { "epoch": 6.00489396411093, "grad_norm": 0.7351275682449341, "learning_rate": 2.0618144400187142e-05, "loss": 0.0084, "num_input_tokens_seen": 79456592, "step": 36810 }, { "epoch": 6.005709624796085, "grad_norm": 5.1559271812438965, "learning_rate": 2.061113678446015e-05, "loss": 0.0637, "num_input_tokens_seen": 79466992, "step": 36815 }, { "epoch": 6.006525285481239, "grad_norm": 0.0848996490240097, "learning_rate": 2.0604129524516676e-05, "loss": 0.0058, "num_input_tokens_seen": 79477168, "step": 36820 }, { "epoch": 6.007340946166395, "grad_norm": 6.075421333312988, "learning_rate": 2.059712262092477e-05, "loss": 0.1392, "num_input_tokens_seen": 79488368, "step": 36825 }, { "epoch": 6.00815660685155, "grad_norm": 0.07987889647483826, "learning_rate": 2.0590116074252438e-05, "loss": 0.0938, "num_input_tokens_seen": 79498736, "step": 36830 }, { "epoch": 6.008972267536705, "grad_norm": 0.07463080435991287, "learning_rate": 2.058310988506768e-05, "loss": 0.2583, "num_input_tokens_seen": 79508656, "step": 36835 }, { "epoch": 6.00978792822186, "grad_norm": 2.687781572341919, "learning_rate": 2.057610405393844e-05, "loss": 0.21, "num_input_tokens_seen": 79519696, "step": 36840 }, { "epoch": 6.010603588907014, "grad_norm": 0.08558306097984314, "learning_rate": 2.0569098581432655e-05, "loss": 0.2366, "num_input_tokens_seen": 79529776, "step": 36845 }, { "epoch": 6.011419249592169, "grad_norm": 2.86208438873291, "learning_rate": 2.0562093468118225e-05, "loss": 0.1569, "num_input_tokens_seen": 79541392, "step": 36850 }, { "epoch": 6.012234910277325, "grad_norm": 1.8412063121795654, "learning_rate": 2.055508871456301e-05, "loss": 0.0642, "num_input_tokens_seen": 79552720, "step": 36855 }, { "epoch": 6.01305057096248, "grad_norm": 0.2715950012207031, "learning_rate": 2.054808432133486e-05, "loss": 0.2212, "num_input_tokens_seen": 79563888, "step": 36860 }, { "epoch": 6.013866231647635, "grad_norm": 0.06896059960126877, "learning_rate": 2.0541080289001584e-05, "loss": 0.1458, "num_input_tokens_seen": 79575760, "step": 36865 }, { "epoch": 6.014681892332789, "grad_norm": 0.25679296255111694, "learning_rate": 2.0534076618130965e-05, "loss": 0.0063, "num_input_tokens_seen": 79587408, "step": 36870 }, { "epoch": 6.015497553017944, "grad_norm": 2.22693133354187, "learning_rate": 2.0527073309290755e-05, "loss": 0.0755, "num_input_tokens_seen": 79598352, "step": 36875 }, { "epoch": 6.0163132137031, "grad_norm": 3.286787509918213, "learning_rate": 2.0520070363048667e-05, "loss": 0.1579, "num_input_tokens_seen": 79609520, "step": 36880 }, { "epoch": 6.017128874388255, "grad_norm": 0.23035003244876862, "learning_rate": 2.0513067779972415e-05, "loss": 0.0243, "num_input_tokens_seen": 79621712, "step": 36885 }, { "epoch": 6.0179445350734095, "grad_norm": 0.1889280080795288, "learning_rate": 2.0506065560629655e-05, "loss": 0.0503, "num_input_tokens_seen": 79630576, "step": 36890 }, { "epoch": 6.018760195758564, "grad_norm": 0.10859799385070801, "learning_rate": 2.0499063705588024e-05, "loss": 0.1151, "num_input_tokens_seen": 79641360, "step": 36895 }, { "epoch": 6.019575856443719, "grad_norm": 0.1558310091495514, "learning_rate": 2.0492062215415125e-05, "loss": 0.0075, "num_input_tokens_seen": 79651792, "step": 36900 }, { "epoch": 6.020391517128874, "grad_norm": 0.08306483179330826, "learning_rate": 2.048506109067854e-05, "loss": 0.0338, "num_input_tokens_seen": 79661488, "step": 36905 }, { "epoch": 6.02120717781403, "grad_norm": 0.15285880863666534, "learning_rate": 2.047806033194581e-05, "loss": 0.1687, "num_input_tokens_seen": 79671856, "step": 36910 }, { "epoch": 6.0220228384991845, "grad_norm": 0.028413239866495132, "learning_rate": 2.0471059939784447e-05, "loss": 0.0067, "num_input_tokens_seen": 79681232, "step": 36915 }, { "epoch": 6.022838499184339, "grad_norm": 9.373331069946289, "learning_rate": 2.046405991476195e-05, "loss": 0.3035, "num_input_tokens_seen": 79691728, "step": 36920 }, { "epoch": 6.023654159869494, "grad_norm": 0.07682496309280396, "learning_rate": 2.045706025744577e-05, "loss": 0.0141, "num_input_tokens_seen": 79702672, "step": 36925 }, { "epoch": 6.024469820554649, "grad_norm": 0.07791969925165176, "learning_rate": 2.045006096840334e-05, "loss": 0.0059, "num_input_tokens_seen": 79713552, "step": 36930 }, { "epoch": 6.025285481239805, "grad_norm": 0.03607427701354027, "learning_rate": 2.044306204820205e-05, "loss": 0.0057, "num_input_tokens_seen": 79725296, "step": 36935 }, { "epoch": 6.0261011419249595, "grad_norm": 3.750582218170166, "learning_rate": 2.0436063497409274e-05, "loss": 0.1002, "num_input_tokens_seen": 79735792, "step": 36940 }, { "epoch": 6.026916802610114, "grad_norm": 0.12163159996271133, "learning_rate": 2.042906531659235e-05, "loss": 0.0039, "num_input_tokens_seen": 79746576, "step": 36945 }, { "epoch": 6.027732463295269, "grad_norm": 3.5392658710479736, "learning_rate": 2.042206750631858e-05, "loss": 0.1053, "num_input_tokens_seen": 79756496, "step": 36950 }, { "epoch": 6.028548123980424, "grad_norm": 0.13679811358451843, "learning_rate": 2.041507006715525e-05, "loss": 0.0346, "num_input_tokens_seen": 79767600, "step": 36955 }, { "epoch": 6.029363784665579, "grad_norm": 0.03501264750957489, "learning_rate": 2.0408072999669604e-05, "loss": 0.1284, "num_input_tokens_seen": 79779056, "step": 36960 }, { "epoch": 6.0301794453507345, "grad_norm": 4.524255752563477, "learning_rate": 2.0401076304428857e-05, "loss": 0.2575, "num_input_tokens_seen": 79790672, "step": 36965 }, { "epoch": 6.030995106035889, "grad_norm": 23.41586685180664, "learning_rate": 2.0394079982000198e-05, "loss": 0.1424, "num_input_tokens_seen": 79800144, "step": 36970 }, { "epoch": 6.031810766721044, "grad_norm": 4.102931022644043, "learning_rate": 2.0387084032950787e-05, "loss": 0.2014, "num_input_tokens_seen": 79811056, "step": 36975 }, { "epoch": 6.032626427406199, "grad_norm": 0.0753416195511818, "learning_rate": 2.038008845784775e-05, "loss": 0.12, "num_input_tokens_seen": 79821616, "step": 36980 }, { "epoch": 6.033442088091354, "grad_norm": 0.20337048172950745, "learning_rate": 2.0373093257258184e-05, "loss": 0.1088, "num_input_tokens_seen": 79833104, "step": 36985 }, { "epoch": 6.034257748776509, "grad_norm": 0.09057068079710007, "learning_rate": 2.0366098431749152e-05, "loss": 0.0044, "num_input_tokens_seen": 79842832, "step": 36990 }, { "epoch": 6.035073409461664, "grad_norm": 0.07087525725364685, "learning_rate": 2.0359103981887695e-05, "loss": 0.0056, "num_input_tokens_seen": 79853840, "step": 36995 }, { "epoch": 6.035889070146819, "grad_norm": 0.17854323983192444, "learning_rate": 2.035210990824082e-05, "loss": 0.095, "num_input_tokens_seen": 79865488, "step": 37000 }, { "epoch": 6.036704730831974, "grad_norm": 0.583503246307373, "learning_rate": 2.0345116211375496e-05, "loss": 0.2521, "num_input_tokens_seen": 79874640, "step": 37005 }, { "epoch": 6.037520391517129, "grad_norm": 0.37706518173217773, "learning_rate": 2.0338122891858677e-05, "loss": 0.0065, "num_input_tokens_seen": 79885552, "step": 37010 }, { "epoch": 6.0383360522022835, "grad_norm": 0.09744422882795334, "learning_rate": 2.0331129950257266e-05, "loss": 0.0044, "num_input_tokens_seen": 79896336, "step": 37015 }, { "epoch": 6.039151712887439, "grad_norm": 10.079150199890137, "learning_rate": 2.0324137387138152e-05, "loss": 0.1087, "num_input_tokens_seen": 79907184, "step": 37020 }, { "epoch": 6.039967373572594, "grad_norm": 5.770218372344971, "learning_rate": 2.031714520306819e-05, "loss": 0.1118, "num_input_tokens_seen": 79917936, "step": 37025 }, { "epoch": 6.040783034257749, "grad_norm": 17.64241600036621, "learning_rate": 2.0310153398614192e-05, "loss": 0.2081, "num_input_tokens_seen": 79930000, "step": 37030 }, { "epoch": 6.041598694942904, "grad_norm": 0.13271000981330872, "learning_rate": 2.030316197434296e-05, "loss": 0.0051, "num_input_tokens_seen": 79941456, "step": 37035 }, { "epoch": 6.0424143556280585, "grad_norm": 0.11974164098501205, "learning_rate": 2.0296170930821245e-05, "loss": 0.018, "num_input_tokens_seen": 79951440, "step": 37040 }, { "epoch": 6.043230016313213, "grad_norm": 0.050197869539260864, "learning_rate": 2.028918026861579e-05, "loss": 0.0085, "num_input_tokens_seen": 79962064, "step": 37045 }, { "epoch": 6.044045676998369, "grad_norm": 0.09011278301477432, "learning_rate": 2.0282189988293276e-05, "loss": 0.0035, "num_input_tokens_seen": 79974160, "step": 37050 }, { "epoch": 6.044861337683524, "grad_norm": 0.041562121361494064, "learning_rate": 2.0275200090420376e-05, "loss": 0.0028, "num_input_tokens_seen": 79985072, "step": 37055 }, { "epoch": 6.045676998368679, "grad_norm": 4.7424421310424805, "learning_rate": 2.026821057556374e-05, "loss": 0.0817, "num_input_tokens_seen": 79996784, "step": 37060 }, { "epoch": 6.0464926590538335, "grad_norm": 0.10275337845087051, "learning_rate": 2.026122144428996e-05, "loss": 0.0043, "num_input_tokens_seen": 80008496, "step": 37065 }, { "epoch": 6.047308319738988, "grad_norm": 3.4851765632629395, "learning_rate": 2.0254232697165616e-05, "loss": 0.1343, "num_input_tokens_seen": 80019600, "step": 37070 }, { "epoch": 6.048123980424143, "grad_norm": 0.07727508246898651, "learning_rate": 2.0247244334757248e-05, "loss": 0.0035, "num_input_tokens_seen": 80030864, "step": 37075 }, { "epoch": 6.048939641109299, "grad_norm": 2.237130880355835, "learning_rate": 2.0240256357631367e-05, "loss": 0.0088, "num_input_tokens_seen": 80041936, "step": 37080 }, { "epoch": 6.049755301794454, "grad_norm": 0.9212360382080078, "learning_rate": 2.023326876635446e-05, "loss": 0.0063, "num_input_tokens_seen": 80053552, "step": 37085 }, { "epoch": 6.0505709624796085, "grad_norm": 2.9676239490509033, "learning_rate": 2.022628156149297e-05, "loss": 0.2996, "num_input_tokens_seen": 80064688, "step": 37090 }, { "epoch": 6.051386623164763, "grad_norm": 0.2867843210697174, "learning_rate": 2.021929474361331e-05, "loss": 0.0047, "num_input_tokens_seen": 80076176, "step": 37095 }, { "epoch": 6.052202283849918, "grad_norm": 0.06813370436429977, "learning_rate": 2.0212308313281886e-05, "loss": 0.0049, "num_input_tokens_seen": 80085968, "step": 37100 }, { "epoch": 6.053017944535074, "grad_norm": 2.913398027420044, "learning_rate": 2.0205322271065042e-05, "loss": 0.3051, "num_input_tokens_seen": 80096048, "step": 37105 }, { "epoch": 6.053833605220229, "grad_norm": 37.044898986816406, "learning_rate": 2.01983366175291e-05, "loss": 0.0585, "num_input_tokens_seen": 80106608, "step": 37110 }, { "epoch": 6.054649265905383, "grad_norm": 0.02120068296790123, "learning_rate": 2.0191351353240363e-05, "loss": 0.1304, "num_input_tokens_seen": 80116400, "step": 37115 }, { "epoch": 6.055464926590538, "grad_norm": 11.320032119750977, "learning_rate": 2.0184366478765078e-05, "loss": 0.1768, "num_input_tokens_seen": 80127024, "step": 37120 }, { "epoch": 6.056280587275693, "grad_norm": 0.06965632736682892, "learning_rate": 2.017738199466948e-05, "loss": 0.1013, "num_input_tokens_seen": 80137936, "step": 37125 }, { "epoch": 6.057096247960848, "grad_norm": 6.816587448120117, "learning_rate": 2.0170397901519766e-05, "loss": 0.0887, "num_input_tokens_seen": 80148752, "step": 37130 }, { "epoch": 6.057911908646004, "grad_norm": 0.11578565090894699, "learning_rate": 2.016341419988211e-05, "loss": 0.1381, "num_input_tokens_seen": 80159856, "step": 37135 }, { "epoch": 6.058727569331158, "grad_norm": 2.500230312347412, "learning_rate": 2.0156430890322627e-05, "loss": 0.0974, "num_input_tokens_seen": 80172656, "step": 37140 }, { "epoch": 6.059543230016313, "grad_norm": 0.04798300191760063, "learning_rate": 2.0149447973407443e-05, "loss": 0.0823, "num_input_tokens_seen": 80184240, "step": 37145 }, { "epoch": 6.060358890701468, "grad_norm": 0.13084805011749268, "learning_rate": 2.0142465449702612e-05, "loss": 0.024, "num_input_tokens_seen": 80195184, "step": 37150 }, { "epoch": 6.061174551386623, "grad_norm": 0.17297035455703735, "learning_rate": 2.0135483319774183e-05, "loss": 0.0075, "num_input_tokens_seen": 80205296, "step": 37155 }, { "epoch": 6.061990212071779, "grad_norm": 0.02693340927362442, "learning_rate": 2.012850158418816e-05, "loss": 0.0184, "num_input_tokens_seen": 80216496, "step": 37160 }, { "epoch": 6.062805872756933, "grad_norm": 13.927870750427246, "learning_rate": 2.0121520243510512e-05, "loss": 0.0787, "num_input_tokens_seen": 80227280, "step": 37165 }, { "epoch": 6.063621533442088, "grad_norm": 0.19517157971858978, "learning_rate": 2.0114539298307188e-05, "loss": 0.1433, "num_input_tokens_seen": 80238704, "step": 37170 }, { "epoch": 6.064437194127243, "grad_norm": 3.330983877182007, "learning_rate": 2.0107558749144096e-05, "loss": 0.1432, "num_input_tokens_seen": 80248560, "step": 37175 }, { "epoch": 6.065252854812398, "grad_norm": 0.045291390269994736, "learning_rate": 2.0100578596587116e-05, "loss": 0.0066, "num_input_tokens_seen": 80258448, "step": 37180 }, { "epoch": 6.066068515497553, "grad_norm": 0.05958879366517067, "learning_rate": 2.0093598841202092e-05, "loss": 0.0055, "num_input_tokens_seen": 80269168, "step": 37185 }, { "epoch": 6.066884176182708, "grad_norm": 0.08888333290815353, "learning_rate": 2.0086619483554847e-05, "loss": 0.0052, "num_input_tokens_seen": 80278672, "step": 37190 }, { "epoch": 6.067699836867863, "grad_norm": 0.04317786172032356, "learning_rate": 2.0079640524211153e-05, "loss": 0.0201, "num_input_tokens_seen": 80290128, "step": 37195 }, { "epoch": 6.068515497553018, "grad_norm": 0.04700427129864693, "learning_rate": 2.0072661963736752e-05, "loss": 0.2334, "num_input_tokens_seen": 80300816, "step": 37200 }, { "epoch": 6.069331158238173, "grad_norm": 0.13251464068889618, "learning_rate": 2.006568380269739e-05, "loss": 0.0027, "num_input_tokens_seen": 80311120, "step": 37205 }, { "epoch": 6.070146818923328, "grad_norm": 0.3197738528251648, "learning_rate": 2.005870604165873e-05, "loss": 0.0059, "num_input_tokens_seen": 80321968, "step": 37210 }, { "epoch": 6.0709624796084825, "grad_norm": 3.1156668663024902, "learning_rate": 2.005172868118643e-05, "loss": 0.1079, "num_input_tokens_seen": 80332176, "step": 37215 }, { "epoch": 6.071778140293638, "grad_norm": 0.3527423143386841, "learning_rate": 2.004475172184611e-05, "loss": 0.0043, "num_input_tokens_seen": 80343376, "step": 37220 }, { "epoch": 6.072593800978793, "grad_norm": 0.1022476777434349, "learning_rate": 2.0037775164203356e-05, "loss": 0.0034, "num_input_tokens_seen": 80354032, "step": 37225 }, { "epoch": 6.073409461663948, "grad_norm": 0.08814200013875961, "learning_rate": 2.0030799008823727e-05, "loss": 0.0036, "num_input_tokens_seen": 80365072, "step": 37230 }, { "epoch": 6.074225122349103, "grad_norm": 0.051949575543403625, "learning_rate": 2.0023823256272748e-05, "loss": 0.0057, "num_input_tokens_seen": 80374512, "step": 37235 }, { "epoch": 6.075040783034257, "grad_norm": 0.028245143592357635, "learning_rate": 2.00168479071159e-05, "loss": 0.0777, "num_input_tokens_seen": 80384368, "step": 37240 }, { "epoch": 6.075856443719413, "grad_norm": 5.9619951248168945, "learning_rate": 2.0009872961918648e-05, "loss": 0.0166, "num_input_tokens_seen": 80395152, "step": 37245 }, { "epoch": 6.076672104404568, "grad_norm": 2.1715729236602783, "learning_rate": 2.0002898421246414e-05, "loss": 0.016, "num_input_tokens_seen": 80406288, "step": 37250 }, { "epoch": 6.077487765089723, "grad_norm": 0.043432507663965225, "learning_rate": 1.9995924285664587e-05, "loss": 0.0255, "num_input_tokens_seen": 80417040, "step": 37255 }, { "epoch": 6.078303425774878, "grad_norm": 0.0427730567753315, "learning_rate": 1.9988950555738528e-05, "loss": 0.1516, "num_input_tokens_seen": 80428112, "step": 37260 }, { "epoch": 6.079119086460032, "grad_norm": 0.06148674339056015, "learning_rate": 1.9981977232033563e-05, "loss": 0.0062, "num_input_tokens_seen": 80438288, "step": 37265 }, { "epoch": 6.079934747145187, "grad_norm": 0.1648724526166916, "learning_rate": 1.9975004315114988e-05, "loss": 0.5308, "num_input_tokens_seen": 80449360, "step": 37270 }, { "epoch": 6.080750407830343, "grad_norm": 1.8730205297470093, "learning_rate": 1.9968031805548056e-05, "loss": 0.006, "num_input_tokens_seen": 80460752, "step": 37275 }, { "epoch": 6.081566068515498, "grad_norm": 5.308781147003174, "learning_rate": 1.9961059703898e-05, "loss": 0.0523, "num_input_tokens_seen": 80472048, "step": 37280 }, { "epoch": 6.082381729200653, "grad_norm": 0.09825458377599716, "learning_rate": 1.9954088010730003e-05, "loss": 0.0878, "num_input_tokens_seen": 80481840, "step": 37285 }, { "epoch": 6.083197389885807, "grad_norm": 0.14295031130313873, "learning_rate": 1.994711672660924e-05, "loss": 0.0027, "num_input_tokens_seen": 80493648, "step": 37290 }, { "epoch": 6.084013050570962, "grad_norm": 0.1177147701382637, "learning_rate": 1.9940145852100836e-05, "loss": 0.1494, "num_input_tokens_seen": 80503024, "step": 37295 }, { "epoch": 6.084828711256117, "grad_norm": 0.327179491519928, "learning_rate": 1.993317538776988e-05, "loss": 0.1426, "num_input_tokens_seen": 80513808, "step": 37300 }, { "epoch": 6.085644371941273, "grad_norm": 0.3292040526866913, "learning_rate": 1.9926205334181443e-05, "loss": 0.2491, "num_input_tokens_seen": 80524176, "step": 37305 }, { "epoch": 6.0864600326264275, "grad_norm": 0.0920260027050972, "learning_rate": 1.9919235691900526e-05, "loss": 0.0907, "num_input_tokens_seen": 80536176, "step": 37310 }, { "epoch": 6.087275693311582, "grad_norm": 0.026817047968506813, "learning_rate": 1.991226646149216e-05, "loss": 0.0177, "num_input_tokens_seen": 80546960, "step": 37315 }, { "epoch": 6.088091353996737, "grad_norm": 4.843094825744629, "learning_rate": 1.9905297643521287e-05, "loss": 0.1909, "num_input_tokens_seen": 80557936, "step": 37320 }, { "epoch": 6.088907014681892, "grad_norm": 8.50800609588623, "learning_rate": 1.9898329238552838e-05, "loss": 0.0974, "num_input_tokens_seen": 80567728, "step": 37325 }, { "epoch": 6.089722675367048, "grad_norm": 0.040082208812236786, "learning_rate": 1.9891361247151706e-05, "loss": 0.1036, "num_input_tokens_seen": 80579184, "step": 37330 }, { "epoch": 6.0905383360522025, "grad_norm": 6.693157196044922, "learning_rate": 1.9884393669882752e-05, "loss": 0.0201, "num_input_tokens_seen": 80589872, "step": 37335 }, { "epoch": 6.091353996737357, "grad_norm": 0.11779052764177322, "learning_rate": 1.9877426507310802e-05, "loss": 0.0045, "num_input_tokens_seen": 80599888, "step": 37340 }, { "epoch": 6.092169657422512, "grad_norm": 4.852125644683838, "learning_rate": 1.9870459760000654e-05, "loss": 0.1395, "num_input_tokens_seen": 80610384, "step": 37345 }, { "epoch": 6.092985318107667, "grad_norm": 0.09516799449920654, "learning_rate": 1.9863493428517066e-05, "loss": 0.1118, "num_input_tokens_seen": 80621456, "step": 37350 }, { "epoch": 6.093800978792822, "grad_norm": 0.06654660403728485, "learning_rate": 1.985652751342476e-05, "loss": 0.0973, "num_input_tokens_seen": 80632208, "step": 37355 }, { "epoch": 6.0946166394779775, "grad_norm": 0.09963220357894897, "learning_rate": 1.984956201528843e-05, "loss": 0.0038, "num_input_tokens_seen": 80642736, "step": 37360 }, { "epoch": 6.095432300163132, "grad_norm": 3.8635575771331787, "learning_rate": 1.984259693467274e-05, "loss": 0.2489, "num_input_tokens_seen": 80654064, "step": 37365 }, { "epoch": 6.096247960848287, "grad_norm": 7.422365188598633, "learning_rate": 1.9835632272142305e-05, "loss": 0.0577, "num_input_tokens_seen": 80664880, "step": 37370 }, { "epoch": 6.097063621533442, "grad_norm": 0.2181326150894165, "learning_rate": 1.9828668028261726e-05, "loss": 0.0038, "num_input_tokens_seen": 80676720, "step": 37375 }, { "epoch": 6.097879282218597, "grad_norm": 0.5527575612068176, "learning_rate": 1.9821704203595554e-05, "loss": 0.124, "num_input_tokens_seen": 80688240, "step": 37380 }, { "epoch": 6.0986949429037525, "grad_norm": 0.1534019410610199, "learning_rate": 1.9814740798708316e-05, "loss": 0.0278, "num_input_tokens_seen": 80699216, "step": 37385 }, { "epoch": 6.099510603588907, "grad_norm": 0.16948339343070984, "learning_rate": 1.98077778141645e-05, "loss": 0.174, "num_input_tokens_seen": 80710192, "step": 37390 }, { "epoch": 6.100326264274062, "grad_norm": 3.207892894744873, "learning_rate": 1.9800815250528557e-05, "loss": 0.2233, "num_input_tokens_seen": 80720880, "step": 37395 }, { "epoch": 6.101141924959217, "grad_norm": 0.2363218069076538, "learning_rate": 1.979385310836491e-05, "loss": 0.0034, "num_input_tokens_seen": 80731760, "step": 37400 }, { "epoch": 6.101957585644372, "grad_norm": 0.11209364235401154, "learning_rate": 1.9786891388237945e-05, "loss": 0.006, "num_input_tokens_seen": 80741712, "step": 37405 }, { "epoch": 6.102773246329527, "grad_norm": 5.0404815673828125, "learning_rate": 1.9779930090712017e-05, "loss": 0.2451, "num_input_tokens_seen": 80753264, "step": 37410 }, { "epoch": 6.103588907014682, "grad_norm": 4.946499824523926, "learning_rate": 1.9772969216351433e-05, "loss": 0.0371, "num_input_tokens_seen": 80764816, "step": 37415 }, { "epoch": 6.104404567699837, "grad_norm": 12.903267860412598, "learning_rate": 1.9766008765720493e-05, "loss": 0.3257, "num_input_tokens_seen": 80774320, "step": 37420 }, { "epoch": 6.105220228384992, "grad_norm": 0.01140553504228592, "learning_rate": 1.975904873938344e-05, "loss": 0.0061, "num_input_tokens_seen": 80784976, "step": 37425 }, { "epoch": 6.106035889070147, "grad_norm": 0.08069048821926117, "learning_rate": 1.9752089137904492e-05, "loss": 0.0974, "num_input_tokens_seen": 80794672, "step": 37430 }, { "epoch": 6.1068515497553015, "grad_norm": 0.1736743301153183, "learning_rate": 1.9745129961847824e-05, "loss": 0.0065, "num_input_tokens_seen": 80805424, "step": 37435 }, { "epoch": 6.107667210440456, "grad_norm": 5.19975471496582, "learning_rate": 1.9738171211777584e-05, "loss": 0.0125, "num_input_tokens_seen": 80816240, "step": 37440 }, { "epoch": 6.108482871125612, "grad_norm": 0.08598069101572037, "learning_rate": 1.9731212888257883e-05, "loss": 0.1493, "num_input_tokens_seen": 80825744, "step": 37445 }, { "epoch": 6.109298531810767, "grad_norm": 2.454923629760742, "learning_rate": 1.97242549918528e-05, "loss": 0.0927, "num_input_tokens_seen": 80835984, "step": 37450 }, { "epoch": 6.110114192495922, "grad_norm": 16.96182632446289, "learning_rate": 1.9717297523126373e-05, "loss": 0.0505, "num_input_tokens_seen": 80847184, "step": 37455 }, { "epoch": 6.1109298531810765, "grad_norm": 0.0514359325170517, "learning_rate": 1.9710340482642615e-05, "loss": 0.0323, "num_input_tokens_seen": 80858608, "step": 37460 }, { "epoch": 6.111745513866231, "grad_norm": 0.05719045549631119, "learning_rate": 1.9703383870965496e-05, "loss": 0.1911, "num_input_tokens_seen": 80869584, "step": 37465 }, { "epoch": 6.112561174551387, "grad_norm": 3.358921766281128, "learning_rate": 1.969642768865896e-05, "loss": 0.1088, "num_input_tokens_seen": 80879408, "step": 37470 }, { "epoch": 6.113376835236542, "grad_norm": 0.10545467585325241, "learning_rate": 1.9689471936286902e-05, "loss": 0.1177, "num_input_tokens_seen": 80890032, "step": 37475 }, { "epoch": 6.114192495921697, "grad_norm": 0.15293854475021362, "learning_rate": 1.9682516614413194e-05, "loss": 0.1575, "num_input_tokens_seen": 80902160, "step": 37480 }, { "epoch": 6.1150081566068515, "grad_norm": 0.06207267940044403, "learning_rate": 1.967556172360167e-05, "loss": 0.2369, "num_input_tokens_seen": 80914032, "step": 37485 }, { "epoch": 6.115823817292006, "grad_norm": 0.340839684009552, "learning_rate": 1.966860726441613e-05, "loss": 0.0175, "num_input_tokens_seen": 80924912, "step": 37490 }, { "epoch": 6.116639477977161, "grad_norm": 0.25051626563072205, "learning_rate": 1.9661653237420337e-05, "loss": 0.1899, "num_input_tokens_seen": 80934544, "step": 37495 }, { "epoch": 6.117455138662317, "grad_norm": 4.973474979400635, "learning_rate": 1.9654699643178016e-05, "loss": 0.1761, "num_input_tokens_seen": 80945456, "step": 37500 }, { "epoch": 6.118270799347472, "grad_norm": 0.14203764498233795, "learning_rate": 1.9647746482252866e-05, "loss": 0.0928, "num_input_tokens_seen": 80956368, "step": 37505 }, { "epoch": 6.1190864600326265, "grad_norm": 0.04542820528149605, "learning_rate": 1.9640793755208542e-05, "loss": 0.0037, "num_input_tokens_seen": 80967280, "step": 37510 }, { "epoch": 6.119902120717781, "grad_norm": 0.0741165429353714, "learning_rate": 1.9633841462608664e-05, "loss": 0.0102, "num_input_tokens_seen": 80978320, "step": 37515 }, { "epoch": 6.120717781402936, "grad_norm": 2.9917116165161133, "learning_rate": 1.9626889605016827e-05, "loss": 0.1145, "num_input_tokens_seen": 80988624, "step": 37520 }, { "epoch": 6.121533442088092, "grad_norm": 0.1031784862279892, "learning_rate": 1.9619938182996585e-05, "loss": 0.1684, "num_input_tokens_seen": 81000560, "step": 37525 }, { "epoch": 6.122349102773247, "grad_norm": 0.08577301353216171, "learning_rate": 1.961298719711145e-05, "loss": 0.003, "num_input_tokens_seen": 81011408, "step": 37530 }, { "epoch": 6.123164763458401, "grad_norm": 0.047973308712244034, "learning_rate": 1.9606036647924907e-05, "loss": 0.1376, "num_input_tokens_seen": 81023472, "step": 37535 }, { "epoch": 6.123980424143556, "grad_norm": 5.070557594299316, "learning_rate": 1.95990865360004e-05, "loss": 0.2248, "num_input_tokens_seen": 81033968, "step": 37540 }, { "epoch": 6.124796084828711, "grad_norm": 0.14994460344314575, "learning_rate": 1.9592136861901344e-05, "loss": 0.0054, "num_input_tokens_seen": 81044304, "step": 37545 }, { "epoch": 6.125611745513866, "grad_norm": 0.12661321461200714, "learning_rate": 1.9585187626191113e-05, "loss": 0.0299, "num_input_tokens_seen": 81055056, "step": 37550 }, { "epoch": 6.126427406199022, "grad_norm": 4.4868364334106445, "learning_rate": 1.9578238829433048e-05, "loss": 0.0798, "num_input_tokens_seen": 81066544, "step": 37555 }, { "epoch": 6.127243066884176, "grad_norm": 0.13730965554714203, "learning_rate": 1.9571290472190456e-05, "loss": 0.0144, "num_input_tokens_seen": 81077360, "step": 37560 }, { "epoch": 6.128058727569331, "grad_norm": 4.3978071212768555, "learning_rate": 1.95643425550266e-05, "loss": 0.0086, "num_input_tokens_seen": 81088816, "step": 37565 }, { "epoch": 6.128874388254486, "grad_norm": 0.03902905806899071, "learning_rate": 1.955739507850472e-05, "loss": 0.1386, "num_input_tokens_seen": 81101488, "step": 37570 }, { "epoch": 6.129690048939641, "grad_norm": 4.845416069030762, "learning_rate": 1.9550448043188007e-05, "loss": 0.0309, "num_input_tokens_seen": 81111568, "step": 37575 }, { "epoch": 6.130505709624796, "grad_norm": 0.12704837322235107, "learning_rate": 1.954350144963963e-05, "loss": 0.0099, "num_input_tokens_seen": 81121840, "step": 37580 }, { "epoch": 6.131321370309951, "grad_norm": 0.33101361989974976, "learning_rate": 1.9536555298422706e-05, "loss": 0.0079, "num_input_tokens_seen": 81132720, "step": 37585 }, { "epoch": 6.132137030995106, "grad_norm": 3.2499988079071045, "learning_rate": 1.9529609590100337e-05, "loss": 0.0928, "num_input_tokens_seen": 81143536, "step": 37590 }, { "epoch": 6.132952691680261, "grad_norm": 8.56700611114502, "learning_rate": 1.952266432523557e-05, "loss": 0.2774, "num_input_tokens_seen": 81154096, "step": 37595 }, { "epoch": 6.133768352365416, "grad_norm": 0.16566801071166992, "learning_rate": 1.9515719504391415e-05, "loss": 0.0043, "num_input_tokens_seen": 81165104, "step": 37600 }, { "epoch": 6.134584013050571, "grad_norm": 19.968868255615234, "learning_rate": 1.950877512813087e-05, "loss": 0.2139, "num_input_tokens_seen": 81175952, "step": 37605 }, { "epoch": 6.135399673735726, "grad_norm": 3.177694082260132, "learning_rate": 1.950183119701688e-05, "loss": 0.2195, "num_input_tokens_seen": 81186608, "step": 37610 }, { "epoch": 6.136215334420881, "grad_norm": 0.065876305103302, "learning_rate": 1.949488771161235e-05, "loss": 0.0079, "num_input_tokens_seen": 81198128, "step": 37615 }, { "epoch": 6.137030995106036, "grad_norm": 2.483712911605835, "learning_rate": 1.948794467248015e-05, "loss": 0.12, "num_input_tokens_seen": 81207792, "step": 37620 }, { "epoch": 6.137846655791191, "grad_norm": 0.3169977366924286, "learning_rate": 1.9481002080183114e-05, "loss": 0.0072, "num_input_tokens_seen": 81219152, "step": 37625 }, { "epoch": 6.138662316476346, "grad_norm": 0.12151922285556793, "learning_rate": 1.947405993528406e-05, "loss": 0.0042, "num_input_tokens_seen": 81230800, "step": 37630 }, { "epoch": 6.1394779771615005, "grad_norm": 0.17825913429260254, "learning_rate": 1.9467118238345752e-05, "loss": 0.1631, "num_input_tokens_seen": 81243536, "step": 37635 }, { "epoch": 6.140293637846656, "grad_norm": 0.031340572983026505, "learning_rate": 1.946017698993091e-05, "loss": 0.0049, "num_input_tokens_seen": 81255344, "step": 37640 }, { "epoch": 6.141109298531811, "grad_norm": 0.10641069710254669, "learning_rate": 1.945323619060223e-05, "loss": 0.0057, "num_input_tokens_seen": 81266544, "step": 37645 }, { "epoch": 6.141924959216966, "grad_norm": 10.570899963378906, "learning_rate": 1.944629584092237e-05, "loss": 0.1512, "num_input_tokens_seen": 81276976, "step": 37650 }, { "epoch": 6.142740619902121, "grad_norm": 0.10499405115842819, "learning_rate": 1.943935594145395e-05, "loss": 0.0875, "num_input_tokens_seen": 81288656, "step": 37655 }, { "epoch": 6.143556280587275, "grad_norm": 3.573042154312134, "learning_rate": 1.9432416492759548e-05, "loss": 0.2075, "num_input_tokens_seen": 81298928, "step": 37660 }, { "epoch": 6.14437194127243, "grad_norm": 0.08196359872817993, "learning_rate": 1.9425477495401716e-05, "loss": 0.1534, "num_input_tokens_seen": 81310224, "step": 37665 }, { "epoch": 6.145187601957586, "grad_norm": 0.10421831905841827, "learning_rate": 1.9418538949942962e-05, "loss": 0.0048, "num_input_tokens_seen": 81321616, "step": 37670 }, { "epoch": 6.146003262642741, "grad_norm": 0.13622203469276428, "learning_rate": 1.9411600856945763e-05, "loss": 0.1136, "num_input_tokens_seen": 81332656, "step": 37675 }, { "epoch": 6.146818923327896, "grad_norm": 5.768846035003662, "learning_rate": 1.940466321697255e-05, "loss": 0.1429, "num_input_tokens_seen": 81343440, "step": 37680 }, { "epoch": 6.14763458401305, "grad_norm": 1.4352920055389404, "learning_rate": 1.9397726030585726e-05, "loss": 0.2253, "num_input_tokens_seen": 81354672, "step": 37685 }, { "epoch": 6.148450244698205, "grad_norm": 0.26600053906440735, "learning_rate": 1.939078929834766e-05, "loss": 0.0978, "num_input_tokens_seen": 81365808, "step": 37690 }, { "epoch": 6.149265905383361, "grad_norm": 0.7572985887527466, "learning_rate": 1.9383853020820674e-05, "loss": 0.0255, "num_input_tokens_seen": 81375856, "step": 37695 }, { "epoch": 6.150081566068516, "grad_norm": 0.920245885848999, "learning_rate": 1.9376917198567058e-05, "loss": 0.0044, "num_input_tokens_seen": 81387184, "step": 37700 }, { "epoch": 6.150897226753671, "grad_norm": 0.09726397693157196, "learning_rate": 1.9369981832149064e-05, "loss": 0.0946, "num_input_tokens_seen": 81397712, "step": 37705 }, { "epoch": 6.151712887438825, "grad_norm": 0.1214255541563034, "learning_rate": 1.936304692212891e-05, "loss": 0.1656, "num_input_tokens_seen": 81408592, "step": 37710 }, { "epoch": 6.15252854812398, "grad_norm": 7.828940391540527, "learning_rate": 1.9356112469068776e-05, "loss": 0.1424, "num_input_tokens_seen": 81420176, "step": 37715 }, { "epoch": 6.153344208809135, "grad_norm": 0.5535727143287659, "learning_rate": 1.93491784735308e-05, "loss": 0.116, "num_input_tokens_seen": 81431920, "step": 37720 }, { "epoch": 6.154159869494291, "grad_norm": 0.5543473958969116, "learning_rate": 1.934224493607709e-05, "loss": 0.2108, "num_input_tokens_seen": 81443312, "step": 37725 }, { "epoch": 6.1549755301794455, "grad_norm": 0.05757756158709526, "learning_rate": 1.933531185726971e-05, "loss": 0.0058, "num_input_tokens_seen": 81454736, "step": 37730 }, { "epoch": 6.1557911908646, "grad_norm": 4.306589603424072, "learning_rate": 1.9328379237670684e-05, "loss": 0.4875, "num_input_tokens_seen": 81465968, "step": 37735 }, { "epoch": 6.156606851549755, "grad_norm": 0.12160925567150116, "learning_rate": 1.9321447077842026e-05, "loss": 0.1139, "num_input_tokens_seen": 81477264, "step": 37740 }, { "epoch": 6.15742251223491, "grad_norm": 0.12456405162811279, "learning_rate": 1.931451537834568e-05, "loss": 0.0267, "num_input_tokens_seen": 81488080, "step": 37745 }, { "epoch": 6.158238172920065, "grad_norm": 0.10891862213611603, "learning_rate": 1.9307584139743564e-05, "loss": 0.185, "num_input_tokens_seen": 81499536, "step": 37750 }, { "epoch": 6.1590538336052205, "grad_norm": 0.15445049107074738, "learning_rate": 1.9300653362597564e-05, "loss": 0.013, "num_input_tokens_seen": 81511184, "step": 37755 }, { "epoch": 6.159869494290375, "grad_norm": 0.16697649657726288, "learning_rate": 1.929372304746952e-05, "loss": 0.4588, "num_input_tokens_seen": 81521648, "step": 37760 }, { "epoch": 6.16068515497553, "grad_norm": 0.06527233868837357, "learning_rate": 1.928679319492124e-05, "loss": 0.1165, "num_input_tokens_seen": 81532016, "step": 37765 }, { "epoch": 6.161500815660685, "grad_norm": 6.2396416664123535, "learning_rate": 1.9279863805514482e-05, "loss": 0.0152, "num_input_tokens_seen": 81543440, "step": 37770 }, { "epoch": 6.16231647634584, "grad_norm": 4.716691970825195, "learning_rate": 1.9272934879810994e-05, "loss": 0.1288, "num_input_tokens_seen": 81553680, "step": 37775 }, { "epoch": 6.1631321370309955, "grad_norm": 0.41060569882392883, "learning_rate": 1.9266006418372464e-05, "loss": 0.0061, "num_input_tokens_seen": 81564944, "step": 37780 }, { "epoch": 6.16394779771615, "grad_norm": 0.46858978271484375, "learning_rate": 1.925907842176055e-05, "loss": 0.1307, "num_input_tokens_seen": 81575376, "step": 37785 }, { "epoch": 6.164763458401305, "grad_norm": 4.114612102508545, "learning_rate": 1.925215089053687e-05, "loss": 0.1889, "num_input_tokens_seen": 81585840, "step": 37790 }, { "epoch": 6.16557911908646, "grad_norm": 6.160334587097168, "learning_rate": 1.9245223825262997e-05, "loss": 0.046, "num_input_tokens_seen": 81597072, "step": 37795 }, { "epoch": 6.166394779771615, "grad_norm": 0.13958598673343658, "learning_rate": 1.9238297226500483e-05, "loss": 0.1977, "num_input_tokens_seen": 81608336, "step": 37800 }, { "epoch": 6.16721044045677, "grad_norm": 0.1675865799188614, "learning_rate": 1.923137109481083e-05, "loss": 0.1069, "num_input_tokens_seen": 81619440, "step": 37805 }, { "epoch": 6.168026101141925, "grad_norm": 0.4309624433517456, "learning_rate": 1.9224445430755507e-05, "loss": 0.1008, "num_input_tokens_seen": 81629648, "step": 37810 }, { "epoch": 6.16884176182708, "grad_norm": 0.09904403984546661, "learning_rate": 1.9217520234895943e-05, "loss": 0.271, "num_input_tokens_seen": 81639408, "step": 37815 }, { "epoch": 6.169657422512235, "grad_norm": 0.6715161800384521, "learning_rate": 1.9210595507793526e-05, "loss": 0.0853, "num_input_tokens_seen": 81650704, "step": 37820 }, { "epoch": 6.17047308319739, "grad_norm": 0.08807911723852158, "learning_rate": 1.9203671250009612e-05, "loss": 0.0105, "num_input_tokens_seen": 81661328, "step": 37825 }, { "epoch": 6.171288743882545, "grad_norm": 0.11950883269309998, "learning_rate": 1.9196747462105517e-05, "loss": 0.0106, "num_input_tokens_seen": 81671728, "step": 37830 }, { "epoch": 6.1721044045677, "grad_norm": 4.313073635101318, "learning_rate": 1.918982414464252e-05, "loss": 0.0249, "num_input_tokens_seen": 81681936, "step": 37835 }, { "epoch": 6.172920065252855, "grad_norm": 12.713224411010742, "learning_rate": 1.918290129818185e-05, "loss": 0.1223, "num_input_tokens_seen": 81692592, "step": 37840 }, { "epoch": 6.17373572593801, "grad_norm": 10.112577438354492, "learning_rate": 1.9175978923284727e-05, "loss": 0.0976, "num_input_tokens_seen": 81703696, "step": 37845 }, { "epoch": 6.174551386623165, "grad_norm": 0.14462950825691223, "learning_rate": 1.91690570205123e-05, "loss": 0.0598, "num_input_tokens_seen": 81714448, "step": 37850 }, { "epoch": 6.1753670473083195, "grad_norm": 0.09021087735891342, "learning_rate": 1.916213559042569e-05, "loss": 0.1472, "num_input_tokens_seen": 81724560, "step": 37855 }, { "epoch": 6.176182707993474, "grad_norm": 0.21085111796855927, "learning_rate": 1.9155214633586e-05, "loss": 0.0051, "num_input_tokens_seen": 81734384, "step": 37860 }, { "epoch": 6.17699836867863, "grad_norm": 0.23198255896568298, "learning_rate": 1.9148294150554266e-05, "loss": 0.0678, "num_input_tokens_seen": 81745552, "step": 37865 }, { "epoch": 6.177814029363785, "grad_norm": 0.14588935673236847, "learning_rate": 1.9141374141891498e-05, "loss": 0.0212, "num_input_tokens_seen": 81756144, "step": 37870 }, { "epoch": 6.17862969004894, "grad_norm": 4.942633152008057, "learning_rate": 1.913445460815867e-05, "loss": 0.1059, "num_input_tokens_seen": 81767024, "step": 37875 }, { "epoch": 6.1794453507340945, "grad_norm": 6.1705217361450195, "learning_rate": 1.9127535549916715e-05, "loss": 0.1619, "num_input_tokens_seen": 81777488, "step": 37880 }, { "epoch": 6.180261011419249, "grad_norm": 0.34688469767570496, "learning_rate": 1.912061696772652e-05, "loss": 0.2104, "num_input_tokens_seen": 81788624, "step": 37885 }, { "epoch": 6.181076672104404, "grad_norm": 0.09854024648666382, "learning_rate": 1.911369886214895e-05, "loss": 0.0537, "num_input_tokens_seen": 81798224, "step": 37890 }, { "epoch": 6.18189233278956, "grad_norm": 0.08823223412036896, "learning_rate": 1.9106781233744813e-05, "loss": 0.0121, "num_input_tokens_seen": 81808976, "step": 37895 }, { "epoch": 6.182707993474715, "grad_norm": 4.729548931121826, "learning_rate": 1.9099864083074892e-05, "loss": 0.1242, "num_input_tokens_seen": 81819760, "step": 37900 }, { "epoch": 6.1835236541598695, "grad_norm": 0.09086601436138153, "learning_rate": 1.9092947410699927e-05, "loss": 0.1173, "num_input_tokens_seen": 81830416, "step": 37905 }, { "epoch": 6.184339314845024, "grad_norm": 2.592741012573242, "learning_rate": 1.9086031217180618e-05, "loss": 0.1232, "num_input_tokens_seen": 81842416, "step": 37910 }, { "epoch": 6.185154975530179, "grad_norm": 0.08322153985500336, "learning_rate": 1.9079115503077617e-05, "loss": 0.0227, "num_input_tokens_seen": 81852080, "step": 37915 }, { "epoch": 6.185970636215335, "grad_norm": 0.15700191259384155, "learning_rate": 1.9072200268951562e-05, "loss": 0.1221, "num_input_tokens_seen": 81862096, "step": 37920 }, { "epoch": 6.18678629690049, "grad_norm": 6.111955642700195, "learning_rate": 1.906528551536303e-05, "loss": 0.0185, "num_input_tokens_seen": 81872816, "step": 37925 }, { "epoch": 6.1876019575856445, "grad_norm": 3.117820978164673, "learning_rate": 1.905837124287257e-05, "loss": 0.1134, "num_input_tokens_seen": 81883120, "step": 37930 }, { "epoch": 6.188417618270799, "grad_norm": 0.17419977486133575, "learning_rate": 1.905145745204068e-05, "loss": 0.0966, "num_input_tokens_seen": 81893904, "step": 37935 }, { "epoch": 6.189233278955954, "grad_norm": 0.053510863333940506, "learning_rate": 1.9044544143427832e-05, "loss": 0.1066, "num_input_tokens_seen": 81904368, "step": 37940 }, { "epoch": 6.190048939641109, "grad_norm": 0.12100216001272202, "learning_rate": 1.9037631317594445e-05, "loss": 0.0067, "num_input_tokens_seen": 81915664, "step": 37945 }, { "epoch": 6.190864600326265, "grad_norm": 0.14096873998641968, "learning_rate": 1.9030718975100927e-05, "loss": 0.0047, "num_input_tokens_seen": 81926736, "step": 37950 }, { "epoch": 6.191680261011419, "grad_norm": 0.059876978397369385, "learning_rate": 1.9023807116507615e-05, "loss": 0.0085, "num_input_tokens_seen": 81937552, "step": 37955 }, { "epoch": 6.192495921696574, "grad_norm": 0.12152976542711258, "learning_rate": 1.9016895742374824e-05, "loss": 0.2442, "num_input_tokens_seen": 81947120, "step": 37960 }, { "epoch": 6.193311582381729, "grad_norm": 2.926598072052002, "learning_rate": 1.900998485326282e-05, "loss": 0.1074, "num_input_tokens_seen": 81955216, "step": 37965 }, { "epoch": 6.194127243066884, "grad_norm": 0.1282525509595871, "learning_rate": 1.9003074449731835e-05, "loss": 0.224, "num_input_tokens_seen": 81965360, "step": 37970 }, { "epoch": 6.19494290375204, "grad_norm": 15.548892974853516, "learning_rate": 1.8996164532342065e-05, "loss": 0.1526, "num_input_tokens_seen": 81976112, "step": 37975 }, { "epoch": 6.195758564437194, "grad_norm": 4.837267875671387, "learning_rate": 1.8989255101653662e-05, "loss": 0.0853, "num_input_tokens_seen": 81987568, "step": 37980 }, { "epoch": 6.196574225122349, "grad_norm": 4.110472679138184, "learning_rate": 1.898234615822674e-05, "loss": 0.191, "num_input_tokens_seen": 81998096, "step": 37985 }, { "epoch": 6.197389885807504, "grad_norm": 0.08067582547664642, "learning_rate": 1.8975437702621368e-05, "loss": 0.0045, "num_input_tokens_seen": 82009872, "step": 37990 }, { "epoch": 6.198205546492659, "grad_norm": 0.13552775979042053, "learning_rate": 1.8968529735397582e-05, "loss": 0.093, "num_input_tokens_seen": 82020080, "step": 37995 }, { "epoch": 6.199021207177814, "grad_norm": 0.049594298005104065, "learning_rate": 1.896162225711538e-05, "loss": 0.0054, "num_input_tokens_seen": 82030416, "step": 38000 }, { "epoch": 6.199836867862969, "grad_norm": 0.14990796148777008, "learning_rate": 1.895471526833472e-05, "loss": 0.0102, "num_input_tokens_seen": 82041296, "step": 38005 }, { "epoch": 6.200652528548124, "grad_norm": 0.11612722277641296, "learning_rate": 1.8947808769615512e-05, "loss": 0.051, "num_input_tokens_seen": 82051024, "step": 38010 }, { "epoch": 6.201468189233279, "grad_norm": 0.11736045777797699, "learning_rate": 1.8940902761517638e-05, "loss": 0.3306, "num_input_tokens_seen": 82062288, "step": 38015 }, { "epoch": 6.202283849918434, "grad_norm": 0.05441810190677643, "learning_rate": 1.8933997244600923e-05, "loss": 0.0059, "num_input_tokens_seen": 82073808, "step": 38020 }, { "epoch": 6.203099510603589, "grad_norm": 0.08446001261472702, "learning_rate": 1.8927092219425174e-05, "loss": 0.0974, "num_input_tokens_seen": 82083728, "step": 38025 }, { "epoch": 6.2039151712887435, "grad_norm": 7.407721519470215, "learning_rate": 1.892018768655014e-05, "loss": 0.2053, "num_input_tokens_seen": 82095600, "step": 38030 }, { "epoch": 6.204730831973899, "grad_norm": 0.10007230192422867, "learning_rate": 1.8913283646535547e-05, "loss": 0.059, "num_input_tokens_seen": 82105136, "step": 38035 }, { "epoch": 6.205546492659054, "grad_norm": 0.07161112129688263, "learning_rate": 1.890638009994106e-05, "loss": 0.0057, "num_input_tokens_seen": 82115504, "step": 38040 }, { "epoch": 6.206362153344209, "grad_norm": 2.146233558654785, "learning_rate": 1.889947704732632e-05, "loss": 0.2134, "num_input_tokens_seen": 82126864, "step": 38045 }, { "epoch": 6.207177814029364, "grad_norm": 0.19240115582942963, "learning_rate": 1.889257448925093e-05, "loss": 0.14, "num_input_tokens_seen": 82137520, "step": 38050 }, { "epoch": 6.2079934747145185, "grad_norm": 0.16441723704338074, "learning_rate": 1.8885672426274424e-05, "loss": 0.0147, "num_input_tokens_seen": 82148976, "step": 38055 }, { "epoch": 6.208809135399674, "grad_norm": 0.1422182321548462, "learning_rate": 1.8878770858956353e-05, "loss": 0.0508, "num_input_tokens_seen": 82160496, "step": 38060 }, { "epoch": 6.209624796084829, "grad_norm": 3.2957746982574463, "learning_rate": 1.8871869787856166e-05, "loss": 0.1198, "num_input_tokens_seen": 82171440, "step": 38065 }, { "epoch": 6.210440456769984, "grad_norm": 0.1262081265449524, "learning_rate": 1.886496921353331e-05, "loss": 0.2352, "num_input_tokens_seen": 82182704, "step": 38070 }, { "epoch": 6.211256117455139, "grad_norm": 0.09778767079114914, "learning_rate": 1.8858069136547186e-05, "loss": 0.0512, "num_input_tokens_seen": 82193392, "step": 38075 }, { "epoch": 6.212071778140293, "grad_norm": 4.510008811950684, "learning_rate": 1.8851169557457128e-05, "loss": 0.224, "num_input_tokens_seen": 82204080, "step": 38080 }, { "epoch": 6.212887438825448, "grad_norm": 0.08541733026504517, "learning_rate": 1.8844270476822473e-05, "loss": 0.0058, "num_input_tokens_seen": 82215440, "step": 38085 }, { "epoch": 6.213703099510604, "grad_norm": 2.2472951412200928, "learning_rate": 1.883737189520249e-05, "loss": 0.1546, "num_input_tokens_seen": 82226832, "step": 38090 }, { "epoch": 6.214518760195759, "grad_norm": 0.031898822635412216, "learning_rate": 1.88304738131564e-05, "loss": 0.0042, "num_input_tokens_seen": 82237296, "step": 38095 }, { "epoch": 6.215334420880914, "grad_norm": 0.3057255446910858, "learning_rate": 1.8823576231243418e-05, "loss": 0.1126, "num_input_tokens_seen": 82248400, "step": 38100 }, { "epoch": 6.216150081566068, "grad_norm": 0.13367220759391785, "learning_rate": 1.881667915002268e-05, "loss": 0.1254, "num_input_tokens_seen": 82259088, "step": 38105 }, { "epoch": 6.216965742251223, "grad_norm": 0.06978476792573929, "learning_rate": 1.8809782570053304e-05, "loss": 0.1414, "num_input_tokens_seen": 82269424, "step": 38110 }, { "epoch": 6.217781402936378, "grad_norm": 24.588245391845703, "learning_rate": 1.880288649189436e-05, "loss": 0.246, "num_input_tokens_seen": 82280144, "step": 38115 }, { "epoch": 6.218597063621534, "grad_norm": 0.10955842584371567, "learning_rate": 1.8795990916104886e-05, "loss": 0.0894, "num_input_tokens_seen": 82290672, "step": 38120 }, { "epoch": 6.219412724306689, "grad_norm": 0.8793649077415466, "learning_rate": 1.8789095843243863e-05, "loss": 0.133, "num_input_tokens_seen": 82301456, "step": 38125 }, { "epoch": 6.220228384991843, "grad_norm": 0.9195159673690796, "learning_rate": 1.878220127387025e-05, "loss": 0.0103, "num_input_tokens_seen": 82311696, "step": 38130 }, { "epoch": 6.221044045676998, "grad_norm": 4.79920768737793, "learning_rate": 1.8775307208542946e-05, "loss": 0.0985, "num_input_tokens_seen": 82322320, "step": 38135 }, { "epoch": 6.221859706362153, "grad_norm": 1.8701971769332886, "learning_rate": 1.8768413647820817e-05, "loss": 0.0674, "num_input_tokens_seen": 82332976, "step": 38140 }, { "epoch": 6.222675367047309, "grad_norm": 6.5250244140625, "learning_rate": 1.8761520592262704e-05, "loss": 0.3446, "num_input_tokens_seen": 82343920, "step": 38145 }, { "epoch": 6.2234910277324635, "grad_norm": 0.5002713799476624, "learning_rate": 1.8754628042427387e-05, "loss": 0.0098, "num_input_tokens_seen": 82354928, "step": 38150 }, { "epoch": 6.224306688417618, "grad_norm": 0.15075264871120453, "learning_rate": 1.8747735998873604e-05, "loss": 0.0087, "num_input_tokens_seen": 82366544, "step": 38155 }, { "epoch": 6.225122349102773, "grad_norm": 0.19868357479572296, "learning_rate": 1.8740844462160064e-05, "loss": 0.0044, "num_input_tokens_seen": 82377520, "step": 38160 }, { "epoch": 6.225938009787928, "grad_norm": 0.09077688306570053, "learning_rate": 1.873395343284543e-05, "loss": 0.008, "num_input_tokens_seen": 82388240, "step": 38165 }, { "epoch": 6.226753670473083, "grad_norm": 0.11144839972257614, "learning_rate": 1.872706291148833e-05, "loss": 0.2211, "num_input_tokens_seen": 82399984, "step": 38170 }, { "epoch": 6.2275693311582385, "grad_norm": 0.09272262454032898, "learning_rate": 1.8720172898647338e-05, "loss": 0.1108, "num_input_tokens_seen": 82410992, "step": 38175 }, { "epoch": 6.228384991843393, "grad_norm": 0.11452114582061768, "learning_rate": 1.8713283394880993e-05, "loss": 0.0093, "num_input_tokens_seen": 82422224, "step": 38180 }, { "epoch": 6.229200652528548, "grad_norm": 15.37417984008789, "learning_rate": 1.8706394400747796e-05, "loss": 0.0592, "num_input_tokens_seen": 82432624, "step": 38185 }, { "epoch": 6.230016313213703, "grad_norm": 0.6620145440101624, "learning_rate": 1.8699505916806205e-05, "loss": 0.1835, "num_input_tokens_seen": 82443696, "step": 38190 }, { "epoch": 6.230831973898858, "grad_norm": 0.16932103037834167, "learning_rate": 1.869261794361463e-05, "loss": 0.049, "num_input_tokens_seen": 82454736, "step": 38195 }, { "epoch": 6.231647634584013, "grad_norm": 2.5973780155181885, "learning_rate": 1.8685730481731444e-05, "loss": 0.2247, "num_input_tokens_seen": 82465072, "step": 38200 }, { "epoch": 6.232463295269168, "grad_norm": 0.0736318901181221, "learning_rate": 1.867884353171499e-05, "loss": 0.0141, "num_input_tokens_seen": 82475760, "step": 38205 }, { "epoch": 6.233278955954323, "grad_norm": 0.14232690632343292, "learning_rate": 1.867195709412355e-05, "loss": 0.1354, "num_input_tokens_seen": 82487056, "step": 38210 }, { "epoch": 6.234094616639478, "grad_norm": 0.03688379377126694, "learning_rate": 1.8665071169515375e-05, "loss": 0.1439, "num_input_tokens_seen": 82498544, "step": 38215 }, { "epoch": 6.234910277324633, "grad_norm": 0.058887895196676254, "learning_rate": 1.8658185758448676e-05, "loss": 0.2208, "num_input_tokens_seen": 82510096, "step": 38220 }, { "epoch": 6.235725938009788, "grad_norm": 0.29461947083473206, "learning_rate": 1.8651300861481614e-05, "loss": 0.007, "num_input_tokens_seen": 82520752, "step": 38225 }, { "epoch": 6.236541598694943, "grad_norm": 0.17150340974330902, "learning_rate": 1.8644416479172316e-05, "loss": 0.0939, "num_input_tokens_seen": 82531760, "step": 38230 }, { "epoch": 6.237357259380098, "grad_norm": 0.05188937112689018, "learning_rate": 1.8637532612078872e-05, "loss": 0.0077, "num_input_tokens_seen": 82542576, "step": 38235 }, { "epoch": 6.238172920065253, "grad_norm": 0.08637218177318573, "learning_rate": 1.8630649260759315e-05, "loss": 0.1278, "num_input_tokens_seen": 82553168, "step": 38240 }, { "epoch": 6.238988580750408, "grad_norm": 0.11446234583854675, "learning_rate": 1.8623766425771648e-05, "loss": 0.0941, "num_input_tokens_seen": 82563280, "step": 38245 }, { "epoch": 6.239804241435563, "grad_norm": 0.056999627500772476, "learning_rate": 1.8616884107673823e-05, "loss": 0.1353, "num_input_tokens_seen": 82575120, "step": 38250 }, { "epoch": 6.240619902120717, "grad_norm": 22.947662353515625, "learning_rate": 1.8610002307023767e-05, "loss": 0.1476, "num_input_tokens_seen": 82586032, "step": 38255 }, { "epoch": 6.241435562805873, "grad_norm": 16.191913604736328, "learning_rate": 1.860312102437934e-05, "loss": 0.155, "num_input_tokens_seen": 82597040, "step": 38260 }, { "epoch": 6.242251223491028, "grad_norm": 0.0527874194085598, "learning_rate": 1.859624026029837e-05, "loss": 0.0062, "num_input_tokens_seen": 82608400, "step": 38265 }, { "epoch": 6.243066884176183, "grad_norm": 5.220399379730225, "learning_rate": 1.8589360015338668e-05, "loss": 0.1912, "num_input_tokens_seen": 82618000, "step": 38270 }, { "epoch": 6.2438825448613375, "grad_norm": 0.774243950843811, "learning_rate": 1.8582480290057975e-05, "loss": 0.121, "num_input_tokens_seen": 82628880, "step": 38275 }, { "epoch": 6.244698205546492, "grad_norm": 0.09298164397478104, "learning_rate": 1.8575601085013988e-05, "loss": 0.0856, "num_input_tokens_seen": 82639216, "step": 38280 }, { "epoch": 6.245513866231648, "grad_norm": 0.14385484158992767, "learning_rate": 1.8568722400764377e-05, "loss": 0.014, "num_input_tokens_seen": 82649264, "step": 38285 }, { "epoch": 6.246329526916803, "grad_norm": 0.10688867419958115, "learning_rate": 1.8561844237866756e-05, "loss": 0.1382, "num_input_tokens_seen": 82660112, "step": 38290 }, { "epoch": 6.247145187601958, "grad_norm": 0.13206766545772552, "learning_rate": 1.855496659687871e-05, "loss": 0.0149, "num_input_tokens_seen": 82671408, "step": 38295 }, { "epoch": 6.2479608482871125, "grad_norm": 0.20640970766544342, "learning_rate": 1.8548089478357774e-05, "loss": 0.0078, "num_input_tokens_seen": 82682928, "step": 38300 }, { "epoch": 6.248776508972267, "grad_norm": 0.174403578042984, "learning_rate": 1.8541212882861442e-05, "loss": 0.0919, "num_input_tokens_seen": 82694864, "step": 38305 }, { "epoch": 6.249592169657422, "grad_norm": 0.18106922507286072, "learning_rate": 1.853433681094716e-05, "loss": 0.0087, "num_input_tokens_seen": 82705296, "step": 38310 }, { "epoch": 6.250407830342578, "grad_norm": 0.28391170501708984, "learning_rate": 1.8527461263172346e-05, "loss": 0.0968, "num_input_tokens_seen": 82716304, "step": 38315 }, { "epoch": 6.251223491027733, "grad_norm": 0.04587554931640625, "learning_rate": 1.852058624009436e-05, "loss": 0.0088, "num_input_tokens_seen": 82728272, "step": 38320 }, { "epoch": 6.2520391517128875, "grad_norm": 2.93485164642334, "learning_rate": 1.8513711742270535e-05, "loss": 0.1192, "num_input_tokens_seen": 82738896, "step": 38325 }, { "epoch": 6.252854812398042, "grad_norm": 0.26074308156967163, "learning_rate": 1.8506837770258147e-05, "loss": 0.0063, "num_input_tokens_seen": 82749712, "step": 38330 }, { "epoch": 6.253670473083197, "grad_norm": 0.06070653349161148, "learning_rate": 1.8499964324614434e-05, "loss": 0.0089, "num_input_tokens_seen": 82758608, "step": 38335 }, { "epoch": 6.254486133768353, "grad_norm": 0.1550820916891098, "learning_rate": 1.8493091405896595e-05, "loss": 0.0104, "num_input_tokens_seen": 82770288, "step": 38340 }, { "epoch": 6.255301794453508, "grad_norm": 3.547386646270752, "learning_rate": 1.8486219014661782e-05, "loss": 0.1853, "num_input_tokens_seen": 82782448, "step": 38345 }, { "epoch": 6.2561174551386625, "grad_norm": 0.12808428704738617, "learning_rate": 1.8479347151467106e-05, "loss": 0.0066, "num_input_tokens_seen": 82793264, "step": 38350 }, { "epoch": 6.256933115823817, "grad_norm": 0.10375116765499115, "learning_rate": 1.8472475816869634e-05, "loss": 0.0068, "num_input_tokens_seen": 82804688, "step": 38355 }, { "epoch": 6.257748776508972, "grad_norm": 0.1232416108250618, "learning_rate": 1.8465605011426395e-05, "loss": 0.003, "num_input_tokens_seen": 82816336, "step": 38360 }, { "epoch": 6.258564437194127, "grad_norm": 0.08774631470441818, "learning_rate": 1.8458734735694366e-05, "loss": 0.0063, "num_input_tokens_seen": 82827408, "step": 38365 }, { "epoch": 6.259380097879283, "grad_norm": 3.3483340740203857, "learning_rate": 1.8451864990230488e-05, "loss": 0.1968, "num_input_tokens_seen": 82838128, "step": 38370 }, { "epoch": 6.260195758564437, "grad_norm": 0.05783867463469505, "learning_rate": 1.8444995775591654e-05, "loss": 0.0254, "num_input_tokens_seen": 82849072, "step": 38375 }, { "epoch": 6.261011419249592, "grad_norm": 0.1645868420600891, "learning_rate": 1.8438127092334732e-05, "loss": 0.0048, "num_input_tokens_seen": 82859088, "step": 38380 }, { "epoch": 6.261827079934747, "grad_norm": 0.08044086396694183, "learning_rate": 1.843125894101652e-05, "loss": 0.0045, "num_input_tokens_seen": 82870480, "step": 38385 }, { "epoch": 6.262642740619902, "grad_norm": 3.3642702102661133, "learning_rate": 1.8424391322193787e-05, "loss": 0.1054, "num_input_tokens_seen": 82880720, "step": 38390 }, { "epoch": 6.263458401305057, "grad_norm": 0.07266976684331894, "learning_rate": 1.8417524236423257e-05, "loss": 0.0084, "num_input_tokens_seen": 82891536, "step": 38395 }, { "epoch": 6.264274061990212, "grad_norm": 13.846406936645508, "learning_rate": 1.8410657684261613e-05, "loss": 0.2432, "num_input_tokens_seen": 82903056, "step": 38400 }, { "epoch": 6.265089722675367, "grad_norm": 6.636152744293213, "learning_rate": 1.840379166626549e-05, "loss": 0.1464, "num_input_tokens_seen": 82912912, "step": 38405 }, { "epoch": 6.265905383360522, "grad_norm": 7.255515098571777, "learning_rate": 1.8396926182991485e-05, "loss": 0.1737, "num_input_tokens_seen": 82923664, "step": 38410 }, { "epoch": 6.266721044045677, "grad_norm": 0.1796352118253708, "learning_rate": 1.8390061234996147e-05, "loss": 0.0075, "num_input_tokens_seen": 82934256, "step": 38415 }, { "epoch": 6.267536704730832, "grad_norm": 4.006227016448975, "learning_rate": 1.8383196822835984e-05, "loss": 0.2556, "num_input_tokens_seen": 82945712, "step": 38420 }, { "epoch": 6.268352365415987, "grad_norm": 0.11999749392271042, "learning_rate": 1.837633294706746e-05, "loss": 0.01, "num_input_tokens_seen": 82956720, "step": 38425 }, { "epoch": 6.269168026101142, "grad_norm": 0.09215438365936279, "learning_rate": 1.8369469608246993e-05, "loss": 0.0551, "num_input_tokens_seen": 82967856, "step": 38430 }, { "epoch": 6.269983686786297, "grad_norm": 0.3104093372821808, "learning_rate": 1.8362606806930964e-05, "loss": 0.0051, "num_input_tokens_seen": 82979472, "step": 38435 }, { "epoch": 6.270799347471452, "grad_norm": 0.18833382427692413, "learning_rate": 1.835574454367571e-05, "loss": 0.0042, "num_input_tokens_seen": 82989264, "step": 38440 }, { "epoch": 6.271615008156607, "grad_norm": 0.8444291949272156, "learning_rate": 1.834888281903751e-05, "loss": 0.1443, "num_input_tokens_seen": 83000368, "step": 38445 }, { "epoch": 6.2724306688417615, "grad_norm": 0.038760919123888016, "learning_rate": 1.8342021633572617e-05, "loss": 0.0711, "num_input_tokens_seen": 83011376, "step": 38450 }, { "epoch": 6.273246329526917, "grad_norm": 0.12532900273799896, "learning_rate": 1.833516098783723e-05, "loss": 0.1228, "num_input_tokens_seen": 83021008, "step": 38455 }, { "epoch": 6.274061990212072, "grad_norm": 0.0743989646434784, "learning_rate": 1.832830088238751e-05, "loss": 0.1924, "num_input_tokens_seen": 83031888, "step": 38460 }, { "epoch": 6.274877650897227, "grad_norm": 0.13158778846263885, "learning_rate": 1.832144131777958e-05, "loss": 0.1937, "num_input_tokens_seen": 83042032, "step": 38465 }, { "epoch": 6.275693311582382, "grad_norm": 0.07835894078016281, "learning_rate": 1.83145822945695e-05, "loss": 0.2449, "num_input_tokens_seen": 83051920, "step": 38470 }, { "epoch": 6.2765089722675365, "grad_norm": 0.09954798221588135, "learning_rate": 1.8307723813313298e-05, "loss": 0.2882, "num_input_tokens_seen": 83063280, "step": 38475 }, { "epoch": 6.277324632952691, "grad_norm": 0.15163902938365936, "learning_rate": 1.8300865874566953e-05, "loss": 0.0783, "num_input_tokens_seen": 83072112, "step": 38480 }, { "epoch": 6.278140293637847, "grad_norm": 0.15219746530056, "learning_rate": 1.829400847888642e-05, "loss": 0.0051, "num_input_tokens_seen": 83083728, "step": 38485 }, { "epoch": 6.278955954323002, "grad_norm": 27.522193908691406, "learning_rate": 1.8287151626827586e-05, "loss": 0.3392, "num_input_tokens_seen": 83094256, "step": 38490 }, { "epoch": 6.279771615008157, "grad_norm": 0.11280357092618942, "learning_rate": 1.8280295318946304e-05, "loss": 0.1118, "num_input_tokens_seen": 83105552, "step": 38495 }, { "epoch": 6.280587275693311, "grad_norm": 0.0732632502913475, "learning_rate": 1.827343955579838e-05, "loss": 0.0165, "num_input_tokens_seen": 83117296, "step": 38500 }, { "epoch": 6.281402936378466, "grad_norm": 0.12993024289608002, "learning_rate": 1.8266584337939568e-05, "loss": 0.1247, "num_input_tokens_seen": 83127824, "step": 38505 }, { "epoch": 6.282218597063622, "grad_norm": 0.354735404253006, "learning_rate": 1.82597296659256e-05, "loss": 0.173, "num_input_tokens_seen": 83138608, "step": 38510 }, { "epoch": 6.283034257748777, "grad_norm": 0.7837084531784058, "learning_rate": 1.8252875540312143e-05, "loss": 0.1228, "num_input_tokens_seen": 83149936, "step": 38515 }, { "epoch": 6.283849918433932, "grad_norm": 3.745943069458008, "learning_rate": 1.824602196165483e-05, "loss": 0.1244, "num_input_tokens_seen": 83160528, "step": 38520 }, { "epoch": 6.284665579119086, "grad_norm": 0.27920007705688477, "learning_rate": 1.823916893050925e-05, "loss": 0.1329, "num_input_tokens_seen": 83170640, "step": 38525 }, { "epoch": 6.285481239804241, "grad_norm": 0.08819136768579483, "learning_rate": 1.8232316447430936e-05, "loss": 0.0157, "num_input_tokens_seen": 83182128, "step": 38530 }, { "epoch": 6.286296900489396, "grad_norm": 0.5468014478683472, "learning_rate": 1.822546451297539e-05, "loss": 0.1199, "num_input_tokens_seen": 83192368, "step": 38535 }, { "epoch": 6.287112561174552, "grad_norm": 0.15090271830558777, "learning_rate": 1.8218613127698058e-05, "loss": 0.1052, "num_input_tokens_seen": 83203024, "step": 38540 }, { "epoch": 6.287928221859707, "grad_norm": 2.385889768600464, "learning_rate": 1.8211762292154362e-05, "loss": 0.2688, "num_input_tokens_seen": 83214160, "step": 38545 }, { "epoch": 6.288743882544861, "grad_norm": 0.05856755003333092, "learning_rate": 1.820491200689966e-05, "loss": 0.1684, "num_input_tokens_seen": 83224784, "step": 38550 }, { "epoch": 6.289559543230016, "grad_norm": 0.28382059931755066, "learning_rate": 1.8198062272489263e-05, "loss": 0.0408, "num_input_tokens_seen": 83235344, "step": 38555 }, { "epoch": 6.290375203915171, "grad_norm": 0.16582660377025604, "learning_rate": 1.8191213089478455e-05, "loss": 0.2156, "num_input_tokens_seen": 83246384, "step": 38560 }, { "epoch": 6.291190864600326, "grad_norm": 0.6182851791381836, "learning_rate": 1.818436445842246e-05, "loss": 0.2381, "num_input_tokens_seen": 83257520, "step": 38565 }, { "epoch": 6.2920065252854815, "grad_norm": 0.17569006979465485, "learning_rate": 1.8177516379876463e-05, "loss": 0.1326, "num_input_tokens_seen": 83267824, "step": 38570 }, { "epoch": 6.292822185970636, "grad_norm": 0.11772775650024414, "learning_rate": 1.817066885439561e-05, "loss": 0.0093, "num_input_tokens_seen": 83277904, "step": 38575 }, { "epoch": 6.293637846655791, "grad_norm": 0.06668476015329361, "learning_rate": 1.8163821882534986e-05, "loss": 0.0043, "num_input_tokens_seen": 83287280, "step": 38580 }, { "epoch": 6.294453507340946, "grad_norm": 2.960515022277832, "learning_rate": 1.815697546484964e-05, "loss": 0.2435, "num_input_tokens_seen": 83297520, "step": 38585 }, { "epoch": 6.295269168026101, "grad_norm": 0.06321658194065094, "learning_rate": 1.8150129601894592e-05, "loss": 0.012, "num_input_tokens_seen": 83308400, "step": 38590 }, { "epoch": 6.2960848287112565, "grad_norm": 0.2736243009567261, "learning_rate": 1.8143284294224794e-05, "loss": 0.1795, "num_input_tokens_seen": 83321008, "step": 38595 }, { "epoch": 6.296900489396411, "grad_norm": 0.1401585191488266, "learning_rate": 1.813643954239516e-05, "loss": 0.1012, "num_input_tokens_seen": 83333264, "step": 38600 }, { "epoch": 6.297716150081566, "grad_norm": 3.16827392578125, "learning_rate": 1.8129595346960568e-05, "loss": 0.0562, "num_input_tokens_seen": 83343856, "step": 38605 }, { "epoch": 6.298531810766721, "grad_norm": 0.158759206533432, "learning_rate": 1.812275170847583e-05, "loss": 0.0087, "num_input_tokens_seen": 83354832, "step": 38610 }, { "epoch": 6.299347471451876, "grad_norm": 0.03607413172721863, "learning_rate": 1.8115908627495742e-05, "loss": 0.0905, "num_input_tokens_seen": 83365488, "step": 38615 }, { "epoch": 6.300163132137031, "grad_norm": 1.4858367443084717, "learning_rate": 1.8109066104575023e-05, "loss": 0.119, "num_input_tokens_seen": 83376688, "step": 38620 }, { "epoch": 6.300978792822186, "grad_norm": 12.213780403137207, "learning_rate": 1.810222414026837e-05, "loss": 0.0787, "num_input_tokens_seen": 83387056, "step": 38625 }, { "epoch": 6.301794453507341, "grad_norm": 2.129650592803955, "learning_rate": 1.809538273513043e-05, "loss": 0.0181, "num_input_tokens_seen": 83398224, "step": 38630 }, { "epoch": 6.302610114192496, "grad_norm": 0.9154722690582275, "learning_rate": 1.8088541889715795e-05, "loss": 0.0067, "num_input_tokens_seen": 83408880, "step": 38635 }, { "epoch": 6.303425774877651, "grad_norm": 0.13156336545944214, "learning_rate": 1.8081701604579025e-05, "loss": 0.2049, "num_input_tokens_seen": 83419504, "step": 38640 }, { "epoch": 6.304241435562806, "grad_norm": 0.0758514329791069, "learning_rate": 1.807486188027463e-05, "loss": 0.0055, "num_input_tokens_seen": 83429296, "step": 38645 }, { "epoch": 6.30505709624796, "grad_norm": 0.018398450687527657, "learning_rate": 1.8068022717357066e-05, "loss": 0.005, "num_input_tokens_seen": 83440912, "step": 38650 }, { "epoch": 6.305872756933116, "grad_norm": 0.160930797457695, "learning_rate": 1.8061184116380754e-05, "loss": 0.0898, "num_input_tokens_seen": 83451568, "step": 38655 }, { "epoch": 6.306688417618271, "grad_norm": 0.0657789558172226, "learning_rate": 1.8054346077900065e-05, "loss": 0.0196, "num_input_tokens_seen": 83461200, "step": 38660 }, { "epoch": 6.307504078303426, "grad_norm": 0.06973383575677872, "learning_rate": 1.8047508602469322e-05, "loss": 0.0092, "num_input_tokens_seen": 83472016, "step": 38665 }, { "epoch": 6.308319738988581, "grad_norm": 0.2847124934196472, "learning_rate": 1.804067169064281e-05, "loss": 0.1104, "num_input_tokens_seen": 83482896, "step": 38670 }, { "epoch": 6.309135399673735, "grad_norm": 3.6392717361450195, "learning_rate": 1.8033835342974763e-05, "loss": 0.3532, "num_input_tokens_seen": 83494384, "step": 38675 }, { "epoch": 6.309951060358891, "grad_norm": 0.10749179869890213, "learning_rate": 1.8026999560019366e-05, "loss": 0.0044, "num_input_tokens_seen": 83504848, "step": 38680 }, { "epoch": 6.310766721044046, "grad_norm": 0.03465574234724045, "learning_rate": 1.8020164342330763e-05, "loss": 0.0922, "num_input_tokens_seen": 83516208, "step": 38685 }, { "epoch": 6.311582381729201, "grad_norm": 9.664535522460938, "learning_rate": 1.8013329690463056e-05, "loss": 0.1194, "num_input_tokens_seen": 83527056, "step": 38690 }, { "epoch": 6.3123980424143555, "grad_norm": 0.35942038893699646, "learning_rate": 1.8006495604970295e-05, "loss": 0.0083, "num_input_tokens_seen": 83537392, "step": 38695 }, { "epoch": 6.31321370309951, "grad_norm": 0.09744260460138321, "learning_rate": 1.7999662086406484e-05, "loss": 0.0749, "num_input_tokens_seen": 83548944, "step": 38700 }, { "epoch": 6.314029363784665, "grad_norm": 0.19223381578922272, "learning_rate": 1.799282913532559e-05, "loss": 0.0115, "num_input_tokens_seen": 83558832, "step": 38705 }, { "epoch": 6.314845024469821, "grad_norm": 0.1924734115600586, "learning_rate": 1.798599675228151e-05, "loss": 0.1044, "num_input_tokens_seen": 83567696, "step": 38710 }, { "epoch": 6.315660685154976, "grad_norm": 0.03546510264277458, "learning_rate": 1.7979164937828127e-05, "loss": 0.0055, "num_input_tokens_seen": 83578480, "step": 38715 }, { "epoch": 6.3164763458401305, "grad_norm": 0.02767333947122097, "learning_rate": 1.797233369251926e-05, "loss": 0.1412, "num_input_tokens_seen": 83588496, "step": 38720 }, { "epoch": 6.317292006525285, "grad_norm": 0.07581031322479248, "learning_rate": 1.796550301690868e-05, "loss": 0.063, "num_input_tokens_seen": 83599376, "step": 38725 }, { "epoch": 6.31810766721044, "grad_norm": 0.11516465246677399, "learning_rate": 1.7958672911550117e-05, "loss": 0.0805, "num_input_tokens_seen": 83610032, "step": 38730 }, { "epoch": 6.318923327895595, "grad_norm": 0.08749879896640778, "learning_rate": 1.7951843376997256e-05, "loss": 0.0044, "num_input_tokens_seen": 83621392, "step": 38735 }, { "epoch": 6.319738988580751, "grad_norm": 4.46632194519043, "learning_rate": 1.7945014413803737e-05, "loss": 0.0197, "num_input_tokens_seen": 83632368, "step": 38740 }, { "epoch": 6.3205546492659055, "grad_norm": 0.09072194993495941, "learning_rate": 1.7938186022523144e-05, "loss": 0.003, "num_input_tokens_seen": 83642800, "step": 38745 }, { "epoch": 6.32137030995106, "grad_norm": 1.0660722255706787, "learning_rate": 1.793135820370902e-05, "loss": 0.0938, "num_input_tokens_seen": 83653552, "step": 38750 }, { "epoch": 6.322185970636215, "grad_norm": 0.13588771224021912, "learning_rate": 1.792453095791487e-05, "loss": 0.0076, "num_input_tokens_seen": 83664624, "step": 38755 }, { "epoch": 6.32300163132137, "grad_norm": 0.11264606565237045, "learning_rate": 1.791770428569414e-05, "loss": 0.0045, "num_input_tokens_seen": 83675536, "step": 38760 }, { "epoch": 6.323817292006526, "grad_norm": 0.019025340676307678, "learning_rate": 1.7910878187600232e-05, "loss": 0.0034, "num_input_tokens_seen": 83685712, "step": 38765 }, { "epoch": 6.3246329526916805, "grad_norm": 0.05009276419878006, "learning_rate": 1.790405266418651e-05, "loss": 0.0983, "num_input_tokens_seen": 83696112, "step": 38770 }, { "epoch": 6.325448613376835, "grad_norm": 0.08625942468643188, "learning_rate": 1.789722771600628e-05, "loss": 0.0044, "num_input_tokens_seen": 83706608, "step": 38775 }, { "epoch": 6.32626427406199, "grad_norm": 0.0683288499712944, "learning_rate": 1.789040334361282e-05, "loss": 0.1019, "num_input_tokens_seen": 83718288, "step": 38780 }, { "epoch": 6.327079934747145, "grad_norm": 6.77628755569458, "learning_rate": 1.788357954755933e-05, "loss": 0.0862, "num_input_tokens_seen": 83729328, "step": 38785 }, { "epoch": 6.327895595432301, "grad_norm": 3.364013671875, "learning_rate": 1.7876756328398998e-05, "loss": 0.2365, "num_input_tokens_seen": 83740016, "step": 38790 }, { "epoch": 6.328711256117455, "grad_norm": 4.250802993774414, "learning_rate": 1.7869933686684938e-05, "loss": 0.0146, "num_input_tokens_seen": 83751120, "step": 38795 }, { "epoch": 6.32952691680261, "grad_norm": 0.09552743285894394, "learning_rate": 1.786311162297022e-05, "loss": 0.1188, "num_input_tokens_seen": 83760880, "step": 38800 }, { "epoch": 6.330342577487765, "grad_norm": 0.10108266025781631, "learning_rate": 1.7856290137807893e-05, "loss": 0.1115, "num_input_tokens_seen": 83771440, "step": 38805 }, { "epoch": 6.33115823817292, "grad_norm": 0.03804726153612137, "learning_rate": 1.7849469231750936e-05, "loss": 0.0989, "num_input_tokens_seen": 83781424, "step": 38810 }, { "epoch": 6.331973898858075, "grad_norm": 0.0366843044757843, "learning_rate": 1.784264890535229e-05, "loss": 0.1035, "num_input_tokens_seen": 83792848, "step": 38815 }, { "epoch": 6.33278955954323, "grad_norm": 0.2906888723373413, "learning_rate": 1.7835829159164835e-05, "loss": 0.1995, "num_input_tokens_seen": 83803152, "step": 38820 }, { "epoch": 6.333605220228385, "grad_norm": 0.18760044872760773, "learning_rate": 1.7829009993741418e-05, "loss": 0.0818, "num_input_tokens_seen": 83814672, "step": 38825 }, { "epoch": 6.33442088091354, "grad_norm": 0.08920400589704514, "learning_rate": 1.782219140963484e-05, "loss": 0.1291, "num_input_tokens_seen": 83825904, "step": 38830 }, { "epoch": 6.335236541598695, "grad_norm": 0.0731261745095253, "learning_rate": 1.781537340739784e-05, "loss": 0.0027, "num_input_tokens_seen": 83837936, "step": 38835 }, { "epoch": 6.33605220228385, "grad_norm": 5.2528228759765625, "learning_rate": 1.780855598758313e-05, "loss": 0.1678, "num_input_tokens_seen": 83848944, "step": 38840 }, { "epoch": 6.3368678629690045, "grad_norm": 4.945896148681641, "learning_rate": 1.7801739150743363e-05, "loss": 0.113, "num_input_tokens_seen": 83861136, "step": 38845 }, { "epoch": 6.33768352365416, "grad_norm": 5.178552150726318, "learning_rate": 1.7794922897431145e-05, "loss": 0.1642, "num_input_tokens_seen": 83873136, "step": 38850 }, { "epoch": 6.338499184339315, "grad_norm": 0.13779862225055695, "learning_rate": 1.7788107228199023e-05, "loss": 0.0867, "num_input_tokens_seen": 83884496, "step": 38855 }, { "epoch": 6.33931484502447, "grad_norm": 0.08722608536481857, "learning_rate": 1.7781292143599532e-05, "loss": 0.0036, "num_input_tokens_seen": 83895664, "step": 38860 }, { "epoch": 6.340130505709625, "grad_norm": 0.6137920618057251, "learning_rate": 1.7774477644185125e-05, "loss": 0.1541, "num_input_tokens_seen": 83907664, "step": 38865 }, { "epoch": 6.3409461663947795, "grad_norm": 0.1464776247739792, "learning_rate": 1.7767663730508222e-05, "loss": 0.1053, "num_input_tokens_seen": 83918896, "step": 38870 }, { "epoch": 6.341761827079935, "grad_norm": 0.21143722534179688, "learning_rate": 1.7760850403121195e-05, "loss": 0.081, "num_input_tokens_seen": 83929008, "step": 38875 }, { "epoch": 6.34257748776509, "grad_norm": 0.07302909344434738, "learning_rate": 1.7754037662576365e-05, "loss": 0.0509, "num_input_tokens_seen": 83937776, "step": 38880 }, { "epoch": 6.343393148450245, "grad_norm": 0.4305923879146576, "learning_rate": 1.7747225509426008e-05, "loss": 0.0052, "num_input_tokens_seen": 83948816, "step": 38885 }, { "epoch": 6.3442088091354, "grad_norm": 0.07678848505020142, "learning_rate": 1.774041394422235e-05, "loss": 0.1067, "num_input_tokens_seen": 83959440, "step": 38890 }, { "epoch": 6.3450244698205545, "grad_norm": 0.060962971299886703, "learning_rate": 1.7733602967517578e-05, "loss": 0.004, "num_input_tokens_seen": 83971248, "step": 38895 }, { "epoch": 6.345840130505709, "grad_norm": 0.05522932857275009, "learning_rate": 1.772679257986381e-05, "loss": 0.0049, "num_input_tokens_seen": 83981584, "step": 38900 }, { "epoch": 6.346655791190865, "grad_norm": 0.15104833245277405, "learning_rate": 1.7719982781813135e-05, "loss": 0.1058, "num_input_tokens_seen": 83992112, "step": 38905 }, { "epoch": 6.34747145187602, "grad_norm": 0.43337759375572205, "learning_rate": 1.7713173573917603e-05, "loss": 0.0049, "num_input_tokens_seen": 84003472, "step": 38910 }, { "epoch": 6.348287112561175, "grad_norm": 0.06316080689430237, "learning_rate": 1.7706364956729195e-05, "loss": 0.0724, "num_input_tokens_seen": 84013840, "step": 38915 }, { "epoch": 6.349102773246329, "grad_norm": 0.0270449910312891, "learning_rate": 1.769955693079985e-05, "loss": 0.0408, "num_input_tokens_seen": 84024912, "step": 38920 }, { "epoch": 6.349918433931484, "grad_norm": 0.011509484611451626, "learning_rate": 1.769274949668146e-05, "loss": 0.0884, "num_input_tokens_seen": 84035632, "step": 38925 }, { "epoch": 6.350734094616639, "grad_norm": 3.131605863571167, "learning_rate": 1.7685942654925876e-05, "loss": 0.0764, "num_input_tokens_seen": 84046448, "step": 38930 }, { "epoch": 6.351549755301795, "grad_norm": 0.15195944905281067, "learning_rate": 1.767913640608489e-05, "loss": 0.1173, "num_input_tokens_seen": 84057648, "step": 38935 }, { "epoch": 6.35236541598695, "grad_norm": 4.266190528869629, "learning_rate": 1.7672330750710247e-05, "loss": 0.1128, "num_input_tokens_seen": 84068880, "step": 38940 }, { "epoch": 6.353181076672104, "grad_norm": 0.09167669713497162, "learning_rate": 1.766552568935366e-05, "loss": 0.0069, "num_input_tokens_seen": 84079696, "step": 38945 }, { "epoch": 6.353996737357259, "grad_norm": 3.26637864112854, "learning_rate": 1.7658721222566775e-05, "loss": 0.1575, "num_input_tokens_seen": 84089456, "step": 38950 }, { "epoch": 6.354812398042414, "grad_norm": 0.09386517852544785, "learning_rate": 1.76519173509012e-05, "loss": 0.0065, "num_input_tokens_seen": 84099440, "step": 38955 }, { "epoch": 6.35562805872757, "grad_norm": 0.8102290034294128, "learning_rate": 1.764511407490848e-05, "loss": 0.242, "num_input_tokens_seen": 84109232, "step": 38960 }, { "epoch": 6.356443719412725, "grad_norm": 0.09411946684122086, "learning_rate": 1.763831139514014e-05, "loss": 0.1275, "num_input_tokens_seen": 84121712, "step": 38965 }, { "epoch": 6.357259380097879, "grad_norm": 0.06968369334936142, "learning_rate": 1.7631509312147626e-05, "loss": 0.052, "num_input_tokens_seen": 84132688, "step": 38970 }, { "epoch": 6.358075040783034, "grad_norm": 9.786787986755371, "learning_rate": 1.7624707826482356e-05, "loss": 0.0797, "num_input_tokens_seen": 84142544, "step": 38975 }, { "epoch": 6.358890701468189, "grad_norm": 6.882516384124756, "learning_rate": 1.7617906938695694e-05, "loss": 0.2608, "num_input_tokens_seen": 84151312, "step": 38980 }, { "epoch": 6.359706362153344, "grad_norm": 2.8158648014068604, "learning_rate": 1.761110664933895e-05, "loss": 0.1112, "num_input_tokens_seen": 84162512, "step": 38985 }, { "epoch": 6.3605220228384995, "grad_norm": 0.48013320565223694, "learning_rate": 1.760430695896339e-05, "loss": 0.0105, "num_input_tokens_seen": 84174480, "step": 38990 }, { "epoch": 6.361337683523654, "grad_norm": 0.06856291741132736, "learning_rate": 1.7597507868120227e-05, "loss": 0.0101, "num_input_tokens_seen": 84185712, "step": 38995 }, { "epoch": 6.362153344208809, "grad_norm": 0.05834678187966347, "learning_rate": 1.7590709377360648e-05, "loss": 0.1768, "num_input_tokens_seen": 84195824, "step": 39000 }, { "epoch": 6.362969004893964, "grad_norm": 2.5577142238616943, "learning_rate": 1.7583911487235753e-05, "loss": 0.1976, "num_input_tokens_seen": 84207344, "step": 39005 }, { "epoch": 6.363784665579119, "grad_norm": 0.03688184916973114, "learning_rate": 1.7577114198296623e-05, "loss": 0.0036, "num_input_tokens_seen": 84218512, "step": 39010 }, { "epoch": 6.364600326264274, "grad_norm": 0.2032366544008255, "learning_rate": 1.757031751109428e-05, "loss": 0.006, "num_input_tokens_seen": 84229520, "step": 39015 }, { "epoch": 6.365415986949429, "grad_norm": 0.01645730994641781, "learning_rate": 1.75635214261797e-05, "loss": 0.1333, "num_input_tokens_seen": 84240752, "step": 39020 }, { "epoch": 6.366231647634584, "grad_norm": 0.12931299209594727, "learning_rate": 1.7556725944103803e-05, "loss": 0.1052, "num_input_tokens_seen": 84250224, "step": 39025 }, { "epoch": 6.367047308319739, "grad_norm": 0.09920920431613922, "learning_rate": 1.754993106541747e-05, "loss": 0.0026, "num_input_tokens_seen": 84260784, "step": 39030 }, { "epoch": 6.367862969004894, "grad_norm": 0.21248573064804077, "learning_rate": 1.7543136790671524e-05, "loss": 0.0563, "num_input_tokens_seen": 84272560, "step": 39035 }, { "epoch": 6.368678629690049, "grad_norm": 0.04769877716898918, "learning_rate": 1.753634312041675e-05, "loss": 0.006, "num_input_tokens_seen": 84282640, "step": 39040 }, { "epoch": 6.369494290375204, "grad_norm": 0.1672208607196808, "learning_rate": 1.752955005520387e-05, "loss": 0.2046, "num_input_tokens_seen": 84293936, "step": 39045 }, { "epoch": 6.370309951060359, "grad_norm": 0.2145567387342453, "learning_rate": 1.7522757595583567e-05, "loss": 0.229, "num_input_tokens_seen": 84303376, "step": 39050 }, { "epoch": 6.371125611745514, "grad_norm": 0.07079474627971649, "learning_rate": 1.751596574210647e-05, "loss": 0.11, "num_input_tokens_seen": 84313968, "step": 39055 }, { "epoch": 6.371941272430669, "grad_norm": 0.6985549926757812, "learning_rate": 1.750917449532317e-05, "loss": 0.1168, "num_input_tokens_seen": 84325520, "step": 39060 }, { "epoch": 6.372756933115824, "grad_norm": 0.1702076643705368, "learning_rate": 1.7502383855784187e-05, "loss": 0.0043, "num_input_tokens_seen": 84336496, "step": 39065 }, { "epoch": 6.373572593800978, "grad_norm": 0.08414614200592041, "learning_rate": 1.7495593824040014e-05, "loss": 0.1257, "num_input_tokens_seen": 84346928, "step": 39070 }, { "epoch": 6.374388254486134, "grad_norm": 13.162398338317871, "learning_rate": 1.7488804400641084e-05, "loss": 0.0405, "num_input_tokens_seen": 84358032, "step": 39075 }, { "epoch": 6.375203915171289, "grad_norm": 0.10966675728559494, "learning_rate": 1.7482015586137774e-05, "loss": 0.0055, "num_input_tokens_seen": 84368816, "step": 39080 }, { "epoch": 6.376019575856444, "grad_norm": 0.1741647720336914, "learning_rate": 1.7475227381080434e-05, "loss": 0.0069, "num_input_tokens_seen": 84378800, "step": 39085 }, { "epoch": 6.376835236541599, "grad_norm": 0.05604543536901474, "learning_rate": 1.746843978601934e-05, "loss": 0.0057, "num_input_tokens_seen": 84389584, "step": 39090 }, { "epoch": 6.377650897226753, "grad_norm": 0.08365931361913681, "learning_rate": 1.746165280150473e-05, "loss": 0.2504, "num_input_tokens_seen": 84398768, "step": 39095 }, { "epoch": 6.378466557911908, "grad_norm": 0.09252443164587021, "learning_rate": 1.7454866428086797e-05, "loss": 0.005, "num_input_tokens_seen": 84408464, "step": 39100 }, { "epoch": 6.379282218597064, "grad_norm": 0.052541542798280716, "learning_rate": 1.7448080666315675e-05, "loss": 0.0031, "num_input_tokens_seen": 84419600, "step": 39105 }, { "epoch": 6.380097879282219, "grad_norm": 3.5105550289154053, "learning_rate": 1.744129551674145e-05, "loss": 0.0097, "num_input_tokens_seen": 84428784, "step": 39110 }, { "epoch": 6.3809135399673735, "grad_norm": 0.08369387686252594, "learning_rate": 1.7434510979914166e-05, "loss": 0.008, "num_input_tokens_seen": 84439696, "step": 39115 }, { "epoch": 6.381729200652528, "grad_norm": 2.1654927730560303, "learning_rate": 1.7427727056383795e-05, "loss": 0.1293, "num_input_tokens_seen": 84451280, "step": 39120 }, { "epoch": 6.382544861337683, "grad_norm": 3.6602649688720703, "learning_rate": 1.74209437467003e-05, "loss": 0.1376, "num_input_tokens_seen": 84462512, "step": 39125 }, { "epoch": 6.383360522022839, "grad_norm": 0.12523126602172852, "learning_rate": 1.7414161051413565e-05, "loss": 0.0077, "num_input_tokens_seen": 84473616, "step": 39130 }, { "epoch": 6.384176182707994, "grad_norm": 0.3088226020336151, "learning_rate": 1.740737897107342e-05, "loss": 0.0184, "num_input_tokens_seen": 84485776, "step": 39135 }, { "epoch": 6.3849918433931485, "grad_norm": 0.13221058249473572, "learning_rate": 1.7400597506229667e-05, "loss": 0.0338, "num_input_tokens_seen": 84498064, "step": 39140 }, { "epoch": 6.385807504078303, "grad_norm": 0.04836839437484741, "learning_rate": 1.739381665743203e-05, "loss": 0.2277, "num_input_tokens_seen": 84509296, "step": 39145 }, { "epoch": 6.386623164763458, "grad_norm": 7.938859939575195, "learning_rate": 1.7387036425230214e-05, "loss": 0.0136, "num_input_tokens_seen": 84519888, "step": 39150 }, { "epoch": 6.387438825448613, "grad_norm": 0.42295539379119873, "learning_rate": 1.7380256810173854e-05, "loss": 0.104, "num_input_tokens_seen": 84530352, "step": 39155 }, { "epoch": 6.388254486133769, "grad_norm": 0.05365495756268501, "learning_rate": 1.7373477812812538e-05, "loss": 0.1059, "num_input_tokens_seen": 84540912, "step": 39160 }, { "epoch": 6.3890701468189235, "grad_norm": 0.09460272639989853, "learning_rate": 1.73666994336958e-05, "loss": 0.0038, "num_input_tokens_seen": 84551760, "step": 39165 }, { "epoch": 6.389885807504078, "grad_norm": 0.10183282941579819, "learning_rate": 1.735992167337314e-05, "loss": 0.0425, "num_input_tokens_seen": 84562960, "step": 39170 }, { "epoch": 6.390701468189233, "grad_norm": 0.26614123582839966, "learning_rate": 1.7353144532394e-05, "loss": 0.1172, "num_input_tokens_seen": 84573264, "step": 39175 }, { "epoch": 6.391517128874388, "grad_norm": 0.1066962406039238, "learning_rate": 1.734636801130776e-05, "loss": 0.0045, "num_input_tokens_seen": 84583632, "step": 39180 }, { "epoch": 6.392332789559543, "grad_norm": 4.822566509246826, "learning_rate": 1.7339592110663768e-05, "loss": 0.4313, "num_input_tokens_seen": 84594960, "step": 39185 }, { "epoch": 6.3931484502446985, "grad_norm": 0.14477436244487762, "learning_rate": 1.7332816831011307e-05, "loss": 0.081, "num_input_tokens_seen": 84604912, "step": 39190 }, { "epoch": 6.393964110929853, "grad_norm": 0.11814351379871368, "learning_rate": 1.7326042172899616e-05, "loss": 0.1229, "num_input_tokens_seen": 84615344, "step": 39195 }, { "epoch": 6.394779771615008, "grad_norm": 1.5876661539077759, "learning_rate": 1.731926813687788e-05, "loss": 0.0108, "num_input_tokens_seen": 84625744, "step": 39200 }, { "epoch": 6.395595432300163, "grad_norm": 2.3105623722076416, "learning_rate": 1.7312494723495243e-05, "loss": 0.0859, "num_input_tokens_seen": 84636784, "step": 39205 }, { "epoch": 6.396411092985318, "grad_norm": 3.1567940711975098, "learning_rate": 1.730572193330079e-05, "loss": 0.0882, "num_input_tokens_seen": 84648656, "step": 39210 }, { "epoch": 6.397226753670473, "grad_norm": 0.052460797131061554, "learning_rate": 1.7298949766843558e-05, "loss": 0.0041, "num_input_tokens_seen": 84659728, "step": 39215 }, { "epoch": 6.398042414355628, "grad_norm": 0.10526681691408157, "learning_rate": 1.7292178224672528e-05, "loss": 0.0048, "num_input_tokens_seen": 84670672, "step": 39220 }, { "epoch": 6.398858075040783, "grad_norm": 7.077473163604736, "learning_rate": 1.7285407307336636e-05, "loss": 0.1098, "num_input_tokens_seen": 84679952, "step": 39225 }, { "epoch": 6.399673735725938, "grad_norm": 0.07950261980295181, "learning_rate": 1.727863701538478e-05, "loss": 0.0831, "num_input_tokens_seen": 84691024, "step": 39230 }, { "epoch": 6.400489396411093, "grad_norm": 0.08905600011348724, "learning_rate": 1.7271867349365782e-05, "loss": 0.1561, "num_input_tokens_seen": 84700976, "step": 39235 }, { "epoch": 6.401305057096248, "grad_norm": 0.06938436627388, "learning_rate": 1.7265098309828433e-05, "loss": 0.0041, "num_input_tokens_seen": 84710896, "step": 39240 }, { "epoch": 6.402120717781403, "grad_norm": 0.11138947308063507, "learning_rate": 1.725832989732146e-05, "loss": 0.1157, "num_input_tokens_seen": 84722448, "step": 39245 }, { "epoch": 6.402936378466558, "grad_norm": 5.175948619842529, "learning_rate": 1.725156211239354e-05, "loss": 0.2433, "num_input_tokens_seen": 84733168, "step": 39250 }, { "epoch": 6.403752039151713, "grad_norm": 3.1428029537200928, "learning_rate": 1.7244794955593316e-05, "loss": 0.0252, "num_input_tokens_seen": 84744176, "step": 39255 }, { "epoch": 6.404567699836868, "grad_norm": 0.12407192587852478, "learning_rate": 1.7238028427469363e-05, "loss": 0.0062, "num_input_tokens_seen": 84754704, "step": 39260 }, { "epoch": 6.4053833605220225, "grad_norm": 2.8569209575653076, "learning_rate": 1.7231262528570207e-05, "loss": 0.1144, "num_input_tokens_seen": 84765584, "step": 39265 }, { "epoch": 6.406199021207178, "grad_norm": 4.624263763427734, "learning_rate": 1.7224497259444334e-05, "loss": 0.0716, "num_input_tokens_seen": 84776144, "step": 39270 }, { "epoch": 6.407014681892333, "grad_norm": 0.07321374118328094, "learning_rate": 1.7217732620640163e-05, "loss": 0.1548, "num_input_tokens_seen": 84785488, "step": 39275 }, { "epoch": 6.407830342577488, "grad_norm": 5.30738639831543, "learning_rate": 1.721096861270607e-05, "loss": 0.1574, "num_input_tokens_seen": 84796080, "step": 39280 }, { "epoch": 6.408646003262643, "grad_norm": 0.20457549393177032, "learning_rate": 1.7204205236190385e-05, "loss": 0.1102, "num_input_tokens_seen": 84805520, "step": 39285 }, { "epoch": 6.4094616639477975, "grad_norm": 0.08466614037752151, "learning_rate": 1.719744249164138e-05, "loss": 0.0071, "num_input_tokens_seen": 84816464, "step": 39290 }, { "epoch": 6.410277324632952, "grad_norm": 0.17640598118305206, "learning_rate": 1.7190680379607278e-05, "loss": 0.12, "num_input_tokens_seen": 84826640, "step": 39295 }, { "epoch": 6.411092985318108, "grad_norm": 1.222657561302185, "learning_rate": 1.718391890063624e-05, "loss": 0.225, "num_input_tokens_seen": 84836304, "step": 39300 }, { "epoch": 6.411908646003263, "grad_norm": 0.1656293421983719, "learning_rate": 1.7177158055276405e-05, "loss": 0.0208, "num_input_tokens_seen": 84847280, "step": 39305 }, { "epoch": 6.412724306688418, "grad_norm": 0.08772674947977066, "learning_rate": 1.717039784407582e-05, "loss": 0.2311, "num_input_tokens_seen": 84859696, "step": 39310 }, { "epoch": 6.4135399673735725, "grad_norm": 0.2088811844587326, "learning_rate": 1.7163638267582516e-05, "loss": 0.0081, "num_input_tokens_seen": 84870576, "step": 39315 }, { "epoch": 6.414355628058727, "grad_norm": 0.04336438328027725, "learning_rate": 1.715687932634446e-05, "loss": 0.0867, "num_input_tokens_seen": 84882224, "step": 39320 }, { "epoch": 6.415171288743883, "grad_norm": 5.473361492156982, "learning_rate": 1.715012102090956e-05, "loss": 0.1718, "num_input_tokens_seen": 84892496, "step": 39325 }, { "epoch": 6.415986949429038, "grad_norm": 0.15484346449375153, "learning_rate": 1.714336335182567e-05, "loss": 0.1921, "num_input_tokens_seen": 84903440, "step": 39330 }, { "epoch": 6.416802610114193, "grad_norm": 0.5261359810829163, "learning_rate": 1.7136606319640616e-05, "loss": 0.0083, "num_input_tokens_seen": 84914608, "step": 39335 }, { "epoch": 6.417618270799347, "grad_norm": 0.11928585916757584, "learning_rate": 1.7129849924902157e-05, "loss": 0.0054, "num_input_tokens_seen": 84926768, "step": 39340 }, { "epoch": 6.418433931484502, "grad_norm": 3.9390835762023926, "learning_rate": 1.7123094168157994e-05, "loss": 0.1103, "num_input_tokens_seen": 84937488, "step": 39345 }, { "epoch": 6.419249592169657, "grad_norm": 0.059945400804281235, "learning_rate": 1.7116339049955788e-05, "loss": 0.1013, "num_input_tokens_seen": 84949840, "step": 39350 }, { "epoch": 6.420065252854813, "grad_norm": 0.22643250226974487, "learning_rate": 1.7109584570843136e-05, "loss": 0.182, "num_input_tokens_seen": 84959088, "step": 39355 }, { "epoch": 6.420880913539968, "grad_norm": 0.08659963309764862, "learning_rate": 1.7102830731367593e-05, "loss": 0.0172, "num_input_tokens_seen": 84969232, "step": 39360 }, { "epoch": 6.421696574225122, "grad_norm": 0.10305386781692505, "learning_rate": 1.7096077532076666e-05, "loss": 0.0115, "num_input_tokens_seen": 84981488, "step": 39365 }, { "epoch": 6.422512234910277, "grad_norm": 2.7358717918395996, "learning_rate": 1.7089324973517794e-05, "loss": 0.1204, "num_input_tokens_seen": 84992688, "step": 39370 }, { "epoch": 6.423327895595432, "grad_norm": 0.08726153522729874, "learning_rate": 1.708257305623838e-05, "loss": 0.0053, "num_input_tokens_seen": 85004144, "step": 39375 }, { "epoch": 6.424143556280587, "grad_norm": 11.125823974609375, "learning_rate": 1.7075821780785766e-05, "loss": 0.1448, "num_input_tokens_seen": 85015408, "step": 39380 }, { "epoch": 6.424959216965743, "grad_norm": 0.11659998446702957, "learning_rate": 1.7069071147707248e-05, "loss": 0.0063, "num_input_tokens_seen": 85026064, "step": 39385 }, { "epoch": 6.425774877650897, "grad_norm": 0.13623273372650146, "learning_rate": 1.706232115755006e-05, "loss": 0.0875, "num_input_tokens_seen": 85037232, "step": 39390 }, { "epoch": 6.426590538336052, "grad_norm": 3.484802007675171, "learning_rate": 1.705557181086139e-05, "loss": 0.2018, "num_input_tokens_seen": 85047888, "step": 39395 }, { "epoch": 6.427406199021207, "grad_norm": 0.16145901381969452, "learning_rate": 1.704882310818839e-05, "loss": 0.1123, "num_input_tokens_seen": 85058768, "step": 39400 }, { "epoch": 6.428221859706362, "grad_norm": 2.734320878982544, "learning_rate": 1.704207505007813e-05, "loss": 0.0654, "num_input_tokens_seen": 85069296, "step": 39405 }, { "epoch": 6.4290375203915175, "grad_norm": 0.09999395906925201, "learning_rate": 1.703532763707764e-05, "loss": 0.1187, "num_input_tokens_seen": 85080528, "step": 39410 }, { "epoch": 6.429853181076672, "grad_norm": 0.06624927371740341, "learning_rate": 1.7028580869733905e-05, "loss": 0.075, "num_input_tokens_seen": 85091024, "step": 39415 }, { "epoch": 6.430668841761827, "grad_norm": 0.10834236443042755, "learning_rate": 1.702183474859385e-05, "loss": 0.0677, "num_input_tokens_seen": 85100656, "step": 39420 }, { "epoch": 6.431484502446982, "grad_norm": 0.18645232915878296, "learning_rate": 1.7015089274204354e-05, "loss": 0.0069, "num_input_tokens_seen": 85111856, "step": 39425 }, { "epoch": 6.432300163132137, "grad_norm": 2.205200433731079, "learning_rate": 1.7008344447112238e-05, "loss": 0.1318, "num_input_tokens_seen": 85120944, "step": 39430 }, { "epoch": 6.433115823817292, "grad_norm": 13.182847023010254, "learning_rate": 1.7001600267864266e-05, "loss": 0.0437, "num_input_tokens_seen": 85131440, "step": 39435 }, { "epoch": 6.433931484502447, "grad_norm": 0.16795465350151062, "learning_rate": 1.6994856737007154e-05, "loss": 0.0846, "num_input_tokens_seen": 85140720, "step": 39440 }, { "epoch": 6.434747145187602, "grad_norm": 1.0521131753921509, "learning_rate": 1.698811385508758e-05, "loss": 0.1491, "num_input_tokens_seen": 85151152, "step": 39445 }, { "epoch": 6.435562805872757, "grad_norm": 4.9170050621032715, "learning_rate": 1.698137162265215e-05, "loss": 0.084, "num_input_tokens_seen": 85161392, "step": 39450 }, { "epoch": 6.436378466557912, "grad_norm": 0.37777280807495117, "learning_rate": 1.6974630040247425e-05, "loss": 0.017, "num_input_tokens_seen": 85172656, "step": 39455 }, { "epoch": 6.437194127243067, "grad_norm": 0.09394480288028717, "learning_rate": 1.6967889108419903e-05, "loss": 0.0075, "num_input_tokens_seen": 85184784, "step": 39460 }, { "epoch": 6.438009787928221, "grad_norm": 0.31116095185279846, "learning_rate": 1.696114882771605e-05, "loss": 0.0886, "num_input_tokens_seen": 85195248, "step": 39465 }, { "epoch": 6.438825448613377, "grad_norm": 0.1480165719985962, "learning_rate": 1.695440919868226e-05, "loss": 0.0989, "num_input_tokens_seen": 85205040, "step": 39470 }, { "epoch": 6.439641109298532, "grad_norm": 0.08026082068681717, "learning_rate": 1.694767022186488e-05, "loss": 0.006, "num_input_tokens_seen": 85215344, "step": 39475 }, { "epoch": 6.440456769983687, "grad_norm": 0.1445658802986145, "learning_rate": 1.6940931897810208e-05, "loss": 0.0797, "num_input_tokens_seen": 85226352, "step": 39480 }, { "epoch": 6.441272430668842, "grad_norm": 0.275978684425354, "learning_rate": 1.693419422706449e-05, "loss": 0.0173, "num_input_tokens_seen": 85235408, "step": 39485 }, { "epoch": 6.442088091353996, "grad_norm": 0.07860295474529266, "learning_rate": 1.6927457210173915e-05, "loss": 0.2019, "num_input_tokens_seen": 85246352, "step": 39490 }, { "epoch": 6.442903752039152, "grad_norm": 0.24194453656673431, "learning_rate": 1.6920720847684617e-05, "loss": 0.1945, "num_input_tokens_seen": 85256368, "step": 39495 }, { "epoch": 6.443719412724307, "grad_norm": 0.029847651720046997, "learning_rate": 1.6913985140142682e-05, "loss": 0.0047, "num_input_tokens_seen": 85267856, "step": 39500 }, { "epoch": 6.444535073409462, "grad_norm": 0.25842732191085815, "learning_rate": 1.690725008809414e-05, "loss": 0.1997, "num_input_tokens_seen": 85280112, "step": 39505 }, { "epoch": 6.445350734094617, "grad_norm": 0.16432420909404755, "learning_rate": 1.6900515692084966e-05, "loss": 0.0064, "num_input_tokens_seen": 85288688, "step": 39510 }, { "epoch": 6.446166394779771, "grad_norm": 0.12695054709911346, "learning_rate": 1.689378195266109e-05, "loss": 0.1807, "num_input_tokens_seen": 85299888, "step": 39515 }, { "epoch": 6.446982055464926, "grad_norm": 0.10619836300611496, "learning_rate": 1.6887048870368377e-05, "loss": 0.0095, "num_input_tokens_seen": 85310128, "step": 39520 }, { "epoch": 6.447797716150082, "grad_norm": 0.16509254276752472, "learning_rate": 1.688031644575265e-05, "loss": 0.0054, "num_input_tokens_seen": 85321072, "step": 39525 }, { "epoch": 6.448613376835237, "grad_norm": 2.5598042011260986, "learning_rate": 1.6873584679359665e-05, "loss": 0.0671, "num_input_tokens_seen": 85333008, "step": 39530 }, { "epoch": 6.4494290375203915, "grad_norm": 0.24327965080738068, "learning_rate": 1.686685357173514e-05, "loss": 0.0348, "num_input_tokens_seen": 85344048, "step": 39535 }, { "epoch": 6.450244698205546, "grad_norm": 0.04776002839207649, "learning_rate": 1.6860123123424733e-05, "loss": 0.1062, "num_input_tokens_seen": 85353136, "step": 39540 }, { "epoch": 6.451060358890701, "grad_norm": 2.810372829437256, "learning_rate": 1.6853393334974044e-05, "loss": 0.088, "num_input_tokens_seen": 85364016, "step": 39545 }, { "epoch": 6.451876019575856, "grad_norm": 0.2053820639848709, "learning_rate": 1.684666420692863e-05, "loss": 0.0082, "num_input_tokens_seen": 85374928, "step": 39550 }, { "epoch": 6.452691680261012, "grad_norm": 0.31872835755348206, "learning_rate": 1.6839935739833986e-05, "loss": 0.0413, "num_input_tokens_seen": 85383888, "step": 39555 }, { "epoch": 6.4535073409461665, "grad_norm": 12.280909538269043, "learning_rate": 1.683320793423555e-05, "loss": 0.2488, "num_input_tokens_seen": 85394736, "step": 39560 }, { "epoch": 6.454323001631321, "grad_norm": 0.08581593632698059, "learning_rate": 1.6826480790678718e-05, "loss": 0.0832, "num_input_tokens_seen": 85405232, "step": 39565 }, { "epoch": 6.455138662316476, "grad_norm": 0.34644970297813416, "learning_rate": 1.681975430970883e-05, "loss": 0.0947, "num_input_tokens_seen": 85416176, "step": 39570 }, { "epoch": 6.455954323001631, "grad_norm": 1.6972507238388062, "learning_rate": 1.681302849187116e-05, "loss": 0.0154, "num_input_tokens_seen": 85426800, "step": 39575 }, { "epoch": 6.456769983686787, "grad_norm": 1.9096978902816772, "learning_rate": 1.6806303337710942e-05, "loss": 0.0099, "num_input_tokens_seen": 85437520, "step": 39580 }, { "epoch": 6.4575856443719415, "grad_norm": 0.1439582258462906, "learning_rate": 1.679957884777335e-05, "loss": 0.0729, "num_input_tokens_seen": 85447888, "step": 39585 }, { "epoch": 6.458401305057096, "grad_norm": 0.03080224059522152, "learning_rate": 1.6792855022603508e-05, "loss": 0.0125, "num_input_tokens_seen": 85459344, "step": 39590 }, { "epoch": 6.459216965742251, "grad_norm": 0.29059574007987976, "learning_rate": 1.678613186274648e-05, "loss": 0.1, "num_input_tokens_seen": 85469744, "step": 39595 }, { "epoch": 6.460032626427406, "grad_norm": 0.1453476995229721, "learning_rate": 1.6779409368747274e-05, "loss": 0.0051, "num_input_tokens_seen": 85481168, "step": 39600 }, { "epoch": 6.460848287112561, "grad_norm": 0.05045260116457939, "learning_rate": 1.677268754115086e-05, "loss": 0.0118, "num_input_tokens_seen": 85492976, "step": 39605 }, { "epoch": 6.4616639477977165, "grad_norm": 0.7699715495109558, "learning_rate": 1.676596638050214e-05, "loss": 0.0819, "num_input_tokens_seen": 85502896, "step": 39610 }, { "epoch": 6.462479608482871, "grad_norm": 3.924278736114502, "learning_rate": 1.6759245887345966e-05, "loss": 0.3947, "num_input_tokens_seen": 85513680, "step": 39615 }, { "epoch": 6.463295269168026, "grad_norm": 0.11376676708459854, "learning_rate": 1.6752526062227127e-05, "loss": 0.1543, "num_input_tokens_seen": 85524784, "step": 39620 }, { "epoch": 6.464110929853181, "grad_norm": 0.5455036759376526, "learning_rate": 1.674580690569037e-05, "loss": 0.1193, "num_input_tokens_seen": 85536208, "step": 39625 }, { "epoch": 6.464926590538336, "grad_norm": 0.059227459132671356, "learning_rate": 1.6739088418280395e-05, "loss": 0.0086, "num_input_tokens_seen": 85545936, "step": 39630 }, { "epoch": 6.465742251223491, "grad_norm": 0.07577138394117355, "learning_rate": 1.6732370600541823e-05, "loss": 0.0985, "num_input_tokens_seen": 85556080, "step": 39635 }, { "epoch": 6.466557911908646, "grad_norm": 5.8748779296875, "learning_rate": 1.6725653453019244e-05, "loss": 0.2306, "num_input_tokens_seen": 85565840, "step": 39640 }, { "epoch": 6.467373572593801, "grad_norm": 0.06989464908838272, "learning_rate": 1.6718936976257177e-05, "loss": 0.1748, "num_input_tokens_seen": 85576912, "step": 39645 }, { "epoch": 6.468189233278956, "grad_norm": 0.04279068857431412, "learning_rate": 1.6712221170800087e-05, "loss": 0.0391, "num_input_tokens_seen": 85587760, "step": 39650 }, { "epoch": 6.469004893964111, "grad_norm": 0.025165436789393425, "learning_rate": 1.670550603719241e-05, "loss": 0.018, "num_input_tokens_seen": 85597552, "step": 39655 }, { "epoch": 6.4698205546492655, "grad_norm": 3.684547185897827, "learning_rate": 1.66987915759785e-05, "loss": 0.0706, "num_input_tokens_seen": 85608368, "step": 39660 }, { "epoch": 6.470636215334421, "grad_norm": 0.04838266968727112, "learning_rate": 1.6692077787702666e-05, "loss": 0.0086, "num_input_tokens_seen": 85618096, "step": 39665 }, { "epoch": 6.471451876019576, "grad_norm": 0.11319730430841446, "learning_rate": 1.6685364672909163e-05, "loss": 0.0038, "num_input_tokens_seen": 85628656, "step": 39670 }, { "epoch": 6.472267536704731, "grad_norm": 0.1918221116065979, "learning_rate": 1.6678652232142185e-05, "loss": 0.0051, "num_input_tokens_seen": 85639632, "step": 39675 }, { "epoch": 6.473083197389886, "grad_norm": 0.0761067196726799, "learning_rate": 1.667194046594588e-05, "loss": 0.1456, "num_input_tokens_seen": 85650832, "step": 39680 }, { "epoch": 6.4738988580750405, "grad_norm": 0.16649939119815826, "learning_rate": 1.666522937486433e-05, "loss": 0.1, "num_input_tokens_seen": 85661296, "step": 39685 }, { "epoch": 6.474714518760196, "grad_norm": 0.25331294536590576, "learning_rate": 1.6658518959441584e-05, "loss": 0.0984, "num_input_tokens_seen": 85672368, "step": 39690 }, { "epoch": 6.475530179445351, "grad_norm": 0.01694580353796482, "learning_rate": 1.6651809220221614e-05, "loss": 0.0746, "num_input_tokens_seen": 85682160, "step": 39695 }, { "epoch": 6.476345840130506, "grad_norm": 13.475996017456055, "learning_rate": 1.664510015774835e-05, "loss": 0.0439, "num_input_tokens_seen": 85692656, "step": 39700 }, { "epoch": 6.477161500815661, "grad_norm": 29.608610153198242, "learning_rate": 1.6638391772565658e-05, "loss": 0.018, "num_input_tokens_seen": 85703600, "step": 39705 }, { "epoch": 6.4779771615008155, "grad_norm": 7.677433013916016, "learning_rate": 1.6631684065217344e-05, "loss": 0.1436, "num_input_tokens_seen": 85714512, "step": 39710 }, { "epoch": 6.47879282218597, "grad_norm": 0.29250794649124146, "learning_rate": 1.662497703624719e-05, "loss": 0.155, "num_input_tokens_seen": 85723888, "step": 39715 }, { "epoch": 6.479608482871126, "grad_norm": 0.07598171383142471, "learning_rate": 1.6618270686198895e-05, "loss": 0.0032, "num_input_tokens_seen": 85734992, "step": 39720 }, { "epoch": 6.480424143556281, "grad_norm": 0.2042560577392578, "learning_rate": 1.6611565015616106e-05, "loss": 0.0063, "num_input_tokens_seen": 85746768, "step": 39725 }, { "epoch": 6.481239804241436, "grad_norm": 0.14365331828594208, "learning_rate": 1.6604860025042412e-05, "loss": 0.0052, "num_input_tokens_seen": 85758352, "step": 39730 }, { "epoch": 6.4820554649265905, "grad_norm": 0.04808463901281357, "learning_rate": 1.6598155715021368e-05, "loss": 0.0027, "num_input_tokens_seen": 85769424, "step": 39735 }, { "epoch": 6.482871125611745, "grad_norm": 0.14188307523727417, "learning_rate": 1.6591452086096448e-05, "loss": 0.077, "num_input_tokens_seen": 85780816, "step": 39740 }, { "epoch": 6.4836867862969, "grad_norm": 0.06382682919502258, "learning_rate": 1.658474913881109e-05, "loss": 0.1917, "num_input_tokens_seen": 85792528, "step": 39745 }, { "epoch": 6.484502446982056, "grad_norm": 0.11008022725582123, "learning_rate": 1.6578046873708663e-05, "loss": 0.0052, "num_input_tokens_seen": 85802448, "step": 39750 }, { "epoch": 6.485318107667211, "grad_norm": 0.25377973914146423, "learning_rate": 1.657134529133248e-05, "loss": 0.0046, "num_input_tokens_seen": 85813104, "step": 39755 }, { "epoch": 6.486133768352365, "grad_norm": 0.17505641281604767, "learning_rate": 1.6564644392225824e-05, "loss": 0.1367, "num_input_tokens_seen": 85823248, "step": 39760 }, { "epoch": 6.48694942903752, "grad_norm": 0.04072978347539902, "learning_rate": 1.6557944176931894e-05, "loss": 0.2781, "num_input_tokens_seen": 85834384, "step": 39765 }, { "epoch": 6.487765089722675, "grad_norm": 0.038568198680877686, "learning_rate": 1.6551244645993847e-05, "loss": 0.0132, "num_input_tokens_seen": 85844848, "step": 39770 }, { "epoch": 6.488580750407831, "grad_norm": 2.7014260292053223, "learning_rate": 1.654454579995477e-05, "loss": 0.2609, "num_input_tokens_seen": 85856528, "step": 39775 }, { "epoch": 6.489396411092986, "grad_norm": 0.07561331242322922, "learning_rate": 1.653784763935772e-05, "loss": 0.0027, "num_input_tokens_seen": 85868112, "step": 39780 }, { "epoch": 6.49021207177814, "grad_norm": 0.1559215486049652, "learning_rate": 1.6531150164745674e-05, "loss": 0.1453, "num_input_tokens_seen": 85878256, "step": 39785 }, { "epoch": 6.491027732463295, "grad_norm": 0.0716010108590126, "learning_rate": 1.6524453376661568e-05, "loss": 0.0033, "num_input_tokens_seen": 85887440, "step": 39790 }, { "epoch": 6.49184339314845, "grad_norm": 28.87321662902832, "learning_rate": 1.6517757275648267e-05, "loss": 0.0391, "num_input_tokens_seen": 85897872, "step": 39795 }, { "epoch": 6.492659053833605, "grad_norm": 0.023398280143737793, "learning_rate": 1.6511061862248605e-05, "loss": 0.1002, "num_input_tokens_seen": 85909776, "step": 39800 }, { "epoch": 6.493474714518761, "grad_norm": 3.4813265800476074, "learning_rate": 1.6504367137005344e-05, "loss": 0.2511, "num_input_tokens_seen": 85919984, "step": 39805 }, { "epoch": 6.494290375203915, "grad_norm": 0.14940987527370453, "learning_rate": 1.649767310046119e-05, "loss": 0.0044, "num_input_tokens_seen": 85931632, "step": 39810 }, { "epoch": 6.49510603588907, "grad_norm": 0.2095457911491394, "learning_rate": 1.649097975315879e-05, "loss": 0.0763, "num_input_tokens_seen": 85944176, "step": 39815 }, { "epoch": 6.495921696574225, "grad_norm": 0.09960245341062546, "learning_rate": 1.648428709564075e-05, "loss": 0.0039, "num_input_tokens_seen": 85956016, "step": 39820 }, { "epoch": 6.49673735725938, "grad_norm": 0.10123313218355179, "learning_rate": 1.6477595128449605e-05, "loss": 0.0062, "num_input_tokens_seen": 85966704, "step": 39825 }, { "epoch": 6.497553017944535, "grad_norm": 0.029882868751883507, "learning_rate": 1.647090385212784e-05, "loss": 0.2219, "num_input_tokens_seen": 85976048, "step": 39830 }, { "epoch": 6.49836867862969, "grad_norm": 0.04088251665234566, "learning_rate": 1.6464213267217888e-05, "loss": 0.1234, "num_input_tokens_seen": 85985424, "step": 39835 }, { "epoch": 6.499184339314845, "grad_norm": 0.08394760638475418, "learning_rate": 1.6457523374262117e-05, "loss": 0.1349, "num_input_tokens_seen": 85996976, "step": 39840 }, { "epoch": 6.5, "grad_norm": 0.040804579854011536, "learning_rate": 1.645083417380284e-05, "loss": 0.0772, "num_input_tokens_seen": 86008784, "step": 39845 }, { "epoch": 6.5, "eval_loss": 0.19124996662139893, "eval_runtime": 132.9357, "eval_samples_per_second": 20.499, "eval_steps_per_second": 5.13, "num_input_tokens_seen": 86008784, "step": 39845 }, { "epoch": 6.500815660685155, "grad_norm": 6.5842061042785645, "learning_rate": 1.644414566638233e-05, "loss": 0.1526, "num_input_tokens_seen": 86019568, "step": 39850 }, { "epoch": 6.50163132137031, "grad_norm": 0.17074818909168243, "learning_rate": 1.643745785254278e-05, "loss": 0.019, "num_input_tokens_seen": 86030448, "step": 39855 }, { "epoch": 6.502446982055465, "grad_norm": 20.010343551635742, "learning_rate": 1.6430770732826346e-05, "loss": 0.1846, "num_input_tokens_seen": 86041040, "step": 39860 }, { "epoch": 6.50326264274062, "grad_norm": 0.07051025331020355, "learning_rate": 1.6424084307775107e-05, "loss": 0.0026, "num_input_tokens_seen": 86052624, "step": 39865 }, { "epoch": 6.504078303425775, "grad_norm": 0.05950159206986427, "learning_rate": 1.6417398577931116e-05, "loss": 0.1023, "num_input_tokens_seen": 86063472, "step": 39870 }, { "epoch": 6.50489396411093, "grad_norm": 0.0516861230134964, "learning_rate": 1.6410713543836342e-05, "loss": 0.0062, "num_input_tokens_seen": 86074320, "step": 39875 }, { "epoch": 6.505709624796085, "grad_norm": 0.06555325537919998, "learning_rate": 1.6404029206032708e-05, "loss": 0.0344, "num_input_tokens_seen": 86086320, "step": 39880 }, { "epoch": 6.506525285481239, "grad_norm": 0.04884251207113266, "learning_rate": 1.6397345565062082e-05, "loss": 0.1264, "num_input_tokens_seen": 86098000, "step": 39885 }, { "epoch": 6.507340946166395, "grad_norm": 0.09672345966100693, "learning_rate": 1.639066262146628e-05, "loss": 0.0865, "num_input_tokens_seen": 86107088, "step": 39890 }, { "epoch": 6.50815660685155, "grad_norm": 0.01781996712088585, "learning_rate": 1.6383980375787044e-05, "loss": 0.1243, "num_input_tokens_seen": 86118096, "step": 39895 }, { "epoch": 6.508972267536705, "grad_norm": 20.226181030273438, "learning_rate": 1.637729882856608e-05, "loss": 0.1434, "num_input_tokens_seen": 86129136, "step": 39900 }, { "epoch": 6.50978792822186, "grad_norm": 3.394411325454712, "learning_rate": 1.6370617980345022e-05, "loss": 0.1982, "num_input_tokens_seen": 86140816, "step": 39905 }, { "epoch": 6.510603588907014, "grad_norm": 0.07280711084604263, "learning_rate": 1.6363937831665458e-05, "loss": 0.0051, "num_input_tokens_seen": 86151664, "step": 39910 }, { "epoch": 6.511419249592169, "grad_norm": 0.14743319153785706, "learning_rate": 1.635725838306891e-05, "loss": 0.0221, "num_input_tokens_seen": 86161744, "step": 39915 }, { "epoch": 6.512234910277325, "grad_norm": 3.7788338661193848, "learning_rate": 1.6350579635096852e-05, "loss": 0.0837, "num_input_tokens_seen": 86171312, "step": 39920 }, { "epoch": 6.51305057096248, "grad_norm": 0.04754680395126343, "learning_rate": 1.6343901588290695e-05, "loss": 0.0461, "num_input_tokens_seen": 86181392, "step": 39925 }, { "epoch": 6.513866231647635, "grad_norm": 0.19998221099376678, "learning_rate": 1.633722424319179e-05, "loss": 0.0321, "num_input_tokens_seen": 86191728, "step": 39930 }, { "epoch": 6.514681892332789, "grad_norm": 0.0944763571023941, "learning_rate": 1.633054760034145e-05, "loss": 0.0039, "num_input_tokens_seen": 86201712, "step": 39935 }, { "epoch": 6.515497553017944, "grad_norm": 0.1905948519706726, "learning_rate": 1.6323871660280904e-05, "loss": 0.3161, "num_input_tokens_seen": 86212464, "step": 39940 }, { "epoch": 6.5163132137031, "grad_norm": 18.064556121826172, "learning_rate": 1.6317196423551347e-05, "loss": 0.1353, "num_input_tokens_seen": 86224208, "step": 39945 }, { "epoch": 6.517128874388255, "grad_norm": 12.106965065002441, "learning_rate": 1.6310521890693904e-05, "loss": 0.0226, "num_input_tokens_seen": 86234544, "step": 39950 }, { "epoch": 6.5179445350734095, "grad_norm": 2.743065595626831, "learning_rate": 1.6303848062249643e-05, "loss": 0.1557, "num_input_tokens_seen": 86245776, "step": 39955 }, { "epoch": 6.518760195758564, "grad_norm": 0.08378918468952179, "learning_rate": 1.6297174938759584e-05, "loss": 0.1994, "num_input_tokens_seen": 86256976, "step": 39960 }, { "epoch": 6.519575856443719, "grad_norm": 0.12126666307449341, "learning_rate": 1.6290502520764687e-05, "loss": 0.2295, "num_input_tokens_seen": 86267568, "step": 39965 }, { "epoch": 6.520391517128875, "grad_norm": 0.15529920160770416, "learning_rate": 1.6283830808805832e-05, "loss": 0.0047, "num_input_tokens_seen": 86278768, "step": 39970 }, { "epoch": 6.52120717781403, "grad_norm": 0.20121616125106812, "learning_rate": 1.6277159803423888e-05, "loss": 0.0369, "num_input_tokens_seen": 86288080, "step": 39975 }, { "epoch": 6.5220228384991845, "grad_norm": 0.12498260289430618, "learning_rate": 1.627048950515963e-05, "loss": 0.0067, "num_input_tokens_seen": 86299120, "step": 39980 }, { "epoch": 6.522838499184339, "grad_norm": 0.11112265288829803, "learning_rate": 1.6263819914553786e-05, "loss": 0.0152, "num_input_tokens_seen": 86311056, "step": 39985 }, { "epoch": 6.523654159869494, "grad_norm": 10.087481498718262, "learning_rate": 1.6257151032147028e-05, "loss": 0.1839, "num_input_tokens_seen": 86321904, "step": 39990 }, { "epoch": 6.524469820554649, "grad_norm": 0.02778514288365841, "learning_rate": 1.6250482858479964e-05, "loss": 0.1932, "num_input_tokens_seen": 86332880, "step": 39995 }, { "epoch": 6.525285481239804, "grad_norm": 0.33237969875335693, "learning_rate": 1.624381539409316e-05, "loss": 0.0069, "num_input_tokens_seen": 86344080, "step": 40000 }, { "epoch": 6.5261011419249595, "grad_norm": 0.25928637385368347, "learning_rate": 1.6237148639527106e-05, "loss": 0.0295, "num_input_tokens_seen": 86356240, "step": 40005 }, { "epoch": 6.526916802610114, "grad_norm": 6.4005537033081055, "learning_rate": 1.6230482595322244e-05, "loss": 0.0609, "num_input_tokens_seen": 86367824, "step": 40010 }, { "epoch": 6.527732463295269, "grad_norm": 0.1521807760000229, "learning_rate": 1.6223817262018958e-05, "loss": 0.2179, "num_input_tokens_seen": 86379632, "step": 40015 }, { "epoch": 6.528548123980424, "grad_norm": 0.05539650097489357, "learning_rate": 1.6217152640157577e-05, "loss": 0.0226, "num_input_tokens_seen": 86391248, "step": 40020 }, { "epoch": 6.529363784665579, "grad_norm": 0.11192484945058823, "learning_rate": 1.621048873027836e-05, "loss": 0.1208, "num_input_tokens_seen": 86401104, "step": 40025 }, { "epoch": 6.5301794453507345, "grad_norm": 3.5296645164489746, "learning_rate": 1.6203825532921533e-05, "loss": 0.2194, "num_input_tokens_seen": 86412528, "step": 40030 }, { "epoch": 6.530995106035889, "grad_norm": 0.1914474219083786, "learning_rate": 1.6197163048627237e-05, "loss": 0.0661, "num_input_tokens_seen": 86422800, "step": 40035 }, { "epoch": 6.531810766721044, "grad_norm": 0.0820464938879013, "learning_rate": 1.619050127793557e-05, "loss": 0.0039, "num_input_tokens_seen": 86434448, "step": 40040 }, { "epoch": 6.532626427406199, "grad_norm": 0.08316051214933395, "learning_rate": 1.6183840221386567e-05, "loss": 0.0044, "num_input_tokens_seen": 86447312, "step": 40045 }, { "epoch": 6.533442088091354, "grad_norm": 7.715195655822754, "learning_rate": 1.617717987952021e-05, "loss": 0.0774, "num_input_tokens_seen": 86458800, "step": 40050 }, { "epoch": 6.5342577487765094, "grad_norm": 2.059783697128296, "learning_rate": 1.6170520252876416e-05, "loss": 0.1462, "num_input_tokens_seen": 86470928, "step": 40055 }, { "epoch": 6.535073409461664, "grad_norm": 0.06226469203829765, "learning_rate": 1.616386134199505e-05, "loss": 0.095, "num_input_tokens_seen": 86481520, "step": 40060 }, { "epoch": 6.535889070146819, "grad_norm": 0.19757278263568878, "learning_rate": 1.6157203147415923e-05, "loss": 0.0083, "num_input_tokens_seen": 86492080, "step": 40065 }, { "epoch": 6.536704730831974, "grad_norm": 0.0957663506269455, "learning_rate": 1.6150545669678773e-05, "loss": 0.0601, "num_input_tokens_seen": 86503280, "step": 40070 }, { "epoch": 6.537520391517129, "grad_norm": 14.515009880065918, "learning_rate": 1.6143888909323286e-05, "loss": 0.1451, "num_input_tokens_seen": 86513808, "step": 40075 }, { "epoch": 6.5383360522022835, "grad_norm": 0.060574427247047424, "learning_rate": 1.6137232866889107e-05, "loss": 0.0187, "num_input_tokens_seen": 86524944, "step": 40080 }, { "epoch": 6.539151712887438, "grad_norm": 0.021970512345433235, "learning_rate": 1.6130577542915798e-05, "loss": 0.2364, "num_input_tokens_seen": 86535824, "step": 40085 }, { "epoch": 6.539967373572594, "grad_norm": 8.479681015014648, "learning_rate": 1.6123922937942883e-05, "loss": 0.1004, "num_input_tokens_seen": 86546992, "step": 40090 }, { "epoch": 6.540783034257749, "grad_norm": 0.028135670349001884, "learning_rate": 1.6117269052509803e-05, "loss": 0.0025, "num_input_tokens_seen": 86558352, "step": 40095 }, { "epoch": 6.541598694942904, "grad_norm": 0.06854220479726791, "learning_rate": 1.6110615887155972e-05, "loss": 0.0821, "num_input_tokens_seen": 86569424, "step": 40100 }, { "epoch": 6.5424143556280585, "grad_norm": 0.35423922538757324, "learning_rate": 1.6103963442420717e-05, "loss": 0.1701, "num_input_tokens_seen": 86579024, "step": 40105 }, { "epoch": 6.543230016313213, "grad_norm": 0.07758591324090958, "learning_rate": 1.6097311718843322e-05, "loss": 0.1315, "num_input_tokens_seen": 86590832, "step": 40110 }, { "epoch": 6.544045676998369, "grad_norm": 7.269680976867676, "learning_rate": 1.6090660716963014e-05, "loss": 0.0177, "num_input_tokens_seen": 86602736, "step": 40115 }, { "epoch": 6.544861337683524, "grad_norm": 4.9752373695373535, "learning_rate": 1.608401043731895e-05, "loss": 0.0565, "num_input_tokens_seen": 86613712, "step": 40120 }, { "epoch": 6.545676998368679, "grad_norm": 0.049367327243089676, "learning_rate": 1.6077360880450244e-05, "loss": 0.2344, "num_input_tokens_seen": 86624112, "step": 40125 }, { "epoch": 6.5464926590538335, "grad_norm": 0.042861804366111755, "learning_rate": 1.6070712046895936e-05, "loss": 0.0039, "num_input_tokens_seen": 86634000, "step": 40130 }, { "epoch": 6.547308319738988, "grad_norm": 28.21422004699707, "learning_rate": 1.6064063937195017e-05, "loss": 0.2787, "num_input_tokens_seen": 86644816, "step": 40135 }, { "epoch": 6.548123980424144, "grad_norm": 17.396093368530273, "learning_rate": 1.6057416551886418e-05, "loss": 0.2133, "num_input_tokens_seen": 86656496, "step": 40140 }, { "epoch": 6.548939641109299, "grad_norm": 0.055646199733018875, "learning_rate": 1.6050769891509005e-05, "loss": 0.1225, "num_input_tokens_seen": 86668400, "step": 40145 }, { "epoch": 6.549755301794454, "grad_norm": 0.18408013880252838, "learning_rate": 1.6044123956601593e-05, "loss": 0.0061, "num_input_tokens_seen": 86678832, "step": 40150 }, { "epoch": 6.5505709624796085, "grad_norm": 5.477224349975586, "learning_rate": 1.6037478747702932e-05, "loss": 0.0607, "num_input_tokens_seen": 86689040, "step": 40155 }, { "epoch": 6.551386623164763, "grad_norm": 0.05760050565004349, "learning_rate": 1.6030834265351724e-05, "loss": 0.1876, "num_input_tokens_seen": 86699952, "step": 40160 }, { "epoch": 6.552202283849918, "grad_norm": 7.261889934539795, "learning_rate": 1.602419051008659e-05, "loss": 0.0195, "num_input_tokens_seen": 86710320, "step": 40165 }, { "epoch": 6.553017944535073, "grad_norm": 0.09678348898887634, "learning_rate": 1.6017547482446127e-05, "loss": 0.0087, "num_input_tokens_seen": 86721648, "step": 40170 }, { "epoch": 6.553833605220229, "grad_norm": 0.29837679862976074, "learning_rate": 1.6010905182968837e-05, "loss": 0.0772, "num_input_tokens_seen": 86732560, "step": 40175 }, { "epoch": 6.554649265905383, "grad_norm": 0.06096908822655678, "learning_rate": 1.6004263612193182e-05, "loss": 0.0037, "num_input_tokens_seen": 86743408, "step": 40180 }, { "epoch": 6.555464926590538, "grad_norm": 0.37986379861831665, "learning_rate": 1.599762277065756e-05, "loss": 0.1418, "num_input_tokens_seen": 86754384, "step": 40185 }, { "epoch": 6.556280587275693, "grad_norm": 0.5057947635650635, "learning_rate": 1.599098265890031e-05, "loss": 0.0047, "num_input_tokens_seen": 86764592, "step": 40190 }, { "epoch": 6.557096247960848, "grad_norm": 0.09633487462997437, "learning_rate": 1.598434327745973e-05, "loss": 0.0865, "num_input_tokens_seen": 86776176, "step": 40195 }, { "epoch": 6.557911908646004, "grad_norm": 5.3133320808410645, "learning_rate": 1.5977704626874023e-05, "loss": 0.1039, "num_input_tokens_seen": 86786320, "step": 40200 }, { "epoch": 6.558727569331158, "grad_norm": 0.04846911132335663, "learning_rate": 1.597106670768136e-05, "loss": 0.0037, "num_input_tokens_seen": 86798288, "step": 40205 }, { "epoch": 6.559543230016313, "grad_norm": 0.03388461098074913, "learning_rate": 1.5964429520419836e-05, "loss": 0.1113, "num_input_tokens_seen": 86809456, "step": 40210 }, { "epoch": 6.560358890701468, "grad_norm": 0.18287904560565948, "learning_rate": 1.595779306562751e-05, "loss": 0.0074, "num_input_tokens_seen": 86819824, "step": 40215 }, { "epoch": 6.561174551386623, "grad_norm": 6.340177536010742, "learning_rate": 1.5951157343842352e-05, "loss": 0.1767, "num_input_tokens_seen": 86829872, "step": 40220 }, { "epoch": 6.561990212071779, "grad_norm": 0.07319547235965729, "learning_rate": 1.5944522355602297e-05, "loss": 0.1392, "num_input_tokens_seen": 86839472, "step": 40225 }, { "epoch": 6.562805872756933, "grad_norm": 0.2601003646850586, "learning_rate": 1.59378881014452e-05, "loss": 0.0076, "num_input_tokens_seen": 86851184, "step": 40230 }, { "epoch": 6.563621533442088, "grad_norm": 0.035068362951278687, "learning_rate": 1.5931254581908882e-05, "loss": 0.0967, "num_input_tokens_seen": 86861136, "step": 40235 }, { "epoch": 6.564437194127243, "grad_norm": 0.04618893191218376, "learning_rate": 1.592462179753108e-05, "loss": 0.1418, "num_input_tokens_seen": 86871344, "step": 40240 }, { "epoch": 6.565252854812398, "grad_norm": 3.044525146484375, "learning_rate": 1.591798974884948e-05, "loss": 0.0782, "num_input_tokens_seen": 86881488, "step": 40245 }, { "epoch": 6.566068515497553, "grad_norm": 0.13312385976314545, "learning_rate": 1.5911358436401708e-05, "loss": 0.1363, "num_input_tokens_seen": 86893264, "step": 40250 }, { "epoch": 6.566884176182708, "grad_norm": 0.05873178318142891, "learning_rate": 1.5904727860725344e-05, "loss": 0.0663, "num_input_tokens_seen": 86904208, "step": 40255 }, { "epoch": 6.567699836867863, "grad_norm": 0.6661403179168701, "learning_rate": 1.589809802235789e-05, "loss": 0.011, "num_input_tokens_seen": 86916080, "step": 40260 }, { "epoch": 6.568515497553018, "grad_norm": 0.07328290492296219, "learning_rate": 1.589146892183679e-05, "loss": 0.0072, "num_input_tokens_seen": 86926640, "step": 40265 }, { "epoch": 6.569331158238173, "grad_norm": 0.05128014460206032, "learning_rate": 1.5884840559699436e-05, "loss": 0.205, "num_input_tokens_seen": 86936688, "step": 40270 }, { "epoch": 6.570146818923328, "grad_norm": 0.050329506397247314, "learning_rate": 1.5878212936483156e-05, "loss": 0.0086, "num_input_tokens_seen": 86948272, "step": 40275 }, { "epoch": 6.5709624796084825, "grad_norm": 8.136697769165039, "learning_rate": 1.5871586052725216e-05, "loss": 0.1234, "num_input_tokens_seen": 86959184, "step": 40280 }, { "epoch": 6.571778140293638, "grad_norm": 0.24187760055065155, "learning_rate": 1.5864959908962832e-05, "loss": 0.0698, "num_input_tokens_seen": 86970032, "step": 40285 }, { "epoch": 6.572593800978793, "grad_norm": 0.05244762822985649, "learning_rate": 1.5858334505733137e-05, "loss": 0.0918, "num_input_tokens_seen": 86980048, "step": 40290 }, { "epoch": 6.573409461663948, "grad_norm": 0.05218328908085823, "learning_rate": 1.585170984357324e-05, "loss": 0.0058, "num_input_tokens_seen": 86991888, "step": 40295 }, { "epoch": 6.574225122349103, "grad_norm": 0.09422808885574341, "learning_rate": 1.5845085923020165e-05, "loss": 0.0041, "num_input_tokens_seen": 87003056, "step": 40300 }, { "epoch": 6.575040783034257, "grad_norm": 0.9793643951416016, "learning_rate": 1.5838462744610872e-05, "loss": 0.0053, "num_input_tokens_seen": 87014192, "step": 40305 }, { "epoch": 6.575856443719413, "grad_norm": 0.214778870344162, "learning_rate": 1.5831840308882276e-05, "loss": 0.1694, "num_input_tokens_seen": 87024112, "step": 40310 }, { "epoch": 6.576672104404568, "grad_norm": 0.04145102947950363, "learning_rate": 1.5825218616371224e-05, "loss": 0.088, "num_input_tokens_seen": 87034896, "step": 40315 }, { "epoch": 6.577487765089723, "grad_norm": 0.6659348607063293, "learning_rate": 1.5818597667614503e-05, "loss": 0.1042, "num_input_tokens_seen": 87045392, "step": 40320 }, { "epoch": 6.578303425774878, "grad_norm": 0.1729201376438141, "learning_rate": 1.581197746314884e-05, "loss": 0.0058, "num_input_tokens_seen": 87056432, "step": 40325 }, { "epoch": 6.579119086460032, "grad_norm": 0.08217202872037888, "learning_rate": 1.5805358003510902e-05, "loss": 0.0949, "num_input_tokens_seen": 87067440, "step": 40330 }, { "epoch": 6.579934747145187, "grad_norm": 0.03858206793665886, "learning_rate": 1.5798739289237298e-05, "loss": 0.1033, "num_input_tokens_seen": 87078064, "step": 40335 }, { "epoch": 6.580750407830343, "grad_norm": 0.11875259131193161, "learning_rate": 1.5792121320864573e-05, "loss": 0.0035, "num_input_tokens_seen": 87087920, "step": 40340 }, { "epoch": 6.581566068515498, "grad_norm": 0.06483682245016098, "learning_rate": 1.5785504098929217e-05, "loss": 0.004, "num_input_tokens_seen": 87098032, "step": 40345 }, { "epoch": 6.582381729200653, "grad_norm": 0.20756934583187103, "learning_rate": 1.5778887623967654e-05, "loss": 0.1881, "num_input_tokens_seen": 87108304, "step": 40350 }, { "epoch": 6.583197389885807, "grad_norm": 0.1414896547794342, "learning_rate": 1.5772271896516245e-05, "loss": 0.0131, "num_input_tokens_seen": 87119088, "step": 40355 }, { "epoch": 6.584013050570962, "grad_norm": 0.5052565336227417, "learning_rate": 1.57656569171113e-05, "loss": 0.215, "num_input_tokens_seen": 87130032, "step": 40360 }, { "epoch": 6.584828711256117, "grad_norm": 0.08235272020101547, "learning_rate": 1.5759042686289056e-05, "loss": 0.2778, "num_input_tokens_seen": 87141104, "step": 40365 }, { "epoch": 6.585644371941273, "grad_norm": 0.22911354899406433, "learning_rate": 1.5752429204585702e-05, "loss": 0.1331, "num_input_tokens_seen": 87151472, "step": 40370 }, { "epoch": 6.5864600326264275, "grad_norm": 5.025905609130859, "learning_rate": 1.5745816472537355e-05, "loss": 0.1019, "num_input_tokens_seen": 87161680, "step": 40375 }, { "epoch": 6.587275693311582, "grad_norm": 0.27062445878982544, "learning_rate": 1.5739204490680085e-05, "loss": 0.0069, "num_input_tokens_seen": 87172080, "step": 40380 }, { "epoch": 6.588091353996737, "grad_norm": 0.38178327679634094, "learning_rate": 1.5732593259549885e-05, "loss": 0.0073, "num_input_tokens_seen": 87181168, "step": 40385 }, { "epoch": 6.588907014681892, "grad_norm": 0.05766642466187477, "learning_rate": 1.57259827796827e-05, "loss": 0.0917, "num_input_tokens_seen": 87192144, "step": 40390 }, { "epoch": 6.589722675367048, "grad_norm": 0.1844758838415146, "learning_rate": 1.5719373051614393e-05, "loss": 0.1106, "num_input_tokens_seen": 87203344, "step": 40395 }, { "epoch": 6.5905383360522025, "grad_norm": 2.9884049892425537, "learning_rate": 1.571276407588081e-05, "loss": 0.0066, "num_input_tokens_seen": 87214416, "step": 40400 }, { "epoch": 6.591353996737357, "grad_norm": 0.058588188141584396, "learning_rate": 1.570615585301769e-05, "loss": 0.0818, "num_input_tokens_seen": 87225168, "step": 40405 }, { "epoch": 6.592169657422512, "grad_norm": 0.17275620996952057, "learning_rate": 1.5699548383560736e-05, "loss": 0.1647, "num_input_tokens_seen": 87237232, "step": 40410 }, { "epoch": 6.592985318107667, "grad_norm": 16.883686065673828, "learning_rate": 1.569294166804558e-05, "loss": 0.4044, "num_input_tokens_seen": 87247696, "step": 40415 }, { "epoch": 6.593800978792823, "grad_norm": 3.278045177459717, "learning_rate": 1.5686335707007794e-05, "loss": 0.2006, "num_input_tokens_seen": 87258448, "step": 40420 }, { "epoch": 6.5946166394779775, "grad_norm": 0.420439213514328, "learning_rate": 1.5679730500982892e-05, "loss": 0.0061, "num_input_tokens_seen": 87268880, "step": 40425 }, { "epoch": 6.595432300163132, "grad_norm": 2.7619595527648926, "learning_rate": 1.5673126050506327e-05, "loss": 0.0093, "num_input_tokens_seen": 87279568, "step": 40430 }, { "epoch": 6.596247960848287, "grad_norm": 0.09254773706197739, "learning_rate": 1.5666522356113488e-05, "loss": 0.1424, "num_input_tokens_seen": 87289776, "step": 40435 }, { "epoch": 6.597063621533442, "grad_norm": 0.029101338237524033, "learning_rate": 1.5659919418339707e-05, "loss": 0.1755, "num_input_tokens_seen": 87301488, "step": 40440 }, { "epoch": 6.597879282218597, "grad_norm": 0.014889135956764221, "learning_rate": 1.565331723772025e-05, "loss": 0.0076, "num_input_tokens_seen": 87312112, "step": 40445 }, { "epoch": 6.598694942903752, "grad_norm": 0.3790788948535919, "learning_rate": 1.5646715814790318e-05, "loss": 0.0089, "num_input_tokens_seen": 87322416, "step": 40450 }, { "epoch": 6.599510603588907, "grad_norm": 0.11865105479955673, "learning_rate": 1.5640115150085067e-05, "loss": 0.0084, "num_input_tokens_seen": 87333040, "step": 40455 }, { "epoch": 6.600326264274062, "grad_norm": 0.03850835561752319, "learning_rate": 1.5633515244139567e-05, "loss": 0.1136, "num_input_tokens_seen": 87344112, "step": 40460 }, { "epoch": 6.601141924959217, "grad_norm": 0.07840988039970398, "learning_rate": 1.562691609748885e-05, "loss": 0.0066, "num_input_tokens_seen": 87353680, "step": 40465 }, { "epoch": 6.601957585644372, "grad_norm": 0.13444779813289642, "learning_rate": 1.562031771066787e-05, "loss": 0.1358, "num_input_tokens_seen": 87363536, "step": 40470 }, { "epoch": 6.602773246329527, "grad_norm": 0.09518253803253174, "learning_rate": 1.561372008421153e-05, "loss": 0.0678, "num_input_tokens_seen": 87374992, "step": 40475 }, { "epoch": 6.603588907014682, "grad_norm": 0.01630476862192154, "learning_rate": 1.560712321865466e-05, "loss": 0.052, "num_input_tokens_seen": 87386448, "step": 40480 }, { "epoch": 6.604404567699837, "grad_norm": 1.866984248161316, "learning_rate": 1.5600527114532042e-05, "loss": 0.3241, "num_input_tokens_seen": 87395760, "step": 40485 }, { "epoch": 6.605220228384992, "grad_norm": 0.2067229002714157, "learning_rate": 1.5593931772378395e-05, "loss": 0.0071, "num_input_tokens_seen": 87407088, "step": 40490 }, { "epoch": 6.606035889070147, "grad_norm": 1.2671343088150024, "learning_rate": 1.5587337192728365e-05, "loss": 0.0068, "num_input_tokens_seen": 87418256, "step": 40495 }, { "epoch": 6.6068515497553015, "grad_norm": 0.12746548652648926, "learning_rate": 1.5580743376116536e-05, "loss": 0.0112, "num_input_tokens_seen": 87427984, "step": 40500 }, { "epoch": 6.607667210440457, "grad_norm": 0.029078975319862366, "learning_rate": 1.5574150323077432e-05, "loss": 0.1386, "num_input_tokens_seen": 87437424, "step": 40505 }, { "epoch": 6.608482871125612, "grad_norm": 0.23436488211154938, "learning_rate": 1.556755803414554e-05, "loss": 0.0287, "num_input_tokens_seen": 87447504, "step": 40510 }, { "epoch": 6.609298531810767, "grad_norm": 0.10258165746927261, "learning_rate": 1.5560966509855256e-05, "loss": 0.0888, "num_input_tokens_seen": 87457456, "step": 40515 }, { "epoch": 6.610114192495922, "grad_norm": 5.345180034637451, "learning_rate": 1.5554375750740917e-05, "loss": 0.0074, "num_input_tokens_seen": 87468464, "step": 40520 }, { "epoch": 6.6109298531810765, "grad_norm": 0.16994620859622955, "learning_rate": 1.554778575733681e-05, "loss": 0.0253, "num_input_tokens_seen": 87479504, "step": 40525 }, { "epoch": 6.611745513866231, "grad_norm": 7.721708297729492, "learning_rate": 1.5541196530177148e-05, "loss": 0.0619, "num_input_tokens_seen": 87489840, "step": 40530 }, { "epoch": 6.612561174551386, "grad_norm": 0.07135919481515884, "learning_rate": 1.5534608069796085e-05, "loss": 0.0042, "num_input_tokens_seen": 87501424, "step": 40535 }, { "epoch": 6.613376835236542, "grad_norm": 0.06724556535482407, "learning_rate": 1.5528020376727725e-05, "loss": 0.0055, "num_input_tokens_seen": 87512688, "step": 40540 }, { "epoch": 6.614192495921697, "grad_norm": 0.07323489338159561, "learning_rate": 1.5521433451506088e-05, "loss": 0.1014, "num_input_tokens_seen": 87523056, "step": 40545 }, { "epoch": 6.6150081566068515, "grad_norm": 0.1892024278640747, "learning_rate": 1.5514847294665152e-05, "loss": 0.0923, "num_input_tokens_seen": 87532784, "step": 40550 }, { "epoch": 6.615823817292006, "grad_norm": 0.2373567819595337, "learning_rate": 1.5508261906738824e-05, "loss": 0.0598, "num_input_tokens_seen": 87542224, "step": 40555 }, { "epoch": 6.616639477977161, "grad_norm": 19.97500991821289, "learning_rate": 1.5501677288260943e-05, "loss": 0.0474, "num_input_tokens_seen": 87554224, "step": 40560 }, { "epoch": 6.617455138662317, "grad_norm": 18.255889892578125, "learning_rate": 1.549509343976529e-05, "loss": 0.0801, "num_input_tokens_seen": 87564944, "step": 40565 }, { "epoch": 6.618270799347472, "grad_norm": 0.02366139553487301, "learning_rate": 1.5488510361785597e-05, "loss": 0.0125, "num_input_tokens_seen": 87575696, "step": 40570 }, { "epoch": 6.6190864600326265, "grad_norm": 0.08095688372850418, "learning_rate": 1.5481928054855512e-05, "loss": 0.0815, "num_input_tokens_seen": 87587216, "step": 40575 }, { "epoch": 6.619902120717781, "grad_norm": 0.04319370537996292, "learning_rate": 1.5475346519508637e-05, "loss": 0.093, "num_input_tokens_seen": 87597872, "step": 40580 }, { "epoch": 6.620717781402936, "grad_norm": 0.10578560084104538, "learning_rate": 1.5468765756278498e-05, "loss": 0.0729, "num_input_tokens_seen": 87608816, "step": 40585 }, { "epoch": 6.621533442088092, "grad_norm": 0.1031089797616005, "learning_rate": 1.5462185765698568e-05, "loss": 0.1104, "num_input_tokens_seen": 87619600, "step": 40590 }, { "epoch": 6.622349102773247, "grad_norm": 3.261246919631958, "learning_rate": 1.5455606548302253e-05, "loss": 0.283, "num_input_tokens_seen": 87629168, "step": 40595 }, { "epoch": 6.623164763458401, "grad_norm": 0.06577812135219574, "learning_rate": 1.5449028104622905e-05, "loss": 0.0057, "num_input_tokens_seen": 87640144, "step": 40600 }, { "epoch": 6.623980424143556, "grad_norm": 4.057028293609619, "learning_rate": 1.5442450435193795e-05, "loss": 0.0098, "num_input_tokens_seen": 87650128, "step": 40605 }, { "epoch": 6.624796084828711, "grad_norm": 0.23669201135635376, "learning_rate": 1.5435873540548135e-05, "loss": 0.1881, "num_input_tokens_seen": 87661264, "step": 40610 }, { "epoch": 6.625611745513866, "grad_norm": 0.04815550148487091, "learning_rate": 1.5429297421219107e-05, "loss": 0.0274, "num_input_tokens_seen": 87672464, "step": 40615 }, { "epoch": 6.626427406199021, "grad_norm": 0.16993646323680878, "learning_rate": 1.5422722077739794e-05, "loss": 0.1498, "num_input_tokens_seen": 87684400, "step": 40620 }, { "epoch": 6.627243066884176, "grad_norm": 0.2493388056755066, "learning_rate": 1.541614751064322e-05, "loss": 0.2131, "num_input_tokens_seen": 87695728, "step": 40625 }, { "epoch": 6.628058727569331, "grad_norm": 0.01710120402276516, "learning_rate": 1.5409573720462357e-05, "loss": 0.0041, "num_input_tokens_seen": 87705456, "step": 40630 }, { "epoch": 6.628874388254486, "grad_norm": 0.039348945021629333, "learning_rate": 1.540300070773011e-05, "loss": 0.0051, "num_input_tokens_seen": 87716816, "step": 40635 }, { "epoch": 6.629690048939641, "grad_norm": 0.020207500085234642, "learning_rate": 1.539642847297932e-05, "loss": 0.0528, "num_input_tokens_seen": 87727216, "step": 40640 }, { "epoch": 6.630505709624796, "grad_norm": 0.2506010830402374, "learning_rate": 1.5389857016742764e-05, "loss": 0.0074, "num_input_tokens_seen": 87737040, "step": 40645 }, { "epoch": 6.631321370309951, "grad_norm": 1.0845345258712769, "learning_rate": 1.538328633955316e-05, "loss": 0.0064, "num_input_tokens_seen": 87746928, "step": 40650 }, { "epoch": 6.632137030995106, "grad_norm": 0.03723951801657677, "learning_rate": 1.5376716441943162e-05, "loss": 0.0958, "num_input_tokens_seen": 87757424, "step": 40655 }, { "epoch": 6.632952691680261, "grad_norm": 12.476139068603516, "learning_rate": 1.5370147324445354e-05, "loss": 0.0884, "num_input_tokens_seen": 87768816, "step": 40660 }, { "epoch": 6.633768352365416, "grad_norm": 0.04074963927268982, "learning_rate": 1.536357898759227e-05, "loss": 0.1074, "num_input_tokens_seen": 87780912, "step": 40665 }, { "epoch": 6.634584013050571, "grad_norm": 4.669559001922607, "learning_rate": 1.535701143191637e-05, "loss": 0.3011, "num_input_tokens_seen": 87791920, "step": 40670 }, { "epoch": 6.635399673735726, "grad_norm": 0.04353715851902962, "learning_rate": 1.535044465795005e-05, "loss": 0.0058, "num_input_tokens_seen": 87802352, "step": 40675 }, { "epoch": 6.636215334420881, "grad_norm": 0.022636588662862778, "learning_rate": 1.534387866622564e-05, "loss": 0.0021, "num_input_tokens_seen": 87813488, "step": 40680 }, { "epoch": 6.637030995106036, "grad_norm": 0.1221766248345375, "learning_rate": 1.5337313457275428e-05, "loss": 0.3061, "num_input_tokens_seen": 87824016, "step": 40685 }, { "epoch": 6.637846655791191, "grad_norm": 0.10250678658485413, "learning_rate": 1.533074903163161e-05, "loss": 0.4608, "num_input_tokens_seen": 87835120, "step": 40690 }, { "epoch": 6.638662316476346, "grad_norm": 0.03980075195431709, "learning_rate": 1.5324185389826338e-05, "loss": 0.1061, "num_input_tokens_seen": 87846480, "step": 40695 }, { "epoch": 6.6394779771615005, "grad_norm": 0.10017578303813934, "learning_rate": 1.5317622532391694e-05, "loss": 0.0365, "num_input_tokens_seen": 87856304, "step": 40700 }, { "epoch": 6.640293637846656, "grad_norm": 4.295700550079346, "learning_rate": 1.53110604598597e-05, "loss": 0.1861, "num_input_tokens_seen": 87868080, "step": 40705 }, { "epoch": 6.641109298531811, "grad_norm": 0.11103116720914841, "learning_rate": 1.5304499172762293e-05, "loss": 0.0411, "num_input_tokens_seen": 87878416, "step": 40710 }, { "epoch": 6.641924959216966, "grad_norm": 0.033165328204631805, "learning_rate": 1.5297938671631386e-05, "loss": 0.1826, "num_input_tokens_seen": 87888720, "step": 40715 }, { "epoch": 6.642740619902121, "grad_norm": 0.18278607726097107, "learning_rate": 1.5291378956998793e-05, "loss": 0.0148, "num_input_tokens_seen": 87900528, "step": 40720 }, { "epoch": 6.643556280587275, "grad_norm": 2.620253562927246, "learning_rate": 1.528482002939629e-05, "loss": 0.2184, "num_input_tokens_seen": 87911984, "step": 40725 }, { "epoch": 6.64437194127243, "grad_norm": 2.582531690597534, "learning_rate": 1.5278261889355568e-05, "loss": 0.1257, "num_input_tokens_seen": 87922608, "step": 40730 }, { "epoch": 6.645187601957586, "grad_norm": 0.1558326929807663, "learning_rate": 1.527170453740826e-05, "loss": 0.1289, "num_input_tokens_seen": 87933936, "step": 40735 }, { "epoch": 6.646003262642741, "grad_norm": 0.0765334889292717, "learning_rate": 1.5265147974085947e-05, "loss": 0.0199, "num_input_tokens_seen": 87945488, "step": 40740 }, { "epoch": 6.646818923327896, "grad_norm": 0.06828638911247253, "learning_rate": 1.5258592199920135e-05, "loss": 0.1816, "num_input_tokens_seen": 87956112, "step": 40745 }, { "epoch": 6.64763458401305, "grad_norm": 3.1311378479003906, "learning_rate": 1.5252037215442266e-05, "loss": 0.1812, "num_input_tokens_seen": 87965584, "step": 40750 }, { "epoch": 6.648450244698205, "grad_norm": 0.07298579066991806, "learning_rate": 1.5245483021183722e-05, "loss": 0.0043, "num_input_tokens_seen": 87975056, "step": 40755 }, { "epoch": 6.649265905383361, "grad_norm": 0.019851306453347206, "learning_rate": 1.5238929617675817e-05, "loss": 0.0954, "num_input_tokens_seen": 87985712, "step": 40760 }, { "epoch": 6.650081566068516, "grad_norm": 4.504660129547119, "learning_rate": 1.5232377005449805e-05, "loss": 0.0127, "num_input_tokens_seen": 87996336, "step": 40765 }, { "epoch": 6.650897226753671, "grad_norm": 0.20256765186786652, "learning_rate": 1.5225825185036874e-05, "loss": 0.0573, "num_input_tokens_seen": 88007184, "step": 40770 }, { "epoch": 6.651712887438825, "grad_norm": 0.33704063296318054, "learning_rate": 1.5219274156968143e-05, "loss": 0.2346, "num_input_tokens_seen": 88017008, "step": 40775 }, { "epoch": 6.65252854812398, "grad_norm": 0.22632668912410736, "learning_rate": 1.521272392177468e-05, "loss": 0.0071, "num_input_tokens_seen": 88027760, "step": 40780 }, { "epoch": 6.653344208809135, "grad_norm": 16.721834182739258, "learning_rate": 1.5206174479987475e-05, "loss": 0.1451, "num_input_tokens_seen": 88039696, "step": 40785 }, { "epoch": 6.654159869494291, "grad_norm": 0.07452557235956192, "learning_rate": 1.5199625832137459e-05, "loss": 0.0636, "num_input_tokens_seen": 88050544, "step": 40790 }, { "epoch": 6.6549755301794455, "grad_norm": 0.050509676337242126, "learning_rate": 1.5193077978755499e-05, "loss": 0.005, "num_input_tokens_seen": 88061840, "step": 40795 }, { "epoch": 6.6557911908646, "grad_norm": 0.6662347912788391, "learning_rate": 1.5186530920372399e-05, "loss": 0.0087, "num_input_tokens_seen": 88072816, "step": 40800 }, { "epoch": 6.656606851549755, "grad_norm": 0.13393627107143402, "learning_rate": 1.5179984657518895e-05, "loss": 0.1109, "num_input_tokens_seen": 88083536, "step": 40805 }, { "epoch": 6.65742251223491, "grad_norm": 13.53052806854248, "learning_rate": 1.5173439190725663e-05, "loss": 0.2261, "num_input_tokens_seen": 88094704, "step": 40810 }, { "epoch": 6.658238172920065, "grad_norm": 4.617764472961426, "learning_rate": 1.5166894520523305e-05, "loss": 0.1982, "num_input_tokens_seen": 88106288, "step": 40815 }, { "epoch": 6.6590538336052205, "grad_norm": 0.02458474598824978, "learning_rate": 1.5160350647442367e-05, "loss": 0.0167, "num_input_tokens_seen": 88114960, "step": 40820 }, { "epoch": 6.659869494290375, "grad_norm": 0.12827251851558685, "learning_rate": 1.5153807572013338e-05, "loss": 0.0916, "num_input_tokens_seen": 88124944, "step": 40825 }, { "epoch": 6.66068515497553, "grad_norm": 0.15690867602825165, "learning_rate": 1.5147265294766624e-05, "loss": 0.0042, "num_input_tokens_seen": 88135056, "step": 40830 }, { "epoch": 6.661500815660685, "grad_norm": 0.16413655877113342, "learning_rate": 1.5140723816232583e-05, "loss": 0.0103, "num_input_tokens_seen": 88146768, "step": 40835 }, { "epoch": 6.66231647634584, "grad_norm": 0.045289475470781326, "learning_rate": 1.5134183136941487e-05, "loss": 0.3974, "num_input_tokens_seen": 88157968, "step": 40840 }, { "epoch": 6.6631321370309955, "grad_norm": 0.02990245632827282, "learning_rate": 1.5127643257423572e-05, "loss": 0.0989, "num_input_tokens_seen": 88168912, "step": 40845 }, { "epoch": 6.66394779771615, "grad_norm": 0.5999934077262878, "learning_rate": 1.5121104178208984e-05, "loss": 0.0082, "num_input_tokens_seen": 88179632, "step": 40850 }, { "epoch": 6.664763458401305, "grad_norm": 0.05874243006110191, "learning_rate": 1.5114565899827815e-05, "loss": 0.007, "num_input_tokens_seen": 88191472, "step": 40855 }, { "epoch": 6.66557911908646, "grad_norm": 0.04252789542078972, "learning_rate": 1.5108028422810094e-05, "loss": 0.021, "num_input_tokens_seen": 88202800, "step": 40860 }, { "epoch": 6.666394779771615, "grad_norm": 2.9396400451660156, "learning_rate": 1.510149174768578e-05, "loss": 0.1816, "num_input_tokens_seen": 88212848, "step": 40865 }, { "epoch": 6.6672104404567705, "grad_norm": 0.07159342616796494, "learning_rate": 1.5094955874984767e-05, "loss": 0.1988, "num_input_tokens_seen": 88224112, "step": 40870 }, { "epoch": 6.668026101141925, "grad_norm": 0.09100861102342606, "learning_rate": 1.5088420805236892e-05, "loss": 0.1974, "num_input_tokens_seen": 88234704, "step": 40875 }, { "epoch": 6.66884176182708, "grad_norm": 0.057788897305727005, "learning_rate": 1.5081886538971911e-05, "loss": 0.0077, "num_input_tokens_seen": 88247216, "step": 40880 }, { "epoch": 6.669657422512235, "grad_norm": 0.4550677537918091, "learning_rate": 1.5075353076719536e-05, "loss": 0.1011, "num_input_tokens_seen": 88259664, "step": 40885 }, { "epoch": 6.67047308319739, "grad_norm": 0.662420392036438, "learning_rate": 1.50688204190094e-05, "loss": 0.1004, "num_input_tokens_seen": 88270128, "step": 40890 }, { "epoch": 6.671288743882545, "grad_norm": 0.08329256623983383, "learning_rate": 1.5062288566371069e-05, "loss": 0.0041, "num_input_tokens_seen": 88280176, "step": 40895 }, { "epoch": 6.672104404567699, "grad_norm": 7.898841857910156, "learning_rate": 1.5055757519334048e-05, "loss": 0.1837, "num_input_tokens_seen": 88290288, "step": 40900 }, { "epoch": 6.672920065252855, "grad_norm": 0.05887956917285919, "learning_rate": 1.5049227278427782e-05, "loss": 0.0879, "num_input_tokens_seen": 88301296, "step": 40905 }, { "epoch": 6.67373572593801, "grad_norm": 2.963451862335205, "learning_rate": 1.504269784418164e-05, "loss": 0.0772, "num_input_tokens_seen": 88312752, "step": 40910 }, { "epoch": 6.674551386623165, "grad_norm": 0.16043300926685333, "learning_rate": 1.5036169217124938e-05, "loss": 0.0051, "num_input_tokens_seen": 88324144, "step": 40915 }, { "epoch": 6.6753670473083195, "grad_norm": 0.09560660272836685, "learning_rate": 1.5029641397786912e-05, "loss": 0.0822, "num_input_tokens_seen": 88335088, "step": 40920 }, { "epoch": 6.676182707993474, "grad_norm": 0.162864550948143, "learning_rate": 1.5023114386696746e-05, "loss": 0.007, "num_input_tokens_seen": 88345360, "step": 40925 }, { "epoch": 6.67699836867863, "grad_norm": 3.8639090061187744, "learning_rate": 1.5016588184383536e-05, "loss": 0.2153, "num_input_tokens_seen": 88355888, "step": 40930 }, { "epoch": 6.677814029363785, "grad_norm": 0.04227403923869133, "learning_rate": 1.5010062791376355e-05, "loss": 0.1095, "num_input_tokens_seen": 88366832, "step": 40935 }, { "epoch": 6.67862969004894, "grad_norm": 5.405738353729248, "learning_rate": 1.5003538208204173e-05, "loss": 0.2397, "num_input_tokens_seen": 88378320, "step": 40940 }, { "epoch": 6.6794453507340945, "grad_norm": 3.2369942665100098, "learning_rate": 1.4997014435395906e-05, "loss": 0.0957, "num_input_tokens_seen": 88388656, "step": 40945 }, { "epoch": 6.680261011419249, "grad_norm": 3.6323814392089844, "learning_rate": 1.4990491473480403e-05, "loss": 0.3306, "num_input_tokens_seen": 88399632, "step": 40950 }, { "epoch": 6.681076672104405, "grad_norm": 0.3965166509151459, "learning_rate": 1.4983969322986446e-05, "loss": 0.0038, "num_input_tokens_seen": 88410032, "step": 40955 }, { "epoch": 6.68189233278956, "grad_norm": 0.416993111371994, "learning_rate": 1.497744798444276e-05, "loss": 0.0125, "num_input_tokens_seen": 88421008, "step": 40960 }, { "epoch": 6.682707993474715, "grad_norm": 0.1507231742143631, "learning_rate": 1.497092745837799e-05, "loss": 0.0068, "num_input_tokens_seen": 88432144, "step": 40965 }, { "epoch": 6.6835236541598695, "grad_norm": 0.09753464162349701, "learning_rate": 1.496440774532073e-05, "loss": 0.0503, "num_input_tokens_seen": 88442576, "step": 40970 }, { "epoch": 6.684339314845024, "grad_norm": 0.6746512055397034, "learning_rate": 1.49578888457995e-05, "loss": 0.1647, "num_input_tokens_seen": 88452048, "step": 40975 }, { "epoch": 6.685154975530179, "grad_norm": 0.06260435283184052, "learning_rate": 1.4951370760342754e-05, "loss": 0.288, "num_input_tokens_seen": 88463280, "step": 40980 }, { "epoch": 6.685970636215334, "grad_norm": 0.0902155265212059, "learning_rate": 1.4944853489478878e-05, "loss": 0.04, "num_input_tokens_seen": 88473488, "step": 40985 }, { "epoch": 6.68678629690049, "grad_norm": 0.03820433467626572, "learning_rate": 1.4938337033736196e-05, "loss": 0.0031, "num_input_tokens_seen": 88484304, "step": 40990 }, { "epoch": 6.6876019575856445, "grad_norm": 0.04262197017669678, "learning_rate": 1.4931821393642969e-05, "loss": 0.1728, "num_input_tokens_seen": 88495088, "step": 40995 }, { "epoch": 6.688417618270799, "grad_norm": 0.04590749740600586, "learning_rate": 1.4925306569727385e-05, "loss": 0.2215, "num_input_tokens_seen": 88506512, "step": 41000 }, { "epoch": 6.689233278955954, "grad_norm": 0.8048055768013, "learning_rate": 1.491879256251757e-05, "loss": 0.0059, "num_input_tokens_seen": 88518192, "step": 41005 }, { "epoch": 6.690048939641109, "grad_norm": 0.17440329492092133, "learning_rate": 1.4912279372541577e-05, "loss": 0.0108, "num_input_tokens_seen": 88526928, "step": 41010 }, { "epoch": 6.690864600326265, "grad_norm": 0.10211269557476044, "learning_rate": 1.4905767000327409e-05, "loss": 0.1417, "num_input_tokens_seen": 88537616, "step": 41015 }, { "epoch": 6.691680261011419, "grad_norm": 0.033806685358285904, "learning_rate": 1.4899255446402982e-05, "loss": 0.1643, "num_input_tokens_seen": 88547408, "step": 41020 }, { "epoch": 6.692495921696574, "grad_norm": 0.044099051505327225, "learning_rate": 1.4892744711296152e-05, "loss": 0.0039, "num_input_tokens_seen": 88557232, "step": 41025 }, { "epoch": 6.693311582381729, "grad_norm": 0.11672758311033249, "learning_rate": 1.488623479553473e-05, "loss": 0.0699, "num_input_tokens_seen": 88567632, "step": 41030 }, { "epoch": 6.694127243066884, "grad_norm": 0.11783801764249802, "learning_rate": 1.4879725699646424e-05, "loss": 0.055, "num_input_tokens_seen": 88579312, "step": 41035 }, { "epoch": 6.69494290375204, "grad_norm": 2.3646669387817383, "learning_rate": 1.4873217424158906e-05, "loss": 0.0096, "num_input_tokens_seen": 88590608, "step": 41040 }, { "epoch": 6.695758564437194, "grad_norm": 0.20109257102012634, "learning_rate": 1.4866709969599767e-05, "loss": 0.0073, "num_input_tokens_seen": 88601904, "step": 41045 }, { "epoch": 6.696574225122349, "grad_norm": 0.02105763740837574, "learning_rate": 1.486020333649653e-05, "loss": 0.0045, "num_input_tokens_seen": 88611088, "step": 41050 }, { "epoch": 6.697389885807504, "grad_norm": 4.496692180633545, "learning_rate": 1.4853697525376665e-05, "loss": 0.2224, "num_input_tokens_seen": 88623120, "step": 41055 }, { "epoch": 6.698205546492659, "grad_norm": 1.6089199781417847, "learning_rate": 1.484719253676756e-05, "loss": 0.2066, "num_input_tokens_seen": 88634064, "step": 41060 }, { "epoch": 6.699021207177814, "grad_norm": 0.14514349400997162, "learning_rate": 1.4840688371196543e-05, "loss": 0.0042, "num_input_tokens_seen": 88645392, "step": 41065 }, { "epoch": 6.699836867862969, "grad_norm": 0.1685430258512497, "learning_rate": 1.4834185029190873e-05, "loss": 0.0049, "num_input_tokens_seen": 88657808, "step": 41070 }, { "epoch": 6.700652528548124, "grad_norm": 0.14244039356708527, "learning_rate": 1.4827682511277746e-05, "loss": 0.0054, "num_input_tokens_seen": 88668048, "step": 41075 }, { "epoch": 6.701468189233279, "grad_norm": 0.05467163026332855, "learning_rate": 1.4821180817984288e-05, "loss": 0.0053, "num_input_tokens_seen": 88677456, "step": 41080 }, { "epoch": 6.702283849918434, "grad_norm": 5.239294528961182, "learning_rate": 1.4814679949837563e-05, "loss": 0.3309, "num_input_tokens_seen": 88688592, "step": 41085 }, { "epoch": 6.703099510603589, "grad_norm": 0.11179067194461823, "learning_rate": 1.4808179907364555e-05, "loss": 0.0051, "num_input_tokens_seen": 88699152, "step": 41090 }, { "epoch": 6.7039151712887435, "grad_norm": 0.09347817301750183, "learning_rate": 1.48016806910922e-05, "loss": 0.1037, "num_input_tokens_seen": 88709776, "step": 41095 }, { "epoch": 6.704730831973899, "grad_norm": 10.745227813720703, "learning_rate": 1.4795182301547356e-05, "loss": 0.2152, "num_input_tokens_seen": 88721328, "step": 41100 }, { "epoch": 6.705546492659054, "grad_norm": 3.5864243507385254, "learning_rate": 1.4788684739256808e-05, "loss": 0.2329, "num_input_tokens_seen": 88732432, "step": 41105 }, { "epoch": 6.706362153344209, "grad_norm": 0.04763638973236084, "learning_rate": 1.4782188004747289e-05, "loss": 0.0087, "num_input_tokens_seen": 88743312, "step": 41110 }, { "epoch": 6.707177814029364, "grad_norm": 4.77978515625, "learning_rate": 1.4775692098545451e-05, "loss": 0.016, "num_input_tokens_seen": 88754096, "step": 41115 }, { "epoch": 6.7079934747145185, "grad_norm": 3.673560380935669, "learning_rate": 1.4769197021177896e-05, "loss": 0.0635, "num_input_tokens_seen": 88765200, "step": 41120 }, { "epoch": 6.708809135399674, "grad_norm": 0.10142073780298233, "learning_rate": 1.476270277317114e-05, "loss": 0.1696, "num_input_tokens_seen": 88777136, "step": 41125 }, { "epoch": 6.709624796084829, "grad_norm": 16.93016242980957, "learning_rate": 1.475620935505164e-05, "loss": 0.1775, "num_input_tokens_seen": 88789040, "step": 41130 }, { "epoch": 6.710440456769984, "grad_norm": 0.30513206124305725, "learning_rate": 1.4749716767345784e-05, "loss": 0.2889, "num_input_tokens_seen": 88799792, "step": 41135 }, { "epoch": 6.711256117455139, "grad_norm": 1.2527740001678467, "learning_rate": 1.4743225010579889e-05, "loss": 0.0834, "num_input_tokens_seen": 88810224, "step": 41140 }, { "epoch": 6.712071778140293, "grad_norm": 0.1730339080095291, "learning_rate": 1.4736734085280226e-05, "loss": 0.2215, "num_input_tokens_seen": 88820112, "step": 41145 }, { "epoch": 6.712887438825448, "grad_norm": 14.265073776245117, "learning_rate": 1.4730243991972976e-05, "loss": 0.075, "num_input_tokens_seen": 88830864, "step": 41150 }, { "epoch": 6.713703099510604, "grad_norm": 0.20754019916057587, "learning_rate": 1.4723754731184253e-05, "loss": 0.006, "num_input_tokens_seen": 88842096, "step": 41155 }, { "epoch": 6.714518760195759, "grad_norm": 6.739881992340088, "learning_rate": 1.4717266303440113e-05, "loss": 0.1074, "num_input_tokens_seen": 88853232, "step": 41160 }, { "epoch": 6.715334420880914, "grad_norm": 0.1268126666545868, "learning_rate": 1.471077870926654e-05, "loss": 0.102, "num_input_tokens_seen": 88863408, "step": 41165 }, { "epoch": 6.716150081566068, "grad_norm": 2.2937026023864746, "learning_rate": 1.4704291949189452e-05, "loss": 0.1632, "num_input_tokens_seen": 88873648, "step": 41170 }, { "epoch": 6.716965742251223, "grad_norm": 0.015560554340481758, "learning_rate": 1.46978060237347e-05, "loss": 0.1059, "num_input_tokens_seen": 88884720, "step": 41175 }, { "epoch": 6.717781402936378, "grad_norm": 0.11674555391073227, "learning_rate": 1.4691320933428066e-05, "loss": 0.0063, "num_input_tokens_seen": 88895632, "step": 41180 }, { "epoch": 6.718597063621534, "grad_norm": 0.16365036368370056, "learning_rate": 1.4684836678795259e-05, "loss": 0.1994, "num_input_tokens_seen": 88905456, "step": 41185 }, { "epoch": 6.719412724306689, "grad_norm": 0.12613674998283386, "learning_rate": 1.4678353260361927e-05, "loss": 0.1081, "num_input_tokens_seen": 88916848, "step": 41190 }, { "epoch": 6.720228384991843, "grad_norm": 0.10388201475143433, "learning_rate": 1.4671870678653653e-05, "loss": 0.0087, "num_input_tokens_seen": 88928240, "step": 41195 }, { "epoch": 6.721044045676998, "grad_norm": 0.19808514416217804, "learning_rate": 1.466538893419595e-05, "loss": 0.0811, "num_input_tokens_seen": 88940016, "step": 41200 }, { "epoch": 6.721859706362153, "grad_norm": 0.07586171478033066, "learning_rate": 1.4658908027514256e-05, "loss": 0.1817, "num_input_tokens_seen": 88950544, "step": 41205 }, { "epoch": 6.722675367047309, "grad_norm": 0.21056191623210907, "learning_rate": 1.4652427959133947e-05, "loss": 0.1585, "num_input_tokens_seen": 88961520, "step": 41210 }, { "epoch": 6.7234910277324635, "grad_norm": 0.1211901530623436, "learning_rate": 1.4645948729580331e-05, "loss": 0.1525, "num_input_tokens_seen": 88971952, "step": 41215 }, { "epoch": 6.724306688417618, "grad_norm": 0.38611674308776855, "learning_rate": 1.4639470339378647e-05, "loss": 0.087, "num_input_tokens_seen": 88981840, "step": 41220 }, { "epoch": 6.725122349102773, "grad_norm": 12.179522514343262, "learning_rate": 1.4632992789054064e-05, "loss": 0.1006, "num_input_tokens_seen": 88992656, "step": 41225 }, { "epoch": 6.725938009787928, "grad_norm": 3.596252679824829, "learning_rate": 1.4626516079131692e-05, "loss": 0.3164, "num_input_tokens_seen": 89004432, "step": 41230 }, { "epoch": 6.726753670473083, "grad_norm": 0.08227868378162384, "learning_rate": 1.4620040210136557e-05, "loss": 0.0592, "num_input_tokens_seen": 89014736, "step": 41235 }, { "epoch": 6.7275693311582385, "grad_norm": 0.05020721256732941, "learning_rate": 1.461356518259363e-05, "loss": 0.1544, "num_input_tokens_seen": 89024976, "step": 41240 }, { "epoch": 6.728384991843393, "grad_norm": 0.09834147989749908, "learning_rate": 1.4607090997027812e-05, "loss": 0.0034, "num_input_tokens_seen": 89035440, "step": 41245 }, { "epoch": 6.729200652528548, "grad_norm": 7.497015476226807, "learning_rate": 1.4600617653963918e-05, "loss": 0.1151, "num_input_tokens_seen": 89046608, "step": 41250 }, { "epoch": 6.730016313213703, "grad_norm": 3.570956230163574, "learning_rate": 1.4594145153926737e-05, "loss": 0.1994, "num_input_tokens_seen": 89056912, "step": 41255 }, { "epoch": 6.730831973898858, "grad_norm": 0.0936165452003479, "learning_rate": 1.4587673497440946e-05, "loss": 0.1032, "num_input_tokens_seen": 89068048, "step": 41260 }, { "epoch": 6.731647634584013, "grad_norm": 0.18565014004707336, "learning_rate": 1.458120268503117e-05, "loss": 0.1172, "num_input_tokens_seen": 89078544, "step": 41265 }, { "epoch": 6.732463295269168, "grad_norm": 5.346120357513428, "learning_rate": 1.4574732717221972e-05, "loss": 0.105, "num_input_tokens_seen": 89090768, "step": 41270 }, { "epoch": 6.733278955954323, "grad_norm": 0.6010496020317078, "learning_rate": 1.456826359453784e-05, "loss": 0.1191, "num_input_tokens_seen": 89101040, "step": 41275 }, { "epoch": 6.734094616639478, "grad_norm": 5.97984504699707, "learning_rate": 1.4561795317503185e-05, "loss": 0.029, "num_input_tokens_seen": 89112112, "step": 41280 }, { "epoch": 6.734910277324633, "grad_norm": 0.1399141401052475, "learning_rate": 1.455532788664237e-05, "loss": 0.1909, "num_input_tokens_seen": 89123056, "step": 41285 }, { "epoch": 6.735725938009788, "grad_norm": 4.638060569763184, "learning_rate": 1.4548861302479672e-05, "loss": 0.1535, "num_input_tokens_seen": 89135120, "step": 41290 }, { "epoch": 6.736541598694943, "grad_norm": 0.05866732448339462, "learning_rate": 1.4542395565539302e-05, "loss": 0.1598, "num_input_tokens_seen": 89147184, "step": 41295 }, { "epoch": 6.737357259380098, "grad_norm": 6.314901351928711, "learning_rate": 1.453593067634541e-05, "loss": 0.065, "num_input_tokens_seen": 89156912, "step": 41300 }, { "epoch": 6.738172920065253, "grad_norm": 0.07997670024633408, "learning_rate": 1.4529466635422063e-05, "loss": 0.0063, "num_input_tokens_seen": 89167184, "step": 41305 }, { "epoch": 6.738988580750408, "grad_norm": 0.11030901223421097, "learning_rate": 1.4523003443293285e-05, "loss": 0.0767, "num_input_tokens_seen": 89178448, "step": 41310 }, { "epoch": 6.739804241435563, "grad_norm": 0.14322669804096222, "learning_rate": 1.4516541100483008e-05, "loss": 0.0584, "num_input_tokens_seen": 89189072, "step": 41315 }, { "epoch": 6.740619902120718, "grad_norm": 0.036629918962717056, "learning_rate": 1.4510079607515104e-05, "loss": 0.0064, "num_input_tokens_seen": 89200496, "step": 41320 }, { "epoch": 6.741435562805873, "grad_norm": 0.09099335968494415, "learning_rate": 1.4503618964913368e-05, "loss": 0.0633, "num_input_tokens_seen": 89209840, "step": 41325 }, { "epoch": 6.742251223491028, "grad_norm": 9.764974594116211, "learning_rate": 1.4497159173201541e-05, "loss": 0.2554, "num_input_tokens_seen": 89220240, "step": 41330 }, { "epoch": 6.743066884176183, "grad_norm": 0.08768236637115479, "learning_rate": 1.4490700232903281e-05, "loss": 0.0054, "num_input_tokens_seen": 89232400, "step": 41335 }, { "epoch": 6.7438825448613375, "grad_norm": 0.12594589591026306, "learning_rate": 1.4484242144542184e-05, "loss": 0.2616, "num_input_tokens_seen": 89242608, "step": 41340 }, { "epoch": 6.744698205546492, "grad_norm": 0.053064584732055664, "learning_rate": 1.4477784908641775e-05, "loss": 0.0098, "num_input_tokens_seen": 89253328, "step": 41345 }, { "epoch": 6.745513866231647, "grad_norm": 0.07828322052955627, "learning_rate": 1.4471328525725512e-05, "loss": 0.1012, "num_input_tokens_seen": 89263984, "step": 41350 }, { "epoch": 6.746329526916803, "grad_norm": 7.68565034866333, "learning_rate": 1.446487299631677e-05, "loss": 0.0523, "num_input_tokens_seen": 89275856, "step": 41355 }, { "epoch": 6.747145187601958, "grad_norm": 5.093059062957764, "learning_rate": 1.4458418320938886e-05, "loss": 0.0586, "num_input_tokens_seen": 89287024, "step": 41360 }, { "epoch": 6.7479608482871125, "grad_norm": 0.12296286225318909, "learning_rate": 1.4451964500115101e-05, "loss": 0.1454, "num_input_tokens_seen": 89298032, "step": 41365 }, { "epoch": 6.748776508972267, "grad_norm": 0.1551298350095749, "learning_rate": 1.4445511534368595e-05, "loss": 0.006, "num_input_tokens_seen": 89309424, "step": 41370 }, { "epoch": 6.749592169657422, "grad_norm": 0.21927422285079956, "learning_rate": 1.4439059424222474e-05, "loss": 0.0085, "num_input_tokens_seen": 89320880, "step": 41375 }, { "epoch": 6.750407830342578, "grad_norm": 1.674319863319397, "learning_rate": 1.4432608170199785e-05, "loss": 0.184, "num_input_tokens_seen": 89332624, "step": 41380 }, { "epoch": 6.751223491027733, "grad_norm": 3.81581449508667, "learning_rate": 1.4426157772823495e-05, "loss": 0.0857, "num_input_tokens_seen": 89343344, "step": 41385 }, { "epoch": 6.7520391517128875, "grad_norm": 3.1091864109039307, "learning_rate": 1.4419708232616508e-05, "loss": 0.1051, "num_input_tokens_seen": 89355344, "step": 41390 }, { "epoch": 6.752854812398042, "grad_norm": 0.2115515172481537, "learning_rate": 1.4413259550101654e-05, "loss": 0.1125, "num_input_tokens_seen": 89366160, "step": 41395 }, { "epoch": 6.753670473083197, "grad_norm": 0.17943738400936127, "learning_rate": 1.4406811725801696e-05, "loss": 0.0703, "num_input_tokens_seen": 89376976, "step": 41400 }, { "epoch": 6.754486133768353, "grad_norm": 7.111088275909424, "learning_rate": 1.4400364760239333e-05, "loss": 0.1543, "num_input_tokens_seen": 89388176, "step": 41405 }, { "epoch": 6.755301794453508, "grad_norm": 0.24837414920330048, "learning_rate": 1.4393918653937183e-05, "loss": 0.0209, "num_input_tokens_seen": 89399152, "step": 41410 }, { "epoch": 6.7561174551386625, "grad_norm": 20.415496826171875, "learning_rate": 1.4387473407417801e-05, "loss": 0.2762, "num_input_tokens_seen": 89409968, "step": 41415 }, { "epoch": 6.756933115823817, "grad_norm": 0.060787633061409, "learning_rate": 1.438102902120367e-05, "loss": 0.0688, "num_input_tokens_seen": 89420560, "step": 41420 }, { "epoch": 6.757748776508972, "grad_norm": 7.669675350189209, "learning_rate": 1.437458549581721e-05, "loss": 0.1184, "num_input_tokens_seen": 89431952, "step": 41425 }, { "epoch": 6.758564437194127, "grad_norm": 0.10015132278203964, "learning_rate": 1.4368142831780763e-05, "loss": 0.0103, "num_input_tokens_seen": 89443152, "step": 41430 }, { "epoch": 6.759380097879282, "grad_norm": 0.23086363077163696, "learning_rate": 1.4361701029616598e-05, "loss": 0.0093, "num_input_tokens_seen": 89454576, "step": 41435 }, { "epoch": 6.760195758564437, "grad_norm": 0.07244721055030823, "learning_rate": 1.4355260089846931e-05, "loss": 0.0045, "num_input_tokens_seen": 89465840, "step": 41440 }, { "epoch": 6.761011419249592, "grad_norm": 0.056052159518003464, "learning_rate": 1.434882001299389e-05, "loss": 0.0043, "num_input_tokens_seen": 89476400, "step": 41445 }, { "epoch": 6.761827079934747, "grad_norm": 0.4685874879360199, "learning_rate": 1.4342380799579533e-05, "loss": 0.2262, "num_input_tokens_seen": 89487568, "step": 41450 }, { "epoch": 6.762642740619902, "grad_norm": 0.2143806368112564, "learning_rate": 1.4335942450125872e-05, "loss": 0.0116, "num_input_tokens_seen": 89496400, "step": 41455 }, { "epoch": 6.763458401305057, "grad_norm": 0.03215618431568146, "learning_rate": 1.4329504965154827e-05, "loss": 0.0177, "num_input_tokens_seen": 89507440, "step": 41460 }, { "epoch": 6.764274061990212, "grad_norm": 0.19878214597702026, "learning_rate": 1.4323068345188253e-05, "loss": 0.1351, "num_input_tokens_seen": 89516624, "step": 41465 }, { "epoch": 6.765089722675367, "grad_norm": 0.01781969517469406, "learning_rate": 1.431663259074793e-05, "loss": 0.0676, "num_input_tokens_seen": 89526352, "step": 41470 }, { "epoch": 6.765905383360522, "grad_norm": 0.06626878678798676, "learning_rate": 1.4310197702355572e-05, "loss": 0.1046, "num_input_tokens_seen": 89538128, "step": 41475 }, { "epoch": 6.766721044045677, "grad_norm": 0.23306971788406372, "learning_rate": 1.430376368053283e-05, "loss": 0.0868, "num_input_tokens_seen": 89549136, "step": 41480 }, { "epoch": 6.767536704730832, "grad_norm": 0.0779542401432991, "learning_rate": 1.429733052580128e-05, "loss": 0.009, "num_input_tokens_seen": 89559728, "step": 41485 }, { "epoch": 6.768352365415987, "grad_norm": 0.12217021733522415, "learning_rate": 1.4290898238682421e-05, "loss": 0.0192, "num_input_tokens_seen": 89569904, "step": 41490 }, { "epoch": 6.769168026101142, "grad_norm": 0.02566716820001602, "learning_rate": 1.428446681969769e-05, "loss": 0.0951, "num_input_tokens_seen": 89581008, "step": 41495 }, { "epoch": 6.769983686786297, "grad_norm": 0.14071223139762878, "learning_rate": 1.427803626936845e-05, "loss": 0.315, "num_input_tokens_seen": 89591728, "step": 41500 }, { "epoch": 6.770799347471452, "grad_norm": 0.05853862687945366, "learning_rate": 1.4271606588215988e-05, "loss": 0.0044, "num_input_tokens_seen": 89601456, "step": 41505 }, { "epoch": 6.771615008156607, "grad_norm": 0.203269362449646, "learning_rate": 1.4265177776761534e-05, "loss": 0.0807, "num_input_tokens_seen": 89612368, "step": 41510 }, { "epoch": 6.7724306688417615, "grad_norm": 0.41243788599967957, "learning_rate": 1.4258749835526235e-05, "loss": 0.0117, "num_input_tokens_seen": 89623312, "step": 41515 }, { "epoch": 6.773246329526917, "grad_norm": 3.9026591777801514, "learning_rate": 1.4252322765031179e-05, "loss": 0.1886, "num_input_tokens_seen": 89633936, "step": 41520 }, { "epoch": 6.774061990212072, "grad_norm": 0.14258667826652527, "learning_rate": 1.4245896565797373e-05, "loss": 0.0959, "num_input_tokens_seen": 89643696, "step": 41525 }, { "epoch": 6.774877650897227, "grad_norm": 0.23517794907093048, "learning_rate": 1.4239471238345753e-05, "loss": 0.0128, "num_input_tokens_seen": 89653936, "step": 41530 }, { "epoch": 6.775693311582382, "grad_norm": 0.13160397112369537, "learning_rate": 1.4233046783197195e-05, "loss": 0.013, "num_input_tokens_seen": 89664752, "step": 41535 }, { "epoch": 6.7765089722675365, "grad_norm": 0.06285054236650467, "learning_rate": 1.4226623200872496e-05, "loss": 0.0047, "num_input_tokens_seen": 89675984, "step": 41540 }, { "epoch": 6.777324632952691, "grad_norm": 0.12321215122938156, "learning_rate": 1.4220200491892383e-05, "loss": 0.0031, "num_input_tokens_seen": 89686704, "step": 41545 }, { "epoch": 6.778140293637847, "grad_norm": 1.5065103769302368, "learning_rate": 1.4213778656777515e-05, "loss": 0.094, "num_input_tokens_seen": 89697040, "step": 41550 }, { "epoch": 6.778955954323002, "grad_norm": 0.07444112747907639, "learning_rate": 1.4207357696048479e-05, "loss": 0.0066, "num_input_tokens_seen": 89708208, "step": 41555 }, { "epoch": 6.779771615008157, "grad_norm": 0.07855966687202454, "learning_rate": 1.4200937610225787e-05, "loss": 0.0073, "num_input_tokens_seen": 89719120, "step": 41560 }, { "epoch": 6.780587275693311, "grad_norm": 0.09173068404197693, "learning_rate": 1.4194518399829887e-05, "loss": 0.0724, "num_input_tokens_seen": 89730384, "step": 41565 }, { "epoch": 6.781402936378466, "grad_norm": 0.08013544976711273, "learning_rate": 1.4188100065381144e-05, "loss": 0.0035, "num_input_tokens_seen": 89742128, "step": 41570 }, { "epoch": 6.782218597063622, "grad_norm": 0.03320332244038582, "learning_rate": 1.4181682607399877e-05, "loss": 0.1222, "num_input_tokens_seen": 89752944, "step": 41575 }, { "epoch": 6.783034257748777, "grad_norm": 0.058275748044252396, "learning_rate": 1.4175266026406308e-05, "loss": 0.2549, "num_input_tokens_seen": 89762448, "step": 41580 }, { "epoch": 6.783849918433932, "grad_norm": 0.07470196485519409, "learning_rate": 1.4168850322920602e-05, "loss": 0.1627, "num_input_tokens_seen": 89773200, "step": 41585 }, { "epoch": 6.784665579119086, "grad_norm": 0.08631259202957153, "learning_rate": 1.4162435497462842e-05, "loss": 0.0104, "num_input_tokens_seen": 89783952, "step": 41590 }, { "epoch": 6.785481239804241, "grad_norm": 0.16717347502708435, "learning_rate": 1.415602155055305e-05, "loss": 0.0285, "num_input_tokens_seen": 89796528, "step": 41595 }, { "epoch": 6.786296900489396, "grad_norm": 0.13983605802059174, "learning_rate": 1.4149608482711177e-05, "loss": 0.0049, "num_input_tokens_seen": 89807664, "step": 41600 }, { "epoch": 6.787112561174552, "grad_norm": 5.405811309814453, "learning_rate": 1.4143196294457092e-05, "loss": 0.0504, "num_input_tokens_seen": 89817552, "step": 41605 }, { "epoch": 6.787928221859707, "grad_norm": 0.14414404332637787, "learning_rate": 1.4136784986310603e-05, "loss": 0.0279, "num_input_tokens_seen": 89829008, "step": 41610 }, { "epoch": 6.788743882544861, "grad_norm": 0.06346258521080017, "learning_rate": 1.4130374558791442e-05, "loss": 0.1968, "num_input_tokens_seen": 89839856, "step": 41615 }, { "epoch": 6.789559543230016, "grad_norm": 0.22917187213897705, "learning_rate": 1.412396501241926e-05, "loss": 0.0934, "num_input_tokens_seen": 89851312, "step": 41620 }, { "epoch": 6.790375203915171, "grad_norm": 0.09620549529790878, "learning_rate": 1.411755634771367e-05, "loss": 0.1002, "num_input_tokens_seen": 89863248, "step": 41625 }, { "epoch": 6.791190864600326, "grad_norm": 0.09660594910383224, "learning_rate": 1.411114856519418e-05, "loss": 0.0886, "num_input_tokens_seen": 89875344, "step": 41630 }, { "epoch": 6.7920065252854815, "grad_norm": 0.053771648555994034, "learning_rate": 1.4104741665380236e-05, "loss": 0.0038, "num_input_tokens_seen": 89887056, "step": 41635 }, { "epoch": 6.792822185970636, "grad_norm": 0.07990163564682007, "learning_rate": 1.4098335648791216e-05, "loss": 0.1245, "num_input_tokens_seen": 89898064, "step": 41640 }, { "epoch": 6.793637846655791, "grad_norm": 0.12838514149188995, "learning_rate": 1.4091930515946422e-05, "loss": 0.1057, "num_input_tokens_seen": 89908816, "step": 41645 }, { "epoch": 6.794453507340946, "grad_norm": 0.1484857052564621, "learning_rate": 1.4085526267365084e-05, "loss": 0.0046, "num_input_tokens_seen": 89918896, "step": 41650 }, { "epoch": 6.795269168026101, "grad_norm": 2.432342290878296, "learning_rate": 1.4079122903566371e-05, "loss": 0.0083, "num_input_tokens_seen": 89928880, "step": 41655 }, { "epoch": 6.7960848287112565, "grad_norm": 0.1658189594745636, "learning_rate": 1.4072720425069364e-05, "loss": 0.0115, "num_input_tokens_seen": 89939952, "step": 41660 }, { "epoch": 6.796900489396411, "grad_norm": 4.673414707183838, "learning_rate": 1.4066318832393086e-05, "loss": 0.2901, "num_input_tokens_seen": 89949360, "step": 41665 }, { "epoch": 6.797716150081566, "grad_norm": 0.26324915885925293, "learning_rate": 1.4059918126056478e-05, "loss": 0.0923, "num_input_tokens_seen": 89960144, "step": 41670 }, { "epoch": 6.798531810766721, "grad_norm": 3.5412778854370117, "learning_rate": 1.405351830657841e-05, "loss": 0.2425, "num_input_tokens_seen": 89971184, "step": 41675 }, { "epoch": 6.799347471451876, "grad_norm": 0.0986003503203392, "learning_rate": 1.4047119374477696e-05, "loss": 0.0439, "num_input_tokens_seen": 89982288, "step": 41680 }, { "epoch": 6.800163132137031, "grad_norm": 0.0386163592338562, "learning_rate": 1.4040721330273062e-05, "loss": 0.0072, "num_input_tokens_seen": 89993744, "step": 41685 }, { "epoch": 6.800978792822186, "grad_norm": 6.793071746826172, "learning_rate": 1.4034324174483166e-05, "loss": 0.0158, "num_input_tokens_seen": 90005232, "step": 41690 }, { "epoch": 6.801794453507341, "grad_norm": 0.28747808933258057, "learning_rate": 1.4027927907626586e-05, "loss": 0.0381, "num_input_tokens_seen": 90016240, "step": 41695 }, { "epoch": 6.802610114192496, "grad_norm": 0.035620465874671936, "learning_rate": 1.4021532530221846e-05, "loss": 0.0047, "num_input_tokens_seen": 90026736, "step": 41700 }, { "epoch": 6.803425774877651, "grad_norm": 0.2508222460746765, "learning_rate": 1.4015138042787381e-05, "loss": 0.0764, "num_input_tokens_seen": 90037296, "step": 41705 }, { "epoch": 6.804241435562806, "grad_norm": 0.18858025968074799, "learning_rate": 1.4008744445841566e-05, "loss": 0.0094, "num_input_tokens_seen": 90048176, "step": 41710 }, { "epoch": 6.80505709624796, "grad_norm": 0.03687680512666702, "learning_rate": 1.4002351739902691e-05, "loss": 0.0067, "num_input_tokens_seen": 90059056, "step": 41715 }, { "epoch": 6.805872756933116, "grad_norm": 0.13411162793636322, "learning_rate": 1.3995959925488988e-05, "loss": 0.0035, "num_input_tokens_seen": 90069456, "step": 41720 }, { "epoch": 6.806688417618271, "grad_norm": 0.06725441664457321, "learning_rate": 1.3989569003118609e-05, "loss": 0.0025, "num_input_tokens_seen": 90081840, "step": 41725 }, { "epoch": 6.807504078303426, "grad_norm": 0.20298072695732117, "learning_rate": 1.398317897330963e-05, "loss": 0.0803, "num_input_tokens_seen": 90091600, "step": 41730 }, { "epoch": 6.808319738988581, "grad_norm": 0.11427846550941467, "learning_rate": 1.3976789836580062e-05, "loss": 0.07, "num_input_tokens_seen": 90103344, "step": 41735 }, { "epoch": 6.809135399673735, "grad_norm": 0.9090158343315125, "learning_rate": 1.3970401593447843e-05, "loss": 0.0066, "num_input_tokens_seen": 90114352, "step": 41740 }, { "epoch": 6.809951060358891, "grad_norm": 11.22180461883545, "learning_rate": 1.3964014244430834e-05, "loss": 0.0845, "num_input_tokens_seen": 90125584, "step": 41745 }, { "epoch": 6.810766721044046, "grad_norm": 0.09490711987018585, "learning_rate": 1.3957627790046826e-05, "loss": 0.1107, "num_input_tokens_seen": 90136304, "step": 41750 }, { "epoch": 6.811582381729201, "grad_norm": 0.18473337590694427, "learning_rate": 1.3951242230813538e-05, "loss": 0.0164, "num_input_tokens_seen": 90147088, "step": 41755 }, { "epoch": 6.8123980424143555, "grad_norm": 0.04754093289375305, "learning_rate": 1.3944857567248615e-05, "loss": 0.2755, "num_input_tokens_seen": 90158512, "step": 41760 }, { "epoch": 6.81321370309951, "grad_norm": 1.2219489812850952, "learning_rate": 1.3938473799869622e-05, "loss": 0.0719, "num_input_tokens_seen": 90168880, "step": 41765 }, { "epoch": 6.814029363784666, "grad_norm": 0.24588292837142944, "learning_rate": 1.3932090929194075e-05, "loss": 0.0057, "num_input_tokens_seen": 90179696, "step": 41770 }, { "epoch": 6.814845024469821, "grad_norm": 2.426314353942871, "learning_rate": 1.39257089557394e-05, "loss": 0.0891, "num_input_tokens_seen": 90190960, "step": 41775 }, { "epoch": 6.815660685154976, "grad_norm": 3.5945773124694824, "learning_rate": 1.3919327880022945e-05, "loss": 0.1121, "num_input_tokens_seen": 90202288, "step": 41780 }, { "epoch": 6.8164763458401305, "grad_norm": 0.07115895301103592, "learning_rate": 1.3912947702561995e-05, "loss": 0.0038, "num_input_tokens_seen": 90214288, "step": 41785 }, { "epoch": 6.817292006525285, "grad_norm": 4.984953880310059, "learning_rate": 1.390656842387375e-05, "loss": 0.2038, "num_input_tokens_seen": 90225168, "step": 41790 }, { "epoch": 6.81810766721044, "grad_norm": 0.02814999222755432, "learning_rate": 1.3900190044475364e-05, "loss": 0.0937, "num_input_tokens_seen": 90235760, "step": 41795 }, { "epoch": 6.818923327895595, "grad_norm": 0.02262449637055397, "learning_rate": 1.3893812564883896e-05, "loss": 0.009, "num_input_tokens_seen": 90246864, "step": 41800 }, { "epoch": 6.819738988580751, "grad_norm": 0.050474390387535095, "learning_rate": 1.3887435985616332e-05, "loss": 0.2542, "num_input_tokens_seen": 90257680, "step": 41805 }, { "epoch": 6.8205546492659055, "grad_norm": 0.050444345921278, "learning_rate": 1.3881060307189592e-05, "loss": 0.2553, "num_input_tokens_seen": 90268976, "step": 41810 }, { "epoch": 6.82137030995106, "grad_norm": 2.1560733318328857, "learning_rate": 1.387468553012052e-05, "loss": 0.1103, "num_input_tokens_seen": 90279696, "step": 41815 }, { "epoch": 6.822185970636215, "grad_norm": 0.0733235776424408, "learning_rate": 1.386831165492589e-05, "loss": 0.0283, "num_input_tokens_seen": 90288624, "step": 41820 }, { "epoch": 6.82300163132137, "grad_norm": 0.024894678965210915, "learning_rate": 1.3861938682122396e-05, "loss": 0.0065, "num_input_tokens_seen": 90299984, "step": 41825 }, { "epoch": 6.823817292006526, "grad_norm": 0.06028543412685394, "learning_rate": 1.3855566612226666e-05, "loss": 0.0026, "num_input_tokens_seen": 90309264, "step": 41830 }, { "epoch": 6.8246329526916805, "grad_norm": 1.5850801467895508, "learning_rate": 1.3849195445755258e-05, "loss": 0.131, "num_input_tokens_seen": 90320528, "step": 41835 }, { "epoch": 6.825448613376835, "grad_norm": 0.035399507731199265, "learning_rate": 1.3842825183224642e-05, "loss": 0.0648, "num_input_tokens_seen": 90331184, "step": 41840 }, { "epoch": 6.82626427406199, "grad_norm": 0.030352847650647163, "learning_rate": 1.3836455825151229e-05, "loss": 0.0048, "num_input_tokens_seen": 90341200, "step": 41845 }, { "epoch": 6.827079934747145, "grad_norm": 0.06919962167739868, "learning_rate": 1.3830087372051347e-05, "loss": 0.1411, "num_input_tokens_seen": 90350992, "step": 41850 }, { "epoch": 6.827895595432301, "grad_norm": 3.3427679538726807, "learning_rate": 1.3823719824441262e-05, "loss": 0.1648, "num_input_tokens_seen": 90361520, "step": 41855 }, { "epoch": 6.828711256117455, "grad_norm": 0.37280991673469543, "learning_rate": 1.3817353182837151e-05, "loss": 0.1894, "num_input_tokens_seen": 90372112, "step": 41860 }, { "epoch": 6.82952691680261, "grad_norm": 0.3510379195213318, "learning_rate": 1.3810987447755136e-05, "loss": 0.2406, "num_input_tokens_seen": 90383856, "step": 41865 }, { "epoch": 6.830342577487765, "grad_norm": 0.110009104013443, "learning_rate": 1.380462261971125e-05, "loss": 0.0737, "num_input_tokens_seen": 90393552, "step": 41870 }, { "epoch": 6.83115823817292, "grad_norm": 4.968212604522705, "learning_rate": 1.3798258699221456e-05, "loss": 0.2073, "num_input_tokens_seen": 90403056, "step": 41875 }, { "epoch": 6.831973898858075, "grad_norm": 0.08078257739543915, "learning_rate": 1.3791895686801653e-05, "loss": 0.1244, "num_input_tokens_seen": 90414160, "step": 41880 }, { "epoch": 6.8327895595432295, "grad_norm": 3.059316635131836, "learning_rate": 1.3785533582967642e-05, "loss": 0.1318, "num_input_tokens_seen": 90426160, "step": 41885 }, { "epoch": 6.833605220228385, "grad_norm": 4.10181999206543, "learning_rate": 1.3779172388235192e-05, "loss": 0.4578, "num_input_tokens_seen": 90436816, "step": 41890 }, { "epoch": 6.83442088091354, "grad_norm": 0.15112021565437317, "learning_rate": 1.3772812103119964e-05, "loss": 0.0875, "num_input_tokens_seen": 90448144, "step": 41895 }, { "epoch": 6.835236541598695, "grad_norm": 4.008183002471924, "learning_rate": 1.376645272813755e-05, "loss": 0.0098, "num_input_tokens_seen": 90458352, "step": 41900 }, { "epoch": 6.83605220228385, "grad_norm": 3.7091166973114014, "learning_rate": 1.3760094263803474e-05, "loss": 0.1987, "num_input_tokens_seen": 90469264, "step": 41905 }, { "epoch": 6.8368678629690045, "grad_norm": 0.07551616430282593, "learning_rate": 1.3753736710633192e-05, "loss": 0.1271, "num_input_tokens_seen": 90478832, "step": 41910 }, { "epoch": 6.83768352365416, "grad_norm": 0.03895062580704689, "learning_rate": 1.3747380069142073e-05, "loss": 0.0048, "num_input_tokens_seen": 90489840, "step": 41915 }, { "epoch": 6.838499184339315, "grad_norm": 3.905212640762329, "learning_rate": 1.3741024339845426e-05, "loss": 0.1333, "num_input_tokens_seen": 90501232, "step": 41920 }, { "epoch": 6.83931484502447, "grad_norm": 0.134656623005867, "learning_rate": 1.373466952325847e-05, "loss": 0.0041, "num_input_tokens_seen": 90512560, "step": 41925 }, { "epoch": 6.840130505709625, "grad_norm": 0.07766005396842957, "learning_rate": 1.372831561989636e-05, "loss": 0.0088, "num_input_tokens_seen": 90523856, "step": 41930 }, { "epoch": 6.8409461663947795, "grad_norm": 0.042651377618312836, "learning_rate": 1.3721962630274171e-05, "loss": 0.1277, "num_input_tokens_seen": 90532720, "step": 41935 }, { "epoch": 6.841761827079935, "grad_norm": 0.0998101532459259, "learning_rate": 1.3715610554906922e-05, "loss": 0.147, "num_input_tokens_seen": 90543888, "step": 41940 }, { "epoch": 6.84257748776509, "grad_norm": 6.188546657562256, "learning_rate": 1.370925939430954e-05, "loss": 0.1926, "num_input_tokens_seen": 90553712, "step": 41945 }, { "epoch": 6.843393148450245, "grad_norm": 8.392866134643555, "learning_rate": 1.3702909148996878e-05, "loss": 0.0173, "num_input_tokens_seen": 90563920, "step": 41950 }, { "epoch": 6.8442088091354, "grad_norm": 0.11783468723297119, "learning_rate": 1.3696559819483722e-05, "loss": 0.0079, "num_input_tokens_seen": 90574544, "step": 41955 }, { "epoch": 6.8450244698205545, "grad_norm": 0.43951496481895447, "learning_rate": 1.3690211406284784e-05, "loss": 0.0281, "num_input_tokens_seen": 90586384, "step": 41960 }, { "epoch": 6.845840130505709, "grad_norm": 0.09904633462429047, "learning_rate": 1.368386390991469e-05, "loss": 0.0976, "num_input_tokens_seen": 90597776, "step": 41965 }, { "epoch": 6.846655791190865, "grad_norm": 0.14969505369663239, "learning_rate": 1.3677517330888007e-05, "loss": 0.0062, "num_input_tokens_seen": 90610128, "step": 41970 }, { "epoch": 6.84747145187602, "grad_norm": 0.09711721539497375, "learning_rate": 1.3671171669719218e-05, "loss": 0.0949, "num_input_tokens_seen": 90621168, "step": 41975 }, { "epoch": 6.848287112561175, "grad_norm": 0.2408400923013687, "learning_rate": 1.3664826926922736e-05, "loss": 0.0131, "num_input_tokens_seen": 90631536, "step": 41980 }, { "epoch": 6.849102773246329, "grad_norm": 10.967161178588867, "learning_rate": 1.3658483103012898e-05, "loss": 0.1604, "num_input_tokens_seen": 90641168, "step": 41985 }, { "epoch": 6.849918433931484, "grad_norm": 0.03806580603122711, "learning_rate": 1.3652140198503966e-05, "loss": 0.0056, "num_input_tokens_seen": 90651632, "step": 41990 }, { "epoch": 6.850734094616639, "grad_norm": 0.08589682728052139, "learning_rate": 1.364579821391012e-05, "loss": 0.0065, "num_input_tokens_seen": 90662128, "step": 41995 }, { "epoch": 6.851549755301795, "grad_norm": 0.08156748116016388, "learning_rate": 1.3639457149745489e-05, "loss": 0.0117, "num_input_tokens_seen": 90673808, "step": 42000 }, { "epoch": 6.85236541598695, "grad_norm": 2.5086851119995117, "learning_rate": 1.3633117006524102e-05, "loss": 0.0272, "num_input_tokens_seen": 90684752, "step": 42005 }, { "epoch": 6.853181076672104, "grad_norm": 0.07278906553983688, "learning_rate": 1.3626777784759925e-05, "loss": 0.0085, "num_input_tokens_seen": 90695312, "step": 42010 }, { "epoch": 6.853996737357259, "grad_norm": 4.181410789489746, "learning_rate": 1.3620439484966851e-05, "loss": 0.0125, "num_input_tokens_seen": 90705520, "step": 42015 }, { "epoch": 6.854812398042414, "grad_norm": 0.05359490588307381, "learning_rate": 1.3614102107658693e-05, "loss": 0.0038, "num_input_tokens_seen": 90717584, "step": 42020 }, { "epoch": 6.85562805872757, "grad_norm": 0.11791082471609116, "learning_rate": 1.3607765653349185e-05, "loss": 0.0124, "num_input_tokens_seen": 90728496, "step": 42025 }, { "epoch": 6.856443719412725, "grad_norm": 3.67244029045105, "learning_rate": 1.3601430122552e-05, "loss": 0.0962, "num_input_tokens_seen": 90738960, "step": 42030 }, { "epoch": 6.857259380097879, "grad_norm": 0.059158798307180405, "learning_rate": 1.3595095515780726e-05, "loss": 0.0693, "num_input_tokens_seen": 90750352, "step": 42035 }, { "epoch": 6.858075040783034, "grad_norm": 0.04778822138905525, "learning_rate": 1.3588761833548875e-05, "loss": 0.1266, "num_input_tokens_seen": 90761040, "step": 42040 }, { "epoch": 6.858890701468189, "grad_norm": 6.498258113861084, "learning_rate": 1.358242907636989e-05, "loss": 0.0789, "num_input_tokens_seen": 90770992, "step": 42045 }, { "epoch": 6.859706362153344, "grad_norm": 0.565824568271637, "learning_rate": 1.3576097244757138e-05, "loss": 0.0371, "num_input_tokens_seen": 90779088, "step": 42050 }, { "epoch": 6.8605220228384995, "grad_norm": 0.04997954145073891, "learning_rate": 1.3569766339223907e-05, "loss": 0.232, "num_input_tokens_seen": 90789616, "step": 42055 }, { "epoch": 6.861337683523654, "grad_norm": 3.0516979694366455, "learning_rate": 1.3563436360283412e-05, "loss": 0.0071, "num_input_tokens_seen": 90800496, "step": 42060 }, { "epoch": 6.862153344208809, "grad_norm": 31.080303192138672, "learning_rate": 1.3557107308448796e-05, "loss": 0.2309, "num_input_tokens_seen": 90809808, "step": 42065 }, { "epoch": 6.862969004893964, "grad_norm": 0.1256628930568695, "learning_rate": 1.355077918423312e-05, "loss": 0.1463, "num_input_tokens_seen": 90820880, "step": 42070 }, { "epoch": 6.863784665579119, "grad_norm": 0.07377687096595764, "learning_rate": 1.3544451988149376e-05, "loss": 0.0047, "num_input_tokens_seen": 90831696, "step": 42075 }, { "epoch": 6.864600326264274, "grad_norm": 0.10171285271644592, "learning_rate": 1.353812572071047e-05, "loss": 0.0035, "num_input_tokens_seen": 90842512, "step": 42080 }, { "epoch": 6.865415986949429, "grad_norm": 3.5421407222747803, "learning_rate": 1.353180038242926e-05, "loss": 0.0799, "num_input_tokens_seen": 90852784, "step": 42085 }, { "epoch": 6.866231647634584, "grad_norm": 0.4722101390361786, "learning_rate": 1.3525475973818502e-05, "loss": 0.1369, "num_input_tokens_seen": 90863280, "step": 42090 }, { "epoch": 6.867047308319739, "grad_norm": 0.054875221103429794, "learning_rate": 1.351915249539088e-05, "loss": 0.2059, "num_input_tokens_seen": 90874160, "step": 42095 }, { "epoch": 6.867862969004894, "grad_norm": 0.10923886299133301, "learning_rate": 1.3512829947659011e-05, "loss": 0.2623, "num_input_tokens_seen": 90884752, "step": 42100 }, { "epoch": 6.868678629690049, "grad_norm": 0.11440841108560562, "learning_rate": 1.350650833113542e-05, "loss": 0.1509, "num_input_tokens_seen": 90894704, "step": 42105 }, { "epoch": 6.869494290375204, "grad_norm": 0.06628463417291641, "learning_rate": 1.3500187646332593e-05, "loss": 0.1294, "num_input_tokens_seen": 90904528, "step": 42110 }, { "epoch": 6.870309951060359, "grad_norm": 0.19406814873218536, "learning_rate": 1.3493867893762904e-05, "loss": 0.0051, "num_input_tokens_seen": 90915696, "step": 42115 }, { "epoch": 6.871125611745514, "grad_norm": 0.2199442833662033, "learning_rate": 1.3487549073938666e-05, "loss": 0.0075, "num_input_tokens_seen": 90925776, "step": 42120 }, { "epoch": 6.871941272430669, "grad_norm": 0.09250465780496597, "learning_rate": 1.3481231187372111e-05, "loss": 0.0033, "num_input_tokens_seen": 90936144, "step": 42125 }, { "epoch": 6.872756933115824, "grad_norm": 0.23881153762340546, "learning_rate": 1.3474914234575406e-05, "loss": 0.1522, "num_input_tokens_seen": 90946480, "step": 42130 }, { "epoch": 6.873572593800979, "grad_norm": 0.8363812565803528, "learning_rate": 1.346859821606063e-05, "loss": 0.0061, "num_input_tokens_seen": 90956720, "step": 42135 }, { "epoch": 6.874388254486134, "grad_norm": 0.14776800572872162, "learning_rate": 1.3462283132339787e-05, "loss": 0.0839, "num_input_tokens_seen": 90967312, "step": 42140 }, { "epoch": 6.875203915171289, "grad_norm": 10.871723175048828, "learning_rate": 1.3455968983924822e-05, "loss": 0.2272, "num_input_tokens_seen": 90978000, "step": 42145 }, { "epoch": 6.876019575856444, "grad_norm": 0.21014146506786346, "learning_rate": 1.344965577132758e-05, "loss": 0.0062, "num_input_tokens_seen": 90988304, "step": 42150 }, { "epoch": 6.876835236541599, "grad_norm": 12.95048999786377, "learning_rate": 1.344334349505985e-05, "loss": 0.0869, "num_input_tokens_seen": 90998288, "step": 42155 }, { "epoch": 6.877650897226753, "grad_norm": 0.05227712169289589, "learning_rate": 1.3437032155633333e-05, "loss": 0.0193, "num_input_tokens_seen": 91009936, "step": 42160 }, { "epoch": 6.878466557911908, "grad_norm": 0.6685779094696045, "learning_rate": 1.343072175355966e-05, "loss": 0.0092, "num_input_tokens_seen": 91021456, "step": 42165 }, { "epoch": 6.879282218597064, "grad_norm": 0.1707424372434616, "learning_rate": 1.342441228935038e-05, "loss": 0.1517, "num_input_tokens_seen": 91032784, "step": 42170 }, { "epoch": 6.880097879282219, "grad_norm": 11.286739349365234, "learning_rate": 1.3418103763516979e-05, "loss": 0.0617, "num_input_tokens_seen": 91043952, "step": 42175 }, { "epoch": 6.8809135399673735, "grad_norm": 2.9973649978637695, "learning_rate": 1.3411796176570852e-05, "loss": 0.1612, "num_input_tokens_seen": 91054704, "step": 42180 }, { "epoch": 6.881729200652528, "grad_norm": 0.3737064599990845, "learning_rate": 1.3405489529023322e-05, "loss": 0.1331, "num_input_tokens_seen": 91064176, "step": 42185 }, { "epoch": 6.882544861337683, "grad_norm": 5.6691741943359375, "learning_rate": 1.339918382138564e-05, "loss": 0.1819, "num_input_tokens_seen": 91074768, "step": 42190 }, { "epoch": 6.883360522022839, "grad_norm": 2.48360276222229, "learning_rate": 1.3392879054168983e-05, "loss": 0.155, "num_input_tokens_seen": 91085520, "step": 42195 }, { "epoch": 6.884176182707994, "grad_norm": 0.31449663639068604, "learning_rate": 1.3386575227884443e-05, "loss": 0.1577, "num_input_tokens_seen": 91094800, "step": 42200 }, { "epoch": 6.8849918433931485, "grad_norm": 0.4666809141635895, "learning_rate": 1.3380272343043032e-05, "loss": 0.0081, "num_input_tokens_seen": 91105840, "step": 42205 }, { "epoch": 6.885807504078303, "grad_norm": 0.06634185463190079, "learning_rate": 1.337397040015571e-05, "loss": 0.2304, "num_input_tokens_seen": 91116624, "step": 42210 }, { "epoch": 6.886623164763458, "grad_norm": 9.664610862731934, "learning_rate": 1.336766939973334e-05, "loss": 0.1508, "num_input_tokens_seen": 91128176, "step": 42215 }, { "epoch": 6.887438825448614, "grad_norm": 0.19557489454746246, "learning_rate": 1.3361369342286706e-05, "loss": 0.219, "num_input_tokens_seen": 91138544, "step": 42220 }, { "epoch": 6.888254486133769, "grad_norm": 0.1942441165447235, "learning_rate": 1.3355070228326533e-05, "loss": 0.1022, "num_input_tokens_seen": 91148624, "step": 42225 }, { "epoch": 6.8890701468189235, "grad_norm": 0.08193667978048325, "learning_rate": 1.3348772058363448e-05, "loss": 0.075, "num_input_tokens_seen": 91158672, "step": 42230 }, { "epoch": 6.889885807504078, "grad_norm": 1.206705093383789, "learning_rate": 1.334247483290802e-05, "loss": 0.0067, "num_input_tokens_seen": 91169648, "step": 42235 }, { "epoch": 6.890701468189233, "grad_norm": 0.5195077061653137, "learning_rate": 1.3336178552470729e-05, "loss": 0.2202, "num_input_tokens_seen": 91180496, "step": 42240 }, { "epoch": 6.891517128874388, "grad_norm": 10.173349380493164, "learning_rate": 1.332988321756198e-05, "loss": 0.0865, "num_input_tokens_seen": 91191440, "step": 42245 }, { "epoch": 6.892332789559543, "grad_norm": 0.12923379242420197, "learning_rate": 1.3323588828692119e-05, "loss": 0.107, "num_input_tokens_seen": 91201584, "step": 42250 }, { "epoch": 6.8931484502446985, "grad_norm": 0.28006529808044434, "learning_rate": 1.3317295386371396e-05, "loss": 0.0053, "num_input_tokens_seen": 91213104, "step": 42255 }, { "epoch": 6.893964110929853, "grad_norm": 0.3573981523513794, "learning_rate": 1.3311002891109981e-05, "loss": 0.0077, "num_input_tokens_seen": 91224560, "step": 42260 }, { "epoch": 6.894779771615008, "grad_norm": 0.05163750424981117, "learning_rate": 1.3304711343417985e-05, "loss": 0.1145, "num_input_tokens_seen": 91235824, "step": 42265 }, { "epoch": 6.895595432300163, "grad_norm": 12.171839714050293, "learning_rate": 1.329842074380543e-05, "loss": 0.0396, "num_input_tokens_seen": 91247600, "step": 42270 }, { "epoch": 6.896411092985318, "grad_norm": 0.12578676640987396, "learning_rate": 1.3292131092782259e-05, "loss": 0.0811, "num_input_tokens_seen": 91259120, "step": 42275 }, { "epoch": 6.897226753670473, "grad_norm": 0.23984800279140472, "learning_rate": 1.328584239085835e-05, "loss": 0.0057, "num_input_tokens_seen": 91269040, "step": 42280 }, { "epoch": 6.898042414355628, "grad_norm": 0.13835729658603668, "learning_rate": 1.327955463854349e-05, "loss": 0.0879, "num_input_tokens_seen": 91280208, "step": 42285 }, { "epoch": 6.898858075040783, "grad_norm": 0.08634456992149353, "learning_rate": 1.3273267836347403e-05, "loss": 0.009, "num_input_tokens_seen": 91291088, "step": 42290 }, { "epoch": 6.899673735725938, "grad_norm": 0.10134554654359818, "learning_rate": 1.3266981984779725e-05, "loss": 0.0348, "num_input_tokens_seen": 91302032, "step": 42295 }, { "epoch": 6.900489396411093, "grad_norm": 0.05540751665830612, "learning_rate": 1.3260697084350018e-05, "loss": 0.0072, "num_input_tokens_seen": 91313552, "step": 42300 }, { "epoch": 6.901305057096248, "grad_norm": 0.057743240147829056, "learning_rate": 1.3254413135567773e-05, "loss": 0.0076, "num_input_tokens_seen": 91323728, "step": 42305 }, { "epoch": 6.902120717781403, "grad_norm": 0.11667633801698685, "learning_rate": 1.3248130138942394e-05, "loss": 0.1382, "num_input_tokens_seen": 91334960, "step": 42310 }, { "epoch": 6.902936378466558, "grad_norm": 0.08370362222194672, "learning_rate": 1.3241848094983206e-05, "loss": 0.005, "num_input_tokens_seen": 91346896, "step": 42315 }, { "epoch": 6.903752039151713, "grad_norm": 0.03880749270319939, "learning_rate": 1.3235567004199481e-05, "loss": 0.0251, "num_input_tokens_seen": 91357168, "step": 42320 }, { "epoch": 6.904567699836868, "grad_norm": 5.930319786071777, "learning_rate": 1.3229286867100388e-05, "loss": 0.1583, "num_input_tokens_seen": 91368688, "step": 42325 }, { "epoch": 6.9053833605220225, "grad_norm": 0.18616336584091187, "learning_rate": 1.3223007684195021e-05, "loss": 0.0103, "num_input_tokens_seen": 91379600, "step": 42330 }, { "epoch": 6.906199021207177, "grad_norm": 0.08522260934114456, "learning_rate": 1.321672945599241e-05, "loss": 0.0102, "num_input_tokens_seen": 91389872, "step": 42335 }, { "epoch": 6.907014681892333, "grad_norm": 4.693546295166016, "learning_rate": 1.3210452183001497e-05, "loss": 0.2053, "num_input_tokens_seen": 91400944, "step": 42340 }, { "epoch": 6.907830342577488, "grad_norm": 2.370194673538208, "learning_rate": 1.320417586573115e-05, "loss": 0.2174, "num_input_tokens_seen": 91412080, "step": 42345 }, { "epoch": 6.908646003262643, "grad_norm": 0.07777683436870575, "learning_rate": 1.3197900504690161e-05, "loss": 0.0052, "num_input_tokens_seen": 91423952, "step": 42350 }, { "epoch": 6.9094616639477975, "grad_norm": 0.10456414520740509, "learning_rate": 1.3191626100387238e-05, "loss": 0.182, "num_input_tokens_seen": 91434480, "step": 42355 }, { "epoch": 6.910277324632952, "grad_norm": 1.8496055603027344, "learning_rate": 1.3185352653331018e-05, "loss": 0.1313, "num_input_tokens_seen": 91444464, "step": 42360 }, { "epoch": 6.911092985318108, "grad_norm": 0.10061901807785034, "learning_rate": 1.3179080164030059e-05, "loss": 0.1324, "num_input_tokens_seen": 91453104, "step": 42365 }, { "epoch": 6.911908646003263, "grad_norm": 0.04748468101024628, "learning_rate": 1.317280863299284e-05, "loss": 0.0069, "num_input_tokens_seen": 91463120, "step": 42370 }, { "epoch": 6.912724306688418, "grad_norm": 0.10149021446704865, "learning_rate": 1.3166538060727765e-05, "loss": 0.0154, "num_input_tokens_seen": 91473104, "step": 42375 }, { "epoch": 6.9135399673735725, "grad_norm": 4.338562488555908, "learning_rate": 1.3160268447743157e-05, "loss": 0.3162, "num_input_tokens_seen": 91484304, "step": 42380 }, { "epoch": 6.914355628058727, "grad_norm": 0.06958422064781189, "learning_rate": 1.3153999794547261e-05, "loss": 0.0054, "num_input_tokens_seen": 91495536, "step": 42385 }, { "epoch": 6.915171288743883, "grad_norm": 0.06562705338001251, "learning_rate": 1.3147732101648242e-05, "loss": 0.0159, "num_input_tokens_seen": 91505392, "step": 42390 }, { "epoch": 6.915986949429038, "grad_norm": 0.04994618520140648, "learning_rate": 1.31414653695542e-05, "loss": 0.0056, "num_input_tokens_seen": 91517200, "step": 42395 }, { "epoch": 6.916802610114193, "grad_norm": 0.043880391865968704, "learning_rate": 1.3135199598773152e-05, "loss": 0.0593, "num_input_tokens_seen": 91528592, "step": 42400 }, { "epoch": 6.917618270799347, "grad_norm": 0.11109756678342819, "learning_rate": 1.3128934789813021e-05, "loss": 0.0039, "num_input_tokens_seen": 91539216, "step": 42405 }, { "epoch": 6.918433931484502, "grad_norm": 0.024895496666431427, "learning_rate": 1.312267094318167e-05, "loss": 0.268, "num_input_tokens_seen": 91549616, "step": 42410 }, { "epoch": 6.919249592169657, "grad_norm": 0.08413500338792801, "learning_rate": 1.3116408059386881e-05, "loss": 0.1347, "num_input_tokens_seen": 91560496, "step": 42415 }, { "epoch": 6.920065252854813, "grad_norm": 0.23153434693813324, "learning_rate": 1.3110146138936335e-05, "loss": 0.0992, "num_input_tokens_seen": 91571408, "step": 42420 }, { "epoch": 6.920880913539968, "grad_norm": 0.1570703238248825, "learning_rate": 1.3103885182337688e-05, "loss": 0.0072, "num_input_tokens_seen": 91580496, "step": 42425 }, { "epoch": 6.921696574225122, "grad_norm": 0.08957034349441528, "learning_rate": 1.3097625190098464e-05, "loss": 0.1445, "num_input_tokens_seen": 91591984, "step": 42430 }, { "epoch": 6.922512234910277, "grad_norm": 11.272125244140625, "learning_rate": 1.3091366162726136e-05, "loss": 0.0367, "num_input_tokens_seen": 91602672, "step": 42435 }, { "epoch": 6.923327895595432, "grad_norm": 27.047489166259766, "learning_rate": 1.3085108100728089e-05, "loss": 0.0175, "num_input_tokens_seen": 91612752, "step": 42440 }, { "epoch": 6.924143556280587, "grad_norm": 0.09969615936279297, "learning_rate": 1.3078851004611636e-05, "loss": 0.0091, "num_input_tokens_seen": 91623184, "step": 42445 }, { "epoch": 6.924959216965743, "grad_norm": 0.0777190700173378, "learning_rate": 1.3072594874884008e-05, "loss": 0.0797, "num_input_tokens_seen": 91633488, "step": 42450 }, { "epoch": 6.925774877650897, "grad_norm": 0.27353769540786743, "learning_rate": 1.3066339712052359e-05, "loss": 0.2706, "num_input_tokens_seen": 91645456, "step": 42455 }, { "epoch": 6.926590538336052, "grad_norm": 0.11094507575035095, "learning_rate": 1.3060085516623763e-05, "loss": 0.1277, "num_input_tokens_seen": 91656016, "step": 42460 }, { "epoch": 6.927406199021207, "grad_norm": 4.023979663848877, "learning_rate": 1.3053832289105216e-05, "loss": 0.1015, "num_input_tokens_seen": 91667728, "step": 42465 }, { "epoch": 6.928221859706362, "grad_norm": 0.09130234271287918, "learning_rate": 1.3047580030003642e-05, "loss": 0.0736, "num_input_tokens_seen": 91679280, "step": 42470 }, { "epoch": 6.9290375203915175, "grad_norm": 0.11625941097736359, "learning_rate": 1.3041328739825873e-05, "loss": 0.0064, "num_input_tokens_seen": 91690032, "step": 42475 }, { "epoch": 6.929853181076672, "grad_norm": 4.393698692321777, "learning_rate": 1.3035078419078675e-05, "loss": 0.0745, "num_input_tokens_seen": 91700944, "step": 42480 }, { "epoch": 6.930668841761827, "grad_norm": 5.182872772216797, "learning_rate": 1.302882906826873e-05, "loss": 0.0418, "num_input_tokens_seen": 91710096, "step": 42485 }, { "epoch": 6.931484502446982, "grad_norm": 0.2051047831773758, "learning_rate": 1.3022580687902641e-05, "loss": 0.1138, "num_input_tokens_seen": 91719856, "step": 42490 }, { "epoch": 6.932300163132137, "grad_norm": 3.923438787460327, "learning_rate": 1.3016333278486936e-05, "loss": 0.2626, "num_input_tokens_seen": 91730448, "step": 42495 }, { "epoch": 6.933115823817292, "grad_norm": 4.099771499633789, "learning_rate": 1.301008684052806e-05, "loss": 0.2064, "num_input_tokens_seen": 91742256, "step": 42500 }, { "epoch": 6.933931484502447, "grad_norm": 19.56888771057129, "learning_rate": 1.300384137453238e-05, "loss": 0.0806, "num_input_tokens_seen": 91752848, "step": 42505 }, { "epoch": 6.934747145187602, "grad_norm": 0.1936420500278473, "learning_rate": 1.2997596881006185e-05, "loss": 0.0705, "num_input_tokens_seen": 91764240, "step": 42510 }, { "epoch": 6.935562805872757, "grad_norm": 0.12515167891979218, "learning_rate": 1.2991353360455688e-05, "loss": 0.0289, "num_input_tokens_seen": 91775696, "step": 42515 }, { "epoch": 6.936378466557912, "grad_norm": 0.05261050537228584, "learning_rate": 1.298511081338702e-05, "loss": 0.0052, "num_input_tokens_seen": 91786576, "step": 42520 }, { "epoch": 6.937194127243067, "grad_norm": 0.09552355855703354, "learning_rate": 1.2978869240306219e-05, "loss": 0.0104, "num_input_tokens_seen": 91796880, "step": 42525 }, { "epoch": 6.938009787928221, "grad_norm": 0.08294784277677536, "learning_rate": 1.2972628641719286e-05, "loss": 0.1157, "num_input_tokens_seen": 91807344, "step": 42530 }, { "epoch": 6.938825448613377, "grad_norm": 0.05821099877357483, "learning_rate": 1.2966389018132097e-05, "loss": 0.0066, "num_input_tokens_seen": 91818640, "step": 42535 }, { "epoch": 6.939641109298532, "grad_norm": 0.10651091486215591, "learning_rate": 1.2960150370050475e-05, "loss": 0.0031, "num_input_tokens_seen": 91829616, "step": 42540 }, { "epoch": 6.940456769983687, "grad_norm": 4.577032089233398, "learning_rate": 1.2953912697980152e-05, "loss": 0.104, "num_input_tokens_seen": 91839376, "step": 42545 }, { "epoch": 6.941272430668842, "grad_norm": 0.03873153030872345, "learning_rate": 1.2947676002426789e-05, "loss": 0.1172, "num_input_tokens_seen": 91849776, "step": 42550 }, { "epoch": 6.942088091353996, "grad_norm": 10.644501686096191, "learning_rate": 1.2941440283895961e-05, "loss": 0.301, "num_input_tokens_seen": 91861520, "step": 42555 }, { "epoch": 6.942903752039152, "grad_norm": 0.07911406457424164, "learning_rate": 1.2935205542893158e-05, "loss": 0.1147, "num_input_tokens_seen": 91872880, "step": 42560 }, { "epoch": 6.943719412724307, "grad_norm": 0.13447058200836182, "learning_rate": 1.2928971779923821e-05, "loss": 0.1414, "num_input_tokens_seen": 91884016, "step": 42565 }, { "epoch": 6.944535073409462, "grad_norm": 8.785443305969238, "learning_rate": 1.2922738995493277e-05, "loss": 0.3427, "num_input_tokens_seen": 91895504, "step": 42570 }, { "epoch": 6.945350734094617, "grad_norm": 0.05817743390798569, "learning_rate": 1.2916507190106792e-05, "loss": 0.0049, "num_input_tokens_seen": 91906288, "step": 42575 }, { "epoch": 6.946166394779771, "grad_norm": 3.8127334117889404, "learning_rate": 1.2910276364269546e-05, "loss": 0.2059, "num_input_tokens_seen": 91916752, "step": 42580 }, { "epoch": 6.946982055464927, "grad_norm": 4.929322242736816, "learning_rate": 1.2904046518486637e-05, "loss": 0.1455, "num_input_tokens_seen": 91926992, "step": 42585 }, { "epoch": 6.947797716150082, "grad_norm": 0.04898161441087723, "learning_rate": 1.2897817653263095e-05, "loss": 0.0042, "num_input_tokens_seen": 91935504, "step": 42590 }, { "epoch": 6.948613376835237, "grad_norm": 4.509163856506348, "learning_rate": 1.2891589769103856e-05, "loss": 0.375, "num_input_tokens_seen": 91946320, "step": 42595 }, { "epoch": 6.9494290375203915, "grad_norm": 7.143014907836914, "learning_rate": 1.288536286651379e-05, "loss": 0.0135, "num_input_tokens_seen": 91957008, "step": 42600 }, { "epoch": 6.950244698205546, "grad_norm": 3.1989221572875977, "learning_rate": 1.2879136945997677e-05, "loss": 0.1246, "num_input_tokens_seen": 91967824, "step": 42605 }, { "epoch": 6.951060358890701, "grad_norm": 4.313570022583008, "learning_rate": 1.2872912008060228e-05, "loss": 0.1304, "num_input_tokens_seen": 91979024, "step": 42610 }, { "epoch": 6.951876019575856, "grad_norm": 0.044972293078899384, "learning_rate": 1.286668805320606e-05, "loss": 0.1614, "num_input_tokens_seen": 91990288, "step": 42615 }, { "epoch": 6.952691680261012, "grad_norm": 0.11908325552940369, "learning_rate": 1.2860465081939727e-05, "loss": 0.1117, "num_input_tokens_seen": 92000592, "step": 42620 }, { "epoch": 6.9535073409461665, "grad_norm": 0.04871833324432373, "learning_rate": 1.2854243094765683e-05, "loss": 0.1082, "num_input_tokens_seen": 92009520, "step": 42625 }, { "epoch": 6.954323001631321, "grad_norm": 0.04810227081179619, "learning_rate": 1.2848022092188328e-05, "loss": 0.0067, "num_input_tokens_seen": 92020848, "step": 42630 }, { "epoch": 6.955138662316476, "grad_norm": 4.308464050292969, "learning_rate": 1.2841802074711945e-05, "loss": 0.1247, "num_input_tokens_seen": 92030384, "step": 42635 }, { "epoch": 6.955954323001631, "grad_norm": 3.530879020690918, "learning_rate": 1.2835583042840788e-05, "loss": 0.2087, "num_input_tokens_seen": 92040528, "step": 42640 }, { "epoch": 6.956769983686787, "grad_norm": 0.21271125972270966, "learning_rate": 1.282936499707899e-05, "loss": 0.0844, "num_input_tokens_seen": 92052368, "step": 42645 }, { "epoch": 6.9575856443719415, "grad_norm": 0.08056087046861649, "learning_rate": 1.282314793793062e-05, "loss": 0.1743, "num_input_tokens_seen": 92063152, "step": 42650 }, { "epoch": 6.958401305057096, "grad_norm": 0.16285933554172516, "learning_rate": 1.2816931865899662e-05, "loss": 0.004, "num_input_tokens_seen": 92073296, "step": 42655 }, { "epoch": 6.959216965742251, "grad_norm": 1.8300361633300781, "learning_rate": 1.2810716781490024e-05, "loss": 0.0709, "num_input_tokens_seen": 92084176, "step": 42660 }, { "epoch": 6.960032626427406, "grad_norm": 0.6765586733818054, "learning_rate": 1.2804502685205532e-05, "loss": 0.0549, "num_input_tokens_seen": 92095344, "step": 42665 }, { "epoch": 6.960848287112562, "grad_norm": 0.2839624285697937, "learning_rate": 1.2798289577549932e-05, "loss": 0.006, "num_input_tokens_seen": 92106384, "step": 42670 }, { "epoch": 6.9616639477977165, "grad_norm": 0.16660496592521667, "learning_rate": 1.2792077459026886e-05, "loss": 0.007, "num_input_tokens_seen": 92117104, "step": 42675 }, { "epoch": 6.962479608482871, "grad_norm": 0.31639882922172546, "learning_rate": 1.278586633013999e-05, "loss": 0.027, "num_input_tokens_seen": 92127600, "step": 42680 }, { "epoch": 6.963295269168026, "grad_norm": 0.11238505691289902, "learning_rate": 1.2779656191392736e-05, "loss": 0.0747, "num_input_tokens_seen": 92138448, "step": 42685 }, { "epoch": 6.964110929853181, "grad_norm": 0.08163265883922577, "learning_rate": 1.2773447043288561e-05, "loss": 0.004, "num_input_tokens_seen": 92148144, "step": 42690 }, { "epoch": 6.964926590538336, "grad_norm": 4.075665473937988, "learning_rate": 1.2767238886330805e-05, "loss": 0.2935, "num_input_tokens_seen": 92158640, "step": 42695 }, { "epoch": 6.9657422512234906, "grad_norm": 0.09558428823947906, "learning_rate": 1.2761031721022732e-05, "loss": 0.2357, "num_input_tokens_seen": 92169008, "step": 42700 }, { "epoch": 6.966557911908646, "grad_norm": 0.06287585943937302, "learning_rate": 1.2754825547867519e-05, "loss": 0.0745, "num_input_tokens_seen": 92180432, "step": 42705 }, { "epoch": 6.967373572593801, "grad_norm": 0.18176084756851196, "learning_rate": 1.2748620367368286e-05, "loss": 0.1636, "num_input_tokens_seen": 92191376, "step": 42710 }, { "epoch": 6.968189233278956, "grad_norm": 0.0947713851928711, "learning_rate": 1.2742416180028053e-05, "loss": 0.0105, "num_input_tokens_seen": 92202256, "step": 42715 }, { "epoch": 6.969004893964111, "grad_norm": 0.1489112675189972, "learning_rate": 1.2736212986349755e-05, "loss": 0.0027, "num_input_tokens_seen": 92212400, "step": 42720 }, { "epoch": 6.9698205546492655, "grad_norm": 3.682657241821289, "learning_rate": 1.2730010786836261e-05, "loss": 0.3949, "num_input_tokens_seen": 92223216, "step": 42725 }, { "epoch": 6.970636215334421, "grad_norm": 0.04183153063058853, "learning_rate": 1.2723809581990348e-05, "loss": 0.0655, "num_input_tokens_seen": 92234928, "step": 42730 }, { "epoch": 6.971451876019576, "grad_norm": 0.15970873832702637, "learning_rate": 1.271760937231472e-05, "loss": 0.1058, "num_input_tokens_seen": 92245392, "step": 42735 }, { "epoch": 6.972267536704731, "grad_norm": 4.388833045959473, "learning_rate": 1.2711410158311987e-05, "loss": 0.1514, "num_input_tokens_seen": 92255824, "step": 42740 }, { "epoch": 6.973083197389886, "grad_norm": 0.14887931942939758, "learning_rate": 1.2705211940484707e-05, "loss": 0.0071, "num_input_tokens_seen": 92265904, "step": 42745 }, { "epoch": 6.9738988580750405, "grad_norm": 0.0658448114991188, "learning_rate": 1.2699014719335329e-05, "loss": 0.0976, "num_input_tokens_seen": 92276176, "step": 42750 }, { "epoch": 6.974714518760196, "grad_norm": 0.06925688683986664, "learning_rate": 1.2692818495366236e-05, "loss": 0.0708, "num_input_tokens_seen": 92285744, "step": 42755 }, { "epoch": 6.975530179445351, "grad_norm": 0.17164693772792816, "learning_rate": 1.2686623269079717e-05, "loss": 0.0052, "num_input_tokens_seen": 92298128, "step": 42760 }, { "epoch": 6.976345840130506, "grad_norm": 0.07573288679122925, "learning_rate": 1.2680429040977998e-05, "loss": 0.005, "num_input_tokens_seen": 92308752, "step": 42765 }, { "epoch": 6.977161500815661, "grad_norm": 3.413907527923584, "learning_rate": 1.2674235811563206e-05, "loss": 0.2298, "num_input_tokens_seen": 92319632, "step": 42770 }, { "epoch": 6.9779771615008155, "grad_norm": 0.15602654218673706, "learning_rate": 1.2668043581337401e-05, "loss": 0.0099, "num_input_tokens_seen": 92330576, "step": 42775 }, { "epoch": 6.97879282218597, "grad_norm": 2.75740909576416, "learning_rate": 1.2661852350802556e-05, "loss": 0.0679, "num_input_tokens_seen": 92340304, "step": 42780 }, { "epoch": 6.979608482871125, "grad_norm": 0.07793080806732178, "learning_rate": 1.2655662120460564e-05, "loss": 0.1154, "num_input_tokens_seen": 92350576, "step": 42785 }, { "epoch": 6.980424143556281, "grad_norm": 1.0050971508026123, "learning_rate": 1.2649472890813232e-05, "loss": 0.0084, "num_input_tokens_seen": 92359664, "step": 42790 }, { "epoch": 6.981239804241436, "grad_norm": 0.11205387860536575, "learning_rate": 1.2643284662362295e-05, "loss": 0.0571, "num_input_tokens_seen": 92370032, "step": 42795 }, { "epoch": 6.9820554649265905, "grad_norm": 0.12443529069423676, "learning_rate": 1.2637097435609402e-05, "loss": 0.203, "num_input_tokens_seen": 92380976, "step": 42800 }, { "epoch": 6.982871125611745, "grad_norm": 0.21364746987819672, "learning_rate": 1.2630911211056116e-05, "loss": 0.1235, "num_input_tokens_seen": 92393136, "step": 42805 }, { "epoch": 6.9836867862969, "grad_norm": 0.053184982389211655, "learning_rate": 1.2624725989203929e-05, "loss": 0.126, "num_input_tokens_seen": 92404208, "step": 42810 }, { "epoch": 6.984502446982056, "grad_norm": 0.05223655328154564, "learning_rate": 1.2618541770554243e-05, "loss": 0.1379, "num_input_tokens_seen": 92415152, "step": 42815 }, { "epoch": 6.985318107667211, "grad_norm": 0.09687217324972153, "learning_rate": 1.2612358555608388e-05, "loss": 0.0041, "num_input_tokens_seen": 92426192, "step": 42820 }, { "epoch": 6.986133768352365, "grad_norm": 0.08545750379562378, "learning_rate": 1.2606176344867598e-05, "loss": 0.0904, "num_input_tokens_seen": 92438064, "step": 42825 }, { "epoch": 6.98694942903752, "grad_norm": 0.6400485634803772, "learning_rate": 1.2599995138833043e-05, "loss": 0.0087, "num_input_tokens_seen": 92447280, "step": 42830 }, { "epoch": 6.987765089722675, "grad_norm": 0.17474797368049622, "learning_rate": 1.25938149380058e-05, "loss": 0.1147, "num_input_tokens_seen": 92458416, "step": 42835 }, { "epoch": 6.988580750407831, "grad_norm": 0.10868343710899353, "learning_rate": 1.258763574288686e-05, "loss": 0.0848, "num_input_tokens_seen": 92469680, "step": 42840 }, { "epoch": 6.989396411092986, "grad_norm": 0.5525995492935181, "learning_rate": 1.2581457553977144e-05, "loss": 0.0097, "num_input_tokens_seen": 92480592, "step": 42845 }, { "epoch": 6.99021207177814, "grad_norm": 0.22638683021068573, "learning_rate": 1.2575280371777496e-05, "loss": 0.0082, "num_input_tokens_seen": 92492336, "step": 42850 }, { "epoch": 6.991027732463295, "grad_norm": 0.4266146719455719, "learning_rate": 1.2569104196788665e-05, "loss": 0.0065, "num_input_tokens_seen": 92502608, "step": 42855 }, { "epoch": 6.99184339314845, "grad_norm": 0.11602754145860672, "learning_rate": 1.256292902951132e-05, "loss": 0.0084, "num_input_tokens_seen": 92513520, "step": 42860 }, { "epoch": 6.992659053833605, "grad_norm": 0.20767870545387268, "learning_rate": 1.2556754870446053e-05, "loss": 0.0556, "num_input_tokens_seen": 92524656, "step": 42865 }, { "epoch": 6.993474714518761, "grad_norm": 7.158193111419678, "learning_rate": 1.255058172009337e-05, "loss": 0.0401, "num_input_tokens_seen": 92535408, "step": 42870 }, { "epoch": 6.994290375203915, "grad_norm": 0.16819290816783905, "learning_rate": 1.2544409578953697e-05, "loss": 0.0084, "num_input_tokens_seen": 92545744, "step": 42875 }, { "epoch": 6.99510603588907, "grad_norm": 5.802420139312744, "learning_rate": 1.253823844752739e-05, "loss": 0.1006, "num_input_tokens_seen": 92556432, "step": 42880 }, { "epoch": 6.995921696574225, "grad_norm": 0.24903814494609833, "learning_rate": 1.25320683263147e-05, "loss": 0.016, "num_input_tokens_seen": 92568592, "step": 42885 }, { "epoch": 6.99673735725938, "grad_norm": 0.1295042634010315, "learning_rate": 1.2525899215815818e-05, "loss": 0.0064, "num_input_tokens_seen": 92579632, "step": 42890 }, { "epoch": 6.997553017944535, "grad_norm": 0.09052778035402298, "learning_rate": 1.2519731116530837e-05, "loss": 0.0067, "num_input_tokens_seen": 92591344, "step": 42895 }, { "epoch": 6.99836867862969, "grad_norm": 0.0461747832596302, "learning_rate": 1.2513564028959777e-05, "loss": 0.0991, "num_input_tokens_seen": 92601264, "step": 42900 }, { "epoch": 6.999184339314845, "grad_norm": 0.690471351146698, "learning_rate": 1.2507397953602574e-05, "loss": 0.1705, "num_input_tokens_seen": 92612464, "step": 42905 }, { "epoch": 7.0, "grad_norm": 0.1497431844472885, "learning_rate": 1.2501232890959075e-05, "loss": 0.0286, "num_input_tokens_seen": 92621824, "step": 42910 }, { "epoch": 7.0, "eval_loss": 0.18837185204029083, "eval_runtime": 132.8718, "eval_samples_per_second": 20.508, "eval_steps_per_second": 5.133, "num_input_tokens_seen": 92621824, "step": 42910 }, { "epoch": 7.000815660685155, "grad_norm": 0.0485120564699173, "learning_rate": 1.2495068841529058e-05, "loss": 0.0976, "num_input_tokens_seen": 92633024, "step": 42915 }, { "epoch": 7.00163132137031, "grad_norm": 0.08123528957366943, "learning_rate": 1.248890580581221e-05, "loss": 0.0077, "num_input_tokens_seen": 92644768, "step": 42920 }, { "epoch": 7.002446982055465, "grad_norm": 0.024659737944602966, "learning_rate": 1.248274378430814e-05, "loss": 0.0064, "num_input_tokens_seen": 92654240, "step": 42925 }, { "epoch": 7.00326264274062, "grad_norm": 0.18686439096927643, "learning_rate": 1.2476582777516368e-05, "loss": 0.0041, "num_input_tokens_seen": 92664288, "step": 42930 }, { "epoch": 7.004078303425775, "grad_norm": 0.0459061935544014, "learning_rate": 1.2470422785936339e-05, "loss": 0.174, "num_input_tokens_seen": 92675104, "step": 42935 }, { "epoch": 7.00489396411093, "grad_norm": 0.1195080503821373, "learning_rate": 1.2464263810067417e-05, "loss": 0.0044, "num_input_tokens_seen": 92685056, "step": 42940 }, { "epoch": 7.005709624796085, "grad_norm": 17.653226852416992, "learning_rate": 1.2458105850408874e-05, "loss": 0.0493, "num_input_tokens_seen": 92696000, "step": 42945 }, { "epoch": 7.006525285481239, "grad_norm": 0.033694829791784286, "learning_rate": 1.2451948907459907e-05, "loss": 0.1407, "num_input_tokens_seen": 92706272, "step": 42950 }, { "epoch": 7.007340946166395, "grad_norm": 0.027108220383524895, "learning_rate": 1.2445792981719622e-05, "loss": 0.1758, "num_input_tokens_seen": 92717120, "step": 42955 }, { "epoch": 7.00815660685155, "grad_norm": 0.09192747622728348, "learning_rate": 1.2439638073687065e-05, "loss": 0.0031, "num_input_tokens_seen": 92726432, "step": 42960 }, { "epoch": 7.008972267536705, "grad_norm": 0.1266232281923294, "learning_rate": 1.2433484183861178e-05, "loss": 0.1223, "num_input_tokens_seen": 92736896, "step": 42965 }, { "epoch": 7.00978792822186, "grad_norm": 0.0833207368850708, "learning_rate": 1.2427331312740822e-05, "loss": 0.0136, "num_input_tokens_seen": 92747904, "step": 42970 }, { "epoch": 7.010603588907014, "grad_norm": 0.15957383811473846, "learning_rate": 1.2421179460824787e-05, "loss": 0.1064, "num_input_tokens_seen": 92757920, "step": 42975 }, { "epoch": 7.011419249592169, "grad_norm": 0.02515152096748352, "learning_rate": 1.2415028628611767e-05, "loss": 0.0064, "num_input_tokens_seen": 92769664, "step": 42980 }, { "epoch": 7.012234910277325, "grad_norm": 0.10467785596847534, "learning_rate": 1.2408878816600384e-05, "loss": 0.0065, "num_input_tokens_seen": 92780256, "step": 42985 }, { "epoch": 7.01305057096248, "grad_norm": 12.657437324523926, "learning_rate": 1.2402730025289166e-05, "loss": 0.0618, "num_input_tokens_seen": 92791744, "step": 42990 }, { "epoch": 7.013866231647635, "grad_norm": 11.175701141357422, "learning_rate": 1.2396582255176575e-05, "loss": 0.0851, "num_input_tokens_seen": 92800928, "step": 42995 }, { "epoch": 7.014681892332789, "grad_norm": 0.08104083687067032, "learning_rate": 1.2390435506760973e-05, "loss": 0.0479, "num_input_tokens_seen": 92812352, "step": 43000 }, { "epoch": 7.015497553017944, "grad_norm": 0.2647564709186554, "learning_rate": 1.238428978054065e-05, "loss": 0.0471, "num_input_tokens_seen": 92822944, "step": 43005 }, { "epoch": 7.0163132137031, "grad_norm": 0.14446577429771423, "learning_rate": 1.2378145077013808e-05, "loss": 0.1382, "num_input_tokens_seen": 92832960, "step": 43010 }, { "epoch": 7.017128874388255, "grad_norm": 0.05882642790675163, "learning_rate": 1.237200139667857e-05, "loss": 0.2418, "num_input_tokens_seen": 92843584, "step": 43015 }, { "epoch": 7.0179445350734095, "grad_norm": 0.14573557674884796, "learning_rate": 1.2365858740032962e-05, "loss": 0.0055, "num_input_tokens_seen": 92853440, "step": 43020 }, { "epoch": 7.018760195758564, "grad_norm": 0.1150248795747757, "learning_rate": 1.2359717107574959e-05, "loss": 0.0524, "num_input_tokens_seen": 92862112, "step": 43025 }, { "epoch": 7.019575856443719, "grad_norm": 0.024813566356897354, "learning_rate": 1.2353576499802425e-05, "loss": 0.0084, "num_input_tokens_seen": 92872000, "step": 43030 }, { "epoch": 7.020391517128874, "grad_norm": 0.09132643789052963, "learning_rate": 1.2347436917213145e-05, "loss": 0.0039, "num_input_tokens_seen": 92881856, "step": 43035 }, { "epoch": 7.02120717781403, "grad_norm": 0.10275115072727203, "learning_rate": 1.2341298360304828e-05, "loss": 0.0038, "num_input_tokens_seen": 92892768, "step": 43040 }, { "epoch": 7.0220228384991845, "grad_norm": 0.059842515736818314, "learning_rate": 1.2335160829575096e-05, "loss": 0.0961, "num_input_tokens_seen": 92904384, "step": 43045 }, { "epoch": 7.022838499184339, "grad_norm": 0.16072654724121094, "learning_rate": 1.2329024325521488e-05, "loss": 0.0097, "num_input_tokens_seen": 92916160, "step": 43050 }, { "epoch": 7.023654159869494, "grad_norm": 0.14826837182044983, "learning_rate": 1.2322888848641458e-05, "loss": 0.12, "num_input_tokens_seen": 92924640, "step": 43055 }, { "epoch": 7.024469820554649, "grad_norm": 0.04622304067015648, "learning_rate": 1.2316754399432374e-05, "loss": 0.0033, "num_input_tokens_seen": 92935264, "step": 43060 }, { "epoch": 7.025285481239805, "grad_norm": 0.05373028293251991, "learning_rate": 1.231062097839154e-05, "loss": 0.0033, "num_input_tokens_seen": 92947232, "step": 43065 }, { "epoch": 7.0261011419249595, "grad_norm": 0.031497180461883545, "learning_rate": 1.2304488586016156e-05, "loss": 0.1275, "num_input_tokens_seen": 92958816, "step": 43070 }, { "epoch": 7.026916802610114, "grad_norm": 2.944779634475708, "learning_rate": 1.2298357222803341e-05, "loss": 0.0124, "num_input_tokens_seen": 92969824, "step": 43075 }, { "epoch": 7.027732463295269, "grad_norm": 0.08053677529096603, "learning_rate": 1.2292226889250142e-05, "loss": 0.0715, "num_input_tokens_seen": 92980928, "step": 43080 }, { "epoch": 7.028548123980424, "grad_norm": 0.4775858521461487, "learning_rate": 1.2286097585853507e-05, "loss": 0.0756, "num_input_tokens_seen": 92992480, "step": 43085 }, { "epoch": 7.029363784665579, "grad_norm": 0.06219562515616417, "learning_rate": 1.2279969313110313e-05, "loss": 0.1079, "num_input_tokens_seen": 93003232, "step": 43090 }, { "epoch": 7.0301794453507345, "grad_norm": 5.719261169433594, "learning_rate": 1.2273842071517344e-05, "loss": 0.1312, "num_input_tokens_seen": 93013952, "step": 43095 }, { "epoch": 7.030995106035889, "grad_norm": 37.29789733886719, "learning_rate": 1.2267715861571311e-05, "loss": 0.0797, "num_input_tokens_seen": 93024064, "step": 43100 }, { "epoch": 7.031810766721044, "grad_norm": 0.1041211411356926, "learning_rate": 1.2261590683768831e-05, "loss": 0.003, "num_input_tokens_seen": 93036128, "step": 43105 }, { "epoch": 7.032626427406199, "grad_norm": 9.541900634765625, "learning_rate": 1.2255466538606447e-05, "loss": 0.1325, "num_input_tokens_seen": 93047904, "step": 43110 }, { "epoch": 7.033442088091354, "grad_norm": 0.06114305928349495, "learning_rate": 1.224934342658061e-05, "loss": 0.2022, "num_input_tokens_seen": 93057568, "step": 43115 }, { "epoch": 7.034257748776509, "grad_norm": 0.09569920599460602, "learning_rate": 1.224322134818769e-05, "loss": 0.1111, "num_input_tokens_seen": 93068192, "step": 43120 }, { "epoch": 7.035073409461664, "grad_norm": 0.04119197651743889, "learning_rate": 1.2237100303923977e-05, "loss": 0.0036, "num_input_tokens_seen": 93080032, "step": 43125 }, { "epoch": 7.035889070146819, "grad_norm": 18.058048248291016, "learning_rate": 1.2230980294285669e-05, "loss": 0.1067, "num_input_tokens_seen": 93090784, "step": 43130 }, { "epoch": 7.036704730831974, "grad_norm": 5.157989025115967, "learning_rate": 1.2224861319768887e-05, "loss": 0.0084, "num_input_tokens_seen": 93102176, "step": 43135 }, { "epoch": 7.037520391517129, "grad_norm": 0.05058105289936066, "learning_rate": 1.2218743380869669e-05, "loss": 0.0046, "num_input_tokens_seen": 93112928, "step": 43140 }, { "epoch": 7.0383360522022835, "grad_norm": 0.07497899979352951, "learning_rate": 1.2212626478083964e-05, "loss": 0.0596, "num_input_tokens_seen": 93123872, "step": 43145 }, { "epoch": 7.039151712887439, "grad_norm": 0.08604057133197784, "learning_rate": 1.2206510611907638e-05, "loss": 0.011, "num_input_tokens_seen": 93134368, "step": 43150 }, { "epoch": 7.039967373572594, "grad_norm": 4.297252655029297, "learning_rate": 1.2200395782836477e-05, "loss": 0.0941, "num_input_tokens_seen": 93146720, "step": 43155 }, { "epoch": 7.040783034257749, "grad_norm": 0.10059039294719696, "learning_rate": 1.2194281991366176e-05, "loss": 0.0042, "num_input_tokens_seen": 93157632, "step": 43160 }, { "epoch": 7.041598694942904, "grad_norm": 0.06647174060344696, "learning_rate": 1.2188169237992345e-05, "loss": 0.0033, "num_input_tokens_seen": 93167264, "step": 43165 }, { "epoch": 7.0424143556280585, "grad_norm": 0.44864416122436523, "learning_rate": 1.218205752321053e-05, "loss": 0.0305, "num_input_tokens_seen": 93179136, "step": 43170 }, { "epoch": 7.043230016313213, "grad_norm": 0.0658923089504242, "learning_rate": 1.217594684751617e-05, "loss": 0.1252, "num_input_tokens_seen": 93189536, "step": 43175 }, { "epoch": 7.044045676998369, "grad_norm": 0.04984398931264877, "learning_rate": 1.2169837211404627e-05, "loss": 0.0014, "num_input_tokens_seen": 93201568, "step": 43180 }, { "epoch": 7.044861337683524, "grad_norm": 0.034007783979177475, "learning_rate": 1.2163728615371181e-05, "loss": 0.0191, "num_input_tokens_seen": 93212128, "step": 43185 }, { "epoch": 7.045676998368679, "grad_norm": 6.431211471557617, "learning_rate": 1.2157621059911014e-05, "loss": 0.2947, "num_input_tokens_seen": 93222784, "step": 43190 }, { "epoch": 7.0464926590538335, "grad_norm": 0.05510122701525688, "learning_rate": 1.2151514545519254e-05, "loss": 0.0028, "num_input_tokens_seen": 93234016, "step": 43195 }, { "epoch": 7.047308319738988, "grad_norm": 4.3655595779418945, "learning_rate": 1.214540907269092e-05, "loss": 0.1949, "num_input_tokens_seen": 93243200, "step": 43200 }, { "epoch": 7.048123980424143, "grad_norm": 0.061375848948955536, "learning_rate": 1.2139304641920946e-05, "loss": 0.0016, "num_input_tokens_seen": 93254432, "step": 43205 }, { "epoch": 7.048939641109299, "grad_norm": 0.0294011402875185, "learning_rate": 1.2133201253704196e-05, "loss": 0.0037, "num_input_tokens_seen": 93263360, "step": 43210 }, { "epoch": 7.049755301794454, "grad_norm": 0.03514668717980385, "learning_rate": 1.2127098908535434e-05, "loss": 0.1108, "num_input_tokens_seen": 93273376, "step": 43215 }, { "epoch": 7.0505709624796085, "grad_norm": 0.09905792772769928, "learning_rate": 1.2120997606909354e-05, "loss": 0.091, "num_input_tokens_seen": 93284416, "step": 43220 }, { "epoch": 7.051386623164763, "grad_norm": 0.1089557558298111, "learning_rate": 1.2114897349320553e-05, "loss": 0.0242, "num_input_tokens_seen": 93295776, "step": 43225 }, { "epoch": 7.052202283849918, "grad_norm": 8.377816200256348, "learning_rate": 1.2108798136263555e-05, "loss": 0.0256, "num_input_tokens_seen": 93305408, "step": 43230 }, { "epoch": 7.053017944535074, "grad_norm": 7.742931842803955, "learning_rate": 1.2102699968232787e-05, "loss": 0.4057, "num_input_tokens_seen": 93316192, "step": 43235 }, { "epoch": 7.053833605220229, "grad_norm": 0.05888286232948303, "learning_rate": 1.2096602845722598e-05, "loss": 0.132, "num_input_tokens_seen": 93327072, "step": 43240 }, { "epoch": 7.054649265905383, "grad_norm": 0.2003031075000763, "learning_rate": 1.2090506769227256e-05, "loss": 0.005, "num_input_tokens_seen": 93338368, "step": 43245 }, { "epoch": 7.055464926590538, "grad_norm": 0.4757084250450134, "learning_rate": 1.2084411739240936e-05, "loss": 0.011, "num_input_tokens_seen": 93349952, "step": 43250 }, { "epoch": 7.056280587275693, "grad_norm": 3.10579514503479, "learning_rate": 1.2078317756257735e-05, "loss": 0.1048, "num_input_tokens_seen": 93360480, "step": 43255 }, { "epoch": 7.057096247960848, "grad_norm": 8.451691627502441, "learning_rate": 1.207222482077166e-05, "loss": 0.0438, "num_input_tokens_seen": 93371104, "step": 43260 }, { "epoch": 7.057911908646004, "grad_norm": 10.191644668579102, "learning_rate": 1.2066132933276636e-05, "loss": 0.2268, "num_input_tokens_seen": 93381792, "step": 43265 }, { "epoch": 7.058727569331158, "grad_norm": 0.05645562708377838, "learning_rate": 1.2060042094266495e-05, "loss": 0.0019, "num_input_tokens_seen": 93393024, "step": 43270 }, { "epoch": 7.059543230016313, "grad_norm": 0.03519802913069725, "learning_rate": 1.2053952304235002e-05, "loss": 0.0435, "num_input_tokens_seen": 93403968, "step": 43275 }, { "epoch": 7.060358890701468, "grad_norm": 0.07684657722711563, "learning_rate": 1.2047863563675826e-05, "loss": 0.0085, "num_input_tokens_seen": 93414336, "step": 43280 }, { "epoch": 7.061174551386623, "grad_norm": 0.0919998362660408, "learning_rate": 1.204177587308255e-05, "loss": 0.0042, "num_input_tokens_seen": 93425312, "step": 43285 }, { "epoch": 7.061990212071779, "grad_norm": 3.51995587348938, "learning_rate": 1.2035689232948669e-05, "loss": 0.1389, "num_input_tokens_seen": 93434368, "step": 43290 }, { "epoch": 7.062805872756933, "grad_norm": 4.85670804977417, "learning_rate": 1.20296036437676e-05, "loss": 0.1302, "num_input_tokens_seen": 93445696, "step": 43295 }, { "epoch": 7.063621533442088, "grad_norm": 0.11340179294347763, "learning_rate": 1.2023519106032672e-05, "loss": 0.0035, "num_input_tokens_seen": 93456768, "step": 43300 }, { "epoch": 7.064437194127243, "grad_norm": 0.027684850618243217, "learning_rate": 1.2017435620237125e-05, "loss": 0.1451, "num_input_tokens_seen": 93468320, "step": 43305 }, { "epoch": 7.065252854812398, "grad_norm": 0.04040546715259552, "learning_rate": 1.201135318687412e-05, "loss": 0.0833, "num_input_tokens_seen": 93480832, "step": 43310 }, { "epoch": 7.066068515497553, "grad_norm": 0.05264955013990402, "learning_rate": 1.2005271806436727e-05, "loss": 0.0303, "num_input_tokens_seen": 93490880, "step": 43315 }, { "epoch": 7.066884176182708, "grad_norm": 0.030342532321810722, "learning_rate": 1.199919147941794e-05, "loss": 0.0019, "num_input_tokens_seen": 93501600, "step": 43320 }, { "epoch": 7.067699836867863, "grad_norm": 0.29926538467407227, "learning_rate": 1.1993112206310656e-05, "loss": 0.0794, "num_input_tokens_seen": 93512832, "step": 43325 }, { "epoch": 7.068515497553018, "grad_norm": 0.18785369396209717, "learning_rate": 1.1987033987607681e-05, "loss": 0.0204, "num_input_tokens_seen": 93524416, "step": 43330 }, { "epoch": 7.069331158238173, "grad_norm": 0.02750498801469803, "learning_rate": 1.198095682380177e-05, "loss": 0.0923, "num_input_tokens_seen": 93535744, "step": 43335 }, { "epoch": 7.070146818923328, "grad_norm": 0.06883826106786728, "learning_rate": 1.1974880715385557e-05, "loss": 0.0036, "num_input_tokens_seen": 93546016, "step": 43340 }, { "epoch": 7.0709624796084825, "grad_norm": 3.369436502456665, "learning_rate": 1.1968805662851601e-05, "loss": 0.0084, "num_input_tokens_seen": 93555040, "step": 43345 }, { "epoch": 7.071778140293638, "grad_norm": 3.999937057495117, "learning_rate": 1.1962731666692378e-05, "loss": 0.1094, "num_input_tokens_seen": 93565184, "step": 43350 }, { "epoch": 7.072593800978793, "grad_norm": 0.06369879096746445, "learning_rate": 1.1956658727400277e-05, "loss": 0.0048, "num_input_tokens_seen": 93576352, "step": 43355 }, { "epoch": 7.073409461663948, "grad_norm": 0.036028988659381866, "learning_rate": 1.1950586845467602e-05, "loss": 0.2048, "num_input_tokens_seen": 93586880, "step": 43360 }, { "epoch": 7.074225122349103, "grad_norm": 0.20576518774032593, "learning_rate": 1.1944516021386565e-05, "loss": 0.074, "num_input_tokens_seen": 93598560, "step": 43365 }, { "epoch": 7.075040783034257, "grad_norm": 8.174752235412598, "learning_rate": 1.1938446255649305e-05, "loss": 0.2437, "num_input_tokens_seen": 93608928, "step": 43370 }, { "epoch": 7.075856443719413, "grad_norm": 0.08758328855037689, "learning_rate": 1.1932377548747867e-05, "loss": 0.266, "num_input_tokens_seen": 93619552, "step": 43375 }, { "epoch": 7.076672104404568, "grad_norm": 0.20168966054916382, "learning_rate": 1.1926309901174196e-05, "loss": 0.0036, "num_input_tokens_seen": 93631264, "step": 43380 }, { "epoch": 7.077487765089723, "grad_norm": 9.433473587036133, "learning_rate": 1.192024331342019e-05, "loss": 0.0733, "num_input_tokens_seen": 93641792, "step": 43385 }, { "epoch": 7.078303425774878, "grad_norm": 0.04689888656139374, "learning_rate": 1.1914177785977629e-05, "loss": 0.0873, "num_input_tokens_seen": 93652992, "step": 43390 }, { "epoch": 7.079119086460032, "grad_norm": 0.12399861961603165, "learning_rate": 1.1908113319338212e-05, "loss": 0.0029, "num_input_tokens_seen": 93663552, "step": 43395 }, { "epoch": 7.079934747145187, "grad_norm": 1.831578254699707, "learning_rate": 1.1902049913993558e-05, "loss": 0.1098, "num_input_tokens_seen": 93674272, "step": 43400 }, { "epoch": 7.080750407830343, "grad_norm": 0.038481879979372025, "learning_rate": 1.1895987570435196e-05, "loss": 0.0035, "num_input_tokens_seen": 93685536, "step": 43405 }, { "epoch": 7.081566068515498, "grad_norm": 0.17415465414524078, "learning_rate": 1.188992628915457e-05, "loss": 0.0071, "num_input_tokens_seen": 93696640, "step": 43410 }, { "epoch": 7.082381729200653, "grad_norm": 29.55493927001953, "learning_rate": 1.1883866070643041e-05, "loss": 0.053, "num_input_tokens_seen": 93707008, "step": 43415 }, { "epoch": 7.083197389885807, "grad_norm": 0.9635235667228699, "learning_rate": 1.1877806915391875e-05, "loss": 0.1307, "num_input_tokens_seen": 93717376, "step": 43420 }, { "epoch": 7.084013050570962, "grad_norm": 18.65500831604004, "learning_rate": 1.1871748823892264e-05, "loss": 0.028, "num_input_tokens_seen": 93728032, "step": 43425 }, { "epoch": 7.084828711256117, "grad_norm": 0.0626855194568634, "learning_rate": 1.1865691796635306e-05, "loss": 0.1448, "num_input_tokens_seen": 93737792, "step": 43430 }, { "epoch": 7.085644371941273, "grad_norm": 0.1425950974225998, "learning_rate": 1.1859635834112012e-05, "loss": 0.0047, "num_input_tokens_seen": 93749088, "step": 43435 }, { "epoch": 7.0864600326264275, "grad_norm": 4.997335910797119, "learning_rate": 1.1853580936813313e-05, "loss": 0.0561, "num_input_tokens_seen": 93760512, "step": 43440 }, { "epoch": 7.087275693311582, "grad_norm": 0.13220566511154175, "learning_rate": 1.1847527105230047e-05, "loss": 0.1945, "num_input_tokens_seen": 93771712, "step": 43445 }, { "epoch": 7.088091353996737, "grad_norm": 0.06322714686393738, "learning_rate": 1.1841474339852968e-05, "loss": 0.0813, "num_input_tokens_seen": 93783008, "step": 43450 }, { "epoch": 7.088907014681892, "grad_norm": 5.114476680755615, "learning_rate": 1.1835422641172744e-05, "loss": 0.0604, "num_input_tokens_seen": 93792928, "step": 43455 }, { "epoch": 7.089722675367048, "grad_norm": 0.04860479384660721, "learning_rate": 1.182937200967996e-05, "loss": 0.0047, "num_input_tokens_seen": 93803872, "step": 43460 }, { "epoch": 7.0905383360522025, "grad_norm": 1.2112655639648438, "learning_rate": 1.1823322445865103e-05, "loss": 0.0438, "num_input_tokens_seen": 93815584, "step": 43465 }, { "epoch": 7.091353996737357, "grad_norm": 0.4199487864971161, "learning_rate": 1.1817273950218591e-05, "loss": 0.0869, "num_input_tokens_seen": 93827040, "step": 43470 }, { "epoch": 7.092169657422512, "grad_norm": 0.050442811101675034, "learning_rate": 1.1811226523230731e-05, "loss": 0.086, "num_input_tokens_seen": 93836800, "step": 43475 }, { "epoch": 7.092985318107667, "grad_norm": 14.030454635620117, "learning_rate": 1.1805180165391774e-05, "loss": 0.093, "num_input_tokens_seen": 93848032, "step": 43480 }, { "epoch": 7.093800978792822, "grad_norm": 4.291860103607178, "learning_rate": 1.1799134877191867e-05, "loss": 0.1784, "num_input_tokens_seen": 93858720, "step": 43485 }, { "epoch": 7.0946166394779775, "grad_norm": 0.052896466106176376, "learning_rate": 1.1793090659121065e-05, "loss": 0.0646, "num_input_tokens_seen": 93869696, "step": 43490 }, { "epoch": 7.095432300163132, "grad_norm": 0.11294753104448318, "learning_rate": 1.1787047511669347e-05, "loss": 0.1258, "num_input_tokens_seen": 93880672, "step": 43495 }, { "epoch": 7.096247960848287, "grad_norm": 3.0636801719665527, "learning_rate": 1.178100543532659e-05, "loss": 0.0719, "num_input_tokens_seen": 93891840, "step": 43500 }, { "epoch": 7.097063621533442, "grad_norm": 0.06495791673660278, "learning_rate": 1.1774964430582614e-05, "loss": 0.1446, "num_input_tokens_seen": 93900928, "step": 43505 }, { "epoch": 7.097879282218597, "grad_norm": 4.291179656982422, "learning_rate": 1.1768924497927123e-05, "loss": 0.0927, "num_input_tokens_seen": 93912480, "step": 43510 }, { "epoch": 7.0986949429037525, "grad_norm": 0.0967203751206398, "learning_rate": 1.1762885637849746e-05, "loss": 0.0064, "num_input_tokens_seen": 93924192, "step": 43515 }, { "epoch": 7.099510603588907, "grad_norm": 0.10868646204471588, "learning_rate": 1.1756847850840024e-05, "loss": 0.0052, "num_input_tokens_seen": 93934304, "step": 43520 }, { "epoch": 7.100326264274062, "grad_norm": 0.04877633601427078, "learning_rate": 1.1750811137387414e-05, "loss": 0.0986, "num_input_tokens_seen": 93944032, "step": 43525 }, { "epoch": 7.101141924959217, "grad_norm": 0.857296884059906, "learning_rate": 1.1744775497981273e-05, "loss": 0.0053, "num_input_tokens_seen": 93954304, "step": 43530 }, { "epoch": 7.101957585644372, "grad_norm": 0.02552896924316883, "learning_rate": 1.173874093311089e-05, "loss": 0.0902, "num_input_tokens_seen": 93964864, "step": 43535 }, { "epoch": 7.102773246329527, "grad_norm": 1.6049836874008179, "learning_rate": 1.1732707443265453e-05, "loss": 0.0067, "num_input_tokens_seen": 93976448, "step": 43540 }, { "epoch": 7.103588907014682, "grad_norm": 0.12095363438129425, "learning_rate": 1.172667502893407e-05, "loss": 0.0066, "num_input_tokens_seen": 93986848, "step": 43545 }, { "epoch": 7.104404567699837, "grad_norm": 0.09445792436599731, "learning_rate": 1.1720643690605754e-05, "loss": 0.0081, "num_input_tokens_seen": 93997472, "step": 43550 }, { "epoch": 7.105220228384992, "grad_norm": 0.01909150369465351, "learning_rate": 1.1714613428769442e-05, "loss": 0.0024, "num_input_tokens_seen": 94007744, "step": 43555 }, { "epoch": 7.106035889070147, "grad_norm": 12.791037559509277, "learning_rate": 1.1708584243913972e-05, "loss": 0.3366, "num_input_tokens_seen": 94018880, "step": 43560 }, { "epoch": 7.1068515497553015, "grad_norm": 0.028335576876997948, "learning_rate": 1.1702556136528106e-05, "loss": 0.0035, "num_input_tokens_seen": 94030112, "step": 43565 }, { "epoch": 7.107667210440456, "grad_norm": 0.1255069226026535, "learning_rate": 1.169652910710051e-05, "loss": 0.1141, "num_input_tokens_seen": 94041120, "step": 43570 }, { "epoch": 7.108482871125612, "grad_norm": 0.20495577156543732, "learning_rate": 1.1690503156119764e-05, "loss": 0.0487, "num_input_tokens_seen": 94052928, "step": 43575 }, { "epoch": 7.109298531810767, "grad_norm": 0.05670922249555588, "learning_rate": 1.1684478284074365e-05, "loss": 0.0772, "num_input_tokens_seen": 94064480, "step": 43580 }, { "epoch": 7.110114192495922, "grad_norm": 0.06790587306022644, "learning_rate": 1.1678454491452717e-05, "loss": 0.0988, "num_input_tokens_seen": 94076064, "step": 43585 }, { "epoch": 7.1109298531810765, "grad_norm": 0.05412507802248001, "learning_rate": 1.1672431778743133e-05, "loss": 0.0257, "num_input_tokens_seen": 94085216, "step": 43590 }, { "epoch": 7.111745513866231, "grad_norm": 3.800899028778076, "learning_rate": 1.1666410146433861e-05, "loss": 0.2593, "num_input_tokens_seen": 94095904, "step": 43595 }, { "epoch": 7.112561174551387, "grad_norm": 2.408957004547119, "learning_rate": 1.1660389595013038e-05, "loss": 0.1236, "num_input_tokens_seen": 94108288, "step": 43600 }, { "epoch": 7.113376835236542, "grad_norm": 0.08000515401363373, "learning_rate": 1.165437012496872e-05, "loss": 0.005, "num_input_tokens_seen": 94117440, "step": 43605 }, { "epoch": 7.114192495921697, "grad_norm": 0.02687612548470497, "learning_rate": 1.1648351736788871e-05, "loss": 0.0875, "num_input_tokens_seen": 94127488, "step": 43610 }, { "epoch": 7.1150081566068515, "grad_norm": 0.21786536276340485, "learning_rate": 1.1642334430961377e-05, "loss": 0.0863, "num_input_tokens_seen": 94137824, "step": 43615 }, { "epoch": 7.115823817292006, "grad_norm": 0.27159833908081055, "learning_rate": 1.163631820797403e-05, "loss": 0.2043, "num_input_tokens_seen": 94149632, "step": 43620 }, { "epoch": 7.116639477977161, "grad_norm": 0.08399226516485214, "learning_rate": 1.1630303068314538e-05, "loss": 0.0145, "num_input_tokens_seen": 94161184, "step": 43625 }, { "epoch": 7.117455138662317, "grad_norm": 0.07265343517065048, "learning_rate": 1.1624289012470513e-05, "loss": 0.0459, "num_input_tokens_seen": 94170656, "step": 43630 }, { "epoch": 7.118270799347472, "grad_norm": 0.04176503047347069, "learning_rate": 1.161827604092949e-05, "loss": 0.0045, "num_input_tokens_seen": 94180384, "step": 43635 }, { "epoch": 7.1190864600326265, "grad_norm": 0.036481671035289764, "learning_rate": 1.161226415417891e-05, "loss": 0.0033, "num_input_tokens_seen": 94191456, "step": 43640 }, { "epoch": 7.119902120717781, "grad_norm": 0.08592111617326736, "learning_rate": 1.1606253352706118e-05, "loss": 0.1274, "num_input_tokens_seen": 94202112, "step": 43645 }, { "epoch": 7.120717781402936, "grad_norm": 3.752108573913574, "learning_rate": 1.1600243636998396e-05, "loss": 0.2679, "num_input_tokens_seen": 94213312, "step": 43650 }, { "epoch": 7.121533442088092, "grad_norm": 0.04887622222304344, "learning_rate": 1.1594235007542914e-05, "loss": 0.0061, "num_input_tokens_seen": 94225248, "step": 43655 }, { "epoch": 7.122349102773247, "grad_norm": 3.3832032680511475, "learning_rate": 1.1588227464826763e-05, "loss": 0.1378, "num_input_tokens_seen": 94236416, "step": 43660 }, { "epoch": 7.123164763458401, "grad_norm": 0.09872160851955414, "learning_rate": 1.1582221009336944e-05, "loss": 0.0412, "num_input_tokens_seen": 94247040, "step": 43665 }, { "epoch": 7.123980424143556, "grad_norm": 1.166061520576477, "learning_rate": 1.157621564156037e-05, "loss": 0.0091, "num_input_tokens_seen": 94257472, "step": 43670 }, { "epoch": 7.124796084828711, "grad_norm": 3.643521547317505, "learning_rate": 1.157021136198387e-05, "loss": 0.1308, "num_input_tokens_seen": 94267776, "step": 43675 }, { "epoch": 7.125611745513866, "grad_norm": 3.188948392868042, "learning_rate": 1.156420817109418e-05, "loss": 0.0884, "num_input_tokens_seen": 94279552, "step": 43680 }, { "epoch": 7.126427406199022, "grad_norm": 0.033812519162893295, "learning_rate": 1.1558206069377945e-05, "loss": 0.0273, "num_input_tokens_seen": 94291424, "step": 43685 }, { "epoch": 7.127243066884176, "grad_norm": 0.656554639339447, "learning_rate": 1.155220505732173e-05, "loss": 0.0067, "num_input_tokens_seen": 94302080, "step": 43690 }, { "epoch": 7.128058727569331, "grad_norm": 0.08332250267267227, "learning_rate": 1.1546205135412008e-05, "loss": 0.0918, "num_input_tokens_seen": 94313440, "step": 43695 }, { "epoch": 7.128874388254486, "grad_norm": 0.11393269151449203, "learning_rate": 1.1540206304135152e-05, "loss": 0.0604, "num_input_tokens_seen": 94322784, "step": 43700 }, { "epoch": 7.129690048939641, "grad_norm": 7.067556381225586, "learning_rate": 1.1534208563977475e-05, "loss": 0.1465, "num_input_tokens_seen": 94333280, "step": 43705 }, { "epoch": 7.130505709624796, "grad_norm": 5.529153347015381, "learning_rate": 1.1528211915425177e-05, "loss": 0.0854, "num_input_tokens_seen": 94344032, "step": 43710 }, { "epoch": 7.131321370309951, "grad_norm": 0.07599502056837082, "learning_rate": 1.1522216358964377e-05, "loss": 0.145, "num_input_tokens_seen": 94354816, "step": 43715 }, { "epoch": 7.132137030995106, "grad_norm": 0.10091648250818253, "learning_rate": 1.1516221895081104e-05, "loss": 0.1838, "num_input_tokens_seen": 94364896, "step": 43720 }, { "epoch": 7.132952691680261, "grad_norm": 0.20659762620925903, "learning_rate": 1.1510228524261302e-05, "loss": 0.3342, "num_input_tokens_seen": 94375648, "step": 43725 }, { "epoch": 7.133768352365416, "grad_norm": 0.0431620255112648, "learning_rate": 1.1504236246990819e-05, "loss": 0.1825, "num_input_tokens_seen": 94386816, "step": 43730 }, { "epoch": 7.134584013050571, "grad_norm": 0.1603519767522812, "learning_rate": 1.1498245063755425e-05, "loss": 0.0096, "num_input_tokens_seen": 94397856, "step": 43735 }, { "epoch": 7.135399673735726, "grad_norm": 3.901911735534668, "learning_rate": 1.1492254975040792e-05, "loss": 0.1674, "num_input_tokens_seen": 94408128, "step": 43740 }, { "epoch": 7.136215334420881, "grad_norm": 0.023856377229094505, "learning_rate": 1.148626598133251e-05, "loss": 0.1777, "num_input_tokens_seen": 94418912, "step": 43745 }, { "epoch": 7.137030995106036, "grad_norm": 0.08474874496459961, "learning_rate": 1.1480278083116074e-05, "loss": 0.0062, "num_input_tokens_seen": 94428576, "step": 43750 }, { "epoch": 7.137846655791191, "grad_norm": 0.03850866109132767, "learning_rate": 1.1474291280876894e-05, "loss": 0.2975, "num_input_tokens_seen": 94439232, "step": 43755 }, { "epoch": 7.138662316476346, "grad_norm": 5.355396747589111, "learning_rate": 1.1468305575100294e-05, "loss": 0.0709, "num_input_tokens_seen": 94450144, "step": 43760 }, { "epoch": 7.1394779771615005, "grad_norm": 0.050152700394392014, "learning_rate": 1.1462320966271503e-05, "loss": 0.0068, "num_input_tokens_seen": 94461664, "step": 43765 }, { "epoch": 7.140293637846656, "grad_norm": 0.057023610919713974, "learning_rate": 1.1456337454875663e-05, "loss": 0.0032, "num_input_tokens_seen": 94471392, "step": 43770 }, { "epoch": 7.141109298531811, "grad_norm": 0.15296074748039246, "learning_rate": 1.1450355041397829e-05, "loss": 0.1481, "num_input_tokens_seen": 94482848, "step": 43775 }, { "epoch": 7.141924959216966, "grad_norm": 0.09270456433296204, "learning_rate": 1.1444373726322966e-05, "loss": 0.1107, "num_input_tokens_seen": 94493792, "step": 43780 }, { "epoch": 7.142740619902121, "grad_norm": 0.08430933207273483, "learning_rate": 1.143839351013595e-05, "loss": 0.0044, "num_input_tokens_seen": 94504448, "step": 43785 }, { "epoch": 7.143556280587275, "grad_norm": 0.2279704511165619, "learning_rate": 1.1432414393321556e-05, "loss": 0.1016, "num_input_tokens_seen": 94515296, "step": 43790 }, { "epoch": 7.14437194127243, "grad_norm": 16.787500381469727, "learning_rate": 1.1426436376364502e-05, "loss": 0.0347, "num_input_tokens_seen": 94525984, "step": 43795 }, { "epoch": 7.145187601957586, "grad_norm": 0.16307874023914337, "learning_rate": 1.1420459459749385e-05, "loss": 0.0947, "num_input_tokens_seen": 94536512, "step": 43800 }, { "epoch": 7.146003262642741, "grad_norm": 0.0962166041135788, "learning_rate": 1.141448364396073e-05, "loss": 0.0046, "num_input_tokens_seen": 94546208, "step": 43805 }, { "epoch": 7.146818923327896, "grad_norm": 0.22677689790725708, "learning_rate": 1.1408508929482961e-05, "loss": 0.0061, "num_input_tokens_seen": 94558656, "step": 43810 }, { "epoch": 7.14763458401305, "grad_norm": 0.35085445642471313, "learning_rate": 1.1402535316800414e-05, "loss": 0.0051, "num_input_tokens_seen": 94568832, "step": 43815 }, { "epoch": 7.148450244698205, "grad_norm": 3.750436305999756, "learning_rate": 1.1396562806397354e-05, "loss": 0.1152, "num_input_tokens_seen": 94580512, "step": 43820 }, { "epoch": 7.149265905383361, "grad_norm": 4.464899063110352, "learning_rate": 1.1390591398757935e-05, "loss": 0.1694, "num_input_tokens_seen": 94592608, "step": 43825 }, { "epoch": 7.150081566068516, "grad_norm": 20.690568923950195, "learning_rate": 1.1384621094366232e-05, "loss": 0.0443, "num_input_tokens_seen": 94603200, "step": 43830 }, { "epoch": 7.150897226753671, "grad_norm": 0.12341461330652237, "learning_rate": 1.1378651893706227e-05, "loss": 0.0258, "num_input_tokens_seen": 94614656, "step": 43835 }, { "epoch": 7.151712887438825, "grad_norm": 0.2521572709083557, "learning_rate": 1.1372683797261814e-05, "loss": 0.0068, "num_input_tokens_seen": 94624512, "step": 43840 }, { "epoch": 7.15252854812398, "grad_norm": 0.051836270838975906, "learning_rate": 1.1366716805516794e-05, "loss": 0.1523, "num_input_tokens_seen": 94635360, "step": 43845 }, { "epoch": 7.153344208809135, "grad_norm": 0.06448886543512344, "learning_rate": 1.1360750918954887e-05, "loss": 0.0077, "num_input_tokens_seen": 94645440, "step": 43850 }, { "epoch": 7.154159869494291, "grad_norm": 0.042032964527606964, "learning_rate": 1.1354786138059715e-05, "loss": 0.2204, "num_input_tokens_seen": 94656832, "step": 43855 }, { "epoch": 7.1549755301794455, "grad_norm": 0.07452675700187683, "learning_rate": 1.1348822463314815e-05, "loss": 0.004, "num_input_tokens_seen": 94666816, "step": 43860 }, { "epoch": 7.1557911908646, "grad_norm": 0.13034167885780334, "learning_rate": 1.1342859895203629e-05, "loss": 0.102, "num_input_tokens_seen": 94677888, "step": 43865 }, { "epoch": 7.156606851549755, "grad_norm": 1.121642827987671, "learning_rate": 1.1336898434209517e-05, "loss": 0.0067, "num_input_tokens_seen": 94689088, "step": 43870 }, { "epoch": 7.15742251223491, "grad_norm": 0.04426287114620209, "learning_rate": 1.1330938080815743e-05, "loss": 0.0964, "num_input_tokens_seen": 94700992, "step": 43875 }, { "epoch": 7.158238172920065, "grad_norm": 0.2092503011226654, "learning_rate": 1.1324978835505483e-05, "loss": 0.0131, "num_input_tokens_seen": 94710112, "step": 43880 }, { "epoch": 7.1590538336052205, "grad_norm": 0.04995310306549072, "learning_rate": 1.1319020698761828e-05, "loss": 0.1868, "num_input_tokens_seen": 94720544, "step": 43885 }, { "epoch": 7.159869494290375, "grad_norm": 0.11302343010902405, "learning_rate": 1.1313063671067769e-05, "loss": 0.012, "num_input_tokens_seen": 94730496, "step": 43890 }, { "epoch": 7.16068515497553, "grad_norm": 0.07707829773426056, "learning_rate": 1.1307107752906218e-05, "loss": 0.2294, "num_input_tokens_seen": 94741696, "step": 43895 }, { "epoch": 7.161500815660685, "grad_norm": 0.0993894636631012, "learning_rate": 1.1301152944759988e-05, "loss": 0.0075, "num_input_tokens_seen": 94752000, "step": 43900 }, { "epoch": 7.16231647634584, "grad_norm": 0.08817077428102493, "learning_rate": 1.129519924711181e-05, "loss": 0.0066, "num_input_tokens_seen": 94763264, "step": 43905 }, { "epoch": 7.1631321370309955, "grad_norm": 3.8644626140594482, "learning_rate": 1.1289246660444306e-05, "loss": 0.2182, "num_input_tokens_seen": 94773504, "step": 43910 }, { "epoch": 7.16394779771615, "grad_norm": 0.10693071782588959, "learning_rate": 1.1283295185240048e-05, "loss": 0.1286, "num_input_tokens_seen": 94783328, "step": 43915 }, { "epoch": 7.164763458401305, "grad_norm": 0.1141429990530014, "learning_rate": 1.1277344821981475e-05, "loss": 0.0042, "num_input_tokens_seen": 94795584, "step": 43920 }, { "epoch": 7.16557911908646, "grad_norm": 0.03173322603106499, "learning_rate": 1.1271395571150964e-05, "loss": 0.0419, "num_input_tokens_seen": 94805408, "step": 43925 }, { "epoch": 7.166394779771615, "grad_norm": 0.1410226970911026, "learning_rate": 1.1265447433230784e-05, "loss": 0.0809, "num_input_tokens_seen": 94817024, "step": 43930 }, { "epoch": 7.16721044045677, "grad_norm": 4.855565547943115, "learning_rate": 1.1259500408703124e-05, "loss": 0.3698, "num_input_tokens_seen": 94828480, "step": 43935 }, { "epoch": 7.168026101141925, "grad_norm": 0.32295486330986023, "learning_rate": 1.1253554498050078e-05, "loss": 0.1422, "num_input_tokens_seen": 94839552, "step": 43940 }, { "epoch": 7.16884176182708, "grad_norm": 0.09155721217393875, "learning_rate": 1.1247609701753656e-05, "loss": 0.127, "num_input_tokens_seen": 94851200, "step": 43945 }, { "epoch": 7.169657422512235, "grad_norm": 0.08383266627788544, "learning_rate": 1.1241666020295768e-05, "loss": 0.2916, "num_input_tokens_seen": 94863744, "step": 43950 }, { "epoch": 7.17047308319739, "grad_norm": 0.3444756865501404, "learning_rate": 1.1235723454158242e-05, "loss": 0.084, "num_input_tokens_seen": 94873504, "step": 43955 }, { "epoch": 7.171288743882545, "grad_norm": 0.026276571676135063, "learning_rate": 1.1229782003822803e-05, "loss": 0.0992, "num_input_tokens_seen": 94883232, "step": 43960 }, { "epoch": 7.1721044045677, "grad_norm": 0.2174517661333084, "learning_rate": 1.1223841669771113e-05, "loss": 0.0146, "num_input_tokens_seen": 94893984, "step": 43965 }, { "epoch": 7.172920065252855, "grad_norm": 0.09940383583307266, "learning_rate": 1.121790245248472e-05, "loss": 0.1552, "num_input_tokens_seen": 94904960, "step": 43970 }, { "epoch": 7.17373572593801, "grad_norm": 3.099872350692749, "learning_rate": 1.1211964352445078e-05, "loss": 0.1952, "num_input_tokens_seen": 94914656, "step": 43975 }, { "epoch": 7.174551386623165, "grad_norm": 0.16598708927631378, "learning_rate": 1.120602737013357e-05, "loss": 0.0845, "num_input_tokens_seen": 94925632, "step": 43980 }, { "epoch": 7.1753670473083195, "grad_norm": 0.2137947827577591, "learning_rate": 1.120009150603147e-05, "loss": 0.0076, "num_input_tokens_seen": 94937376, "step": 43985 }, { "epoch": 7.176182707993474, "grad_norm": 20.040090560913086, "learning_rate": 1.1194156760619976e-05, "loss": 0.0403, "num_input_tokens_seen": 94948608, "step": 43990 }, { "epoch": 7.17699836867863, "grad_norm": 0.06532534956932068, "learning_rate": 1.1188223134380183e-05, "loss": 0.0057, "num_input_tokens_seen": 94958368, "step": 43995 }, { "epoch": 7.177814029363785, "grad_norm": 0.06551416218280792, "learning_rate": 1.1182290627793105e-05, "loss": 0.0031, "num_input_tokens_seen": 94968576, "step": 44000 }, { "epoch": 7.17862969004894, "grad_norm": 30.835641860961914, "learning_rate": 1.1176359241339656e-05, "loss": 0.1642, "num_input_tokens_seen": 94979552, "step": 44005 }, { "epoch": 7.1794453507340945, "grad_norm": 0.33559077978134155, "learning_rate": 1.1170428975500668e-05, "loss": 0.0065, "num_input_tokens_seen": 94990656, "step": 44010 }, { "epoch": 7.180261011419249, "grad_norm": 0.1564725637435913, "learning_rate": 1.116449983075688e-05, "loss": 0.1024, "num_input_tokens_seen": 95000480, "step": 44015 }, { "epoch": 7.181076672104404, "grad_norm": 0.18704740703105927, "learning_rate": 1.1158571807588924e-05, "loss": 0.0098, "num_input_tokens_seen": 95011584, "step": 44020 }, { "epoch": 7.18189233278956, "grad_norm": 0.07209386676549911, "learning_rate": 1.115264490647738e-05, "loss": 0.1841, "num_input_tokens_seen": 95021824, "step": 44025 }, { "epoch": 7.182707993474715, "grad_norm": 0.079127736389637, "learning_rate": 1.11467191279027e-05, "loss": 0.005, "num_input_tokens_seen": 95033696, "step": 44030 }, { "epoch": 7.1835236541598695, "grad_norm": 5.181544780731201, "learning_rate": 1.1140794472345259e-05, "loss": 0.0906, "num_input_tokens_seen": 95043936, "step": 44035 }, { "epoch": 7.184339314845024, "grad_norm": 0.13229092955589294, "learning_rate": 1.1134870940285339e-05, "loss": 0.0051, "num_input_tokens_seen": 95055040, "step": 44040 }, { "epoch": 7.185154975530179, "grad_norm": 2.937776803970337, "learning_rate": 1.112894853220313e-05, "loss": 0.1312, "num_input_tokens_seen": 95066368, "step": 44045 }, { "epoch": 7.185970636215335, "grad_norm": 2.413372755050659, "learning_rate": 1.1123027248578736e-05, "loss": 0.2587, "num_input_tokens_seen": 95077728, "step": 44050 }, { "epoch": 7.18678629690049, "grad_norm": 0.17712894082069397, "learning_rate": 1.1117107089892162e-05, "loss": 0.0071, "num_input_tokens_seen": 95088160, "step": 44055 }, { "epoch": 7.1876019575856445, "grad_norm": 3.4987986087799072, "learning_rate": 1.1111188056623328e-05, "loss": 0.1675, "num_input_tokens_seen": 95099424, "step": 44060 }, { "epoch": 7.188417618270799, "grad_norm": 0.4862080514431, "learning_rate": 1.1105270149252062e-05, "loss": 0.1127, "num_input_tokens_seen": 95109600, "step": 44065 }, { "epoch": 7.189233278955954, "grad_norm": 0.11356819421052933, "learning_rate": 1.10993533682581e-05, "loss": 0.1124, "num_input_tokens_seen": 95119424, "step": 44070 }, { "epoch": 7.190048939641109, "grad_norm": 18.320451736450195, "learning_rate": 1.1093437714121085e-05, "loss": 0.0375, "num_input_tokens_seen": 95130368, "step": 44075 }, { "epoch": 7.190864600326265, "grad_norm": 7.893442630767822, "learning_rate": 1.108752318732057e-05, "loss": 0.0127, "num_input_tokens_seen": 95142400, "step": 44080 }, { "epoch": 7.191680261011419, "grad_norm": 0.03855288028717041, "learning_rate": 1.1081609788336014e-05, "loss": 0.2769, "num_input_tokens_seen": 95153472, "step": 44085 }, { "epoch": 7.192495921696574, "grad_norm": 9.533473014831543, "learning_rate": 1.1075697517646794e-05, "loss": 0.1633, "num_input_tokens_seen": 95163072, "step": 44090 }, { "epoch": 7.193311582381729, "grad_norm": 0.13133949041366577, "learning_rate": 1.1069786375732181e-05, "loss": 0.0058, "num_input_tokens_seen": 95174656, "step": 44095 }, { "epoch": 7.194127243066884, "grad_norm": 1.044507622718811, "learning_rate": 1.1063876363071368e-05, "loss": 0.0558, "num_input_tokens_seen": 95184960, "step": 44100 }, { "epoch": 7.19494290375204, "grad_norm": 3.8548696041107178, "learning_rate": 1.1057967480143438e-05, "loss": 0.1389, "num_input_tokens_seen": 95194848, "step": 44105 }, { "epoch": 7.195758564437194, "grad_norm": 0.8807480335235596, "learning_rate": 1.1052059727427414e-05, "loss": 0.0106, "num_input_tokens_seen": 95207072, "step": 44110 }, { "epoch": 7.196574225122349, "grad_norm": 0.046531785279512405, "learning_rate": 1.1046153105402199e-05, "loss": 0.1024, "num_input_tokens_seen": 95217920, "step": 44115 }, { "epoch": 7.197389885807504, "grad_norm": 0.1135723739862442, "learning_rate": 1.1040247614546617e-05, "loss": 0.0287, "num_input_tokens_seen": 95228224, "step": 44120 }, { "epoch": 7.198205546492659, "grad_norm": 0.09810398519039154, "learning_rate": 1.1034343255339391e-05, "loss": 0.0989, "num_input_tokens_seen": 95239008, "step": 44125 }, { "epoch": 7.199021207177814, "grad_norm": 0.08268575370311737, "learning_rate": 1.1028440028259154e-05, "loss": 0.293, "num_input_tokens_seen": 95248800, "step": 44130 }, { "epoch": 7.199836867862969, "grad_norm": 0.21904577314853668, "learning_rate": 1.1022537933784472e-05, "loss": 0.121, "num_input_tokens_seen": 95259840, "step": 44135 }, { "epoch": 7.200652528548124, "grad_norm": 10.687139511108398, "learning_rate": 1.1016636972393782e-05, "loss": 0.1494, "num_input_tokens_seen": 95270848, "step": 44140 }, { "epoch": 7.201468189233279, "grad_norm": 0.04208752140402794, "learning_rate": 1.101073714456545e-05, "loss": 0.0104, "num_input_tokens_seen": 95281472, "step": 44145 }, { "epoch": 7.202283849918434, "grad_norm": 0.07681772857904434, "learning_rate": 1.1004838450777747e-05, "loss": 0.1983, "num_input_tokens_seen": 95292160, "step": 44150 }, { "epoch": 7.203099510603589, "grad_norm": 3.470168113708496, "learning_rate": 1.099894089150885e-05, "loss": 0.1027, "num_input_tokens_seen": 95302208, "step": 44155 }, { "epoch": 7.2039151712887435, "grad_norm": 0.12584130465984344, "learning_rate": 1.0993044467236843e-05, "loss": 0.2133, "num_input_tokens_seen": 95312032, "step": 44160 }, { "epoch": 7.204730831973899, "grad_norm": 0.26527076959609985, "learning_rate": 1.0987149178439726e-05, "loss": 0.1062, "num_input_tokens_seen": 95323744, "step": 44165 }, { "epoch": 7.205546492659054, "grad_norm": 0.12256963551044464, "learning_rate": 1.0981255025595394e-05, "loss": 0.1266, "num_input_tokens_seen": 95333856, "step": 44170 }, { "epoch": 7.206362153344209, "grad_norm": 0.020157380029559135, "learning_rate": 1.097536200918166e-05, "loss": 0.0073, "num_input_tokens_seen": 95344864, "step": 44175 }, { "epoch": 7.207177814029364, "grad_norm": 0.18879617750644684, "learning_rate": 1.0969470129676243e-05, "loss": 0.1129, "num_input_tokens_seen": 95355520, "step": 44180 }, { "epoch": 7.2079934747145185, "grad_norm": 0.1413317173719406, "learning_rate": 1.096357938755677e-05, "loss": 0.0101, "num_input_tokens_seen": 95366752, "step": 44185 }, { "epoch": 7.208809135399674, "grad_norm": 0.0799066349864006, "learning_rate": 1.0957689783300767e-05, "loss": 0.0089, "num_input_tokens_seen": 95378304, "step": 44190 }, { "epoch": 7.209624796084829, "grad_norm": 3.785212755203247, "learning_rate": 1.0951801317385682e-05, "loss": 0.1014, "num_input_tokens_seen": 95389024, "step": 44195 }, { "epoch": 7.210440456769984, "grad_norm": 0.11698129028081894, "learning_rate": 1.0945913990288862e-05, "loss": 0.0065, "num_input_tokens_seen": 95398656, "step": 44200 }, { "epoch": 7.211256117455139, "grad_norm": 0.16734565794467926, "learning_rate": 1.0940027802487565e-05, "loss": 0.0894, "num_input_tokens_seen": 95408160, "step": 44205 }, { "epoch": 7.212071778140293, "grad_norm": 0.2755676805973053, "learning_rate": 1.0934142754458954e-05, "loss": 0.0074, "num_input_tokens_seen": 95419936, "step": 44210 }, { "epoch": 7.212887438825448, "grad_norm": 0.10861805826425552, "learning_rate": 1.0928258846680097e-05, "loss": 0.155, "num_input_tokens_seen": 95429952, "step": 44215 }, { "epoch": 7.213703099510604, "grad_norm": 0.13752296566963196, "learning_rate": 1.092237607962798e-05, "loss": 0.0071, "num_input_tokens_seen": 95441024, "step": 44220 }, { "epoch": 7.214518760195759, "grad_norm": 0.36296749114990234, "learning_rate": 1.0916494453779489e-05, "loss": 0.0056, "num_input_tokens_seen": 95452448, "step": 44225 }, { "epoch": 7.215334420880914, "grad_norm": 0.06946375966072083, "learning_rate": 1.0910613969611406e-05, "loss": 0.0038, "num_input_tokens_seen": 95462432, "step": 44230 }, { "epoch": 7.216150081566068, "grad_norm": 1.6671926975250244, "learning_rate": 1.0904734627600448e-05, "loss": 0.0131, "num_input_tokens_seen": 95473312, "step": 44235 }, { "epoch": 7.216965742251223, "grad_norm": 11.202441215515137, "learning_rate": 1.0898856428223225e-05, "loss": 0.1112, "num_input_tokens_seen": 95482624, "step": 44240 }, { "epoch": 7.217781402936378, "grad_norm": 0.06486225128173828, "learning_rate": 1.0892979371956246e-05, "loss": 0.0032, "num_input_tokens_seen": 95494304, "step": 44245 }, { "epoch": 7.218597063621534, "grad_norm": 0.12349742650985718, "learning_rate": 1.088710345927594e-05, "loss": 0.006, "num_input_tokens_seen": 95505024, "step": 44250 }, { "epoch": 7.219412724306689, "grad_norm": 0.8538283109664917, "learning_rate": 1.0881228690658634e-05, "loss": 0.0042, "num_input_tokens_seen": 95516224, "step": 44255 }, { "epoch": 7.220228384991843, "grad_norm": 0.049786556512117386, "learning_rate": 1.087535506658057e-05, "loss": 0.0028, "num_input_tokens_seen": 95527168, "step": 44260 }, { "epoch": 7.221044045676998, "grad_norm": 0.08179260045289993, "learning_rate": 1.086948258751789e-05, "loss": 0.0795, "num_input_tokens_seen": 95537152, "step": 44265 }, { "epoch": 7.221859706362153, "grad_norm": 10.488143920898438, "learning_rate": 1.0863611253946651e-05, "loss": 0.0797, "num_input_tokens_seen": 95549248, "step": 44270 }, { "epoch": 7.222675367047309, "grad_norm": 4.596770763397217, "learning_rate": 1.08577410663428e-05, "loss": 0.0929, "num_input_tokens_seen": 95560704, "step": 44275 }, { "epoch": 7.2234910277324635, "grad_norm": 0.021962575614452362, "learning_rate": 1.0851872025182225e-05, "loss": 0.0155, "num_input_tokens_seen": 95572864, "step": 44280 }, { "epoch": 7.224306688417618, "grad_norm": 0.9708815217018127, "learning_rate": 1.084600413094069e-05, "loss": 0.0589, "num_input_tokens_seen": 95582944, "step": 44285 }, { "epoch": 7.225122349102773, "grad_norm": 0.07511153817176819, "learning_rate": 1.0840137384093876e-05, "loss": 0.0054, "num_input_tokens_seen": 95593472, "step": 44290 }, { "epoch": 7.225938009787928, "grad_norm": 1.9997107982635498, "learning_rate": 1.0834271785117376e-05, "loss": 0.115, "num_input_tokens_seen": 95604064, "step": 44295 }, { "epoch": 7.226753670473083, "grad_norm": 0.026333212852478027, "learning_rate": 1.0828407334486676e-05, "loss": 0.0038, "num_input_tokens_seen": 95616192, "step": 44300 }, { "epoch": 7.2275693311582385, "grad_norm": 0.3405097424983978, "learning_rate": 1.0822544032677187e-05, "loss": 0.0114, "num_input_tokens_seen": 95626976, "step": 44305 }, { "epoch": 7.228384991843393, "grad_norm": 0.08290238678455353, "learning_rate": 1.0816681880164215e-05, "loss": 0.0041, "num_input_tokens_seen": 95635328, "step": 44310 }, { "epoch": 7.229200652528548, "grad_norm": 0.12996086478233337, "learning_rate": 1.0810820877422973e-05, "loss": 0.0345, "num_input_tokens_seen": 95645248, "step": 44315 }, { "epoch": 7.230016313213703, "grad_norm": 0.2046303153038025, "learning_rate": 1.0804961024928587e-05, "loss": 0.1687, "num_input_tokens_seen": 95656928, "step": 44320 }, { "epoch": 7.230831973898858, "grad_norm": 0.09022431820631027, "learning_rate": 1.0799102323156082e-05, "loss": 0.0028, "num_input_tokens_seen": 95667200, "step": 44325 }, { "epoch": 7.231647634584013, "grad_norm": 0.1085621789097786, "learning_rate": 1.0793244772580402e-05, "loss": 0.2045, "num_input_tokens_seen": 95677248, "step": 44330 }, { "epoch": 7.232463295269168, "grad_norm": 0.1027090772986412, "learning_rate": 1.0787388373676374e-05, "loss": 0.0984, "num_input_tokens_seen": 95687104, "step": 44335 }, { "epoch": 7.233278955954323, "grad_norm": 0.12635067105293274, "learning_rate": 1.0781533126918767e-05, "loss": 0.107, "num_input_tokens_seen": 95697792, "step": 44340 }, { "epoch": 7.234094616639478, "grad_norm": 0.08168936520814896, "learning_rate": 1.077567903278223e-05, "loss": 0.1504, "num_input_tokens_seen": 95708800, "step": 44345 }, { "epoch": 7.234910277324633, "grad_norm": 0.057498153299093246, "learning_rate": 1.0769826091741323e-05, "loss": 0.0113, "num_input_tokens_seen": 95719296, "step": 44350 }, { "epoch": 7.235725938009788, "grad_norm": 0.045036740601062775, "learning_rate": 1.0763974304270516e-05, "loss": 0.0309, "num_input_tokens_seen": 95730336, "step": 44355 }, { "epoch": 7.236541598694943, "grad_norm": 0.09333490580320358, "learning_rate": 1.0758123670844186e-05, "loss": 0.356, "num_input_tokens_seen": 95741760, "step": 44360 }, { "epoch": 7.237357259380098, "grad_norm": 0.07128450274467468, "learning_rate": 1.0752274191936611e-05, "loss": 0.1928, "num_input_tokens_seen": 95752896, "step": 44365 }, { "epoch": 7.238172920065253, "grad_norm": 20.4687442779541, "learning_rate": 1.0746425868021986e-05, "loss": 0.3655, "num_input_tokens_seen": 95763968, "step": 44370 }, { "epoch": 7.238988580750408, "grad_norm": 0.08805769681930542, "learning_rate": 1.07405786995744e-05, "loss": 0.0604, "num_input_tokens_seen": 95774528, "step": 44375 }, { "epoch": 7.239804241435563, "grad_norm": 0.07074488699436188, "learning_rate": 1.0734732687067856e-05, "loss": 0.0038, "num_input_tokens_seen": 95785600, "step": 44380 }, { "epoch": 7.240619902120717, "grad_norm": 0.7448371052742004, "learning_rate": 1.0728887830976261e-05, "loss": 0.1705, "num_input_tokens_seen": 95795104, "step": 44385 }, { "epoch": 7.241435562805873, "grad_norm": 0.060588471591472626, "learning_rate": 1.0723044131773433e-05, "loss": 0.0794, "num_input_tokens_seen": 95807104, "step": 44390 }, { "epoch": 7.242251223491028, "grad_norm": 0.10568378120660782, "learning_rate": 1.0717201589933085e-05, "loss": 0.1016, "num_input_tokens_seen": 95816832, "step": 44395 }, { "epoch": 7.243066884176183, "grad_norm": 0.09632561355829239, "learning_rate": 1.0711360205928847e-05, "loss": 0.004, "num_input_tokens_seen": 95826688, "step": 44400 }, { "epoch": 7.2438825448613375, "grad_norm": 0.04835936427116394, "learning_rate": 1.070551998023425e-05, "loss": 0.1001, "num_input_tokens_seen": 95835904, "step": 44405 }, { "epoch": 7.244698205546492, "grad_norm": 3.753042459487915, "learning_rate": 1.0699680913322736e-05, "loss": 0.1757, "num_input_tokens_seen": 95847136, "step": 44410 }, { "epoch": 7.245513866231648, "grad_norm": 0.05000664293766022, "learning_rate": 1.0693843005667633e-05, "loss": 0.2018, "num_input_tokens_seen": 95858368, "step": 44415 }, { "epoch": 7.246329526916803, "grad_norm": 0.04885876178741455, "learning_rate": 1.0688006257742214e-05, "loss": 0.133, "num_input_tokens_seen": 95868544, "step": 44420 }, { "epoch": 7.247145187601958, "grad_norm": 0.11001405119895935, "learning_rate": 1.0682170670019628e-05, "loss": 0.0048, "num_input_tokens_seen": 95878944, "step": 44425 }, { "epoch": 7.2479608482871125, "grad_norm": 4.917969226837158, "learning_rate": 1.0676336242972934e-05, "loss": 0.0092, "num_input_tokens_seen": 95889792, "step": 44430 }, { "epoch": 7.248776508972267, "grad_norm": 0.060306940227746964, "learning_rate": 1.0670502977075103e-05, "loss": 0.1261, "num_input_tokens_seen": 95900896, "step": 44435 }, { "epoch": 7.249592169657422, "grad_norm": 0.27811291813850403, "learning_rate": 1.0664670872799006e-05, "loss": 0.0085, "num_input_tokens_seen": 95911808, "step": 44440 }, { "epoch": 7.250407830342578, "grad_norm": 0.09314201772212982, "learning_rate": 1.065883993061742e-05, "loss": 0.1007, "num_input_tokens_seen": 95921696, "step": 44445 }, { "epoch": 7.251223491027733, "grad_norm": 0.112932488322258, "learning_rate": 1.065301015100304e-05, "loss": 0.0883, "num_input_tokens_seen": 95931104, "step": 44450 }, { "epoch": 7.2520391517128875, "grad_norm": 0.17163510620594025, "learning_rate": 1.0647181534428455e-05, "loss": 0.205, "num_input_tokens_seen": 95941376, "step": 44455 }, { "epoch": 7.252854812398042, "grad_norm": 0.21094530820846558, "learning_rate": 1.0641354081366161e-05, "loss": 0.0217, "num_input_tokens_seen": 95950496, "step": 44460 }, { "epoch": 7.253670473083197, "grad_norm": 4.583967685699463, "learning_rate": 1.0635527792288558e-05, "loss": 0.0568, "num_input_tokens_seen": 95961824, "step": 44465 }, { "epoch": 7.254486133768353, "grad_norm": 0.3027603030204773, "learning_rate": 1.0629702667667959e-05, "loss": 0.0733, "num_input_tokens_seen": 95972768, "step": 44470 }, { "epoch": 7.255301794453508, "grad_norm": 0.38592761754989624, "learning_rate": 1.0623878707976575e-05, "loss": 0.0104, "num_input_tokens_seen": 95985024, "step": 44475 }, { "epoch": 7.2561174551386625, "grad_norm": 0.1534576267004013, "learning_rate": 1.0618055913686525e-05, "loss": 0.005, "num_input_tokens_seen": 95996768, "step": 44480 }, { "epoch": 7.256933115823817, "grad_norm": 12.545076370239258, "learning_rate": 1.061223428526984e-05, "loss": 0.0407, "num_input_tokens_seen": 96007328, "step": 44485 }, { "epoch": 7.257748776508972, "grad_norm": 0.14382946491241455, "learning_rate": 1.0606413823198444e-05, "loss": 0.0059, "num_input_tokens_seen": 96017472, "step": 44490 }, { "epoch": 7.258564437194127, "grad_norm": 3.0954854488372803, "learning_rate": 1.0600594527944174e-05, "loss": 0.006, "num_input_tokens_seen": 96026048, "step": 44495 }, { "epoch": 7.259380097879283, "grad_norm": 0.0807352140545845, "learning_rate": 1.0594776399978776e-05, "loss": 0.008, "num_input_tokens_seen": 96037024, "step": 44500 }, { "epoch": 7.260195758564437, "grad_norm": 0.10835316777229309, "learning_rate": 1.0588959439773893e-05, "loss": 0.0039, "num_input_tokens_seen": 96048640, "step": 44505 }, { "epoch": 7.261011419249592, "grad_norm": 0.8585956692695618, "learning_rate": 1.058314364780108e-05, "loss": 0.0794, "num_input_tokens_seen": 96059968, "step": 44510 }, { "epoch": 7.261827079934747, "grad_norm": 0.14995978772640228, "learning_rate": 1.0577329024531792e-05, "loss": 0.1314, "num_input_tokens_seen": 96070592, "step": 44515 }, { "epoch": 7.262642740619902, "grad_norm": 4.186001777648926, "learning_rate": 1.0571515570437396e-05, "loss": 0.0596, "num_input_tokens_seen": 96080768, "step": 44520 }, { "epoch": 7.263458401305057, "grad_norm": 0.19002677500247955, "learning_rate": 1.0565703285989154e-05, "loss": 0.099, "num_input_tokens_seen": 96091584, "step": 44525 }, { "epoch": 7.264274061990212, "grad_norm": 0.21625880897045135, "learning_rate": 1.0559892171658245e-05, "loss": 0.0037, "num_input_tokens_seen": 96103264, "step": 44530 }, { "epoch": 7.265089722675367, "grad_norm": 0.08097948133945465, "learning_rate": 1.0554082227915743e-05, "loss": 0.2587, "num_input_tokens_seen": 96112992, "step": 44535 }, { "epoch": 7.265905383360522, "grad_norm": 0.18207955360412598, "learning_rate": 1.0548273455232634e-05, "loss": 0.1109, "num_input_tokens_seen": 96124480, "step": 44540 }, { "epoch": 7.266721044045677, "grad_norm": 0.4924478530883789, "learning_rate": 1.0542465854079806e-05, "loss": 0.0302, "num_input_tokens_seen": 96134848, "step": 44545 }, { "epoch": 7.267536704730832, "grad_norm": 0.08942606300115585, "learning_rate": 1.0536659424928044e-05, "loss": 0.0074, "num_input_tokens_seen": 96146176, "step": 44550 }, { "epoch": 7.268352365415987, "grad_norm": 0.08673729747533798, "learning_rate": 1.0530854168248064e-05, "loss": 0.017, "num_input_tokens_seen": 96157824, "step": 44555 }, { "epoch": 7.269168026101142, "grad_norm": 0.011961083859205246, "learning_rate": 1.052505008451046e-05, "loss": 0.0036, "num_input_tokens_seen": 96168992, "step": 44560 }, { "epoch": 7.269983686786297, "grad_norm": 1.6557157039642334, "learning_rate": 1.0519247174185742e-05, "loss": 0.0069, "num_input_tokens_seen": 96179712, "step": 44565 }, { "epoch": 7.270799347471452, "grad_norm": 0.4414486885070801, "learning_rate": 1.0513445437744323e-05, "loss": 0.0958, "num_input_tokens_seen": 96190656, "step": 44570 }, { "epoch": 7.271615008156607, "grad_norm": 4.245144367218018, "learning_rate": 1.0507644875656523e-05, "loss": 0.0887, "num_input_tokens_seen": 96201120, "step": 44575 }, { "epoch": 7.2724306688417615, "grad_norm": 0.10975703597068787, "learning_rate": 1.0501845488392558e-05, "loss": 0.0034, "num_input_tokens_seen": 96210304, "step": 44580 }, { "epoch": 7.273246329526917, "grad_norm": 0.08201498538255692, "learning_rate": 1.0496047276422554e-05, "loss": 0.1695, "num_input_tokens_seen": 96221472, "step": 44585 }, { "epoch": 7.274061990212072, "grad_norm": 0.04187267646193504, "learning_rate": 1.0490250240216562e-05, "loss": 0.0037, "num_input_tokens_seen": 96232736, "step": 44590 }, { "epoch": 7.274877650897227, "grad_norm": 0.12512241303920746, "learning_rate": 1.0484454380244505e-05, "loss": 0.2199, "num_input_tokens_seen": 96244352, "step": 44595 }, { "epoch": 7.275693311582382, "grad_norm": 0.023977143689990044, "learning_rate": 1.0478659696976225e-05, "loss": 0.0064, "num_input_tokens_seen": 96254240, "step": 44600 }, { "epoch": 7.2765089722675365, "grad_norm": 0.38099923729896545, "learning_rate": 1.0472866190881473e-05, "loss": 0.0032, "num_input_tokens_seen": 96265696, "step": 44605 }, { "epoch": 7.277324632952691, "grad_norm": 0.1553783416748047, "learning_rate": 1.0467073862429897e-05, "loss": 0.1281, "num_input_tokens_seen": 96277664, "step": 44610 }, { "epoch": 7.278140293637847, "grad_norm": 0.9362660050392151, "learning_rate": 1.0461282712091053e-05, "loss": 0.0076, "num_input_tokens_seen": 96288448, "step": 44615 }, { "epoch": 7.278955954323002, "grad_norm": 0.1234993264079094, "learning_rate": 1.0455492740334399e-05, "loss": 0.0035, "num_input_tokens_seen": 96298432, "step": 44620 }, { "epoch": 7.279771615008157, "grad_norm": 0.007566182874143124, "learning_rate": 1.0449703947629305e-05, "loss": 0.0066, "num_input_tokens_seen": 96309568, "step": 44625 }, { "epoch": 7.280587275693311, "grad_norm": 0.09912440925836563, "learning_rate": 1.0443916334445034e-05, "loss": 0.0019, "num_input_tokens_seen": 96319968, "step": 44630 }, { "epoch": 7.281402936378466, "grad_norm": 0.07563468813896179, "learning_rate": 1.0438129901250762e-05, "loss": 0.0039, "num_input_tokens_seen": 96331232, "step": 44635 }, { "epoch": 7.282218597063622, "grad_norm": 0.3058144450187683, "learning_rate": 1.0432344648515569e-05, "loss": 0.0046, "num_input_tokens_seen": 96341696, "step": 44640 }, { "epoch": 7.283034257748777, "grad_norm": 0.07371566444635391, "learning_rate": 1.042656057670843e-05, "loss": 0.1109, "num_input_tokens_seen": 96352160, "step": 44645 }, { "epoch": 7.283849918433932, "grad_norm": 0.10091523081064224, "learning_rate": 1.042077768629824e-05, "loss": 0.0042, "num_input_tokens_seen": 96362592, "step": 44650 }, { "epoch": 7.284665579119086, "grad_norm": 4.34228515625, "learning_rate": 1.0414995977753772e-05, "loss": 0.0769, "num_input_tokens_seen": 96375424, "step": 44655 }, { "epoch": 7.285481239804241, "grad_norm": 0.24394764006137848, "learning_rate": 1.0409215451543746e-05, "loss": 0.1543, "num_input_tokens_seen": 96387296, "step": 44660 }, { "epoch": 7.286296900489396, "grad_norm": 4.6436004638671875, "learning_rate": 1.0403436108136747e-05, "loss": 0.1301, "num_input_tokens_seen": 96397632, "step": 44665 }, { "epoch": 7.287112561174552, "grad_norm": 0.01952068693935871, "learning_rate": 1.039765794800128e-05, "loss": 0.0031, "num_input_tokens_seen": 96408096, "step": 44670 }, { "epoch": 7.287928221859707, "grad_norm": 0.018300577998161316, "learning_rate": 1.0391880971605749e-05, "loss": 0.0037, "num_input_tokens_seen": 96417408, "step": 44675 }, { "epoch": 7.288743882544861, "grad_norm": 0.2479696422815323, "learning_rate": 1.0386105179418467e-05, "loss": 0.1203, "num_input_tokens_seen": 96427328, "step": 44680 }, { "epoch": 7.289559543230016, "grad_norm": 0.08731896430253983, "learning_rate": 1.0380330571907654e-05, "loss": 0.0028, "num_input_tokens_seen": 96437920, "step": 44685 }, { "epoch": 7.290375203915171, "grad_norm": 4.4533305168151855, "learning_rate": 1.037455714954142e-05, "loss": 0.0815, "num_input_tokens_seen": 96447872, "step": 44690 }, { "epoch": 7.291190864600326, "grad_norm": 0.08045049011707306, "learning_rate": 1.0368784912787794e-05, "loss": 0.0029, "num_input_tokens_seen": 96458592, "step": 44695 }, { "epoch": 7.2920065252854815, "grad_norm": 0.09697709232568741, "learning_rate": 1.03630138621147e-05, "loss": 0.0393, "num_input_tokens_seen": 96469312, "step": 44700 }, { "epoch": 7.292822185970636, "grad_norm": 0.0496763177216053, "learning_rate": 1.035724399798997e-05, "loss": 0.0047, "num_input_tokens_seen": 96478976, "step": 44705 }, { "epoch": 7.293637846655791, "grad_norm": 0.04743843898177147, "learning_rate": 1.035147532088134e-05, "loss": 0.0176, "num_input_tokens_seen": 96489952, "step": 44710 }, { "epoch": 7.294453507340946, "grad_norm": 0.024325326085090637, "learning_rate": 1.0345707831256443e-05, "loss": 0.1312, "num_input_tokens_seen": 96500672, "step": 44715 }, { "epoch": 7.295269168026101, "grad_norm": 5.337332248687744, "learning_rate": 1.0339941529582828e-05, "loss": 0.2575, "num_input_tokens_seen": 96511488, "step": 44720 }, { "epoch": 7.2960848287112565, "grad_norm": 0.0264720618724823, "learning_rate": 1.0334176416327935e-05, "loss": 0.1006, "num_input_tokens_seen": 96522336, "step": 44725 }, { "epoch": 7.296900489396411, "grad_norm": 0.0639074295759201, "learning_rate": 1.0328412491959104e-05, "loss": 0.2508, "num_input_tokens_seen": 96533088, "step": 44730 }, { "epoch": 7.297716150081566, "grad_norm": 0.10456375032663345, "learning_rate": 1.0322649756943611e-05, "loss": 0.0031, "num_input_tokens_seen": 96543552, "step": 44735 }, { "epoch": 7.298531810766721, "grad_norm": 0.02855144441127777, "learning_rate": 1.0316888211748601e-05, "loss": 0.1592, "num_input_tokens_seen": 96554432, "step": 44740 }, { "epoch": 7.299347471451876, "grad_norm": 28.35318374633789, "learning_rate": 1.0311127856841136e-05, "loss": 0.061, "num_input_tokens_seen": 96565536, "step": 44745 }, { "epoch": 7.300163132137031, "grad_norm": 0.12948469817638397, "learning_rate": 1.0305368692688174e-05, "loss": 0.0048, "num_input_tokens_seen": 96576576, "step": 44750 }, { "epoch": 7.300978792822186, "grad_norm": 17.168794631958008, "learning_rate": 1.0299610719756587e-05, "loss": 0.0408, "num_input_tokens_seen": 96588640, "step": 44755 }, { "epoch": 7.301794453507341, "grad_norm": 0.06320144981145859, "learning_rate": 1.0293853938513142e-05, "loss": 0.0941, "num_input_tokens_seen": 96600160, "step": 44760 }, { "epoch": 7.302610114192496, "grad_norm": 0.07234900444746017, "learning_rate": 1.028809834942451e-05, "loss": 0.3319, "num_input_tokens_seen": 96610944, "step": 44765 }, { "epoch": 7.303425774877651, "grad_norm": 2.9788599014282227, "learning_rate": 1.028234395295728e-05, "loss": 0.1095, "num_input_tokens_seen": 96620960, "step": 44770 }, { "epoch": 7.304241435562806, "grad_norm": 0.05842946842312813, "learning_rate": 1.0276590749577924e-05, "loss": 0.2737, "num_input_tokens_seen": 96630592, "step": 44775 }, { "epoch": 7.30505709624796, "grad_norm": 0.01867961883544922, "learning_rate": 1.027083873975283e-05, "loss": 0.0064, "num_input_tokens_seen": 96642368, "step": 44780 }, { "epoch": 7.305872756933116, "grad_norm": 0.20065642893314362, "learning_rate": 1.0265087923948283e-05, "loss": 0.0117, "num_input_tokens_seen": 96654464, "step": 44785 }, { "epoch": 7.306688417618271, "grad_norm": 16.948871612548828, "learning_rate": 1.0259338302630472e-05, "loss": 0.1915, "num_input_tokens_seen": 96664992, "step": 44790 }, { "epoch": 7.307504078303426, "grad_norm": 0.08691882342100143, "learning_rate": 1.025358987626549e-05, "loss": 0.1411, "num_input_tokens_seen": 96675168, "step": 44795 }, { "epoch": 7.308319738988581, "grad_norm": 20.442317962646484, "learning_rate": 1.0247842645319339e-05, "loss": 0.0929, "num_input_tokens_seen": 96684992, "step": 44800 }, { "epoch": 7.309135399673735, "grad_norm": 0.11712927371263504, "learning_rate": 1.0242096610257911e-05, "loss": 0.0043, "num_input_tokens_seen": 96695808, "step": 44805 }, { "epoch": 7.309951060358891, "grad_norm": 0.2057601511478424, "learning_rate": 1.0236351771547014e-05, "loss": 0.0928, "num_input_tokens_seen": 96707840, "step": 44810 }, { "epoch": 7.310766721044046, "grad_norm": 5.6143999099731445, "learning_rate": 1.0230608129652355e-05, "loss": 0.1065, "num_input_tokens_seen": 96718784, "step": 44815 }, { "epoch": 7.311582381729201, "grad_norm": 0.10936146974563599, "learning_rate": 1.022486568503954e-05, "loss": 0.003, "num_input_tokens_seen": 96729376, "step": 44820 }, { "epoch": 7.3123980424143555, "grad_norm": 0.06403083354234695, "learning_rate": 1.0219124438174076e-05, "loss": 0.0043, "num_input_tokens_seen": 96740640, "step": 44825 }, { "epoch": 7.31321370309951, "grad_norm": 0.021619712933897972, "learning_rate": 1.0213384389521385e-05, "loss": 0.0052, "num_input_tokens_seen": 96751104, "step": 44830 }, { "epoch": 7.314029363784665, "grad_norm": 0.05380615219473839, "learning_rate": 1.0207645539546784e-05, "loss": 0.2215, "num_input_tokens_seen": 96760672, "step": 44835 }, { "epoch": 7.314845024469821, "grad_norm": 0.08868793398141861, "learning_rate": 1.020190788871549e-05, "loss": 0.0137, "num_input_tokens_seen": 96771840, "step": 44840 }, { "epoch": 7.315660685154976, "grad_norm": 0.0648154690861702, "learning_rate": 1.0196171437492627e-05, "loss": 0.1151, "num_input_tokens_seen": 96782592, "step": 44845 }, { "epoch": 7.3164763458401305, "grad_norm": 0.03126976639032364, "learning_rate": 1.0190436186343218e-05, "loss": 0.1971, "num_input_tokens_seen": 96794016, "step": 44850 }, { "epoch": 7.317292006525285, "grad_norm": 0.10227061808109283, "learning_rate": 1.01847021357322e-05, "loss": 0.1948, "num_input_tokens_seen": 96803968, "step": 44855 }, { "epoch": 7.31810766721044, "grad_norm": 0.17148098349571228, "learning_rate": 1.0178969286124396e-05, "loss": 0.1504, "num_input_tokens_seen": 96814304, "step": 44860 }, { "epoch": 7.318923327895595, "grad_norm": 0.04482850804924965, "learning_rate": 1.0173237637984542e-05, "loss": 0.0046, "num_input_tokens_seen": 96824128, "step": 44865 }, { "epoch": 7.319738988580751, "grad_norm": 0.06380118429660797, "learning_rate": 1.016750719177727e-05, "loss": 0.113, "num_input_tokens_seen": 96835712, "step": 44870 }, { "epoch": 7.3205546492659055, "grad_norm": 0.08428000658750534, "learning_rate": 1.016177794796713e-05, "loss": 0.0032, "num_input_tokens_seen": 96846272, "step": 44875 }, { "epoch": 7.32137030995106, "grad_norm": 0.07979920506477356, "learning_rate": 1.0156049907018562e-05, "loss": 0.0058, "num_input_tokens_seen": 96857408, "step": 44880 }, { "epoch": 7.322185970636215, "grad_norm": 0.06582298129796982, "learning_rate": 1.0150323069395901e-05, "loss": 0.1055, "num_input_tokens_seen": 96868032, "step": 44885 }, { "epoch": 7.32300163132137, "grad_norm": 0.053582530468702316, "learning_rate": 1.01445974355634e-05, "loss": 0.1086, "num_input_tokens_seen": 96879136, "step": 44890 }, { "epoch": 7.323817292006526, "grad_norm": 0.08738100528717041, "learning_rate": 1.0138873005985208e-05, "loss": 0.0854, "num_input_tokens_seen": 96890016, "step": 44895 }, { "epoch": 7.3246329526916805, "grad_norm": 0.057709500193595886, "learning_rate": 1.0133149781125365e-05, "loss": 0.1186, "num_input_tokens_seen": 96900160, "step": 44900 }, { "epoch": 7.325448613376835, "grad_norm": 0.277441143989563, "learning_rate": 1.0127427761447842e-05, "loss": 0.0941, "num_input_tokens_seen": 96909856, "step": 44905 }, { "epoch": 7.32626427406199, "grad_norm": 0.19494812190532684, "learning_rate": 1.012170694741649e-05, "loss": 0.0052, "num_input_tokens_seen": 96921280, "step": 44910 }, { "epoch": 7.327079934747145, "grad_norm": 6.171853542327881, "learning_rate": 1.0115987339495061e-05, "loss": 0.0378, "num_input_tokens_seen": 96931136, "step": 44915 }, { "epoch": 7.327895595432301, "grad_norm": 0.04178924486041069, "learning_rate": 1.0110268938147222e-05, "loss": 0.0029, "num_input_tokens_seen": 96942112, "step": 44920 }, { "epoch": 7.328711256117455, "grad_norm": 0.10311181098222733, "learning_rate": 1.0104551743836532e-05, "loss": 0.0384, "num_input_tokens_seen": 96953664, "step": 44925 }, { "epoch": 7.32952691680261, "grad_norm": 0.05509466305375099, "learning_rate": 1.0098835757026457e-05, "loss": 0.0775, "num_input_tokens_seen": 96964800, "step": 44930 }, { "epoch": 7.330342577487765, "grad_norm": 0.2352639138698578, "learning_rate": 1.009312097818036e-05, "loss": 0.0084, "num_input_tokens_seen": 96975680, "step": 44935 }, { "epoch": 7.33115823817292, "grad_norm": 0.07198192179203033, "learning_rate": 1.0087407407761515e-05, "loss": 0.0589, "num_input_tokens_seen": 96986496, "step": 44940 }, { "epoch": 7.331973898858075, "grad_norm": 0.13894616067409515, "learning_rate": 1.0081695046233091e-05, "loss": 0.0047, "num_input_tokens_seen": 96997184, "step": 44945 }, { "epoch": 7.33278955954323, "grad_norm": 0.12491953372955322, "learning_rate": 1.0075983894058163e-05, "loss": 0.0037, "num_input_tokens_seen": 97008416, "step": 44950 }, { "epoch": 7.333605220228385, "grad_norm": 0.06016465276479721, "learning_rate": 1.0070273951699704e-05, "loss": 0.0048, "num_input_tokens_seen": 97020000, "step": 44955 }, { "epoch": 7.33442088091354, "grad_norm": 0.557112455368042, "learning_rate": 1.0064565219620593e-05, "loss": 0.0069, "num_input_tokens_seen": 97030752, "step": 44960 }, { "epoch": 7.335236541598695, "grad_norm": 0.12895114719867706, "learning_rate": 1.0058857698283603e-05, "loss": 0.3145, "num_input_tokens_seen": 97040416, "step": 44965 }, { "epoch": 7.33605220228385, "grad_norm": 0.05453105270862579, "learning_rate": 1.0053151388151418e-05, "loss": 0.214, "num_input_tokens_seen": 97051936, "step": 44970 }, { "epoch": 7.3368678629690045, "grad_norm": 0.07821512222290039, "learning_rate": 1.0047446289686615e-05, "loss": 0.0877, "num_input_tokens_seen": 97060864, "step": 44975 }, { "epoch": 7.33768352365416, "grad_norm": 0.22982530295848846, "learning_rate": 1.0041742403351693e-05, "loss": 0.01, "num_input_tokens_seen": 97073376, "step": 44980 }, { "epoch": 7.338499184339315, "grad_norm": 0.11923679709434509, "learning_rate": 1.0036039729609029e-05, "loss": 0.0064, "num_input_tokens_seen": 97085696, "step": 44985 }, { "epoch": 7.33931484502447, "grad_norm": 0.05471346154808998, "learning_rate": 1.0030338268920911e-05, "loss": 0.0111, "num_input_tokens_seen": 97097856, "step": 44990 }, { "epoch": 7.340130505709625, "grad_norm": 0.04758811369538307, "learning_rate": 1.0024638021749527e-05, "loss": 0.0056, "num_input_tokens_seen": 97109280, "step": 44995 }, { "epoch": 7.3409461663947795, "grad_norm": 0.05021590366959572, "learning_rate": 1.001893898855697e-05, "loss": 0.1206, "num_input_tokens_seen": 97121728, "step": 45000 }, { "epoch": 7.341761827079935, "grad_norm": 0.07956917583942413, "learning_rate": 1.0013241169805232e-05, "loss": 0.0037, "num_input_tokens_seen": 97132480, "step": 45005 }, { "epoch": 7.34257748776509, "grad_norm": 0.06605811417102814, "learning_rate": 1.0007544565956206e-05, "loss": 0.0028, "num_input_tokens_seen": 97143904, "step": 45010 }, { "epoch": 7.343393148450245, "grad_norm": 0.0318470299243927, "learning_rate": 1.0001849177471687e-05, "loss": 0.0037, "num_input_tokens_seen": 97154816, "step": 45015 }, { "epoch": 7.3442088091354, "grad_norm": 22.52471923828125, "learning_rate": 9.996155004813376e-06, "loss": 0.0245, "num_input_tokens_seen": 97165664, "step": 45020 }, { "epoch": 7.3450244698205545, "grad_norm": 0.047088466584682465, "learning_rate": 9.99046204844287e-06, "loss": 0.1246, "num_input_tokens_seen": 97174976, "step": 45025 }, { "epoch": 7.345840130505709, "grad_norm": 0.02915360778570175, "learning_rate": 9.984770308821664e-06, "loss": 0.0046, "num_input_tokens_seen": 97186720, "step": 45030 }, { "epoch": 7.346655791190865, "grad_norm": 0.21505773067474365, "learning_rate": 9.979079786411167e-06, "loss": 0.0064, "num_input_tokens_seen": 97196608, "step": 45035 }, { "epoch": 7.34747145187602, "grad_norm": 9.541070938110352, "learning_rate": 9.973390481672676e-06, "loss": 0.0138, "num_input_tokens_seen": 97208352, "step": 45040 }, { "epoch": 7.348287112561175, "grad_norm": 0.13850171864032745, "learning_rate": 9.967702395067388e-06, "loss": 0.0101, "num_input_tokens_seen": 97218048, "step": 45045 }, { "epoch": 7.349102773246329, "grad_norm": 0.03252837806940079, "learning_rate": 9.962015527056429e-06, "loss": 0.1767, "num_input_tokens_seen": 97229920, "step": 45050 }, { "epoch": 7.349918433931484, "grad_norm": 0.04658183827996254, "learning_rate": 9.95632987810079e-06, "loss": 0.0068, "num_input_tokens_seen": 97242816, "step": 45055 }, { "epoch": 7.350734094616639, "grad_norm": 0.05596194788813591, "learning_rate": 9.950645448661381e-06, "loss": 0.0064, "num_input_tokens_seen": 97254656, "step": 45060 }, { "epoch": 7.351549755301795, "grad_norm": 0.015002463944256306, "learning_rate": 9.944962239199013e-06, "loss": 0.1525, "num_input_tokens_seen": 97266432, "step": 45065 }, { "epoch": 7.35236541598695, "grad_norm": 0.7433382272720337, "learning_rate": 9.939280250174396e-06, "loss": 0.0126, "num_input_tokens_seen": 97278080, "step": 45070 }, { "epoch": 7.353181076672104, "grad_norm": 4.4932451248168945, "learning_rate": 9.933599482048136e-06, "loss": 0.0914, "num_input_tokens_seen": 97288672, "step": 45075 }, { "epoch": 7.353996737357259, "grad_norm": 0.0814744308590889, "learning_rate": 9.927919935280752e-06, "loss": 0.0036, "num_input_tokens_seen": 97300736, "step": 45080 }, { "epoch": 7.354812398042414, "grad_norm": 0.06717698276042938, "learning_rate": 9.922241610332641e-06, "loss": 0.0441, "num_input_tokens_seen": 97311808, "step": 45085 }, { "epoch": 7.35562805872757, "grad_norm": 0.036793094128370285, "learning_rate": 9.91656450766414e-06, "loss": 0.1405, "num_input_tokens_seen": 97321120, "step": 45090 }, { "epoch": 7.356443719412725, "grad_norm": 0.11527273803949356, "learning_rate": 9.91088862773545e-06, "loss": 0.1126, "num_input_tokens_seen": 97331712, "step": 45095 }, { "epoch": 7.357259380097879, "grad_norm": 0.12357182055711746, "learning_rate": 9.90521397100669e-06, "loss": 0.003, "num_input_tokens_seen": 97342720, "step": 45100 }, { "epoch": 7.358075040783034, "grad_norm": 0.10513553768396378, "learning_rate": 9.899540537937879e-06, "loss": 0.0063, "num_input_tokens_seen": 97353920, "step": 45105 }, { "epoch": 7.358890701468189, "grad_norm": 0.35938525199890137, "learning_rate": 9.893868328988928e-06, "loss": 0.0045, "num_input_tokens_seen": 97364320, "step": 45110 }, { "epoch": 7.359706362153344, "grad_norm": 0.026927918195724487, "learning_rate": 9.888197344619657e-06, "loss": 0.007, "num_input_tokens_seen": 97374656, "step": 45115 }, { "epoch": 7.3605220228384995, "grad_norm": 0.25100481510162354, "learning_rate": 9.882527585289788e-06, "loss": 0.0053, "num_input_tokens_seen": 97386272, "step": 45120 }, { "epoch": 7.361337683523654, "grad_norm": 4.940239906311035, "learning_rate": 9.876859051458937e-06, "loss": 0.0699, "num_input_tokens_seen": 97396992, "step": 45125 }, { "epoch": 7.362153344208809, "grad_norm": 0.03793177381157875, "learning_rate": 9.871191743586624e-06, "loss": 0.1292, "num_input_tokens_seen": 97407296, "step": 45130 }, { "epoch": 7.362969004893964, "grad_norm": 0.03328360989689827, "learning_rate": 9.865525662132274e-06, "loss": 0.2463, "num_input_tokens_seen": 97417824, "step": 45135 }, { "epoch": 7.363784665579119, "grad_norm": 0.011244897730648518, "learning_rate": 9.859860807555204e-06, "loss": 0.0019, "num_input_tokens_seen": 97428096, "step": 45140 }, { "epoch": 7.364600326264274, "grad_norm": 3.699322462081909, "learning_rate": 9.854197180314639e-06, "loss": 0.0702, "num_input_tokens_seen": 97439680, "step": 45145 }, { "epoch": 7.365415986949429, "grad_norm": 0.06068158149719238, "learning_rate": 9.848534780869698e-06, "loss": 0.1016, "num_input_tokens_seen": 97450880, "step": 45150 }, { "epoch": 7.366231647634584, "grad_norm": 10.884716033935547, "learning_rate": 9.842873609679404e-06, "loss": 0.4667, "num_input_tokens_seen": 97461568, "step": 45155 }, { "epoch": 7.367047308319739, "grad_norm": 0.027186408638954163, "learning_rate": 9.837213667202682e-06, "loss": 0.2195, "num_input_tokens_seen": 97472416, "step": 45160 }, { "epoch": 7.367862969004894, "grad_norm": 0.09181984513998032, "learning_rate": 9.83155495389836e-06, "loss": 0.1054, "num_input_tokens_seen": 97483744, "step": 45165 }, { "epoch": 7.368678629690049, "grad_norm": 0.04382199048995972, "learning_rate": 9.82589747022515e-06, "loss": 0.0064, "num_input_tokens_seen": 97494752, "step": 45170 }, { "epoch": 7.369494290375204, "grad_norm": 0.01950172521173954, "learning_rate": 9.82024121664169e-06, "loss": 0.002, "num_input_tokens_seen": 97505056, "step": 45175 }, { "epoch": 7.370309951060359, "grad_norm": 0.4417515993118286, "learning_rate": 9.814586193606496e-06, "loss": 0.146, "num_input_tokens_seen": 97516352, "step": 45180 }, { "epoch": 7.371125611745514, "grad_norm": 0.030991017818450928, "learning_rate": 9.808932401577994e-06, "loss": 0.1999, "num_input_tokens_seen": 97526784, "step": 45185 }, { "epoch": 7.371941272430669, "grad_norm": 0.46684086322784424, "learning_rate": 9.803279841014501e-06, "loss": 0.0053, "num_input_tokens_seen": 97538368, "step": 45190 }, { "epoch": 7.372756933115824, "grad_norm": 3.5872461795806885, "learning_rate": 9.797628512374262e-06, "loss": 0.1035, "num_input_tokens_seen": 97549440, "step": 45195 }, { "epoch": 7.373572593800978, "grad_norm": 0.10208668559789658, "learning_rate": 9.791978416115393e-06, "loss": 0.0788, "num_input_tokens_seen": 97559264, "step": 45200 }, { "epoch": 7.374388254486134, "grad_norm": 0.15362916886806488, "learning_rate": 9.786329552695916e-06, "loss": 0.0061, "num_input_tokens_seen": 97569312, "step": 45205 }, { "epoch": 7.375203915171289, "grad_norm": 0.10528465360403061, "learning_rate": 9.780681922573759e-06, "loss": 0.0033, "num_input_tokens_seen": 97579712, "step": 45210 }, { "epoch": 7.376019575856444, "grad_norm": 0.09068232774734497, "learning_rate": 9.775035526206741e-06, "loss": 0.0038, "num_input_tokens_seen": 97590912, "step": 45215 }, { "epoch": 7.376835236541599, "grad_norm": 0.060372915118932724, "learning_rate": 9.7693903640526e-06, "loss": 0.0054, "num_input_tokens_seen": 97602176, "step": 45220 }, { "epoch": 7.377650897226753, "grad_norm": 0.1669987291097641, "learning_rate": 9.763746436568957e-06, "loss": 0.1188, "num_input_tokens_seen": 97612128, "step": 45225 }, { "epoch": 7.378466557911908, "grad_norm": 0.1839768886566162, "learning_rate": 9.758103744213334e-06, "loss": 0.1168, "num_input_tokens_seen": 97622528, "step": 45230 }, { "epoch": 7.379282218597064, "grad_norm": 0.06200070306658745, "learning_rate": 9.752462287443163e-06, "loss": 0.0407, "num_input_tokens_seen": 97634304, "step": 45235 }, { "epoch": 7.380097879282219, "grad_norm": 0.2495262175798416, "learning_rate": 9.746822066715757e-06, "loss": 0.1293, "num_input_tokens_seen": 97644864, "step": 45240 }, { "epoch": 7.3809135399673735, "grad_norm": 0.12641514837741852, "learning_rate": 9.741183082488354e-06, "loss": 0.0061, "num_input_tokens_seen": 97656768, "step": 45245 }, { "epoch": 7.381729200652528, "grad_norm": 8.78662109375, "learning_rate": 9.73554533521807e-06, "loss": 0.25, "num_input_tokens_seen": 97668416, "step": 45250 }, { "epoch": 7.382544861337683, "grad_norm": 1.1876436471939087, "learning_rate": 9.729908825361933e-06, "loss": 0.0059, "num_input_tokens_seen": 97679392, "step": 45255 }, { "epoch": 7.383360522022839, "grad_norm": 0.351886510848999, "learning_rate": 9.724273553376864e-06, "loss": 0.2233, "num_input_tokens_seen": 97689312, "step": 45260 }, { "epoch": 7.384176182707994, "grad_norm": 0.07911387830972672, "learning_rate": 9.718639519719695e-06, "loss": 0.003, "num_input_tokens_seen": 97700448, "step": 45265 }, { "epoch": 7.3849918433931485, "grad_norm": 13.688436508178711, "learning_rate": 9.713006724847137e-06, "loss": 0.0675, "num_input_tokens_seen": 97710528, "step": 45270 }, { "epoch": 7.385807504078303, "grad_norm": 0.13461512327194214, "learning_rate": 9.70737516921582e-06, "loss": 0.0056, "num_input_tokens_seen": 97718720, "step": 45275 }, { "epoch": 7.386623164763458, "grad_norm": 0.03747570514678955, "learning_rate": 9.70174485328227e-06, "loss": 0.1638, "num_input_tokens_seen": 97727744, "step": 45280 }, { "epoch": 7.387438825448613, "grad_norm": 0.05718259885907173, "learning_rate": 9.6961157775029e-06, "loss": 0.0022, "num_input_tokens_seen": 97738112, "step": 45285 }, { "epoch": 7.388254486133769, "grad_norm": 0.07959671318531036, "learning_rate": 9.69048794233404e-06, "loss": 0.0056, "num_input_tokens_seen": 97748320, "step": 45290 }, { "epoch": 7.3890701468189235, "grad_norm": 0.05740109831094742, "learning_rate": 9.684861348231897e-06, "loss": 0.0962, "num_input_tokens_seen": 97760480, "step": 45295 }, { "epoch": 7.389885807504078, "grad_norm": 0.1578749567270279, "learning_rate": 9.67923599565261e-06, "loss": 0.1199, "num_input_tokens_seen": 97770912, "step": 45300 }, { "epoch": 7.390701468189233, "grad_norm": 6.2333855628967285, "learning_rate": 9.673611885052189e-06, "loss": 0.4056, "num_input_tokens_seen": 97781984, "step": 45305 }, { "epoch": 7.391517128874388, "grad_norm": 3.5526957511901855, "learning_rate": 9.667989016886555e-06, "loss": 0.1209, "num_input_tokens_seen": 97793152, "step": 45310 }, { "epoch": 7.392332789559543, "grad_norm": 0.04285155236721039, "learning_rate": 9.662367391611526e-06, "loss": 0.1422, "num_input_tokens_seen": 97804896, "step": 45315 }, { "epoch": 7.3931484502446985, "grad_norm": 0.09960167855024338, "learning_rate": 9.656747009682817e-06, "loss": 0.0832, "num_input_tokens_seen": 97815008, "step": 45320 }, { "epoch": 7.393964110929853, "grad_norm": 4.54679012298584, "learning_rate": 9.651127871556049e-06, "loss": 0.0096, "num_input_tokens_seen": 97826400, "step": 45325 }, { "epoch": 7.394779771615008, "grad_norm": 0.09120772033929825, "learning_rate": 9.645509977686731e-06, "loss": 0.1545, "num_input_tokens_seen": 97837024, "step": 45330 }, { "epoch": 7.395595432300163, "grad_norm": 0.05697239562869072, "learning_rate": 9.639893328530283e-06, "loss": 0.0107, "num_input_tokens_seen": 97847616, "step": 45335 }, { "epoch": 7.396411092985318, "grad_norm": 0.03465592488646507, "learning_rate": 9.63427792454202e-06, "loss": 0.0696, "num_input_tokens_seen": 97858464, "step": 45340 }, { "epoch": 7.397226753670473, "grad_norm": 13.479477882385254, "learning_rate": 9.628663766177154e-06, "loss": 0.0887, "num_input_tokens_seen": 97868224, "step": 45345 }, { "epoch": 7.398042414355628, "grad_norm": 0.41337093710899353, "learning_rate": 9.623050853890795e-06, "loss": 0.1268, "num_input_tokens_seen": 97879520, "step": 45350 }, { "epoch": 7.398858075040783, "grad_norm": 0.1932287961244583, "learning_rate": 9.617439188137956e-06, "loss": 0.0045, "num_input_tokens_seen": 97889568, "step": 45355 }, { "epoch": 7.399673735725938, "grad_norm": 0.1612764447927475, "learning_rate": 9.611828769373538e-06, "loss": 0.0047, "num_input_tokens_seen": 97899840, "step": 45360 }, { "epoch": 7.400489396411093, "grad_norm": 3.143439292907715, "learning_rate": 9.606219598052366e-06, "loss": 0.1091, "num_input_tokens_seen": 97910112, "step": 45365 }, { "epoch": 7.401305057096248, "grad_norm": 0.42711320519447327, "learning_rate": 9.600611674629143e-06, "loss": 0.0042, "num_input_tokens_seen": 97921664, "step": 45370 }, { "epoch": 7.402120717781403, "grad_norm": 1.0538039207458496, "learning_rate": 9.595004999558471e-06, "loss": 0.0961, "num_input_tokens_seen": 97934112, "step": 45375 }, { "epoch": 7.402936378466558, "grad_norm": 0.22625838220119476, "learning_rate": 9.58939957329486e-06, "loss": 0.0054, "num_input_tokens_seen": 97945760, "step": 45380 }, { "epoch": 7.403752039151713, "grad_norm": 0.06716348975896835, "learning_rate": 9.58379539629271e-06, "loss": 0.3946, "num_input_tokens_seen": 97956320, "step": 45385 }, { "epoch": 7.404567699836868, "grad_norm": 0.2427644580602646, "learning_rate": 9.578192469006328e-06, "loss": 0.1536, "num_input_tokens_seen": 97966368, "step": 45390 }, { "epoch": 7.4053833605220225, "grad_norm": 0.08670927584171295, "learning_rate": 9.572590791889916e-06, "loss": 0.1181, "num_input_tokens_seen": 97976736, "step": 45395 }, { "epoch": 7.406199021207178, "grad_norm": 0.09502453356981277, "learning_rate": 9.56699036539756e-06, "loss": 0.0772, "num_input_tokens_seen": 97987616, "step": 45400 }, { "epoch": 7.407014681892333, "grad_norm": 0.05197295546531677, "learning_rate": 9.561391189983281e-06, "loss": 0.0732, "num_input_tokens_seen": 97999424, "step": 45405 }, { "epoch": 7.407830342577488, "grad_norm": 0.1300973743200302, "learning_rate": 9.555793266100969e-06, "loss": 0.1041, "num_input_tokens_seen": 98010688, "step": 45410 }, { "epoch": 7.408646003262643, "grad_norm": 32.43983840942383, "learning_rate": 9.550196594204413e-06, "loss": 0.0672, "num_input_tokens_seen": 98022304, "step": 45415 }, { "epoch": 7.4094616639477975, "grad_norm": 0.5860154628753662, "learning_rate": 9.544601174747316e-06, "loss": 0.0052, "num_input_tokens_seen": 98032448, "step": 45420 }, { "epoch": 7.410277324632952, "grad_norm": 0.3548831045627594, "learning_rate": 9.539007008183267e-06, "loss": 0.1053, "num_input_tokens_seen": 98043936, "step": 45425 }, { "epoch": 7.411092985318108, "grad_norm": 0.09791775792837143, "learning_rate": 9.533414094965759e-06, "loss": 0.1139, "num_input_tokens_seen": 98054880, "step": 45430 }, { "epoch": 7.411908646003263, "grad_norm": 0.21384207904338837, "learning_rate": 9.527822435548181e-06, "loss": 0.0045, "num_input_tokens_seen": 98065440, "step": 45435 }, { "epoch": 7.412724306688418, "grad_norm": 0.06359481066465378, "learning_rate": 9.522232030383822e-06, "loss": 0.0055, "num_input_tokens_seen": 98076256, "step": 45440 }, { "epoch": 7.4135399673735725, "grad_norm": 17.562448501586914, "learning_rate": 9.516642879925865e-06, "loss": 0.0194, "num_input_tokens_seen": 98087648, "step": 45445 }, { "epoch": 7.414355628058727, "grad_norm": 0.07706096768379211, "learning_rate": 9.511054984627402e-06, "loss": 0.106, "num_input_tokens_seen": 98097952, "step": 45450 }, { "epoch": 7.415171288743883, "grad_norm": 0.21106182038784027, "learning_rate": 9.50546834494141e-06, "loss": 0.0957, "num_input_tokens_seen": 98109440, "step": 45455 }, { "epoch": 7.415986949429038, "grad_norm": 2.599562168121338, "learning_rate": 9.499882961320771e-06, "loss": 0.0087, "num_input_tokens_seen": 98120160, "step": 45460 }, { "epoch": 7.416802610114193, "grad_norm": 0.1349298506975174, "learning_rate": 9.494298834218268e-06, "loss": 0.0819, "num_input_tokens_seen": 98131488, "step": 45465 }, { "epoch": 7.417618270799347, "grad_norm": 0.030637389048933983, "learning_rate": 9.488715964086575e-06, "loss": 0.0047, "num_input_tokens_seen": 98142528, "step": 45470 }, { "epoch": 7.418433931484502, "grad_norm": 0.07534648478031158, "learning_rate": 9.48313435137827e-06, "loss": 0.0065, "num_input_tokens_seen": 98153472, "step": 45475 }, { "epoch": 7.419249592169657, "grad_norm": 0.07768328487873077, "learning_rate": 9.47755399654583e-06, "loss": 0.1942, "num_input_tokens_seen": 98162784, "step": 45480 }, { "epoch": 7.420065252854813, "grad_norm": 0.10556311905384064, "learning_rate": 9.47197490004162e-06, "loss": 0.1105, "num_input_tokens_seen": 98173376, "step": 45485 }, { "epoch": 7.420880913539968, "grad_norm": 0.1772138774394989, "learning_rate": 9.466397062317911e-06, "loss": 0.1223, "num_input_tokens_seen": 98183872, "step": 45490 }, { "epoch": 7.421696574225122, "grad_norm": 0.07352486997842789, "learning_rate": 9.460820483826874e-06, "loss": 0.1066, "num_input_tokens_seen": 98195200, "step": 45495 }, { "epoch": 7.422512234910277, "grad_norm": 0.30517813563346863, "learning_rate": 9.455245165020565e-06, "loss": 0.1547, "num_input_tokens_seen": 98205344, "step": 45500 }, { "epoch": 7.423327895595432, "grad_norm": 0.0715760663151741, "learning_rate": 9.449671106350966e-06, "loss": 0.07, "num_input_tokens_seen": 98216896, "step": 45505 }, { "epoch": 7.424143556280587, "grad_norm": 0.9469295144081116, "learning_rate": 9.444098308269931e-06, "loss": 0.1897, "num_input_tokens_seen": 98228896, "step": 45510 }, { "epoch": 7.424959216965743, "grad_norm": 3.0780670642852783, "learning_rate": 9.438526771229212e-06, "loss": 0.2251, "num_input_tokens_seen": 98239488, "step": 45515 }, { "epoch": 7.425774877650897, "grad_norm": 0.08582847565412521, "learning_rate": 9.432956495680473e-06, "loss": 0.0055, "num_input_tokens_seen": 98251232, "step": 45520 }, { "epoch": 7.426590538336052, "grad_norm": 1.2630985975265503, "learning_rate": 9.42738748207526e-06, "loss": 0.01, "num_input_tokens_seen": 98262432, "step": 45525 }, { "epoch": 7.427406199021207, "grad_norm": 0.013918722979724407, "learning_rate": 9.421819730865039e-06, "loss": 0.3242, "num_input_tokens_seen": 98272320, "step": 45530 }, { "epoch": 7.428221859706362, "grad_norm": 0.07709713280200958, "learning_rate": 9.416253242501155e-06, "loss": 0.0107, "num_input_tokens_seen": 98283904, "step": 45535 }, { "epoch": 7.4290375203915175, "grad_norm": 3.635129690170288, "learning_rate": 9.410688017434852e-06, "loss": 0.1949, "num_input_tokens_seen": 98295840, "step": 45540 }, { "epoch": 7.429853181076672, "grad_norm": 0.04166160151362419, "learning_rate": 9.40512405611728e-06, "loss": 0.0043, "num_input_tokens_seen": 98307072, "step": 45545 }, { "epoch": 7.430668841761827, "grad_norm": 0.42729049921035767, "learning_rate": 9.399561358999479e-06, "loss": 0.0054, "num_input_tokens_seen": 98318144, "step": 45550 }, { "epoch": 7.431484502446982, "grad_norm": 0.3752131164073944, "learning_rate": 9.393999926532387e-06, "loss": 0.1307, "num_input_tokens_seen": 98329728, "step": 45555 }, { "epoch": 7.432300163132137, "grad_norm": 0.18965347111225128, "learning_rate": 9.38843975916685e-06, "loss": 0.0037, "num_input_tokens_seen": 98340256, "step": 45560 }, { "epoch": 7.433115823817292, "grad_norm": 0.024965984746813774, "learning_rate": 9.382880857353596e-06, "loss": 0.0045, "num_input_tokens_seen": 98350976, "step": 45565 }, { "epoch": 7.433931484502447, "grad_norm": 0.24310645461082458, "learning_rate": 9.37732322154326e-06, "loss": 0.1285, "num_input_tokens_seen": 98361536, "step": 45570 }, { "epoch": 7.434747145187602, "grad_norm": 0.08490164577960968, "learning_rate": 9.371766852186373e-06, "loss": 0.0039, "num_input_tokens_seen": 98372064, "step": 45575 }, { "epoch": 7.435562805872757, "grad_norm": 17.166574478149414, "learning_rate": 9.366211749733361e-06, "loss": 0.1932, "num_input_tokens_seen": 98382688, "step": 45580 }, { "epoch": 7.436378466557912, "grad_norm": 0.17929571866989136, "learning_rate": 9.36065791463455e-06, "loss": 0.0053, "num_input_tokens_seen": 98393312, "step": 45585 }, { "epoch": 7.437194127243067, "grad_norm": 0.12863053381443024, "learning_rate": 9.355105347340163e-06, "loss": 0.1902, "num_input_tokens_seen": 98403136, "step": 45590 }, { "epoch": 7.438009787928221, "grad_norm": 0.02911662869155407, "learning_rate": 9.349554048300316e-06, "loss": 0.0045, "num_input_tokens_seen": 98414752, "step": 45595 }, { "epoch": 7.438825448613377, "grad_norm": 0.019138168543577194, "learning_rate": 9.344004017965027e-06, "loss": 0.005, "num_input_tokens_seen": 98426496, "step": 45600 }, { "epoch": 7.439641109298532, "grad_norm": 0.1090579703450203, "learning_rate": 9.338455256784212e-06, "loss": 0.1387, "num_input_tokens_seen": 98437696, "step": 45605 }, { "epoch": 7.440456769983687, "grad_norm": 0.5317416191101074, "learning_rate": 9.33290776520768e-06, "loss": 0.1597, "num_input_tokens_seen": 98448640, "step": 45610 }, { "epoch": 7.441272430668842, "grad_norm": 0.07390889525413513, "learning_rate": 9.32736154368513e-06, "loss": 0.0033, "num_input_tokens_seen": 98459360, "step": 45615 }, { "epoch": 7.442088091353996, "grad_norm": 0.20405705273151398, "learning_rate": 9.32181659266618e-06, "loss": 0.1511, "num_input_tokens_seen": 98470560, "step": 45620 }, { "epoch": 7.442903752039152, "grad_norm": 0.07471613585948944, "learning_rate": 9.316272912600332e-06, "loss": 0.0046, "num_input_tokens_seen": 98482432, "step": 45625 }, { "epoch": 7.443719412724307, "grad_norm": 17.78740119934082, "learning_rate": 9.310730503936976e-06, "loss": 0.0247, "num_input_tokens_seen": 98493056, "step": 45630 }, { "epoch": 7.444535073409462, "grad_norm": 0.04233328253030777, "learning_rate": 9.305189367125416e-06, "loss": 0.1337, "num_input_tokens_seen": 98503904, "step": 45635 }, { "epoch": 7.445350734094617, "grad_norm": 0.03839850798249245, "learning_rate": 9.299649502614838e-06, "loss": 0.0717, "num_input_tokens_seen": 98514848, "step": 45640 }, { "epoch": 7.446166394779771, "grad_norm": 12.67846393585205, "learning_rate": 9.294110910854337e-06, "loss": 0.0374, "num_input_tokens_seen": 98525696, "step": 45645 }, { "epoch": 7.446982055464926, "grad_norm": 0.03788448125123978, "learning_rate": 9.288573592292893e-06, "loss": 0.0381, "num_input_tokens_seen": 98536672, "step": 45650 }, { "epoch": 7.447797716150082, "grad_norm": 0.08515933156013489, "learning_rate": 9.283037547379394e-06, "loss": 0.0048, "num_input_tokens_seen": 98546912, "step": 45655 }, { "epoch": 7.448613376835237, "grad_norm": 9.248990058898926, "learning_rate": 9.27750277656262e-06, "loss": 0.4888, "num_input_tokens_seen": 98558336, "step": 45660 }, { "epoch": 7.4494290375203915, "grad_norm": 0.059895455837249756, "learning_rate": 9.271969280291243e-06, "loss": 0.0425, "num_input_tokens_seen": 98568096, "step": 45665 }, { "epoch": 7.450244698205546, "grad_norm": 0.034142039716243744, "learning_rate": 9.266437059013834e-06, "loss": 0.0977, "num_input_tokens_seen": 98579072, "step": 45670 }, { "epoch": 7.451060358890701, "grad_norm": 0.27393639087677, "learning_rate": 9.260906113178875e-06, "loss": 0.1577, "num_input_tokens_seen": 98589600, "step": 45675 }, { "epoch": 7.451876019575856, "grad_norm": 3.8771159648895264, "learning_rate": 9.255376443234725e-06, "loss": 0.2061, "num_input_tokens_seen": 98600864, "step": 45680 }, { "epoch": 7.452691680261012, "grad_norm": 0.08974248170852661, "learning_rate": 9.24984804962965e-06, "loss": 0.0052, "num_input_tokens_seen": 98611968, "step": 45685 }, { "epoch": 7.4535073409461665, "grad_norm": 0.11940401792526245, "learning_rate": 9.244320932811806e-06, "loss": 0.1428, "num_input_tokens_seen": 98622912, "step": 45690 }, { "epoch": 7.454323001631321, "grad_norm": 3.759678363800049, "learning_rate": 9.238795093229252e-06, "loss": 0.4739, "num_input_tokens_seen": 98634592, "step": 45695 }, { "epoch": 7.455138662316476, "grad_norm": 0.05598155036568642, "learning_rate": 9.233270531329937e-06, "loss": 0.121, "num_input_tokens_seen": 98644096, "step": 45700 }, { "epoch": 7.455954323001631, "grad_norm": 0.1402728259563446, "learning_rate": 9.227747247561713e-06, "loss": 0.1641, "num_input_tokens_seen": 98654400, "step": 45705 }, { "epoch": 7.456769983686787, "grad_norm": 0.08239126205444336, "learning_rate": 9.222225242372326e-06, "loss": 0.0032, "num_input_tokens_seen": 98665568, "step": 45710 }, { "epoch": 7.4575856443719415, "grad_norm": 0.10888636112213135, "learning_rate": 9.216704516209417e-06, "loss": 0.1002, "num_input_tokens_seen": 98674624, "step": 45715 }, { "epoch": 7.458401305057096, "grad_norm": 13.814203262329102, "learning_rate": 9.211185069520514e-06, "loss": 0.1048, "num_input_tokens_seen": 98685536, "step": 45720 }, { "epoch": 7.459216965742251, "grad_norm": 0.08124645799398422, "learning_rate": 9.205666902753071e-06, "loss": 0.1477, "num_input_tokens_seen": 98696352, "step": 45725 }, { "epoch": 7.460032626427406, "grad_norm": 0.11474943906068802, "learning_rate": 9.200150016354406e-06, "loss": 0.0704, "num_input_tokens_seen": 98707808, "step": 45730 }, { "epoch": 7.460848287112561, "grad_norm": 13.604469299316406, "learning_rate": 9.19463441077175e-06, "loss": 0.0962, "num_input_tokens_seen": 98718112, "step": 45735 }, { "epoch": 7.4616639477977165, "grad_norm": 3.5292141437530518, "learning_rate": 9.189120086452224e-06, "loss": 0.1251, "num_input_tokens_seen": 98729312, "step": 45740 }, { "epoch": 7.462479608482871, "grad_norm": 0.07247146964073181, "learning_rate": 9.183607043842846e-06, "loss": 0.003, "num_input_tokens_seen": 98739168, "step": 45745 }, { "epoch": 7.463295269168026, "grad_norm": 17.904502868652344, "learning_rate": 9.178095283390533e-06, "loss": 0.4384, "num_input_tokens_seen": 98749376, "step": 45750 }, { "epoch": 7.464110929853181, "grad_norm": 0.0919409990310669, "learning_rate": 9.172584805542098e-06, "loss": 0.0044, "num_input_tokens_seen": 98760928, "step": 45755 }, { "epoch": 7.464926590538336, "grad_norm": 0.1532522439956665, "learning_rate": 9.167075610744244e-06, "loss": 0.0898, "num_input_tokens_seen": 98770560, "step": 45760 }, { "epoch": 7.465742251223491, "grad_norm": 0.1458083689212799, "learning_rate": 9.161567699443577e-06, "loss": 0.1232, "num_input_tokens_seen": 98780512, "step": 45765 }, { "epoch": 7.466557911908646, "grad_norm": 0.6249791383743286, "learning_rate": 9.156061072086597e-06, "loss": 0.0134, "num_input_tokens_seen": 98792192, "step": 45770 }, { "epoch": 7.467373572593801, "grad_norm": 6.886234760284424, "learning_rate": 9.150555729119697e-06, "loss": 0.143, "num_input_tokens_seen": 98802752, "step": 45775 }, { "epoch": 7.468189233278956, "grad_norm": 0.17001597583293915, "learning_rate": 9.14505167098917e-06, "loss": 0.0052, "num_input_tokens_seen": 98813600, "step": 45780 }, { "epoch": 7.469004893964111, "grad_norm": 0.011153833009302616, "learning_rate": 9.139548898141198e-06, "loss": 0.0047, "num_input_tokens_seen": 98825280, "step": 45785 }, { "epoch": 7.4698205546492655, "grad_norm": 2.9685122966766357, "learning_rate": 9.13404741102187e-06, "loss": 0.1782, "num_input_tokens_seen": 98836480, "step": 45790 }, { "epoch": 7.470636215334421, "grad_norm": 15.426504135131836, "learning_rate": 9.128547210077162e-06, "loss": 0.0538, "num_input_tokens_seen": 98847424, "step": 45795 }, { "epoch": 7.471451876019576, "grad_norm": 26.57598876953125, "learning_rate": 9.123048295752948e-06, "loss": 0.0576, "num_input_tokens_seen": 98858016, "step": 45800 }, { "epoch": 7.472267536704731, "grad_norm": 0.04468873515725136, "learning_rate": 9.117550668494998e-06, "loss": 0.0039, "num_input_tokens_seen": 98868864, "step": 45805 }, { "epoch": 7.473083197389886, "grad_norm": 4.287403583526611, "learning_rate": 9.112054328748975e-06, "loss": 0.1979, "num_input_tokens_seen": 98879936, "step": 45810 }, { "epoch": 7.4738988580750405, "grad_norm": 0.07600153237581253, "learning_rate": 9.106559276960439e-06, "loss": 0.0038, "num_input_tokens_seen": 98891328, "step": 45815 }, { "epoch": 7.474714518760196, "grad_norm": 0.15871362388134003, "learning_rate": 9.101065513574856e-06, "loss": 0.0059, "num_input_tokens_seen": 98902592, "step": 45820 }, { "epoch": 7.475530179445351, "grad_norm": 0.10289601236581802, "learning_rate": 9.095573039037574e-06, "loss": 0.0084, "num_input_tokens_seen": 98913440, "step": 45825 }, { "epoch": 7.476345840130506, "grad_norm": 0.1871442198753357, "learning_rate": 9.090081853793838e-06, "loss": 0.0861, "num_input_tokens_seen": 98923808, "step": 45830 }, { "epoch": 7.477161500815661, "grad_norm": 0.09045414626598358, "learning_rate": 9.084591958288797e-06, "loss": 0.0043, "num_input_tokens_seen": 98933856, "step": 45835 }, { "epoch": 7.4779771615008155, "grad_norm": 35.73626708984375, "learning_rate": 9.079103352967471e-06, "loss": 0.0645, "num_input_tokens_seen": 98944960, "step": 45840 }, { "epoch": 7.47879282218597, "grad_norm": 6.172349452972412, "learning_rate": 9.073616038274823e-06, "loss": 0.0824, "num_input_tokens_seen": 98956512, "step": 45845 }, { "epoch": 7.479608482871126, "grad_norm": 0.2083701193332672, "learning_rate": 9.068130014655665e-06, "loss": 0.0919, "num_input_tokens_seen": 98967520, "step": 45850 }, { "epoch": 7.480424143556281, "grad_norm": 0.11684663593769073, "learning_rate": 9.06264528255473e-06, "loss": 0.1304, "num_input_tokens_seen": 98977920, "step": 45855 }, { "epoch": 7.481239804241436, "grad_norm": 0.06528680771589279, "learning_rate": 9.057161842416628e-06, "loss": 0.0855, "num_input_tokens_seen": 98987392, "step": 45860 }, { "epoch": 7.4820554649265905, "grad_norm": 0.1682402789592743, "learning_rate": 9.051679694685885e-06, "loss": 0.0051, "num_input_tokens_seen": 98998304, "step": 45865 }, { "epoch": 7.482871125611745, "grad_norm": 0.20442993938922882, "learning_rate": 9.046198839806905e-06, "loss": 0.1825, "num_input_tokens_seen": 99008480, "step": 45870 }, { "epoch": 7.4836867862969, "grad_norm": 0.10275133699178696, "learning_rate": 9.040719278223997e-06, "loss": 0.1771, "num_input_tokens_seen": 99019584, "step": 45875 }, { "epoch": 7.484502446982056, "grad_norm": 4.467346668243408, "learning_rate": 9.03524101038136e-06, "loss": 0.2166, "num_input_tokens_seen": 99029920, "step": 45880 }, { "epoch": 7.485318107667211, "grad_norm": 2.960280656814575, "learning_rate": 9.02976403672309e-06, "loss": 0.3622, "num_input_tokens_seen": 99040288, "step": 45885 }, { "epoch": 7.486133768352365, "grad_norm": 0.10947456955909729, "learning_rate": 9.02428835769318e-06, "loss": 0.0076, "num_input_tokens_seen": 99053024, "step": 45890 }, { "epoch": 7.48694942903752, "grad_norm": 0.026944268494844437, "learning_rate": 9.018813973735515e-06, "loss": 0.112, "num_input_tokens_seen": 99062720, "step": 45895 }, { "epoch": 7.487765089722675, "grad_norm": 0.05024728551506996, "learning_rate": 9.013340885293878e-06, "loss": 0.0046, "num_input_tokens_seen": 99073728, "step": 45900 }, { "epoch": 7.488580750407831, "grad_norm": 0.13830679655075073, "learning_rate": 9.007869092811944e-06, "loss": 0.4181, "num_input_tokens_seen": 99084704, "step": 45905 }, { "epoch": 7.489396411092986, "grad_norm": 4.213515758514404, "learning_rate": 9.002398596733287e-06, "loss": 0.3499, "num_input_tokens_seen": 99095744, "step": 45910 }, { "epoch": 7.49021207177814, "grad_norm": 0.06781812012195587, "learning_rate": 8.996929397501366e-06, "loss": 0.1147, "num_input_tokens_seen": 99107168, "step": 45915 }, { "epoch": 7.491027732463295, "grad_norm": 0.04535306990146637, "learning_rate": 8.99146149555955e-06, "loss": 0.0146, "num_input_tokens_seen": 99118208, "step": 45920 }, { "epoch": 7.49184339314845, "grad_norm": 0.11362104117870331, "learning_rate": 8.98599489135109e-06, "loss": 0.0051, "num_input_tokens_seen": 99129152, "step": 45925 }, { "epoch": 7.492659053833605, "grad_norm": 0.13913558423519135, "learning_rate": 8.980529585319142e-06, "loss": 0.0038, "num_input_tokens_seen": 99140192, "step": 45930 }, { "epoch": 7.493474714518761, "grad_norm": 3.604703187942505, "learning_rate": 8.975065577906735e-06, "loss": 0.2062, "num_input_tokens_seen": 99150368, "step": 45935 }, { "epoch": 7.494290375203915, "grad_norm": 0.04341362416744232, "learning_rate": 8.969602869556834e-06, "loss": 0.1583, "num_input_tokens_seen": 99161728, "step": 45940 }, { "epoch": 7.49510603588907, "grad_norm": 4.822989463806152, "learning_rate": 8.964141460712258e-06, "loss": 0.1043, "num_input_tokens_seen": 99173024, "step": 45945 }, { "epoch": 7.495921696574225, "grad_norm": 1.2872027158737183, "learning_rate": 8.958681351815742e-06, "loss": 0.1475, "num_input_tokens_seen": 99183008, "step": 45950 }, { "epoch": 7.49673735725938, "grad_norm": 0.03739134222269058, "learning_rate": 8.95322254330991e-06, "loss": 0.0044, "num_input_tokens_seen": 99194976, "step": 45955 }, { "epoch": 7.497553017944535, "grad_norm": 5.683506011962891, "learning_rate": 8.947765035637278e-06, "loss": 0.0957, "num_input_tokens_seen": 99205760, "step": 45960 }, { "epoch": 7.49836867862969, "grad_norm": 0.14213387668132782, "learning_rate": 8.942308829240262e-06, "loss": 0.0785, "num_input_tokens_seen": 99215808, "step": 45965 }, { "epoch": 7.499184339314845, "grad_norm": 0.0866631269454956, "learning_rate": 8.936853924561167e-06, "loss": 0.1909, "num_input_tokens_seen": 99226848, "step": 45970 }, { "epoch": 7.5, "grad_norm": 0.13099905848503113, "learning_rate": 8.931400322042193e-06, "loss": 0.1522, "num_input_tokens_seen": 99237152, "step": 45975 }, { "epoch": 7.5, "eval_loss": 0.18871694803237915, "eval_runtime": 132.9402, "eval_samples_per_second": 20.498, "eval_steps_per_second": 5.13, "num_input_tokens_seen": 99237152, "step": 45975 }, { "epoch": 7.500815660685155, "grad_norm": 4.312325954437256, "learning_rate": 8.925948022125446e-06, "loss": 0.2061, "num_input_tokens_seen": 99247936, "step": 45980 }, { "epoch": 7.50163132137031, "grad_norm": 0.1773970127105713, "learning_rate": 8.9204970252529e-06, "loss": 0.0073, "num_input_tokens_seen": 99259456, "step": 45985 }, { "epoch": 7.502446982055465, "grad_norm": 0.06699678301811218, "learning_rate": 8.91504733186646e-06, "loss": 0.0044, "num_input_tokens_seen": 99269760, "step": 45990 }, { "epoch": 7.50326264274062, "grad_norm": 0.09259167313575745, "learning_rate": 8.909598942407898e-06, "loss": 0.0242, "num_input_tokens_seen": 99280256, "step": 45995 }, { "epoch": 7.504078303425775, "grad_norm": 0.12489908188581467, "learning_rate": 8.904151857318888e-06, "loss": 0.0041, "num_input_tokens_seen": 99289600, "step": 46000 }, { "epoch": 7.50489396411093, "grad_norm": 0.06093369424343109, "learning_rate": 8.898706077040997e-06, "loss": 0.2861, "num_input_tokens_seen": 99300128, "step": 46005 }, { "epoch": 7.505709624796085, "grad_norm": 3.503791093826294, "learning_rate": 8.893261602015687e-06, "loss": 0.1008, "num_input_tokens_seen": 99311680, "step": 46010 }, { "epoch": 7.506525285481239, "grad_norm": 0.23494867980480194, "learning_rate": 8.88781843268432e-06, "loss": 0.2047, "num_input_tokens_seen": 99322976, "step": 46015 }, { "epoch": 7.507340946166395, "grad_norm": 0.05065242946147919, "learning_rate": 8.88237656948814e-06, "loss": 0.1271, "num_input_tokens_seen": 99335584, "step": 46020 }, { "epoch": 7.50815660685155, "grad_norm": 0.0380345843732357, "learning_rate": 8.876936012868297e-06, "loss": 0.0798, "num_input_tokens_seen": 99346528, "step": 46025 }, { "epoch": 7.508972267536705, "grad_norm": 0.08900675177574158, "learning_rate": 8.87149676326583e-06, "loss": 0.0399, "num_input_tokens_seen": 99357280, "step": 46030 }, { "epoch": 7.50978792822186, "grad_norm": 0.11268915981054306, "learning_rate": 8.866058821121667e-06, "loss": 0.1257, "num_input_tokens_seen": 99368192, "step": 46035 }, { "epoch": 7.510603588907014, "grad_norm": 0.2183750420808792, "learning_rate": 8.860622186876632e-06, "loss": 0.0063, "num_input_tokens_seen": 99379104, "step": 46040 }, { "epoch": 7.511419249592169, "grad_norm": 29.613920211791992, "learning_rate": 8.855186860971462e-06, "loss": 0.2434, "num_input_tokens_seen": 99391200, "step": 46045 }, { "epoch": 7.512234910277325, "grad_norm": 0.12104889750480652, "learning_rate": 8.849752843846762e-06, "loss": 0.0953, "num_input_tokens_seen": 99402272, "step": 46050 }, { "epoch": 7.51305057096248, "grad_norm": 0.29253631830215454, "learning_rate": 8.844320135943042e-06, "loss": 0.0077, "num_input_tokens_seen": 99414016, "step": 46055 }, { "epoch": 7.513866231647635, "grad_norm": 0.14891695976257324, "learning_rate": 8.838888737700707e-06, "loss": 0.1026, "num_input_tokens_seen": 99423424, "step": 46060 }, { "epoch": 7.514681892332789, "grad_norm": 0.1279086023569107, "learning_rate": 8.833458649560051e-06, "loss": 0.1157, "num_input_tokens_seen": 99434592, "step": 46065 }, { "epoch": 7.515497553017944, "grad_norm": 2.201295852661133, "learning_rate": 8.828029871961263e-06, "loss": 0.0895, "num_input_tokens_seen": 99444096, "step": 46070 }, { "epoch": 7.5163132137031, "grad_norm": 0.08931440860033035, "learning_rate": 8.82260240534443e-06, "loss": 0.1461, "num_input_tokens_seen": 99455328, "step": 46075 }, { "epoch": 7.517128874388255, "grad_norm": 19.845977783203125, "learning_rate": 8.817176250149528e-06, "loss": 0.162, "num_input_tokens_seen": 99465152, "step": 46080 }, { "epoch": 7.5179445350734095, "grad_norm": 0.06507433950901031, "learning_rate": 8.811751406816432e-06, "loss": 0.0029, "num_input_tokens_seen": 99475072, "step": 46085 }, { "epoch": 7.518760195758564, "grad_norm": 0.5574439167976379, "learning_rate": 8.806327875784906e-06, "loss": 0.0802, "num_input_tokens_seen": 99485344, "step": 46090 }, { "epoch": 7.519575856443719, "grad_norm": 0.22275541722774506, "learning_rate": 8.800905657494607e-06, "loss": 0.104, "num_input_tokens_seen": 99496864, "step": 46095 }, { "epoch": 7.520391517128875, "grad_norm": 0.13205687701702118, "learning_rate": 8.795484752385088e-06, "loss": 0.0061, "num_input_tokens_seen": 99507520, "step": 46100 }, { "epoch": 7.52120717781403, "grad_norm": 0.09482310712337494, "learning_rate": 8.790065160895797e-06, "loss": 0.1108, "num_input_tokens_seen": 99517664, "step": 46105 }, { "epoch": 7.5220228384991845, "grad_norm": 0.09053410589694977, "learning_rate": 8.784646883466072e-06, "loss": 0.069, "num_input_tokens_seen": 99527392, "step": 46110 }, { "epoch": 7.522838499184339, "grad_norm": 0.06341391056776047, "learning_rate": 8.779229920535148e-06, "loss": 0.134, "num_input_tokens_seen": 99538784, "step": 46115 }, { "epoch": 7.523654159869494, "grad_norm": 1.591096043586731, "learning_rate": 8.77381427254215e-06, "loss": 0.1277, "num_input_tokens_seen": 99549024, "step": 46120 }, { "epoch": 7.524469820554649, "grad_norm": 0.19831347465515137, "learning_rate": 8.768399939926095e-06, "loss": 0.0041, "num_input_tokens_seen": 99560480, "step": 46125 }, { "epoch": 7.525285481239804, "grad_norm": 2.8108818531036377, "learning_rate": 8.762986923125894e-06, "loss": 0.1244, "num_input_tokens_seen": 99571968, "step": 46130 }, { "epoch": 7.5261011419249595, "grad_norm": 0.07803460955619812, "learning_rate": 8.757575222580364e-06, "loss": 0.1148, "num_input_tokens_seen": 99582112, "step": 46135 }, { "epoch": 7.526916802610114, "grad_norm": 0.1227453425526619, "learning_rate": 8.752164838728203e-06, "loss": 0.1457, "num_input_tokens_seen": 99591360, "step": 46140 }, { "epoch": 7.527732463295269, "grad_norm": 0.37101203203201294, "learning_rate": 8.746755772007998e-06, "loss": 0.0168, "num_input_tokens_seen": 99601696, "step": 46145 }, { "epoch": 7.528548123980424, "grad_norm": 2.539811134338379, "learning_rate": 8.74134802285824e-06, "loss": 0.1978, "num_input_tokens_seen": 99612928, "step": 46150 }, { "epoch": 7.529363784665579, "grad_norm": 0.15568888187408447, "learning_rate": 8.735941591717297e-06, "loss": 0.0187, "num_input_tokens_seen": 99622720, "step": 46155 }, { "epoch": 7.5301794453507345, "grad_norm": 0.15910722315311432, "learning_rate": 8.730536479023463e-06, "loss": 0.3713, "num_input_tokens_seen": 99633888, "step": 46160 }, { "epoch": 7.530995106035889, "grad_norm": 3.501009702682495, "learning_rate": 8.72513268521489e-06, "loss": 0.1287, "num_input_tokens_seen": 99644960, "step": 46165 }, { "epoch": 7.531810766721044, "grad_norm": 0.09123460203409195, "learning_rate": 8.719730210729638e-06, "loss": 0.0071, "num_input_tokens_seen": 99656096, "step": 46170 }, { "epoch": 7.532626427406199, "grad_norm": 3.6207470893859863, "learning_rate": 8.714329056005663e-06, "loss": 0.0933, "num_input_tokens_seen": 99666752, "step": 46175 }, { "epoch": 7.533442088091354, "grad_norm": 14.625901222229004, "learning_rate": 8.708929221480808e-06, "loss": 0.1012, "num_input_tokens_seen": 99678016, "step": 46180 }, { "epoch": 7.5342577487765094, "grad_norm": 0.0866745337843895, "learning_rate": 8.703530707592807e-06, "loss": 0.0051, "num_input_tokens_seen": 99687456, "step": 46185 }, { "epoch": 7.535073409461664, "grad_norm": 0.11230406910181046, "learning_rate": 8.698133514779297e-06, "loss": 0.0058, "num_input_tokens_seen": 99699584, "step": 46190 }, { "epoch": 7.535889070146819, "grad_norm": 0.05673373490571976, "learning_rate": 8.692737643477796e-06, "loss": 0.0043, "num_input_tokens_seen": 99710624, "step": 46195 }, { "epoch": 7.536704730831974, "grad_norm": 12.691792488098145, "learning_rate": 8.687343094125726e-06, "loss": 0.1306, "num_input_tokens_seen": 99720704, "step": 46200 }, { "epoch": 7.537520391517129, "grad_norm": 0.04475383833050728, "learning_rate": 8.681949867160396e-06, "loss": 0.1146, "num_input_tokens_seen": 99729984, "step": 46205 }, { "epoch": 7.5383360522022835, "grad_norm": 0.08457732945680618, "learning_rate": 8.676557963019005e-06, "loss": 0.1244, "num_input_tokens_seen": 99741632, "step": 46210 }, { "epoch": 7.539151712887438, "grad_norm": 0.08247902244329453, "learning_rate": 8.67116738213865e-06, "loss": 0.1136, "num_input_tokens_seen": 99750688, "step": 46215 }, { "epoch": 7.539967373572594, "grad_norm": 0.1298837661743164, "learning_rate": 8.66577812495632e-06, "loss": 0.1652, "num_input_tokens_seen": 99761440, "step": 46220 }, { "epoch": 7.540783034257749, "grad_norm": 7.871242046356201, "learning_rate": 8.660390191908892e-06, "loss": 0.0594, "num_input_tokens_seen": 99770912, "step": 46225 }, { "epoch": 7.541598694942904, "grad_norm": 2.9351773262023926, "learning_rate": 8.655003583433144e-06, "loss": 0.1058, "num_input_tokens_seen": 99782368, "step": 46230 }, { "epoch": 7.5424143556280585, "grad_norm": 0.15129609405994415, "learning_rate": 8.649618299965736e-06, "loss": 0.0652, "num_input_tokens_seen": 99794560, "step": 46235 }, { "epoch": 7.543230016313213, "grad_norm": 0.03578079119324684, "learning_rate": 8.644234341943232e-06, "loss": 0.091, "num_input_tokens_seen": 99805184, "step": 46240 }, { "epoch": 7.544045676998369, "grad_norm": 0.09130024164915085, "learning_rate": 8.638851709802082e-06, "loss": 0.1788, "num_input_tokens_seen": 99816160, "step": 46245 }, { "epoch": 7.544861337683524, "grad_norm": 0.16685356199741364, "learning_rate": 8.633470403978625e-06, "loss": 0.0065, "num_input_tokens_seen": 99826784, "step": 46250 }, { "epoch": 7.545676998368679, "grad_norm": 0.07381764054298401, "learning_rate": 8.628090424909091e-06, "loss": 0.0052, "num_input_tokens_seen": 99837920, "step": 46255 }, { "epoch": 7.5464926590538335, "grad_norm": 41.165855407714844, "learning_rate": 8.62271177302963e-06, "loss": 0.1403, "num_input_tokens_seen": 99848672, "step": 46260 }, { "epoch": 7.547308319738988, "grad_norm": 0.10340555012226105, "learning_rate": 8.617334448776246e-06, "loss": 0.1952, "num_input_tokens_seen": 99861216, "step": 46265 }, { "epoch": 7.548123980424144, "grad_norm": 0.08121224492788315, "learning_rate": 8.611958452584859e-06, "loss": 0.1579, "num_input_tokens_seen": 99870784, "step": 46270 }, { "epoch": 7.548939641109299, "grad_norm": 5.6692094802856445, "learning_rate": 8.60658378489127e-06, "loss": 0.0235, "num_input_tokens_seen": 99882272, "step": 46275 }, { "epoch": 7.549755301794454, "grad_norm": 0.13333682715892792, "learning_rate": 8.60121044613118e-06, "loss": 0.0808, "num_input_tokens_seen": 99892992, "step": 46280 }, { "epoch": 7.5505709624796085, "grad_norm": 0.04906648024916649, "learning_rate": 8.595838436740178e-06, "loss": 0.0047, "num_input_tokens_seen": 99903744, "step": 46285 }, { "epoch": 7.551386623164763, "grad_norm": 0.05480366200208664, "learning_rate": 8.590467757153744e-06, "loss": 0.1292, "num_input_tokens_seen": 99914688, "step": 46290 }, { "epoch": 7.552202283849918, "grad_norm": 0.1344582438468933, "learning_rate": 8.585098407807258e-06, "loss": 0.0082, "num_input_tokens_seen": 99925632, "step": 46295 }, { "epoch": 7.553017944535073, "grad_norm": 0.9332029223442078, "learning_rate": 8.579730389135973e-06, "loss": 0.0619, "num_input_tokens_seen": 99935552, "step": 46300 }, { "epoch": 7.553833605220229, "grad_norm": 0.239906445145607, "learning_rate": 8.574363701575067e-06, "loss": 0.0048, "num_input_tokens_seen": 99946880, "step": 46305 }, { "epoch": 7.554649265905383, "grad_norm": 0.14199669659137726, "learning_rate": 8.568998345559581e-06, "loss": 0.1089, "num_input_tokens_seen": 99957792, "step": 46310 }, { "epoch": 7.555464926590538, "grad_norm": 0.14231956005096436, "learning_rate": 8.56363432152446e-06, "loss": 0.1839, "num_input_tokens_seen": 99967168, "step": 46315 }, { "epoch": 7.556280587275693, "grad_norm": 0.13341651856899261, "learning_rate": 8.55827162990454e-06, "loss": 0.0047, "num_input_tokens_seen": 99978784, "step": 46320 }, { "epoch": 7.557096247960848, "grad_norm": 0.09734013676643372, "learning_rate": 8.552910271134545e-06, "loss": 0.1143, "num_input_tokens_seen": 99990144, "step": 46325 }, { "epoch": 7.557911908646004, "grad_norm": 0.05781449005007744, "learning_rate": 8.547550245649095e-06, "loss": 0.0037, "num_input_tokens_seen": 100001568, "step": 46330 }, { "epoch": 7.558727569331158, "grad_norm": 3.64250111579895, "learning_rate": 8.542191553882701e-06, "loss": 0.1238, "num_input_tokens_seen": 100012128, "step": 46335 }, { "epoch": 7.559543230016313, "grad_norm": 0.06417738646268845, "learning_rate": 8.536834196269766e-06, "loss": 0.1774, "num_input_tokens_seen": 100023936, "step": 46340 }, { "epoch": 7.560358890701468, "grad_norm": 3.0087335109710693, "learning_rate": 8.531478173244583e-06, "loss": 0.0928, "num_input_tokens_seen": 100035552, "step": 46345 }, { "epoch": 7.561174551386623, "grad_norm": 0.16188861429691315, "learning_rate": 8.52612348524134e-06, "loss": 0.0044, "num_input_tokens_seen": 100045408, "step": 46350 }, { "epoch": 7.561990212071779, "grad_norm": 0.13789072632789612, "learning_rate": 8.520770132694118e-06, "loss": 0.1002, "num_input_tokens_seen": 100057184, "step": 46355 }, { "epoch": 7.562805872756933, "grad_norm": 0.15002018213272095, "learning_rate": 8.515418116036872e-06, "loss": 0.0066, "num_input_tokens_seen": 100068384, "step": 46360 }, { "epoch": 7.563621533442088, "grad_norm": 1.1017075777053833, "learning_rate": 8.510067435703484e-06, "loss": 0.1029, "num_input_tokens_seen": 100079488, "step": 46365 }, { "epoch": 7.564437194127243, "grad_norm": 0.10850369185209274, "learning_rate": 8.5047180921277e-06, "loss": 0.1924, "num_input_tokens_seen": 100090720, "step": 46370 }, { "epoch": 7.565252854812398, "grad_norm": 0.33265987038612366, "learning_rate": 8.499370085743163e-06, "loss": 0.0203, "num_input_tokens_seen": 100101248, "step": 46375 }, { "epoch": 7.566068515497553, "grad_norm": 0.06324081122875214, "learning_rate": 8.49402341698341e-06, "loss": 0.0055, "num_input_tokens_seen": 100112224, "step": 46380 }, { "epoch": 7.566884176182708, "grad_norm": 0.10510724782943726, "learning_rate": 8.48867808628187e-06, "loss": 0.0127, "num_input_tokens_seen": 100121632, "step": 46385 }, { "epoch": 7.567699836867863, "grad_norm": 0.13526450097560883, "learning_rate": 8.483334094071862e-06, "loss": 0.083, "num_input_tokens_seen": 100133024, "step": 46390 }, { "epoch": 7.568515497553018, "grad_norm": 0.2898355722427368, "learning_rate": 8.477991440786597e-06, "loss": 0.1588, "num_input_tokens_seen": 100143008, "step": 46395 }, { "epoch": 7.569331158238173, "grad_norm": 0.04432517662644386, "learning_rate": 8.472650126859177e-06, "loss": 0.2262, "num_input_tokens_seen": 100153408, "step": 46400 }, { "epoch": 7.570146818923328, "grad_norm": 14.88448715209961, "learning_rate": 8.467310152722599e-06, "loss": 0.2278, "num_input_tokens_seen": 100163744, "step": 46405 }, { "epoch": 7.5709624796084825, "grad_norm": 0.14386697113513947, "learning_rate": 8.461971518809744e-06, "loss": 0.0539, "num_input_tokens_seen": 100172608, "step": 46410 }, { "epoch": 7.571778140293638, "grad_norm": 0.10405105352401733, "learning_rate": 8.456634225553389e-06, "loss": 0.0043, "num_input_tokens_seen": 100182400, "step": 46415 }, { "epoch": 7.572593800978793, "grad_norm": 0.5253939628601074, "learning_rate": 8.451298273386207e-06, "loss": 0.0125, "num_input_tokens_seen": 100191936, "step": 46420 }, { "epoch": 7.573409461663948, "grad_norm": 0.07708753645420074, "learning_rate": 8.445963662740752e-06, "loss": 0.0891, "num_input_tokens_seen": 100202048, "step": 46425 }, { "epoch": 7.574225122349103, "grad_norm": 0.14142228662967682, "learning_rate": 8.440630394049479e-06, "loss": 0.0964, "num_input_tokens_seen": 100213632, "step": 46430 }, { "epoch": 7.575040783034257, "grad_norm": 0.08559701591730118, "learning_rate": 8.435298467744726e-06, "loss": 0.1407, "num_input_tokens_seen": 100224096, "step": 46435 }, { "epoch": 7.575856443719413, "grad_norm": 0.09634073078632355, "learning_rate": 8.429967884258721e-06, "loss": 0.1044, "num_input_tokens_seen": 100234368, "step": 46440 }, { "epoch": 7.576672104404568, "grad_norm": 0.05761324614286423, "learning_rate": 8.424638644023603e-06, "loss": 0.1281, "num_input_tokens_seen": 100244256, "step": 46445 }, { "epoch": 7.577487765089723, "grad_norm": 0.2969125211238861, "learning_rate": 8.419310747471377e-06, "loss": 0.0093, "num_input_tokens_seen": 100255360, "step": 46450 }, { "epoch": 7.578303425774878, "grad_norm": 0.06729293614625931, "learning_rate": 8.413984195033953e-06, "loss": 0.0615, "num_input_tokens_seen": 100265824, "step": 46455 }, { "epoch": 7.579119086460032, "grad_norm": 0.31247928738594055, "learning_rate": 8.408658987143125e-06, "loss": 0.1944, "num_input_tokens_seen": 100276512, "step": 46460 }, { "epoch": 7.579934747145187, "grad_norm": 0.028771821409463882, "learning_rate": 8.403335124230586e-06, "loss": 0.0041, "num_input_tokens_seen": 100286560, "step": 46465 }, { "epoch": 7.580750407830343, "grad_norm": 0.08469962328672409, "learning_rate": 8.3980126067279e-06, "loss": 0.0056, "num_input_tokens_seen": 100297856, "step": 46470 }, { "epoch": 7.581566068515498, "grad_norm": 0.07814224064350128, "learning_rate": 8.392691435066563e-06, "loss": 0.2223, "num_input_tokens_seen": 100308352, "step": 46475 }, { "epoch": 7.582381729200653, "grad_norm": 0.4001646935939789, "learning_rate": 8.387371609677921e-06, "loss": 0.0137, "num_input_tokens_seen": 100317536, "step": 46480 }, { "epoch": 7.583197389885807, "grad_norm": 0.11351838707923889, "learning_rate": 8.382053130993226e-06, "loss": 0.0055, "num_input_tokens_seen": 100327808, "step": 46485 }, { "epoch": 7.584013050570962, "grad_norm": 0.07939837872982025, "learning_rate": 8.376735999443624e-06, "loss": 0.0928, "num_input_tokens_seen": 100339712, "step": 46490 }, { "epoch": 7.584828711256117, "grad_norm": 0.592789351940155, "learning_rate": 8.371420215460149e-06, "loss": 0.1337, "num_input_tokens_seen": 100349696, "step": 46495 }, { "epoch": 7.585644371941273, "grad_norm": 0.10668357461690903, "learning_rate": 8.366105779473723e-06, "loss": 0.139, "num_input_tokens_seen": 100360160, "step": 46500 }, { "epoch": 7.5864600326264275, "grad_norm": 1.1885261535644531, "learning_rate": 8.360792691915163e-06, "loss": 0.0101, "num_input_tokens_seen": 100370848, "step": 46505 }, { "epoch": 7.587275693311582, "grad_norm": 0.5904202461242676, "learning_rate": 8.35548095321517e-06, "loss": 0.0046, "num_input_tokens_seen": 100381248, "step": 46510 }, { "epoch": 7.588091353996737, "grad_norm": 19.061939239501953, "learning_rate": 8.350170563804349e-06, "loss": 0.0812, "num_input_tokens_seen": 100389888, "step": 46515 }, { "epoch": 7.588907014681892, "grad_norm": 0.1632000356912613, "learning_rate": 8.344861524113178e-06, "loss": 0.0426, "num_input_tokens_seen": 100399840, "step": 46520 }, { "epoch": 7.589722675367048, "grad_norm": 0.202743798494339, "learning_rate": 8.339553834572043e-06, "loss": 0.099, "num_input_tokens_seen": 100410848, "step": 46525 }, { "epoch": 7.5905383360522025, "grad_norm": 4.82681941986084, "learning_rate": 8.334247495611208e-06, "loss": 0.2846, "num_input_tokens_seen": 100422368, "step": 46530 }, { "epoch": 7.591353996737357, "grad_norm": 0.11173932254314423, "learning_rate": 8.32894250766083e-06, "loss": 0.1146, "num_input_tokens_seen": 100433504, "step": 46535 }, { "epoch": 7.592169657422512, "grad_norm": 0.11460330337285995, "learning_rate": 8.323638871150962e-06, "loss": 0.0093, "num_input_tokens_seen": 100443584, "step": 46540 }, { "epoch": 7.592985318107667, "grad_norm": 3.662522315979004, "learning_rate": 8.31833658651154e-06, "loss": 0.1426, "num_input_tokens_seen": 100453216, "step": 46545 }, { "epoch": 7.593800978792823, "grad_norm": 0.18638724088668823, "learning_rate": 8.313035654172399e-06, "loss": 0.0771, "num_input_tokens_seen": 100463296, "step": 46550 }, { "epoch": 7.5946166394779775, "grad_norm": 6.057276248931885, "learning_rate": 8.307736074563257e-06, "loss": 0.2101, "num_input_tokens_seen": 100473728, "step": 46555 }, { "epoch": 7.595432300163132, "grad_norm": 0.34686973690986633, "learning_rate": 8.302437848113722e-06, "loss": 0.1298, "num_input_tokens_seen": 100485344, "step": 46560 }, { "epoch": 7.596247960848287, "grad_norm": 3.977979898452759, "learning_rate": 8.297140975253302e-06, "loss": 0.1283, "num_input_tokens_seen": 100495552, "step": 46565 }, { "epoch": 7.597063621533442, "grad_norm": 0.0953311026096344, "learning_rate": 8.291845456411378e-06, "loss": 0.1078, "num_input_tokens_seen": 100506944, "step": 46570 }, { "epoch": 7.597879282218597, "grad_norm": 0.14356254041194916, "learning_rate": 8.286551292017233e-06, "loss": 0.1116, "num_input_tokens_seen": 100517760, "step": 46575 }, { "epoch": 7.598694942903752, "grad_norm": 0.42716148495674133, "learning_rate": 8.281258482500052e-06, "loss": 0.0942, "num_input_tokens_seen": 100528768, "step": 46580 }, { "epoch": 7.599510603588907, "grad_norm": 5.533297061920166, "learning_rate": 8.275967028288886e-06, "loss": 0.0964, "num_input_tokens_seen": 100539392, "step": 46585 }, { "epoch": 7.600326264274062, "grad_norm": 0.12495343387126923, "learning_rate": 8.270676929812692e-06, "loss": 0.1495, "num_input_tokens_seen": 100550176, "step": 46590 }, { "epoch": 7.601141924959217, "grad_norm": 3.771775722503662, "learning_rate": 8.265388187500309e-06, "loss": 0.1383, "num_input_tokens_seen": 100561216, "step": 46595 }, { "epoch": 7.601957585644372, "grad_norm": 0.0826186090707779, "learning_rate": 8.26010080178047e-06, "loss": 0.0052, "num_input_tokens_seen": 100571648, "step": 46600 }, { "epoch": 7.602773246329527, "grad_norm": 2.264418125152588, "learning_rate": 8.254814773081798e-06, "loss": 0.1827, "num_input_tokens_seen": 100583776, "step": 46605 }, { "epoch": 7.603588907014682, "grad_norm": 0.17515042424201965, "learning_rate": 8.249530101832795e-06, "loss": 0.0135, "num_input_tokens_seen": 100592992, "step": 46610 }, { "epoch": 7.604404567699837, "grad_norm": 0.08349408209323883, "learning_rate": 8.244246788461882e-06, "loss": 0.1155, "num_input_tokens_seen": 100603680, "step": 46615 }, { "epoch": 7.605220228384992, "grad_norm": 3.912369728088379, "learning_rate": 8.238964833397341e-06, "loss": 0.1567, "num_input_tokens_seen": 100615808, "step": 46620 }, { "epoch": 7.606035889070147, "grad_norm": 7.789304733276367, "learning_rate": 8.233684237067358e-06, "loss": 0.1143, "num_input_tokens_seen": 100625696, "step": 46625 }, { "epoch": 7.6068515497553015, "grad_norm": 0.26968061923980713, "learning_rate": 8.2284049999e-06, "loss": 0.1099, "num_input_tokens_seen": 100635968, "step": 46630 }, { "epoch": 7.607667210440457, "grad_norm": 0.10891762375831604, "learning_rate": 8.223127122323231e-06, "loss": 0.0378, "num_input_tokens_seen": 100647104, "step": 46635 }, { "epoch": 7.608482871125612, "grad_norm": 0.026683762669563293, "learning_rate": 8.217850604764903e-06, "loss": 0.004, "num_input_tokens_seen": 100656992, "step": 46640 }, { "epoch": 7.609298531810767, "grad_norm": 4.219992637634277, "learning_rate": 8.212575447652757e-06, "loss": 0.1318, "num_input_tokens_seen": 100668288, "step": 46645 }, { "epoch": 7.610114192495922, "grad_norm": 0.18180352449417114, "learning_rate": 8.207301651414423e-06, "loss": 0.216, "num_input_tokens_seen": 100680736, "step": 46650 }, { "epoch": 7.6109298531810765, "grad_norm": 0.15708036720752716, "learning_rate": 8.202029216477425e-06, "loss": 0.0098, "num_input_tokens_seen": 100692000, "step": 46655 }, { "epoch": 7.611745513866231, "grad_norm": 1.7249727249145508, "learning_rate": 8.196758143269168e-06, "loss": 0.1963, "num_input_tokens_seen": 100701440, "step": 46660 }, { "epoch": 7.612561174551386, "grad_norm": 0.10883460938930511, "learning_rate": 8.191488432216957e-06, "loss": 0.1372, "num_input_tokens_seen": 100712224, "step": 46665 }, { "epoch": 7.613376835236542, "grad_norm": 0.7182273268699646, "learning_rate": 8.186220083747975e-06, "loss": 0.0079, "num_input_tokens_seen": 100722912, "step": 46670 }, { "epoch": 7.614192495921697, "grad_norm": 0.07891812920570374, "learning_rate": 8.18095309828931e-06, "loss": 0.0434, "num_input_tokens_seen": 100732384, "step": 46675 }, { "epoch": 7.6150081566068515, "grad_norm": 4.916957855224609, "learning_rate": 8.175687476267915e-06, "loss": 0.2123, "num_input_tokens_seen": 100743328, "step": 46680 }, { "epoch": 7.615823817292006, "grad_norm": 0.1271146982908249, "learning_rate": 8.170423218110667e-06, "loss": 0.1043, "num_input_tokens_seen": 100754944, "step": 46685 }, { "epoch": 7.616639477977161, "grad_norm": 4.182375431060791, "learning_rate": 8.165160324244305e-06, "loss": 0.319, "num_input_tokens_seen": 100767200, "step": 46690 }, { "epoch": 7.617455138662317, "grad_norm": 4.3861494064331055, "learning_rate": 8.15989879509547e-06, "loss": 0.1397, "num_input_tokens_seen": 100777408, "step": 46695 }, { "epoch": 7.618270799347472, "grad_norm": 0.12118413299322128, "learning_rate": 8.15463863109068e-06, "loss": 0.2319, "num_input_tokens_seen": 100787584, "step": 46700 }, { "epoch": 7.6190864600326265, "grad_norm": 0.24924133718013763, "learning_rate": 8.149379832656356e-06, "loss": 0.0904, "num_input_tokens_seen": 100796800, "step": 46705 }, { "epoch": 7.619902120717781, "grad_norm": 0.3655216693878174, "learning_rate": 8.144122400218804e-06, "loss": 0.0055, "num_input_tokens_seen": 100806528, "step": 46710 }, { "epoch": 7.620717781402936, "grad_norm": 0.18927684426307678, "learning_rate": 8.138866334204215e-06, "loss": 0.0129, "num_input_tokens_seen": 100817120, "step": 46715 }, { "epoch": 7.621533442088092, "grad_norm": 5.13013219833374, "learning_rate": 8.133611635038674e-06, "loss": 0.226, "num_input_tokens_seen": 100828000, "step": 46720 }, { "epoch": 7.622349102773247, "grad_norm": 0.08743203431367874, "learning_rate": 8.12835830314815e-06, "loss": 0.0035, "num_input_tokens_seen": 100839424, "step": 46725 }, { "epoch": 7.623164763458401, "grad_norm": 3.035060405731201, "learning_rate": 8.123106338958511e-06, "loss": 0.0833, "num_input_tokens_seen": 100849472, "step": 46730 }, { "epoch": 7.623980424143556, "grad_norm": 8.867013931274414, "learning_rate": 8.117855742895506e-06, "loss": 0.063, "num_input_tokens_seen": 100859232, "step": 46735 }, { "epoch": 7.624796084828711, "grad_norm": 0.35741615295410156, "learning_rate": 8.112606515384772e-06, "loss": 0.0085, "num_input_tokens_seen": 100869888, "step": 46740 }, { "epoch": 7.625611745513866, "grad_norm": 0.07205919176340103, "learning_rate": 8.107358656851838e-06, "loss": 0.26, "num_input_tokens_seen": 100881440, "step": 46745 }, { "epoch": 7.626427406199021, "grad_norm": 0.17037171125411987, "learning_rate": 8.102112167722125e-06, "loss": 0.1244, "num_input_tokens_seen": 100892160, "step": 46750 }, { "epoch": 7.627243066884176, "grad_norm": 0.18843121826648712, "learning_rate": 8.096867048420932e-06, "loss": 0.1191, "num_input_tokens_seen": 100901280, "step": 46755 }, { "epoch": 7.628058727569331, "grad_norm": 10.705406188964844, "learning_rate": 8.091623299373467e-06, "loss": 0.1362, "num_input_tokens_seen": 100913440, "step": 46760 }, { "epoch": 7.628874388254486, "grad_norm": 0.07794339954853058, "learning_rate": 8.08638092100481e-06, "loss": 0.091, "num_input_tokens_seen": 100923840, "step": 46765 }, { "epoch": 7.629690048939641, "grad_norm": 0.14688681066036224, "learning_rate": 8.081139913739936e-06, "loss": 0.0079, "num_input_tokens_seen": 100933824, "step": 46770 }, { "epoch": 7.630505709624796, "grad_norm": 0.2018391191959381, "learning_rate": 8.075900278003703e-06, "loss": 0.0065, "num_input_tokens_seen": 100943424, "step": 46775 }, { "epoch": 7.631321370309951, "grad_norm": 0.33399730920791626, "learning_rate": 8.07066201422087e-06, "loss": 0.0667, "num_input_tokens_seen": 100955040, "step": 46780 }, { "epoch": 7.632137030995106, "grad_norm": 13.152643203735352, "learning_rate": 8.065425122816061e-06, "loss": 0.0238, "num_input_tokens_seen": 100966848, "step": 46785 }, { "epoch": 7.632952691680261, "grad_norm": 0.0843852087855339, "learning_rate": 8.060189604213827e-06, "loss": 0.0583, "num_input_tokens_seen": 100977888, "step": 46790 }, { "epoch": 7.633768352365416, "grad_norm": 0.12785997986793518, "learning_rate": 8.054955458838576e-06, "loss": 0.0046, "num_input_tokens_seen": 100988736, "step": 46795 }, { "epoch": 7.634584013050571, "grad_norm": 0.10067712515592575, "learning_rate": 8.049722687114611e-06, "loss": 0.0902, "num_input_tokens_seen": 100999360, "step": 46800 }, { "epoch": 7.635399673735726, "grad_norm": 0.03972485288977623, "learning_rate": 8.044491289466133e-06, "loss": 0.006, "num_input_tokens_seen": 101011104, "step": 46805 }, { "epoch": 7.636215334420881, "grad_norm": 0.1417984664440155, "learning_rate": 8.039261266317219e-06, "loss": 0.0045, "num_input_tokens_seen": 101021664, "step": 46810 }, { "epoch": 7.637030995106036, "grad_norm": 0.13314402103424072, "learning_rate": 8.034032618091846e-06, "loss": 0.1178, "num_input_tokens_seen": 101032576, "step": 46815 }, { "epoch": 7.637846655791191, "grad_norm": 0.18647818267345428, "learning_rate": 8.028805345213875e-06, "loss": 0.0855, "num_input_tokens_seen": 101043040, "step": 46820 }, { "epoch": 7.638662316476346, "grad_norm": 0.14198535680770874, "learning_rate": 8.023579448107053e-06, "loss": 0.2243, "num_input_tokens_seen": 101054400, "step": 46825 }, { "epoch": 7.6394779771615005, "grad_norm": 0.14789186418056488, "learning_rate": 8.018354927195017e-06, "loss": 0.0073, "num_input_tokens_seen": 101064896, "step": 46830 }, { "epoch": 7.640293637846656, "grad_norm": 0.11056870967149734, "learning_rate": 8.013131782901295e-06, "loss": 0.0051, "num_input_tokens_seen": 101074848, "step": 46835 }, { "epoch": 7.641109298531811, "grad_norm": 0.03951028361916542, "learning_rate": 8.007910015649304e-06, "loss": 0.1201, "num_input_tokens_seen": 101085312, "step": 46840 }, { "epoch": 7.641924959216966, "grad_norm": 0.09886524826288223, "learning_rate": 8.002689625862342e-06, "loss": 0.0044, "num_input_tokens_seen": 101095584, "step": 46845 }, { "epoch": 7.642740619902121, "grad_norm": 0.10876501351594925, "learning_rate": 7.997470613963601e-06, "loss": 0.0033, "num_input_tokens_seen": 101105760, "step": 46850 }, { "epoch": 7.643556280587275, "grad_norm": 0.07424748688936234, "learning_rate": 7.992252980376164e-06, "loss": 0.0852, "num_input_tokens_seen": 101116576, "step": 46855 }, { "epoch": 7.64437194127243, "grad_norm": 0.09375285357236862, "learning_rate": 7.987036725522995e-06, "loss": 0.0044, "num_input_tokens_seen": 101127840, "step": 46860 }, { "epoch": 7.645187601957586, "grad_norm": 0.15094710886478424, "learning_rate": 7.981821849826954e-06, "loss": 0.135, "num_input_tokens_seen": 101140192, "step": 46865 }, { "epoch": 7.646003262642741, "grad_norm": 2.2446706295013428, "learning_rate": 7.976608353710782e-06, "loss": 0.1672, "num_input_tokens_seen": 101150336, "step": 46870 }, { "epoch": 7.646818923327896, "grad_norm": 0.10287173092365265, "learning_rate": 7.971396237597114e-06, "loss": 0.0075, "num_input_tokens_seen": 101161088, "step": 46875 }, { "epoch": 7.64763458401305, "grad_norm": 13.358624458312988, "learning_rate": 7.966185501908469e-06, "loss": 0.0326, "num_input_tokens_seen": 101171776, "step": 46880 }, { "epoch": 7.648450244698205, "grad_norm": 0.06350710988044739, "learning_rate": 7.960976147067254e-06, "loss": 0.0054, "num_input_tokens_seen": 101180544, "step": 46885 }, { "epoch": 7.649265905383361, "grad_norm": 0.39150890707969666, "learning_rate": 7.95576817349577e-06, "loss": 0.008, "num_input_tokens_seen": 101192448, "step": 46890 }, { "epoch": 7.650081566068516, "grad_norm": 0.07359413802623749, "learning_rate": 7.95056158161619e-06, "loss": 0.0165, "num_input_tokens_seen": 101203168, "step": 46895 }, { "epoch": 7.650897226753671, "grad_norm": 0.5913971662521362, "learning_rate": 7.945356371850604e-06, "loss": 0.0067, "num_input_tokens_seen": 101213216, "step": 46900 }, { "epoch": 7.651712887438825, "grad_norm": 0.238386869430542, "learning_rate": 7.940152544620966e-06, "loss": 0.0444, "num_input_tokens_seen": 101224960, "step": 46905 }, { "epoch": 7.65252854812398, "grad_norm": 3.611781597137451, "learning_rate": 7.934950100349123e-06, "loss": 0.1098, "num_input_tokens_seen": 101235296, "step": 46910 }, { "epoch": 7.653344208809135, "grad_norm": 14.35745620727539, "learning_rate": 7.929749039456813e-06, "loss": 0.1562, "num_input_tokens_seen": 101246784, "step": 46915 }, { "epoch": 7.654159869494291, "grad_norm": 0.1352587789297104, "learning_rate": 7.924549362365658e-06, "loss": 0.1563, "num_input_tokens_seen": 101257440, "step": 46920 }, { "epoch": 7.6549755301794455, "grad_norm": 0.047789525240659714, "learning_rate": 7.919351069497163e-06, "loss": 0.0067, "num_input_tokens_seen": 101268672, "step": 46925 }, { "epoch": 7.6557911908646, "grad_norm": 0.049748245626688004, "learning_rate": 7.914154161272746e-06, "loss": 0.095, "num_input_tokens_seen": 101279168, "step": 46930 }, { "epoch": 7.656606851549755, "grad_norm": 5.892858505249023, "learning_rate": 7.908958638113687e-06, "loss": 0.1287, "num_input_tokens_seen": 101290336, "step": 46935 }, { "epoch": 7.65742251223491, "grad_norm": 0.07121598720550537, "learning_rate": 7.903764500441157e-06, "loss": 0.2274, "num_input_tokens_seen": 101301344, "step": 46940 }, { "epoch": 7.658238172920065, "grad_norm": 0.1507032960653305, "learning_rate": 7.898571748676223e-06, "loss": 0.1458, "num_input_tokens_seen": 101312160, "step": 46945 }, { "epoch": 7.6590538336052205, "grad_norm": 3.4130213260650635, "learning_rate": 7.893380383239835e-06, "loss": 0.1016, "num_input_tokens_seen": 101322720, "step": 46950 }, { "epoch": 7.659869494290375, "grad_norm": 0.13280156254768372, "learning_rate": 7.888190404552832e-06, "loss": 0.0049, "num_input_tokens_seen": 101334048, "step": 46955 }, { "epoch": 7.66068515497553, "grad_norm": 3.296661615371704, "learning_rate": 7.883001813035937e-06, "loss": 0.2718, "num_input_tokens_seen": 101345760, "step": 46960 }, { "epoch": 7.661500815660685, "grad_norm": 3.592658281326294, "learning_rate": 7.877814609109769e-06, "loss": 0.1009, "num_input_tokens_seen": 101356416, "step": 46965 }, { "epoch": 7.66231647634584, "grad_norm": 3.6723384857177734, "learning_rate": 7.872628793194823e-06, "loss": 0.3296, "num_input_tokens_seen": 101367360, "step": 46970 }, { "epoch": 7.6631321370309955, "grad_norm": 0.21846187114715576, "learning_rate": 7.86744436571149e-06, "loss": 0.0058, "num_input_tokens_seen": 101378848, "step": 46975 }, { "epoch": 7.66394779771615, "grad_norm": 0.10069020837545395, "learning_rate": 7.86226132708005e-06, "loss": 0.0951, "num_input_tokens_seen": 101389184, "step": 46980 }, { "epoch": 7.664763458401305, "grad_norm": 0.11880723387002945, "learning_rate": 7.85707967772066e-06, "loss": 0.1587, "num_input_tokens_seen": 101399840, "step": 46985 }, { "epoch": 7.66557911908646, "grad_norm": 0.3594798147678375, "learning_rate": 7.851899418053374e-06, "loss": 0.1191, "num_input_tokens_seen": 101410112, "step": 46990 }, { "epoch": 7.666394779771615, "grad_norm": 0.17258435487747192, "learning_rate": 7.846720548498132e-06, "loss": 0.0814, "num_input_tokens_seen": 101420224, "step": 46995 }, { "epoch": 7.6672104404567705, "grad_norm": 0.15103264153003693, "learning_rate": 7.841543069474747e-06, "loss": 0.0225, "num_input_tokens_seen": 101430080, "step": 47000 }, { "epoch": 7.668026101141925, "grad_norm": 0.15950919687747955, "learning_rate": 7.836366981402951e-06, "loss": 0.0619, "num_input_tokens_seen": 101441856, "step": 47005 }, { "epoch": 7.66884176182708, "grad_norm": 4.0336995124816895, "learning_rate": 7.831192284702334e-06, "loss": 0.2092, "num_input_tokens_seen": 101451232, "step": 47010 }, { "epoch": 7.669657422512235, "grad_norm": 0.1788661777973175, "learning_rate": 7.826018979792385e-06, "loss": 0.0923, "num_input_tokens_seen": 101461792, "step": 47015 }, { "epoch": 7.67047308319739, "grad_norm": 0.2176932394504547, "learning_rate": 7.820847067092477e-06, "loss": 0.2219, "num_input_tokens_seen": 101472096, "step": 47020 }, { "epoch": 7.671288743882545, "grad_norm": 0.24234457314014435, "learning_rate": 7.815676547021871e-06, "loss": 0.0071, "num_input_tokens_seen": 101482784, "step": 47025 }, { "epoch": 7.672104404567699, "grad_norm": 0.15751397609710693, "learning_rate": 7.810507419999716e-06, "loss": 0.0795, "num_input_tokens_seen": 101493248, "step": 47030 }, { "epoch": 7.672920065252855, "grad_norm": 0.13083268702030182, "learning_rate": 7.805339686445051e-06, "loss": 0.004, "num_input_tokens_seen": 101504128, "step": 47035 }, { "epoch": 7.67373572593801, "grad_norm": 0.28730228543281555, "learning_rate": 7.800173346776793e-06, "loss": 0.0085, "num_input_tokens_seen": 101515808, "step": 47040 }, { "epoch": 7.674551386623165, "grad_norm": 0.059784840792417526, "learning_rate": 7.795008401413756e-06, "loss": 0.142, "num_input_tokens_seen": 101526976, "step": 47045 }, { "epoch": 7.6753670473083195, "grad_norm": 0.08952692151069641, "learning_rate": 7.789844850774636e-06, "loss": 0.0042, "num_input_tokens_seen": 101537440, "step": 47050 }, { "epoch": 7.676182707993474, "grad_norm": 0.19022005796432495, "learning_rate": 7.784682695278014e-06, "loss": 0.094, "num_input_tokens_seen": 101549280, "step": 47055 }, { "epoch": 7.67699836867863, "grad_norm": 0.11355360597372055, "learning_rate": 7.779521935342363e-06, "loss": 0.0075, "num_input_tokens_seen": 101558720, "step": 47060 }, { "epoch": 7.677814029363785, "grad_norm": 0.03983641788363457, "learning_rate": 7.77436257138604e-06, "loss": 0.2278, "num_input_tokens_seen": 101570400, "step": 47065 }, { "epoch": 7.67862969004894, "grad_norm": 3.2674293518066406, "learning_rate": 7.769204603827282e-06, "loss": 0.1675, "num_input_tokens_seen": 101581472, "step": 47070 }, { "epoch": 7.6794453507340945, "grad_norm": 0.07456944137811661, "learning_rate": 7.764048033084235e-06, "loss": 0.0033, "num_input_tokens_seen": 101592480, "step": 47075 }, { "epoch": 7.680261011419249, "grad_norm": 6.097672462463379, "learning_rate": 7.758892859574906e-06, "loss": 0.0866, "num_input_tokens_seen": 101601952, "step": 47080 }, { "epoch": 7.681076672104405, "grad_norm": 0.1380632221698761, "learning_rate": 7.753739083717204e-06, "loss": 0.0369, "num_input_tokens_seen": 101612384, "step": 47085 }, { "epoch": 7.68189233278956, "grad_norm": 0.21851445734500885, "learning_rate": 7.748586705928917e-06, "loss": 0.0109, "num_input_tokens_seen": 101623872, "step": 47090 }, { "epoch": 7.682707993474715, "grad_norm": 0.11787327378988266, "learning_rate": 7.743435726627726e-06, "loss": 0.0424, "num_input_tokens_seen": 101635072, "step": 47095 }, { "epoch": 7.6835236541598695, "grad_norm": 0.09346489608287811, "learning_rate": 7.738286146231194e-06, "loss": 0.1052, "num_input_tokens_seen": 101644896, "step": 47100 }, { "epoch": 7.684339314845024, "grad_norm": 0.11581163108348846, "learning_rate": 7.733137965156764e-06, "loss": 0.0059, "num_input_tokens_seen": 101655584, "step": 47105 }, { "epoch": 7.685154975530179, "grad_norm": 0.1141398698091507, "learning_rate": 7.727991183821792e-06, "loss": 0.0172, "num_input_tokens_seen": 101666912, "step": 47110 }, { "epoch": 7.685970636215334, "grad_norm": 0.057288192212581635, "learning_rate": 7.722845802643489e-06, "loss": 0.0039, "num_input_tokens_seen": 101676768, "step": 47115 }, { "epoch": 7.68678629690049, "grad_norm": 0.08691194653511047, "learning_rate": 7.71770182203897e-06, "loss": 0.0041, "num_input_tokens_seen": 101687936, "step": 47120 }, { "epoch": 7.6876019575856445, "grad_norm": 3.4730427265167236, "learning_rate": 7.71255924242523e-06, "loss": 0.394, "num_input_tokens_seen": 101698592, "step": 47125 }, { "epoch": 7.688417618270799, "grad_norm": 0.6738741397857666, "learning_rate": 7.707418064219152e-06, "loss": 0.0054, "num_input_tokens_seen": 101709312, "step": 47130 }, { "epoch": 7.689233278955954, "grad_norm": 0.06230625882744789, "learning_rate": 7.702278287837509e-06, "loss": 0.0915, "num_input_tokens_seen": 101720512, "step": 47135 }, { "epoch": 7.690048939641109, "grad_norm": 0.05752212926745415, "learning_rate": 7.697139913696955e-06, "loss": 0.0314, "num_input_tokens_seen": 101732640, "step": 47140 }, { "epoch": 7.690864600326265, "grad_norm": 0.0707167237997055, "learning_rate": 7.692002942214035e-06, "loss": 0.0088, "num_input_tokens_seen": 101742400, "step": 47145 }, { "epoch": 7.691680261011419, "grad_norm": 0.07063805311918259, "learning_rate": 7.686867373805176e-06, "loss": 0.3276, "num_input_tokens_seen": 101751936, "step": 47150 }, { "epoch": 7.692495921696574, "grad_norm": 8.095948219299316, "learning_rate": 7.681733208886693e-06, "loss": 0.1036, "num_input_tokens_seen": 101763136, "step": 47155 }, { "epoch": 7.693311582381729, "grad_norm": 1.144657850265503, "learning_rate": 7.676600447874788e-06, "loss": 0.0264, "num_input_tokens_seen": 101773440, "step": 47160 }, { "epoch": 7.694127243066884, "grad_norm": 0.17921125888824463, "learning_rate": 7.67146909118555e-06, "loss": 0.2127, "num_input_tokens_seen": 101783264, "step": 47165 }, { "epoch": 7.69494290375204, "grad_norm": 0.08309762924909592, "learning_rate": 7.666339139234949e-06, "loss": 0.0865, "num_input_tokens_seen": 101793856, "step": 47170 }, { "epoch": 7.695758564437194, "grad_norm": 0.13982199132442474, "learning_rate": 7.66121059243885e-06, "loss": 0.1073, "num_input_tokens_seen": 101804832, "step": 47175 }, { "epoch": 7.696574225122349, "grad_norm": 0.1988714635372162, "learning_rate": 7.656083451212995e-06, "loss": 0.1206, "num_input_tokens_seen": 101815808, "step": 47180 }, { "epoch": 7.697389885807504, "grad_norm": 0.04910849779844284, "learning_rate": 7.650957715973017e-06, "loss": 0.0895, "num_input_tokens_seen": 101825120, "step": 47185 }, { "epoch": 7.698205546492659, "grad_norm": 0.6744894981384277, "learning_rate": 7.645833387134437e-06, "loss": 0.0997, "num_input_tokens_seen": 101836416, "step": 47190 }, { "epoch": 7.699021207177814, "grad_norm": 0.16725163161754608, "learning_rate": 7.640710465112654e-06, "loss": 0.0979, "num_input_tokens_seen": 101846240, "step": 47195 }, { "epoch": 7.699836867862969, "grad_norm": 0.12873879075050354, "learning_rate": 7.635588950322964e-06, "loss": 0.0226, "num_input_tokens_seen": 101857152, "step": 47200 }, { "epoch": 7.700652528548124, "grad_norm": 0.1449815183877945, "learning_rate": 7.630468843180538e-06, "loss": 0.0943, "num_input_tokens_seen": 101867296, "step": 47205 }, { "epoch": 7.701468189233279, "grad_norm": 0.20121796429157257, "learning_rate": 7.625350144100441e-06, "loss": 0.0073, "num_input_tokens_seen": 101876000, "step": 47210 }, { "epoch": 7.702283849918434, "grad_norm": 1.4556446075439453, "learning_rate": 7.620232853497611e-06, "loss": 0.0052, "num_input_tokens_seen": 101886080, "step": 47215 }, { "epoch": 7.703099510603589, "grad_norm": 0.19958502054214478, "learning_rate": 7.615116971786895e-06, "loss": 0.0085, "num_input_tokens_seen": 101896480, "step": 47220 }, { "epoch": 7.7039151712887435, "grad_norm": 0.017091643065214157, "learning_rate": 7.610002499383012e-06, "loss": 0.0477, "num_input_tokens_seen": 101907136, "step": 47225 }, { "epoch": 7.704730831973899, "grad_norm": 0.07241757214069366, "learning_rate": 7.60488943670056e-06, "loss": 0.1033, "num_input_tokens_seen": 101918624, "step": 47230 }, { "epoch": 7.705546492659054, "grad_norm": 0.07324308902025223, "learning_rate": 7.59977778415403e-06, "loss": 0.0078, "num_input_tokens_seen": 101929408, "step": 47235 }, { "epoch": 7.706362153344209, "grad_norm": 0.05124499648809433, "learning_rate": 7.594667542157796e-06, "loss": 0.0059, "num_input_tokens_seen": 101940736, "step": 47240 }, { "epoch": 7.707177814029364, "grad_norm": 4.194717884063721, "learning_rate": 7.5895587111261325e-06, "loss": 0.0208, "num_input_tokens_seen": 101950848, "step": 47245 }, { "epoch": 7.7079934747145185, "grad_norm": 15.6417236328125, "learning_rate": 7.584451291473177e-06, "loss": 0.1458, "num_input_tokens_seen": 101961376, "step": 47250 }, { "epoch": 7.708809135399674, "grad_norm": 0.12029201537370682, "learning_rate": 7.579345283612968e-06, "loss": 0.0074, "num_input_tokens_seen": 101972448, "step": 47255 }, { "epoch": 7.709624796084829, "grad_norm": 0.1028146743774414, "learning_rate": 7.574240687959422e-06, "loss": 0.0031, "num_input_tokens_seen": 101983712, "step": 47260 }, { "epoch": 7.710440456769984, "grad_norm": 0.1645936667919159, "learning_rate": 7.56913750492634e-06, "loss": 0.1216, "num_input_tokens_seen": 101993792, "step": 47265 }, { "epoch": 7.711256117455139, "grad_norm": 0.09468498826026917, "learning_rate": 7.564035734927419e-06, "loss": 0.0236, "num_input_tokens_seen": 102004800, "step": 47270 }, { "epoch": 7.712071778140293, "grad_norm": 10.034408569335938, "learning_rate": 7.558935378376228e-06, "loss": 0.1065, "num_input_tokens_seen": 102015936, "step": 47275 }, { "epoch": 7.712887438825448, "grad_norm": 0.3068498373031616, "learning_rate": 7.553836435686232e-06, "loss": 0.1882, "num_input_tokens_seen": 102026592, "step": 47280 }, { "epoch": 7.713703099510604, "grad_norm": 6.699936389923096, "learning_rate": 7.5487389072707744e-06, "loss": 0.0686, "num_input_tokens_seen": 102036448, "step": 47285 }, { "epoch": 7.714518760195759, "grad_norm": 3.9150755405426025, "learning_rate": 7.543642793543088e-06, "loss": 0.1662, "num_input_tokens_seen": 102048800, "step": 47290 }, { "epoch": 7.715334420880914, "grad_norm": 0.05595221742987633, "learning_rate": 7.53854809491629e-06, "loss": 0.0428, "num_input_tokens_seen": 102060000, "step": 47295 }, { "epoch": 7.716150081566068, "grad_norm": 0.1571471095085144, "learning_rate": 7.533454811803381e-06, "loss": 0.0086, "num_input_tokens_seen": 102071104, "step": 47300 }, { "epoch": 7.716965742251223, "grad_norm": 4.362936973571777, "learning_rate": 7.528362944617251e-06, "loss": 0.1903, "num_input_tokens_seen": 102082880, "step": 47305 }, { "epoch": 7.717781402936378, "grad_norm": 0.07679007202386856, "learning_rate": 7.523272493770669e-06, "loss": 0.0853, "num_input_tokens_seen": 102094816, "step": 47310 }, { "epoch": 7.718597063621534, "grad_norm": 0.12573713064193726, "learning_rate": 7.518183459676295e-06, "loss": 0.0322, "num_input_tokens_seen": 102105632, "step": 47315 }, { "epoch": 7.719412724306689, "grad_norm": 3.7796518802642822, "learning_rate": 7.513095842746665e-06, "loss": 0.1788, "num_input_tokens_seen": 102115808, "step": 47320 }, { "epoch": 7.720228384991843, "grad_norm": 8.606292724609375, "learning_rate": 7.5080096433942204e-06, "loss": 0.0794, "num_input_tokens_seen": 102127392, "step": 47325 }, { "epoch": 7.721044045676998, "grad_norm": 0.5781602263450623, "learning_rate": 7.502924862031269e-06, "loss": 0.0055, "num_input_tokens_seen": 102139136, "step": 47330 }, { "epoch": 7.721859706362153, "grad_norm": 0.03822608292102814, "learning_rate": 7.497841499070005e-06, "loss": 0.2096, "num_input_tokens_seen": 102150688, "step": 47335 }, { "epoch": 7.722675367047309, "grad_norm": 3.7553579807281494, "learning_rate": 7.4927595549225155e-06, "loss": 0.411, "num_input_tokens_seen": 102161568, "step": 47340 }, { "epoch": 7.7234910277324635, "grad_norm": 0.05443377047777176, "learning_rate": 7.487679030000769e-06, "loss": 0.0035, "num_input_tokens_seen": 102173440, "step": 47345 }, { "epoch": 7.724306688417618, "grad_norm": 0.08578251302242279, "learning_rate": 7.482599924716613e-06, "loss": 0.0029, "num_input_tokens_seen": 102185376, "step": 47350 }, { "epoch": 7.725122349102773, "grad_norm": 0.20941054821014404, "learning_rate": 7.477522239481793e-06, "loss": 0.0051, "num_input_tokens_seen": 102195456, "step": 47355 }, { "epoch": 7.725938009787928, "grad_norm": 3.4213364124298096, "learning_rate": 7.472445974707928e-06, "loss": 0.0877, "num_input_tokens_seen": 102206080, "step": 47360 }, { "epoch": 7.726753670473083, "grad_norm": 5.20107889175415, "learning_rate": 7.467371130806524e-06, "loss": 0.1009, "num_input_tokens_seen": 102216736, "step": 47365 }, { "epoch": 7.7275693311582385, "grad_norm": 0.06946857273578644, "learning_rate": 7.462297708188978e-06, "loss": 0.0033, "num_input_tokens_seen": 102227904, "step": 47370 }, { "epoch": 7.728384991843393, "grad_norm": 1.2200431823730469, "learning_rate": 7.457225707266566e-06, "loss": 0.0073, "num_input_tokens_seen": 102239200, "step": 47375 }, { "epoch": 7.729200652528548, "grad_norm": 0.06667802482843399, "learning_rate": 7.452155128450447e-06, "loss": 0.005, "num_input_tokens_seen": 102251264, "step": 47380 }, { "epoch": 7.730016313213703, "grad_norm": 0.0479462705552578, "learning_rate": 7.447085972151663e-06, "loss": 0.0991, "num_input_tokens_seen": 102262432, "step": 47385 }, { "epoch": 7.730831973898858, "grad_norm": 0.32065635919570923, "learning_rate": 7.4420182387811596e-06, "loss": 0.0699, "num_input_tokens_seen": 102274912, "step": 47390 }, { "epoch": 7.731647634584013, "grad_norm": 0.13018104434013367, "learning_rate": 7.436951928749747e-06, "loss": 0.0122, "num_input_tokens_seen": 102285696, "step": 47395 }, { "epoch": 7.732463295269168, "grad_norm": 4.171314239501953, "learning_rate": 7.431887042468125e-06, "loss": 0.0592, "num_input_tokens_seen": 102296160, "step": 47400 }, { "epoch": 7.733278955954323, "grad_norm": 0.17030827701091766, "learning_rate": 7.426823580346881e-06, "loss": 0.0143, "num_input_tokens_seen": 102307712, "step": 47405 }, { "epoch": 7.734094616639478, "grad_norm": 0.03789321705698967, "learning_rate": 7.421761542796479e-06, "loss": 0.0053, "num_input_tokens_seen": 102320032, "step": 47410 }, { "epoch": 7.734910277324633, "grad_norm": 0.07053886353969574, "learning_rate": 7.41670093022728e-06, "loss": 0.147, "num_input_tokens_seen": 102330656, "step": 47415 }, { "epoch": 7.735725938009788, "grad_norm": 0.12778106331825256, "learning_rate": 7.411641743049522e-06, "loss": 0.1072, "num_input_tokens_seen": 102340704, "step": 47420 }, { "epoch": 7.736541598694943, "grad_norm": 0.06977693736553192, "learning_rate": 7.406583981673315e-06, "loss": 0.0512, "num_input_tokens_seen": 102350752, "step": 47425 }, { "epoch": 7.737357259380098, "grad_norm": 0.02339315600693226, "learning_rate": 7.401527646508691e-06, "loss": 0.0768, "num_input_tokens_seen": 102360800, "step": 47430 }, { "epoch": 7.738172920065253, "grad_norm": 0.150446355342865, "learning_rate": 7.396472737965526e-06, "loss": 0.0613, "num_input_tokens_seen": 102372096, "step": 47435 }, { "epoch": 7.738988580750408, "grad_norm": 0.06412381678819656, "learning_rate": 7.391419256453602e-06, "loss": 0.3177, "num_input_tokens_seen": 102382112, "step": 47440 }, { "epoch": 7.739804241435563, "grad_norm": 0.7351124882698059, "learning_rate": 7.386367202382577e-06, "loss": 0.0049, "num_input_tokens_seen": 102391392, "step": 47445 }, { "epoch": 7.740619902120718, "grad_norm": 2.7334470748901367, "learning_rate": 7.3813165761619975e-06, "loss": 0.0851, "num_input_tokens_seen": 102402720, "step": 47450 }, { "epoch": 7.741435562805873, "grad_norm": 0.056419868022203445, "learning_rate": 7.376267378201293e-06, "loss": 0.2647, "num_input_tokens_seen": 102412992, "step": 47455 }, { "epoch": 7.742251223491028, "grad_norm": 2.9997851848602295, "learning_rate": 7.371219608909777e-06, "loss": 0.1289, "num_input_tokens_seen": 102424416, "step": 47460 }, { "epoch": 7.743066884176183, "grad_norm": 0.060516271740198135, "learning_rate": 7.366173268696646e-06, "loss": 0.1282, "num_input_tokens_seen": 102435296, "step": 47465 }, { "epoch": 7.7438825448613375, "grad_norm": 4.344020843505859, "learning_rate": 7.3611283579709835e-06, "loss": 0.0793, "num_input_tokens_seen": 102445856, "step": 47470 }, { "epoch": 7.744698205546492, "grad_norm": 0.1005115956068039, "learning_rate": 7.356084877141756e-06, "loss": 0.0025, "num_input_tokens_seen": 102457024, "step": 47475 }, { "epoch": 7.745513866231647, "grad_norm": 9.595219612121582, "learning_rate": 7.35104282661781e-06, "loss": 0.06, "num_input_tokens_seen": 102467744, "step": 47480 }, { "epoch": 7.746329526916803, "grad_norm": 9.318653106689453, "learning_rate": 7.346002206807887e-06, "loss": 0.0219, "num_input_tokens_seen": 102477408, "step": 47485 }, { "epoch": 7.747145187601958, "grad_norm": 5.1646881103515625, "learning_rate": 7.340963018120597e-06, "loss": 0.0741, "num_input_tokens_seen": 102487840, "step": 47490 }, { "epoch": 7.7479608482871125, "grad_norm": 0.06552334129810333, "learning_rate": 7.335925260964446e-06, "loss": 0.0026, "num_input_tokens_seen": 102498624, "step": 47495 }, { "epoch": 7.748776508972267, "grad_norm": 0.14217686653137207, "learning_rate": 7.330888935747821e-06, "loss": 0.0049, "num_input_tokens_seen": 102509952, "step": 47500 }, { "epoch": 7.749592169657422, "grad_norm": 0.03769100457429886, "learning_rate": 7.325854042878991e-06, "loss": 0.0022, "num_input_tokens_seen": 102521120, "step": 47505 }, { "epoch": 7.750407830342578, "grad_norm": 0.14183053374290466, "learning_rate": 7.320820582766108e-06, "loss": 0.2305, "num_input_tokens_seen": 102532832, "step": 47510 }, { "epoch": 7.751223491027733, "grad_norm": 0.407478392124176, "learning_rate": 7.315788555817215e-06, "loss": 0.0058, "num_input_tokens_seen": 102542752, "step": 47515 }, { "epoch": 7.7520391517128875, "grad_norm": 0.07920314371585846, "learning_rate": 7.3107579624402286e-06, "loss": 0.0072, "num_input_tokens_seen": 102553920, "step": 47520 }, { "epoch": 7.752854812398042, "grad_norm": 0.12087984383106232, "learning_rate": 7.305728803042949e-06, "loss": 0.1148, "num_input_tokens_seen": 102565088, "step": 47525 }, { "epoch": 7.753670473083197, "grad_norm": 0.10578302294015884, "learning_rate": 7.3007010780330785e-06, "loss": 0.0042, "num_input_tokens_seen": 102576704, "step": 47530 }, { "epoch": 7.754486133768353, "grad_norm": 0.05380365997552872, "learning_rate": 7.295674787818188e-06, "loss": 0.2452, "num_input_tokens_seen": 102588000, "step": 47535 }, { "epoch": 7.755301794453508, "grad_norm": 1.0776162147521973, "learning_rate": 7.290649932805726e-06, "loss": 0.0142, "num_input_tokens_seen": 102598144, "step": 47540 }, { "epoch": 7.7561174551386625, "grad_norm": 0.0860818549990654, "learning_rate": 7.285626513403038e-06, "loss": 0.0297, "num_input_tokens_seen": 102608256, "step": 47545 }, { "epoch": 7.756933115823817, "grad_norm": 0.14374639093875885, "learning_rate": 7.2806045300173484e-06, "loss": 0.0559, "num_input_tokens_seen": 102618528, "step": 47550 }, { "epoch": 7.757748776508972, "grad_norm": 0.7806813716888428, "learning_rate": 7.275583983055753e-06, "loss": 0.167, "num_input_tokens_seen": 102629632, "step": 47555 }, { "epoch": 7.758564437194127, "grad_norm": 0.09024453163146973, "learning_rate": 7.2705648729252615e-06, "loss": 0.0047, "num_input_tokens_seen": 102639712, "step": 47560 }, { "epoch": 7.759380097879282, "grad_norm": 0.06424576789140701, "learning_rate": 7.265547200032738e-06, "loss": 0.009, "num_input_tokens_seen": 102650272, "step": 47565 }, { "epoch": 7.760195758564437, "grad_norm": 0.13625803589820862, "learning_rate": 7.260530964784945e-06, "loss": 0.1459, "num_input_tokens_seen": 102661216, "step": 47570 }, { "epoch": 7.761011419249592, "grad_norm": 0.09976625442504883, "learning_rate": 7.2555161675885195e-06, "loss": 0.0838, "num_input_tokens_seen": 102671200, "step": 47575 }, { "epoch": 7.761827079934747, "grad_norm": 0.12595100700855255, "learning_rate": 7.250502808849988e-06, "loss": 0.1217, "num_input_tokens_seen": 102682144, "step": 47580 }, { "epoch": 7.762642740619902, "grad_norm": 3.1571035385131836, "learning_rate": 7.2454908889757586e-06, "loss": 0.1977, "num_input_tokens_seen": 102693376, "step": 47585 }, { "epoch": 7.763458401305057, "grad_norm": 0.11334747821092606, "learning_rate": 7.240480408372125e-06, "loss": 0.0052, "num_input_tokens_seen": 102704800, "step": 47590 }, { "epoch": 7.764274061990212, "grad_norm": 2.676896572113037, "learning_rate": 7.235471367445257e-06, "loss": 0.2064, "num_input_tokens_seen": 102715776, "step": 47595 }, { "epoch": 7.765089722675367, "grad_norm": 10.041728973388672, "learning_rate": 7.2304637666012195e-06, "loss": 0.2402, "num_input_tokens_seen": 102727232, "step": 47600 }, { "epoch": 7.765905383360522, "grad_norm": 0.1420239955186844, "learning_rate": 7.22545760624595e-06, "loss": 0.015, "num_input_tokens_seen": 102737568, "step": 47605 }, { "epoch": 7.766721044045677, "grad_norm": 0.06168798729777336, "learning_rate": 7.2204528867852725e-06, "loss": 0.2718, "num_input_tokens_seen": 102749728, "step": 47610 }, { "epoch": 7.767536704730832, "grad_norm": 1.830137848854065, "learning_rate": 7.215449608624899e-06, "loss": 0.0053, "num_input_tokens_seen": 102760256, "step": 47615 }, { "epoch": 7.768352365415987, "grad_norm": 0.15203669667243958, "learning_rate": 7.210447772170418e-06, "loss": 0.0383, "num_input_tokens_seen": 102771648, "step": 47620 }, { "epoch": 7.769168026101142, "grad_norm": 0.1616964191198349, "learning_rate": 7.205447377827301e-06, "loss": 0.1632, "num_input_tokens_seen": 102782912, "step": 47625 }, { "epoch": 7.769983686786297, "grad_norm": 0.06401637196540833, "learning_rate": 7.200448426000911e-06, "loss": 0.1192, "num_input_tokens_seen": 102795200, "step": 47630 }, { "epoch": 7.770799347471452, "grad_norm": 0.3197227418422699, "learning_rate": 7.195450917096483e-06, "loss": 0.1945, "num_input_tokens_seen": 102805600, "step": 47635 }, { "epoch": 7.771615008156607, "grad_norm": 3.529085397720337, "learning_rate": 7.190454851519138e-06, "loss": 0.1202, "num_input_tokens_seen": 102816800, "step": 47640 }, { "epoch": 7.7724306688417615, "grad_norm": 0.09904611110687256, "learning_rate": 7.185460229673893e-06, "loss": 0.105, "num_input_tokens_seen": 102827968, "step": 47645 }, { "epoch": 7.773246329526917, "grad_norm": 0.019798442721366882, "learning_rate": 7.180467051965634e-06, "loss": 0.0173, "num_input_tokens_seen": 102838400, "step": 47650 }, { "epoch": 7.774061990212072, "grad_norm": 0.40912890434265137, "learning_rate": 7.17547531879913e-06, "loss": 0.0053, "num_input_tokens_seen": 102848704, "step": 47655 }, { "epoch": 7.774877650897227, "grad_norm": 0.08395379036664963, "learning_rate": 7.170485030579038e-06, "loss": 0.0054, "num_input_tokens_seen": 102859296, "step": 47660 }, { "epoch": 7.775693311582382, "grad_norm": 0.03093654103577137, "learning_rate": 7.165496187709894e-06, "loss": 0.0088, "num_input_tokens_seen": 102870592, "step": 47665 }, { "epoch": 7.7765089722675365, "grad_norm": 7.923911094665527, "learning_rate": 7.160508790596121e-06, "loss": 0.1545, "num_input_tokens_seen": 102881568, "step": 47670 }, { "epoch": 7.777324632952691, "grad_norm": 12.883262634277344, "learning_rate": 7.155522839642023e-06, "loss": 0.14, "num_input_tokens_seen": 102892320, "step": 47675 }, { "epoch": 7.778140293637847, "grad_norm": 1.4507733583450317, "learning_rate": 7.150538335251786e-06, "loss": 0.1976, "num_input_tokens_seen": 102903456, "step": 47680 }, { "epoch": 7.778955954323002, "grad_norm": 0.08981003612279892, "learning_rate": 7.1455552778294775e-06, "loss": 0.08, "num_input_tokens_seen": 102914208, "step": 47685 }, { "epoch": 7.779771615008157, "grad_norm": 5.994715690612793, "learning_rate": 7.140573667779052e-06, "loss": 0.1083, "num_input_tokens_seen": 102925152, "step": 47690 }, { "epoch": 7.780587275693311, "grad_norm": 0.1755041629076004, "learning_rate": 7.1355935055043314e-06, "loss": 0.1214, "num_input_tokens_seen": 102936064, "step": 47695 }, { "epoch": 7.781402936378466, "grad_norm": 0.062174130231142044, "learning_rate": 7.130614791409057e-06, "loss": 0.0663, "num_input_tokens_seen": 102947200, "step": 47700 }, { "epoch": 7.782218597063622, "grad_norm": 3.0506484508514404, "learning_rate": 7.125637525896814e-06, "loss": 0.1461, "num_input_tokens_seen": 102957216, "step": 47705 }, { "epoch": 7.783034257748777, "grad_norm": 0.035387951880693436, "learning_rate": 7.1206617093710845e-06, "loss": 0.0032, "num_input_tokens_seen": 102968160, "step": 47710 }, { "epoch": 7.783849918433932, "grad_norm": 0.10718975961208344, "learning_rate": 7.115687342235239e-06, "loss": 0.0384, "num_input_tokens_seen": 102978912, "step": 47715 }, { "epoch": 7.784665579119086, "grad_norm": 7.703681468963623, "learning_rate": 7.11071442489252e-06, "loss": 0.374, "num_input_tokens_seen": 102989568, "step": 47720 }, { "epoch": 7.785481239804241, "grad_norm": 0.05116169899702072, "learning_rate": 7.1057429577460584e-06, "loss": 0.0058, "num_input_tokens_seen": 103000512, "step": 47725 }, { "epoch": 7.786296900489396, "grad_norm": 0.051105331629514694, "learning_rate": 7.100772941198869e-06, "loss": 0.125, "num_input_tokens_seen": 103012320, "step": 47730 }, { "epoch": 7.787112561174552, "grad_norm": 0.026301920413970947, "learning_rate": 7.095804375653844e-06, "loss": 0.0046, "num_input_tokens_seen": 103023936, "step": 47735 }, { "epoch": 7.787928221859707, "grad_norm": 11.353914260864258, "learning_rate": 7.090837261513764e-06, "loss": 0.0834, "num_input_tokens_seen": 103033696, "step": 47740 }, { "epoch": 7.788743882544861, "grad_norm": 5.692740440368652, "learning_rate": 7.085871599181274e-06, "loss": 0.1808, "num_input_tokens_seen": 103045280, "step": 47745 }, { "epoch": 7.789559543230016, "grad_norm": 0.1049162745475769, "learning_rate": 7.0809073890589356e-06, "loss": 0.0042, "num_input_tokens_seen": 103055584, "step": 47750 }, { "epoch": 7.790375203915171, "grad_norm": 0.1505405306816101, "learning_rate": 7.075944631549167e-06, "loss": 0.0808, "num_input_tokens_seen": 103066880, "step": 47755 }, { "epoch": 7.791190864600326, "grad_norm": 0.05198020488023758, "learning_rate": 7.07098332705427e-06, "loss": 0.0047, "num_input_tokens_seen": 103077568, "step": 47760 }, { "epoch": 7.7920065252854815, "grad_norm": 0.0652122050523758, "learning_rate": 7.066023475976438e-06, "loss": 0.0054, "num_input_tokens_seen": 103088672, "step": 47765 }, { "epoch": 7.792822185970636, "grad_norm": 5.139396667480469, "learning_rate": 7.061065078717738e-06, "loss": 0.119, "num_input_tokens_seen": 103098496, "step": 47770 }, { "epoch": 7.793637846655791, "grad_norm": 3.967467784881592, "learning_rate": 7.056108135680123e-06, "loss": 0.2102, "num_input_tokens_seen": 103109856, "step": 47775 }, { "epoch": 7.794453507340946, "grad_norm": 0.09969165921211243, "learning_rate": 7.05115264726543e-06, "loss": 0.1134, "num_input_tokens_seen": 103120032, "step": 47780 }, { "epoch": 7.795269168026101, "grad_norm": 4.543612003326416, "learning_rate": 7.046198613875374e-06, "loss": 0.1984, "num_input_tokens_seen": 103131072, "step": 47785 }, { "epoch": 7.7960848287112565, "grad_norm": 3.157122850418091, "learning_rate": 7.0412460359115555e-06, "loss": 0.241, "num_input_tokens_seen": 103142240, "step": 47790 }, { "epoch": 7.796900489396411, "grad_norm": 5.2432541847229, "learning_rate": 7.0362949137754565e-06, "loss": 0.1828, "num_input_tokens_seen": 103152960, "step": 47795 }, { "epoch": 7.797716150081566, "grad_norm": 0.23773658275604248, "learning_rate": 7.031345247868437e-06, "loss": 0.1168, "num_input_tokens_seen": 103164352, "step": 47800 }, { "epoch": 7.798531810766721, "grad_norm": 27.523258209228516, "learning_rate": 7.026397038591745e-06, "loss": 0.1761, "num_input_tokens_seen": 103175680, "step": 47805 }, { "epoch": 7.799347471451876, "grad_norm": 0.08605588227510452, "learning_rate": 7.021450286346503e-06, "loss": 0.0046, "num_input_tokens_seen": 103185952, "step": 47810 }, { "epoch": 7.800163132137031, "grad_norm": 3.771977663040161, "learning_rate": 7.016504991533726e-06, "loss": 0.32, "num_input_tokens_seen": 103195840, "step": 47815 }, { "epoch": 7.800978792822186, "grad_norm": 0.09736517071723938, "learning_rate": 7.011561154554303e-06, "loss": 0.0859, "num_input_tokens_seen": 103207104, "step": 47820 }, { "epoch": 7.801794453507341, "grad_norm": 0.1261928230524063, "learning_rate": 7.006618775809001e-06, "loss": 0.0963, "num_input_tokens_seen": 103218624, "step": 47825 }, { "epoch": 7.802610114192496, "grad_norm": 0.15593859553337097, "learning_rate": 7.001677855698482e-06, "loss": 0.0132, "num_input_tokens_seen": 103230592, "step": 47830 }, { "epoch": 7.803425774877651, "grad_norm": 0.16541704535484314, "learning_rate": 6.996738394623279e-06, "loss": 0.2095, "num_input_tokens_seen": 103241984, "step": 47835 }, { "epoch": 7.804241435562806, "grad_norm": 6.937445163726807, "learning_rate": 6.991800392983799e-06, "loss": 0.2821, "num_input_tokens_seen": 103252800, "step": 47840 }, { "epoch": 7.80505709624796, "grad_norm": 0.06043032184243202, "learning_rate": 6.9868638511803615e-06, "loss": 0.004, "num_input_tokens_seen": 103265440, "step": 47845 }, { "epoch": 7.805872756933116, "grad_norm": 0.2144368439912796, "learning_rate": 6.9819287696131355e-06, "loss": 0.0094, "num_input_tokens_seen": 103274976, "step": 47850 }, { "epoch": 7.806688417618271, "grad_norm": 0.1217925027012825, "learning_rate": 6.9769951486821885e-06, "loss": 0.0168, "num_input_tokens_seen": 103285344, "step": 47855 }, { "epoch": 7.807504078303426, "grad_norm": 0.174509659409523, "learning_rate": 6.972062988787462e-06, "loss": 0.0048, "num_input_tokens_seen": 103296576, "step": 47860 }, { "epoch": 7.808319738988581, "grad_norm": 0.14375446736812592, "learning_rate": 6.9671322903287765e-06, "loss": 0.1212, "num_input_tokens_seen": 103306880, "step": 47865 }, { "epoch": 7.809135399673735, "grad_norm": 0.13414366543293, "learning_rate": 6.962203053705851e-06, "loss": 0.061, "num_input_tokens_seen": 103317760, "step": 47870 }, { "epoch": 7.809951060358891, "grad_norm": 5.264692306518555, "learning_rate": 6.957275279318268e-06, "loss": 0.1892, "num_input_tokens_seen": 103328096, "step": 47875 }, { "epoch": 7.810766721044046, "grad_norm": 4.860972881317139, "learning_rate": 6.9523489675655e-06, "loss": 0.1944, "num_input_tokens_seen": 103338560, "step": 47880 }, { "epoch": 7.811582381729201, "grad_norm": 5.393228054046631, "learning_rate": 6.9474241188468985e-06, "loss": 0.1765, "num_input_tokens_seen": 103349824, "step": 47885 }, { "epoch": 7.8123980424143555, "grad_norm": 0.14536328613758087, "learning_rate": 6.942500733561694e-06, "loss": 0.0046, "num_input_tokens_seen": 103361056, "step": 47890 }, { "epoch": 7.81321370309951, "grad_norm": 0.07104796916246414, "learning_rate": 6.937578812109005e-06, "loss": 0.0186, "num_input_tokens_seen": 103371520, "step": 47895 }, { "epoch": 7.814029363784666, "grad_norm": 1.0618921518325806, "learning_rate": 6.932658354887825e-06, "loss": 0.0074, "num_input_tokens_seen": 103382784, "step": 47900 }, { "epoch": 7.814845024469821, "grad_norm": 0.1532667577266693, "learning_rate": 6.927739362297028e-06, "loss": 0.2538, "num_input_tokens_seen": 103393376, "step": 47905 }, { "epoch": 7.815660685154976, "grad_norm": 0.04023406654596329, "learning_rate": 6.92282183473538e-06, "loss": 0.1004, "num_input_tokens_seen": 103404000, "step": 47910 }, { "epoch": 7.8164763458401305, "grad_norm": 0.021517453715205193, "learning_rate": 6.917905772601516e-06, "loss": 0.0859, "num_input_tokens_seen": 103414848, "step": 47915 }, { "epoch": 7.817292006525285, "grad_norm": 0.06931926310062408, "learning_rate": 6.912991176293957e-06, "loss": 0.0091, "num_input_tokens_seen": 103426144, "step": 47920 }, { "epoch": 7.81810766721044, "grad_norm": 0.09444573521614075, "learning_rate": 6.908078046211105e-06, "loss": 0.082, "num_input_tokens_seen": 103435584, "step": 47925 }, { "epoch": 7.818923327895595, "grad_norm": 0.04047401621937752, "learning_rate": 6.903166382751244e-06, "loss": 0.0055, "num_input_tokens_seen": 103446464, "step": 47930 }, { "epoch": 7.819738988580751, "grad_norm": 0.858544647693634, "learning_rate": 6.8982561863125405e-06, "loss": 0.2054, "num_input_tokens_seen": 103457248, "step": 47935 }, { "epoch": 7.8205546492659055, "grad_norm": 1.1933023929595947, "learning_rate": 6.893347457293036e-06, "loss": 0.1077, "num_input_tokens_seen": 103468128, "step": 47940 }, { "epoch": 7.82137030995106, "grad_norm": 0.04212098568677902, "learning_rate": 6.888440196090659e-06, "loss": 0.1546, "num_input_tokens_seen": 103478656, "step": 47945 }, { "epoch": 7.822185970636215, "grad_norm": 0.17217549681663513, "learning_rate": 6.8835344031032175e-06, "loss": 0.0422, "num_input_tokens_seen": 103489472, "step": 47950 }, { "epoch": 7.82300163132137, "grad_norm": 2.7660112380981445, "learning_rate": 6.878630078728399e-06, "loss": 0.233, "num_input_tokens_seen": 103500384, "step": 47955 }, { "epoch": 7.823817292006526, "grad_norm": 3.457015037536621, "learning_rate": 6.873727223363766e-06, "loss": 0.2738, "num_input_tokens_seen": 103510432, "step": 47960 }, { "epoch": 7.8246329526916805, "grad_norm": 0.10388149321079254, "learning_rate": 6.868825837406784e-06, "loss": 0.0071, "num_input_tokens_seen": 103520544, "step": 47965 }, { "epoch": 7.825448613376835, "grad_norm": 2.991614818572998, "learning_rate": 6.8639259212547764e-06, "loss": 0.1083, "num_input_tokens_seen": 103531264, "step": 47970 }, { "epoch": 7.82626427406199, "grad_norm": 0.08335016667842865, "learning_rate": 6.859027475304955e-06, "loss": 0.0059, "num_input_tokens_seen": 103542176, "step": 47975 }, { "epoch": 7.827079934747145, "grad_norm": 0.11649288237094879, "learning_rate": 6.854130499954411e-06, "loss": 0.0086, "num_input_tokens_seen": 103552608, "step": 47980 }, { "epoch": 7.827895595432301, "grad_norm": 0.1272554099559784, "learning_rate": 6.849234995600121e-06, "loss": 0.1022, "num_input_tokens_seen": 103563552, "step": 47985 }, { "epoch": 7.828711256117455, "grad_norm": 0.14843489229679108, "learning_rate": 6.84434096263894e-06, "loss": 0.1328, "num_input_tokens_seen": 103575296, "step": 47990 }, { "epoch": 7.82952691680261, "grad_norm": 0.1605493575334549, "learning_rate": 6.839448401467599e-06, "loss": 0.0048, "num_input_tokens_seen": 103586304, "step": 47995 }, { "epoch": 7.830342577487765, "grad_norm": 3.597090721130371, "learning_rate": 6.834557312482717e-06, "loss": 0.1724, "num_input_tokens_seen": 103596768, "step": 48000 }, { "epoch": 7.83115823817292, "grad_norm": 11.16240406036377, "learning_rate": 6.8296676960807906e-06, "loss": 0.1301, "num_input_tokens_seen": 103607808, "step": 48005 }, { "epoch": 7.831973898858075, "grad_norm": 0.18940110504627228, "learning_rate": 6.824779552658189e-06, "loss": 0.09, "num_input_tokens_seen": 103618848, "step": 48010 }, { "epoch": 7.8327895595432295, "grad_norm": 0.22899290919303894, "learning_rate": 6.819892882611184e-06, "loss": 0.0242, "num_input_tokens_seen": 103629600, "step": 48015 }, { "epoch": 7.833605220228385, "grad_norm": 0.11677180975675583, "learning_rate": 6.8150076863359054e-06, "loss": 0.0046, "num_input_tokens_seen": 103639744, "step": 48020 }, { "epoch": 7.83442088091354, "grad_norm": 0.09852508455514908, "learning_rate": 6.810123964228374e-06, "loss": 0.1979, "num_input_tokens_seen": 103649472, "step": 48025 }, { "epoch": 7.835236541598695, "grad_norm": 3.817969560623169, "learning_rate": 6.8052417166844905e-06, "loss": 0.0794, "num_input_tokens_seen": 103659968, "step": 48030 }, { "epoch": 7.83605220228385, "grad_norm": 0.9190351366996765, "learning_rate": 6.800360944100031e-06, "loss": 0.0068, "num_input_tokens_seen": 103671584, "step": 48035 }, { "epoch": 7.8368678629690045, "grad_norm": 3.154757499694824, "learning_rate": 6.795481646870658e-06, "loss": 0.1681, "num_input_tokens_seen": 103681984, "step": 48040 }, { "epoch": 7.83768352365416, "grad_norm": 0.13393941521644592, "learning_rate": 6.790603825391912e-06, "loss": 0.2417, "num_input_tokens_seen": 103692544, "step": 48045 }, { "epoch": 7.838499184339315, "grad_norm": 0.05127906799316406, "learning_rate": 6.785727480059212e-06, "loss": 0.0065, "num_input_tokens_seen": 103703488, "step": 48050 }, { "epoch": 7.83931484502447, "grad_norm": 0.05056251958012581, "learning_rate": 6.78085261126786e-06, "loss": 0.008, "num_input_tokens_seen": 103714464, "step": 48055 }, { "epoch": 7.840130505709625, "grad_norm": 0.2773749530315399, "learning_rate": 6.775979219413042e-06, "loss": 0.0077, "num_input_tokens_seen": 103725184, "step": 48060 }, { "epoch": 7.8409461663947795, "grad_norm": 0.4287366569042206, "learning_rate": 6.771107304889807e-06, "loss": 0.0062, "num_input_tokens_seen": 103735520, "step": 48065 }, { "epoch": 7.841761827079935, "grad_norm": 7.799998760223389, "learning_rate": 6.766236868093112e-06, "loss": 0.0993, "num_input_tokens_seen": 103747072, "step": 48070 }, { "epoch": 7.84257748776509, "grad_norm": 0.17426146566867828, "learning_rate": 6.761367909417776e-06, "loss": 0.101, "num_input_tokens_seen": 103757632, "step": 48075 }, { "epoch": 7.843393148450245, "grad_norm": 6.004482269287109, "learning_rate": 6.756500429258497e-06, "loss": 0.1365, "num_input_tokens_seen": 103768576, "step": 48080 }, { "epoch": 7.8442088091354, "grad_norm": 0.04564683511853218, "learning_rate": 6.751634428009862e-06, "loss": 0.1335, "num_input_tokens_seen": 103779840, "step": 48085 }, { "epoch": 7.8450244698205545, "grad_norm": 0.14614471793174744, "learning_rate": 6.7467699060663305e-06, "loss": 0.1128, "num_input_tokens_seen": 103789664, "step": 48090 }, { "epoch": 7.845840130505709, "grad_norm": 0.05062619596719742, "learning_rate": 6.741906863822248e-06, "loss": 0.1937, "num_input_tokens_seen": 103800832, "step": 48095 }, { "epoch": 7.846655791190865, "grad_norm": 1.9862357378005981, "learning_rate": 6.737045301671832e-06, "loss": 0.0629, "num_input_tokens_seen": 103811616, "step": 48100 }, { "epoch": 7.84747145187602, "grad_norm": 0.442433625459671, "learning_rate": 6.7321852200091935e-06, "loss": 0.3016, "num_input_tokens_seen": 103822464, "step": 48105 }, { "epoch": 7.848287112561175, "grad_norm": 0.09706331789493561, "learning_rate": 6.727326619228308e-06, "loss": 0.006, "num_input_tokens_seen": 103833984, "step": 48110 }, { "epoch": 7.849102773246329, "grad_norm": 0.04700395464897156, "learning_rate": 6.722469499723042e-06, "loss": 0.0034, "num_input_tokens_seen": 103845664, "step": 48115 }, { "epoch": 7.849918433931484, "grad_norm": 0.26358553767204285, "learning_rate": 6.717613861887137e-06, "loss": 0.392, "num_input_tokens_seen": 103855936, "step": 48120 }, { "epoch": 7.850734094616639, "grad_norm": 13.11994743347168, "learning_rate": 6.712759706114219e-06, "loss": 0.0195, "num_input_tokens_seen": 103866304, "step": 48125 }, { "epoch": 7.851549755301795, "grad_norm": 0.14919701218605042, "learning_rate": 6.707907032797786e-06, "loss": 0.0119, "num_input_tokens_seen": 103877280, "step": 48130 }, { "epoch": 7.85236541598695, "grad_norm": 4.979850769042969, "learning_rate": 6.703055842331221e-06, "loss": 0.1614, "num_input_tokens_seen": 103888192, "step": 48135 }, { "epoch": 7.853181076672104, "grad_norm": 0.3740044832229614, "learning_rate": 6.698206135107787e-06, "loss": 0.0678, "num_input_tokens_seen": 103897568, "step": 48140 }, { "epoch": 7.853996737357259, "grad_norm": 0.21097080409526825, "learning_rate": 6.6933579115206284e-06, "loss": 0.0069, "num_input_tokens_seen": 103908448, "step": 48145 }, { "epoch": 7.854812398042414, "grad_norm": 22.14005470275879, "learning_rate": 6.6885111719627635e-06, "loss": 0.069, "num_input_tokens_seen": 103917856, "step": 48150 }, { "epoch": 7.85562805872757, "grad_norm": 0.09356860816478729, "learning_rate": 6.683665916827087e-06, "loss": 0.119, "num_input_tokens_seen": 103929792, "step": 48155 }, { "epoch": 7.856443719412725, "grad_norm": 26.533403396606445, "learning_rate": 6.678822146506394e-06, "loss": 0.1676, "num_input_tokens_seen": 103939488, "step": 48160 }, { "epoch": 7.857259380097879, "grad_norm": 0.1259954422712326, "learning_rate": 6.67397986139334e-06, "loss": 0.0077, "num_input_tokens_seen": 103949600, "step": 48165 }, { "epoch": 7.858075040783034, "grad_norm": 0.32674962282180786, "learning_rate": 6.669139061880464e-06, "loss": 0.2732, "num_input_tokens_seen": 103959936, "step": 48170 }, { "epoch": 7.858890701468189, "grad_norm": 0.1663704812526703, "learning_rate": 6.664299748360184e-06, "loss": 0.0041, "num_input_tokens_seen": 103971136, "step": 48175 }, { "epoch": 7.859706362153344, "grad_norm": 0.1198531985282898, "learning_rate": 6.659461921224794e-06, "loss": 0.1066, "num_input_tokens_seen": 103981760, "step": 48180 }, { "epoch": 7.8605220228384995, "grad_norm": 0.5928356051445007, "learning_rate": 6.654625580866486e-06, "loss": 0.0093, "num_input_tokens_seen": 103992480, "step": 48185 }, { "epoch": 7.861337683523654, "grad_norm": 1.333174228668213, "learning_rate": 6.649790727677313e-06, "loss": 0.0108, "num_input_tokens_seen": 104002720, "step": 48190 }, { "epoch": 7.862153344208809, "grad_norm": 3.8798046112060547, "learning_rate": 6.644957362049212e-06, "loss": 0.1092, "num_input_tokens_seen": 104011968, "step": 48195 }, { "epoch": 7.862969004893964, "grad_norm": 7.625709533691406, "learning_rate": 6.640125484373999e-06, "loss": 0.1359, "num_input_tokens_seen": 104023328, "step": 48200 }, { "epoch": 7.863784665579119, "grad_norm": 3.2316322326660156, "learning_rate": 6.635295095043373e-06, "loss": 0.1565, "num_input_tokens_seen": 104032896, "step": 48205 }, { "epoch": 7.864600326264274, "grad_norm": 29.43779182434082, "learning_rate": 6.630466194448906e-06, "loss": 0.0824, "num_input_tokens_seen": 104044128, "step": 48210 }, { "epoch": 7.865415986949429, "grad_norm": 0.30524954199790955, "learning_rate": 6.625638782982058e-06, "loss": 0.0058, "num_input_tokens_seen": 104055808, "step": 48215 }, { "epoch": 7.866231647634584, "grad_norm": 0.0591411255300045, "learning_rate": 6.620812861034159e-06, "loss": 0.126, "num_input_tokens_seen": 104066816, "step": 48220 }, { "epoch": 7.867047308319739, "grad_norm": 4.141035079956055, "learning_rate": 6.615988428996426e-06, "loss": 0.123, "num_input_tokens_seen": 104078624, "step": 48225 }, { "epoch": 7.867862969004894, "grad_norm": 0.016146790236234665, "learning_rate": 6.611165487259946e-06, "loss": 0.0037, "num_input_tokens_seen": 104090592, "step": 48230 }, { "epoch": 7.868678629690049, "grad_norm": 0.14641910791397095, "learning_rate": 6.6063440362157e-06, "loss": 0.2131, "num_input_tokens_seen": 104101376, "step": 48235 }, { "epoch": 7.869494290375204, "grad_norm": 5.0318450927734375, "learning_rate": 6.601524076254534e-06, "loss": 0.1209, "num_input_tokens_seen": 104111968, "step": 48240 }, { "epoch": 7.870309951060359, "grad_norm": 0.09114504605531693, "learning_rate": 6.5967056077671785e-06, "loss": 0.0056, "num_input_tokens_seen": 104123616, "step": 48245 }, { "epoch": 7.871125611745514, "grad_norm": 0.19192178547382355, "learning_rate": 6.591888631144244e-06, "loss": 0.0098, "num_input_tokens_seen": 104134176, "step": 48250 }, { "epoch": 7.871941272430669, "grad_norm": 0.16777630150318146, "learning_rate": 6.587073146776221e-06, "loss": 0.0134, "num_input_tokens_seen": 104144704, "step": 48255 }, { "epoch": 7.872756933115824, "grad_norm": 0.10116895288228989, "learning_rate": 6.582259155053472e-06, "loss": 0.005, "num_input_tokens_seen": 104155648, "step": 48260 }, { "epoch": 7.873572593800979, "grad_norm": 0.3359253406524658, "learning_rate": 6.577446656366248e-06, "loss": 0.0092, "num_input_tokens_seen": 104166080, "step": 48265 }, { "epoch": 7.874388254486134, "grad_norm": 0.10709737986326218, "learning_rate": 6.572635651104672e-06, "loss": 0.1907, "num_input_tokens_seen": 104177152, "step": 48270 }, { "epoch": 7.875203915171289, "grad_norm": 0.183837428689003, "learning_rate": 6.56782613965875e-06, "loss": 0.0903, "num_input_tokens_seen": 104187104, "step": 48275 }, { "epoch": 7.876019575856444, "grad_norm": 0.24258825182914734, "learning_rate": 6.56301812241836e-06, "loss": 0.1851, "num_input_tokens_seen": 104198176, "step": 48280 }, { "epoch": 7.876835236541599, "grad_norm": 3.056492805480957, "learning_rate": 6.558211599773273e-06, "loss": 0.2398, "num_input_tokens_seen": 104208896, "step": 48285 }, { "epoch": 7.877650897226753, "grad_norm": 0.07631750404834747, "learning_rate": 6.55340657211313e-06, "loss": 0.0541, "num_input_tokens_seen": 104220512, "step": 48290 }, { "epoch": 7.878466557911908, "grad_norm": 3.429745674133301, "learning_rate": 6.5486030398274444e-06, "loss": 0.2371, "num_input_tokens_seen": 104230816, "step": 48295 }, { "epoch": 7.879282218597064, "grad_norm": 1.1050548553466797, "learning_rate": 6.543801003305619e-06, "loss": 0.0085, "num_input_tokens_seen": 104242240, "step": 48300 }, { "epoch": 7.880097879282219, "grad_norm": 0.06108357384800911, "learning_rate": 6.539000462936931e-06, "loss": 0.0061, "num_input_tokens_seen": 104253376, "step": 48305 }, { "epoch": 7.8809135399673735, "grad_norm": 0.15179091691970825, "learning_rate": 6.534201419110536e-06, "loss": 0.0255, "num_input_tokens_seen": 104264864, "step": 48310 }, { "epoch": 7.881729200652528, "grad_norm": 2.2326877117156982, "learning_rate": 6.529403872215467e-06, "loss": 0.0546, "num_input_tokens_seen": 104275712, "step": 48315 }, { "epoch": 7.882544861337683, "grad_norm": 0.08912886679172516, "learning_rate": 6.524607822640638e-06, "loss": 0.0042, "num_input_tokens_seen": 104287328, "step": 48320 }, { "epoch": 7.883360522022839, "grad_norm": 0.3407483696937561, "learning_rate": 6.519813270774835e-06, "loss": 0.0048, "num_input_tokens_seen": 104297344, "step": 48325 }, { "epoch": 7.884176182707994, "grad_norm": 0.053344812244176865, "learning_rate": 6.515020217006745e-06, "loss": 0.0052, "num_input_tokens_seen": 104308992, "step": 48330 }, { "epoch": 7.8849918433931485, "grad_norm": 0.1516156941652298, "learning_rate": 6.510228661724907e-06, "loss": 0.1034, "num_input_tokens_seen": 104319616, "step": 48335 }, { "epoch": 7.885807504078303, "grad_norm": 0.0773768424987793, "learning_rate": 6.5054386053177515e-06, "loss": 0.0078, "num_input_tokens_seen": 104331520, "step": 48340 }, { "epoch": 7.886623164763458, "grad_norm": 0.17040883004665375, "learning_rate": 6.500650048173582e-06, "loss": 0.0066, "num_input_tokens_seen": 104341664, "step": 48345 }, { "epoch": 7.887438825448614, "grad_norm": 4.182097434997559, "learning_rate": 6.495862990680585e-06, "loss": 0.1941, "num_input_tokens_seen": 104352640, "step": 48350 }, { "epoch": 7.888254486133769, "grad_norm": 0.11969134956598282, "learning_rate": 6.4910774332268195e-06, "loss": 0.1603, "num_input_tokens_seen": 104362496, "step": 48355 }, { "epoch": 7.8890701468189235, "grad_norm": 0.05024494603276253, "learning_rate": 6.486293376200234e-06, "loss": 0.1065, "num_input_tokens_seen": 104372192, "step": 48360 }, { "epoch": 7.889885807504078, "grad_norm": 0.12330567091703415, "learning_rate": 6.481510819988645e-06, "loss": 0.0069, "num_input_tokens_seen": 104383168, "step": 48365 }, { "epoch": 7.890701468189233, "grad_norm": 0.037438564002513885, "learning_rate": 6.47672976497975e-06, "loss": 0.0037, "num_input_tokens_seen": 104393632, "step": 48370 }, { "epoch": 7.891517128874388, "grad_norm": 0.08976734429597855, "learning_rate": 6.471950211561125e-06, "loss": 0.2618, "num_input_tokens_seen": 104404096, "step": 48375 }, { "epoch": 7.892332789559543, "grad_norm": 0.10741182416677475, "learning_rate": 6.46717216012023e-06, "loss": 0.063, "num_input_tokens_seen": 104414592, "step": 48380 }, { "epoch": 7.8931484502446985, "grad_norm": 0.9168614745140076, "learning_rate": 6.462395611044383e-06, "loss": 0.0085, "num_input_tokens_seen": 104424736, "step": 48385 }, { "epoch": 7.893964110929853, "grad_norm": 0.20880509912967682, "learning_rate": 6.457620564720815e-06, "loss": 0.0054, "num_input_tokens_seen": 104434976, "step": 48390 }, { "epoch": 7.894779771615008, "grad_norm": 0.07101184874773026, "learning_rate": 6.452847021536609e-06, "loss": 0.1205, "num_input_tokens_seen": 104445920, "step": 48395 }, { "epoch": 7.895595432300163, "grad_norm": 0.09273168444633484, "learning_rate": 6.44807498187873e-06, "loss": 0.244, "num_input_tokens_seen": 104455584, "step": 48400 }, { "epoch": 7.896411092985318, "grad_norm": 0.051877886056900024, "learning_rate": 6.443304446134024e-06, "loss": 0.0047, "num_input_tokens_seen": 104466304, "step": 48405 }, { "epoch": 7.897226753670473, "grad_norm": 0.043426841497421265, "learning_rate": 6.438535414689215e-06, "loss": 0.0051, "num_input_tokens_seen": 104476096, "step": 48410 }, { "epoch": 7.898042414355628, "grad_norm": 0.26901328563690186, "learning_rate": 6.4337678879309055e-06, "loss": 0.0567, "num_input_tokens_seen": 104486656, "step": 48415 }, { "epoch": 7.898858075040783, "grad_norm": 4.49504280090332, "learning_rate": 6.4290018662455764e-06, "loss": 0.1719, "num_input_tokens_seen": 104497600, "step": 48420 }, { "epoch": 7.899673735725938, "grad_norm": 0.18512462079524994, "learning_rate": 6.424237350019582e-06, "loss": 0.0659, "num_input_tokens_seen": 104508384, "step": 48425 }, { "epoch": 7.900489396411093, "grad_norm": 0.2118251919746399, "learning_rate": 6.419474339639161e-06, "loss": 0.0066, "num_input_tokens_seen": 104519136, "step": 48430 }, { "epoch": 7.901305057096248, "grad_norm": 0.22650903463363647, "learning_rate": 6.414712835490428e-06, "loss": 0.0041, "num_input_tokens_seen": 104529728, "step": 48435 }, { "epoch": 7.902120717781403, "grad_norm": 0.09416618198156357, "learning_rate": 6.409952837959374e-06, "loss": 0.0758, "num_input_tokens_seen": 104541888, "step": 48440 }, { "epoch": 7.902936378466558, "grad_norm": 0.12530404329299927, "learning_rate": 6.405194347431864e-06, "loss": 0.0034, "num_input_tokens_seen": 104552736, "step": 48445 }, { "epoch": 7.903752039151713, "grad_norm": 3.2568163871765137, "learning_rate": 6.400437364293655e-06, "loss": 0.1064, "num_input_tokens_seen": 104562464, "step": 48450 }, { "epoch": 7.904567699836868, "grad_norm": 0.12182630598545074, "learning_rate": 6.395681888930361e-06, "loss": 0.0293, "num_input_tokens_seen": 104573600, "step": 48455 }, { "epoch": 7.9053833605220225, "grad_norm": 0.12934081256389618, "learning_rate": 6.390927921727494e-06, "loss": 0.0064, "num_input_tokens_seen": 104584800, "step": 48460 }, { "epoch": 7.906199021207177, "grad_norm": 0.8263911604881287, "learning_rate": 6.386175463070429e-06, "loss": 0.1427, "num_input_tokens_seen": 104596192, "step": 48465 }, { "epoch": 7.907014681892333, "grad_norm": 2.735842227935791, "learning_rate": 6.3814245133444196e-06, "loss": 0.1951, "num_input_tokens_seen": 104606976, "step": 48470 }, { "epoch": 7.907830342577488, "grad_norm": 6.71162223815918, "learning_rate": 6.376675072934618e-06, "loss": 0.153, "num_input_tokens_seen": 104618304, "step": 48475 }, { "epoch": 7.908646003262643, "grad_norm": 0.39882490038871765, "learning_rate": 6.371927142226028e-06, "loss": 0.0051, "num_input_tokens_seen": 104627840, "step": 48480 }, { "epoch": 7.9094616639477975, "grad_norm": 0.19895535707473755, "learning_rate": 6.367180721603541e-06, "loss": 0.0083, "num_input_tokens_seen": 104639296, "step": 48485 }, { "epoch": 7.910277324632952, "grad_norm": 24.118324279785156, "learning_rate": 6.3624358114519275e-06, "loss": 0.1069, "num_input_tokens_seen": 104648992, "step": 48490 }, { "epoch": 7.911092985318108, "grad_norm": 0.043225184082984924, "learning_rate": 6.3576924121558246e-06, "loss": 0.0035, "num_input_tokens_seen": 104659680, "step": 48495 }, { "epoch": 7.911908646003263, "grad_norm": 0.02157299593091011, "learning_rate": 6.352950524099774e-06, "loss": 0.1461, "num_input_tokens_seen": 104670688, "step": 48500 }, { "epoch": 7.912724306688418, "grad_norm": 0.05414982885122299, "learning_rate": 6.348210147668165e-06, "loss": 0.1177, "num_input_tokens_seen": 104681600, "step": 48505 }, { "epoch": 7.9135399673735725, "grad_norm": 0.39657047390937805, "learning_rate": 6.343471283245283e-06, "loss": 0.0792, "num_input_tokens_seen": 104692672, "step": 48510 }, { "epoch": 7.914355628058727, "grad_norm": 0.06217885762453079, "learning_rate": 6.33873393121528e-06, "loss": 0.004, "num_input_tokens_seen": 104703296, "step": 48515 }, { "epoch": 7.915171288743883, "grad_norm": 0.033534884452819824, "learning_rate": 6.33399809196219e-06, "loss": 0.0851, "num_input_tokens_seen": 104715648, "step": 48520 }, { "epoch": 7.915986949429038, "grad_norm": 4.821717262268066, "learning_rate": 6.329263765869925e-06, "loss": 0.1039, "num_input_tokens_seen": 104724864, "step": 48525 }, { "epoch": 7.916802610114193, "grad_norm": 3.659557819366455, "learning_rate": 6.324530953322275e-06, "loss": 0.131, "num_input_tokens_seen": 104734592, "step": 48530 }, { "epoch": 7.917618270799347, "grad_norm": 6.6302056312561035, "learning_rate": 6.319799654702904e-06, "loss": 0.0903, "num_input_tokens_seen": 104744992, "step": 48535 }, { "epoch": 7.918433931484502, "grad_norm": 0.024048956111073494, "learning_rate": 6.315069870395354e-06, "loss": 0.1195, "num_input_tokens_seen": 104756512, "step": 48540 }, { "epoch": 7.919249592169657, "grad_norm": 6.27457332611084, "learning_rate": 6.310341600783049e-06, "loss": 0.2202, "num_input_tokens_seen": 104767840, "step": 48545 }, { "epoch": 7.920065252854813, "grad_norm": 0.27394771575927734, "learning_rate": 6.305614846249283e-06, "loss": 0.122, "num_input_tokens_seen": 104778944, "step": 48550 }, { "epoch": 7.920880913539968, "grad_norm": 0.05479596182703972, "learning_rate": 6.300889607177229e-06, "loss": 0.0889, "num_input_tokens_seen": 104789088, "step": 48555 }, { "epoch": 7.921696574225122, "grad_norm": 0.11633160710334778, "learning_rate": 6.296165883949947e-06, "loss": 0.1402, "num_input_tokens_seen": 104800032, "step": 48560 }, { "epoch": 7.922512234910277, "grad_norm": 1.6217732429504395, "learning_rate": 6.291443676950357e-06, "loss": 0.0125, "num_input_tokens_seen": 104810720, "step": 48565 }, { "epoch": 7.923327895595432, "grad_norm": 0.1704300045967102, "learning_rate": 6.286722986561272e-06, "loss": 0.0053, "num_input_tokens_seen": 104820896, "step": 48570 }, { "epoch": 7.924143556280587, "grad_norm": 0.15686701238155365, "learning_rate": 6.282003813165368e-06, "loss": 0.1773, "num_input_tokens_seen": 104831808, "step": 48575 }, { "epoch": 7.924959216965743, "grad_norm": 0.061979733407497406, "learning_rate": 6.2772861571452125e-06, "loss": 0.0268, "num_input_tokens_seen": 104841920, "step": 48580 }, { "epoch": 7.925774877650897, "grad_norm": 0.050359781831502914, "learning_rate": 6.272570018883236e-06, "loss": 0.0933, "num_input_tokens_seen": 104853408, "step": 48585 }, { "epoch": 7.926590538336052, "grad_norm": 0.05893663316965103, "learning_rate": 6.26785539876176e-06, "loss": 0.1893, "num_input_tokens_seen": 104862880, "step": 48590 }, { "epoch": 7.927406199021207, "grad_norm": 0.32345038652420044, "learning_rate": 6.2631422971629605e-06, "loss": 0.2086, "num_input_tokens_seen": 104873952, "step": 48595 }, { "epoch": 7.928221859706362, "grad_norm": 0.36849120259284973, "learning_rate": 6.2584307144689245e-06, "loss": 0.0038, "num_input_tokens_seen": 104884128, "step": 48600 }, { "epoch": 7.9290375203915175, "grad_norm": 0.05867064371705055, "learning_rate": 6.25372065106159e-06, "loss": 0.0709, "num_input_tokens_seen": 104895296, "step": 48605 }, { "epoch": 7.929853181076672, "grad_norm": 0.05543160438537598, "learning_rate": 6.249012107322774e-06, "loss": 0.0085, "num_input_tokens_seen": 104905760, "step": 48610 }, { "epoch": 7.930668841761827, "grad_norm": 0.09976672381162643, "learning_rate": 6.244305083634181e-06, "loss": 0.0156, "num_input_tokens_seen": 104916864, "step": 48615 }, { "epoch": 7.931484502446982, "grad_norm": 0.11526015400886536, "learning_rate": 6.239599580377381e-06, "loss": 0.1241, "num_input_tokens_seen": 104927104, "step": 48620 }, { "epoch": 7.932300163132137, "grad_norm": 17.803361892700195, "learning_rate": 6.234895597933832e-06, "loss": 0.0556, "num_input_tokens_seen": 104937632, "step": 48625 }, { "epoch": 7.933115823817292, "grad_norm": 3.604686737060547, "learning_rate": 6.2301931366848555e-06, "loss": 0.297, "num_input_tokens_seen": 104948448, "step": 48630 }, { "epoch": 7.933931484502447, "grad_norm": 0.1886691451072693, "learning_rate": 6.225492197011654e-06, "loss": 0.0221, "num_input_tokens_seen": 104958688, "step": 48635 }, { "epoch": 7.934747145187602, "grad_norm": 0.12465963512659073, "learning_rate": 6.220792779295326e-06, "loss": 0.0046, "num_input_tokens_seen": 104969408, "step": 48640 }, { "epoch": 7.935562805872757, "grad_norm": 0.08143021911382675, "learning_rate": 6.216094883916815e-06, "loss": 0.0065, "num_input_tokens_seen": 104979968, "step": 48645 }, { "epoch": 7.936378466557912, "grad_norm": 0.1900239884853363, "learning_rate": 6.211398511256966e-06, "loss": 0.1017, "num_input_tokens_seen": 104990688, "step": 48650 }, { "epoch": 7.937194127243067, "grad_norm": 4.787083625793457, "learning_rate": 6.206703661696484e-06, "loss": 0.0626, "num_input_tokens_seen": 105002048, "step": 48655 }, { "epoch": 7.938009787928221, "grad_norm": 0.132158562541008, "learning_rate": 6.20201033561596e-06, "loss": 0.0542, "num_input_tokens_seen": 105013056, "step": 48660 }, { "epoch": 7.938825448613377, "grad_norm": 3.366595983505249, "learning_rate": 6.197318533395858e-06, "loss": 0.2686, "num_input_tokens_seen": 105021856, "step": 48665 }, { "epoch": 7.939641109298532, "grad_norm": 2.141418695449829, "learning_rate": 6.192628255416519e-06, "loss": 0.2362, "num_input_tokens_seen": 105031936, "step": 48670 }, { "epoch": 7.940456769983687, "grad_norm": 5.5656023025512695, "learning_rate": 6.18793950205816e-06, "loss": 0.3258, "num_input_tokens_seen": 105044000, "step": 48675 }, { "epoch": 7.941272430668842, "grad_norm": 3.6275901794433594, "learning_rate": 6.183252273700879e-06, "loss": 0.1159, "num_input_tokens_seen": 105054272, "step": 48680 }, { "epoch": 7.942088091353996, "grad_norm": 0.09550347924232483, "learning_rate": 6.178566570724642e-06, "loss": 0.0055, "num_input_tokens_seen": 105065984, "step": 48685 }, { "epoch": 7.942903752039152, "grad_norm": 4.608447551727295, "learning_rate": 6.1738823935092975e-06, "loss": 0.1095, "num_input_tokens_seen": 105077440, "step": 48690 }, { "epoch": 7.943719412724307, "grad_norm": 1.4750423431396484, "learning_rate": 6.16919974243457e-06, "loss": 0.0965, "num_input_tokens_seen": 105087104, "step": 48695 }, { "epoch": 7.944535073409462, "grad_norm": 0.16015009582042694, "learning_rate": 6.164518617880058e-06, "loss": 0.0268, "num_input_tokens_seen": 105097824, "step": 48700 }, { "epoch": 7.945350734094617, "grad_norm": 0.10820517688989639, "learning_rate": 6.159839020225231e-06, "loss": 0.0042, "num_input_tokens_seen": 105108064, "step": 48705 }, { "epoch": 7.946166394779771, "grad_norm": 0.08304480463266373, "learning_rate": 6.155160949849453e-06, "loss": 0.004, "num_input_tokens_seen": 105118784, "step": 48710 }, { "epoch": 7.946982055464927, "grad_norm": 0.127269446849823, "learning_rate": 6.150484407131945e-06, "loss": 0.1073, "num_input_tokens_seen": 105128608, "step": 48715 }, { "epoch": 7.947797716150082, "grad_norm": 0.13916225731372833, "learning_rate": 6.145809392451815e-06, "loss": 0.0049, "num_input_tokens_seen": 105138720, "step": 48720 }, { "epoch": 7.948613376835237, "grad_norm": 0.12183847278356552, "learning_rate": 6.141135906188039e-06, "loss": 0.0196, "num_input_tokens_seen": 105150368, "step": 48725 }, { "epoch": 7.9494290375203915, "grad_norm": 0.19774994254112244, "learning_rate": 6.136463948719475e-06, "loss": 0.0217, "num_input_tokens_seen": 105160544, "step": 48730 }, { "epoch": 7.950244698205546, "grad_norm": 0.22997406125068665, "learning_rate": 6.131793520424859e-06, "loss": 0.1491, "num_input_tokens_seen": 105170784, "step": 48735 }, { "epoch": 7.951060358890701, "grad_norm": 0.17153584957122803, "learning_rate": 6.1271246216827945e-06, "loss": 0.1337, "num_input_tokens_seen": 105180960, "step": 48740 }, { "epoch": 7.951876019575856, "grad_norm": 0.09357471764087677, "learning_rate": 6.122457252871769e-06, "loss": 0.094, "num_input_tokens_seen": 105191200, "step": 48745 }, { "epoch": 7.952691680261012, "grad_norm": 6.854122161865234, "learning_rate": 6.117791414370141e-06, "loss": 0.0299, "num_input_tokens_seen": 105202176, "step": 48750 }, { "epoch": 7.9535073409461665, "grad_norm": 1.0626060962677002, "learning_rate": 6.113127106556149e-06, "loss": 0.2266, "num_input_tokens_seen": 105212352, "step": 48755 }, { "epoch": 7.954323001631321, "grad_norm": 0.368046373128891, "learning_rate": 6.108464329807903e-06, "loss": 0.1021, "num_input_tokens_seen": 105222464, "step": 48760 }, { "epoch": 7.955138662316476, "grad_norm": 0.19278091192245483, "learning_rate": 6.1038030845033956e-06, "loss": 0.0058, "num_input_tokens_seen": 105233984, "step": 48765 }, { "epoch": 7.955954323001631, "grad_norm": 0.07780694961547852, "learning_rate": 6.0991433710204885e-06, "loss": 0.004, "num_input_tokens_seen": 105244768, "step": 48770 }, { "epoch": 7.956769983686787, "grad_norm": 0.050757069140672684, "learning_rate": 6.0944851897369206e-06, "loss": 0.2335, "num_input_tokens_seen": 105255680, "step": 48775 }, { "epoch": 7.9575856443719415, "grad_norm": 0.022331800311803818, "learning_rate": 6.0898285410303015e-06, "loss": 0.0032, "num_input_tokens_seen": 105268608, "step": 48780 }, { "epoch": 7.958401305057096, "grad_norm": 3.4827897548675537, "learning_rate": 6.085173425278137e-06, "loss": 0.1042, "num_input_tokens_seen": 105278464, "step": 48785 }, { "epoch": 7.959216965742251, "grad_norm": 0.14530052244663239, "learning_rate": 6.080519842857787e-06, "loss": 0.0065, "num_input_tokens_seen": 105289856, "step": 48790 }, { "epoch": 7.960032626427406, "grad_norm": 0.05929364264011383, "learning_rate": 6.075867794146497e-06, "loss": 0.0576, "num_input_tokens_seen": 105300160, "step": 48795 }, { "epoch": 7.960848287112562, "grad_norm": 0.08685368299484253, "learning_rate": 6.071217279521382e-06, "loss": 0.0626, "num_input_tokens_seen": 105312352, "step": 48800 }, { "epoch": 7.9616639477977165, "grad_norm": 0.15442676842212677, "learning_rate": 6.06656829935944e-06, "loss": 0.0215, "num_input_tokens_seen": 105324544, "step": 48805 }, { "epoch": 7.962479608482871, "grad_norm": 0.07632876187562943, "learning_rate": 6.061920854037531e-06, "loss": 0.0954, "num_input_tokens_seen": 105335072, "step": 48810 }, { "epoch": 7.963295269168026, "grad_norm": 0.16926124691963196, "learning_rate": 6.0572749439324146e-06, "loss": 0.0035, "num_input_tokens_seen": 105344960, "step": 48815 }, { "epoch": 7.964110929853181, "grad_norm": 0.22158217430114746, "learning_rate": 6.052630569420706e-06, "loss": 0.004, "num_input_tokens_seen": 105355264, "step": 48820 }, { "epoch": 7.964926590538336, "grad_norm": 0.06808106601238251, "learning_rate": 6.047987730878904e-06, "loss": 0.0749, "num_input_tokens_seen": 105366624, "step": 48825 }, { "epoch": 7.9657422512234906, "grad_norm": 0.026783689856529236, "learning_rate": 6.043346428683375e-06, "loss": 0.1196, "num_input_tokens_seen": 105377888, "step": 48830 }, { "epoch": 7.966557911908646, "grad_norm": 0.04408380389213562, "learning_rate": 6.0387066632103695e-06, "loss": 0.0098, "num_input_tokens_seen": 105388192, "step": 48835 }, { "epoch": 7.967373572593801, "grad_norm": 8.979509353637695, "learning_rate": 6.034068434836013e-06, "loss": 0.3422, "num_input_tokens_seen": 105400256, "step": 48840 }, { "epoch": 7.968189233278956, "grad_norm": 1.1973680257797241, "learning_rate": 6.029431743936298e-06, "loss": 0.1211, "num_input_tokens_seen": 105411328, "step": 48845 }, { "epoch": 7.969004893964111, "grad_norm": 2.606157064437866, "learning_rate": 6.024796590887105e-06, "loss": 0.063, "num_input_tokens_seen": 105421408, "step": 48850 }, { "epoch": 7.9698205546492655, "grad_norm": 0.08284850418567657, "learning_rate": 6.020162976064178e-06, "loss": 0.0047, "num_input_tokens_seen": 105431872, "step": 48855 }, { "epoch": 7.970636215334421, "grad_norm": 0.16866230964660645, "learning_rate": 6.0155308998431415e-06, "loss": 0.1046, "num_input_tokens_seen": 105443776, "step": 48860 }, { "epoch": 7.971451876019576, "grad_norm": 0.1879003942012787, "learning_rate": 6.0109003625994975e-06, "loss": 0.005, "num_input_tokens_seen": 105453088, "step": 48865 }, { "epoch": 7.972267536704731, "grad_norm": 0.08856741338968277, "learning_rate": 6.006271364708621e-06, "loss": 0.0213, "num_input_tokens_seen": 105463744, "step": 48870 }, { "epoch": 7.973083197389886, "grad_norm": 0.10097008943557739, "learning_rate": 6.0016439065457595e-06, "loss": 0.0042, "num_input_tokens_seen": 105474752, "step": 48875 }, { "epoch": 7.9738988580750405, "grad_norm": 0.07519751787185669, "learning_rate": 5.997017988486039e-06, "loss": 0.0062, "num_input_tokens_seen": 105486016, "step": 48880 }, { "epoch": 7.974714518760196, "grad_norm": 0.04896562919020653, "learning_rate": 5.992393610904465e-06, "loss": 0.0071, "num_input_tokens_seen": 105496800, "step": 48885 }, { "epoch": 7.975530179445351, "grad_norm": 0.03369162604212761, "learning_rate": 5.987770774175905e-06, "loss": 0.0125, "num_input_tokens_seen": 105506528, "step": 48890 }, { "epoch": 7.976345840130506, "grad_norm": 3.082150459289551, "learning_rate": 5.983149478675113e-06, "loss": 0.0669, "num_input_tokens_seen": 105517312, "step": 48895 }, { "epoch": 7.977161500815661, "grad_norm": 0.021023431792855263, "learning_rate": 5.978529724776713e-06, "loss": 0.205, "num_input_tokens_seen": 105526976, "step": 48900 }, { "epoch": 7.9779771615008155, "grad_norm": 0.16616760194301605, "learning_rate": 5.97391151285521e-06, "loss": 0.0041, "num_input_tokens_seen": 105537408, "step": 48905 }, { "epoch": 7.97879282218597, "grad_norm": 0.19769129157066345, "learning_rate": 5.969294843284978e-06, "loss": 0.0757, "num_input_tokens_seen": 105547712, "step": 48910 }, { "epoch": 7.979608482871125, "grad_norm": 0.08259648829698563, "learning_rate": 5.964679716440258e-06, "loss": 0.1356, "num_input_tokens_seen": 105557760, "step": 48915 }, { "epoch": 7.980424143556281, "grad_norm": 5.66948938369751, "learning_rate": 5.9600661326951916e-06, "loss": 0.2767, "num_input_tokens_seen": 105568800, "step": 48920 }, { "epoch": 7.981239804241436, "grad_norm": 0.044336091727018356, "learning_rate": 5.955454092423773e-06, "loss": 0.003, "num_input_tokens_seen": 105578944, "step": 48925 }, { "epoch": 7.9820554649265905, "grad_norm": 6.642369747161865, "learning_rate": 5.950843595999877e-06, "loss": 0.0746, "num_input_tokens_seen": 105589664, "step": 48930 }, { "epoch": 7.982871125611745, "grad_norm": 0.09263117611408234, "learning_rate": 5.946234643797252e-06, "loss": 0.0043, "num_input_tokens_seen": 105601056, "step": 48935 }, { "epoch": 7.9836867862969, "grad_norm": 0.8972885012626648, "learning_rate": 5.941627236189526e-06, "loss": 0.0703, "num_input_tokens_seen": 105611808, "step": 48940 }, { "epoch": 7.984502446982056, "grad_norm": 0.0643182098865509, "learning_rate": 5.9370213735501974e-06, "loss": 0.1625, "num_input_tokens_seen": 105622944, "step": 48945 }, { "epoch": 7.985318107667211, "grad_norm": 0.13953441381454468, "learning_rate": 5.9324170562526345e-06, "loss": 0.1258, "num_input_tokens_seen": 105633920, "step": 48950 }, { "epoch": 7.986133768352365, "grad_norm": 0.21682924032211304, "learning_rate": 5.927814284670097e-06, "loss": 0.1442, "num_input_tokens_seen": 105645280, "step": 48955 }, { "epoch": 7.98694942903752, "grad_norm": 0.08131765574216843, "learning_rate": 5.923213059175709e-06, "loss": 0.004, "num_input_tokens_seen": 105656032, "step": 48960 }, { "epoch": 7.987765089722675, "grad_norm": 0.0924699455499649, "learning_rate": 5.918613380142463e-06, "loss": 0.0051, "num_input_tokens_seen": 105666208, "step": 48965 }, { "epoch": 7.988580750407831, "grad_norm": 14.537064552307129, "learning_rate": 5.914015247943233e-06, "loss": 0.3662, "num_input_tokens_seen": 105677600, "step": 48970 }, { "epoch": 7.989396411092986, "grad_norm": 0.18694168329238892, "learning_rate": 5.909418662950769e-06, "loss": 0.0375, "num_input_tokens_seen": 105688256, "step": 48975 }, { "epoch": 7.99021207177814, "grad_norm": 0.08439765870571136, "learning_rate": 5.904823625537695e-06, "loss": 0.0259, "num_input_tokens_seen": 105698560, "step": 48980 }, { "epoch": 7.991027732463295, "grad_norm": 0.05119810253381729, "learning_rate": 5.900230136076504e-06, "loss": 0.112, "num_input_tokens_seen": 105709792, "step": 48985 }, { "epoch": 7.99184339314845, "grad_norm": 0.06547822058200836, "learning_rate": 5.895638194939568e-06, "loss": 0.1206, "num_input_tokens_seen": 105720608, "step": 48990 }, { "epoch": 7.992659053833605, "grad_norm": 0.11921960115432739, "learning_rate": 5.891047802499136e-06, "loss": 0.0024, "num_input_tokens_seen": 105732192, "step": 48995 }, { "epoch": 7.993474714518761, "grad_norm": 0.10682287812232971, "learning_rate": 5.886458959127328e-06, "loss": 0.254, "num_input_tokens_seen": 105742688, "step": 49000 }, { "epoch": 7.994290375203915, "grad_norm": 11.609454154968262, "learning_rate": 5.881871665196137e-06, "loss": 0.1577, "num_input_tokens_seen": 105754208, "step": 49005 }, { "epoch": 7.99510603588907, "grad_norm": 0.07879647612571716, "learning_rate": 5.877285921077433e-06, "loss": 0.0279, "num_input_tokens_seen": 105765920, "step": 49010 }, { "epoch": 7.995921696574225, "grad_norm": 0.12344497442245483, "learning_rate": 5.872701727142963e-06, "loss": 0.1149, "num_input_tokens_seen": 105776992, "step": 49015 }, { "epoch": 7.99673735725938, "grad_norm": 0.05152774602174759, "learning_rate": 5.868119083764337e-06, "loss": 0.1549, "num_input_tokens_seen": 105788576, "step": 49020 }, { "epoch": 7.997553017944535, "grad_norm": 0.06692085415124893, "learning_rate": 5.863537991313047e-06, "loss": 0.0053, "num_input_tokens_seen": 105800384, "step": 49025 }, { "epoch": 7.99836867862969, "grad_norm": 11.177392959594727, "learning_rate": 5.858958450160473e-06, "loss": 0.0913, "num_input_tokens_seen": 105810528, "step": 49030 }, { "epoch": 7.999184339314845, "grad_norm": 0.01515423133969307, "learning_rate": 5.854380460677847e-06, "loss": 0.0335, "num_input_tokens_seen": 105821440, "step": 49035 }, { "epoch": 8.0, "grad_norm": 0.07321982085704803, "learning_rate": 5.849804023236285e-06, "loss": 0.0034, "num_input_tokens_seen": 105830544, "step": 49040 }, { "epoch": 8.0, "eval_loss": 0.18560057878494263, "eval_runtime": 132.9521, "eval_samples_per_second": 20.496, "eval_steps_per_second": 5.13, "num_input_tokens_seen": 105830544, "step": 49040 }, { "epoch": 8.000815660685156, "grad_norm": 0.23461724817752838, "learning_rate": 5.845229138206776e-06, "loss": 0.0165, "num_input_tokens_seen": 105843312, "step": 49045 }, { "epoch": 8.00163132137031, "grad_norm": 0.2039797604084015, "learning_rate": 5.8406558059601825e-06, "loss": 0.005, "num_input_tokens_seen": 105851248, "step": 49050 }, { "epoch": 8.002446982055465, "grad_norm": 16.558528900146484, "learning_rate": 5.836084026867244e-06, "loss": 0.1154, "num_input_tokens_seen": 105862352, "step": 49055 }, { "epoch": 8.00326264274062, "grad_norm": 0.07633313536643982, "learning_rate": 5.831513801298572e-06, "loss": 0.0053, "num_input_tokens_seen": 105873264, "step": 49060 }, { "epoch": 8.004078303425775, "grad_norm": 0.03318179398775101, "learning_rate": 5.82694512962465e-06, "loss": 0.0047, "num_input_tokens_seen": 105884272, "step": 49065 }, { "epoch": 8.00489396411093, "grad_norm": 4.506711006164551, "learning_rate": 5.822378012215837e-06, "loss": 0.2231, "num_input_tokens_seen": 105894832, "step": 49070 }, { "epoch": 8.005709624796085, "grad_norm": 0.055067844688892365, "learning_rate": 5.81781244944237e-06, "loss": 0.0018, "num_input_tokens_seen": 105905520, "step": 49075 }, { "epoch": 8.00652528548124, "grad_norm": 0.1632113754749298, "learning_rate": 5.813248441674357e-06, "loss": 0.09, "num_input_tokens_seen": 105917424, "step": 49080 }, { "epoch": 8.007340946166394, "grad_norm": 3.296776533126831, "learning_rate": 5.8086859892817755e-06, "loss": 0.0764, "num_input_tokens_seen": 105928560, "step": 49085 }, { "epoch": 8.00815660685155, "grad_norm": 0.16078858077526093, "learning_rate": 5.804125092634485e-06, "loss": 0.0067, "num_input_tokens_seen": 105939120, "step": 49090 }, { "epoch": 8.008972267536704, "grad_norm": 0.19128017127513885, "learning_rate": 5.799565752102207e-06, "loss": 0.008, "num_input_tokens_seen": 105949424, "step": 49095 }, { "epoch": 8.00978792822186, "grad_norm": 0.05958623066544533, "learning_rate": 5.795007968054555e-06, "loss": 0.0043, "num_input_tokens_seen": 105959280, "step": 49100 }, { "epoch": 8.010603588907015, "grad_norm": 0.0718739703297615, "learning_rate": 5.790451740861005e-06, "loss": 0.0067, "num_input_tokens_seen": 105968272, "step": 49105 }, { "epoch": 8.01141924959217, "grad_norm": 2.842352867126465, "learning_rate": 5.7858970708909056e-06, "loss": 0.0921, "num_input_tokens_seen": 105978736, "step": 49110 }, { "epoch": 8.012234910277325, "grad_norm": 0.05983918905258179, "learning_rate": 5.78134395851348e-06, "loss": 0.0042, "num_input_tokens_seen": 105989872, "step": 49115 }, { "epoch": 8.013050570962479, "grad_norm": 0.06373894214630127, "learning_rate": 5.7767924040978275e-06, "loss": 0.0103, "num_input_tokens_seen": 105999792, "step": 49120 }, { "epoch": 8.013866231647635, "grad_norm": 0.035480622202157974, "learning_rate": 5.772242408012921e-06, "loss": 0.005, "num_input_tokens_seen": 106010256, "step": 49125 }, { "epoch": 8.01468189233279, "grad_norm": 7.287117004394531, "learning_rate": 5.767693970627597e-06, "loss": 0.2716, "num_input_tokens_seen": 106020496, "step": 49130 }, { "epoch": 8.015497553017944, "grad_norm": 0.08558224886655807, "learning_rate": 5.763147092310592e-06, "loss": 0.0279, "num_input_tokens_seen": 106031248, "step": 49135 }, { "epoch": 8.0163132137031, "grad_norm": 0.05756988376379013, "learning_rate": 5.758601773430489e-06, "loss": 0.0021, "num_input_tokens_seen": 106041232, "step": 49140 }, { "epoch": 8.017128874388254, "grad_norm": 0.13379822671413422, "learning_rate": 5.7540580143557564e-06, "loss": 0.1162, "num_input_tokens_seen": 106051696, "step": 49145 }, { "epoch": 8.01794453507341, "grad_norm": 0.06322108209133148, "learning_rate": 5.7495158154547365e-06, "loss": 0.0056, "num_input_tokens_seen": 106062672, "step": 49150 }, { "epoch": 8.018760195758565, "grad_norm": 0.3233621120452881, "learning_rate": 5.744975177095638e-06, "loss": 0.0046, "num_input_tokens_seen": 106072784, "step": 49155 }, { "epoch": 8.01957585644372, "grad_norm": 0.05716429650783539, "learning_rate": 5.740436099646551e-06, "loss": 0.0045, "num_input_tokens_seen": 106083952, "step": 49160 }, { "epoch": 8.020391517128875, "grad_norm": 0.3119354844093323, "learning_rate": 5.735898583475438e-06, "loss": 0.118, "num_input_tokens_seen": 106095632, "step": 49165 }, { "epoch": 8.021207177814029, "grad_norm": 0.0634540542960167, "learning_rate": 5.731362628950129e-06, "loss": 0.0028, "num_input_tokens_seen": 106106160, "step": 49170 }, { "epoch": 8.022022838499185, "grad_norm": 0.17123360931873322, "learning_rate": 5.726828236438334e-06, "loss": 0.1068, "num_input_tokens_seen": 106116784, "step": 49175 }, { "epoch": 8.022838499184338, "grad_norm": 0.07622867077589035, "learning_rate": 5.722295406307632e-06, "loss": 0.0048, "num_input_tokens_seen": 106127088, "step": 49180 }, { "epoch": 8.023654159869494, "grad_norm": 0.3558162450790405, "learning_rate": 5.71776413892548e-06, "loss": 0.0909, "num_input_tokens_seen": 106136912, "step": 49185 }, { "epoch": 8.02446982055465, "grad_norm": 0.19106322526931763, "learning_rate": 5.713234434659203e-06, "loss": 0.0037, "num_input_tokens_seen": 106148816, "step": 49190 }, { "epoch": 8.025285481239804, "grad_norm": 0.04668969660997391, "learning_rate": 5.708706293876004e-06, "loss": 0.0022, "num_input_tokens_seen": 106158672, "step": 49195 }, { "epoch": 8.02610114192496, "grad_norm": 5.1401238441467285, "learning_rate": 5.7041797169429536e-06, "loss": 0.022, "num_input_tokens_seen": 106169744, "step": 49200 }, { "epoch": 8.026916802610113, "grad_norm": 0.06998219341039658, "learning_rate": 5.699654704227003e-06, "loss": 0.2022, "num_input_tokens_seen": 106179824, "step": 49205 }, { "epoch": 8.02773246329527, "grad_norm": 0.04609766975045204, "learning_rate": 5.695131256094971e-06, "loss": 0.2098, "num_input_tokens_seen": 106190640, "step": 49210 }, { "epoch": 8.028548123980425, "grad_norm": 0.10465165972709656, "learning_rate": 5.6906093729135495e-06, "loss": 0.0036, "num_input_tokens_seen": 106200112, "step": 49215 }, { "epoch": 8.029363784665579, "grad_norm": 0.07307170331478119, "learning_rate": 5.6860890550493095e-06, "loss": 0.0062, "num_input_tokens_seen": 106211440, "step": 49220 }, { "epoch": 8.030179445350734, "grad_norm": 0.617223858833313, "learning_rate": 5.681570302868688e-06, "loss": 0.1075, "num_input_tokens_seen": 106222224, "step": 49225 }, { "epoch": 8.030995106035888, "grad_norm": 0.04321765899658203, "learning_rate": 5.677053116737999e-06, "loss": 0.0055, "num_input_tokens_seen": 106233968, "step": 49230 }, { "epoch": 8.031810766721044, "grad_norm": 0.4132688045501709, "learning_rate": 5.67253749702342e-06, "loss": 0.2008, "num_input_tokens_seen": 106244944, "step": 49235 }, { "epoch": 8.0326264274062, "grad_norm": 4.113018989562988, "learning_rate": 5.668023444091025e-06, "loss": 0.1212, "num_input_tokens_seen": 106255664, "step": 49240 }, { "epoch": 8.033442088091354, "grad_norm": 0.09752390533685684, "learning_rate": 5.663510958306739e-06, "loss": 0.0068, "num_input_tokens_seen": 106266672, "step": 49245 }, { "epoch": 8.03425774877651, "grad_norm": 0.09980696439743042, "learning_rate": 5.659000040036366e-06, "loss": 0.1372, "num_input_tokens_seen": 106278288, "step": 49250 }, { "epoch": 8.035073409461663, "grad_norm": 4.008538722991943, "learning_rate": 5.654490689645589e-06, "loss": 0.0063, "num_input_tokens_seen": 106289744, "step": 49255 }, { "epoch": 8.035889070146819, "grad_norm": 0.23308169841766357, "learning_rate": 5.649982907499951e-06, "loss": 0.0445, "num_input_tokens_seen": 106298480, "step": 49260 }, { "epoch": 8.036704730831975, "grad_norm": 0.031971950083971024, "learning_rate": 5.645476693964874e-06, "loss": 0.0855, "num_input_tokens_seen": 106310800, "step": 49265 }, { "epoch": 8.037520391517129, "grad_norm": 3.505389451980591, "learning_rate": 5.640972049405666e-06, "loss": 0.2035, "num_input_tokens_seen": 106322000, "step": 49270 }, { "epoch": 8.038336052202284, "grad_norm": 0.046675022691488266, "learning_rate": 5.636468974187492e-06, "loss": 0.1079, "num_input_tokens_seen": 106331632, "step": 49275 }, { "epoch": 8.039151712887438, "grad_norm": 0.07738251984119415, "learning_rate": 5.631967468675392e-06, "loss": 0.0051, "num_input_tokens_seen": 106343344, "step": 49280 }, { "epoch": 8.039967373572594, "grad_norm": 0.04473774880170822, "learning_rate": 5.627467533234282e-06, "loss": 0.1477, "num_input_tokens_seen": 106354288, "step": 49285 }, { "epoch": 8.040783034257748, "grad_norm": 0.04963133484125137, "learning_rate": 5.622969168228947e-06, "loss": 0.0051, "num_input_tokens_seen": 106365168, "step": 49290 }, { "epoch": 8.041598694942904, "grad_norm": 0.052206624299287796, "learning_rate": 5.61847237402405e-06, "loss": 0.0038, "num_input_tokens_seen": 106375440, "step": 49295 }, { "epoch": 8.04241435562806, "grad_norm": 4.268579483032227, "learning_rate": 5.613977150984123e-06, "loss": 0.1155, "num_input_tokens_seen": 106386032, "step": 49300 }, { "epoch": 8.043230016313213, "grad_norm": 0.07151418924331665, "learning_rate": 5.609483499473575e-06, "loss": 0.0047, "num_input_tokens_seen": 106398160, "step": 49305 }, { "epoch": 8.044045676998369, "grad_norm": 0.0778755322098732, "learning_rate": 5.604991419856678e-06, "loss": 0.0722, "num_input_tokens_seen": 106408432, "step": 49310 }, { "epoch": 8.044861337683523, "grad_norm": 0.1610373854637146, "learning_rate": 5.600500912497586e-06, "loss": 0.3244, "num_input_tokens_seen": 106419440, "step": 49315 }, { "epoch": 8.045676998368679, "grad_norm": 0.12459628283977509, "learning_rate": 5.596011977760324e-06, "loss": 0.0707, "num_input_tokens_seen": 106430000, "step": 49320 }, { "epoch": 8.046492659053834, "grad_norm": 0.08844204992055893, "learning_rate": 5.591524616008784e-06, "loss": 0.0046, "num_input_tokens_seen": 106441392, "step": 49325 }, { "epoch": 8.047308319738988, "grad_norm": 0.9334814548492432, "learning_rate": 5.587038827606736e-06, "loss": 0.007, "num_input_tokens_seen": 106452688, "step": 49330 }, { "epoch": 8.048123980424144, "grad_norm": 0.9453881978988647, "learning_rate": 5.582554612917823e-06, "loss": 0.0058, "num_input_tokens_seen": 106462352, "step": 49335 }, { "epoch": 8.048939641109298, "grad_norm": 9.040762901306152, "learning_rate": 5.578071972305554e-06, "loss": 0.2573, "num_input_tokens_seen": 106472208, "step": 49340 }, { "epoch": 8.049755301794454, "grad_norm": 3.1483774185180664, "learning_rate": 5.57359090613331e-06, "loss": 0.329, "num_input_tokens_seen": 106481840, "step": 49345 }, { "epoch": 8.05057096247961, "grad_norm": 0.07931210845708847, "learning_rate": 5.569111414764363e-06, "loss": 0.2614, "num_input_tokens_seen": 106492912, "step": 49350 }, { "epoch": 8.051386623164763, "grad_norm": 0.08259209990501404, "learning_rate": 5.564633498561839e-06, "loss": 0.2622, "num_input_tokens_seen": 106503440, "step": 49355 }, { "epoch": 8.052202283849919, "grad_norm": 0.1957506388425827, "learning_rate": 5.560157157888735e-06, "loss": 0.0165, "num_input_tokens_seen": 106512880, "step": 49360 }, { "epoch": 8.053017944535073, "grad_norm": 0.04240275174379349, "learning_rate": 5.555682393107928e-06, "loss": 0.005, "num_input_tokens_seen": 106525296, "step": 49365 }, { "epoch": 8.053833605220229, "grad_norm": 7.280547618865967, "learning_rate": 5.551209204582167e-06, "loss": 0.2064, "num_input_tokens_seen": 106535536, "step": 49370 }, { "epoch": 8.054649265905383, "grad_norm": 0.2381419986486435, "learning_rate": 5.54673759267407e-06, "loss": 0.1138, "num_input_tokens_seen": 106546160, "step": 49375 }, { "epoch": 8.055464926590538, "grad_norm": 0.05607723817229271, "learning_rate": 5.542267557746128e-06, "loss": 0.1253, "num_input_tokens_seen": 106558064, "step": 49380 }, { "epoch": 8.056280587275694, "grad_norm": 0.1966191530227661, "learning_rate": 5.537799100160704e-06, "loss": 0.0864, "num_input_tokens_seen": 106568400, "step": 49385 }, { "epoch": 8.057096247960848, "grad_norm": 0.2882792055606842, "learning_rate": 5.533332220280038e-06, "loss": 0.0096, "num_input_tokens_seen": 106578192, "step": 49390 }, { "epoch": 8.057911908646004, "grad_norm": 0.08140349388122559, "learning_rate": 5.5288669184662325e-06, "loss": 0.0037, "num_input_tokens_seen": 106587312, "step": 49395 }, { "epoch": 8.058727569331158, "grad_norm": 0.12514647841453552, "learning_rate": 5.524403195081271e-06, "loss": 0.1624, "num_input_tokens_seen": 106598672, "step": 49400 }, { "epoch": 8.059543230016313, "grad_norm": 0.07031819224357605, "learning_rate": 5.519941050487007e-06, "loss": 0.0064, "num_input_tokens_seen": 106608208, "step": 49405 }, { "epoch": 8.060358890701469, "grad_norm": 0.0699918270111084, "learning_rate": 5.515480485045152e-06, "loss": 0.0044, "num_input_tokens_seen": 106619984, "step": 49410 }, { "epoch": 8.061174551386623, "grad_norm": 3.1284658908843994, "learning_rate": 5.511021499117322e-06, "loss": 0.1468, "num_input_tokens_seen": 106631792, "step": 49415 }, { "epoch": 8.061990212071779, "grad_norm": 0.015321805141866207, "learning_rate": 5.5065640930649725e-06, "loss": 0.1166, "num_input_tokens_seen": 106643600, "step": 49420 }, { "epoch": 8.062805872756933, "grad_norm": 3.460632801055908, "learning_rate": 5.502108267249448e-06, "loss": 0.0934, "num_input_tokens_seen": 106654672, "step": 49425 }, { "epoch": 8.063621533442088, "grad_norm": 5.855128765106201, "learning_rate": 5.497654022031959e-06, "loss": 0.0693, "num_input_tokens_seen": 106665456, "step": 49430 }, { "epoch": 8.064437194127244, "grad_norm": 0.20783241093158722, "learning_rate": 5.493201357773589e-06, "loss": 0.2219, "num_input_tokens_seen": 106676880, "step": 49435 }, { "epoch": 8.065252854812398, "grad_norm": 0.14572155475616455, "learning_rate": 5.488750274835291e-06, "loss": 0.1397, "num_input_tokens_seen": 106688528, "step": 49440 }, { "epoch": 8.066068515497554, "grad_norm": 0.29012978076934814, "learning_rate": 5.4843007735778996e-06, "loss": 0.0727, "num_input_tokens_seen": 106698992, "step": 49445 }, { "epoch": 8.066884176182707, "grad_norm": 0.12157758325338364, "learning_rate": 5.4798528543620965e-06, "loss": 0.0553, "num_input_tokens_seen": 106709168, "step": 49450 }, { "epoch": 8.067699836867863, "grad_norm": 0.16166500747203827, "learning_rate": 5.475406517548476e-06, "loss": 0.194, "num_input_tokens_seen": 106719920, "step": 49455 }, { "epoch": 8.068515497553017, "grad_norm": 0.06889856606721878, "learning_rate": 5.47096176349747e-06, "loss": 0.0778, "num_input_tokens_seen": 106729872, "step": 49460 }, { "epoch": 8.069331158238173, "grad_norm": 3.6685426235198975, "learning_rate": 5.466518592569391e-06, "loss": 0.1442, "num_input_tokens_seen": 106741168, "step": 49465 }, { "epoch": 8.070146818923329, "grad_norm": 0.1276254802942276, "learning_rate": 5.4620770051244275e-06, "loss": 0.005, "num_input_tokens_seen": 106751376, "step": 49470 }, { "epoch": 8.070962479608482, "grad_norm": 0.37503746151924133, "learning_rate": 5.457637001522636e-06, "loss": 0.1265, "num_input_tokens_seen": 106761904, "step": 49475 }, { "epoch": 8.071778140293638, "grad_norm": 6.915079593658447, "learning_rate": 5.453198582123947e-06, "loss": 0.1625, "num_input_tokens_seen": 106772240, "step": 49480 }, { "epoch": 8.072593800978792, "grad_norm": 0.09333750605583191, "learning_rate": 5.448761747288161e-06, "loss": 0.0042, "num_input_tokens_seen": 106783216, "step": 49485 }, { "epoch": 8.073409461663948, "grad_norm": 0.2951674163341522, "learning_rate": 5.444326497374949e-06, "loss": 0.0985, "num_input_tokens_seen": 106794896, "step": 49490 }, { "epoch": 8.074225122349104, "grad_norm": 1.0948500633239746, "learning_rate": 5.439892832743856e-06, "loss": 0.2702, "num_input_tokens_seen": 106805360, "step": 49495 }, { "epoch": 8.075040783034257, "grad_norm": 5.78016996383667, "learning_rate": 5.435460753754296e-06, "loss": 0.1262, "num_input_tokens_seen": 106816144, "step": 49500 }, { "epoch": 8.075856443719413, "grad_norm": 0.3723219037055969, "learning_rate": 5.431030260765557e-06, "loss": 0.0062, "num_input_tokens_seen": 106827088, "step": 49505 }, { "epoch": 8.076672104404567, "grad_norm": 0.06229551509022713, "learning_rate": 5.426601354136799e-06, "loss": 0.0437, "num_input_tokens_seen": 106837488, "step": 49510 }, { "epoch": 8.077487765089723, "grad_norm": 0.041666120290756226, "learning_rate": 5.4221740342270455e-06, "loss": 0.0052, "num_input_tokens_seen": 106849520, "step": 49515 }, { "epoch": 8.078303425774878, "grad_norm": 0.05304932966828346, "learning_rate": 5.4177483013952065e-06, "loss": 0.0637, "num_input_tokens_seen": 106860528, "step": 49520 }, { "epoch": 8.079119086460032, "grad_norm": 2.4275498390197754, "learning_rate": 5.413324156000046e-06, "loss": 0.1433, "num_input_tokens_seen": 106869936, "step": 49525 }, { "epoch": 8.079934747145188, "grad_norm": 3.568582534790039, "learning_rate": 5.408901598400212e-06, "loss": 0.0892, "num_input_tokens_seen": 106881936, "step": 49530 }, { "epoch": 8.080750407830342, "grad_norm": 0.1210571676492691, "learning_rate": 5.40448062895422e-06, "loss": 0.0042, "num_input_tokens_seen": 106891376, "step": 49535 }, { "epoch": 8.081566068515498, "grad_norm": 0.16298231482505798, "learning_rate": 5.400061248020452e-06, "loss": 0.1194, "num_input_tokens_seen": 106901168, "step": 49540 }, { "epoch": 8.082381729200652, "grad_norm": 0.21729084849357605, "learning_rate": 5.395643455957172e-06, "loss": 0.2148, "num_input_tokens_seen": 106912432, "step": 49545 }, { "epoch": 8.083197389885807, "grad_norm": 15.05525016784668, "learning_rate": 5.391227253122502e-06, "loss": 0.0354, "num_input_tokens_seen": 106924592, "step": 49550 }, { "epoch": 8.084013050570963, "grad_norm": 0.0854477807879448, "learning_rate": 5.386812639874439e-06, "loss": 0.0096, "num_input_tokens_seen": 106934704, "step": 49555 }, { "epoch": 8.084828711256117, "grad_norm": 0.37851154804229736, "learning_rate": 5.382399616570869e-06, "loss": 0.0083, "num_input_tokens_seen": 106945936, "step": 49560 }, { "epoch": 8.085644371941273, "grad_norm": 15.90233325958252, "learning_rate": 5.377988183569521e-06, "loss": 0.0422, "num_input_tokens_seen": 106957264, "step": 49565 }, { "epoch": 8.086460032626427, "grad_norm": 0.1077587828040123, "learning_rate": 5.3735783412280134e-06, "loss": 0.1699, "num_input_tokens_seen": 106968816, "step": 49570 }, { "epoch": 8.087275693311582, "grad_norm": 0.0675574466586113, "learning_rate": 5.36917008990383e-06, "loss": 0.2308, "num_input_tokens_seen": 106981680, "step": 49575 }, { "epoch": 8.088091353996738, "grad_norm": 0.11761905997991562, "learning_rate": 5.364763429954317e-06, "loss": 0.0504, "num_input_tokens_seen": 106992048, "step": 49580 }, { "epoch": 8.088907014681892, "grad_norm": 0.19656305015087128, "learning_rate": 5.360358361736714e-06, "loss": 0.0042, "num_input_tokens_seen": 107004016, "step": 49585 }, { "epoch": 8.089722675367048, "grad_norm": 3.6156342029571533, "learning_rate": 5.3559548856081135e-06, "loss": 0.0982, "num_input_tokens_seen": 107015024, "step": 49590 }, { "epoch": 8.090538336052202, "grad_norm": 0.1276438683271408, "learning_rate": 5.351553001925486e-06, "loss": 0.0041, "num_input_tokens_seen": 107025008, "step": 49595 }, { "epoch": 8.091353996737357, "grad_norm": 17.190284729003906, "learning_rate": 5.347152711045664e-06, "loss": 0.0235, "num_input_tokens_seen": 107035408, "step": 49600 }, { "epoch": 8.092169657422513, "grad_norm": 2.734891414642334, "learning_rate": 5.342754013325363e-06, "loss": 0.0422, "num_input_tokens_seen": 107046960, "step": 49605 }, { "epoch": 8.092985318107667, "grad_norm": 5.170504570007324, "learning_rate": 5.338356909121159e-06, "loss": 0.1617, "num_input_tokens_seen": 107058736, "step": 49610 }, { "epoch": 8.093800978792823, "grad_norm": 0.4141773581504822, "learning_rate": 5.3339613987895084e-06, "loss": 0.1956, "num_input_tokens_seen": 107069872, "step": 49615 }, { "epoch": 8.094616639477977, "grad_norm": 0.05458392947912216, "learning_rate": 5.329567482686729e-06, "loss": 0.0037, "num_input_tokens_seen": 107080400, "step": 49620 }, { "epoch": 8.095432300163132, "grad_norm": 0.1334652304649353, "learning_rate": 5.325175161169019e-06, "loss": 0.0111, "num_input_tokens_seen": 107091696, "step": 49625 }, { "epoch": 8.096247960848286, "grad_norm": 0.24071387946605682, "learning_rate": 5.320784434592438e-06, "loss": 0.1279, "num_input_tokens_seen": 107101168, "step": 49630 }, { "epoch": 8.097063621533442, "grad_norm": 0.18897297978401184, "learning_rate": 5.316395303312921e-06, "loss": 0.0832, "num_input_tokens_seen": 107112464, "step": 49635 }, { "epoch": 8.097879282218598, "grad_norm": 0.2674519419670105, "learning_rate": 5.3120077676862754e-06, "loss": 0.0082, "num_input_tokens_seen": 107123280, "step": 49640 }, { "epoch": 8.098694942903752, "grad_norm": 4.2298102378845215, "learning_rate": 5.307621828068177e-06, "loss": 0.1066, "num_input_tokens_seen": 107134800, "step": 49645 }, { "epoch": 8.099510603588907, "grad_norm": 0.09429102391004562, "learning_rate": 5.303237484814169e-06, "loss": 0.0733, "num_input_tokens_seen": 107145904, "step": 49650 }, { "epoch": 8.100326264274061, "grad_norm": 3.282207489013672, "learning_rate": 5.2988547382796735e-06, "loss": 0.1514, "num_input_tokens_seen": 107156464, "step": 49655 }, { "epoch": 8.101141924959217, "grad_norm": 0.05307849869132042, "learning_rate": 5.294473588819968e-06, "loss": 0.0103, "num_input_tokens_seen": 107167824, "step": 49660 }, { "epoch": 8.101957585644373, "grad_norm": 1.4988816976547241, "learning_rate": 5.2900940367902245e-06, "loss": 0.0117, "num_input_tokens_seen": 107178928, "step": 49665 }, { "epoch": 8.102773246329527, "grad_norm": 0.04693633317947388, "learning_rate": 5.2857160825454635e-06, "loss": 0.0963, "num_input_tokens_seen": 107191120, "step": 49670 }, { "epoch": 8.103588907014682, "grad_norm": 0.039586059749126434, "learning_rate": 5.28133972644059e-06, "loss": 0.0864, "num_input_tokens_seen": 107201872, "step": 49675 }, { "epoch": 8.104404567699836, "grad_norm": 2.518397808074951, "learning_rate": 5.276964968830367e-06, "loss": 0.1285, "num_input_tokens_seen": 107214192, "step": 49680 }, { "epoch": 8.105220228384992, "grad_norm": 0.31964388489723206, "learning_rate": 5.272591810069438e-06, "loss": 0.0078, "num_input_tokens_seen": 107225616, "step": 49685 }, { "epoch": 8.106035889070148, "grad_norm": 0.03488241881132126, "learning_rate": 5.268220250512315e-06, "loss": 0.0101, "num_input_tokens_seen": 107236208, "step": 49690 }, { "epoch": 8.106851549755302, "grad_norm": 0.058267757296562195, "learning_rate": 5.263850290513373e-06, "loss": 0.0039, "num_input_tokens_seen": 107247632, "step": 49695 }, { "epoch": 8.107667210440457, "grad_norm": 0.07162422686815262, "learning_rate": 5.259481930426869e-06, "loss": 0.0017, "num_input_tokens_seen": 107258288, "step": 49700 }, { "epoch": 8.108482871125611, "grad_norm": 0.031090060248970985, "learning_rate": 5.2551151706069225e-06, "loss": 0.0034, "num_input_tokens_seen": 107268944, "step": 49705 }, { "epoch": 8.109298531810767, "grad_norm": 0.12256871908903122, "learning_rate": 5.250750011407521e-06, "loss": 0.1123, "num_input_tokens_seen": 107279472, "step": 49710 }, { "epoch": 8.11011419249592, "grad_norm": 7.657137870788574, "learning_rate": 5.246386453182533e-06, "loss": 0.1078, "num_input_tokens_seen": 107290064, "step": 49715 }, { "epoch": 8.110929853181077, "grad_norm": 0.08682429790496826, "learning_rate": 5.242024496285683e-06, "loss": 0.0024, "num_input_tokens_seen": 107300560, "step": 49720 }, { "epoch": 8.111745513866232, "grad_norm": 0.09137769788503647, "learning_rate": 5.237664141070583e-06, "loss": 0.0047, "num_input_tokens_seen": 107310512, "step": 49725 }, { "epoch": 8.112561174551386, "grad_norm": 0.29633578658103943, "learning_rate": 5.2333053878907e-06, "loss": 0.0041, "num_input_tokens_seen": 107321296, "step": 49730 }, { "epoch": 8.113376835236542, "grad_norm": 0.16949453949928284, "learning_rate": 5.228948237099379e-06, "loss": 0.1126, "num_input_tokens_seen": 107331920, "step": 49735 }, { "epoch": 8.114192495921696, "grad_norm": 0.373892605304718, "learning_rate": 5.224592689049832e-06, "loss": 0.0048, "num_input_tokens_seen": 107342768, "step": 49740 }, { "epoch": 8.115008156606851, "grad_norm": 0.3301897346973419, "learning_rate": 5.220238744095137e-06, "loss": 0.058, "num_input_tokens_seen": 107352304, "step": 49745 }, { "epoch": 8.115823817292007, "grad_norm": 0.01428622379899025, "learning_rate": 5.215886402588255e-06, "loss": 0.0028, "num_input_tokens_seen": 107362448, "step": 49750 }, { "epoch": 8.116639477977161, "grad_norm": 0.029162505641579628, "learning_rate": 5.211535664882003e-06, "loss": 0.0037, "num_input_tokens_seen": 107372368, "step": 49755 }, { "epoch": 8.117455138662317, "grad_norm": 0.17050567269325256, "learning_rate": 5.207186531329075e-06, "loss": 0.0826, "num_input_tokens_seen": 107383120, "step": 49760 }, { "epoch": 8.11827079934747, "grad_norm": 0.08083541691303253, "learning_rate": 5.202839002282037e-06, "loss": 0.0033, "num_input_tokens_seen": 107394576, "step": 49765 }, { "epoch": 8.119086460032626, "grad_norm": 0.043164532631635666, "learning_rate": 5.198493078093311e-06, "loss": 0.169, "num_input_tokens_seen": 107404752, "step": 49770 }, { "epoch": 8.119902120717782, "grad_norm": 0.10148000717163086, "learning_rate": 5.194148759115214e-06, "loss": 0.1128, "num_input_tokens_seen": 107414288, "step": 49775 }, { "epoch": 8.120717781402936, "grad_norm": 0.08053573966026306, "learning_rate": 5.189806045699913e-06, "loss": 0.0664, "num_input_tokens_seen": 107425648, "step": 49780 }, { "epoch": 8.121533442088092, "grad_norm": 0.2354675829410553, "learning_rate": 5.185464938199449e-06, "loss": 0.0611, "num_input_tokens_seen": 107437936, "step": 49785 }, { "epoch": 8.122349102773246, "grad_norm": 0.5793733596801758, "learning_rate": 5.181125436965739e-06, "loss": 0.0058, "num_input_tokens_seen": 107448336, "step": 49790 }, { "epoch": 8.123164763458401, "grad_norm": 0.08733253180980682, "learning_rate": 5.176787542350558e-06, "loss": 0.0689, "num_input_tokens_seen": 107458416, "step": 49795 }, { "epoch": 8.123980424143557, "grad_norm": 0.06490445882081985, "learning_rate": 5.172451254705559e-06, "loss": 0.0483, "num_input_tokens_seen": 107469872, "step": 49800 }, { "epoch": 8.124796084828711, "grad_norm": 0.10985083132982254, "learning_rate": 5.1681165743822676e-06, "loss": 0.1949, "num_input_tokens_seen": 107479760, "step": 49805 }, { "epoch": 8.125611745513867, "grad_norm": 0.035494428128004074, "learning_rate": 5.1637835017320726e-06, "loss": 0.0651, "num_input_tokens_seen": 107490096, "step": 49810 }, { "epoch": 8.12642740619902, "grad_norm": 0.09943348914384842, "learning_rate": 5.159452037106236e-06, "loss": 0.1107, "num_input_tokens_seen": 107500368, "step": 49815 }, { "epoch": 8.127243066884176, "grad_norm": 0.0746140256524086, "learning_rate": 5.155122180855884e-06, "loss": 0.0062, "num_input_tokens_seen": 107511600, "step": 49820 }, { "epoch": 8.12805872756933, "grad_norm": 0.059252284467220306, "learning_rate": 5.150793933332024e-06, "loss": 0.1505, "num_input_tokens_seen": 107522672, "step": 49825 }, { "epoch": 8.128874388254486, "grad_norm": 0.12755051255226135, "learning_rate": 5.146467294885518e-06, "loss": 0.0026, "num_input_tokens_seen": 107534096, "step": 49830 }, { "epoch": 8.129690048939642, "grad_norm": 3.337573766708374, "learning_rate": 5.142142265867112e-06, "loss": 0.1877, "num_input_tokens_seen": 107543472, "step": 49835 }, { "epoch": 8.130505709624796, "grad_norm": 3.7103707790374756, "learning_rate": 5.137818846627409e-06, "loss": 0.2132, "num_input_tokens_seen": 107554224, "step": 49840 }, { "epoch": 8.131321370309951, "grad_norm": 0.11919867992401123, "learning_rate": 5.13349703751689e-06, "loss": 0.125, "num_input_tokens_seen": 107564240, "step": 49845 }, { "epoch": 8.132137030995105, "grad_norm": 0.4787959158420563, "learning_rate": 5.129176838885905e-06, "loss": 0.0061, "num_input_tokens_seen": 107574800, "step": 49850 }, { "epoch": 8.132952691680261, "grad_norm": 0.0798761248588562, "learning_rate": 5.124858251084666e-06, "loss": 0.1976, "num_input_tokens_seen": 107585520, "step": 49855 }, { "epoch": 8.133768352365417, "grad_norm": 0.036203090101480484, "learning_rate": 5.120541274463264e-06, "loss": 0.1533, "num_input_tokens_seen": 107596112, "step": 49860 }, { "epoch": 8.13458401305057, "grad_norm": 13.261408805847168, "learning_rate": 5.116225909371649e-06, "loss": 0.0112, "num_input_tokens_seen": 107607120, "step": 49865 }, { "epoch": 8.135399673735726, "grad_norm": 0.05671172961592674, "learning_rate": 5.111912156159657e-06, "loss": 0.18, "num_input_tokens_seen": 107618000, "step": 49870 }, { "epoch": 8.13621533442088, "grad_norm": 16.111249923706055, "learning_rate": 5.107600015176975e-06, "loss": 0.122, "num_input_tokens_seen": 107629232, "step": 49875 }, { "epoch": 8.137030995106036, "grad_norm": 0.17345628142356873, "learning_rate": 5.103289486773169e-06, "loss": 0.0071, "num_input_tokens_seen": 107640784, "step": 49880 }, { "epoch": 8.137846655791192, "grad_norm": 3.813416004180908, "learning_rate": 5.098980571297673e-06, "loss": 0.1718, "num_input_tokens_seen": 107650576, "step": 49885 }, { "epoch": 8.138662316476346, "grad_norm": 0.14174208045005798, "learning_rate": 5.094673269099781e-06, "loss": 0.0039, "num_input_tokens_seen": 107660368, "step": 49890 }, { "epoch": 8.139477977161501, "grad_norm": 0.0553726889193058, "learning_rate": 5.090367580528679e-06, "loss": 0.0077, "num_input_tokens_seen": 107670096, "step": 49895 }, { "epoch": 8.140293637846655, "grad_norm": 13.390634536743164, "learning_rate": 5.086063505933403e-06, "loss": 0.1784, "num_input_tokens_seen": 107680240, "step": 49900 }, { "epoch": 8.141109298531811, "grad_norm": 0.0556836873292923, "learning_rate": 5.081761045662861e-06, "loss": 0.0079, "num_input_tokens_seen": 107691504, "step": 49905 }, { "epoch": 8.141924959216965, "grad_norm": 3.9966859817504883, "learning_rate": 5.077460200065834e-06, "loss": 0.2348, "num_input_tokens_seen": 107701968, "step": 49910 }, { "epoch": 8.14274061990212, "grad_norm": 3.627897262573242, "learning_rate": 5.073160969490967e-06, "loss": 0.232, "num_input_tokens_seen": 107713136, "step": 49915 }, { "epoch": 8.143556280587276, "grad_norm": 9.396637916564941, "learning_rate": 5.068863354286779e-06, "loss": 0.0747, "num_input_tokens_seen": 107724080, "step": 49920 }, { "epoch": 8.14437194127243, "grad_norm": 3.8547706604003906, "learning_rate": 5.064567354801658e-06, "loss": 0.0055, "num_input_tokens_seen": 107735792, "step": 49925 }, { "epoch": 8.145187601957586, "grad_norm": 0.5412003993988037, "learning_rate": 5.060272971383862e-06, "loss": 0.1378, "num_input_tokens_seen": 107747280, "step": 49930 }, { "epoch": 8.14600326264274, "grad_norm": 1.3821433782577515, "learning_rate": 5.055980204381508e-06, "loss": 0.1476, "num_input_tokens_seen": 107758448, "step": 49935 }, { "epoch": 8.146818923327896, "grad_norm": 0.20223121345043182, "learning_rate": 5.051689054142594e-06, "loss": 0.0051, "num_input_tokens_seen": 107769712, "step": 49940 }, { "epoch": 8.147634584013051, "grad_norm": 0.023549934849143028, "learning_rate": 5.047399521014984e-06, "loss": 0.0973, "num_input_tokens_seen": 107780624, "step": 49945 }, { "epoch": 8.148450244698205, "grad_norm": 0.10849035531282425, "learning_rate": 5.043111605346404e-06, "loss": 0.0709, "num_input_tokens_seen": 107790800, "step": 49950 }, { "epoch": 8.149265905383361, "grad_norm": 0.05483119934797287, "learning_rate": 5.03882530748446e-06, "loss": 0.0826, "num_input_tokens_seen": 107802736, "step": 49955 }, { "epoch": 8.150081566068515, "grad_norm": 0.07685652375221252, "learning_rate": 5.034540627776618e-06, "loss": 0.0065, "num_input_tokens_seen": 107813552, "step": 49960 }, { "epoch": 8.15089722675367, "grad_norm": 0.048805173486471176, "learning_rate": 5.030257566570215e-06, "loss": 0.1742, "num_input_tokens_seen": 107825264, "step": 49965 }, { "epoch": 8.151712887438826, "grad_norm": 0.058466944843530655, "learning_rate": 5.025976124212461e-06, "loss": 0.0048, "num_input_tokens_seen": 107836880, "step": 49970 }, { "epoch": 8.15252854812398, "grad_norm": 0.09701434522867203, "learning_rate": 5.0216963010504295e-06, "loss": 0.0044, "num_input_tokens_seen": 107849392, "step": 49975 }, { "epoch": 8.153344208809136, "grad_norm": 3.819746732711792, "learning_rate": 5.017418097431059e-06, "loss": 0.0969, "num_input_tokens_seen": 107860144, "step": 49980 }, { "epoch": 8.15415986949429, "grad_norm": 0.054632510989904404, "learning_rate": 5.013141513701173e-06, "loss": 0.0978, "num_input_tokens_seen": 107869520, "step": 49985 }, { "epoch": 8.154975530179446, "grad_norm": 0.19069510698318481, "learning_rate": 5.008866550207447e-06, "loss": 0.1492, "num_input_tokens_seen": 107879792, "step": 49990 }, { "epoch": 8.1557911908646, "grad_norm": 0.7505430579185486, "learning_rate": 5.004593207296434e-06, "loss": 0.0064, "num_input_tokens_seen": 107890672, "step": 49995 }, { "epoch": 8.156606851549755, "grad_norm": 0.06662391871213913, "learning_rate": 5.000321485314552e-06, "loss": 0.105, "num_input_tokens_seen": 107900528, "step": 50000 }, { "epoch": 8.15742251223491, "grad_norm": 0.20353369414806366, "learning_rate": 4.9960513846080885e-06, "loss": 0.2116, "num_input_tokens_seen": 107911248, "step": 50005 }, { "epoch": 8.158238172920065, "grad_norm": 0.11364521086215973, "learning_rate": 4.991782905523196e-06, "loss": 0.003, "num_input_tokens_seen": 107922800, "step": 50010 }, { "epoch": 8.15905383360522, "grad_norm": 8.014961242675781, "learning_rate": 4.987516048405905e-06, "loss": 0.0122, "num_input_tokens_seen": 107934032, "step": 50015 }, { "epoch": 8.159869494290374, "grad_norm": 0.06340456008911133, "learning_rate": 4.983250813602103e-06, "loss": 0.0084, "num_input_tokens_seen": 107946128, "step": 50020 }, { "epoch": 8.16068515497553, "grad_norm": 0.0635066032409668, "learning_rate": 4.978987201457555e-06, "loss": 0.0737, "num_input_tokens_seen": 107956912, "step": 50025 }, { "epoch": 8.161500815660686, "grad_norm": 13.93176555633545, "learning_rate": 4.97472521231789e-06, "loss": 0.0521, "num_input_tokens_seen": 107968112, "step": 50030 }, { "epoch": 8.16231647634584, "grad_norm": 0.04283421114087105, "learning_rate": 4.9704648465286e-06, "loss": 0.0223, "num_input_tokens_seen": 107978352, "step": 50035 }, { "epoch": 8.163132137030995, "grad_norm": 0.055563513189554214, "learning_rate": 4.966206104435064e-06, "loss": 0.0045, "num_input_tokens_seen": 107989264, "step": 50040 }, { "epoch": 8.16394779771615, "grad_norm": 17.561492919921875, "learning_rate": 4.961948986382511e-06, "loss": 0.0892, "num_input_tokens_seen": 107999888, "step": 50045 }, { "epoch": 8.164763458401305, "grad_norm": 0.06243748217821121, "learning_rate": 4.957693492716048e-06, "loss": 0.0994, "num_input_tokens_seen": 108009680, "step": 50050 }, { "epoch": 8.16557911908646, "grad_norm": 0.10302278399467468, "learning_rate": 4.953439623780643e-06, "loss": 0.0101, "num_input_tokens_seen": 108020368, "step": 50055 }, { "epoch": 8.166394779771615, "grad_norm": 0.44655972719192505, "learning_rate": 4.949187379921136e-06, "loss": 0.0458, "num_input_tokens_seen": 108030736, "step": 50060 }, { "epoch": 8.16721044045677, "grad_norm": 0.038387883454561234, "learning_rate": 4.9449367614822384e-06, "loss": 0.0027, "num_input_tokens_seen": 108041232, "step": 50065 }, { "epoch": 8.168026101141924, "grad_norm": 0.08896858245134354, "learning_rate": 4.940687768808525e-06, "loss": 0.0988, "num_input_tokens_seen": 108053264, "step": 50070 }, { "epoch": 8.16884176182708, "grad_norm": 2.4326529502868652, "learning_rate": 4.936440402244441e-06, "loss": 0.005, "num_input_tokens_seen": 108063088, "step": 50075 }, { "epoch": 8.169657422512234, "grad_norm": 0.07476741820573807, "learning_rate": 4.932194662134298e-06, "loss": 0.0056, "num_input_tokens_seen": 108072688, "step": 50080 }, { "epoch": 8.17047308319739, "grad_norm": 2.7072935104370117, "learning_rate": 4.92795054882228e-06, "loss": 0.0825, "num_input_tokens_seen": 108085296, "step": 50085 }, { "epoch": 8.171288743882545, "grad_norm": 0.5258146524429321, "learning_rate": 4.9237080626524294e-06, "loss": 0.0035, "num_input_tokens_seen": 108096816, "step": 50090 }, { "epoch": 8.1721044045677, "grad_norm": 0.061414800584316254, "learning_rate": 4.919467203968675e-06, "loss": 0.1125, "num_input_tokens_seen": 108107280, "step": 50095 }, { "epoch": 8.172920065252855, "grad_norm": 0.07064393162727356, "learning_rate": 4.915227973114797e-06, "loss": 0.0509, "num_input_tokens_seen": 108117552, "step": 50100 }, { "epoch": 8.173735725938009, "grad_norm": 0.07800977677106857, "learning_rate": 4.910990370434449e-06, "loss": 0.0036, "num_input_tokens_seen": 108129264, "step": 50105 }, { "epoch": 8.174551386623165, "grad_norm": 0.19258436560630798, "learning_rate": 4.906754396271152e-06, "loss": 0.01, "num_input_tokens_seen": 108139472, "step": 50110 }, { "epoch": 8.17536704730832, "grad_norm": 0.9658310413360596, "learning_rate": 4.902520050968293e-06, "loss": 0.025, "num_input_tokens_seen": 108149712, "step": 50115 }, { "epoch": 8.176182707993474, "grad_norm": 0.07832033932209015, "learning_rate": 4.898287334869134e-06, "loss": 0.0234, "num_input_tokens_seen": 108160464, "step": 50120 }, { "epoch": 8.17699836867863, "grad_norm": 3.9769275188446045, "learning_rate": 4.8940562483168005e-06, "loss": 0.1665, "num_input_tokens_seen": 108171312, "step": 50125 }, { "epoch": 8.177814029363784, "grad_norm": 0.09289734810590744, "learning_rate": 4.889826791654281e-06, "loss": 0.1265, "num_input_tokens_seen": 108181616, "step": 50130 }, { "epoch": 8.17862969004894, "grad_norm": 0.02215544320642948, "learning_rate": 4.8855989652244415e-06, "loss": 0.2234, "num_input_tokens_seen": 108191792, "step": 50135 }, { "epoch": 8.179445350734095, "grad_norm": 0.12375655770301819, "learning_rate": 4.8813727693700104e-06, "loss": 0.1492, "num_input_tokens_seen": 108203216, "step": 50140 }, { "epoch": 8.18026101141925, "grad_norm": 0.08403193950653076, "learning_rate": 4.877148204433582e-06, "loss": 0.0063, "num_input_tokens_seen": 108214000, "step": 50145 }, { "epoch": 8.181076672104405, "grad_norm": 0.08748190850019455, "learning_rate": 4.872925270757623e-06, "loss": 0.1165, "num_input_tokens_seen": 108224528, "step": 50150 }, { "epoch": 8.181892332789559, "grad_norm": 0.26224926114082336, "learning_rate": 4.868703968684466e-06, "loss": 0.003, "num_input_tokens_seen": 108235088, "step": 50155 }, { "epoch": 8.182707993474715, "grad_norm": 0.2896566092967987, "learning_rate": 4.86448429855631e-06, "loss": 0.0033, "num_input_tokens_seen": 108245616, "step": 50160 }, { "epoch": 8.18352365415987, "grad_norm": 3.337801694869995, "learning_rate": 4.860266260715221e-06, "loss": 0.1663, "num_input_tokens_seen": 108256144, "step": 50165 }, { "epoch": 8.184339314845024, "grad_norm": 0.033951111137866974, "learning_rate": 4.856049855503139e-06, "loss": 0.126, "num_input_tokens_seen": 108267248, "step": 50170 }, { "epoch": 8.18515497553018, "grad_norm": 0.1714457869529724, "learning_rate": 4.8518350832618655e-06, "loss": 0.1017, "num_input_tokens_seen": 108278128, "step": 50175 }, { "epoch": 8.185970636215334, "grad_norm": 0.08282965421676636, "learning_rate": 4.847621944333064e-06, "loss": 0.1976, "num_input_tokens_seen": 108290352, "step": 50180 }, { "epoch": 8.18678629690049, "grad_norm": 0.6131508350372314, "learning_rate": 4.8434104390582855e-06, "loss": 0.0069, "num_input_tokens_seen": 108301776, "step": 50185 }, { "epoch": 8.187601957585644, "grad_norm": 8.647669792175293, "learning_rate": 4.839200567778932e-06, "loss": 0.0964, "num_input_tokens_seen": 108312208, "step": 50190 }, { "epoch": 8.1884176182708, "grad_norm": 4.240434169769287, "learning_rate": 4.834992330836274e-06, "loss": 0.1322, "num_input_tokens_seen": 108322480, "step": 50195 }, { "epoch": 8.189233278955955, "grad_norm": 0.07553743571043015, "learning_rate": 4.8307857285714545e-06, "loss": 0.0039, "num_input_tokens_seen": 108333392, "step": 50200 }, { "epoch": 8.190048939641109, "grad_norm": 0.057867687195539474, "learning_rate": 4.826580761325475e-06, "loss": 0.0041, "num_input_tokens_seen": 108343856, "step": 50205 }, { "epoch": 8.190864600326265, "grad_norm": 0.07736869156360626, "learning_rate": 4.822377429439223e-06, "loss": 0.1104, "num_input_tokens_seen": 108355728, "step": 50210 }, { "epoch": 8.191680261011419, "grad_norm": 0.054170429706573486, "learning_rate": 4.818175733253438e-06, "loss": 0.2302, "num_input_tokens_seen": 108366736, "step": 50215 }, { "epoch": 8.192495921696574, "grad_norm": 6.451676368713379, "learning_rate": 4.813975673108731e-06, "loss": 0.0176, "num_input_tokens_seen": 108376912, "step": 50220 }, { "epoch": 8.19331158238173, "grad_norm": 0.06798990815877914, "learning_rate": 4.809777249345576e-06, "loss": 0.1984, "num_input_tokens_seen": 108387440, "step": 50225 }, { "epoch": 8.194127243066884, "grad_norm": 0.13363580405712128, "learning_rate": 4.8055804623043235e-06, "loss": 0.3014, "num_input_tokens_seen": 108397232, "step": 50230 }, { "epoch": 8.19494290375204, "grad_norm": 0.15777772665023804, "learning_rate": 4.801385312325182e-06, "loss": 0.0044, "num_input_tokens_seen": 108408016, "step": 50235 }, { "epoch": 8.195758564437194, "grad_norm": 6.402616500854492, "learning_rate": 4.7971917997482376e-06, "loss": 0.0928, "num_input_tokens_seen": 108419024, "step": 50240 }, { "epoch": 8.19657422512235, "grad_norm": 0.10083062946796417, "learning_rate": 4.79299992491343e-06, "loss": 0.0031, "num_input_tokens_seen": 108430320, "step": 50245 }, { "epoch": 8.197389885807505, "grad_norm": 0.05472959205508232, "learning_rate": 4.788809688160581e-06, "loss": 0.0046, "num_input_tokens_seen": 108441104, "step": 50250 }, { "epoch": 8.198205546492659, "grad_norm": 0.05518600344657898, "learning_rate": 4.784621089829366e-06, "loss": 0.1861, "num_input_tokens_seen": 108452656, "step": 50255 }, { "epoch": 8.199021207177815, "grad_norm": 3.555570602416992, "learning_rate": 4.780434130259339e-06, "loss": 0.1197, "num_input_tokens_seen": 108462992, "step": 50260 }, { "epoch": 8.199836867862969, "grad_norm": 0.02412850223481655, "learning_rate": 4.7762488097899154e-06, "loss": 0.0082, "num_input_tokens_seen": 108473744, "step": 50265 }, { "epoch": 8.200652528548124, "grad_norm": 0.2734960913658142, "learning_rate": 4.772065128760375e-06, "loss": 0.0066, "num_input_tokens_seen": 108485136, "step": 50270 }, { "epoch": 8.201468189233278, "grad_norm": 0.1171422004699707, "learning_rate": 4.767883087509872e-06, "loss": 0.0686, "num_input_tokens_seen": 108496720, "step": 50275 }, { "epoch": 8.202283849918434, "grad_norm": 4.667135238647461, "learning_rate": 4.763702686377425e-06, "loss": 0.0875, "num_input_tokens_seen": 108508016, "step": 50280 }, { "epoch": 8.20309951060359, "grad_norm": 3.5564005374908447, "learning_rate": 4.759523925701914e-06, "loss": 0.1315, "num_input_tokens_seen": 108517872, "step": 50285 }, { "epoch": 8.203915171288743, "grad_norm": 0.11621157079935074, "learning_rate": 4.7553468058220915e-06, "loss": 0.1008, "num_input_tokens_seen": 108527280, "step": 50290 }, { "epoch": 8.2047308319739, "grad_norm": 0.081459179520607, "learning_rate": 4.751171327076579e-06, "loss": 0.136, "num_input_tokens_seen": 108538192, "step": 50295 }, { "epoch": 8.205546492659053, "grad_norm": 4.702872276306152, "learning_rate": 4.746997489803853e-06, "loss": 0.1771, "num_input_tokens_seen": 108546960, "step": 50300 }, { "epoch": 8.206362153344209, "grad_norm": 0.1000506579875946, "learning_rate": 4.7428252943422794e-06, "loss": 0.0037, "num_input_tokens_seen": 108556912, "step": 50305 }, { "epoch": 8.207177814029365, "grad_norm": 0.04609856382012367, "learning_rate": 4.738654741030074e-06, "loss": 0.006, "num_input_tokens_seen": 108568144, "step": 50310 }, { "epoch": 8.207993474714518, "grad_norm": 0.059388112276792526, "learning_rate": 4.734485830205318e-06, "loss": 0.0026, "num_input_tokens_seen": 108578160, "step": 50315 }, { "epoch": 8.208809135399674, "grad_norm": 0.03651902452111244, "learning_rate": 4.730318562205965e-06, "loss": 0.0806, "num_input_tokens_seen": 108589296, "step": 50320 }, { "epoch": 8.209624796084828, "grad_norm": 0.17606563866138458, "learning_rate": 4.7261529373698404e-06, "loss": 0.0047, "num_input_tokens_seen": 108600240, "step": 50325 }, { "epoch": 8.210440456769984, "grad_norm": 0.2413724958896637, "learning_rate": 4.721988956034626e-06, "loss": 0.0703, "num_input_tokens_seen": 108611056, "step": 50330 }, { "epoch": 8.21125611745514, "grad_norm": 0.13980406522750854, "learning_rate": 4.717826618537874e-06, "loss": 0.161, "num_input_tokens_seen": 108620400, "step": 50335 }, { "epoch": 8.212071778140293, "grad_norm": 0.16237755119800568, "learning_rate": 4.713665925217009e-06, "loss": 0.0288, "num_input_tokens_seen": 108632432, "step": 50340 }, { "epoch": 8.21288743882545, "grad_norm": 5.084513187408447, "learning_rate": 4.709506876409317e-06, "loss": 0.283, "num_input_tokens_seen": 108642896, "step": 50345 }, { "epoch": 8.213703099510603, "grad_norm": 9.8268461227417, "learning_rate": 4.705349472451942e-06, "loss": 0.132, "num_input_tokens_seen": 108652816, "step": 50350 }, { "epoch": 8.214518760195759, "grad_norm": 0.09373872727155685, "learning_rate": 4.70119371368192e-06, "loss": 0.079, "num_input_tokens_seen": 108663792, "step": 50355 }, { "epoch": 8.215334420880913, "grad_norm": 0.08155173808336258, "learning_rate": 4.697039600436132e-06, "loss": 0.0047, "num_input_tokens_seen": 108674896, "step": 50360 }, { "epoch": 8.216150081566068, "grad_norm": 0.20419591665267944, "learning_rate": 4.6928871330513296e-06, "loss": 0.2251, "num_input_tokens_seen": 108685584, "step": 50365 }, { "epoch": 8.216965742251224, "grad_norm": 0.18128938972949982, "learning_rate": 4.6887363118641335e-06, "loss": 0.1375, "num_input_tokens_seen": 108696560, "step": 50370 }, { "epoch": 8.217781402936378, "grad_norm": 0.2932167947292328, "learning_rate": 4.68458713721103e-06, "loss": 0.0053, "num_input_tokens_seen": 108706512, "step": 50375 }, { "epoch": 8.218597063621534, "grad_norm": 4.024229049682617, "learning_rate": 4.680439609428372e-06, "loss": 0.1881, "num_input_tokens_seen": 108717808, "step": 50380 }, { "epoch": 8.219412724306688, "grad_norm": 0.6397793292999268, "learning_rate": 4.676293728852379e-06, "loss": 0.006, "num_input_tokens_seen": 108729040, "step": 50385 }, { "epoch": 8.220228384991843, "grad_norm": 0.0787268728017807, "learning_rate": 4.6721494958191395e-06, "loss": 0.115, "num_input_tokens_seen": 108739536, "step": 50390 }, { "epoch": 8.221044045676999, "grad_norm": 13.2654447555542, "learning_rate": 4.6680069106646014e-06, "loss": 0.0751, "num_input_tokens_seen": 108750640, "step": 50395 }, { "epoch": 8.221859706362153, "grad_norm": 0.34706342220306396, "learning_rate": 4.663865973724591e-06, "loss": 0.0423, "num_input_tokens_seen": 108760368, "step": 50400 }, { "epoch": 8.222675367047309, "grad_norm": 0.12569265067577362, "learning_rate": 4.659726685334786e-06, "loss": 0.0051, "num_input_tokens_seen": 108772336, "step": 50405 }, { "epoch": 8.223491027732463, "grad_norm": 4.785270690917969, "learning_rate": 4.655589045830735e-06, "loss": 0.1103, "num_input_tokens_seen": 108782800, "step": 50410 }, { "epoch": 8.224306688417618, "grad_norm": 0.02386106364428997, "learning_rate": 4.651453055547872e-06, "loss": 0.1051, "num_input_tokens_seen": 108794000, "step": 50415 }, { "epoch": 8.225122349102774, "grad_norm": 0.12528622150421143, "learning_rate": 4.647318714821469e-06, "loss": 0.0025, "num_input_tokens_seen": 108805392, "step": 50420 }, { "epoch": 8.225938009787928, "grad_norm": 1.0077322721481323, "learning_rate": 4.643186023986681e-06, "loss": 0.0081, "num_input_tokens_seen": 108817616, "step": 50425 }, { "epoch": 8.226753670473084, "grad_norm": 0.08704919368028641, "learning_rate": 4.639054983378521e-06, "loss": 0.0055, "num_input_tokens_seen": 108829360, "step": 50430 }, { "epoch": 8.227569331158238, "grad_norm": 2.131197929382324, "learning_rate": 4.634925593331876e-06, "loss": 0.1934, "num_input_tokens_seen": 108839696, "step": 50435 }, { "epoch": 8.228384991843393, "grad_norm": 0.19904682040214539, "learning_rate": 4.630797854181495e-06, "loss": 0.0344, "num_input_tokens_seen": 108851184, "step": 50440 }, { "epoch": 8.229200652528547, "grad_norm": 0.16750061511993408, "learning_rate": 4.626671766261992e-06, "loss": 0.2497, "num_input_tokens_seen": 108861744, "step": 50445 }, { "epoch": 8.230016313213703, "grad_norm": 0.1557835340499878, "learning_rate": 4.622547329907848e-06, "loss": 0.0043, "num_input_tokens_seen": 108872816, "step": 50450 }, { "epoch": 8.230831973898859, "grad_norm": 0.040223173797130585, "learning_rate": 4.618424545453409e-06, "loss": 0.004, "num_input_tokens_seen": 108884688, "step": 50455 }, { "epoch": 8.231647634584013, "grad_norm": 14.621254920959473, "learning_rate": 4.6143034132328955e-06, "loss": 0.0825, "num_input_tokens_seen": 108896208, "step": 50460 }, { "epoch": 8.232463295269168, "grad_norm": 0.15322542190551758, "learning_rate": 4.610183933580381e-06, "loss": 0.1539, "num_input_tokens_seen": 108907536, "step": 50465 }, { "epoch": 8.233278955954322, "grad_norm": 0.017540767788887024, "learning_rate": 4.606066106829815e-06, "loss": 0.0825, "num_input_tokens_seen": 108917360, "step": 50470 }, { "epoch": 8.234094616639478, "grad_norm": 0.05903009697794914, "learning_rate": 4.601949933315009e-06, "loss": 0.2029, "num_input_tokens_seen": 108927952, "step": 50475 }, { "epoch": 8.234910277324634, "grad_norm": 0.2780179977416992, "learning_rate": 4.597835413369639e-06, "loss": 0.0039, "num_input_tokens_seen": 108939248, "step": 50480 }, { "epoch": 8.235725938009788, "grad_norm": 0.2921516001224518, "learning_rate": 4.593722547327248e-06, "loss": 0.1172, "num_input_tokens_seen": 108950832, "step": 50485 }, { "epoch": 8.236541598694943, "grad_norm": 0.1570664346218109, "learning_rate": 4.589611335521249e-06, "loss": 0.0138, "num_input_tokens_seen": 108961488, "step": 50490 }, { "epoch": 8.237357259380097, "grad_norm": 0.3389540910720825, "learning_rate": 4.585501778284912e-06, "loss": 0.1036, "num_input_tokens_seen": 108971888, "step": 50495 }, { "epoch": 8.238172920065253, "grad_norm": 0.07394928485155106, "learning_rate": 4.581393875951387e-06, "loss": 0.0891, "num_input_tokens_seen": 108983856, "step": 50500 }, { "epoch": 8.238988580750409, "grad_norm": 0.04614948108792305, "learning_rate": 4.577287628853677e-06, "loss": 0.3118, "num_input_tokens_seen": 108995408, "step": 50505 }, { "epoch": 8.239804241435563, "grad_norm": 0.04369710758328438, "learning_rate": 4.5731830373246574e-06, "loss": 0.2083, "num_input_tokens_seen": 109006736, "step": 50510 }, { "epoch": 8.240619902120718, "grad_norm": 0.19078224897384644, "learning_rate": 4.5690801016970655e-06, "loss": 0.0039, "num_input_tokens_seen": 109016528, "step": 50515 }, { "epoch": 8.241435562805872, "grad_norm": 0.15958210825920105, "learning_rate": 4.564978822303498e-06, "loss": 0.0959, "num_input_tokens_seen": 109028688, "step": 50520 }, { "epoch": 8.242251223491028, "grad_norm": 0.040741484612226486, "learning_rate": 4.560879199476442e-06, "loss": 0.1921, "num_input_tokens_seen": 109039792, "step": 50525 }, { "epoch": 8.243066884176184, "grad_norm": 0.07980673760175705, "learning_rate": 4.5567812335482244e-06, "loss": 0.004, "num_input_tokens_seen": 109050416, "step": 50530 }, { "epoch": 8.243882544861338, "grad_norm": 0.1288427859544754, "learning_rate": 4.5526849248510475e-06, "loss": 0.0647, "num_input_tokens_seen": 109060816, "step": 50535 }, { "epoch": 8.244698205546493, "grad_norm": 0.0687427669763565, "learning_rate": 4.548590273716979e-06, "loss": 0.0029, "num_input_tokens_seen": 109071920, "step": 50540 }, { "epoch": 8.245513866231647, "grad_norm": 0.36271336674690247, "learning_rate": 4.5444972804779525e-06, "loss": 0.2503, "num_input_tokens_seen": 109082512, "step": 50545 }, { "epoch": 8.246329526916803, "grad_norm": 0.02701878547668457, "learning_rate": 4.540405945465767e-06, "loss": 0.2103, "num_input_tokens_seen": 109093744, "step": 50550 }, { "epoch": 8.247145187601957, "grad_norm": 0.27419909834861755, "learning_rate": 4.536316269012086e-06, "loss": 0.0074, "num_input_tokens_seen": 109104368, "step": 50555 }, { "epoch": 8.247960848287113, "grad_norm": 3.8257272243499756, "learning_rate": 4.532228251448439e-06, "loss": 0.2266, "num_input_tokens_seen": 109115920, "step": 50560 }, { "epoch": 8.248776508972268, "grad_norm": 0.47641947865486145, "learning_rate": 4.528141893106225e-06, "loss": 0.0073, "num_input_tokens_seen": 109127088, "step": 50565 }, { "epoch": 8.249592169657422, "grad_norm": 0.05158061906695366, "learning_rate": 4.5240571943167e-06, "loss": 0.065, "num_input_tokens_seen": 109138736, "step": 50570 }, { "epoch": 8.250407830342578, "grad_norm": 0.043185245245695114, "learning_rate": 4.519974155410992e-06, "loss": 0.1124, "num_input_tokens_seen": 109149488, "step": 50575 }, { "epoch": 8.251223491027732, "grad_norm": 0.44439175724983215, "learning_rate": 4.515892776720096e-06, "loss": 0.0046, "num_input_tokens_seen": 109160656, "step": 50580 }, { "epoch": 8.252039151712887, "grad_norm": 0.0924413651227951, "learning_rate": 4.5118130585748655e-06, "loss": 0.0031, "num_input_tokens_seen": 109171312, "step": 50585 }, { "epoch": 8.252854812398043, "grad_norm": 0.20112235844135284, "learning_rate": 4.507735001306024e-06, "loss": 0.0046, "num_input_tokens_seen": 109182384, "step": 50590 }, { "epoch": 8.253670473083197, "grad_norm": 0.03435681015253067, "learning_rate": 4.503658605244163e-06, "loss": 0.0038, "num_input_tokens_seen": 109191792, "step": 50595 }, { "epoch": 8.254486133768353, "grad_norm": 0.17052924633026123, "learning_rate": 4.499583870719728e-06, "loss": 0.12, "num_input_tokens_seen": 109202960, "step": 50600 }, { "epoch": 8.255301794453507, "grad_norm": 0.034675903618335724, "learning_rate": 4.495510798063046e-06, "loss": 0.0059, "num_input_tokens_seen": 109214288, "step": 50605 }, { "epoch": 8.256117455138662, "grad_norm": 0.029451005160808563, "learning_rate": 4.4914393876042984e-06, "loss": 0.092, "num_input_tokens_seen": 109225328, "step": 50610 }, { "epoch": 8.256933115823816, "grad_norm": 0.1021382138133049, "learning_rate": 4.48736963967353e-06, "loss": 0.1346, "num_input_tokens_seen": 109236016, "step": 50615 }, { "epoch": 8.257748776508972, "grad_norm": 0.2410012185573578, "learning_rate": 4.483301554600655e-06, "loss": 0.2154, "num_input_tokens_seen": 109247344, "step": 50620 }, { "epoch": 8.258564437194128, "grad_norm": 0.19051925837993622, "learning_rate": 4.479235132715462e-06, "loss": 0.0198, "num_input_tokens_seen": 109258864, "step": 50625 }, { "epoch": 8.259380097879282, "grad_norm": 0.040067970752716064, "learning_rate": 4.4751703743475895e-06, "loss": 0.0454, "num_input_tokens_seen": 109270192, "step": 50630 }, { "epoch": 8.260195758564437, "grad_norm": 0.09321948140859604, "learning_rate": 4.47110727982655e-06, "loss": 0.0029, "num_input_tokens_seen": 109280016, "step": 50635 }, { "epoch": 8.261011419249591, "grad_norm": 0.054231688380241394, "learning_rate": 4.467045849481716e-06, "loss": 0.0036, "num_input_tokens_seen": 109290256, "step": 50640 }, { "epoch": 8.261827079934747, "grad_norm": 0.44992342591285706, "learning_rate": 4.462986083642329e-06, "loss": 0.0834, "num_input_tokens_seen": 109300208, "step": 50645 }, { "epoch": 8.262642740619903, "grad_norm": 0.11483272165060043, "learning_rate": 4.4589279826374955e-06, "loss": 0.1073, "num_input_tokens_seen": 109310960, "step": 50650 }, { "epoch": 8.263458401305057, "grad_norm": 0.13658924400806427, "learning_rate": 4.454871546796182e-06, "loss": 0.0845, "num_input_tokens_seen": 109321040, "step": 50655 }, { "epoch": 8.264274061990212, "grad_norm": 0.11663892865180969, "learning_rate": 4.4508167764472254e-06, "loss": 0.1995, "num_input_tokens_seen": 109331056, "step": 50660 }, { "epoch": 8.265089722675366, "grad_norm": 0.16184276342391968, "learning_rate": 4.446763671919321e-06, "loss": 0.1159, "num_input_tokens_seen": 109340816, "step": 50665 }, { "epoch": 8.265905383360522, "grad_norm": 0.14844438433647156, "learning_rate": 4.442712233541046e-06, "loss": 0.0055, "num_input_tokens_seen": 109351984, "step": 50670 }, { "epoch": 8.266721044045678, "grad_norm": 0.35642722249031067, "learning_rate": 4.438662461640825e-06, "loss": 0.0729, "num_input_tokens_seen": 109361616, "step": 50675 }, { "epoch": 8.267536704730832, "grad_norm": 0.12397274374961853, "learning_rate": 4.4346143565469485e-06, "loss": 0.0027, "num_input_tokens_seen": 109372240, "step": 50680 }, { "epoch": 8.268352365415987, "grad_norm": 0.10330452769994736, "learning_rate": 4.430567918587583e-06, "loss": 0.0067, "num_input_tokens_seen": 109383120, "step": 50685 }, { "epoch": 8.269168026101141, "grad_norm": 0.07007979601621628, "learning_rate": 4.42652314809075e-06, "loss": 0.0456, "num_input_tokens_seen": 109391952, "step": 50690 }, { "epoch": 8.269983686786297, "grad_norm": 0.09421925991773605, "learning_rate": 4.4224800453843394e-06, "loss": 0.041, "num_input_tokens_seen": 109402576, "step": 50695 }, { "epoch": 8.270799347471453, "grad_norm": 5.5964508056640625, "learning_rate": 4.418438610796105e-06, "loss": 0.1264, "num_input_tokens_seen": 109412976, "step": 50700 }, { "epoch": 8.271615008156607, "grad_norm": 5.199951171875, "learning_rate": 4.414398844653666e-06, "loss": 0.0822, "num_input_tokens_seen": 109425040, "step": 50705 }, { "epoch": 8.272430668841762, "grad_norm": 0.1465538889169693, "learning_rate": 4.410360747284508e-06, "loss": 0.0053, "num_input_tokens_seen": 109436592, "step": 50710 }, { "epoch": 8.273246329526916, "grad_norm": 3.4535036087036133, "learning_rate": 4.406324319015978e-06, "loss": 0.1077, "num_input_tokens_seen": 109447184, "step": 50715 }, { "epoch": 8.274061990212072, "grad_norm": 0.17683878540992737, "learning_rate": 4.4022895601752905e-06, "loss": 0.0065, "num_input_tokens_seen": 109457616, "step": 50720 }, { "epoch": 8.274877650897226, "grad_norm": 0.07014454901218414, "learning_rate": 4.398256471089518e-06, "loss": 0.0019, "num_input_tokens_seen": 109466896, "step": 50725 }, { "epoch": 8.275693311582382, "grad_norm": 22.82807159423828, "learning_rate": 4.394225052085613e-06, "loss": 0.0455, "num_input_tokens_seen": 109478800, "step": 50730 }, { "epoch": 8.276508972267537, "grad_norm": 0.24154672026634216, "learning_rate": 4.390195303490377e-06, "loss": 0.0285, "num_input_tokens_seen": 109488272, "step": 50735 }, { "epoch": 8.277324632952691, "grad_norm": 0.15540587902069092, "learning_rate": 4.3861672256304835e-06, "loss": 0.0035, "num_input_tokens_seen": 109499248, "step": 50740 }, { "epoch": 8.278140293637847, "grad_norm": 0.19240988790988922, "learning_rate": 4.382140818832467e-06, "loss": 0.1734, "num_input_tokens_seen": 109509584, "step": 50745 }, { "epoch": 8.278955954323001, "grad_norm": 5.04918909072876, "learning_rate": 4.378116083422732e-06, "loss": 0.2013, "num_input_tokens_seen": 109520656, "step": 50750 }, { "epoch": 8.279771615008157, "grad_norm": 0.4211946427822113, "learning_rate": 4.374093019727541e-06, "loss": 0.0063, "num_input_tokens_seen": 109531600, "step": 50755 }, { "epoch": 8.280587275693312, "grad_norm": 0.3730059564113617, "learning_rate": 4.370071628073025e-06, "loss": 0.1024, "num_input_tokens_seen": 109541936, "step": 50760 }, { "epoch": 8.281402936378466, "grad_norm": 0.16246718168258667, "learning_rate": 4.366051908785177e-06, "loss": 0.0036, "num_input_tokens_seen": 109552336, "step": 50765 }, { "epoch": 8.282218597063622, "grad_norm": 0.14463460445404053, "learning_rate": 4.3620338621898575e-06, "loss": 0.2052, "num_input_tokens_seen": 109563376, "step": 50770 }, { "epoch": 8.283034257748776, "grad_norm": 8.738451957702637, "learning_rate": 4.35801748861279e-06, "loss": 0.1017, "num_input_tokens_seen": 109575216, "step": 50775 }, { "epoch": 8.283849918433932, "grad_norm": 0.09802068769931793, "learning_rate": 4.354002788379558e-06, "loss": 0.0492, "num_input_tokens_seen": 109585904, "step": 50780 }, { "epoch": 8.284665579119087, "grad_norm": 0.14776861667633057, "learning_rate": 4.34998976181562e-06, "loss": 0.2586, "num_input_tokens_seen": 109597936, "step": 50785 }, { "epoch": 8.285481239804241, "grad_norm": 0.05964956060051918, "learning_rate": 4.345978409246287e-06, "loss": 0.081, "num_input_tokens_seen": 109609424, "step": 50790 }, { "epoch": 8.286296900489397, "grad_norm": 0.0592818446457386, "learning_rate": 4.341968730996743e-06, "loss": 0.0111, "num_input_tokens_seen": 109621488, "step": 50795 }, { "epoch": 8.28711256117455, "grad_norm": 1.5230180025100708, "learning_rate": 4.337960727392032e-06, "loss": 0.0064, "num_input_tokens_seen": 109631696, "step": 50800 }, { "epoch": 8.287928221859707, "grad_norm": 1.5851444005966187, "learning_rate": 4.333954398757054e-06, "loss": 0.272, "num_input_tokens_seen": 109642544, "step": 50805 }, { "epoch": 8.28874388254486, "grad_norm": 0.3063996434211731, "learning_rate": 4.329949745416598e-06, "loss": 0.0068, "num_input_tokens_seen": 109652752, "step": 50810 }, { "epoch": 8.289559543230016, "grad_norm": 2.3268768787384033, "learning_rate": 4.325946767695297e-06, "loss": 0.0708, "num_input_tokens_seen": 109664496, "step": 50815 }, { "epoch": 8.290375203915172, "grad_norm": 0.16275164484977722, "learning_rate": 4.321945465917646e-06, "loss": 0.1863, "num_input_tokens_seen": 109676752, "step": 50820 }, { "epoch": 8.291190864600326, "grad_norm": 0.11027777194976807, "learning_rate": 4.317945840408019e-06, "loss": 0.0046, "num_input_tokens_seen": 109687792, "step": 50825 }, { "epoch": 8.292006525285482, "grad_norm": 0.08528872579336166, "learning_rate": 4.313947891490638e-06, "loss": 0.0579, "num_input_tokens_seen": 109698416, "step": 50830 }, { "epoch": 8.292822185970635, "grad_norm": 0.08500778675079346, "learning_rate": 4.309951619489597e-06, "loss": 0.1109, "num_input_tokens_seen": 109709488, "step": 50835 }, { "epoch": 8.293637846655791, "grad_norm": 0.14026764035224915, "learning_rate": 4.3059570247288624e-06, "loss": 0.1589, "num_input_tokens_seen": 109720912, "step": 50840 }, { "epoch": 8.294453507340947, "grad_norm": 0.07855894416570663, "learning_rate": 4.301964107532255e-06, "loss": 0.0046, "num_input_tokens_seen": 109731600, "step": 50845 }, { "epoch": 8.2952691680261, "grad_norm": 0.10529959201812744, "learning_rate": 4.297972868223457e-06, "loss": 0.2531, "num_input_tokens_seen": 109742064, "step": 50850 }, { "epoch": 8.296084828711257, "grad_norm": 0.0845145657658577, "learning_rate": 4.293983307126018e-06, "loss": 0.0251, "num_input_tokens_seen": 109752848, "step": 50855 }, { "epoch": 8.29690048939641, "grad_norm": 0.08708660304546356, "learning_rate": 4.289995424563353e-06, "loss": 0.0059, "num_input_tokens_seen": 109763728, "step": 50860 }, { "epoch": 8.297716150081566, "grad_norm": 0.2024458646774292, "learning_rate": 4.286009220858742e-06, "loss": 0.288, "num_input_tokens_seen": 109774864, "step": 50865 }, { "epoch": 8.298531810766722, "grad_norm": 0.178129643201828, "learning_rate": 4.282024696335324e-06, "loss": 0.0056, "num_input_tokens_seen": 109785296, "step": 50870 }, { "epoch": 8.299347471451876, "grad_norm": 3.1330602169036865, "learning_rate": 4.278041851316106e-06, "loss": 0.0069, "num_input_tokens_seen": 109795856, "step": 50875 }, { "epoch": 8.300163132137031, "grad_norm": 3.9104366302490234, "learning_rate": 4.274060686123959e-06, "loss": 0.1223, "num_input_tokens_seen": 109806480, "step": 50880 }, { "epoch": 8.300978792822185, "grad_norm": 0.0885840579867363, "learning_rate": 4.270081201081613e-06, "loss": 0.0029, "num_input_tokens_seen": 109817680, "step": 50885 }, { "epoch": 8.301794453507341, "grad_norm": 0.03894002363085747, "learning_rate": 4.2661033965116695e-06, "loss": 0.0032, "num_input_tokens_seen": 109829616, "step": 50890 }, { "epoch": 8.302610114192497, "grad_norm": 0.11713149398565292, "learning_rate": 4.2621272727365875e-06, "loss": 0.0605, "num_input_tokens_seen": 109839632, "step": 50895 }, { "epoch": 8.30342577487765, "grad_norm": 0.26046085357666016, "learning_rate": 4.2581528300786906e-06, "loss": 0.0042, "num_input_tokens_seen": 109851344, "step": 50900 }, { "epoch": 8.304241435562806, "grad_norm": 0.04204858839511871, "learning_rate": 4.2541800688601696e-06, "loss": 0.1063, "num_input_tokens_seen": 109861072, "step": 50905 }, { "epoch": 8.30505709624796, "grad_norm": 0.10443831980228424, "learning_rate": 4.250208989403073e-06, "loss": 0.1402, "num_input_tokens_seen": 109872144, "step": 50910 }, { "epoch": 8.305872756933116, "grad_norm": 0.14044144749641418, "learning_rate": 4.2462395920293215e-06, "loss": 0.0688, "num_input_tokens_seen": 109883152, "step": 50915 }, { "epoch": 8.30668841761827, "grad_norm": 0.03933459892868996, "learning_rate": 4.242271877060691e-06, "loss": 0.0094, "num_input_tokens_seen": 109894672, "step": 50920 }, { "epoch": 8.307504078303426, "grad_norm": 0.3973071873188019, "learning_rate": 4.238305844818827e-06, "loss": 0.1753, "num_input_tokens_seen": 109905456, "step": 50925 }, { "epoch": 8.308319738988581, "grad_norm": 4.436182022094727, "learning_rate": 4.234341495625233e-06, "loss": 0.3647, "num_input_tokens_seen": 109915024, "step": 50930 }, { "epoch": 8.309135399673735, "grad_norm": 0.1660321056842804, "learning_rate": 4.230378829801282e-06, "loss": 0.1378, "num_input_tokens_seen": 109923728, "step": 50935 }, { "epoch": 8.309951060358891, "grad_norm": 0.21303358674049377, "learning_rate": 4.226417847668201e-06, "loss": 0.0951, "num_input_tokens_seen": 109934128, "step": 50940 }, { "epoch": 8.310766721044045, "grad_norm": 0.03064492531120777, "learning_rate": 4.222458549547101e-06, "loss": 0.0054, "num_input_tokens_seen": 109945072, "step": 50945 }, { "epoch": 8.3115823817292, "grad_norm": 0.3193550705909729, "learning_rate": 4.218500935758935e-06, "loss": 0.1199, "num_input_tokens_seen": 109955184, "step": 50950 }, { "epoch": 8.312398042414356, "grad_norm": 0.0818767175078392, "learning_rate": 4.214545006624526e-06, "loss": 0.1143, "num_input_tokens_seen": 109965232, "step": 50955 }, { "epoch": 8.31321370309951, "grad_norm": 0.04161890223622322, "learning_rate": 4.210590762464564e-06, "loss": 0.0046, "num_input_tokens_seen": 109976944, "step": 50960 }, { "epoch": 8.314029363784666, "grad_norm": 0.04804445430636406, "learning_rate": 4.206638203599597e-06, "loss": 0.1408, "num_input_tokens_seen": 109987504, "step": 50965 }, { "epoch": 8.31484502446982, "grad_norm": 0.11469604074954987, "learning_rate": 4.202687330350044e-06, "loss": 0.0958, "num_input_tokens_seen": 109998160, "step": 50970 }, { "epoch": 8.315660685154976, "grad_norm": 0.058986835181713104, "learning_rate": 4.1987381430361735e-06, "loss": 0.0617, "num_input_tokens_seen": 110008464, "step": 50975 }, { "epoch": 8.31647634584013, "grad_norm": 5.525466442108154, "learning_rate": 4.194790641978141e-06, "loss": 0.1832, "num_input_tokens_seen": 110020080, "step": 50980 }, { "epoch": 8.317292006525285, "grad_norm": 0.0891265943646431, "learning_rate": 4.1908448274959436e-06, "loss": 0.0031, "num_input_tokens_seen": 110029104, "step": 50985 }, { "epoch": 8.318107667210441, "grad_norm": 0.3449834883213043, "learning_rate": 4.186900699909446e-06, "loss": 0.0549, "num_input_tokens_seen": 110040528, "step": 50990 }, { "epoch": 8.318923327895595, "grad_norm": 0.04940696805715561, "learning_rate": 4.182958259538386e-06, "loss": 0.0297, "num_input_tokens_seen": 110049744, "step": 50995 }, { "epoch": 8.31973898858075, "grad_norm": 0.1019030436873436, "learning_rate": 4.179017506702351e-06, "loss": 0.0028, "num_input_tokens_seen": 110061680, "step": 51000 }, { "epoch": 8.320554649265905, "grad_norm": 0.11362658441066742, "learning_rate": 4.1750784417208065e-06, "loss": 0.0953, "num_input_tokens_seen": 110072880, "step": 51005 }, { "epoch": 8.32137030995106, "grad_norm": 0.036861419677734375, "learning_rate": 4.171141064913061e-06, "loss": 0.1668, "num_input_tokens_seen": 110084464, "step": 51010 }, { "epoch": 8.322185970636216, "grad_norm": 0.1589759886264801, "learning_rate": 4.16720537659831e-06, "loss": 0.0877, "num_input_tokens_seen": 110095536, "step": 51015 }, { "epoch": 8.32300163132137, "grad_norm": 0.09122513979673386, "learning_rate": 4.1632713770955956e-06, "loss": 0.149, "num_input_tokens_seen": 110104496, "step": 51020 }, { "epoch": 8.323817292006526, "grad_norm": 0.020270230248570442, "learning_rate": 4.159339066723827e-06, "loss": 0.0373, "num_input_tokens_seen": 110115888, "step": 51025 }, { "epoch": 8.32463295269168, "grad_norm": 0.06882867217063904, "learning_rate": 4.155408445801779e-06, "loss": 0.0067, "num_input_tokens_seen": 110125104, "step": 51030 }, { "epoch": 8.325448613376835, "grad_norm": 0.05521862953901291, "learning_rate": 4.151479514648085e-06, "loss": 0.0051, "num_input_tokens_seen": 110135728, "step": 51035 }, { "epoch": 8.326264274061991, "grad_norm": 0.1626533567905426, "learning_rate": 4.147552273581248e-06, "loss": 0.0091, "num_input_tokens_seen": 110146832, "step": 51040 }, { "epoch": 8.327079934747145, "grad_norm": 0.17563822865486145, "learning_rate": 4.143626722919619e-06, "loss": 0.1437, "num_input_tokens_seen": 110157520, "step": 51045 }, { "epoch": 8.3278955954323, "grad_norm": 0.06099024415016174, "learning_rate": 4.139702862981443e-06, "loss": 0.1121, "num_input_tokens_seen": 110168688, "step": 51050 }, { "epoch": 8.328711256117455, "grad_norm": 0.0700126588344574, "learning_rate": 4.135780694084793e-06, "loss": 0.033, "num_input_tokens_seen": 110179056, "step": 51055 }, { "epoch": 8.32952691680261, "grad_norm": 0.08973924815654755, "learning_rate": 4.131860216547623e-06, "loss": 0.0089, "num_input_tokens_seen": 110190160, "step": 51060 }, { "epoch": 8.330342577487766, "grad_norm": 0.04193677008152008, "learning_rate": 4.127941430687751e-06, "loss": 0.0057, "num_input_tokens_seen": 110201424, "step": 51065 }, { "epoch": 8.33115823817292, "grad_norm": 0.06412708014249802, "learning_rate": 4.1240243368228485e-06, "loss": 0.0054, "num_input_tokens_seen": 110212304, "step": 51070 }, { "epoch": 8.331973898858076, "grad_norm": 0.16163991391658783, "learning_rate": 4.120108935270459e-06, "loss": 0.0066, "num_input_tokens_seen": 110223216, "step": 51075 }, { "epoch": 8.33278955954323, "grad_norm": 0.03767704963684082, "learning_rate": 4.11619522634798e-06, "loss": 0.3622, "num_input_tokens_seen": 110234160, "step": 51080 }, { "epoch": 8.333605220228385, "grad_norm": 0.2308705896139145, "learning_rate": 4.11228321037268e-06, "loss": 0.1111, "num_input_tokens_seen": 110245264, "step": 51085 }, { "epoch": 8.33442088091354, "grad_norm": 0.10310493409633636, "learning_rate": 4.108372887661688e-06, "loss": 0.0036, "num_input_tokens_seen": 110256560, "step": 51090 }, { "epoch": 8.335236541598695, "grad_norm": 0.3172813653945923, "learning_rate": 4.10446425853199e-06, "loss": 0.2201, "num_input_tokens_seen": 110267920, "step": 51095 }, { "epoch": 8.33605220228385, "grad_norm": 0.053970981389284134, "learning_rate": 4.100557323300444e-06, "loss": 0.294, "num_input_tokens_seen": 110278032, "step": 51100 }, { "epoch": 8.336867862969005, "grad_norm": 0.13745872676372528, "learning_rate": 4.096652082283764e-06, "loss": 0.0041, "num_input_tokens_seen": 110289232, "step": 51105 }, { "epoch": 8.33768352365416, "grad_norm": 0.6048405766487122, "learning_rate": 4.092748535798527e-06, "loss": 0.0346, "num_input_tokens_seen": 110299664, "step": 51110 }, { "epoch": 8.338499184339314, "grad_norm": 0.15191605687141418, "learning_rate": 4.088846684161177e-06, "loss": 0.1041, "num_input_tokens_seen": 110310480, "step": 51115 }, { "epoch": 8.33931484502447, "grad_norm": 0.08844520896673203, "learning_rate": 4.0849465276880105e-06, "loss": 0.0044, "num_input_tokens_seen": 110322640, "step": 51120 }, { "epoch": 8.340130505709626, "grad_norm": 0.06009729579091072, "learning_rate": 4.081048066695209e-06, "loss": 0.0032, "num_input_tokens_seen": 110334992, "step": 51125 }, { "epoch": 8.34094616639478, "grad_norm": 0.10387653857469559, "learning_rate": 4.077151301498791e-06, "loss": 0.0048, "num_input_tokens_seen": 110345616, "step": 51130 }, { "epoch": 8.341761827079935, "grad_norm": 0.2893862724304199, "learning_rate": 4.073256232414649e-06, "loss": 0.0067, "num_input_tokens_seen": 110356752, "step": 51135 }, { "epoch": 8.34257748776509, "grad_norm": 0.06195401772856712, "learning_rate": 4.069362859758541e-06, "loss": 0.0081, "num_input_tokens_seen": 110367568, "step": 51140 }, { "epoch": 8.343393148450245, "grad_norm": 0.12342137843370438, "learning_rate": 4.065471183846079e-06, "loss": 0.0098, "num_input_tokens_seen": 110378576, "step": 51145 }, { "epoch": 8.3442088091354, "grad_norm": 0.15264281630516052, "learning_rate": 4.061581204992742e-06, "loss": 0.0053, "num_input_tokens_seen": 110389072, "step": 51150 }, { "epoch": 8.345024469820554, "grad_norm": 0.1557842642068863, "learning_rate": 4.057692923513867e-06, "loss": 0.1976, "num_input_tokens_seen": 110400464, "step": 51155 }, { "epoch": 8.34584013050571, "grad_norm": 0.06787768751382828, "learning_rate": 4.0538063397246725e-06, "loss": 0.1965, "num_input_tokens_seen": 110411728, "step": 51160 }, { "epoch": 8.346655791190864, "grad_norm": 0.08586647361516953, "learning_rate": 4.049921453940214e-06, "loss": 0.0123, "num_input_tokens_seen": 110422480, "step": 51165 }, { "epoch": 8.34747145187602, "grad_norm": 0.09865080565214157, "learning_rate": 4.046038266475421e-06, "loss": 0.0047, "num_input_tokens_seen": 110433680, "step": 51170 }, { "epoch": 8.348287112561174, "grad_norm": 0.14197717607021332, "learning_rate": 4.0421567776450895e-06, "loss": 0.0137, "num_input_tokens_seen": 110442512, "step": 51175 }, { "epoch": 8.34910277324633, "grad_norm": 0.08103757351636887, "learning_rate": 4.038276987763864e-06, "loss": 0.0036, "num_input_tokens_seen": 110454032, "step": 51180 }, { "epoch": 8.349918433931485, "grad_norm": 4.606961727142334, "learning_rate": 4.034398897146269e-06, "loss": 0.1384, "num_input_tokens_seen": 110464432, "step": 51185 }, { "epoch": 8.350734094616639, "grad_norm": 0.07641594856977463, "learning_rate": 4.0305225061066735e-06, "loss": 0.1245, "num_input_tokens_seen": 110476176, "step": 51190 }, { "epoch": 8.351549755301795, "grad_norm": 0.10787976533174515, "learning_rate": 4.026647814959325e-06, "loss": 0.1265, "num_input_tokens_seen": 110486800, "step": 51195 }, { "epoch": 8.352365415986949, "grad_norm": 0.13637679815292358, "learning_rate": 4.022774824018321e-06, "loss": 0.2487, "num_input_tokens_seen": 110498448, "step": 51200 }, { "epoch": 8.353181076672104, "grad_norm": 0.05383555591106415, "learning_rate": 4.018903533597629e-06, "loss": 0.0043, "num_input_tokens_seen": 110510064, "step": 51205 }, { "epoch": 8.35399673735726, "grad_norm": 26.989179611206055, "learning_rate": 4.015033944011071e-06, "loss": 0.0466, "num_input_tokens_seen": 110522352, "step": 51210 }, { "epoch": 8.354812398042414, "grad_norm": 0.055170685052871704, "learning_rate": 4.011166055572338e-06, "loss": 0.0034, "num_input_tokens_seen": 110534640, "step": 51215 }, { "epoch": 8.35562805872757, "grad_norm": 0.13349996507167816, "learning_rate": 4.007299868594983e-06, "loss": 0.0754, "num_input_tokens_seen": 110546608, "step": 51220 }, { "epoch": 8.356443719412724, "grad_norm": 0.11645912379026413, "learning_rate": 4.003435383392415e-06, "loss": 0.0661, "num_input_tokens_seen": 110556656, "step": 51225 }, { "epoch": 8.35725938009788, "grad_norm": 0.5707449316978455, "learning_rate": 3.999572600277912e-06, "loss": 0.1064, "num_input_tokens_seen": 110567184, "step": 51230 }, { "epoch": 8.358075040783035, "grad_norm": 0.15371613204479218, "learning_rate": 3.995711519564607e-06, "loss": 0.0038, "num_input_tokens_seen": 110576848, "step": 51235 }, { "epoch": 8.358890701468189, "grad_norm": 0.08689006417989731, "learning_rate": 3.991852141565503e-06, "loss": 0.0489, "num_input_tokens_seen": 110587920, "step": 51240 }, { "epoch": 8.359706362153345, "grad_norm": 0.08142328262329102, "learning_rate": 3.987994466593456e-06, "loss": 0.0912, "num_input_tokens_seen": 110599184, "step": 51245 }, { "epoch": 8.360522022838499, "grad_norm": 0.19214679300785065, "learning_rate": 3.9841384949611924e-06, "loss": 0.0047, "num_input_tokens_seen": 110610768, "step": 51250 }, { "epoch": 8.361337683523654, "grad_norm": 0.4272608160972595, "learning_rate": 3.980284226981299e-06, "loss": 0.251, "num_input_tokens_seen": 110621136, "step": 51255 }, { "epoch": 8.362153344208808, "grad_norm": 0.11546285450458527, "learning_rate": 3.976431662966209e-06, "loss": 0.0334, "num_input_tokens_seen": 110631088, "step": 51260 }, { "epoch": 8.362969004893964, "grad_norm": 0.2972230613231659, "learning_rate": 3.972580803228249e-06, "loss": 0.0172, "num_input_tokens_seen": 110643184, "step": 51265 }, { "epoch": 8.36378466557912, "grad_norm": 3.7936272621154785, "learning_rate": 3.96873164807958e-06, "loss": 0.1162, "num_input_tokens_seen": 110653712, "step": 51270 }, { "epoch": 8.364600326264274, "grad_norm": 0.05377197265625, "learning_rate": 3.964884197832236e-06, "loss": 0.1022, "num_input_tokens_seen": 110663600, "step": 51275 }, { "epoch": 8.36541598694943, "grad_norm": 3.144099235534668, "learning_rate": 3.96103845279811e-06, "loss": 0.0973, "num_input_tokens_seen": 110674192, "step": 51280 }, { "epoch": 8.366231647634583, "grad_norm": 0.0675184354186058, "learning_rate": 3.957194413288956e-06, "loss": 0.1611, "num_input_tokens_seen": 110684880, "step": 51285 }, { "epoch": 8.367047308319739, "grad_norm": 0.0853884220123291, "learning_rate": 3.953352079616387e-06, "loss": 0.0214, "num_input_tokens_seen": 110695120, "step": 51290 }, { "epoch": 8.367862969004895, "grad_norm": 0.12090042978525162, "learning_rate": 3.949511452091898e-06, "loss": 0.0043, "num_input_tokens_seen": 110704912, "step": 51295 }, { "epoch": 8.368678629690049, "grad_norm": 0.08174553513526917, "learning_rate": 3.945672531026817e-06, "loss": 0.0104, "num_input_tokens_seen": 110715952, "step": 51300 }, { "epoch": 8.369494290375204, "grad_norm": 0.08900509774684906, "learning_rate": 3.941835316732348e-06, "loss": 0.1461, "num_input_tokens_seen": 110727312, "step": 51305 }, { "epoch": 8.370309951060358, "grad_norm": 0.037323251366615295, "learning_rate": 3.9379998095195606e-06, "loss": 0.0391, "num_input_tokens_seen": 110738448, "step": 51310 }, { "epoch": 8.371125611745514, "grad_norm": 3.7181408405303955, "learning_rate": 3.9341660096993725e-06, "loss": 0.1128, "num_input_tokens_seen": 110750960, "step": 51315 }, { "epoch": 8.37194127243067, "grad_norm": 0.10436207801103592, "learning_rate": 3.9303339175825736e-06, "loss": 0.0033, "num_input_tokens_seen": 110761584, "step": 51320 }, { "epoch": 8.372756933115824, "grad_norm": 0.0793977677822113, "learning_rate": 3.926503533479817e-06, "loss": 0.002, "num_input_tokens_seen": 110771792, "step": 51325 }, { "epoch": 8.37357259380098, "grad_norm": 3.5391643047332764, "learning_rate": 3.922674857701608e-06, "loss": 0.1114, "num_input_tokens_seen": 110783472, "step": 51330 }, { "epoch": 8.374388254486133, "grad_norm": 0.13794291019439697, "learning_rate": 3.918847890558322e-06, "loss": 0.0848, "num_input_tokens_seen": 110793584, "step": 51335 }, { "epoch": 8.375203915171289, "grad_norm": 0.11728204041719437, "learning_rate": 3.915022632360188e-06, "loss": 0.0074, "num_input_tokens_seen": 110803440, "step": 51340 }, { "epoch": 8.376019575856443, "grad_norm": 0.1369055062532425, "learning_rate": 3.911199083417305e-06, "loss": 0.0033, "num_input_tokens_seen": 110813808, "step": 51345 }, { "epoch": 8.376835236541599, "grad_norm": 1.1726471185684204, "learning_rate": 3.9073772440396285e-06, "loss": 0.0797, "num_input_tokens_seen": 110824016, "step": 51350 }, { "epoch": 8.377650897226754, "grad_norm": 0.16108562052249908, "learning_rate": 3.903557114536973e-06, "loss": 0.0057, "num_input_tokens_seen": 110834672, "step": 51355 }, { "epoch": 8.378466557911908, "grad_norm": 0.060831084847450256, "learning_rate": 3.899738695219024e-06, "loss": 0.0019, "num_input_tokens_seen": 110845456, "step": 51360 }, { "epoch": 8.379282218597064, "grad_norm": 0.15530067682266235, "learning_rate": 3.89592198639531e-06, "loss": 0.1605, "num_input_tokens_seen": 110856080, "step": 51365 }, { "epoch": 8.380097879282218, "grad_norm": 0.2525686025619507, "learning_rate": 3.8921069883752465e-06, "loss": 0.1055, "num_input_tokens_seen": 110867056, "step": 51370 }, { "epoch": 8.380913539967374, "grad_norm": 23.300146102905273, "learning_rate": 3.88829370146809e-06, "loss": 0.04, "num_input_tokens_seen": 110878352, "step": 51375 }, { "epoch": 8.38172920065253, "grad_norm": 0.14016051590442657, "learning_rate": 3.884482125982969e-06, "loss": 0.0037, "num_input_tokens_seen": 110889584, "step": 51380 }, { "epoch": 8.382544861337683, "grad_norm": 0.37360477447509766, "learning_rate": 3.880672262228863e-06, "loss": 0.0068, "num_input_tokens_seen": 110900464, "step": 51385 }, { "epoch": 8.383360522022839, "grad_norm": 0.07993663847446442, "learning_rate": 3.876864110514622e-06, "loss": 0.1006, "num_input_tokens_seen": 110911568, "step": 51390 }, { "epoch": 8.384176182707993, "grad_norm": 7.038994312286377, "learning_rate": 3.8730576711489555e-06, "loss": 0.0923, "num_input_tokens_seen": 110922640, "step": 51395 }, { "epoch": 8.384991843393149, "grad_norm": 0.09898694604635239, "learning_rate": 3.86925294444043e-06, "loss": 0.1509, "num_input_tokens_seen": 110933872, "step": 51400 }, { "epoch": 8.385807504078304, "grad_norm": 0.07565029710531235, "learning_rate": 3.8654499306974765e-06, "loss": 0.0961, "num_input_tokens_seen": 110945648, "step": 51405 }, { "epoch": 8.386623164763458, "grad_norm": 0.04033258929848671, "learning_rate": 3.86164863022839e-06, "loss": 0.004, "num_input_tokens_seen": 110956528, "step": 51410 }, { "epoch": 8.387438825448614, "grad_norm": 4.092426300048828, "learning_rate": 3.857849043341316e-06, "loss": 0.0125, "num_input_tokens_seen": 110966224, "step": 51415 }, { "epoch": 8.388254486133768, "grad_norm": 0.13605235517024994, "learning_rate": 3.854051170344278e-06, "loss": 0.0566, "num_input_tokens_seen": 110976912, "step": 51420 }, { "epoch": 8.389070146818923, "grad_norm": 0.10848422348499298, "learning_rate": 3.8502550115451425e-06, "loss": 0.0046, "num_input_tokens_seen": 110988080, "step": 51425 }, { "epoch": 8.38988580750408, "grad_norm": 0.08110389113426208, "learning_rate": 3.846460567251648e-06, "loss": 0.0045, "num_input_tokens_seen": 110999472, "step": 51430 }, { "epoch": 8.390701468189233, "grad_norm": 0.13696368038654327, "learning_rate": 3.8426678377713884e-06, "loss": 0.0685, "num_input_tokens_seen": 111011184, "step": 51435 }, { "epoch": 8.391517128874389, "grad_norm": 0.05284971371293068, "learning_rate": 3.8388768234118275e-06, "loss": 0.0027, "num_input_tokens_seen": 111022480, "step": 51440 }, { "epoch": 8.392332789559543, "grad_norm": 0.08694755285978317, "learning_rate": 3.8350875244802855e-06, "loss": 0.1245, "num_input_tokens_seen": 111032912, "step": 51445 }, { "epoch": 8.393148450244698, "grad_norm": 4.483850955963135, "learning_rate": 3.831299941283936e-06, "loss": 0.0805, "num_input_tokens_seen": 111044272, "step": 51450 }, { "epoch": 8.393964110929852, "grad_norm": 14.72894287109375, "learning_rate": 3.827514074129823e-06, "loss": 0.0379, "num_input_tokens_seen": 111054704, "step": 51455 }, { "epoch": 8.394779771615008, "grad_norm": 0.12237244099378586, "learning_rate": 3.823729923324848e-06, "loss": 0.0032, "num_input_tokens_seen": 111065040, "step": 51460 }, { "epoch": 8.395595432300164, "grad_norm": 0.025830551981925964, "learning_rate": 3.819947489175771e-06, "loss": 0.004, "num_input_tokens_seen": 111077200, "step": 51465 }, { "epoch": 8.396411092985318, "grad_norm": 0.20611804723739624, "learning_rate": 3.816166771989218e-06, "loss": 0.1059, "num_input_tokens_seen": 111088016, "step": 51470 }, { "epoch": 8.397226753670473, "grad_norm": 0.1460016667842865, "learning_rate": 3.812387772071668e-06, "loss": 0.0431, "num_input_tokens_seen": 111098320, "step": 51475 }, { "epoch": 8.398042414355627, "grad_norm": 0.05462673678994179, "learning_rate": 3.808610489729472e-06, "loss": 0.008, "num_input_tokens_seen": 111108528, "step": 51480 }, { "epoch": 8.398858075040783, "grad_norm": 0.06848473101854324, "learning_rate": 3.804834925268838e-06, "loss": 0.1276, "num_input_tokens_seen": 111120080, "step": 51485 }, { "epoch": 8.399673735725939, "grad_norm": 0.0810776874423027, "learning_rate": 3.801061078995827e-06, "loss": 0.0591, "num_input_tokens_seen": 111130512, "step": 51490 }, { "epoch": 8.400489396411093, "grad_norm": 0.10097594559192657, "learning_rate": 3.7972889512163656e-06, "loss": 0.202, "num_input_tokens_seen": 111141520, "step": 51495 }, { "epoch": 8.401305057096248, "grad_norm": 0.04713840037584305, "learning_rate": 3.7935185422362433e-06, "loss": 0.0203, "num_input_tokens_seen": 111152112, "step": 51500 }, { "epoch": 8.402120717781402, "grad_norm": 0.49332499504089355, "learning_rate": 3.7897498523611104e-06, "loss": 0.0081, "num_input_tokens_seen": 111162896, "step": 51505 }, { "epoch": 8.402936378466558, "grad_norm": 0.20173262059688568, "learning_rate": 3.7859828818964716e-06, "loss": 0.222, "num_input_tokens_seen": 111173616, "step": 51510 }, { "epoch": 8.403752039151712, "grad_norm": 0.02554556354880333, "learning_rate": 3.7822176311477027e-06, "loss": 0.0211, "num_input_tokens_seen": 111183856, "step": 51515 }, { "epoch": 8.404567699836868, "grad_norm": 0.07756847888231277, "learning_rate": 3.7784541004200287e-06, "loss": 0.1304, "num_input_tokens_seen": 111193936, "step": 51520 }, { "epoch": 8.405383360522023, "grad_norm": 0.04575493559241295, "learning_rate": 3.774692290018542e-06, "loss": 0.0981, "num_input_tokens_seen": 111204912, "step": 51525 }, { "epoch": 8.406199021207177, "grad_norm": 0.09417518228292465, "learning_rate": 3.770932200248195e-06, "loss": 0.0824, "num_input_tokens_seen": 111215408, "step": 51530 }, { "epoch": 8.407014681892333, "grad_norm": 0.03317214548587799, "learning_rate": 3.7671738314137978e-06, "loss": 0.1031, "num_input_tokens_seen": 111227056, "step": 51535 }, { "epoch": 8.407830342577487, "grad_norm": 4.939886569976807, "learning_rate": 3.7634171838200253e-06, "loss": 0.0707, "num_input_tokens_seen": 111238512, "step": 51540 }, { "epoch": 8.408646003262643, "grad_norm": 0.10969651490449905, "learning_rate": 3.75966225777141e-06, "loss": 0.012, "num_input_tokens_seen": 111249392, "step": 51545 }, { "epoch": 8.409461663947798, "grad_norm": 4.015342712402344, "learning_rate": 3.7559090535723427e-06, "loss": 0.2732, "num_input_tokens_seen": 111259600, "step": 51550 }, { "epoch": 8.410277324632952, "grad_norm": 0.07740370184183121, "learning_rate": 3.7521575715270817e-06, "loss": 0.0048, "num_input_tokens_seen": 111270032, "step": 51555 }, { "epoch": 8.411092985318108, "grad_norm": 0.08327538520097733, "learning_rate": 3.748407811939736e-06, "loss": 0.2263, "num_input_tokens_seen": 111279504, "step": 51560 }, { "epoch": 8.411908646003262, "grad_norm": 3.396444320678711, "learning_rate": 3.7446597751142844e-06, "loss": 0.1199, "num_input_tokens_seen": 111291504, "step": 51565 }, { "epoch": 8.412724306688418, "grad_norm": 0.05676591768860817, "learning_rate": 3.7409134613545587e-06, "loss": 0.1198, "num_input_tokens_seen": 111303152, "step": 51570 }, { "epoch": 8.413539967373573, "grad_norm": 6.9310150146484375, "learning_rate": 3.7371688709642555e-06, "loss": 0.0769, "num_input_tokens_seen": 111313232, "step": 51575 }, { "epoch": 8.414355628058727, "grad_norm": 0.03313552960753441, "learning_rate": 3.7334260042469232e-06, "loss": 0.0071, "num_input_tokens_seen": 111324240, "step": 51580 }, { "epoch": 8.415171288743883, "grad_norm": 0.03135775774717331, "learning_rate": 3.7296848615059913e-06, "loss": 0.1245, "num_input_tokens_seen": 111335056, "step": 51585 }, { "epoch": 8.415986949429037, "grad_norm": 7.66591739654541, "learning_rate": 3.725945443044729e-06, "loss": 0.0955, "num_input_tokens_seen": 111345200, "step": 51590 }, { "epoch": 8.416802610114193, "grad_norm": 0.11677058041095734, "learning_rate": 3.722207749166273e-06, "loss": 0.02, "num_input_tokens_seen": 111354640, "step": 51595 }, { "epoch": 8.417618270799348, "grad_norm": 0.06249038875102997, "learning_rate": 3.7184717801736186e-06, "loss": 0.0382, "num_input_tokens_seen": 111365008, "step": 51600 }, { "epoch": 8.418433931484502, "grad_norm": 0.1685234159231186, "learning_rate": 3.7147375363696168e-06, "loss": 0.0441, "num_input_tokens_seen": 111376080, "step": 51605 }, { "epoch": 8.419249592169658, "grad_norm": 0.15778842568397522, "learning_rate": 3.7110050180569985e-06, "loss": 0.1111, "num_input_tokens_seen": 111387376, "step": 51610 }, { "epoch": 8.420065252854812, "grad_norm": 0.035594161599874496, "learning_rate": 3.707274225538332e-06, "loss": 0.0121, "num_input_tokens_seen": 111398224, "step": 51615 }, { "epoch": 8.420880913539968, "grad_norm": 3.7639381885528564, "learning_rate": 3.7035451591160535e-06, "loss": 0.2597, "num_input_tokens_seen": 111408624, "step": 51620 }, { "epoch": 8.421696574225122, "grad_norm": 0.0966758206486702, "learning_rate": 3.699817819092463e-06, "loss": 0.004, "num_input_tokens_seen": 111419664, "step": 51625 }, { "epoch": 8.422512234910277, "grad_norm": 0.11365343630313873, "learning_rate": 3.6960922057697163e-06, "loss": 0.0726, "num_input_tokens_seen": 111430480, "step": 51630 }, { "epoch": 8.423327895595433, "grad_norm": 0.019891297444701195, "learning_rate": 3.6923683194498295e-06, "loss": 0.207, "num_input_tokens_seen": 111439984, "step": 51635 }, { "epoch": 8.424143556280587, "grad_norm": 0.10552123188972473, "learning_rate": 3.6886461604346807e-06, "loss": 0.0046, "num_input_tokens_seen": 111451472, "step": 51640 }, { "epoch": 8.424959216965743, "grad_norm": 0.32801109552383423, "learning_rate": 3.6849257290260066e-06, "loss": 0.0199, "num_input_tokens_seen": 111462544, "step": 51645 }, { "epoch": 8.425774877650896, "grad_norm": 0.11163122206926346, "learning_rate": 3.6812070255254043e-06, "loss": 0.0035, "num_input_tokens_seen": 111474800, "step": 51650 }, { "epoch": 8.426590538336052, "grad_norm": 0.1100960224866867, "learning_rate": 3.677490050234331e-06, "loss": 0.0047, "num_input_tokens_seen": 111485776, "step": 51655 }, { "epoch": 8.427406199021208, "grad_norm": 0.057948071509599686, "learning_rate": 3.6737748034541054e-06, "loss": 0.2664, "num_input_tokens_seen": 111497616, "step": 51660 }, { "epoch": 8.428221859706362, "grad_norm": 0.2203397899866104, "learning_rate": 3.670061285485901e-06, "loss": 0.156, "num_input_tokens_seen": 111508848, "step": 51665 }, { "epoch": 8.429037520391518, "grad_norm": 0.12745971977710724, "learning_rate": 3.6663494966307553e-06, "loss": 0.2073, "num_input_tokens_seen": 111520816, "step": 51670 }, { "epoch": 8.429853181076671, "grad_norm": 5.879717826843262, "learning_rate": 3.662639437189566e-06, "loss": 0.1333, "num_input_tokens_seen": 111531984, "step": 51675 }, { "epoch": 8.430668841761827, "grad_norm": 12.864153861999512, "learning_rate": 3.65893110746309e-06, "loss": 0.1447, "num_input_tokens_seen": 111542800, "step": 51680 }, { "epoch": 8.431484502446983, "grad_norm": 0.07838033139705658, "learning_rate": 3.655224507751934e-06, "loss": 0.0621, "num_input_tokens_seen": 111553040, "step": 51685 }, { "epoch": 8.432300163132137, "grad_norm": 0.08984548598527908, "learning_rate": 3.6515196383565873e-06, "loss": 0.0036, "num_input_tokens_seen": 111563120, "step": 51690 }, { "epoch": 8.433115823817293, "grad_norm": 0.0729716420173645, "learning_rate": 3.6478164995773807e-06, "loss": 0.0044, "num_input_tokens_seen": 111573776, "step": 51695 }, { "epoch": 8.433931484502446, "grad_norm": 0.05781647562980652, "learning_rate": 3.644115091714509e-06, "loss": 0.2378, "num_input_tokens_seen": 111584080, "step": 51700 }, { "epoch": 8.434747145187602, "grad_norm": 7.219149589538574, "learning_rate": 3.640415415068027e-06, "loss": 0.0125, "num_input_tokens_seen": 111594480, "step": 51705 }, { "epoch": 8.435562805872756, "grad_norm": 5.097474098205566, "learning_rate": 3.6367174699378476e-06, "loss": 0.1759, "num_input_tokens_seen": 111604656, "step": 51710 }, { "epoch": 8.436378466557912, "grad_norm": 0.09925854206085205, "learning_rate": 3.6330212566237477e-06, "loss": 0.0974, "num_input_tokens_seen": 111616304, "step": 51715 }, { "epoch": 8.437194127243067, "grad_norm": 48.08366394042969, "learning_rate": 3.6293267754253566e-06, "loss": 0.0383, "num_input_tokens_seen": 111627920, "step": 51720 }, { "epoch": 8.438009787928221, "grad_norm": 0.07857240736484528, "learning_rate": 3.6256340266421747e-06, "loss": 0.0059, "num_input_tokens_seen": 111638864, "step": 51725 }, { "epoch": 8.438825448613377, "grad_norm": 0.15507225692272186, "learning_rate": 3.6219430105735476e-06, "loss": 0.0081, "num_input_tokens_seen": 111649296, "step": 51730 }, { "epoch": 8.439641109298531, "grad_norm": 0.4046974182128906, "learning_rate": 3.6182537275186947e-06, "loss": 0.0351, "num_input_tokens_seen": 111659152, "step": 51735 }, { "epoch": 8.440456769983687, "grad_norm": 0.6124518513679504, "learning_rate": 3.614566177776682e-06, "loss": 0.0066, "num_input_tokens_seen": 111669744, "step": 51740 }, { "epoch": 8.441272430668842, "grad_norm": 0.16115950047969818, "learning_rate": 3.6108803616464376e-06, "loss": 0.0998, "num_input_tokens_seen": 111680304, "step": 51745 }, { "epoch": 8.442088091353996, "grad_norm": 5.681840896606445, "learning_rate": 3.6071962794267667e-06, "loss": 0.2223, "num_input_tokens_seen": 111692400, "step": 51750 }, { "epoch": 8.442903752039152, "grad_norm": 0.052736107259988785, "learning_rate": 3.603513931416311e-06, "loss": 0.0227, "num_input_tokens_seen": 111702160, "step": 51755 }, { "epoch": 8.443719412724306, "grad_norm": 0.05737826228141785, "learning_rate": 3.5998333179135783e-06, "loss": 0.107, "num_input_tokens_seen": 111713712, "step": 51760 }, { "epoch": 8.444535073409462, "grad_norm": 0.02551724575459957, "learning_rate": 3.596154439216942e-06, "loss": 0.0286, "num_input_tokens_seen": 111724112, "step": 51765 }, { "epoch": 8.445350734094617, "grad_norm": 0.07931575924158096, "learning_rate": 3.5924772956246273e-06, "loss": 0.0071, "num_input_tokens_seen": 111735504, "step": 51770 }, { "epoch": 8.446166394779771, "grad_norm": 0.04812389984726906, "learning_rate": 3.5888018874347257e-06, "loss": 0.0041, "num_input_tokens_seen": 111744912, "step": 51775 }, { "epoch": 8.446982055464927, "grad_norm": 0.2566978931427002, "learning_rate": 3.5851282149451798e-06, "loss": 0.0049, "num_input_tokens_seen": 111756080, "step": 51780 }, { "epoch": 8.447797716150081, "grad_norm": 0.056581467390060425, "learning_rate": 3.5814562784538012e-06, "loss": 0.0023, "num_input_tokens_seen": 111766928, "step": 51785 }, { "epoch": 8.448613376835237, "grad_norm": 0.06746412813663483, "learning_rate": 3.5777860782582523e-06, "loss": 0.2365, "num_input_tokens_seen": 111775760, "step": 51790 }, { "epoch": 8.449429037520392, "grad_norm": 0.0598878487944603, "learning_rate": 3.5741176146560558e-06, "loss": 0.099, "num_input_tokens_seen": 111787184, "step": 51795 }, { "epoch": 8.450244698205546, "grad_norm": 0.08283038437366486, "learning_rate": 3.570450887944601e-06, "loss": 0.1214, "num_input_tokens_seen": 111798064, "step": 51800 }, { "epoch": 8.451060358890702, "grad_norm": 0.4032798409461975, "learning_rate": 3.5667858984211323e-06, "loss": 0.1477, "num_input_tokens_seen": 111808016, "step": 51805 }, { "epoch": 8.451876019575856, "grad_norm": 0.17475727200508118, "learning_rate": 3.5631226463827492e-06, "loss": 0.0038, "num_input_tokens_seen": 111819568, "step": 51810 }, { "epoch": 8.452691680261012, "grad_norm": 0.6154772639274597, "learning_rate": 3.5594611321264125e-06, "loss": 0.0032, "num_input_tokens_seen": 111829840, "step": 51815 }, { "epoch": 8.453507340946166, "grad_norm": 0.05333033576607704, "learning_rate": 3.5558013559489457e-06, "loss": 0.2209, "num_input_tokens_seen": 111841136, "step": 51820 }, { "epoch": 8.454323001631321, "grad_norm": 0.05515044555068016, "learning_rate": 3.5521433181470306e-06, "loss": 0.1007, "num_input_tokens_seen": 111852208, "step": 51825 }, { "epoch": 8.455138662316477, "grad_norm": 0.03507401421666145, "learning_rate": 3.5484870190171994e-06, "loss": 0.0048, "num_input_tokens_seen": 111862608, "step": 51830 }, { "epoch": 8.455954323001631, "grad_norm": 0.025188079103827477, "learning_rate": 3.5448324588558566e-06, "loss": 0.096, "num_input_tokens_seen": 111873424, "step": 51835 }, { "epoch": 8.456769983686787, "grad_norm": 4.245416641235352, "learning_rate": 3.541179637959255e-06, "loss": 0.2786, "num_input_tokens_seen": 111884016, "step": 51840 }, { "epoch": 8.45758564437194, "grad_norm": 0.22885818779468536, "learning_rate": 3.537528556623515e-06, "loss": 0.0041, "num_input_tokens_seen": 111895440, "step": 51845 }, { "epoch": 8.458401305057096, "grad_norm": 0.07951080054044724, "learning_rate": 3.5338792151446087e-06, "loss": 0.0023, "num_input_tokens_seen": 111907120, "step": 51850 }, { "epoch": 8.459216965742252, "grad_norm": 0.08638417720794678, "learning_rate": 3.530231613818372e-06, "loss": 0.1129, "num_input_tokens_seen": 111916464, "step": 51855 }, { "epoch": 8.460032626427406, "grad_norm": 0.118630051612854, "learning_rate": 3.526585752940495e-06, "loss": 0.0043, "num_input_tokens_seen": 111927376, "step": 51860 }, { "epoch": 8.460848287112562, "grad_norm": 4.591496467590332, "learning_rate": 3.522941632806534e-06, "loss": 0.1518, "num_input_tokens_seen": 111937392, "step": 51865 }, { "epoch": 8.461663947797716, "grad_norm": 0.19774377346038818, "learning_rate": 3.519299253711897e-06, "loss": 0.0566, "num_input_tokens_seen": 111948848, "step": 51870 }, { "epoch": 8.462479608482871, "grad_norm": 0.04803958162665367, "learning_rate": 3.515658615951856e-06, "loss": 0.0023, "num_input_tokens_seen": 111959664, "step": 51875 }, { "epoch": 8.463295269168025, "grad_norm": 0.06117786839604378, "learning_rate": 3.5120197198215356e-06, "loss": 0.1215, "num_input_tokens_seen": 111969712, "step": 51880 }, { "epoch": 8.464110929853181, "grad_norm": 0.2503572404384613, "learning_rate": 3.508382565615928e-06, "loss": 0.0076, "num_input_tokens_seen": 111981168, "step": 51885 }, { "epoch": 8.464926590538337, "grad_norm": 26.263456344604492, "learning_rate": 3.5047471536298697e-06, "loss": 0.0785, "num_input_tokens_seen": 111991952, "step": 51890 }, { "epoch": 8.46574225122349, "grad_norm": 0.07542380690574646, "learning_rate": 3.5011134841580805e-06, "loss": 0.0061, "num_input_tokens_seen": 112003024, "step": 51895 }, { "epoch": 8.466557911908646, "grad_norm": 3.779695749282837, "learning_rate": 3.4974815574951135e-06, "loss": 0.2171, "num_input_tokens_seen": 112013360, "step": 51900 }, { "epoch": 8.4673735725938, "grad_norm": 2.3623011112213135, "learning_rate": 3.4938513739353973e-06, "loss": 0.1671, "num_input_tokens_seen": 112023664, "step": 51905 }, { "epoch": 8.468189233278956, "grad_norm": 0.3942054808139801, "learning_rate": 3.4902229337732074e-06, "loss": 0.0985, "num_input_tokens_seen": 112034928, "step": 51910 }, { "epoch": 8.469004893964112, "grad_norm": 13.913886070251465, "learning_rate": 3.4865962373026805e-06, "loss": 0.2255, "num_input_tokens_seen": 112046128, "step": 51915 }, { "epoch": 8.469820554649266, "grad_norm": 0.012509414926171303, "learning_rate": 3.4829712848178293e-06, "loss": 0.0021, "num_input_tokens_seen": 112056944, "step": 51920 }, { "epoch": 8.470636215334421, "grad_norm": 0.037642624229192734, "learning_rate": 3.4793480766124986e-06, "loss": 0.3158, "num_input_tokens_seen": 112067472, "step": 51925 }, { "epoch": 8.471451876019575, "grad_norm": 0.05478547513484955, "learning_rate": 3.4757266129804093e-06, "loss": 0.0028, "num_input_tokens_seen": 112076400, "step": 51930 }, { "epoch": 8.47226753670473, "grad_norm": 2.140247106552124, "learning_rate": 3.4721068942151324e-06, "loss": 0.1839, "num_input_tokens_seen": 112087312, "step": 51935 }, { "epoch": 8.473083197389887, "grad_norm": 7.122751712799072, "learning_rate": 3.4684889206101025e-06, "loss": 0.1102, "num_input_tokens_seen": 112098544, "step": 51940 }, { "epoch": 8.47389885807504, "grad_norm": 0.10864073783159256, "learning_rate": 3.464872692458612e-06, "loss": 0.0878, "num_input_tokens_seen": 112108304, "step": 51945 }, { "epoch": 8.474714518760196, "grad_norm": 0.05819137021899223, "learning_rate": 3.4612582100538082e-06, "loss": 0.0015, "num_input_tokens_seen": 112119600, "step": 51950 }, { "epoch": 8.47553017944535, "grad_norm": 0.07787571847438812, "learning_rate": 3.4576454736887003e-06, "loss": 0.0031, "num_input_tokens_seen": 112130192, "step": 51955 }, { "epoch": 8.476345840130506, "grad_norm": 0.10811415314674377, "learning_rate": 3.4540344836561546e-06, "loss": 0.1739, "num_input_tokens_seen": 112142288, "step": 51960 }, { "epoch": 8.477161500815662, "grad_norm": 0.08363020420074463, "learning_rate": 3.4504252402488974e-06, "loss": 0.043, "num_input_tokens_seen": 112154032, "step": 51965 }, { "epoch": 8.477977161500815, "grad_norm": 0.09594326466321945, "learning_rate": 3.446817743759512e-06, "loss": 0.0021, "num_input_tokens_seen": 112165520, "step": 51970 }, { "epoch": 8.478792822185971, "grad_norm": 4.208395481109619, "learning_rate": 3.443211994480439e-06, "loss": 0.1565, "num_input_tokens_seen": 112176560, "step": 51975 }, { "epoch": 8.479608482871125, "grad_norm": 0.13062646985054016, "learning_rate": 3.4396079927039804e-06, "loss": 0.0023, "num_input_tokens_seen": 112187248, "step": 51980 }, { "epoch": 8.48042414355628, "grad_norm": 0.08347542583942413, "learning_rate": 3.436005738722292e-06, "loss": 0.0792, "num_input_tokens_seen": 112198128, "step": 51985 }, { "epoch": 8.481239804241435, "grad_norm": 3.741190195083618, "learning_rate": 3.432405232827396e-06, "loss": 0.1506, "num_input_tokens_seen": 112209680, "step": 51990 }, { "epoch": 8.48205546492659, "grad_norm": 0.0963752344250679, "learning_rate": 3.428806475311164e-06, "loss": 0.0102, "num_input_tokens_seen": 112220688, "step": 51995 }, { "epoch": 8.482871125611746, "grad_norm": 26.20519256591797, "learning_rate": 3.4252094664653316e-06, "loss": 0.1882, "num_input_tokens_seen": 112231632, "step": 52000 }, { "epoch": 8.4836867862969, "grad_norm": 0.08957038819789886, "learning_rate": 3.4216142065814806e-06, "loss": 0.228, "num_input_tokens_seen": 112240592, "step": 52005 }, { "epoch": 8.484502446982056, "grad_norm": 0.0813271775841713, "learning_rate": 3.418020695951077e-06, "loss": 0.0098, "num_input_tokens_seen": 112251440, "step": 52010 }, { "epoch": 8.48531810766721, "grad_norm": 1.235237717628479, "learning_rate": 3.414428934865421e-06, "loss": 0.1309, "num_input_tokens_seen": 112262192, "step": 52015 }, { "epoch": 8.486133768352365, "grad_norm": 0.02408054657280445, "learning_rate": 3.4108389236156806e-06, "loss": 0.0021, "num_input_tokens_seen": 112273808, "step": 52020 }, { "epoch": 8.486949429037521, "grad_norm": 27.396228790283203, "learning_rate": 3.4072506624928808e-06, "loss": 0.0425, "num_input_tokens_seen": 112284304, "step": 52025 }, { "epoch": 8.487765089722675, "grad_norm": 4.579169750213623, "learning_rate": 3.4036641517878997e-06, "loss": 0.2476, "num_input_tokens_seen": 112295152, "step": 52030 }, { "epoch": 8.48858075040783, "grad_norm": 0.04967943951487541, "learning_rate": 3.400079391791483e-06, "loss": 0.11, "num_input_tokens_seen": 112305648, "step": 52035 }, { "epoch": 8.489396411092985, "grad_norm": 0.20065897703170776, "learning_rate": 3.3964963827942257e-06, "loss": 0.0088, "num_input_tokens_seen": 112316144, "step": 52040 }, { "epoch": 8.49021207177814, "grad_norm": 0.1346825212240219, "learning_rate": 3.3929151250865903e-06, "loss": 0.082, "num_input_tokens_seen": 112328560, "step": 52045 }, { "epoch": 8.491027732463296, "grad_norm": 0.754371166229248, "learning_rate": 3.389335618958886e-06, "loss": 0.1196, "num_input_tokens_seen": 112338640, "step": 52050 }, { "epoch": 8.49184339314845, "grad_norm": 0.17561429738998413, "learning_rate": 3.385757864701286e-06, "loss": 0.0105, "num_input_tokens_seen": 112349232, "step": 52055 }, { "epoch": 8.492659053833606, "grad_norm": 0.09084920585155487, "learning_rate": 3.3821818626038198e-06, "loss": 0.0706, "num_input_tokens_seen": 112360400, "step": 52060 }, { "epoch": 8.49347471451876, "grad_norm": 0.05691864714026451, "learning_rate": 3.378607612956386e-06, "loss": 0.075, "num_input_tokens_seen": 112371408, "step": 52065 }, { "epoch": 8.494290375203915, "grad_norm": 0.056467730551958084, "learning_rate": 3.375035116048722e-06, "loss": 0.0652, "num_input_tokens_seen": 112382544, "step": 52070 }, { "epoch": 8.49510603588907, "grad_norm": 0.7548681497573853, "learning_rate": 3.371464372170438e-06, "loss": 0.085, "num_input_tokens_seen": 112392688, "step": 52075 }, { "epoch": 8.495921696574225, "grad_norm": 0.08632798492908478, "learning_rate": 3.3678953816109916e-06, "loss": 0.1076, "num_input_tokens_seen": 112404240, "step": 52080 }, { "epoch": 8.49673735725938, "grad_norm": 0.25324541330337524, "learning_rate": 3.3643281446597092e-06, "loss": 0.131, "num_input_tokens_seen": 112414768, "step": 52085 }, { "epoch": 8.497553017944535, "grad_norm": 0.031284745782613754, "learning_rate": 3.3607626616057624e-06, "loss": 0.1112, "num_input_tokens_seen": 112425424, "step": 52090 }, { "epoch": 8.49836867862969, "grad_norm": 0.08141515403985977, "learning_rate": 3.3571989327381923e-06, "loss": 0.0529, "num_input_tokens_seen": 112436720, "step": 52095 }, { "epoch": 8.499184339314844, "grad_norm": 0.0655307024717331, "learning_rate": 3.3536369583458905e-06, "loss": 0.2946, "num_input_tokens_seen": 112446896, "step": 52100 }, { "epoch": 8.5, "grad_norm": 0.06876910477876663, "learning_rate": 3.3500767387176114e-06, "loss": 0.0042, "num_input_tokens_seen": 112458064, "step": 52105 }, { "epoch": 8.5, "eval_loss": 0.19772686064243317, "eval_runtime": 568.5108, "eval_samples_per_second": 4.793, "eval_steps_per_second": 1.2, "num_input_tokens_seen": 112458064, "step": 52105 }, { "epoch": 8.500815660685156, "grad_norm": 0.2995106875896454, "learning_rate": 3.3465182741419547e-06, "loss": 0.0078, "num_input_tokens_seen": 112469200, "step": 52110 }, { "epoch": 8.50163132137031, "grad_norm": 6.758239269256592, "learning_rate": 3.3429615649074013e-06, "loss": 0.3036, "num_input_tokens_seen": 112479888, "step": 52115 }, { "epoch": 8.502446982055465, "grad_norm": 0.08517052233219147, "learning_rate": 3.3394066113022706e-06, "loss": 0.4295, "num_input_tokens_seen": 112490896, "step": 52120 }, { "epoch": 8.50326264274062, "grad_norm": 0.1996101438999176, "learning_rate": 3.335853413614745e-06, "loss": 0.1195, "num_input_tokens_seen": 112503152, "step": 52125 }, { "epoch": 8.504078303425775, "grad_norm": 0.113855741918087, "learning_rate": 3.332301972132862e-06, "loss": 0.1288, "num_input_tokens_seen": 112513936, "step": 52130 }, { "epoch": 8.50489396411093, "grad_norm": 0.1581876277923584, "learning_rate": 3.3287522871445263e-06, "loss": 0.0837, "num_input_tokens_seen": 112523536, "step": 52135 }, { "epoch": 8.505709624796085, "grad_norm": 0.044547755271196365, "learning_rate": 3.3252043589374866e-06, "loss": 0.1824, "num_input_tokens_seen": 112534512, "step": 52140 }, { "epoch": 8.50652528548124, "grad_norm": 0.07674051076173782, "learning_rate": 3.3216581877993564e-06, "loss": 0.2144, "num_input_tokens_seen": 112546032, "step": 52145 }, { "epoch": 8.507340946166394, "grad_norm": 4.138210296630859, "learning_rate": 3.3181137740176118e-06, "loss": 0.252, "num_input_tokens_seen": 112556016, "step": 52150 }, { "epoch": 8.50815660685155, "grad_norm": 0.3108614385128021, "learning_rate": 3.3145711178795753e-06, "loss": 0.0322, "num_input_tokens_seen": 112565520, "step": 52155 }, { "epoch": 8.508972267536706, "grad_norm": 0.062382735311985016, "learning_rate": 3.3110302196724368e-06, "loss": 0.0231, "num_input_tokens_seen": 112575824, "step": 52160 }, { "epoch": 8.50978792822186, "grad_norm": 0.420159250497818, "learning_rate": 3.3074910796832363e-06, "loss": 0.0048, "num_input_tokens_seen": 112587056, "step": 52165 }, { "epoch": 8.510603588907015, "grad_norm": 0.11795055866241455, "learning_rate": 3.303953698198875e-06, "loss": 0.0024, "num_input_tokens_seen": 112598192, "step": 52170 }, { "epoch": 8.51141924959217, "grad_norm": 0.1338420957326889, "learning_rate": 3.300418075506112e-06, "loss": 0.0044, "num_input_tokens_seen": 112609584, "step": 52175 }, { "epoch": 8.512234910277325, "grad_norm": 8.539312362670898, "learning_rate": 3.296884211891563e-06, "loss": 0.1122, "num_input_tokens_seen": 112620720, "step": 52180 }, { "epoch": 8.513050570962479, "grad_norm": 0.13929800689220428, "learning_rate": 3.293352107641698e-06, "loss": 0.0057, "num_input_tokens_seen": 112631696, "step": 52185 }, { "epoch": 8.513866231647635, "grad_norm": 0.12888842821121216, "learning_rate": 3.2898217630428523e-06, "loss": 0.1069, "num_input_tokens_seen": 112641872, "step": 52190 }, { "epoch": 8.51468189233279, "grad_norm": 0.21990133821964264, "learning_rate": 3.2862931783812083e-06, "loss": 0.0049, "num_input_tokens_seen": 112652240, "step": 52195 }, { "epoch": 8.515497553017944, "grad_norm": 0.10042712092399597, "learning_rate": 3.282766353942815e-06, "loss": 0.0082, "num_input_tokens_seen": 112663824, "step": 52200 }, { "epoch": 8.5163132137031, "grad_norm": 1.5217121839523315, "learning_rate": 3.279241290013568e-06, "loss": 0.0073, "num_input_tokens_seen": 112675216, "step": 52205 }, { "epoch": 8.517128874388254, "grad_norm": 0.08880387246608734, "learning_rate": 3.275717986879237e-06, "loss": 0.0031, "num_input_tokens_seen": 112684464, "step": 52210 }, { "epoch": 8.51794453507341, "grad_norm": 0.09611815214157104, "learning_rate": 3.2721964448254345e-06, "loss": 0.0057, "num_input_tokens_seen": 112695120, "step": 52215 }, { "epoch": 8.518760195758565, "grad_norm": 0.02544640563428402, "learning_rate": 3.268676664137635e-06, "loss": 0.003, "num_input_tokens_seen": 112705776, "step": 52220 }, { "epoch": 8.51957585644372, "grad_norm": 0.04137982428073883, "learning_rate": 3.2651586451011657e-06, "loss": 0.2269, "num_input_tokens_seen": 112716368, "step": 52225 }, { "epoch": 8.520391517128875, "grad_norm": 0.1521427482366562, "learning_rate": 3.2616423880012153e-06, "loss": 0.0041, "num_input_tokens_seen": 112727440, "step": 52230 }, { "epoch": 8.521207177814029, "grad_norm": 3.602827548980713, "learning_rate": 3.2581278931228363e-06, "loss": 0.2515, "num_input_tokens_seen": 112736336, "step": 52235 }, { "epoch": 8.522022838499185, "grad_norm": 0.1897648721933365, "learning_rate": 3.254615160750926e-06, "loss": 0.1008, "num_input_tokens_seen": 112747312, "step": 52240 }, { "epoch": 8.522838499184338, "grad_norm": 11.034748077392578, "learning_rate": 3.2511041911702483e-06, "loss": 0.2812, "num_input_tokens_seen": 112758160, "step": 52245 }, { "epoch": 8.523654159869494, "grad_norm": 0.34525004029273987, "learning_rate": 3.247594984665417e-06, "loss": 0.0077, "num_input_tokens_seen": 112768080, "step": 52250 }, { "epoch": 8.52446982055465, "grad_norm": 0.06467463821172714, "learning_rate": 3.244087541520907e-06, "loss": 0.0746, "num_input_tokens_seen": 112778416, "step": 52255 }, { "epoch": 8.525285481239804, "grad_norm": 3.4724481105804443, "learning_rate": 3.24058186202105e-06, "loss": 0.1234, "num_input_tokens_seen": 112788944, "step": 52260 }, { "epoch": 8.52610114192496, "grad_norm": 0.06703388690948486, "learning_rate": 3.2370779464500317e-06, "loss": 0.0018, "num_input_tokens_seen": 112799664, "step": 52265 }, { "epoch": 8.526916802610113, "grad_norm": 0.11705297976732254, "learning_rate": 3.2335757950919003e-06, "loss": 0.2958, "num_input_tokens_seen": 112809840, "step": 52270 }, { "epoch": 8.52773246329527, "grad_norm": 0.18924038112163544, "learning_rate": 3.230075408230557e-06, "loss": 0.096, "num_input_tokens_seen": 112820240, "step": 52275 }, { "epoch": 8.528548123980425, "grad_norm": 0.06967607140541077, "learning_rate": 3.2265767861497597e-06, "loss": 0.0033, "num_input_tokens_seen": 112830704, "step": 52280 }, { "epoch": 8.529363784665579, "grad_norm": 2.9614319801330566, "learning_rate": 3.2230799291331244e-06, "loss": 0.074, "num_input_tokens_seen": 112842352, "step": 52285 }, { "epoch": 8.530179445350734, "grad_norm": 0.10939770191907883, "learning_rate": 3.219584837464126e-06, "loss": 0.1415, "num_input_tokens_seen": 112853904, "step": 52290 }, { "epoch": 8.530995106035888, "grad_norm": 0.18360036611557007, "learning_rate": 3.2160915114260947e-06, "loss": 0.0145, "num_input_tokens_seen": 112863504, "step": 52295 }, { "epoch": 8.531810766721044, "grad_norm": 0.12482015788555145, "learning_rate": 3.212599951302214e-06, "loss": 0.0071, "num_input_tokens_seen": 112873808, "step": 52300 }, { "epoch": 8.5326264274062, "grad_norm": 3.226783037185669, "learning_rate": 3.2091101573755306e-06, "loss": 0.1789, "num_input_tokens_seen": 112883696, "step": 52305 }, { "epoch": 8.533442088091354, "grad_norm": 0.1161847934126854, "learning_rate": 3.2056221299289423e-06, "loss": 0.0057, "num_input_tokens_seen": 112893424, "step": 52310 }, { "epoch": 8.53425774877651, "grad_norm": 0.06892212480306625, "learning_rate": 3.20213586924521e-06, "loss": 0.0437, "num_input_tokens_seen": 112904016, "step": 52315 }, { "epoch": 8.535073409461663, "grad_norm": 1.1102094650268555, "learning_rate": 3.1986513756069426e-06, "loss": 0.0104, "num_input_tokens_seen": 112915376, "step": 52320 }, { "epoch": 8.535889070146819, "grad_norm": 0.03022949770092964, "learning_rate": 3.1951686492966094e-06, "loss": 0.1211, "num_input_tokens_seen": 112925424, "step": 52325 }, { "epoch": 8.536704730831975, "grad_norm": 0.09329638630151749, "learning_rate": 3.1916876905965483e-06, "loss": 0.1195, "num_input_tokens_seen": 112936656, "step": 52330 }, { "epoch": 8.537520391517129, "grad_norm": 0.07435731589794159, "learning_rate": 3.188208499788936e-06, "loss": 0.0042, "num_input_tokens_seen": 112947600, "step": 52335 }, { "epoch": 8.538336052202284, "grad_norm": 0.038756757974624634, "learning_rate": 3.184731077155817e-06, "loss": 0.005, "num_input_tokens_seen": 112957840, "step": 52340 }, { "epoch": 8.539151712887438, "grad_norm": 0.11953333765268326, "learning_rate": 3.1812554229790848e-06, "loss": 0.0206, "num_input_tokens_seen": 112969488, "step": 52345 }, { "epoch": 8.539967373572594, "grad_norm": 0.11215139180421829, "learning_rate": 3.1777815375404944e-06, "loss": 0.0045, "num_input_tokens_seen": 112979216, "step": 52350 }, { "epoch": 8.540783034257748, "grad_norm": 13.117161750793457, "learning_rate": 3.17430942112166e-06, "loss": 0.0316, "num_input_tokens_seen": 112990736, "step": 52355 }, { "epoch": 8.541598694942904, "grad_norm": 6.0222249031066895, "learning_rate": 3.170839074004045e-06, "loss": 0.2536, "num_input_tokens_seen": 113001296, "step": 52360 }, { "epoch": 8.54241435562806, "grad_norm": 0.06390171498060226, "learning_rate": 3.1673704964689743e-06, "loss": 0.0097, "num_input_tokens_seen": 113011312, "step": 52365 }, { "epoch": 8.543230016313213, "grad_norm": 1.7926284074783325, "learning_rate": 3.1639036887976286e-06, "loss": 0.0108, "num_input_tokens_seen": 113020688, "step": 52370 }, { "epoch": 8.544045676998369, "grad_norm": 0.10173257440328598, "learning_rate": 3.1604386512710387e-06, "loss": 0.0845, "num_input_tokens_seen": 113031888, "step": 52375 }, { "epoch": 8.544861337683523, "grad_norm": 0.18404307961463928, "learning_rate": 3.1569753841701106e-06, "loss": 0.0089, "num_input_tokens_seen": 113042800, "step": 52380 }, { "epoch": 8.545676998368679, "grad_norm": 0.0896589383482933, "learning_rate": 3.1535138877755887e-06, "loss": 0.0043, "num_input_tokens_seen": 113052176, "step": 52385 }, { "epoch": 8.546492659053834, "grad_norm": 0.023683076724410057, "learning_rate": 3.1500541623680795e-06, "loss": 0.1377, "num_input_tokens_seen": 113062608, "step": 52390 }, { "epoch": 8.547308319738988, "grad_norm": 0.09791173785924911, "learning_rate": 3.1465962082280474e-06, "loss": 0.0041, "num_input_tokens_seen": 113075248, "step": 52395 }, { "epoch": 8.548123980424144, "grad_norm": 0.06262429058551788, "learning_rate": 3.1431400256358073e-06, "loss": 0.0331, "num_input_tokens_seen": 113086256, "step": 52400 }, { "epoch": 8.548939641109298, "grad_norm": 0.1712462455034256, "learning_rate": 3.1396856148715375e-06, "loss": 0.0044, "num_input_tokens_seen": 113096752, "step": 52405 }, { "epoch": 8.549755301794454, "grad_norm": 0.021678127348423004, "learning_rate": 3.13623297621527e-06, "loss": 0.0041, "num_input_tokens_seen": 113106480, "step": 52410 }, { "epoch": 8.550570962479608, "grad_norm": 0.05082330107688904, "learning_rate": 3.1327821099468915e-06, "loss": 0.1551, "num_input_tokens_seen": 113117424, "step": 52415 }, { "epoch": 8.551386623164763, "grad_norm": 4.563812732696533, "learning_rate": 3.1293330163461503e-06, "loss": 0.1964, "num_input_tokens_seen": 113127728, "step": 52420 }, { "epoch": 8.552202283849919, "grad_norm": 4.3056321144104, "learning_rate": 3.125885695692646e-06, "loss": 0.1261, "num_input_tokens_seen": 113138000, "step": 52425 }, { "epoch": 8.553017944535073, "grad_norm": 0.13502757251262665, "learning_rate": 3.122440148265829e-06, "loss": 0.2172, "num_input_tokens_seen": 113148752, "step": 52430 }, { "epoch": 8.553833605220229, "grad_norm": 0.0628582164645195, "learning_rate": 3.1189963743450235e-06, "loss": 0.0363, "num_input_tokens_seen": 113159600, "step": 52435 }, { "epoch": 8.554649265905383, "grad_norm": 0.2686457931995392, "learning_rate": 3.115554374209395e-06, "loss": 0.0767, "num_input_tokens_seen": 113170416, "step": 52440 }, { "epoch": 8.555464926590538, "grad_norm": 0.1591261923313141, "learning_rate": 3.1121141481379735e-06, "loss": 0.0202, "num_input_tokens_seen": 113179984, "step": 52445 }, { "epoch": 8.556280587275694, "grad_norm": 0.36520224809646606, "learning_rate": 3.1086756964096327e-06, "loss": 0.0954, "num_input_tokens_seen": 113189680, "step": 52450 }, { "epoch": 8.557096247960848, "grad_norm": 0.2917357087135315, "learning_rate": 3.105239019303116e-06, "loss": 0.0105, "num_input_tokens_seen": 113200112, "step": 52455 }, { "epoch": 8.557911908646004, "grad_norm": 0.06772247701883316, "learning_rate": 3.101804117097018e-06, "loss": 0.0078, "num_input_tokens_seen": 113210416, "step": 52460 }, { "epoch": 8.558727569331158, "grad_norm": 0.03786826878786087, "learning_rate": 3.0983709900697903e-06, "loss": 0.0021, "num_input_tokens_seen": 113221392, "step": 52465 }, { "epoch": 8.559543230016313, "grad_norm": 0.09939499199390411, "learning_rate": 3.0949396384997357e-06, "loss": 0.3358, "num_input_tokens_seen": 113232528, "step": 52470 }, { "epoch": 8.560358890701469, "grad_norm": 0.04357518255710602, "learning_rate": 3.0915100626650206e-06, "loss": 0.1614, "num_input_tokens_seen": 113243056, "step": 52475 }, { "epoch": 8.561174551386623, "grad_norm": 37.47199249267578, "learning_rate": 3.0880822628436613e-06, "loss": 0.2143, "num_input_tokens_seen": 113254576, "step": 52480 }, { "epoch": 8.561990212071779, "grad_norm": 0.06570349633693695, "learning_rate": 3.0846562393135352e-06, "loss": 0.0669, "num_input_tokens_seen": 113263472, "step": 52485 }, { "epoch": 8.562805872756933, "grad_norm": 0.3499777019023895, "learning_rate": 3.0812319923523706e-06, "loss": 0.0048, "num_input_tokens_seen": 113276112, "step": 52490 }, { "epoch": 8.563621533442088, "grad_norm": 0.045446496456861496, "learning_rate": 3.077809522237754e-06, "loss": 0.0048, "num_input_tokens_seen": 113285520, "step": 52495 }, { "epoch": 8.564437194127244, "grad_norm": 0.18569421768188477, "learning_rate": 3.0743888292471322e-06, "loss": 0.008, "num_input_tokens_seen": 113296304, "step": 52500 }, { "epoch": 8.565252854812398, "grad_norm": 0.02989482879638672, "learning_rate": 3.0709699136578006e-06, "loss": 0.0936, "num_input_tokens_seen": 113307632, "step": 52505 }, { "epoch": 8.566068515497554, "grad_norm": 0.16343659162521362, "learning_rate": 3.0675527757469124e-06, "loss": 0.0038, "num_input_tokens_seen": 113318352, "step": 52510 }, { "epoch": 8.566884176182707, "grad_norm": 0.049167994409799576, "learning_rate": 3.064137415791485e-06, "loss": 0.2914, "num_input_tokens_seen": 113329552, "step": 52515 }, { "epoch": 8.567699836867863, "grad_norm": 0.2270321398973465, "learning_rate": 3.0607238340683713e-06, "loss": 0.0071, "num_input_tokens_seen": 113339216, "step": 52520 }, { "epoch": 8.568515497553017, "grad_norm": 1.0442709922790527, "learning_rate": 3.057312030854306e-06, "loss": 0.1237, "num_input_tokens_seen": 113349904, "step": 52525 }, { "epoch": 8.569331158238173, "grad_norm": 6.7521162033081055, "learning_rate": 3.0539020064258682e-06, "loss": 0.0847, "num_input_tokens_seen": 113358832, "step": 52530 }, { "epoch": 8.570146818923329, "grad_norm": 0.033351555466651917, "learning_rate": 3.0504937610594837e-06, "loss": 0.1551, "num_input_tokens_seen": 113370480, "step": 52535 }, { "epoch": 8.570962479608482, "grad_norm": 0.11367560178041458, "learning_rate": 3.0470872950314476e-06, "loss": 0.0043, "num_input_tokens_seen": 113380592, "step": 52540 }, { "epoch": 8.571778140293638, "grad_norm": 4.26261568069458, "learning_rate": 3.043682608617898e-06, "loss": 0.1709, "num_input_tokens_seen": 113392112, "step": 52545 }, { "epoch": 8.572593800978792, "grad_norm": 0.1217903420329094, "learning_rate": 3.0402797020948446e-06, "loss": 0.0707, "num_input_tokens_seen": 113403152, "step": 52550 }, { "epoch": 8.573409461663948, "grad_norm": 0.13792067766189575, "learning_rate": 3.0368785757381418e-06, "loss": 0.0036, "num_input_tokens_seen": 113413424, "step": 52555 }, { "epoch": 8.574225122349104, "grad_norm": 0.20448121428489685, "learning_rate": 3.033479229823502e-06, "loss": 0.1303, "num_input_tokens_seen": 113424848, "step": 52560 }, { "epoch": 8.575040783034257, "grad_norm": 3.6465656757354736, "learning_rate": 3.030081664626494e-06, "loss": 0.3299, "num_input_tokens_seen": 113434608, "step": 52565 }, { "epoch": 8.575856443719413, "grad_norm": 0.014260239899158478, "learning_rate": 3.0266858804225388e-06, "loss": 0.1935, "num_input_tokens_seen": 113446224, "step": 52570 }, { "epoch": 8.576672104404567, "grad_norm": 0.09066756814718246, "learning_rate": 3.0232918774869194e-06, "loss": 0.2235, "num_input_tokens_seen": 113457296, "step": 52575 }, { "epoch": 8.577487765089723, "grad_norm": 0.020743852481245995, "learning_rate": 3.0198996560947657e-06, "loss": 0.0054, "num_input_tokens_seen": 113468720, "step": 52580 }, { "epoch": 8.578303425774878, "grad_norm": 5.239649295806885, "learning_rate": 3.016509216521074e-06, "loss": 0.1828, "num_input_tokens_seen": 113479152, "step": 52585 }, { "epoch": 8.579119086460032, "grad_norm": 3.68194317817688, "learning_rate": 3.0131205590406886e-06, "loss": 0.1205, "num_input_tokens_seen": 113490608, "step": 52590 }, { "epoch": 8.579934747145188, "grad_norm": 0.0550539493560791, "learning_rate": 3.0097336839283118e-06, "loss": 0.0051, "num_input_tokens_seen": 113501968, "step": 52595 }, { "epoch": 8.580750407830342, "grad_norm": 0.18661192059516907, "learning_rate": 3.0063485914584995e-06, "loss": 0.005, "num_input_tokens_seen": 113512944, "step": 52600 }, { "epoch": 8.581566068515498, "grad_norm": 0.050649698823690414, "learning_rate": 3.0029652819056646e-06, "loss": 0.0035, "num_input_tokens_seen": 113524304, "step": 52605 }, { "epoch": 8.582381729200652, "grad_norm": 5.219583988189697, "learning_rate": 2.9995837555440748e-06, "loss": 0.1349, "num_input_tokens_seen": 113536400, "step": 52610 }, { "epoch": 8.583197389885807, "grad_norm": 3.2164628505706787, "learning_rate": 2.9962040126478548e-06, "loss": 0.2181, "num_input_tokens_seen": 113546992, "step": 52615 }, { "epoch": 8.584013050570963, "grad_norm": 0.12123799324035645, "learning_rate": 2.992826053490985e-06, "loss": 0.0788, "num_input_tokens_seen": 113557808, "step": 52620 }, { "epoch": 8.584828711256117, "grad_norm": 0.0507858544588089, "learning_rate": 2.9894498783473e-06, "loss": 0.0652, "num_input_tokens_seen": 113568336, "step": 52625 }, { "epoch": 8.585644371941273, "grad_norm": 0.19948065280914307, "learning_rate": 2.986075487490486e-06, "loss": 0.0044, "num_input_tokens_seen": 113579280, "step": 52630 }, { "epoch": 8.586460032626427, "grad_norm": 0.20050232112407684, "learning_rate": 2.982702881194091e-06, "loss": 0.0047, "num_input_tokens_seen": 113591024, "step": 52635 }, { "epoch": 8.587275693311582, "grad_norm": 0.148842453956604, "learning_rate": 2.9793320597315154e-06, "loss": 0.0901, "num_input_tokens_seen": 113602032, "step": 52640 }, { "epoch": 8.588091353996738, "grad_norm": 0.08197193592786789, "learning_rate": 2.975963023376008e-06, "loss": 0.0032, "num_input_tokens_seen": 113612688, "step": 52645 }, { "epoch": 8.588907014681892, "grad_norm": 0.06971848756074905, "learning_rate": 2.9725957724006936e-06, "loss": 0.005, "num_input_tokens_seen": 113622224, "step": 52650 }, { "epoch": 8.589722675367048, "grad_norm": 3.8267710208892822, "learning_rate": 2.9692303070785325e-06, "loss": 0.106, "num_input_tokens_seen": 113631920, "step": 52655 }, { "epoch": 8.590538336052202, "grad_norm": 0.08621606975793839, "learning_rate": 2.9658666276823427e-06, "loss": 0.004, "num_input_tokens_seen": 113642672, "step": 52660 }, { "epoch": 8.591353996737357, "grad_norm": 0.09965116530656815, "learning_rate": 2.9625047344848082e-06, "loss": 0.0872, "num_input_tokens_seen": 113653456, "step": 52665 }, { "epoch": 8.592169657422513, "grad_norm": 0.7939848899841309, "learning_rate": 2.959144627758453e-06, "loss": 0.1169, "num_input_tokens_seen": 113664304, "step": 52670 }, { "epoch": 8.592985318107667, "grad_norm": 3.727461099624634, "learning_rate": 2.955786307775671e-06, "loss": 0.1173, "num_input_tokens_seen": 113675024, "step": 52675 }, { "epoch": 8.593800978792823, "grad_norm": 0.09197498857975006, "learning_rate": 2.9524297748087014e-06, "loss": 0.23, "num_input_tokens_seen": 113685200, "step": 52680 }, { "epoch": 8.594616639477977, "grad_norm": 0.5910281538963318, "learning_rate": 2.949075029129644e-06, "loss": 0.006, "num_input_tokens_seen": 113696528, "step": 52685 }, { "epoch": 8.595432300163132, "grad_norm": 0.039027437567710876, "learning_rate": 2.945722071010443e-06, "loss": 0.0739, "num_input_tokens_seen": 113707568, "step": 52690 }, { "epoch": 8.596247960848288, "grad_norm": 0.16804245114326477, "learning_rate": 2.9423709007229184e-06, "loss": 0.0024, "num_input_tokens_seen": 113719280, "step": 52695 }, { "epoch": 8.597063621533442, "grad_norm": 3.382453203201294, "learning_rate": 2.9390215185387287e-06, "loss": 0.235, "num_input_tokens_seen": 113729072, "step": 52700 }, { "epoch": 8.597879282218598, "grad_norm": 29.68398094177246, "learning_rate": 2.93567392472939e-06, "loss": 0.032, "num_input_tokens_seen": 113740784, "step": 52705 }, { "epoch": 8.598694942903752, "grad_norm": 0.04309482127428055, "learning_rate": 2.932328119566277e-06, "loss": 0.077, "num_input_tokens_seen": 113751088, "step": 52710 }, { "epoch": 8.599510603588907, "grad_norm": 4.468674659729004, "learning_rate": 2.928984103320617e-06, "loss": 0.3409, "num_input_tokens_seen": 113762672, "step": 52715 }, { "epoch": 8.600326264274061, "grad_norm": 0.09823358058929443, "learning_rate": 2.9256418762634936e-06, "loss": 0.0076, "num_input_tokens_seen": 113773072, "step": 52720 }, { "epoch": 8.601141924959217, "grad_norm": 0.04677252098917961, "learning_rate": 2.922301438665842e-06, "loss": 0.0036, "num_input_tokens_seen": 113784080, "step": 52725 }, { "epoch": 8.601957585644373, "grad_norm": 0.4189244210720062, "learning_rate": 2.9189627907984576e-06, "loss": 0.0166, "num_input_tokens_seen": 113794704, "step": 52730 }, { "epoch": 8.602773246329527, "grad_norm": 0.14216023683547974, "learning_rate": 2.9156259329319867e-06, "loss": 0.0046, "num_input_tokens_seen": 113806512, "step": 52735 }, { "epoch": 8.603588907014682, "grad_norm": 0.46621978282928467, "learning_rate": 2.9122908653369335e-06, "loss": 0.1245, "num_input_tokens_seen": 113817872, "step": 52740 }, { "epoch": 8.604404567699836, "grad_norm": 0.5484855771064758, "learning_rate": 2.908957588283656e-06, "loss": 0.0204, "num_input_tokens_seen": 113829808, "step": 52745 }, { "epoch": 8.605220228384992, "grad_norm": 0.037593141198158264, "learning_rate": 2.9056261020423582e-06, "loss": 0.0094, "num_input_tokens_seen": 113840656, "step": 52750 }, { "epoch": 8.606035889070148, "grad_norm": 0.0696600005030632, "learning_rate": 2.9022964068831204e-06, "loss": 0.0044, "num_input_tokens_seen": 113851664, "step": 52755 }, { "epoch": 8.606851549755302, "grad_norm": 0.1309024840593338, "learning_rate": 2.898968503075858e-06, "loss": 0.0082, "num_input_tokens_seen": 113862288, "step": 52760 }, { "epoch": 8.607667210440457, "grad_norm": 0.06248234957456589, "learning_rate": 2.895642390890349e-06, "loss": 0.0045, "num_input_tokens_seen": 113873456, "step": 52765 }, { "epoch": 8.608482871125611, "grad_norm": 0.036878447979688644, "learning_rate": 2.8923180705962226e-06, "loss": 0.2432, "num_input_tokens_seen": 113883536, "step": 52770 }, { "epoch": 8.609298531810767, "grad_norm": 0.103336863219738, "learning_rate": 2.888995542462969e-06, "loss": 0.0037, "num_input_tokens_seen": 113893520, "step": 52775 }, { "epoch": 8.61011419249592, "grad_norm": 0.05401739478111267, "learning_rate": 2.885674806759925e-06, "loss": 0.2536, "num_input_tokens_seen": 113904592, "step": 52780 }, { "epoch": 8.610929853181077, "grad_norm": 0.07610581070184708, "learning_rate": 2.8823558637562893e-06, "loss": 0.0049, "num_input_tokens_seen": 113916272, "step": 52785 }, { "epoch": 8.611745513866232, "grad_norm": 0.04994501173496246, "learning_rate": 2.8790387137211105e-06, "loss": 0.0031, "num_input_tokens_seen": 113928336, "step": 52790 }, { "epoch": 8.612561174551386, "grad_norm": 0.11280269175767899, "learning_rate": 2.8757233569232933e-06, "loss": 0.1173, "num_input_tokens_seen": 113939920, "step": 52795 }, { "epoch": 8.613376835236542, "grad_norm": 0.12876686453819275, "learning_rate": 2.8724097936316004e-06, "loss": 0.08, "num_input_tokens_seen": 113949616, "step": 52800 }, { "epoch": 8.614192495921696, "grad_norm": 0.04100547358393669, "learning_rate": 2.8690980241146415e-06, "loss": 0.0042, "num_input_tokens_seen": 113959408, "step": 52805 }, { "epoch": 8.615008156606851, "grad_norm": 0.05942099168896675, "learning_rate": 2.8657880486408884e-06, "loss": 0.2803, "num_input_tokens_seen": 113970736, "step": 52810 }, { "epoch": 8.615823817292007, "grad_norm": 0.2773951590061188, "learning_rate": 2.862479867478665e-06, "loss": 0.097, "num_input_tokens_seen": 113981008, "step": 52815 }, { "epoch": 8.616639477977161, "grad_norm": 0.12048663944005966, "learning_rate": 2.859173480896149e-06, "loss": 0.0705, "num_input_tokens_seen": 113992816, "step": 52820 }, { "epoch": 8.617455138662317, "grad_norm": 0.11280005425214767, "learning_rate": 2.85586888916137e-06, "loss": 0.005, "num_input_tokens_seen": 114005392, "step": 52825 }, { "epoch": 8.61827079934747, "grad_norm": 0.09502550214529037, "learning_rate": 2.852566092542211e-06, "loss": 0.007, "num_input_tokens_seen": 114015600, "step": 52830 }, { "epoch": 8.619086460032626, "grad_norm": 4.35811710357666, "learning_rate": 2.8492650913064274e-06, "loss": 0.2785, "num_input_tokens_seen": 114026448, "step": 52835 }, { "epoch": 8.619902120717782, "grad_norm": 21.104766845703125, "learning_rate": 2.8459658857216074e-06, "loss": 0.0709, "num_input_tokens_seen": 114037552, "step": 52840 }, { "epoch": 8.620717781402936, "grad_norm": 0.09337308257818222, "learning_rate": 2.8426684760551993e-06, "loss": 0.1148, "num_input_tokens_seen": 114049072, "step": 52845 }, { "epoch": 8.621533442088092, "grad_norm": 0.05295537784695625, "learning_rate": 2.83937286257451e-06, "loss": 0.1148, "num_input_tokens_seen": 114059824, "step": 52850 }, { "epoch": 8.622349102773246, "grad_norm": 0.098955437541008, "learning_rate": 2.8360790455466996e-06, "loss": 0.1785, "num_input_tokens_seen": 114070960, "step": 52855 }, { "epoch": 8.623164763458401, "grad_norm": 0.09193826466798782, "learning_rate": 2.8327870252387727e-06, "loss": 0.0114, "num_input_tokens_seen": 114082224, "step": 52860 }, { "epoch": 8.623980424143557, "grad_norm": 0.0787716954946518, "learning_rate": 2.829496801917611e-06, "loss": 0.1159, "num_input_tokens_seen": 114091760, "step": 52865 }, { "epoch": 8.624796084828711, "grad_norm": 2.387537956237793, "learning_rate": 2.826208375849931e-06, "loss": 0.0061, "num_input_tokens_seen": 114100912, "step": 52870 }, { "epoch": 8.625611745513867, "grad_norm": 0.061954595148563385, "learning_rate": 2.8229217473023094e-06, "loss": 0.1305, "num_input_tokens_seen": 114111856, "step": 52875 }, { "epoch": 8.62642740619902, "grad_norm": 0.1902577131986618, "learning_rate": 2.8196369165411767e-06, "loss": 0.1039, "num_input_tokens_seen": 114121872, "step": 52880 }, { "epoch": 8.627243066884176, "grad_norm": 0.38934525847435, "learning_rate": 2.8163538838328176e-06, "loss": 0.1306, "num_input_tokens_seen": 114132304, "step": 52885 }, { "epoch": 8.62805872756933, "grad_norm": 0.4867192804813385, "learning_rate": 2.8130726494433684e-06, "loss": 0.1491, "num_input_tokens_seen": 114142160, "step": 52890 }, { "epoch": 8.628874388254486, "grad_norm": 0.07474728673696518, "learning_rate": 2.8097932136388285e-06, "loss": 0.1089, "num_input_tokens_seen": 114153392, "step": 52895 }, { "epoch": 8.629690048939642, "grad_norm": 0.16183558106422424, "learning_rate": 2.8065155766850425e-06, "loss": 0.0882, "num_input_tokens_seen": 114165648, "step": 52900 }, { "epoch": 8.630505709624796, "grad_norm": 0.5834521651268005, "learning_rate": 2.8032397388477098e-06, "loss": 0.0047, "num_input_tokens_seen": 114175248, "step": 52905 }, { "epoch": 8.631321370309951, "grad_norm": 3.95804500579834, "learning_rate": 2.799965700392393e-06, "loss": 0.3783, "num_input_tokens_seen": 114186256, "step": 52910 }, { "epoch": 8.632137030995105, "grad_norm": 0.2856743037700653, "learning_rate": 2.7966934615844957e-06, "loss": 0.0043, "num_input_tokens_seen": 114197264, "step": 52915 }, { "epoch": 8.632952691680261, "grad_norm": 0.13744287192821503, "learning_rate": 2.793423022689284e-06, "loss": 0.0036, "num_input_tokens_seen": 114208944, "step": 52920 }, { "epoch": 8.633768352365417, "grad_norm": 0.08259768038988113, "learning_rate": 2.7901543839718795e-06, "loss": 0.0328, "num_input_tokens_seen": 114219504, "step": 52925 }, { "epoch": 8.63458401305057, "grad_norm": 0.20659682154655457, "learning_rate": 2.7868875456972534e-06, "loss": 0.0045, "num_input_tokens_seen": 114229072, "step": 52930 }, { "epoch": 8.635399673735726, "grad_norm": 0.15795865654945374, "learning_rate": 2.783622508130229e-06, "loss": 0.0446, "num_input_tokens_seen": 114238768, "step": 52935 }, { "epoch": 8.63621533442088, "grad_norm": 0.023627901449799538, "learning_rate": 2.7803592715354877e-06, "loss": 0.0017, "num_input_tokens_seen": 114250256, "step": 52940 }, { "epoch": 8.637030995106036, "grad_norm": 5.866702556610107, "learning_rate": 2.7770978361775667e-06, "loss": 0.16, "num_input_tokens_seen": 114259696, "step": 52945 }, { "epoch": 8.63784665579119, "grad_norm": 0.08152702450752258, "learning_rate": 2.7738382023208526e-06, "loss": 0.1679, "num_input_tokens_seen": 114270288, "step": 52950 }, { "epoch": 8.638662316476346, "grad_norm": 0.060370657593011856, "learning_rate": 2.770580370229589e-06, "loss": 0.0064, "num_input_tokens_seen": 114280784, "step": 52955 }, { "epoch": 8.639477977161501, "grad_norm": 8.027362823486328, "learning_rate": 2.7673243401678704e-06, "loss": 0.0089, "num_input_tokens_seen": 114290832, "step": 52960 }, { "epoch": 8.640293637846655, "grad_norm": 0.32207539677619934, "learning_rate": 2.7640701123996445e-06, "loss": 0.0645, "num_input_tokens_seen": 114301648, "step": 52965 }, { "epoch": 8.641109298531811, "grad_norm": 0.05956178158521652, "learning_rate": 2.7608176871887242e-06, "loss": 0.0966, "num_input_tokens_seen": 114311728, "step": 52970 }, { "epoch": 8.641924959216965, "grad_norm": 0.08997718244791031, "learning_rate": 2.7575670647987606e-06, "loss": 0.2073, "num_input_tokens_seen": 114322512, "step": 52975 }, { "epoch": 8.64274061990212, "grad_norm": 0.316913366317749, "learning_rate": 2.7543182454932705e-06, "loss": 0.1184, "num_input_tokens_seen": 114332592, "step": 52980 }, { "epoch": 8.643556280587276, "grad_norm": 3.2485742568969727, "learning_rate": 2.751071229535615e-06, "loss": 0.241, "num_input_tokens_seen": 114343696, "step": 52985 }, { "epoch": 8.64437194127243, "grad_norm": 0.04890444874763489, "learning_rate": 2.7478260171890175e-06, "loss": 0.0979, "num_input_tokens_seen": 114354928, "step": 52990 }, { "epoch": 8.645187601957586, "grad_norm": 0.46548447012901306, "learning_rate": 2.744582608716548e-06, "loss": 0.0733, "num_input_tokens_seen": 114365392, "step": 52995 }, { "epoch": 8.64600326264274, "grad_norm": 0.14548994600772858, "learning_rate": 2.741341004381129e-06, "loss": 0.0863, "num_input_tokens_seen": 114374992, "step": 53000 }, { "epoch": 8.646818923327896, "grad_norm": 0.029076898470520973, "learning_rate": 2.7381012044455535e-06, "loss": 0.0026, "num_input_tokens_seen": 114385808, "step": 53005 }, { "epoch": 8.647634584013051, "grad_norm": 0.27490729093551636, "learning_rate": 2.73486320917245e-06, "loss": 0.0554, "num_input_tokens_seen": 114397072, "step": 53010 }, { "epoch": 8.648450244698205, "grad_norm": 0.08811581134796143, "learning_rate": 2.7316270188243064e-06, "loss": 0.0034, "num_input_tokens_seen": 114408624, "step": 53015 }, { "epoch": 8.649265905383361, "grad_norm": 0.056794021278619766, "learning_rate": 2.728392633663468e-06, "loss": 0.191, "num_input_tokens_seen": 114418512, "step": 53020 }, { "epoch": 8.650081566068515, "grad_norm": 0.9381062984466553, "learning_rate": 2.7251600539521248e-06, "loss": 0.0057, "num_input_tokens_seen": 114427792, "step": 53025 }, { "epoch": 8.65089722675367, "grad_norm": 0.1318144053220749, "learning_rate": 2.7219292799523316e-06, "loss": 0.3686, "num_input_tokens_seen": 114439888, "step": 53030 }, { "epoch": 8.651712887438826, "grad_norm": 0.10454489290714264, "learning_rate": 2.718700311925987e-06, "loss": 0.0994, "num_input_tokens_seen": 114451280, "step": 53035 }, { "epoch": 8.65252854812398, "grad_norm": 0.026726340875029564, "learning_rate": 2.715473150134848e-06, "loss": 0.0681, "num_input_tokens_seen": 114463376, "step": 53040 }, { "epoch": 8.653344208809136, "grad_norm": 0.05666026845574379, "learning_rate": 2.7122477948405277e-06, "loss": 0.0106, "num_input_tokens_seen": 114474896, "step": 53045 }, { "epoch": 8.65415986949429, "grad_norm": 0.07485075294971466, "learning_rate": 2.7090242463044896e-06, "loss": 0.0167, "num_input_tokens_seen": 114486000, "step": 53050 }, { "epoch": 8.654975530179446, "grad_norm": 0.038886334747076035, "learning_rate": 2.7058025047880466e-06, "loss": 0.0082, "num_input_tokens_seen": 114497456, "step": 53055 }, { "epoch": 8.655791190864601, "grad_norm": 0.10611975938081741, "learning_rate": 2.702582570552373e-06, "loss": 0.0044, "num_input_tokens_seen": 114508368, "step": 53060 }, { "epoch": 8.656606851549755, "grad_norm": 0.4545610547065735, "learning_rate": 2.699364443858493e-06, "loss": 0.0965, "num_input_tokens_seen": 114519888, "step": 53065 }, { "epoch": 8.65742251223491, "grad_norm": 0.1703397035598755, "learning_rate": 2.6961481249672765e-06, "loss": 0.1311, "num_input_tokens_seen": 114529712, "step": 53070 }, { "epoch": 8.658238172920065, "grad_norm": 0.07904315739870071, "learning_rate": 2.69293361413947e-06, "loss": 0.0991, "num_input_tokens_seen": 114540528, "step": 53075 }, { "epoch": 8.65905383360522, "grad_norm": 0.15389762818813324, "learning_rate": 2.6897209116356457e-06, "loss": 0.1363, "num_input_tokens_seen": 114552016, "step": 53080 }, { "epoch": 8.659869494290374, "grad_norm": 0.07232028245925903, "learning_rate": 2.6865100177162484e-06, "loss": 0.003, "num_input_tokens_seen": 114562672, "step": 53085 }, { "epoch": 8.66068515497553, "grad_norm": 0.04487145319581032, "learning_rate": 2.6833009326415663e-06, "loss": 0.0042, "num_input_tokens_seen": 114574320, "step": 53090 }, { "epoch": 8.661500815660686, "grad_norm": 0.04890063777565956, "learning_rate": 2.680093656671745e-06, "loss": 0.1374, "num_input_tokens_seen": 114585456, "step": 53095 }, { "epoch": 8.66231647634584, "grad_norm": 0.0658997967839241, "learning_rate": 2.6768881900667787e-06, "loss": 0.1267, "num_input_tokens_seen": 114596400, "step": 53100 }, { "epoch": 8.663132137030995, "grad_norm": 0.5864744782447815, "learning_rate": 2.673684533086526e-06, "loss": 0.0057, "num_input_tokens_seen": 114607120, "step": 53105 }, { "epoch": 8.66394779771615, "grad_norm": 0.7246565222740173, "learning_rate": 2.6704826859906858e-06, "loss": 0.0077, "num_input_tokens_seen": 114617808, "step": 53110 }, { "epoch": 8.664763458401305, "grad_norm": 0.18121881783008575, "learning_rate": 2.667282649038816e-06, "loss": 0.0065, "num_input_tokens_seen": 114627664, "step": 53115 }, { "epoch": 8.66557911908646, "grad_norm": 0.06088462844491005, "learning_rate": 2.6640844224903318e-06, "loss": 0.1073, "num_input_tokens_seen": 114638064, "step": 53120 }, { "epoch": 8.666394779771615, "grad_norm": 0.026404354721307755, "learning_rate": 2.660888006604498e-06, "loss": 0.1699, "num_input_tokens_seen": 114647760, "step": 53125 }, { "epoch": 8.66721044045677, "grad_norm": 7.807608127593994, "learning_rate": 2.6576934016404264e-06, "loss": 0.1214, "num_input_tokens_seen": 114658192, "step": 53130 }, { "epoch": 8.668026101141924, "grad_norm": 0.01555652916431427, "learning_rate": 2.654500607857091e-06, "loss": 0.0022, "num_input_tokens_seen": 114669392, "step": 53135 }, { "epoch": 8.66884176182708, "grad_norm": 0.2186608761548996, "learning_rate": 2.651309625513318e-06, "loss": 0.0641, "num_input_tokens_seen": 114681168, "step": 53140 }, { "epoch": 8.669657422512234, "grad_norm": 0.9420276284217834, "learning_rate": 2.648120454867778e-06, "loss": 0.0112, "num_input_tokens_seen": 114692688, "step": 53145 }, { "epoch": 8.67047308319739, "grad_norm": 1.198050856590271, "learning_rate": 2.6449330961790116e-06, "loss": 0.0428, "num_input_tokens_seen": 114703472, "step": 53150 }, { "epoch": 8.671288743882545, "grad_norm": 0.06262125819921494, "learning_rate": 2.641747549705395e-06, "loss": 0.0048, "num_input_tokens_seen": 114714704, "step": 53155 }, { "epoch": 8.6721044045677, "grad_norm": 0.08199375122785568, "learning_rate": 2.638563815705167e-06, "loss": 0.0028, "num_input_tokens_seen": 114725200, "step": 53160 }, { "epoch": 8.672920065252855, "grad_norm": 0.22733455896377563, "learning_rate": 2.635381894436417e-06, "loss": 0.0051, "num_input_tokens_seen": 114735568, "step": 53165 }, { "epoch": 8.673735725938009, "grad_norm": 0.05899897217750549, "learning_rate": 2.63220178615709e-06, "loss": 0.0038, "num_input_tokens_seen": 114746448, "step": 53170 }, { "epoch": 8.674551386623165, "grad_norm": 0.08533771336078644, "learning_rate": 2.629023491124971e-06, "loss": 0.1314, "num_input_tokens_seen": 114758704, "step": 53175 }, { "epoch": 8.67536704730832, "grad_norm": 0.09958028048276901, "learning_rate": 2.6258470095977262e-06, "loss": 0.1571, "num_input_tokens_seen": 114769744, "step": 53180 }, { "epoch": 8.676182707993474, "grad_norm": 2.5879385471343994, "learning_rate": 2.6226723418328437e-06, "loss": 0.0075, "num_input_tokens_seen": 114780112, "step": 53185 }, { "epoch": 8.67699836867863, "grad_norm": 3.7749695777893066, "learning_rate": 2.6194994880876843e-06, "loss": 0.1072, "num_input_tokens_seen": 114790384, "step": 53190 }, { "epoch": 8.677814029363784, "grad_norm": 0.06751300394535065, "learning_rate": 2.616328448619454e-06, "loss": 0.0034, "num_input_tokens_seen": 114801168, "step": 53195 }, { "epoch": 8.67862969004894, "grad_norm": 3.2267613410949707, "learning_rate": 2.613159223685213e-06, "loss": 0.1381, "num_input_tokens_seen": 114812560, "step": 53200 }, { "epoch": 8.679445350734095, "grad_norm": 0.16413560509681702, "learning_rate": 2.609991813541876e-06, "loss": 0.0047, "num_input_tokens_seen": 114823664, "step": 53205 }, { "epoch": 8.68026101141925, "grad_norm": 34.654502868652344, "learning_rate": 2.6068262184462066e-06, "loss": 0.0904, "num_input_tokens_seen": 114835056, "step": 53210 }, { "epoch": 8.681076672104405, "grad_norm": 0.064839206635952, "learning_rate": 2.6036624386548277e-06, "loss": 0.2715, "num_input_tokens_seen": 114845744, "step": 53215 }, { "epoch": 8.681892332789559, "grad_norm": 0.3497132956981659, "learning_rate": 2.6005004744242082e-06, "loss": 0.0927, "num_input_tokens_seen": 114857072, "step": 53220 }, { "epoch": 8.682707993474715, "grad_norm": 0.24611487984657288, "learning_rate": 2.597340326010675e-06, "loss": 0.0071, "num_input_tokens_seen": 114868304, "step": 53225 }, { "epoch": 8.68352365415987, "grad_norm": 0.45386627316474915, "learning_rate": 2.5941819936704053e-06, "loss": 0.0986, "num_input_tokens_seen": 114877360, "step": 53230 }, { "epoch": 8.684339314845024, "grad_norm": 0.04542342573404312, "learning_rate": 2.5910254776594256e-06, "loss": 0.0318, "num_input_tokens_seen": 114888496, "step": 53235 }, { "epoch": 8.68515497553018, "grad_norm": 0.3080884516239166, "learning_rate": 2.587870778233625e-06, "loss": 0.1251, "num_input_tokens_seen": 114900656, "step": 53240 }, { "epoch": 8.685970636215334, "grad_norm": 0.07289087027311325, "learning_rate": 2.584717895648739e-06, "loss": 0.0035, "num_input_tokens_seen": 114912208, "step": 53245 }, { "epoch": 8.68678629690049, "grad_norm": 5.651989459991455, "learning_rate": 2.5815668301603537e-06, "loss": 0.1794, "num_input_tokens_seen": 114924144, "step": 53250 }, { "epoch": 8.687601957585644, "grad_norm": 10.517882347106934, "learning_rate": 2.5784175820239094e-06, "loss": 0.1098, "num_input_tokens_seen": 114936496, "step": 53255 }, { "epoch": 8.6884176182708, "grad_norm": 0.11167328804731369, "learning_rate": 2.575270151494702e-06, "loss": 0.257, "num_input_tokens_seen": 114946928, "step": 53260 }, { "epoch": 8.689233278955955, "grad_norm": 0.11391818523406982, "learning_rate": 2.5721245388278805e-06, "loss": 0.1043, "num_input_tokens_seen": 114957392, "step": 53265 }, { "epoch": 8.690048939641109, "grad_norm": 0.03107847273349762, "learning_rate": 2.5689807442784404e-06, "loss": 0.0043, "num_input_tokens_seen": 114968816, "step": 53270 }, { "epoch": 8.690864600326265, "grad_norm": 0.26515620946884155, "learning_rate": 2.5658387681012337e-06, "loss": 0.0102, "num_input_tokens_seen": 114979408, "step": 53275 }, { "epoch": 8.691680261011419, "grad_norm": 2.514280080795288, "learning_rate": 2.5626986105509677e-06, "loss": 0.0753, "num_input_tokens_seen": 114989776, "step": 53280 }, { "epoch": 8.692495921696574, "grad_norm": 0.1590150147676468, "learning_rate": 2.5595602718821916e-06, "loss": 0.1038, "num_input_tokens_seen": 115000976, "step": 53285 }, { "epoch": 8.69331158238173, "grad_norm": 0.30125266313552856, "learning_rate": 2.5564237523493295e-06, "loss": 0.1155, "num_input_tokens_seen": 115010608, "step": 53290 }, { "epoch": 8.694127243066884, "grad_norm": 0.1378786861896515, "learning_rate": 2.553289052206634e-06, "loss": 0.0111, "num_input_tokens_seen": 115021904, "step": 53295 }, { "epoch": 8.69494290375204, "grad_norm": 4.505702972412109, "learning_rate": 2.5501561717082204e-06, "loss": 0.0923, "num_input_tokens_seen": 115032240, "step": 53300 }, { "epoch": 8.695758564437194, "grad_norm": 0.0683271661400795, "learning_rate": 2.547025111108056e-06, "loss": 0.0062, "num_input_tokens_seen": 115043024, "step": 53305 }, { "epoch": 8.69657422512235, "grad_norm": 3.7424323558807373, "learning_rate": 2.5438958706599623e-06, "loss": 0.1452, "num_input_tokens_seen": 115053584, "step": 53310 }, { "epoch": 8.697389885807503, "grad_norm": 0.07999517768621445, "learning_rate": 2.540768450617609e-06, "loss": 0.2054, "num_input_tokens_seen": 115064368, "step": 53315 }, { "epoch": 8.698205546492659, "grad_norm": 0.830254077911377, "learning_rate": 2.537642851234523e-06, "loss": 0.0569, "num_input_tokens_seen": 115073712, "step": 53320 }, { "epoch": 8.699021207177815, "grad_norm": 9.407503128051758, "learning_rate": 2.5345190727640828e-06, "loss": 0.0531, "num_input_tokens_seen": 115085616, "step": 53325 }, { "epoch": 8.699836867862969, "grad_norm": 3.5490689277648926, "learning_rate": 2.5313971154595135e-06, "loss": 0.1871, "num_input_tokens_seen": 115096080, "step": 53330 }, { "epoch": 8.700652528548124, "grad_norm": 0.08478696644306183, "learning_rate": 2.5282769795738987e-06, "loss": 0.0043, "num_input_tokens_seen": 115107344, "step": 53335 }, { "epoch": 8.701468189233278, "grad_norm": 62.311641693115234, "learning_rate": 2.5251586653601722e-06, "loss": 0.1402, "num_input_tokens_seen": 115117712, "step": 53340 }, { "epoch": 8.702283849918434, "grad_norm": 3.5588507652282715, "learning_rate": 2.522042173071121e-06, "loss": 0.2631, "num_input_tokens_seen": 115128048, "step": 53345 }, { "epoch": 8.70309951060359, "grad_norm": 0.06941116601228714, "learning_rate": 2.518927502959384e-06, "loss": 0.0048, "num_input_tokens_seen": 115139152, "step": 53350 }, { "epoch": 8.703915171288743, "grad_norm": 6.748194694519043, "learning_rate": 2.5158146552774486e-06, "loss": 0.2439, "num_input_tokens_seen": 115150032, "step": 53355 }, { "epoch": 8.7047308319739, "grad_norm": 0.0572577528655529, "learning_rate": 2.512703630277663e-06, "loss": 0.0038, "num_input_tokens_seen": 115159536, "step": 53360 }, { "epoch": 8.705546492659053, "grad_norm": 0.04039865359663963, "learning_rate": 2.5095944282122226e-06, "loss": 0.1804, "num_input_tokens_seen": 115170576, "step": 53365 }, { "epoch": 8.706362153344209, "grad_norm": 0.018278568983078003, "learning_rate": 2.50648704933317e-06, "loss": 0.1922, "num_input_tokens_seen": 115182160, "step": 53370 }, { "epoch": 8.707177814029365, "grad_norm": 0.0602726973593235, "learning_rate": 2.5033814938924095e-06, "loss": 0.1083, "num_input_tokens_seen": 115190960, "step": 53375 }, { "epoch": 8.707993474714518, "grad_norm": 0.10237548500299454, "learning_rate": 2.500277762141692e-06, "loss": 0.0046, "num_input_tokens_seen": 115201680, "step": 53380 }, { "epoch": 8.708809135399674, "grad_norm": 9.355179786682129, "learning_rate": 2.49717585433262e-06, "loss": 0.0525, "num_input_tokens_seen": 115212496, "step": 53385 }, { "epoch": 8.709624796084828, "grad_norm": 2.9365639686584473, "learning_rate": 2.4940757707166474e-06, "loss": 0.1061, "num_input_tokens_seen": 115223888, "step": 53390 }, { "epoch": 8.710440456769984, "grad_norm": 0.06928296387195587, "learning_rate": 2.490977511545092e-06, "loss": 0.1105, "num_input_tokens_seen": 115233616, "step": 53395 }, { "epoch": 8.71125611745514, "grad_norm": 0.10657060146331787, "learning_rate": 2.4878810770691096e-06, "loss": 0.0438, "num_input_tokens_seen": 115244496, "step": 53400 }, { "epoch": 8.712071778140293, "grad_norm": 1.0137279033660889, "learning_rate": 2.48478646753971e-06, "loss": 0.1113, "num_input_tokens_seen": 115255216, "step": 53405 }, { "epoch": 8.71288743882545, "grad_norm": 0.08584178239107132, "learning_rate": 2.4816936832077615e-06, "loss": 0.0038, "num_input_tokens_seen": 115264720, "step": 53410 }, { "epoch": 8.713703099510603, "grad_norm": 0.23638053238391876, "learning_rate": 2.478602724323981e-06, "loss": 0.2156, "num_input_tokens_seen": 115276080, "step": 53415 }, { "epoch": 8.714518760195759, "grad_norm": 0.1255992203950882, "learning_rate": 2.4755135911389364e-06, "loss": 0.016, "num_input_tokens_seen": 115286576, "step": 53420 }, { "epoch": 8.715334420880914, "grad_norm": 0.11487893015146255, "learning_rate": 2.472426283903048e-06, "loss": 0.0873, "num_input_tokens_seen": 115297200, "step": 53425 }, { "epoch": 8.716150081566068, "grad_norm": 0.05824054032564163, "learning_rate": 2.4693408028665878e-06, "loss": 0.0044, "num_input_tokens_seen": 115308240, "step": 53430 }, { "epoch": 8.716965742251224, "grad_norm": 0.1160576343536377, "learning_rate": 2.4662571482796797e-06, "loss": 0.004, "num_input_tokens_seen": 115319600, "step": 53435 }, { "epoch": 8.717781402936378, "grad_norm": 0.394844651222229, "learning_rate": 2.4631753203923052e-06, "loss": 0.0049, "num_input_tokens_seen": 115330736, "step": 53440 }, { "epoch": 8.718597063621534, "grad_norm": 0.030199168249964714, "learning_rate": 2.460095319454289e-06, "loss": 0.2393, "num_input_tokens_seen": 115342672, "step": 53445 }, { "epoch": 8.719412724306688, "grad_norm": 0.010499256663024426, "learning_rate": 2.4570171457153123e-06, "loss": 0.2822, "num_input_tokens_seen": 115354320, "step": 53450 }, { "epoch": 8.720228384991843, "grad_norm": 3.4621529579162598, "learning_rate": 2.4539407994249088e-06, "loss": 0.3082, "num_input_tokens_seen": 115364880, "step": 53455 }, { "epoch": 8.721044045676999, "grad_norm": 0.05974346771836281, "learning_rate": 2.450866280832456e-06, "loss": 0.0054, "num_input_tokens_seen": 115375824, "step": 53460 }, { "epoch": 8.721859706362153, "grad_norm": 0.049211133271455765, "learning_rate": 2.4477935901872e-06, "loss": 0.0997, "num_input_tokens_seen": 115386992, "step": 53465 }, { "epoch": 8.722675367047309, "grad_norm": 0.07548020035028458, "learning_rate": 2.4447227277382244e-06, "loss": 0.1105, "num_input_tokens_seen": 115397008, "step": 53470 }, { "epoch": 8.723491027732463, "grad_norm": 0.15559165179729462, "learning_rate": 2.441653693734472e-06, "loss": 0.1576, "num_input_tokens_seen": 115407888, "step": 53475 }, { "epoch": 8.724306688417618, "grad_norm": 0.1762966513633728, "learning_rate": 2.438586488424727e-06, "loss": 0.1806, "num_input_tokens_seen": 115417936, "step": 53480 }, { "epoch": 8.725122349102774, "grad_norm": 0.4542035162448883, "learning_rate": 2.435521112057637e-06, "loss": 0.071, "num_input_tokens_seen": 115429200, "step": 53485 }, { "epoch": 8.725938009787928, "grad_norm": 0.06879319995641708, "learning_rate": 2.432457564881699e-06, "loss": 0.1011, "num_input_tokens_seen": 115439920, "step": 53490 }, { "epoch": 8.726753670473084, "grad_norm": 1.1868400573730469, "learning_rate": 2.429395847145252e-06, "loss": 0.0097, "num_input_tokens_seen": 115450608, "step": 53495 }, { "epoch": 8.727569331158238, "grad_norm": 0.3320519030094147, "learning_rate": 2.4263359590965042e-06, "loss": 0.0099, "num_input_tokens_seen": 115461680, "step": 53500 }, { "epoch": 8.728384991843393, "grad_norm": 0.23768679797649384, "learning_rate": 2.4232779009835006e-06, "loss": 0.0041, "num_input_tokens_seen": 115472752, "step": 53505 }, { "epoch": 8.729200652528547, "grad_norm": 0.17958806455135345, "learning_rate": 2.420221673054143e-06, "loss": 0.1738, "num_input_tokens_seen": 115483536, "step": 53510 }, { "epoch": 8.730016313213703, "grad_norm": 0.049967117607593536, "learning_rate": 2.417167275556187e-06, "loss": 0.0041, "num_input_tokens_seen": 115494736, "step": 53515 }, { "epoch": 8.730831973898859, "grad_norm": 0.06931749731302261, "learning_rate": 2.4141147087372336e-06, "loss": 0.125, "num_input_tokens_seen": 115505552, "step": 53520 }, { "epoch": 8.731647634584013, "grad_norm": 0.07650643587112427, "learning_rate": 2.4110639728447433e-06, "loss": 0.1952, "num_input_tokens_seen": 115515920, "step": 53525 }, { "epoch": 8.732463295269168, "grad_norm": 3.825303792953491, "learning_rate": 2.4080150681260212e-06, "loss": 0.2009, "num_input_tokens_seen": 115526896, "step": 53530 }, { "epoch": 8.733278955954322, "grad_norm": 2.012338161468506, "learning_rate": 2.4049679948282305e-06, "loss": 0.1668, "num_input_tokens_seen": 115536912, "step": 53535 }, { "epoch": 8.734094616639478, "grad_norm": 0.3810151517391205, "learning_rate": 2.401922753198377e-06, "loss": 0.0103, "num_input_tokens_seen": 115547728, "step": 53540 }, { "epoch": 8.734910277324634, "grad_norm": 0.1658923178911209, "learning_rate": 2.398879343483329e-06, "loss": 0.0388, "num_input_tokens_seen": 115559184, "step": 53545 }, { "epoch": 8.735725938009788, "grad_norm": 0.03131018951535225, "learning_rate": 2.3958377659297983e-06, "loss": 0.1574, "num_input_tokens_seen": 115569552, "step": 53550 }, { "epoch": 8.736541598694943, "grad_norm": 5.776989936828613, "learning_rate": 2.392798020784348e-06, "loss": 0.1313, "num_input_tokens_seen": 115578928, "step": 53555 }, { "epoch": 8.737357259380097, "grad_norm": 0.09969309717416763, "learning_rate": 2.3897601082934013e-06, "loss": 0.017, "num_input_tokens_seen": 115589904, "step": 53560 }, { "epoch": 8.738172920065253, "grad_norm": 0.2182675451040268, "learning_rate": 2.3867240287032214e-06, "loss": 0.006, "num_input_tokens_seen": 115599440, "step": 53565 }, { "epoch": 8.738988580750409, "grad_norm": 0.33458808064460754, "learning_rate": 2.3836897822599317e-06, "loss": 0.1146, "num_input_tokens_seen": 115610064, "step": 53570 }, { "epoch": 8.739804241435563, "grad_norm": 3.599860191345215, "learning_rate": 2.380657369209502e-06, "loss": 0.2455, "num_input_tokens_seen": 115621360, "step": 53575 }, { "epoch": 8.740619902120718, "grad_norm": 0.0940905213356018, "learning_rate": 2.3776267897977543e-06, "loss": 0.1025, "num_input_tokens_seen": 115632272, "step": 53580 }, { "epoch": 8.741435562805872, "grad_norm": 0.0359291210770607, "learning_rate": 2.374598044270365e-06, "loss": 0.0078, "num_input_tokens_seen": 115643280, "step": 53585 }, { "epoch": 8.742251223491028, "grad_norm": 0.03600488603115082, "learning_rate": 2.3715711328728575e-06, "loss": 0.0069, "num_input_tokens_seen": 115654128, "step": 53590 }, { "epoch": 8.743066884176184, "grad_norm": 0.08532227575778961, "learning_rate": 2.3685460558506097e-06, "loss": 0.0088, "num_input_tokens_seen": 115664528, "step": 53595 }, { "epoch": 8.743882544861338, "grad_norm": 5.841217517852783, "learning_rate": 2.3655228134488505e-06, "loss": 0.0946, "num_input_tokens_seen": 115675024, "step": 53600 }, { "epoch": 8.744698205546493, "grad_norm": 0.0855046808719635, "learning_rate": 2.362501405912651e-06, "loss": 0.0051, "num_input_tokens_seen": 115687056, "step": 53605 }, { "epoch": 8.745513866231647, "grad_norm": 0.10552932322025299, "learning_rate": 2.3594818334869568e-06, "loss": 0.1236, "num_input_tokens_seen": 115697840, "step": 53610 }, { "epoch": 8.746329526916803, "grad_norm": 0.3092189133167267, "learning_rate": 2.3564640964165386e-06, "loss": 0.004, "num_input_tokens_seen": 115708848, "step": 53615 }, { "epoch": 8.747145187601957, "grad_norm": 0.2367440164089203, "learning_rate": 2.353448194946037e-06, "loss": 0.3288, "num_input_tokens_seen": 115719568, "step": 53620 }, { "epoch": 8.747960848287113, "grad_norm": 0.07751596719026566, "learning_rate": 2.350434129319928e-06, "loss": 0.0903, "num_input_tokens_seen": 115730768, "step": 53625 }, { "epoch": 8.748776508972268, "grad_norm": 0.11400056630373001, "learning_rate": 2.347421899782551e-06, "loss": 0.0045, "num_input_tokens_seen": 115741840, "step": 53630 }, { "epoch": 8.749592169657422, "grad_norm": 0.02843559719622135, "learning_rate": 2.3444115065780953e-06, "loss": 0.0128, "num_input_tokens_seen": 115752208, "step": 53635 }, { "epoch": 8.750407830342578, "grad_norm": 0.24501417577266693, "learning_rate": 2.341402949950594e-06, "loss": 0.004, "num_input_tokens_seen": 115762992, "step": 53640 }, { "epoch": 8.751223491027732, "grad_norm": 11.36231803894043, "learning_rate": 2.338396230143941e-06, "loss": 0.2064, "num_input_tokens_seen": 115774224, "step": 53645 }, { "epoch": 8.752039151712887, "grad_norm": 2.922313690185547, "learning_rate": 2.335391347401872e-06, "loss": 0.0064, "num_input_tokens_seen": 115785776, "step": 53650 }, { "epoch": 8.752854812398043, "grad_norm": 6.933698654174805, "learning_rate": 2.3323883019679805e-06, "loss": 0.2152, "num_input_tokens_seen": 115796656, "step": 53655 }, { "epoch": 8.753670473083197, "grad_norm": 6.864072799682617, "learning_rate": 2.3293870940857084e-06, "loss": 0.093, "num_input_tokens_seen": 115808272, "step": 53660 }, { "epoch": 8.754486133768353, "grad_norm": 0.2145785391330719, "learning_rate": 2.326387723998347e-06, "loss": 0.0061, "num_input_tokens_seen": 115820144, "step": 53665 }, { "epoch": 8.755301794453507, "grad_norm": 0.023491421714425087, "learning_rate": 2.3233901919490404e-06, "loss": 0.3007, "num_input_tokens_seen": 115831888, "step": 53670 }, { "epoch": 8.756117455138662, "grad_norm": 0.1304338425397873, "learning_rate": 2.3203944981807835e-06, "loss": 0.0839, "num_input_tokens_seen": 115843024, "step": 53675 }, { "epoch": 8.756933115823816, "grad_norm": 3.298250675201416, "learning_rate": 2.3174006429364263e-06, "loss": 0.118, "num_input_tokens_seen": 115854288, "step": 53680 }, { "epoch": 8.757748776508972, "grad_norm": 0.06621116399765015, "learning_rate": 2.314408626458664e-06, "loss": 0.1373, "num_input_tokens_seen": 115865296, "step": 53685 }, { "epoch": 8.758564437194128, "grad_norm": 0.6509382724761963, "learning_rate": 2.311418448990041e-06, "loss": 0.0078, "num_input_tokens_seen": 115876304, "step": 53690 }, { "epoch": 8.759380097879282, "grad_norm": 22.45487403869629, "learning_rate": 2.3084301107729633e-06, "loss": 0.0732, "num_input_tokens_seen": 115887632, "step": 53695 }, { "epoch": 8.760195758564437, "grad_norm": 0.30147695541381836, "learning_rate": 2.3054436120496736e-06, "loss": 0.059, "num_input_tokens_seen": 115898640, "step": 53700 }, { "epoch": 8.761011419249591, "grad_norm": 0.5400973558425903, "learning_rate": 2.302458953062275e-06, "loss": 0.0063, "num_input_tokens_seen": 115909392, "step": 53705 }, { "epoch": 8.761827079934747, "grad_norm": 0.04615411162376404, "learning_rate": 2.2994761340527195e-06, "loss": 0.0599, "num_input_tokens_seen": 115920080, "step": 53710 }, { "epoch": 8.762642740619903, "grad_norm": 0.14032778143882751, "learning_rate": 2.2964951552628096e-06, "loss": 0.0025, "num_input_tokens_seen": 115930640, "step": 53715 }, { "epoch": 8.763458401305057, "grad_norm": 0.09734567999839783, "learning_rate": 2.293516016934202e-06, "loss": 0.0036, "num_input_tokens_seen": 115941392, "step": 53720 }, { "epoch": 8.764274061990212, "grad_norm": 1.4439170360565186, "learning_rate": 2.2905387193083965e-06, "loss": 0.088, "num_input_tokens_seen": 115952176, "step": 53725 }, { "epoch": 8.765089722675366, "grad_norm": 15.759501457214355, "learning_rate": 2.287563262626749e-06, "loss": 0.0239, "num_input_tokens_seen": 115962896, "step": 53730 }, { "epoch": 8.765905383360522, "grad_norm": 0.1682003140449524, "learning_rate": 2.2845896471304667e-06, "loss": 0.0061, "num_input_tokens_seen": 115972944, "step": 53735 }, { "epoch": 8.766721044045678, "grad_norm": 0.09591232240200043, "learning_rate": 2.2816178730606012e-06, "loss": 0.1179, "num_input_tokens_seen": 115983600, "step": 53740 }, { "epoch": 8.767536704730832, "grad_norm": 0.06589756160974503, "learning_rate": 2.2786479406580658e-06, "loss": 0.1541, "num_input_tokens_seen": 115994832, "step": 53745 }, { "epoch": 8.768352365415987, "grad_norm": 0.05247294902801514, "learning_rate": 2.2756798501636146e-06, "loss": 0.0033, "num_input_tokens_seen": 116004848, "step": 53750 }, { "epoch": 8.769168026101141, "grad_norm": 7.89129638671875, "learning_rate": 2.272713601817855e-06, "loss": 0.0208, "num_input_tokens_seen": 116016016, "step": 53755 }, { "epoch": 8.769983686786297, "grad_norm": 0.05102195218205452, "learning_rate": 2.269749195861251e-06, "loss": 0.0758, "num_input_tokens_seen": 116027088, "step": 53760 }, { "epoch": 8.770799347471453, "grad_norm": 0.08562790602445602, "learning_rate": 2.266786632534107e-06, "loss": 0.1945, "num_input_tokens_seen": 116038192, "step": 53765 }, { "epoch": 8.771615008156607, "grad_norm": 0.23722116649150848, "learning_rate": 2.2638259120765864e-06, "loss": 0.1114, "num_input_tokens_seen": 116048272, "step": 53770 }, { "epoch": 8.772430668841762, "grad_norm": 0.18164660036563873, "learning_rate": 2.2608670347286947e-06, "loss": 0.0102, "num_input_tokens_seen": 116060112, "step": 53775 }, { "epoch": 8.773246329526916, "grad_norm": 0.03893053159117699, "learning_rate": 2.257910000730304e-06, "loss": 0.0054, "num_input_tokens_seen": 116071216, "step": 53780 }, { "epoch": 8.774061990212072, "grad_norm": 0.2040170133113861, "learning_rate": 2.2549548103211222e-06, "loss": 0.0421, "num_input_tokens_seen": 116081360, "step": 53785 }, { "epoch": 8.774877650897226, "grad_norm": 0.6495329737663269, "learning_rate": 2.2520014637407076e-06, "loss": 0.0762, "num_input_tokens_seen": 116091216, "step": 53790 }, { "epoch": 8.775693311582382, "grad_norm": 0.07339529693126678, "learning_rate": 2.2490499612284777e-06, "loss": 0.0038, "num_input_tokens_seen": 116101168, "step": 53795 }, { "epoch": 8.776508972267537, "grad_norm": 0.22962583601474762, "learning_rate": 2.246100303023696e-06, "loss": 0.0055, "num_input_tokens_seen": 116112272, "step": 53800 }, { "epoch": 8.777324632952691, "grad_norm": 3.569058895111084, "learning_rate": 2.2431524893654743e-06, "loss": 0.2096, "num_input_tokens_seen": 116123184, "step": 53805 }, { "epoch": 8.778140293637847, "grad_norm": 0.17319542169570923, "learning_rate": 2.2402065204927797e-06, "loss": 0.005, "num_input_tokens_seen": 116133776, "step": 53810 }, { "epoch": 8.778955954323001, "grad_norm": 2.4589014053344727, "learning_rate": 2.237262396644421e-06, "loss": 0.1882, "num_input_tokens_seen": 116144688, "step": 53815 }, { "epoch": 8.779771615008157, "grad_norm": 0.08419797569513321, "learning_rate": 2.2343201180590745e-06, "loss": 0.0986, "num_input_tokens_seen": 116154288, "step": 53820 }, { "epoch": 8.780587275693312, "grad_norm": 0.09047245234251022, "learning_rate": 2.2313796849752516e-06, "loss": 0.057, "num_input_tokens_seen": 116165584, "step": 53825 }, { "epoch": 8.781402936378466, "grad_norm": 0.1527445912361145, "learning_rate": 2.2284410976313174e-06, "loss": 0.0334, "num_input_tokens_seen": 116176848, "step": 53830 }, { "epoch": 8.782218597063622, "grad_norm": 0.05455156788229942, "learning_rate": 2.2255043562654926e-06, "loss": 0.0829, "num_input_tokens_seen": 116186800, "step": 53835 }, { "epoch": 8.783034257748776, "grad_norm": 0.07151231914758682, "learning_rate": 2.2225694611158366e-06, "loss": 0.0045, "num_input_tokens_seen": 116196656, "step": 53840 }, { "epoch": 8.783849918433932, "grad_norm": 0.3095216453075409, "learning_rate": 2.2196364124202756e-06, "loss": 0.0048, "num_input_tokens_seen": 116207760, "step": 53845 }, { "epoch": 8.784665579119086, "grad_norm": 2.7524521350860596, "learning_rate": 2.2167052104165724e-06, "loss": 0.092, "num_input_tokens_seen": 116216752, "step": 53850 }, { "epoch": 8.785481239804241, "grad_norm": 6.77707576751709, "learning_rate": 2.2137758553423454e-06, "loss": 0.112, "num_input_tokens_seen": 116227568, "step": 53855 }, { "epoch": 8.786296900489397, "grad_norm": 0.18915368616580963, "learning_rate": 2.2108483474350626e-06, "loss": 0.0053, "num_input_tokens_seen": 116238736, "step": 53860 }, { "epoch": 8.78711256117455, "grad_norm": 3.433610200881958, "learning_rate": 2.207922686932046e-06, "loss": 0.232, "num_input_tokens_seen": 116249584, "step": 53865 }, { "epoch": 8.787928221859707, "grad_norm": 0.03905387595295906, "learning_rate": 2.2049988740704604e-06, "loss": 0.0045, "num_input_tokens_seen": 116259664, "step": 53870 }, { "epoch": 8.78874388254486, "grad_norm": 4.9666547775268555, "learning_rate": 2.202076909087328e-06, "loss": 0.1455, "num_input_tokens_seen": 116270352, "step": 53875 }, { "epoch": 8.789559543230016, "grad_norm": 0.8719344139099121, "learning_rate": 2.199156792219517e-06, "loss": 0.0247, "num_input_tokens_seen": 116281232, "step": 53880 }, { "epoch": 8.790375203915172, "grad_norm": 0.11355309933423996, "learning_rate": 2.1962385237037445e-06, "loss": 0.0055, "num_input_tokens_seen": 116291728, "step": 53885 }, { "epoch": 8.791190864600326, "grad_norm": 3.836967706680298, "learning_rate": 2.193322103776585e-06, "loss": 0.1835, "num_input_tokens_seen": 116302480, "step": 53890 }, { "epoch": 8.792006525285482, "grad_norm": 3.569146156311035, "learning_rate": 2.1904075326744543e-06, "loss": 0.1813, "num_input_tokens_seen": 116314192, "step": 53895 }, { "epoch": 8.792822185970635, "grad_norm": 0.04698646813631058, "learning_rate": 2.1874948106336254e-06, "loss": 0.1143, "num_input_tokens_seen": 116325616, "step": 53900 }, { "epoch": 8.793637846655791, "grad_norm": 0.033510949462652206, "learning_rate": 2.1845839378902167e-06, "loss": 0.0024, "num_input_tokens_seen": 116335344, "step": 53905 }, { "epoch": 8.794453507340947, "grad_norm": 0.059157855808734894, "learning_rate": 2.181674914680196e-06, "loss": 0.005, "num_input_tokens_seen": 116346480, "step": 53910 }, { "epoch": 8.7952691680261, "grad_norm": 0.3026077449321747, "learning_rate": 2.178767741239382e-06, "loss": 0.2606, "num_input_tokens_seen": 116358576, "step": 53915 }, { "epoch": 8.796084828711257, "grad_norm": 0.14737290143966675, "learning_rate": 2.1758624178034537e-06, "loss": 0.1134, "num_input_tokens_seen": 116369872, "step": 53920 }, { "epoch": 8.79690048939641, "grad_norm": 0.30312269926071167, "learning_rate": 2.1729589446079252e-06, "loss": 0.0049, "num_input_tokens_seen": 116380432, "step": 53925 }, { "epoch": 8.797716150081566, "grad_norm": 0.05973745882511139, "learning_rate": 2.1700573218881694e-06, "loss": 0.0044, "num_input_tokens_seen": 116390544, "step": 53930 }, { "epoch": 8.798531810766722, "grad_norm": 0.0973372831940651, "learning_rate": 2.167157549879406e-06, "loss": 0.0053, "num_input_tokens_seen": 116402512, "step": 53935 }, { "epoch": 8.799347471451876, "grad_norm": 0.11348201334476471, "learning_rate": 2.1642596288166976e-06, "loss": 0.1278, "num_input_tokens_seen": 116413488, "step": 53940 }, { "epoch": 8.800163132137031, "grad_norm": 0.046488139778375626, "learning_rate": 2.1613635589349756e-06, "loss": 0.0053, "num_input_tokens_seen": 116424976, "step": 53945 }, { "epoch": 8.800978792822185, "grad_norm": 0.1353694200515747, "learning_rate": 2.1584693404690076e-06, "loss": 0.105, "num_input_tokens_seen": 116435856, "step": 53950 }, { "epoch": 8.801794453507341, "grad_norm": 0.15112315118312836, "learning_rate": 2.155576973653409e-06, "loss": 0.3163, "num_input_tokens_seen": 116447472, "step": 53955 }, { "epoch": 8.802610114192497, "grad_norm": 0.11449641734361649, "learning_rate": 2.1526864587226525e-06, "loss": 0.1872, "num_input_tokens_seen": 116458064, "step": 53960 }, { "epoch": 8.80342577487765, "grad_norm": 0.1149986982345581, "learning_rate": 2.14979779591106e-06, "loss": 0.0391, "num_input_tokens_seen": 116468464, "step": 53965 }, { "epoch": 8.804241435562806, "grad_norm": 0.020535923540592194, "learning_rate": 2.1469109854527993e-06, "loss": 0.1241, "num_input_tokens_seen": 116477488, "step": 53970 }, { "epoch": 8.80505709624796, "grad_norm": 0.09510662406682968, "learning_rate": 2.1440260275818856e-06, "loss": 0.102, "num_input_tokens_seen": 116488368, "step": 53975 }, { "epoch": 8.805872756933116, "grad_norm": 0.1702425479888916, "learning_rate": 2.1411429225321965e-06, "loss": 0.1228, "num_input_tokens_seen": 116498704, "step": 53980 }, { "epoch": 8.80668841761827, "grad_norm": 0.1370113044977188, "learning_rate": 2.138261670537445e-06, "loss": 0.0079, "num_input_tokens_seen": 116510000, "step": 53985 }, { "epoch": 8.807504078303426, "grad_norm": 0.06765083223581314, "learning_rate": 2.1353822718312016e-06, "loss": 0.1072, "num_input_tokens_seen": 116521552, "step": 53990 }, { "epoch": 8.808319738988581, "grad_norm": 0.1415599137544632, "learning_rate": 2.132504726646883e-06, "loss": 0.006, "num_input_tokens_seen": 116532208, "step": 53995 }, { "epoch": 8.809135399673735, "grad_norm": 0.2205033153295517, "learning_rate": 2.1296290352177644e-06, "loss": 0.0219, "num_input_tokens_seen": 116542608, "step": 54000 }, { "epoch": 8.809951060358891, "grad_norm": 0.10757742077112198, "learning_rate": 2.1267551977769555e-06, "loss": 0.2081, "num_input_tokens_seen": 116552656, "step": 54005 }, { "epoch": 8.810766721044045, "grad_norm": 0.1324695497751236, "learning_rate": 2.123883214557429e-06, "loss": 0.0659, "num_input_tokens_seen": 116565008, "step": 54010 }, { "epoch": 8.8115823817292, "grad_norm": 0.07865919172763824, "learning_rate": 2.1210130857920034e-06, "loss": 0.0053, "num_input_tokens_seen": 116574576, "step": 54015 }, { "epoch": 8.812398042414356, "grad_norm": 0.02997676655650139, "learning_rate": 2.1181448117133408e-06, "loss": 0.0074, "num_input_tokens_seen": 116584944, "step": 54020 }, { "epoch": 8.81321370309951, "grad_norm": 0.05799383670091629, "learning_rate": 2.115278392553963e-06, "loss": 0.0945, "num_input_tokens_seen": 116595920, "step": 54025 }, { "epoch": 8.814029363784666, "grad_norm": 0.10029251128435135, "learning_rate": 2.112413828546231e-06, "loss": 0.0311, "num_input_tokens_seen": 116606704, "step": 54030 }, { "epoch": 8.81484502446982, "grad_norm": 0.1537550836801529, "learning_rate": 2.1095511199223676e-06, "loss": 0.0055, "num_input_tokens_seen": 116618032, "step": 54035 }, { "epoch": 8.815660685154976, "grad_norm": 0.07866183668375015, "learning_rate": 2.106690266914435e-06, "loss": 0.1524, "num_input_tokens_seen": 116628464, "step": 54040 }, { "epoch": 8.81647634584013, "grad_norm": 0.18933890759944916, "learning_rate": 2.10383126975435e-06, "loss": 0.005, "num_input_tokens_seen": 116639408, "step": 54045 }, { "epoch": 8.817292006525285, "grad_norm": 0.031053628772497177, "learning_rate": 2.1009741286738745e-06, "loss": 0.35, "num_input_tokens_seen": 116649808, "step": 54050 }, { "epoch": 8.818107667210441, "grad_norm": 0.07190708070993423, "learning_rate": 2.098118843904626e-06, "loss": 0.1437, "num_input_tokens_seen": 116661584, "step": 54055 }, { "epoch": 8.818923327895595, "grad_norm": 0.044518325477838516, "learning_rate": 2.0952654156780686e-06, "loss": 0.0031, "num_input_tokens_seen": 116671664, "step": 54060 }, { "epoch": 8.81973898858075, "grad_norm": 0.0714629516005516, "learning_rate": 2.09241384422551e-06, "loss": 0.1103, "num_input_tokens_seen": 116683248, "step": 54065 }, { "epoch": 8.820554649265905, "grad_norm": 2.834881067276001, "learning_rate": 2.089564129778121e-06, "loss": 0.0947, "num_input_tokens_seen": 116693936, "step": 54070 }, { "epoch": 8.82137030995106, "grad_norm": 0.06168261170387268, "learning_rate": 2.0867162725669077e-06, "loss": 0.0039, "num_input_tokens_seen": 116703856, "step": 54075 }, { "epoch": 8.822185970636216, "grad_norm": 0.03860916569828987, "learning_rate": 2.0838702728227356e-06, "loss": 0.003, "num_input_tokens_seen": 116713136, "step": 54080 }, { "epoch": 8.82300163132137, "grad_norm": 0.07390040904283524, "learning_rate": 2.081026130776309e-06, "loss": 0.0038, "num_input_tokens_seen": 116723696, "step": 54085 }, { "epoch": 8.823817292006526, "grad_norm": 2.8101046085357666, "learning_rate": 2.078183846658199e-06, "loss": 0.0877, "num_input_tokens_seen": 116735280, "step": 54090 }, { "epoch": 8.82463295269168, "grad_norm": 4.103845119476318, "learning_rate": 2.075343420698811e-06, "loss": 0.1353, "num_input_tokens_seen": 116746416, "step": 54095 }, { "epoch": 8.825448613376835, "grad_norm": 0.0914025604724884, "learning_rate": 2.0725048531284015e-06, "loss": 0.1435, "num_input_tokens_seen": 116757232, "step": 54100 }, { "epoch": 8.826264274061991, "grad_norm": 0.024981753900647163, "learning_rate": 2.0696681441770836e-06, "loss": 0.0075, "num_input_tokens_seen": 116768304, "step": 54105 }, { "epoch": 8.827079934747145, "grad_norm": 9.234810829162598, "learning_rate": 2.066833294074813e-06, "loss": 0.0957, "num_input_tokens_seen": 116779216, "step": 54110 }, { "epoch": 8.8278955954323, "grad_norm": 0.2372504025697708, "learning_rate": 2.064000303051397e-06, "loss": 0.0026, "num_input_tokens_seen": 116790160, "step": 54115 }, { "epoch": 8.828711256117455, "grad_norm": 0.20221616327762604, "learning_rate": 2.061169171336494e-06, "loss": 0.0067, "num_input_tokens_seen": 116801936, "step": 54120 }, { "epoch": 8.82952691680261, "grad_norm": 0.20247258245944977, "learning_rate": 2.058339899159606e-06, "loss": 0.1894, "num_input_tokens_seen": 116812816, "step": 54125 }, { "epoch": 8.830342577487766, "grad_norm": 0.10631213337182999, "learning_rate": 2.0555124867500915e-06, "loss": 0.1848, "num_input_tokens_seen": 116821840, "step": 54130 }, { "epoch": 8.83115823817292, "grad_norm": 0.06082427129149437, "learning_rate": 2.05268693433715e-06, "loss": 0.0045, "num_input_tokens_seen": 116832912, "step": 54135 }, { "epoch": 8.831973898858076, "grad_norm": 0.13098251819610596, "learning_rate": 2.049863242149844e-06, "loss": 0.0039, "num_input_tokens_seen": 116842896, "step": 54140 }, { "epoch": 8.83278955954323, "grad_norm": 7.468482971191406, "learning_rate": 2.0470414104170694e-06, "loss": 0.0903, "num_input_tokens_seen": 116853744, "step": 54145 }, { "epoch": 8.833605220228385, "grad_norm": 0.12263431400060654, "learning_rate": 2.044221439367583e-06, "loss": 0.11, "num_input_tokens_seen": 116863824, "step": 54150 }, { "epoch": 8.83442088091354, "grad_norm": 0.03963104262948036, "learning_rate": 2.0414033292299823e-06, "loss": 0.0065, "num_input_tokens_seen": 116875056, "step": 54155 }, { "epoch": 8.835236541598695, "grad_norm": 0.06285598129034042, "learning_rate": 2.0385870802327176e-06, "loss": 0.1412, "num_input_tokens_seen": 116886768, "step": 54160 }, { "epoch": 8.83605220228385, "grad_norm": 3.2838828563690186, "learning_rate": 2.0357726926040875e-06, "loss": 0.0909, "num_input_tokens_seen": 116898032, "step": 54165 }, { "epoch": 8.836867862969005, "grad_norm": 0.1277763992547989, "learning_rate": 2.0329601665722453e-06, "loss": 0.0039, "num_input_tokens_seen": 116909008, "step": 54170 }, { "epoch": 8.83768352365416, "grad_norm": 0.15885736048221588, "learning_rate": 2.030149502365186e-06, "loss": 0.0051, "num_input_tokens_seen": 116920464, "step": 54175 }, { "epoch": 8.838499184339314, "grad_norm": 0.05947447568178177, "learning_rate": 2.027340700210753e-06, "loss": 0.0061, "num_input_tokens_seen": 116931568, "step": 54180 }, { "epoch": 8.83931484502447, "grad_norm": 3.056044578552246, "learning_rate": 2.0245337603366472e-06, "loss": 0.0993, "num_input_tokens_seen": 116943312, "step": 54185 }, { "epoch": 8.840130505709626, "grad_norm": 0.056010182946920395, "learning_rate": 2.0217286829704115e-06, "loss": 0.0048, "num_input_tokens_seen": 116954480, "step": 54190 }, { "epoch": 8.84094616639478, "grad_norm": 3.2493302822113037, "learning_rate": 2.018925468339436e-06, "loss": 0.0349, "num_input_tokens_seen": 116964880, "step": 54195 }, { "epoch": 8.841761827079935, "grad_norm": 0.08195551484823227, "learning_rate": 2.01612411667097e-06, "loss": 0.1324, "num_input_tokens_seen": 116974896, "step": 54200 }, { "epoch": 8.84257748776509, "grad_norm": 0.3906518816947937, "learning_rate": 2.013324628192101e-06, "loss": 0.0057, "num_input_tokens_seen": 116985072, "step": 54205 }, { "epoch": 8.843393148450245, "grad_norm": 0.08634142577648163, "learning_rate": 2.0105270031297725e-06, "loss": 0.0032, "num_input_tokens_seen": 116995856, "step": 54210 }, { "epoch": 8.844208809135399, "grad_norm": 3.831986904144287, "learning_rate": 2.0077312417107695e-06, "loss": 0.3087, "num_input_tokens_seen": 117007152, "step": 54215 }, { "epoch": 8.845024469820554, "grad_norm": 2.8840560913085938, "learning_rate": 2.0049373441617363e-06, "loss": 0.1635, "num_input_tokens_seen": 117017040, "step": 54220 }, { "epoch": 8.84584013050571, "grad_norm": 0.3361341059207916, "learning_rate": 2.002145310709155e-06, "loss": 0.0071, "num_input_tokens_seen": 117028304, "step": 54225 }, { "epoch": 8.846655791190864, "grad_norm": 0.07720833271741867, "learning_rate": 1.9993551415793647e-06, "loss": 0.1723, "num_input_tokens_seen": 117039120, "step": 54230 }, { "epoch": 8.84747145187602, "grad_norm": 0.06056373566389084, "learning_rate": 1.9965668369985507e-06, "loss": 0.0055, "num_input_tokens_seen": 117050032, "step": 54235 }, { "epoch": 8.848287112561174, "grad_norm": 0.3385069966316223, "learning_rate": 1.993780397192749e-06, "loss": 0.1919, "num_input_tokens_seen": 117058448, "step": 54240 }, { "epoch": 8.84910277324633, "grad_norm": 0.12695370614528656, "learning_rate": 1.9909958223878424e-06, "loss": 0.0036, "num_input_tokens_seen": 117068880, "step": 54245 }, { "epoch": 8.849918433931485, "grad_norm": 4.353246688842773, "learning_rate": 1.988213112809559e-06, "loss": 0.2479, "num_input_tokens_seen": 117079312, "step": 54250 }, { "epoch": 8.850734094616639, "grad_norm": 14.258933067321777, "learning_rate": 1.9854322686834766e-06, "loss": 0.2642, "num_input_tokens_seen": 117089776, "step": 54255 }, { "epoch": 8.851549755301795, "grad_norm": 0.03966104984283447, "learning_rate": 1.982653290235034e-06, "loss": 0.263, "num_input_tokens_seen": 117100304, "step": 54260 }, { "epoch": 8.852365415986949, "grad_norm": 0.0750674456357956, "learning_rate": 1.979876177689505e-06, "loss": 0.1308, "num_input_tokens_seen": 117110064, "step": 54265 }, { "epoch": 8.853181076672104, "grad_norm": 28.39978790283203, "learning_rate": 1.9771009312720164e-06, "loss": 0.1608, "num_input_tokens_seen": 117121872, "step": 54270 }, { "epoch": 8.85399673735726, "grad_norm": 0.1714479923248291, "learning_rate": 1.9743275512075417e-06, "loss": 0.0055, "num_input_tokens_seen": 117133040, "step": 54275 }, { "epoch": 8.854812398042414, "grad_norm": 2.3563148975372314, "learning_rate": 1.9715560377209093e-06, "loss": 0.1389, "num_input_tokens_seen": 117145488, "step": 54280 }, { "epoch": 8.85562805872757, "grad_norm": 0.08297800272703171, "learning_rate": 1.9687863910367866e-06, "loss": 0.0536, "num_input_tokens_seen": 117156464, "step": 54285 }, { "epoch": 8.856443719412724, "grad_norm": 0.19360771775245667, "learning_rate": 1.9660186113796996e-06, "loss": 0.0049, "num_input_tokens_seen": 117166128, "step": 54290 }, { "epoch": 8.85725938009788, "grad_norm": 0.3158252239227295, "learning_rate": 1.963252698974019e-06, "loss": 0.0311, "num_input_tokens_seen": 117176272, "step": 54295 }, { "epoch": 8.858075040783035, "grad_norm": 0.14304521679878235, "learning_rate": 1.96048865404396e-06, "loss": 0.2595, "num_input_tokens_seen": 117187440, "step": 54300 }, { "epoch": 8.858890701468189, "grad_norm": 0.05150622874498367, "learning_rate": 1.9577264768135927e-06, "loss": 0.0046, "num_input_tokens_seen": 117198416, "step": 54305 }, { "epoch": 8.859706362153345, "grad_norm": 0.029016217216849327, "learning_rate": 1.9549661675068303e-06, "loss": 0.1438, "num_input_tokens_seen": 117209104, "step": 54310 }, { "epoch": 8.860522022838499, "grad_norm": 0.06012250483036041, "learning_rate": 1.9522077263474436e-06, "loss": 0.003, "num_input_tokens_seen": 117220592, "step": 54315 }, { "epoch": 8.861337683523654, "grad_norm": 0.0844341441988945, "learning_rate": 1.94945115355904e-06, "loss": 0.0067, "num_input_tokens_seen": 117231280, "step": 54320 }, { "epoch": 8.86215334420881, "grad_norm": 3.6612021923065186, "learning_rate": 1.946696449365082e-06, "loss": 0.0855, "num_input_tokens_seen": 117243216, "step": 54325 }, { "epoch": 8.862969004893964, "grad_norm": 0.191433846950531, "learning_rate": 1.943943613988883e-06, "loss": 0.0073, "num_input_tokens_seen": 117252976, "step": 54330 }, { "epoch": 8.86378466557912, "grad_norm": 0.10980988293886185, "learning_rate": 1.9411926476535976e-06, "loss": 0.0156, "num_input_tokens_seen": 117264016, "step": 54335 }, { "epoch": 8.864600326264274, "grad_norm": 3.5337917804718018, "learning_rate": 1.938443550582239e-06, "loss": 0.2485, "num_input_tokens_seen": 117274864, "step": 54340 }, { "epoch": 8.86541598694943, "grad_norm": 0.14542673528194427, "learning_rate": 1.935696322997657e-06, "loss": 0.0046, "num_input_tokens_seen": 117285552, "step": 54345 }, { "epoch": 8.866231647634583, "grad_norm": 6.901078224182129, "learning_rate": 1.932950965122554e-06, "loss": 0.0518, "num_input_tokens_seen": 117295728, "step": 54350 }, { "epoch": 8.867047308319739, "grad_norm": 0.24575941264629364, "learning_rate": 1.930207477179491e-06, "loss": 0.1869, "num_input_tokens_seen": 117307408, "step": 54355 }, { "epoch": 8.867862969004895, "grad_norm": 4.659783840179443, "learning_rate": 1.9274658593908647e-06, "loss": 0.1681, "num_input_tokens_seen": 117318832, "step": 54360 }, { "epoch": 8.868678629690049, "grad_norm": 1.5721524953842163, "learning_rate": 1.9247261119789252e-06, "loss": 0.1441, "num_input_tokens_seen": 117329712, "step": 54365 }, { "epoch": 8.869494290375204, "grad_norm": 4.646288871765137, "learning_rate": 1.9219882351657696e-06, "loss": 0.1658, "num_input_tokens_seen": 117339728, "step": 54370 }, { "epoch": 8.870309951060358, "grad_norm": 0.1199723333120346, "learning_rate": 1.9192522291733434e-06, "loss": 0.0715, "num_input_tokens_seen": 117351408, "step": 54375 }, { "epoch": 8.871125611745514, "grad_norm": 6.661080837249756, "learning_rate": 1.9165180942234435e-06, "loss": 0.4362, "num_input_tokens_seen": 117361904, "step": 54380 }, { "epoch": 8.87194127243067, "grad_norm": 0.06170143187046051, "learning_rate": 1.913785830537712e-06, "loss": 0.0782, "num_input_tokens_seen": 117371536, "step": 54385 }, { "epoch": 8.872756933115824, "grad_norm": 0.0556231290102005, "learning_rate": 1.911055438337639e-06, "loss": 0.1225, "num_input_tokens_seen": 117379984, "step": 54390 }, { "epoch": 8.87357259380098, "grad_norm": 0.03875993564724922, "learning_rate": 1.9083269178445636e-06, "loss": 0.0049, "num_input_tokens_seen": 117390416, "step": 54395 }, { "epoch": 8.874388254486133, "grad_norm": 4.18616247177124, "learning_rate": 1.9056002692796698e-06, "loss": 0.0878, "num_input_tokens_seen": 117401136, "step": 54400 }, { "epoch": 8.875203915171289, "grad_norm": 0.110075943171978, "learning_rate": 1.9028754928640008e-06, "loss": 0.1132, "num_input_tokens_seen": 117411344, "step": 54405 }, { "epoch": 8.876019575856443, "grad_norm": 0.05919650197029114, "learning_rate": 1.9001525888184407e-06, "loss": 0.0058, "num_input_tokens_seen": 117421680, "step": 54410 }, { "epoch": 8.876835236541599, "grad_norm": 0.12138954550027847, "learning_rate": 1.8974315573637185e-06, "loss": 0.0046, "num_input_tokens_seen": 117432048, "step": 54415 }, { "epoch": 8.877650897226754, "grad_norm": 0.07046827673912048, "learning_rate": 1.8947123987204135e-06, "loss": 0.0811, "num_input_tokens_seen": 117444080, "step": 54420 }, { "epoch": 8.878466557911908, "grad_norm": 0.16570985317230225, "learning_rate": 1.8919951131089602e-06, "loss": 0.1182, "num_input_tokens_seen": 117454864, "step": 54425 }, { "epoch": 8.879282218597064, "grad_norm": 0.04637861251831055, "learning_rate": 1.88927970074963e-06, "loss": 0.1158, "num_input_tokens_seen": 117464944, "step": 54430 }, { "epoch": 8.880097879282218, "grad_norm": 0.07738249003887177, "learning_rate": 1.8865661618625491e-06, "loss": 0.005, "num_input_tokens_seen": 117475344, "step": 54435 }, { "epoch": 8.880913539967374, "grad_norm": 6.084794044494629, "learning_rate": 1.883854496667692e-06, "loss": 0.2447, "num_input_tokens_seen": 117486288, "step": 54440 }, { "epoch": 8.88172920065253, "grad_norm": 3.370311975479126, "learning_rate": 1.8811447053848796e-06, "loss": 0.0346, "num_input_tokens_seen": 117496432, "step": 54445 }, { "epoch": 8.882544861337683, "grad_norm": 0.1351543515920639, "learning_rate": 1.878436788233781e-06, "loss": 0.0549, "num_input_tokens_seen": 117506960, "step": 54450 }, { "epoch": 8.883360522022839, "grad_norm": 0.1686161607503891, "learning_rate": 1.8757307454339095e-06, "loss": 0.209, "num_input_tokens_seen": 117517360, "step": 54455 }, { "epoch": 8.884176182707993, "grad_norm": 0.39949020743370056, "learning_rate": 1.8730265772046396e-06, "loss": 0.145, "num_input_tokens_seen": 117527376, "step": 54460 }, { "epoch": 8.884991843393149, "grad_norm": 0.07697786390781403, "learning_rate": 1.8703242837651818e-06, "loss": 0.0056, "num_input_tokens_seen": 117537104, "step": 54465 }, { "epoch": 8.885807504078304, "grad_norm": 0.2855949401855469, "learning_rate": 1.8676238653345945e-06, "loss": 0.1307, "num_input_tokens_seen": 117546896, "step": 54470 }, { "epoch": 8.886623164763458, "grad_norm": 0.10021426528692245, "learning_rate": 1.8649253221317914e-06, "loss": 0.0806, "num_input_tokens_seen": 117557328, "step": 54475 }, { "epoch": 8.887438825448614, "grad_norm": 0.127748504281044, "learning_rate": 1.8622286543755252e-06, "loss": 0.0226, "num_input_tokens_seen": 117567568, "step": 54480 }, { "epoch": 8.888254486133768, "grad_norm": 0.22251491248607635, "learning_rate": 1.8595338622844072e-06, "loss": 0.0055, "num_input_tokens_seen": 117578032, "step": 54485 }, { "epoch": 8.889070146818923, "grad_norm": 0.10934317857027054, "learning_rate": 1.8568409460768848e-06, "loss": 0.0878, "num_input_tokens_seen": 117589456, "step": 54490 }, { "epoch": 8.88988580750408, "grad_norm": 4.995783805847168, "learning_rate": 1.8541499059712641e-06, "loss": 0.0089, "num_input_tokens_seen": 117599952, "step": 54495 }, { "epoch": 8.890701468189233, "grad_norm": 0.062409330159425735, "learning_rate": 1.8514607421856928e-06, "loss": 0.1433, "num_input_tokens_seen": 117610288, "step": 54500 }, { "epoch": 8.891517128874389, "grad_norm": 0.21258576214313507, "learning_rate": 1.8487734549381686e-06, "loss": 0.0043, "num_input_tokens_seen": 117620528, "step": 54505 }, { "epoch": 8.892332789559543, "grad_norm": 0.07005354762077332, "learning_rate": 1.8460880444465367e-06, "loss": 0.0048, "num_input_tokens_seen": 117631984, "step": 54510 }, { "epoch": 8.893148450244698, "grad_norm": 0.034182339906692505, "learning_rate": 1.843404510928487e-06, "loss": 0.0038, "num_input_tokens_seen": 117642800, "step": 54515 }, { "epoch": 8.893964110929852, "grad_norm": 0.1602908968925476, "learning_rate": 1.8407228546015648e-06, "loss": 0.0039, "num_input_tokens_seen": 117654544, "step": 54520 }, { "epoch": 8.894779771615008, "grad_norm": 0.08445174992084503, "learning_rate": 1.8380430756831574e-06, "loss": 0.0041, "num_input_tokens_seen": 117664688, "step": 54525 }, { "epoch": 8.895595432300164, "grad_norm": 0.027813170105218887, "learning_rate": 1.835365174390502e-06, "loss": 0.0033, "num_input_tokens_seen": 117675664, "step": 54530 }, { "epoch": 8.896411092985318, "grad_norm": 0.15260443091392517, "learning_rate": 1.8326891509406808e-06, "loss": 0.0044, "num_input_tokens_seen": 117685744, "step": 54535 }, { "epoch": 8.897226753670473, "grad_norm": 0.1360047608613968, "learning_rate": 1.8300150055506254e-06, "loss": 0.0072, "num_input_tokens_seen": 117697104, "step": 54540 }, { "epoch": 8.898042414355627, "grad_norm": 3.5753955841064453, "learning_rate": 1.8273427384371127e-06, "loss": 0.0972, "num_input_tokens_seen": 117708272, "step": 54545 }, { "epoch": 8.898858075040783, "grad_norm": 2.59792423248291, "learning_rate": 1.8246723498167806e-06, "loss": 0.0975, "num_input_tokens_seen": 117718832, "step": 54550 }, { "epoch": 8.899673735725939, "grad_norm": 0.17426784336566925, "learning_rate": 1.8220038399060973e-06, "loss": 0.1208, "num_input_tokens_seen": 117729744, "step": 54555 }, { "epoch": 8.900489396411093, "grad_norm": 11.262162208557129, "learning_rate": 1.8193372089213872e-06, "loss": 0.1129, "num_input_tokens_seen": 117741104, "step": 54560 }, { "epoch": 8.901305057096248, "grad_norm": 0.047902703285217285, "learning_rate": 1.816672457078819e-06, "loss": 0.0027, "num_input_tokens_seen": 117751280, "step": 54565 }, { "epoch": 8.902120717781402, "grad_norm": 0.10405686497688293, "learning_rate": 1.8140095845944117e-06, "loss": 0.004, "num_input_tokens_seen": 117761200, "step": 54570 }, { "epoch": 8.902936378466558, "grad_norm": 0.1675781011581421, "learning_rate": 1.811348591684031e-06, "loss": 0.098, "num_input_tokens_seen": 117771536, "step": 54575 }, { "epoch": 8.903752039151712, "grad_norm": 0.1404532641172409, "learning_rate": 1.8086894785633969e-06, "loss": 0.0085, "num_input_tokens_seen": 117781808, "step": 54580 }, { "epoch": 8.904567699836868, "grad_norm": 0.041269417852163315, "learning_rate": 1.806032245448061e-06, "loss": 0.1004, "num_input_tokens_seen": 117793296, "step": 54585 }, { "epoch": 8.905383360522023, "grad_norm": 0.03334139287471771, "learning_rate": 1.8033768925534378e-06, "loss": 0.2275, "num_input_tokens_seen": 117803568, "step": 54590 }, { "epoch": 8.906199021207177, "grad_norm": 0.09672871977090836, "learning_rate": 1.8007234200947826e-06, "loss": 0.1079, "num_input_tokens_seen": 117813648, "step": 54595 }, { "epoch": 8.907014681892333, "grad_norm": 0.1703348308801651, "learning_rate": 1.7980718282871982e-06, "loss": 0.0073, "num_input_tokens_seen": 117824816, "step": 54600 }, { "epoch": 8.907830342577487, "grad_norm": 0.07169262319803238, "learning_rate": 1.7954221173456382e-06, "loss": 0.2201, "num_input_tokens_seen": 117835312, "step": 54605 }, { "epoch": 8.908646003262643, "grad_norm": 0.05825048312544823, "learning_rate": 1.7927742874848997e-06, "loss": 0.0052, "num_input_tokens_seen": 117845616, "step": 54610 }, { "epoch": 8.909461663947798, "grad_norm": 0.1675950139760971, "learning_rate": 1.7901283389196278e-06, "loss": 0.0888, "num_input_tokens_seen": 117856656, "step": 54615 }, { "epoch": 8.910277324632952, "grad_norm": 6.598090648651123, "learning_rate": 1.7874842718643203e-06, "loss": 0.4248, "num_input_tokens_seen": 117867056, "step": 54620 }, { "epoch": 8.911092985318108, "grad_norm": 4.078062534332275, "learning_rate": 1.7848420865333172e-06, "loss": 0.3353, "num_input_tokens_seen": 117877552, "step": 54625 }, { "epoch": 8.911908646003262, "grad_norm": 0.1207270622253418, "learning_rate": 1.782201783140805e-06, "loss": 0.087, "num_input_tokens_seen": 117888624, "step": 54630 }, { "epoch": 8.912724306688418, "grad_norm": 4.053979873657227, "learning_rate": 1.779563361900824e-06, "loss": 0.1236, "num_input_tokens_seen": 117897520, "step": 54635 }, { "epoch": 8.913539967373573, "grad_norm": 0.5061556696891785, "learning_rate": 1.7769268230272557e-06, "loss": 0.007, "num_input_tokens_seen": 117907920, "step": 54640 }, { "epoch": 8.914355628058727, "grad_norm": 0.7372855544090271, "learning_rate": 1.774292166733832e-06, "loss": 0.0862, "num_input_tokens_seen": 117919088, "step": 54645 }, { "epoch": 8.915171288743883, "grad_norm": 4.361306667327881, "learning_rate": 1.7716593932341319e-06, "loss": 0.1022, "num_input_tokens_seen": 117928848, "step": 54650 }, { "epoch": 8.915986949429037, "grad_norm": 0.05198827013373375, "learning_rate": 1.7690285027415792e-06, "loss": 0.0897, "num_input_tokens_seen": 117939984, "step": 54655 }, { "epoch": 8.916802610114193, "grad_norm": 0.040379416197538376, "learning_rate": 1.7663994954694508e-06, "loss": 0.0054, "num_input_tokens_seen": 117952784, "step": 54660 }, { "epoch": 8.917618270799348, "grad_norm": 0.07739093899726868, "learning_rate": 1.7637723716308646e-06, "loss": 0.2355, "num_input_tokens_seen": 117963504, "step": 54665 }, { "epoch": 8.918433931484502, "grad_norm": 0.051868923008441925, "learning_rate": 1.7611471314387867e-06, "loss": 0.0803, "num_input_tokens_seen": 117973808, "step": 54670 }, { "epoch": 8.919249592169658, "grad_norm": 0.055537253618240356, "learning_rate": 1.7585237751060357e-06, "loss": 0.0777, "num_input_tokens_seen": 117984464, "step": 54675 }, { "epoch": 8.920065252854812, "grad_norm": 0.21607373654842377, "learning_rate": 1.7559023028452748e-06, "loss": 0.0062, "num_input_tokens_seen": 117994928, "step": 54680 }, { "epoch": 8.920880913539968, "grad_norm": 0.0954468846321106, "learning_rate": 1.7532827148690145e-06, "loss": 0.0046, "num_input_tokens_seen": 118004304, "step": 54685 }, { "epoch": 8.921696574225122, "grad_norm": 0.3734135329723358, "learning_rate": 1.75066501138961e-06, "loss": 0.0083, "num_input_tokens_seen": 118015152, "step": 54690 }, { "epoch": 8.922512234910277, "grad_norm": 0.23452778160572052, "learning_rate": 1.7480491926192638e-06, "loss": 0.0132, "num_input_tokens_seen": 118027344, "step": 54695 }, { "epoch": 8.923327895595433, "grad_norm": 3.485624313354492, "learning_rate": 1.7454352587700284e-06, "loss": 0.1744, "num_input_tokens_seen": 118038672, "step": 54700 }, { "epoch": 8.924143556280587, "grad_norm": 4.423807621002197, "learning_rate": 1.7428232100538067e-06, "loss": 0.0318, "num_input_tokens_seen": 118050576, "step": 54705 }, { "epoch": 8.924959216965743, "grad_norm": 0.05518285557627678, "learning_rate": 1.7402130466823373e-06, "loss": 0.0565, "num_input_tokens_seen": 118061520, "step": 54710 }, { "epoch": 8.925774877650896, "grad_norm": 0.1752781867980957, "learning_rate": 1.7376047688672182e-06, "loss": 0.096, "num_input_tokens_seen": 118072080, "step": 54715 }, { "epoch": 8.926590538336052, "grad_norm": 0.08497949689626694, "learning_rate": 1.734998376819888e-06, "loss": 0.0049, "num_input_tokens_seen": 118082416, "step": 54720 }, { "epoch": 8.927406199021208, "grad_norm": 0.1496218591928482, "learning_rate": 1.732393870751639e-06, "loss": 0.1595, "num_input_tokens_seen": 118092880, "step": 54725 }, { "epoch": 8.928221859706362, "grad_norm": 0.9150623679161072, "learning_rate": 1.729791250873597e-06, "loss": 0.0094, "num_input_tokens_seen": 118105136, "step": 54730 }, { "epoch": 8.929037520391518, "grad_norm": 0.03522399812936783, "learning_rate": 1.7271905173967513e-06, "loss": 0.0405, "num_input_tokens_seen": 118114736, "step": 54735 }, { "epoch": 8.929853181076671, "grad_norm": 0.5155400633811951, "learning_rate": 1.7245916705319276e-06, "loss": 0.0077, "num_input_tokens_seen": 118126064, "step": 54740 }, { "epoch": 8.930668841761827, "grad_norm": 0.08923421800136566, "learning_rate": 1.7219947104897994e-06, "loss": 0.0978, "num_input_tokens_seen": 118137456, "step": 54745 }, { "epoch": 8.931484502446983, "grad_norm": 0.12209942936897278, "learning_rate": 1.7193996374808924e-06, "loss": 0.1175, "num_input_tokens_seen": 118147376, "step": 54750 }, { "epoch": 8.932300163132137, "grad_norm": 0.051897816359996796, "learning_rate": 1.7168064517155747e-06, "loss": 0.0046, "num_input_tokens_seen": 118158096, "step": 54755 }, { "epoch": 8.933115823817293, "grad_norm": 0.07628517597913742, "learning_rate": 1.714215153404064e-06, "loss": 0.0791, "num_input_tokens_seen": 118168688, "step": 54760 }, { "epoch": 8.933931484502446, "grad_norm": 0.0787888765335083, "learning_rate": 1.7116257427564259e-06, "loss": 0.2482, "num_input_tokens_seen": 118179792, "step": 54765 }, { "epoch": 8.934747145187602, "grad_norm": 0.2817718982696533, "learning_rate": 1.7090382199825672e-06, "loss": 0.0041, "num_input_tokens_seen": 118190736, "step": 54770 }, { "epoch": 8.935562805872756, "grad_norm": 0.03979673609137535, "learning_rate": 1.7064525852922424e-06, "loss": 0.0974, "num_input_tokens_seen": 118201584, "step": 54775 }, { "epoch": 8.936378466557912, "grad_norm": 0.12454798817634583, "learning_rate": 1.7038688388950675e-06, "loss": 0.0042, "num_input_tokens_seen": 118212624, "step": 54780 }, { "epoch": 8.937194127243067, "grad_norm": 0.12224704772233963, "learning_rate": 1.7012869810004856e-06, "loss": 0.1946, "num_input_tokens_seen": 118222800, "step": 54785 }, { "epoch": 8.938009787928221, "grad_norm": 0.11173922568559647, "learning_rate": 1.698707011817799e-06, "loss": 0.0836, "num_input_tokens_seen": 118232592, "step": 54790 }, { "epoch": 8.938825448613377, "grad_norm": 0.2910785377025604, "learning_rate": 1.6961289315561512e-06, "loss": 0.0052, "num_input_tokens_seen": 118244464, "step": 54795 }, { "epoch": 8.939641109298531, "grad_norm": 0.5948049426078796, "learning_rate": 1.6935527404245366e-06, "loss": 0.0901, "num_input_tokens_seen": 118253552, "step": 54800 }, { "epoch": 8.940456769983687, "grad_norm": 0.06635866314172745, "learning_rate": 1.6909784386317906e-06, "loss": 0.1438, "num_input_tokens_seen": 118263824, "step": 54805 }, { "epoch": 8.941272430668842, "grad_norm": 4.436750411987305, "learning_rate": 1.688406026386602e-06, "loss": 0.1401, "num_input_tokens_seen": 118275216, "step": 54810 }, { "epoch": 8.942088091353996, "grad_norm": 0.1608649641275406, "learning_rate": 1.6858355038975038e-06, "loss": 0.0058, "num_input_tokens_seen": 118286224, "step": 54815 }, { "epoch": 8.942903752039152, "grad_norm": 0.09187374264001846, "learning_rate": 1.6832668713728711e-06, "loss": 0.1731, "num_input_tokens_seen": 118296240, "step": 54820 }, { "epoch": 8.943719412724306, "grad_norm": 0.043479304760694504, "learning_rate": 1.6807001290209374e-06, "loss": 0.0103, "num_input_tokens_seen": 118305680, "step": 54825 }, { "epoch": 8.944535073409462, "grad_norm": 0.04699716717004776, "learning_rate": 1.6781352770497694e-06, "loss": 0.143, "num_input_tokens_seen": 118316880, "step": 54830 }, { "epoch": 8.945350734094617, "grad_norm": 0.14820754528045654, "learning_rate": 1.6755723156672925e-06, "loss": 0.0888, "num_input_tokens_seen": 118328304, "step": 54835 }, { "epoch": 8.946166394779771, "grad_norm": 0.0861411988735199, "learning_rate": 1.6730112450812685e-06, "loss": 0.0244, "num_input_tokens_seen": 118339728, "step": 54840 }, { "epoch": 8.946982055464927, "grad_norm": 0.14119026064872742, "learning_rate": 1.6704520654993145e-06, "loss": 0.0925, "num_input_tokens_seen": 118349872, "step": 54845 }, { "epoch": 8.947797716150081, "grad_norm": 0.07659178227186203, "learning_rate": 1.6678947771288866e-06, "loss": 0.2229, "num_input_tokens_seen": 118359888, "step": 54850 }, { "epoch": 8.948613376835237, "grad_norm": 0.03260839357972145, "learning_rate": 1.665339380177297e-06, "loss": 0.0052, "num_input_tokens_seen": 118370544, "step": 54855 }, { "epoch": 8.949429037520392, "grad_norm": 0.09648166596889496, "learning_rate": 1.6627858748516912e-06, "loss": 0.0039, "num_input_tokens_seen": 118381584, "step": 54860 }, { "epoch": 8.950244698205546, "grad_norm": 0.11531396955251694, "learning_rate": 1.6602342613590754e-06, "loss": 0.0707, "num_input_tokens_seen": 118392624, "step": 54865 }, { "epoch": 8.951060358890702, "grad_norm": 0.04172559082508087, "learning_rate": 1.6576845399062985e-06, "loss": 0.025, "num_input_tokens_seen": 118402768, "step": 54870 }, { "epoch": 8.951876019575856, "grad_norm": 16.108680725097656, "learning_rate": 1.6551367107000503e-06, "loss": 0.0799, "num_input_tokens_seen": 118413584, "step": 54875 }, { "epoch": 8.952691680261012, "grad_norm": 0.14586836099624634, "learning_rate": 1.6525907739468689e-06, "loss": 0.1229, "num_input_tokens_seen": 118424400, "step": 54880 }, { "epoch": 8.953507340946166, "grad_norm": 0.0429881252348423, "learning_rate": 1.6500467298531414e-06, "loss": 0.0031, "num_input_tokens_seen": 118435696, "step": 54885 }, { "epoch": 8.954323001631321, "grad_norm": 0.16560575366020203, "learning_rate": 1.6475045786251059e-06, "loss": 0.2786, "num_input_tokens_seen": 118446544, "step": 54890 }, { "epoch": 8.955138662316477, "grad_norm": 3.961792469024658, "learning_rate": 1.6449643204688364e-06, "loss": 0.1506, "num_input_tokens_seen": 118457616, "step": 54895 }, { "epoch": 8.955954323001631, "grad_norm": 0.10775754600763321, "learning_rate": 1.6424259555902627e-06, "loss": 0.1212, "num_input_tokens_seen": 118468816, "step": 54900 }, { "epoch": 8.956769983686787, "grad_norm": 0.05108390003442764, "learning_rate": 1.6398894841951561e-06, "loss": 0.1589, "num_input_tokens_seen": 118480176, "step": 54905 }, { "epoch": 8.95758564437194, "grad_norm": 0.1063050851225853, "learning_rate": 1.6373549064891358e-06, "loss": 0.0035, "num_input_tokens_seen": 118491024, "step": 54910 }, { "epoch": 8.958401305057096, "grad_norm": 0.10395338386297226, "learning_rate": 1.6348222226776705e-06, "loss": 0.0056, "num_input_tokens_seen": 118502096, "step": 54915 }, { "epoch": 8.959216965742252, "grad_norm": 0.2134932279586792, "learning_rate": 1.6322914329660655e-06, "loss": 0.0057, "num_input_tokens_seen": 118512464, "step": 54920 }, { "epoch": 8.960032626427406, "grad_norm": 0.059857435524463654, "learning_rate": 1.629762537559487e-06, "loss": 0.1359, "num_input_tokens_seen": 118523728, "step": 54925 }, { "epoch": 8.960848287112562, "grad_norm": 0.10444968938827515, "learning_rate": 1.6272355366629327e-06, "loss": 0.1108, "num_input_tokens_seen": 118533680, "step": 54930 }, { "epoch": 8.961663947797716, "grad_norm": 0.09019356220960617, "learning_rate": 1.6247104304812604e-06, "loss": 0.1299, "num_input_tokens_seen": 118544176, "step": 54935 }, { "epoch": 8.962479608482871, "grad_norm": 0.06750141829252243, "learning_rate": 1.6221872192191651e-06, "loss": 0.0037, "num_input_tokens_seen": 118554384, "step": 54940 }, { "epoch": 8.963295269168025, "grad_norm": 4.921962261199951, "learning_rate": 1.6196659030811938e-06, "loss": 0.133, "num_input_tokens_seen": 118566064, "step": 54945 }, { "epoch": 8.964110929853181, "grad_norm": 0.10307639092206955, "learning_rate": 1.6171464822717337e-06, "loss": 0.1893, "num_input_tokens_seen": 118575952, "step": 54950 }, { "epoch": 8.964926590538337, "grad_norm": 0.07564139366149902, "learning_rate": 1.6146289569950208e-06, "loss": 0.0843, "num_input_tokens_seen": 118587408, "step": 54955 }, { "epoch": 8.96574225122349, "grad_norm": 0.030069146305322647, "learning_rate": 1.612113327455142e-06, "loss": 0.2027, "num_input_tokens_seen": 118598320, "step": 54960 }, { "epoch": 8.966557911908646, "grad_norm": 0.837108314037323, "learning_rate": 1.6095995938560288e-06, "loss": 0.1128, "num_input_tokens_seen": 118609488, "step": 54965 }, { "epoch": 8.9673735725938, "grad_norm": 0.14493216574192047, "learning_rate": 1.6070877564014514e-06, "loss": 0.3061, "num_input_tokens_seen": 118621040, "step": 54970 }, { "epoch": 8.968189233278956, "grad_norm": 0.35746636986732483, "learning_rate": 1.6045778152950357e-06, "loss": 0.0958, "num_input_tokens_seen": 118632720, "step": 54975 }, { "epoch": 8.969004893964112, "grad_norm": 0.06156148388981819, "learning_rate": 1.6020697707402472e-06, "loss": 0.0043, "num_input_tokens_seen": 118643024, "step": 54980 }, { "epoch": 8.969820554649266, "grad_norm": 8.342617988586426, "learning_rate": 1.599563622940406e-06, "loss": 0.2614, "num_input_tokens_seen": 118653712, "step": 54985 }, { "epoch": 8.970636215334421, "grad_norm": 0.11663028597831726, "learning_rate": 1.5970593720986666e-06, "loss": 0.3216, "num_input_tokens_seen": 118664400, "step": 54990 }, { "epoch": 8.971451876019575, "grad_norm": 0.11002062261104584, "learning_rate": 1.5945570184180413e-06, "loss": 0.1151, "num_input_tokens_seen": 118675504, "step": 54995 }, { "epoch": 8.97226753670473, "grad_norm": 0.08915618062019348, "learning_rate": 1.5920565621013822e-06, "loss": 0.0066, "num_input_tokens_seen": 118686192, "step": 55000 }, { "epoch": 8.973083197389887, "grad_norm": 0.06287045776844025, "learning_rate": 1.5895580033513908e-06, "loss": 0.0047, "num_input_tokens_seen": 118697776, "step": 55005 }, { "epoch": 8.97389885807504, "grad_norm": 0.08433584868907928, "learning_rate": 1.5870613423706103e-06, "loss": 0.0036, "num_input_tokens_seen": 118708720, "step": 55010 }, { "epoch": 8.974714518760196, "grad_norm": 0.4887405037879944, "learning_rate": 1.5845665793614322e-06, "loss": 0.0047, "num_input_tokens_seen": 118720112, "step": 55015 }, { "epoch": 8.97553017944535, "grad_norm": 3.545954465866089, "learning_rate": 1.5820737145260972e-06, "loss": 0.3707, "num_input_tokens_seen": 118731184, "step": 55020 }, { "epoch": 8.976345840130506, "grad_norm": 0.12726637721061707, "learning_rate": 1.5795827480666852e-06, "loss": 0.1355, "num_input_tokens_seen": 118741392, "step": 55025 }, { "epoch": 8.977161500815662, "grad_norm": 5.345801830291748, "learning_rate": 1.5770936801851322e-06, "loss": 0.0954, "num_input_tokens_seen": 118751984, "step": 55030 }, { "epoch": 8.977977161500815, "grad_norm": 0.9549118280410767, "learning_rate": 1.5746065110832124e-06, "loss": 0.0093, "num_input_tokens_seen": 118763600, "step": 55035 }, { "epoch": 8.978792822185971, "grad_norm": 0.08882303535938263, "learning_rate": 1.5721212409625485e-06, "loss": 0.0072, "num_input_tokens_seen": 118774512, "step": 55040 }, { "epoch": 8.979608482871125, "grad_norm": 0.07915233075618744, "learning_rate": 1.5696378700246094e-06, "loss": 0.0044, "num_input_tokens_seen": 118785072, "step": 55045 }, { "epoch": 8.98042414355628, "grad_norm": 0.07184240221977234, "learning_rate": 1.5671563984707066e-06, "loss": 0.0048, "num_input_tokens_seen": 118795984, "step": 55050 }, { "epoch": 8.981239804241435, "grad_norm": 0.030804220587015152, "learning_rate": 1.564676826502004e-06, "loss": 0.0051, "num_input_tokens_seen": 118804656, "step": 55055 }, { "epoch": 8.98205546492659, "grad_norm": 0.8717603087425232, "learning_rate": 1.5621991543195103e-06, "loss": 0.0045, "num_input_tokens_seen": 118814224, "step": 55060 }, { "epoch": 8.982871125611746, "grad_norm": 0.03816085681319237, "learning_rate": 1.5597233821240732e-06, "loss": 0.0068, "num_input_tokens_seen": 118825552, "step": 55065 }, { "epoch": 8.9836867862969, "grad_norm": 0.06686482578516006, "learning_rate": 1.557249510116393e-06, "loss": 0.1324, "num_input_tokens_seen": 118836752, "step": 55070 }, { "epoch": 8.984502446982056, "grad_norm": 0.04887497425079346, "learning_rate": 1.5547775384970154e-06, "loss": 0.0054, "num_input_tokens_seen": 118846160, "step": 55075 }, { "epoch": 8.98531810766721, "grad_norm": 0.04898509755730629, "learning_rate": 1.5523074674663296e-06, "loss": 0.0032, "num_input_tokens_seen": 118857168, "step": 55080 }, { "epoch": 8.986133768352365, "grad_norm": 0.08052734285593033, "learning_rate": 1.549839297224573e-06, "loss": 0.0037, "num_input_tokens_seen": 118868432, "step": 55085 }, { "epoch": 8.986949429037521, "grad_norm": 0.03552599251270294, "learning_rate": 1.5473730279718296e-06, "loss": 0.014, "num_input_tokens_seen": 118879120, "step": 55090 }, { "epoch": 8.987765089722675, "grad_norm": 0.027884574607014656, "learning_rate": 1.5449086599080204e-06, "loss": 0.0991, "num_input_tokens_seen": 118889040, "step": 55095 }, { "epoch": 8.98858075040783, "grad_norm": 0.027779938653111458, "learning_rate": 1.5424461932329298e-06, "loss": 0.0041, "num_input_tokens_seen": 118898608, "step": 55100 }, { "epoch": 8.989396411092985, "grad_norm": 0.09449734538793564, "learning_rate": 1.5399856281461734e-06, "loss": 0.003, "num_input_tokens_seen": 118909264, "step": 55105 }, { "epoch": 8.99021207177814, "grad_norm": 0.1730055809020996, "learning_rate": 1.5375269648472162e-06, "loss": 0.1187, "num_input_tokens_seen": 118920048, "step": 55110 }, { "epoch": 8.991027732463294, "grad_norm": 0.19182473421096802, "learning_rate": 1.5350702035353716e-06, "loss": 0.0072, "num_input_tokens_seen": 118931120, "step": 55115 }, { "epoch": 8.99184339314845, "grad_norm": 0.4815743565559387, "learning_rate": 1.5326153444097934e-06, "loss": 0.0793, "num_input_tokens_seen": 118940464, "step": 55120 }, { "epoch": 8.992659053833606, "grad_norm": 0.07847863435745239, "learning_rate": 1.5301623876694898e-06, "loss": 0.1133, "num_input_tokens_seen": 118951120, "step": 55125 }, { "epoch": 8.99347471451876, "grad_norm": 2.0929036140441895, "learning_rate": 1.5277113335133097e-06, "loss": 0.1297, "num_input_tokens_seen": 118961616, "step": 55130 }, { "epoch": 8.994290375203915, "grad_norm": 0.070279061794281, "learning_rate": 1.5252621821399443e-06, "loss": 0.1001, "num_input_tokens_seen": 118973296, "step": 55135 }, { "epoch": 8.99510603588907, "grad_norm": 38.61929702758789, "learning_rate": 1.5228149337479347e-06, "loss": 0.0552, "num_input_tokens_seen": 118983344, "step": 55140 }, { "epoch": 8.995921696574225, "grad_norm": 0.033813487738370895, "learning_rate": 1.5203695885356694e-06, "loss": 0.0651, "num_input_tokens_seen": 118994544, "step": 55145 }, { "epoch": 8.99673735725938, "grad_norm": 0.29596346616744995, "learning_rate": 1.5179261467013817e-06, "loss": 0.0592, "num_input_tokens_seen": 119005264, "step": 55150 }, { "epoch": 8.997553017944535, "grad_norm": 4.728759765625, "learning_rate": 1.5154846084431463e-06, "loss": 0.2518, "num_input_tokens_seen": 119016688, "step": 55155 }, { "epoch": 8.99836867862969, "grad_norm": 5.094381809234619, "learning_rate": 1.513044973958888e-06, "loss": 0.15, "num_input_tokens_seen": 119027856, "step": 55160 }, { "epoch": 8.999184339314844, "grad_norm": 4.91703987121582, "learning_rate": 1.5106072434463742e-06, "loss": 0.1511, "num_input_tokens_seen": 119038000, "step": 55165 }, { "epoch": 9.0, "grad_norm": 0.021419517695903778, "learning_rate": 1.5081714171032186e-06, "loss": 0.0036, "num_input_tokens_seen": 119047920, "step": 55170 }, { "epoch": 9.0, "eval_loss": 0.19297294318675995, "eval_runtime": 568.4632, "eval_samples_per_second": 4.794, "eval_steps_per_second": 1.2, "num_input_tokens_seen": 119047920, "step": 55170 }, { "epoch": 9.000815660685156, "grad_norm": 7.182825088500977, "learning_rate": 1.5057374951268883e-06, "loss": 0.1803, "num_input_tokens_seen": 119058096, "step": 55175 }, { "epoch": 9.00163132137031, "grad_norm": 3.8748457431793213, "learning_rate": 1.5033054777146838e-06, "loss": 0.128, "num_input_tokens_seen": 119068080, "step": 55180 }, { "epoch": 9.002446982055465, "grad_norm": 4.433473587036133, "learning_rate": 1.5008753650637585e-06, "loss": 0.2174, "num_input_tokens_seen": 119078576, "step": 55185 }, { "epoch": 9.00326264274062, "grad_norm": 0.3918544352054596, "learning_rate": 1.4984471573711105e-06, "loss": 0.1319, "num_input_tokens_seen": 119089808, "step": 55190 }, { "epoch": 9.004078303425775, "grad_norm": 0.045883674174547195, "learning_rate": 1.4960208548335825e-06, "loss": 0.203, "num_input_tokens_seen": 119100208, "step": 55195 }, { "epoch": 9.00489396411093, "grad_norm": 0.216802179813385, "learning_rate": 1.4935964576478584e-06, "loss": 0.0988, "num_input_tokens_seen": 119111632, "step": 55200 }, { "epoch": 9.005709624796085, "grad_norm": 0.16552449762821198, "learning_rate": 1.4911739660104785e-06, "loss": 0.0147, "num_input_tokens_seen": 119122608, "step": 55205 }, { "epoch": 9.00652528548124, "grad_norm": 0.10056468844413757, "learning_rate": 1.4887533801178188e-06, "loss": 0.0041, "num_input_tokens_seen": 119132976, "step": 55210 }, { "epoch": 9.007340946166394, "grad_norm": 2.736128568649292, "learning_rate": 1.486334700166106e-06, "loss": 0.1052, "num_input_tokens_seen": 119143984, "step": 55215 }, { "epoch": 9.00815660685155, "grad_norm": 0.6527325510978699, "learning_rate": 1.483917926351408e-06, "loss": 0.0064, "num_input_tokens_seen": 119154256, "step": 55220 }, { "epoch": 9.008972267536704, "grad_norm": 0.30277392268180847, "learning_rate": 1.4815030588696432e-06, "loss": 0.0959, "num_input_tokens_seen": 119165712, "step": 55225 }, { "epoch": 9.00978792822186, "grad_norm": 0.04763905704021454, "learning_rate": 1.4790900979165717e-06, "loss": 0.0039, "num_input_tokens_seen": 119176496, "step": 55230 }, { "epoch": 9.010603588907015, "grad_norm": 0.08349156379699707, "learning_rate": 1.4766790436878035e-06, "loss": 0.1009, "num_input_tokens_seen": 119186640, "step": 55235 }, { "epoch": 9.01141924959217, "grad_norm": 5.5012922286987305, "learning_rate": 1.4742698963787854e-06, "loss": 0.2143, "num_input_tokens_seen": 119197680, "step": 55240 }, { "epoch": 9.012234910277325, "grad_norm": 0.1282111555337906, "learning_rate": 1.4718626561848193e-06, "loss": 0.0064, "num_input_tokens_seen": 119208080, "step": 55245 }, { "epoch": 9.013050570962479, "grad_norm": 4.5648908615112305, "learning_rate": 1.469457323301046e-06, "loss": 0.134, "num_input_tokens_seen": 119218896, "step": 55250 }, { "epoch": 9.013866231647635, "grad_norm": 4.804664134979248, "learning_rate": 1.4670538979224547e-06, "loss": 0.4, "num_input_tokens_seen": 119230032, "step": 55255 }, { "epoch": 9.01468189233279, "grad_norm": 0.05063999816775322, "learning_rate": 1.4646523802438805e-06, "loss": 0.011, "num_input_tokens_seen": 119240112, "step": 55260 }, { "epoch": 9.015497553017944, "grad_norm": 0.05710785835981369, "learning_rate": 1.4622527704599986e-06, "loss": 0.3593, "num_input_tokens_seen": 119251184, "step": 55265 }, { "epoch": 9.0163132137031, "grad_norm": 0.09515246003866196, "learning_rate": 1.4598550687653394e-06, "loss": 0.0042, "num_input_tokens_seen": 119263216, "step": 55270 }, { "epoch": 9.017128874388254, "grad_norm": 0.18920369446277618, "learning_rate": 1.4574592753542698e-06, "loss": 0.0086, "num_input_tokens_seen": 119272112, "step": 55275 }, { "epoch": 9.01794453507341, "grad_norm": 0.7133920192718506, "learning_rate": 1.4550653904210038e-06, "loss": 0.1259, "num_input_tokens_seen": 119282352, "step": 55280 }, { "epoch": 9.018760195758565, "grad_norm": 0.07079581916332245, "learning_rate": 1.4526734141596026e-06, "loss": 0.1497, "num_input_tokens_seen": 119293488, "step": 55285 }, { "epoch": 9.01957585644372, "grad_norm": 10.122812271118164, "learning_rate": 1.4502833467639725e-06, "loss": 0.1082, "num_input_tokens_seen": 119304592, "step": 55290 }, { "epoch": 9.020391517128875, "grad_norm": 8.083893775939941, "learning_rate": 1.4478951884278669e-06, "loss": 0.1842, "num_input_tokens_seen": 119315088, "step": 55295 }, { "epoch": 9.021207177814029, "grad_norm": 3.10662579536438, "learning_rate": 1.4455089393448778e-06, "loss": 0.1553, "num_input_tokens_seen": 119326416, "step": 55300 }, { "epoch": 9.022022838499185, "grad_norm": 0.252075731754303, "learning_rate": 1.4431245997084425e-06, "loss": 0.006, "num_input_tokens_seen": 119337296, "step": 55305 }, { "epoch": 9.022838499184338, "grad_norm": 0.2267480194568634, "learning_rate": 1.4407421697118617e-06, "loss": 0.0845, "num_input_tokens_seen": 119347824, "step": 55310 }, { "epoch": 9.023654159869494, "grad_norm": 0.1325957477092743, "learning_rate": 1.438361649548256e-06, "loss": 0.0036, "num_input_tokens_seen": 119359216, "step": 55315 }, { "epoch": 9.02446982055465, "grad_norm": 0.06241501495242119, "learning_rate": 1.4359830394106071e-06, "loss": 0.1319, "num_input_tokens_seen": 119370800, "step": 55320 }, { "epoch": 9.025285481239804, "grad_norm": 3.341966152191162, "learning_rate": 1.4336063394917333e-06, "loss": 0.2197, "num_input_tokens_seen": 119380944, "step": 55325 }, { "epoch": 9.02610114192496, "grad_norm": 0.05169983580708504, "learning_rate": 1.4312315499843077e-06, "loss": 0.101, "num_input_tokens_seen": 119390608, "step": 55330 }, { "epoch": 9.026916802610113, "grad_norm": 0.06638845056295395, "learning_rate": 1.428858671080835e-06, "loss": 0.007, "num_input_tokens_seen": 119401328, "step": 55335 }, { "epoch": 9.02773246329527, "grad_norm": 0.13590192794799805, "learning_rate": 1.4264877029736778e-06, "loss": 0.0052, "num_input_tokens_seen": 119411472, "step": 55340 }, { "epoch": 9.028548123980425, "grad_norm": 0.03334615379571915, "learning_rate": 1.424118645855041e-06, "loss": 0.0096, "num_input_tokens_seen": 119423152, "step": 55345 }, { "epoch": 9.029363784665579, "grad_norm": 0.11990103870630264, "learning_rate": 1.4217514999169678e-06, "loss": 0.0053, "num_input_tokens_seen": 119434992, "step": 55350 }, { "epoch": 9.030179445350734, "grad_norm": 0.11351681500673294, "learning_rate": 1.4193862653513524e-06, "loss": 0.2898, "num_input_tokens_seen": 119445520, "step": 55355 }, { "epoch": 9.030995106035888, "grad_norm": 0.11845037341117859, "learning_rate": 1.4170229423499353e-06, "loss": 0.0168, "num_input_tokens_seen": 119456080, "step": 55360 }, { "epoch": 9.031810766721044, "grad_norm": 0.11720054596662521, "learning_rate": 1.4146615311042972e-06, "loss": 0.0038, "num_input_tokens_seen": 119467504, "step": 55365 }, { "epoch": 9.0326264274062, "grad_norm": 2.3431005477905273, "learning_rate": 1.4123020318058649e-06, "loss": 0.2547, "num_input_tokens_seen": 119478896, "step": 55370 }, { "epoch": 9.033442088091354, "grad_norm": 0.11629412323236465, "learning_rate": 1.4099444446459138e-06, "loss": 0.1221, "num_input_tokens_seen": 119489840, "step": 55375 }, { "epoch": 9.03425774877651, "grad_norm": 0.06350281089544296, "learning_rate": 1.4075887698155599e-06, "loss": 0.0078, "num_input_tokens_seen": 119499472, "step": 55380 }, { "epoch": 9.035073409461663, "grad_norm": 0.09608178585767746, "learning_rate": 1.4052350075057673e-06, "loss": 0.1319, "num_input_tokens_seen": 119510768, "step": 55385 }, { "epoch": 9.035889070146819, "grad_norm": 0.07565252482891083, "learning_rate": 1.4028831579073448e-06, "loss": 0.1231, "num_input_tokens_seen": 119521968, "step": 55390 }, { "epoch": 9.036704730831975, "grad_norm": 0.009342601522803307, "learning_rate": 1.4005332212109424e-06, "loss": 0.1157, "num_input_tokens_seen": 119532912, "step": 55395 }, { "epoch": 9.037520391517129, "grad_norm": 0.1403772234916687, "learning_rate": 1.3981851976070603e-06, "loss": 0.004, "num_input_tokens_seen": 119544208, "step": 55400 }, { "epoch": 9.038336052202284, "grad_norm": 0.0709967240691185, "learning_rate": 1.395839087286041e-06, "loss": 0.0075, "num_input_tokens_seen": 119554960, "step": 55405 }, { "epoch": 9.039151712887438, "grad_norm": 0.05165521055459976, "learning_rate": 1.3934948904380712e-06, "loss": 0.1353, "num_input_tokens_seen": 119566224, "step": 55410 }, { "epoch": 9.039967373572594, "grad_norm": 0.09342238306999207, "learning_rate": 1.3911526072531795e-06, "loss": 0.0979, "num_input_tokens_seen": 119577424, "step": 55415 }, { "epoch": 9.040783034257748, "grad_norm": 0.1905154436826706, "learning_rate": 1.3888122379212527e-06, "loss": 0.1214, "num_input_tokens_seen": 119588752, "step": 55420 }, { "epoch": 9.041598694942904, "grad_norm": 0.0606345497071743, "learning_rate": 1.3864737826320058e-06, "loss": 0.0054, "num_input_tokens_seen": 119600048, "step": 55425 }, { "epoch": 9.04241435562806, "grad_norm": 18.631378173828125, "learning_rate": 1.3841372415750093e-06, "loss": 0.2015, "num_input_tokens_seen": 119608944, "step": 55430 }, { "epoch": 9.043230016313213, "grad_norm": 0.08189153671264648, "learning_rate": 1.381802614939673e-06, "loss": 0.0864, "num_input_tokens_seen": 119618512, "step": 55435 }, { "epoch": 9.044045676998369, "grad_norm": 0.09821552783250809, "learning_rate": 1.3794699029152563e-06, "loss": 0.0163, "num_input_tokens_seen": 119629040, "step": 55440 }, { "epoch": 9.044861337683523, "grad_norm": 0.04215510934591293, "learning_rate": 1.3771391056908555e-06, "loss": 0.0062, "num_input_tokens_seen": 119639248, "step": 55445 }, { "epoch": 9.045676998368679, "grad_norm": 0.07656820118427277, "learning_rate": 1.3748102234554222e-06, "loss": 0.0091, "num_input_tokens_seen": 119648848, "step": 55450 }, { "epoch": 9.046492659053834, "grad_norm": 1.7920234203338623, "learning_rate": 1.372483256397744e-06, "loss": 0.0094, "num_input_tokens_seen": 119660624, "step": 55455 }, { "epoch": 9.047308319738988, "grad_norm": 2.013300895690918, "learning_rate": 1.3701582047064592e-06, "loss": 0.0095, "num_input_tokens_seen": 119671120, "step": 55460 }, { "epoch": 9.048123980424144, "grad_norm": 0.06278388947248459, "learning_rate": 1.3678350685700447e-06, "loss": 0.2559, "num_input_tokens_seen": 119680720, "step": 55465 }, { "epoch": 9.048939641109298, "grad_norm": 0.18598142266273499, "learning_rate": 1.3655138481768303e-06, "loss": 0.079, "num_input_tokens_seen": 119690896, "step": 55470 }, { "epoch": 9.049755301794454, "grad_norm": 0.1875404268503189, "learning_rate": 1.3631945437149823e-06, "loss": 0.0065, "num_input_tokens_seen": 119699824, "step": 55475 }, { "epoch": 9.05057096247961, "grad_norm": 0.9875158667564392, "learning_rate": 1.3608771553725168e-06, "loss": 0.1063, "num_input_tokens_seen": 119709552, "step": 55480 }, { "epoch": 9.051386623164763, "grad_norm": 0.09905755519866943, "learning_rate": 1.3585616833372894e-06, "loss": 0.0125, "num_input_tokens_seen": 119721136, "step": 55485 }, { "epoch": 9.052202283849919, "grad_norm": 0.09456333518028259, "learning_rate": 1.3562481277970108e-06, "loss": 0.1033, "num_input_tokens_seen": 119731408, "step": 55490 }, { "epoch": 9.053017944535073, "grad_norm": 0.03764314204454422, "learning_rate": 1.3539364889392281e-06, "loss": 0.2635, "num_input_tokens_seen": 119743728, "step": 55495 }, { "epoch": 9.053833605220229, "grad_norm": 0.17601527273654938, "learning_rate": 1.3516267669513305e-06, "loss": 0.0888, "num_input_tokens_seen": 119754992, "step": 55500 }, { "epoch": 9.054649265905383, "grad_norm": 0.10600312799215317, "learning_rate": 1.3493189620205572e-06, "loss": 0.0077, "num_input_tokens_seen": 119765936, "step": 55505 }, { "epoch": 9.055464926590538, "grad_norm": 0.029680950567126274, "learning_rate": 1.3470130743339914e-06, "loss": 0.0018, "num_input_tokens_seen": 119776688, "step": 55510 }, { "epoch": 9.056280587275694, "grad_norm": 2.44942569732666, "learning_rate": 1.3447091040785619e-06, "loss": 0.0859, "num_input_tokens_seen": 119787088, "step": 55515 }, { "epoch": 9.057096247960848, "grad_norm": 0.09405805170536041, "learning_rate": 1.342407051441033e-06, "loss": 0.0126, "num_input_tokens_seen": 119798576, "step": 55520 }, { "epoch": 9.057911908646004, "grad_norm": 3.5197908878326416, "learning_rate": 1.3401069166080278e-06, "loss": 0.082, "num_input_tokens_seen": 119809424, "step": 55525 }, { "epoch": 9.058727569331158, "grad_norm": 0.5775467753410339, "learning_rate": 1.3378086997660077e-06, "loss": 0.0104, "num_input_tokens_seen": 119821232, "step": 55530 }, { "epoch": 9.059543230016313, "grad_norm": 0.0336451530456543, "learning_rate": 1.3355124011012744e-06, "loss": 0.1223, "num_input_tokens_seen": 119832688, "step": 55535 }, { "epoch": 9.060358890701469, "grad_norm": 23.774309158325195, "learning_rate": 1.3332180207999783e-06, "loss": 0.2499, "num_input_tokens_seen": 119842736, "step": 55540 }, { "epoch": 9.061174551386623, "grad_norm": 0.11912717670202255, "learning_rate": 1.3309255590481129e-06, "loss": 0.011, "num_input_tokens_seen": 119853808, "step": 55545 }, { "epoch": 9.061990212071779, "grad_norm": 0.08176933974027634, "learning_rate": 1.3286350160315181e-06, "loss": 0.2745, "num_input_tokens_seen": 119864976, "step": 55550 }, { "epoch": 9.062805872756933, "grad_norm": 0.09938140958547592, "learning_rate": 1.3263463919358759e-06, "loss": 0.0142, "num_input_tokens_seen": 119875312, "step": 55555 }, { "epoch": 9.063621533442088, "grad_norm": 0.1812557876110077, "learning_rate": 1.3240596869467158e-06, "loss": 0.08, "num_input_tokens_seen": 119886704, "step": 55560 }, { "epoch": 9.064437194127244, "grad_norm": 0.31800466775894165, "learning_rate": 1.3217749012494062e-06, "loss": 0.0048, "num_input_tokens_seen": 119897424, "step": 55565 }, { "epoch": 9.065252854812398, "grad_norm": 18.320484161376953, "learning_rate": 1.3194920350291657e-06, "loss": 0.0201, "num_input_tokens_seen": 119909520, "step": 55570 }, { "epoch": 9.066068515497554, "grad_norm": 0.03986978158354759, "learning_rate": 1.3172110884710541e-06, "loss": 0.1012, "num_input_tokens_seen": 119919056, "step": 55575 }, { "epoch": 9.066884176182707, "grad_norm": 0.23460790514945984, "learning_rate": 1.314932061759977e-06, "loss": 0.1249, "num_input_tokens_seen": 119930448, "step": 55580 }, { "epoch": 9.067699836867863, "grad_norm": 0.07698375731706619, "learning_rate": 1.3126549550806832e-06, "loss": 0.0044, "num_input_tokens_seen": 119939824, "step": 55585 }, { "epoch": 9.068515497553017, "grad_norm": 0.6116136908531189, "learning_rate": 1.310379768617767e-06, "loss": 0.0445, "num_input_tokens_seen": 119951152, "step": 55590 }, { "epoch": 9.069331158238173, "grad_norm": 8.246176719665527, "learning_rate": 1.308106502555667e-06, "loss": 0.0944, "num_input_tokens_seen": 119961104, "step": 55595 }, { "epoch": 9.070146818923329, "grad_norm": 3.7423102855682373, "learning_rate": 1.3058351570786665e-06, "loss": 0.0187, "num_input_tokens_seen": 119971920, "step": 55600 }, { "epoch": 9.070962479608482, "grad_norm": 0.016244083642959595, "learning_rate": 1.3035657323708927e-06, "loss": 0.1457, "num_input_tokens_seen": 119980944, "step": 55605 }, { "epoch": 9.071778140293638, "grad_norm": 0.15408752858638763, "learning_rate": 1.3012982286163129e-06, "loss": 0.1199, "num_input_tokens_seen": 119991792, "step": 55610 }, { "epoch": 9.072593800978792, "grad_norm": 0.6007954478263855, "learning_rate": 1.2990326459987434e-06, "loss": 0.0087, "num_input_tokens_seen": 120002064, "step": 55615 }, { "epoch": 9.073409461663948, "grad_norm": 0.48222196102142334, "learning_rate": 1.296768984701846e-06, "loss": 0.1087, "num_input_tokens_seen": 120012560, "step": 55620 }, { "epoch": 9.074225122349104, "grad_norm": 0.0767175629734993, "learning_rate": 1.2945072449091212e-06, "loss": 0.0098, "num_input_tokens_seen": 120023568, "step": 55625 }, { "epoch": 9.075040783034257, "grad_norm": 0.09177295118570328, "learning_rate": 1.292247426803922e-06, "loss": 0.2351, "num_input_tokens_seen": 120034448, "step": 55630 }, { "epoch": 9.075856443719413, "grad_norm": 0.23538443446159363, "learning_rate": 1.2899895305694408e-06, "loss": 0.1497, "num_input_tokens_seen": 120046096, "step": 55635 }, { "epoch": 9.076672104404567, "grad_norm": 0.14989183843135834, "learning_rate": 1.2877335563887095e-06, "loss": 0.0033, "num_input_tokens_seen": 120055344, "step": 55640 }, { "epoch": 9.077487765089723, "grad_norm": 0.11194797605276108, "learning_rate": 1.2854795044446116e-06, "loss": 0.0068, "num_input_tokens_seen": 120065808, "step": 55645 }, { "epoch": 9.078303425774878, "grad_norm": 0.3517356514930725, "learning_rate": 1.2832273749198708e-06, "loss": 0.0977, "num_input_tokens_seen": 120075472, "step": 55650 }, { "epoch": 9.079119086460032, "grad_norm": 0.037554457783699036, "learning_rate": 1.2809771679970522e-06, "loss": 0.0675, "num_input_tokens_seen": 120085072, "step": 55655 }, { "epoch": 9.079934747145188, "grad_norm": 0.11872192472219467, "learning_rate": 1.2787288838585793e-06, "loss": 0.1116, "num_input_tokens_seen": 120096304, "step": 55660 }, { "epoch": 9.080750407830342, "grad_norm": 0.09016989171504974, "learning_rate": 1.2764825226867005e-06, "loss": 0.0033, "num_input_tokens_seen": 120105968, "step": 55665 }, { "epoch": 9.081566068515498, "grad_norm": 0.07086058706045151, "learning_rate": 1.2742380846635231e-06, "loss": 0.1139, "num_input_tokens_seen": 120117360, "step": 55670 }, { "epoch": 9.082381729200652, "grad_norm": 0.3153351843357086, "learning_rate": 1.2719955699709907e-06, "loss": 0.1097, "num_input_tokens_seen": 120128464, "step": 55675 }, { "epoch": 9.083197389885807, "grad_norm": 0.13674883544445038, "learning_rate": 1.2697549787908908e-06, "loss": 0.0048, "num_input_tokens_seen": 120139440, "step": 55680 }, { "epoch": 9.084013050570963, "grad_norm": 0.11322315782308578, "learning_rate": 1.267516311304856e-06, "loss": 0.0101, "num_input_tokens_seen": 120149392, "step": 55685 }, { "epoch": 9.084828711256117, "grad_norm": 4.931153774261475, "learning_rate": 1.265279567694369e-06, "loss": 0.359, "num_input_tokens_seen": 120160848, "step": 55690 }, { "epoch": 9.085644371941273, "grad_norm": 0.08744145929813385, "learning_rate": 1.2630447481407486e-06, "loss": 0.0031, "num_input_tokens_seen": 120172368, "step": 55695 }, { "epoch": 9.086460032626427, "grad_norm": 0.0977356880903244, "learning_rate": 1.2608118528251611e-06, "loss": 0.0542, "num_input_tokens_seen": 120183184, "step": 55700 }, { "epoch": 9.087275693311582, "grad_norm": 0.06535228341817856, "learning_rate": 1.2585808819286172e-06, "loss": 0.0083, "num_input_tokens_seen": 120194480, "step": 55705 }, { "epoch": 9.088091353996738, "grad_norm": 0.07237150520086288, "learning_rate": 1.2563518356319664e-06, "loss": 0.0734, "num_input_tokens_seen": 120205616, "step": 55710 }, { "epoch": 9.088907014681892, "grad_norm": 0.06289654970169067, "learning_rate": 1.2541247141159119e-06, "loss": 0.189, "num_input_tokens_seen": 120216496, "step": 55715 }, { "epoch": 9.089722675367048, "grad_norm": 3.9390320777893066, "learning_rate": 1.2518995175609949e-06, "loss": 0.3441, "num_input_tokens_seen": 120226896, "step": 55720 }, { "epoch": 9.090538336052202, "grad_norm": 0.6136237382888794, "learning_rate": 1.2496762461475992e-06, "loss": 0.006, "num_input_tokens_seen": 120236400, "step": 55725 }, { "epoch": 9.091353996737357, "grad_norm": 0.3308112919330597, "learning_rate": 1.2474549000559527e-06, "loss": 0.0056, "num_input_tokens_seen": 120247920, "step": 55730 }, { "epoch": 9.092169657422513, "grad_norm": 0.07058186829090118, "learning_rate": 1.245235479466131e-06, "loss": 0.0082, "num_input_tokens_seen": 120258512, "step": 55735 }, { "epoch": 9.092985318107667, "grad_norm": 21.97412872314453, "learning_rate": 1.2430179845580537e-06, "loss": 0.0963, "num_input_tokens_seen": 120270032, "step": 55740 }, { "epoch": 9.093800978792823, "grad_norm": 0.10977151989936829, "learning_rate": 1.24080241551148e-06, "loss": 0.0042, "num_input_tokens_seen": 120280816, "step": 55745 }, { "epoch": 9.094616639477977, "grad_norm": 0.09765518456697464, "learning_rate": 1.2385887725060135e-06, "loss": 0.1115, "num_input_tokens_seen": 120291664, "step": 55750 }, { "epoch": 9.095432300163132, "grad_norm": 0.8600424528121948, "learning_rate": 1.236377055721108e-06, "loss": 0.0672, "num_input_tokens_seen": 120302320, "step": 55755 }, { "epoch": 9.096247960848286, "grad_norm": 0.07710446417331696, "learning_rate": 1.234167265336053e-06, "loss": 0.0632, "num_input_tokens_seen": 120312688, "step": 55760 }, { "epoch": 9.097063621533442, "grad_norm": 0.11269769817590714, "learning_rate": 1.2319594015299862e-06, "loss": 0.0187, "num_input_tokens_seen": 120322928, "step": 55765 }, { "epoch": 9.097879282218598, "grad_norm": 0.1080789640545845, "learning_rate": 1.2297534644818891e-06, "loss": 0.0774, "num_input_tokens_seen": 120332080, "step": 55770 }, { "epoch": 9.098694942903752, "grad_norm": 0.05265503376722336, "learning_rate": 1.227549454370583e-06, "loss": 0.1184, "num_input_tokens_seen": 120344400, "step": 55775 }, { "epoch": 9.099510603588907, "grad_norm": 0.17311835289001465, "learning_rate": 1.225347371374741e-06, "loss": 0.0172, "num_input_tokens_seen": 120355344, "step": 55780 }, { "epoch": 9.100326264274061, "grad_norm": 0.08054272830486298, "learning_rate": 1.2231472156728707e-06, "loss": 0.007, "num_input_tokens_seen": 120366640, "step": 55785 }, { "epoch": 9.101141924959217, "grad_norm": 0.02693832479417324, "learning_rate": 1.2209489874433294e-06, "loss": 0.0046, "num_input_tokens_seen": 120377584, "step": 55790 }, { "epoch": 9.101957585644373, "grad_norm": 0.06729228049516678, "learning_rate": 1.2187526868643162e-06, "loss": 0.0037, "num_input_tokens_seen": 120388368, "step": 55795 }, { "epoch": 9.102773246329527, "grad_norm": 0.01973656751215458, "learning_rate": 1.2165583141138748e-06, "loss": 0.0045, "num_input_tokens_seen": 120399312, "step": 55800 }, { "epoch": 9.103588907014682, "grad_norm": 3.6598472595214844, "learning_rate": 1.2143658693698933e-06, "loss": 0.2122, "num_input_tokens_seen": 120409616, "step": 55805 }, { "epoch": 9.104404567699836, "grad_norm": 0.15364094078540802, "learning_rate": 1.212175352810102e-06, "loss": 0.1852, "num_input_tokens_seen": 120421424, "step": 55810 }, { "epoch": 9.105220228384992, "grad_norm": 0.4004209041595459, "learning_rate": 1.2099867646120754e-06, "loss": 0.0997, "num_input_tokens_seen": 120431088, "step": 55815 }, { "epoch": 9.106035889070148, "grad_norm": 0.09255687147378922, "learning_rate": 1.20780010495323e-06, "loss": 0.0032, "num_input_tokens_seen": 120442320, "step": 55820 }, { "epoch": 9.106851549755302, "grad_norm": 2.7680530548095703, "learning_rate": 1.2056153740108295e-06, "loss": 0.1717, "num_input_tokens_seen": 120453104, "step": 55825 }, { "epoch": 9.107667210440457, "grad_norm": 0.04503974691033363, "learning_rate": 1.2034325719619794e-06, "loss": 0.0051, "num_input_tokens_seen": 120463536, "step": 55830 }, { "epoch": 9.108482871125611, "grad_norm": 0.22261156141757965, "learning_rate": 1.2012516989836242e-06, "loss": 0.0845, "num_input_tokens_seen": 120474672, "step": 55835 }, { "epoch": 9.109298531810767, "grad_norm": 1.0107585191726685, "learning_rate": 1.1990727552525588e-06, "loss": 0.0063, "num_input_tokens_seen": 120484208, "step": 55840 }, { "epoch": 9.11011419249592, "grad_norm": 6.194400310516357, "learning_rate": 1.196895740945425e-06, "loss": 0.1166, "num_input_tokens_seen": 120494896, "step": 55845 }, { "epoch": 9.110929853181077, "grad_norm": 7.5901947021484375, "learning_rate": 1.194720656238696e-06, "loss": 0.0399, "num_input_tokens_seen": 120505936, "step": 55850 }, { "epoch": 9.111745513866232, "grad_norm": 0.11343776434659958, "learning_rate": 1.1925475013086968e-06, "loss": 0.0034, "num_input_tokens_seen": 120516944, "step": 55855 }, { "epoch": 9.112561174551386, "grad_norm": 0.29412412643432617, "learning_rate": 1.190376276331598e-06, "loss": 0.0055, "num_input_tokens_seen": 120527344, "step": 55860 }, { "epoch": 9.113376835236542, "grad_norm": 0.13387349247932434, "learning_rate": 1.1882069814834057e-06, "loss": 0.0037, "num_input_tokens_seen": 120537520, "step": 55865 }, { "epoch": 9.114192495921696, "grad_norm": 0.1723552793264389, "learning_rate": 1.186039616939974e-06, "loss": 0.1019, "num_input_tokens_seen": 120549072, "step": 55870 }, { "epoch": 9.115008156606851, "grad_norm": 21.81083106994629, "learning_rate": 1.1838741828770039e-06, "loss": 0.055, "num_input_tokens_seen": 120559888, "step": 55875 }, { "epoch": 9.115823817292007, "grad_norm": 0.06051301956176758, "learning_rate": 1.1817106794700327e-06, "loss": 0.1739, "num_input_tokens_seen": 120571504, "step": 55880 }, { "epoch": 9.116639477977161, "grad_norm": 15.309614181518555, "learning_rate": 1.1795491068944453e-06, "loss": 0.0321, "num_input_tokens_seen": 120581456, "step": 55885 }, { "epoch": 9.117455138662317, "grad_norm": 0.07540477812290192, "learning_rate": 1.1773894653254736e-06, "loss": 0.1269, "num_input_tokens_seen": 120592080, "step": 55890 }, { "epoch": 9.11827079934747, "grad_norm": 0.14149914681911469, "learning_rate": 1.1752317549381857e-06, "loss": 0.0064, "num_input_tokens_seen": 120602128, "step": 55895 }, { "epoch": 9.119086460032626, "grad_norm": 2.3565332889556885, "learning_rate": 1.1730759759074978e-06, "loss": 0.0081, "num_input_tokens_seen": 120614352, "step": 55900 }, { "epoch": 9.119902120717782, "grad_norm": 0.06479676812887192, "learning_rate": 1.1709221284081666e-06, "loss": 0.0039, "num_input_tokens_seen": 120625424, "step": 55905 }, { "epoch": 9.120717781402936, "grad_norm": 4.313600063323975, "learning_rate": 1.1687702126147976e-06, "loss": 0.0142, "num_input_tokens_seen": 120635536, "step": 55910 }, { "epoch": 9.121533442088092, "grad_norm": 0.020870855078101158, "learning_rate": 1.1666202287018313e-06, "loss": 0.1353, "num_input_tokens_seen": 120646064, "step": 55915 }, { "epoch": 9.122349102773246, "grad_norm": 0.04624507948756218, "learning_rate": 1.1644721768435617e-06, "loss": 0.0036, "num_input_tokens_seen": 120657072, "step": 55920 }, { "epoch": 9.123164763458401, "grad_norm": 0.15288259088993073, "learning_rate": 1.1623260572141137e-06, "loss": 0.101, "num_input_tokens_seen": 120668816, "step": 55925 }, { "epoch": 9.123980424143557, "grad_norm": 0.2106817364692688, "learning_rate": 1.16018186998747e-06, "loss": 0.0071, "num_input_tokens_seen": 120679248, "step": 55930 }, { "epoch": 9.124796084828711, "grad_norm": 0.14101427793502808, "learning_rate": 1.1580396153374446e-06, "loss": 0.0061, "num_input_tokens_seen": 120690096, "step": 55935 }, { "epoch": 9.125611745513867, "grad_norm": 0.4781815707683563, "learning_rate": 1.1558992934376982e-06, "loss": 0.0057, "num_input_tokens_seen": 120702352, "step": 55940 }, { "epoch": 9.12642740619902, "grad_norm": 0.09064584970474243, "learning_rate": 1.1537609044617398e-06, "loss": 0.1309, "num_input_tokens_seen": 120713072, "step": 55945 }, { "epoch": 9.127243066884176, "grad_norm": 0.033964842557907104, "learning_rate": 1.1516244485829193e-06, "loss": 0.0884, "num_input_tokens_seen": 120725200, "step": 55950 }, { "epoch": 9.12805872756933, "grad_norm": 4.713009834289551, "learning_rate": 1.1494899259744258e-06, "loss": 0.0967, "num_input_tokens_seen": 120735696, "step": 55955 }, { "epoch": 9.128874388254486, "grad_norm": 3.6822562217712402, "learning_rate": 1.147357336809296e-06, "loss": 0.1994, "num_input_tokens_seen": 120747344, "step": 55960 }, { "epoch": 9.129690048939642, "grad_norm": 0.591336190700531, "learning_rate": 1.1452266812604056e-06, "loss": 0.1115, "num_input_tokens_seen": 120758416, "step": 55965 }, { "epoch": 9.130505709624796, "grad_norm": 6.977701187133789, "learning_rate": 1.1430979595004777e-06, "loss": 0.2717, "num_input_tokens_seen": 120769552, "step": 55970 }, { "epoch": 9.131321370309951, "grad_norm": 0.14338502287864685, "learning_rate": 1.1409711717020794e-06, "loss": 0.1445, "num_input_tokens_seen": 120780656, "step": 55975 }, { "epoch": 9.132137030995105, "grad_norm": 0.12120555341243744, "learning_rate": 1.1388463180376175e-06, "loss": 0.0778, "num_input_tokens_seen": 120792048, "step": 55980 }, { "epoch": 9.132952691680261, "grad_norm": 0.11915405839681625, "learning_rate": 1.1367233986793429e-06, "loss": 0.1098, "num_input_tokens_seen": 120802224, "step": 55985 }, { "epoch": 9.133768352365417, "grad_norm": 0.12654858827590942, "learning_rate": 1.1346024137993516e-06, "loss": 0.0612, "num_input_tokens_seen": 120812720, "step": 55990 }, { "epoch": 9.13458401305057, "grad_norm": 10.58918571472168, "learning_rate": 1.1324833635695808e-06, "loss": 0.157, "num_input_tokens_seen": 120824880, "step": 55995 }, { "epoch": 9.135399673735726, "grad_norm": 0.07049047201871872, "learning_rate": 1.13036624816181e-06, "loss": 0.105, "num_input_tokens_seen": 120835632, "step": 56000 }, { "epoch": 9.13621533442088, "grad_norm": 0.06958387047052383, "learning_rate": 1.1282510677476655e-06, "loss": 0.0038, "num_input_tokens_seen": 120847984, "step": 56005 }, { "epoch": 9.137030995106036, "grad_norm": 0.36804723739624023, "learning_rate": 1.126137822498613e-06, "loss": 0.152, "num_input_tokens_seen": 120858832, "step": 56010 }, { "epoch": 9.137846655791192, "grad_norm": 0.10517661273479462, "learning_rate": 1.1240265125859628e-06, "loss": 0.0054, "num_input_tokens_seen": 120869872, "step": 56015 }, { "epoch": 9.138662316476346, "grad_norm": 0.3555206060409546, "learning_rate": 1.1219171381808696e-06, "loss": 0.1546, "num_input_tokens_seen": 120879920, "step": 56020 }, { "epoch": 9.139477977161501, "grad_norm": 0.030262192711234093, "learning_rate": 1.11980969945433e-06, "loss": 0.2091, "num_input_tokens_seen": 120891088, "step": 56025 }, { "epoch": 9.140293637846655, "grad_norm": 3.5948376655578613, "learning_rate": 1.1177041965771823e-06, "loss": 0.0912, "num_input_tokens_seen": 120901968, "step": 56030 }, { "epoch": 9.141109298531811, "grad_norm": 1.0993356704711914, "learning_rate": 1.1156006297201093e-06, "loss": 0.0855, "num_input_tokens_seen": 120912176, "step": 56035 }, { "epoch": 9.141924959216965, "grad_norm": 0.12425486743450165, "learning_rate": 1.1134989990536387e-06, "loss": 0.0052, "num_input_tokens_seen": 120923216, "step": 56040 }, { "epoch": 9.14274061990212, "grad_norm": 0.30227938294410706, "learning_rate": 1.1113993047481369e-06, "loss": 0.1335, "num_input_tokens_seen": 120934448, "step": 56045 }, { "epoch": 9.143556280587276, "grad_norm": 0.09625578671693802, "learning_rate": 1.1093015469738177e-06, "loss": 0.0106, "num_input_tokens_seen": 120944880, "step": 56050 }, { "epoch": 9.14437194127243, "grad_norm": 3.6121058464050293, "learning_rate": 1.107205725900734e-06, "loss": 0.1135, "num_input_tokens_seen": 120956304, "step": 56055 }, { "epoch": 9.145187601957586, "grad_norm": 5.352141857147217, "learning_rate": 1.105111841698786e-06, "loss": 0.073, "num_input_tokens_seen": 120966320, "step": 56060 }, { "epoch": 9.14600326264274, "grad_norm": 2.422102212905884, "learning_rate": 1.1030198945377128e-06, "loss": 0.0069, "num_input_tokens_seen": 120977648, "step": 56065 }, { "epoch": 9.146818923327896, "grad_norm": 0.14568936824798584, "learning_rate": 1.1009298845871013e-06, "loss": 0.015, "num_input_tokens_seen": 120987728, "step": 56070 }, { "epoch": 9.147634584013051, "grad_norm": 0.10506874322891235, "learning_rate": 1.098841812016374e-06, "loss": 0.0639, "num_input_tokens_seen": 120998640, "step": 56075 }, { "epoch": 9.148450244698205, "grad_norm": 0.078071728348732, "learning_rate": 1.096755676994804e-06, "loss": 0.1179, "num_input_tokens_seen": 121008912, "step": 56080 }, { "epoch": 9.149265905383361, "grad_norm": 0.15917930006980896, "learning_rate": 1.0946714796915032e-06, "loss": 0.0067, "num_input_tokens_seen": 121019184, "step": 56085 }, { "epoch": 9.150081566068515, "grad_norm": 2.8171603679656982, "learning_rate": 1.092589220275425e-06, "loss": 0.133, "num_input_tokens_seen": 121031184, "step": 56090 }, { "epoch": 9.15089722675367, "grad_norm": 17.829561233520508, "learning_rate": 1.0905088989153712e-06, "loss": 0.2337, "num_input_tokens_seen": 121042672, "step": 56095 }, { "epoch": 9.151712887438826, "grad_norm": 1.4083698987960815, "learning_rate": 1.0884305157799785e-06, "loss": 0.2532, "num_input_tokens_seen": 121053520, "step": 56100 }, { "epoch": 9.15252854812398, "grad_norm": 0.061346959322690964, "learning_rate": 1.0863540710377373e-06, "loss": 0.0061, "num_input_tokens_seen": 121064688, "step": 56105 }, { "epoch": 9.153344208809136, "grad_norm": 0.05480477958917618, "learning_rate": 1.0842795648569688e-06, "loss": 0.2609, "num_input_tokens_seen": 121075088, "step": 56110 }, { "epoch": 9.15415986949429, "grad_norm": 0.09183188527822495, "learning_rate": 1.0822069974058464e-06, "loss": 0.1282, "num_input_tokens_seen": 121085424, "step": 56115 }, { "epoch": 9.154975530179446, "grad_norm": 0.22845900058746338, "learning_rate": 1.0801363688523858e-06, "loss": 0.0065, "num_input_tokens_seen": 121097072, "step": 56120 }, { "epoch": 9.1557911908646, "grad_norm": 0.09074364602565765, "learning_rate": 1.0780676793644362e-06, "loss": 0.005, "num_input_tokens_seen": 121108880, "step": 56125 }, { "epoch": 9.156606851549755, "grad_norm": 0.3043750822544098, "learning_rate": 1.0760009291097022e-06, "loss": 0.0171, "num_input_tokens_seen": 121120432, "step": 56130 }, { "epoch": 9.15742251223491, "grad_norm": 0.32757917046546936, "learning_rate": 1.0739361182557194e-06, "loss": 0.3645, "num_input_tokens_seen": 121131952, "step": 56135 }, { "epoch": 9.158238172920065, "grad_norm": 0.04966404289007187, "learning_rate": 1.071873246969876e-06, "loss": 0.0075, "num_input_tokens_seen": 121140304, "step": 56140 }, { "epoch": 9.15905383360522, "grad_norm": 0.04576646909117699, "learning_rate": 1.0698123154193967e-06, "loss": 0.1015, "num_input_tokens_seen": 121150608, "step": 56145 }, { "epoch": 9.159869494290374, "grad_norm": 0.050863754004240036, "learning_rate": 1.0677533237713533e-06, "loss": 0.0795, "num_input_tokens_seen": 121161296, "step": 56150 }, { "epoch": 9.16068515497553, "grad_norm": 0.09488074481487274, "learning_rate": 1.0656962721926539e-06, "loss": 0.1087, "num_input_tokens_seen": 121172976, "step": 56155 }, { "epoch": 9.161500815660686, "grad_norm": 0.039611224085092545, "learning_rate": 1.063641160850054e-06, "loss": 0.0031, "num_input_tokens_seen": 121183024, "step": 56160 }, { "epoch": 9.16231647634584, "grad_norm": 3.2668161392211914, "learning_rate": 1.0615879899101567e-06, "loss": 0.1267, "num_input_tokens_seen": 121194032, "step": 56165 }, { "epoch": 9.163132137030995, "grad_norm": 0.11332713067531586, "learning_rate": 1.0595367595393978e-06, "loss": 0.0045, "num_input_tokens_seen": 121205296, "step": 56170 }, { "epoch": 9.16394779771615, "grad_norm": 0.09239339828491211, "learning_rate": 1.0574874699040643e-06, "loss": 0.0025, "num_input_tokens_seen": 121217168, "step": 56175 }, { "epoch": 9.164763458401305, "grad_norm": 4.365166664123535, "learning_rate": 1.0554401211702787e-06, "loss": 0.4424, "num_input_tokens_seen": 121228176, "step": 56180 }, { "epoch": 9.16557911908646, "grad_norm": 0.03770161792635918, "learning_rate": 1.0533947135040106e-06, "loss": 0.0038, "num_input_tokens_seen": 121237968, "step": 56185 }, { "epoch": 9.166394779771615, "grad_norm": 0.14600355923175812, "learning_rate": 1.0513512470710695e-06, "loss": 0.01, "num_input_tokens_seen": 121248048, "step": 56190 }, { "epoch": 9.16721044045677, "grad_norm": 0.13089050352573395, "learning_rate": 1.0493097220371117e-06, "loss": 0.072, "num_input_tokens_seen": 121259088, "step": 56195 }, { "epoch": 9.168026101141924, "grad_norm": 0.10141665488481522, "learning_rate": 1.0472701385676326e-06, "loss": 0.2774, "num_input_tokens_seen": 121270416, "step": 56200 }, { "epoch": 9.16884176182708, "grad_norm": 0.10410164296627045, "learning_rate": 1.045232496827972e-06, "loss": 0.097, "num_input_tokens_seen": 121281424, "step": 56205 }, { "epoch": 9.169657422512234, "grad_norm": 0.17221102118492126, "learning_rate": 1.043196796983309e-06, "loss": 0.0065, "num_input_tokens_seen": 121292496, "step": 56210 }, { "epoch": 9.17047308319739, "grad_norm": 0.06364672631025314, "learning_rate": 1.0411630391986698e-06, "loss": 0.0023, "num_input_tokens_seen": 121303280, "step": 56215 }, { "epoch": 9.171288743882545, "grad_norm": 0.3799787163734436, "learning_rate": 1.03913122363892e-06, "loss": 0.0966, "num_input_tokens_seen": 121315248, "step": 56220 }, { "epoch": 9.1721044045677, "grad_norm": 0.17793439328670502, "learning_rate": 1.0371013504687692e-06, "loss": 0.0834, "num_input_tokens_seen": 121326256, "step": 56225 }, { "epoch": 9.172920065252855, "grad_norm": 0.18157732486724854, "learning_rate": 1.0350734198527696e-06, "loss": 0.2261, "num_input_tokens_seen": 121337264, "step": 56230 }, { "epoch": 9.173735725938009, "grad_norm": 0.2199692577123642, "learning_rate": 1.033047431955317e-06, "loss": 0.1282, "num_input_tokens_seen": 121347600, "step": 56235 }, { "epoch": 9.174551386623165, "grad_norm": 0.1965482532978058, "learning_rate": 1.0310233869406437e-06, "loss": 0.161, "num_input_tokens_seen": 121358800, "step": 56240 }, { "epoch": 9.17536704730832, "grad_norm": 0.4902295470237732, "learning_rate": 1.0290012849728358e-06, "loss": 0.0069, "num_input_tokens_seen": 121369712, "step": 56245 }, { "epoch": 9.176182707993474, "grad_norm": 0.10036962479352951, "learning_rate": 1.0269811262158092e-06, "loss": 0.0804, "num_input_tokens_seen": 121380784, "step": 56250 }, { "epoch": 9.17699836867863, "grad_norm": 0.0895148441195488, "learning_rate": 1.024962910833327e-06, "loss": 0.0062, "num_input_tokens_seen": 121392752, "step": 56255 }, { "epoch": 9.177814029363784, "grad_norm": 0.0164335947483778, "learning_rate": 1.022946638989003e-06, "loss": 0.0784, "num_input_tokens_seen": 121402928, "step": 56260 }, { "epoch": 9.17862969004894, "grad_norm": 0.08182208240032196, "learning_rate": 1.0209323108462816e-06, "loss": 0.004, "num_input_tokens_seen": 121413008, "step": 56265 }, { "epoch": 9.179445350734095, "grad_norm": 0.0456121489405632, "learning_rate": 1.018919926568457e-06, "loss": 0.0522, "num_input_tokens_seen": 121422896, "step": 56270 }, { "epoch": 9.18026101141925, "grad_norm": 0.07641737908124924, "learning_rate": 1.0169094863186623e-06, "loss": 0.2069, "num_input_tokens_seen": 121434512, "step": 56275 }, { "epoch": 9.181076672104405, "grad_norm": 0.0399506501853466, "learning_rate": 1.0149009902598706e-06, "loss": 0.2618, "num_input_tokens_seen": 121445840, "step": 56280 }, { "epoch": 9.181892332789559, "grad_norm": 0.06404338777065277, "learning_rate": 1.0128944385549038e-06, "loss": 0.0735, "num_input_tokens_seen": 121457264, "step": 56285 }, { "epoch": 9.182707993474715, "grad_norm": 0.10424448549747467, "learning_rate": 1.0108898313664267e-06, "loss": 0.0026, "num_input_tokens_seen": 121467728, "step": 56290 }, { "epoch": 9.18352365415987, "grad_norm": 0.0442383848130703, "learning_rate": 1.0088871688569397e-06, "loss": 0.2079, "num_input_tokens_seen": 121478608, "step": 56295 }, { "epoch": 9.184339314845024, "grad_norm": 0.13860155642032623, "learning_rate": 1.006886451188785e-06, "loss": 0.0058, "num_input_tokens_seen": 121489488, "step": 56300 }, { "epoch": 9.18515497553018, "grad_norm": 0.09070900827646255, "learning_rate": 1.0048876785241578e-06, "loss": 0.1313, "num_input_tokens_seen": 121500400, "step": 56305 }, { "epoch": 9.185970636215334, "grad_norm": 0.12147332727909088, "learning_rate": 1.0028908510250846e-06, "loss": 0.1195, "num_input_tokens_seen": 121511088, "step": 56310 }, { "epoch": 9.18678629690049, "grad_norm": 0.374837189912796, "learning_rate": 1.000895968853438e-06, "loss": 0.118, "num_input_tokens_seen": 121523088, "step": 56315 }, { "epoch": 9.187601957585644, "grad_norm": 0.039751630276441574, "learning_rate": 9.989030321709336e-07, "loss": 0.0051, "num_input_tokens_seen": 121533808, "step": 56320 }, { "epoch": 9.1884176182708, "grad_norm": 19.650569915771484, "learning_rate": 9.969120411391308e-07, "loss": 0.1255, "num_input_tokens_seen": 121544240, "step": 56325 }, { "epoch": 9.189233278955955, "grad_norm": 5.189046382904053, "learning_rate": 9.949229959194313e-07, "loss": 0.2986, "num_input_tokens_seen": 121554992, "step": 56330 }, { "epoch": 9.190048939641109, "grad_norm": 0.09287931025028229, "learning_rate": 9.929358966730696e-07, "loss": 0.16, "num_input_tokens_seen": 121565424, "step": 56335 }, { "epoch": 9.190864600326265, "grad_norm": 0.11300577968358994, "learning_rate": 9.909507435611365e-07, "loss": 0.1685, "num_input_tokens_seen": 121575504, "step": 56340 }, { "epoch": 9.191680261011419, "grad_norm": 0.10862802714109421, "learning_rate": 9.889675367445589e-07, "loss": 0.0042, "num_input_tokens_seen": 121586320, "step": 56345 }, { "epoch": 9.192495921696574, "grad_norm": 0.10327637195587158, "learning_rate": 9.869862763841026e-07, "loss": 0.2934, "num_input_tokens_seen": 121596976, "step": 56350 }, { "epoch": 9.19331158238173, "grad_norm": 0.07766059041023254, "learning_rate": 9.85006962640378e-07, "loss": 0.0132, "num_input_tokens_seen": 121607344, "step": 56355 }, { "epoch": 9.194127243066884, "grad_norm": 0.06651419401168823, "learning_rate": 9.8302959567384e-07, "loss": 0.0667, "num_input_tokens_seen": 121617424, "step": 56360 }, { "epoch": 9.19494290375204, "grad_norm": 7.505390644073486, "learning_rate": 9.810541756447855e-07, "loss": 0.0799, "num_input_tokens_seen": 121629744, "step": 56365 }, { "epoch": 9.195758564437194, "grad_norm": 3.8123583793640137, "learning_rate": 9.790807027133446e-07, "loss": 0.2384, "num_input_tokens_seen": 121641008, "step": 56370 }, { "epoch": 9.19657422512235, "grad_norm": 0.07194401323795319, "learning_rate": 9.77109177039509e-07, "loss": 0.0053, "num_input_tokens_seen": 121652528, "step": 56375 }, { "epoch": 9.197389885807505, "grad_norm": 0.610551655292511, "learning_rate": 9.751395987830924e-07, "loss": 0.0187, "num_input_tokens_seen": 121663856, "step": 56380 }, { "epoch": 9.198205546492659, "grad_norm": 8.918259620666504, "learning_rate": 9.731719681037616e-07, "loss": 0.1054, "num_input_tokens_seen": 121676112, "step": 56385 }, { "epoch": 9.199021207177815, "grad_norm": 0.08791600912809372, "learning_rate": 9.712062851610222e-07, "loss": 0.098, "num_input_tokens_seen": 121688400, "step": 56390 }, { "epoch": 9.199836867862969, "grad_norm": 4.068906307220459, "learning_rate": 9.692425501142217e-07, "loss": 0.0957, "num_input_tokens_seen": 121698928, "step": 56395 }, { "epoch": 9.200652528548124, "grad_norm": 8.034584045410156, "learning_rate": 9.672807631225521e-07, "loss": 0.2178, "num_input_tokens_seen": 121708496, "step": 56400 }, { "epoch": 9.201468189233278, "grad_norm": 41.141563415527344, "learning_rate": 9.65320924345045e-07, "loss": 0.0419, "num_input_tokens_seen": 121720240, "step": 56405 }, { "epoch": 9.202283849918434, "grad_norm": 0.07082147151231766, "learning_rate": 9.633630339405731e-07, "loss": 0.099, "num_input_tokens_seen": 121732080, "step": 56410 }, { "epoch": 9.20309951060359, "grad_norm": 0.12724247574806213, "learning_rate": 9.614070920678536e-07, "loss": 0.0115, "num_input_tokens_seen": 121742096, "step": 56415 }, { "epoch": 9.203915171288743, "grad_norm": 0.16074572503566742, "learning_rate": 9.59453098885446e-07, "loss": 0.0046, "num_input_tokens_seen": 121752496, "step": 56420 }, { "epoch": 9.2047308319739, "grad_norm": 0.1263059824705124, "learning_rate": 9.575010545517487e-07, "loss": 0.1537, "num_input_tokens_seen": 121763024, "step": 56425 }, { "epoch": 9.205546492659053, "grad_norm": 0.044610343873500824, "learning_rate": 9.5555095922501e-07, "loss": 0.1087, "num_input_tokens_seen": 121774352, "step": 56430 }, { "epoch": 9.206362153344209, "grad_norm": 0.02099367417395115, "learning_rate": 9.53602813063309e-07, "loss": 0.0054, "num_input_tokens_seen": 121785296, "step": 56435 }, { "epoch": 9.207177814029365, "grad_norm": 0.11674931645393372, "learning_rate": 9.516566162245749e-07, "loss": 0.1457, "num_input_tokens_seen": 121796752, "step": 56440 }, { "epoch": 9.207993474714518, "grad_norm": 13.21556282043457, "learning_rate": 9.49712368866576e-07, "loss": 0.0238, "num_input_tokens_seen": 121807440, "step": 56445 }, { "epoch": 9.208809135399674, "grad_norm": 0.03066655993461609, "learning_rate": 9.477700711469223e-07, "loss": 0.0045, "num_input_tokens_seen": 121818640, "step": 56450 }, { "epoch": 9.209624796084828, "grad_norm": 0.02105654403567314, "learning_rate": 9.458297232230684e-07, "loss": 0.0798, "num_input_tokens_seen": 121828560, "step": 56455 }, { "epoch": 9.210440456769984, "grad_norm": 5.688864707946777, "learning_rate": 9.438913252523024e-07, "loss": 0.0776, "num_input_tokens_seen": 121839088, "step": 56460 }, { "epoch": 9.21125611745514, "grad_norm": 0.07022027671337128, "learning_rate": 9.41954877391768e-07, "loss": 0.0042, "num_input_tokens_seen": 121849744, "step": 56465 }, { "epoch": 9.212071778140293, "grad_norm": 0.03331270068883896, "learning_rate": 9.400203797984397e-07, "loss": 0.1969, "num_input_tokens_seen": 121859696, "step": 56470 }, { "epoch": 9.21288743882545, "grad_norm": 0.18326084315776825, "learning_rate": 9.380878326291392e-07, "loss": 0.003, "num_input_tokens_seen": 121869840, "step": 56475 }, { "epoch": 9.213703099510603, "grad_norm": 0.12050646543502808, "learning_rate": 9.361572360405246e-07, "loss": 0.0027, "num_input_tokens_seen": 121880432, "step": 56480 }, { "epoch": 9.214518760195759, "grad_norm": 0.13081873953342438, "learning_rate": 9.342285901891068e-07, "loss": 0.0263, "num_input_tokens_seen": 121889264, "step": 56485 }, { "epoch": 9.215334420880913, "grad_norm": 0.3042246401309967, "learning_rate": 9.323018952312273e-07, "loss": 0.0051, "num_input_tokens_seen": 121900016, "step": 56490 }, { "epoch": 9.216150081566068, "grad_norm": 0.041951294988393784, "learning_rate": 9.303771513230752e-07, "loss": 0.0053, "num_input_tokens_seen": 121911312, "step": 56495 }, { "epoch": 9.216965742251224, "grad_norm": 0.16849592328071594, "learning_rate": 9.284543586206784e-07, "loss": 0.0035, "num_input_tokens_seen": 121923088, "step": 56500 }, { "epoch": 9.217781402936378, "grad_norm": 2.54909348487854, "learning_rate": 9.265335172799094e-07, "loss": 0.1501, "num_input_tokens_seen": 121932720, "step": 56505 }, { "epoch": 9.218597063621534, "grad_norm": 0.19697806239128113, "learning_rate": 9.246146274564798e-07, "loss": 0.0434, "num_input_tokens_seen": 121943248, "step": 56510 }, { "epoch": 9.219412724306688, "grad_norm": 2.1499359607696533, "learning_rate": 9.226976893059458e-07, "loss": 0.0835, "num_input_tokens_seen": 121953712, "step": 56515 }, { "epoch": 9.220228384991843, "grad_norm": 0.06755515933036804, "learning_rate": 9.207827029837052e-07, "loss": 0.0033, "num_input_tokens_seen": 121965456, "step": 56520 }, { "epoch": 9.221044045676999, "grad_norm": 0.14053942263126373, "learning_rate": 9.188696686449949e-07, "loss": 0.0049, "num_input_tokens_seen": 121975120, "step": 56525 }, { "epoch": 9.221859706362153, "grad_norm": 11.012677192687988, "learning_rate": 9.169585864448965e-07, "loss": 0.2189, "num_input_tokens_seen": 121987344, "step": 56530 }, { "epoch": 9.222675367047309, "grad_norm": 0.3366019129753113, "learning_rate": 9.150494565383305e-07, "loss": 0.005, "num_input_tokens_seen": 121998448, "step": 56535 }, { "epoch": 9.223491027732463, "grad_norm": 0.04674627259373665, "learning_rate": 9.13142279080062e-07, "loss": 0.3208, "num_input_tokens_seen": 122009104, "step": 56540 }, { "epoch": 9.224306688417618, "grad_norm": 20.346275329589844, "learning_rate": 9.112370542246978e-07, "loss": 0.1621, "num_input_tokens_seen": 122019952, "step": 56545 }, { "epoch": 9.225122349102774, "grad_norm": 0.092558853328228, "learning_rate": 9.093337821266784e-07, "loss": 0.0986, "num_input_tokens_seen": 122031280, "step": 56550 }, { "epoch": 9.225938009787928, "grad_norm": 0.23202751576900482, "learning_rate": 9.074324629403025e-07, "loss": 0.0039, "num_input_tokens_seen": 122041584, "step": 56555 }, { "epoch": 9.226753670473084, "grad_norm": 5.594674110412598, "learning_rate": 9.055330968196912e-07, "loss": 0.0708, "num_input_tokens_seen": 122051312, "step": 56560 }, { "epoch": 9.227569331158238, "grad_norm": 0.14860931038856506, "learning_rate": 9.036356839188243e-07, "loss": 0.01, "num_input_tokens_seen": 122061552, "step": 56565 }, { "epoch": 9.228384991843393, "grad_norm": 0.10265801846981049, "learning_rate": 9.017402243915091e-07, "loss": 0.1881, "num_input_tokens_seen": 122071184, "step": 56570 }, { "epoch": 9.229200652528547, "grad_norm": 3.455475330352783, "learning_rate": 8.998467183914061e-07, "loss": 0.0723, "num_input_tokens_seen": 122083344, "step": 56575 }, { "epoch": 9.230016313213703, "grad_norm": 0.05952727794647217, "learning_rate": 8.97955166072012e-07, "loss": 0.0022, "num_input_tokens_seen": 122094832, "step": 56580 }, { "epoch": 9.230831973898859, "grad_norm": 0.07937181740999222, "learning_rate": 8.960655675866653e-07, "loss": 0.1006, "num_input_tokens_seen": 122106032, "step": 56585 }, { "epoch": 9.231647634584013, "grad_norm": 3.776068687438965, "learning_rate": 8.941779230885433e-07, "loss": 0.1298, "num_input_tokens_seen": 122116304, "step": 56590 }, { "epoch": 9.232463295269168, "grad_norm": 0.0652937963604927, "learning_rate": 8.92292232730671e-07, "loss": 0.0735, "num_input_tokens_seen": 122128176, "step": 56595 }, { "epoch": 9.233278955954322, "grad_norm": 0.027470342814922333, "learning_rate": 8.904084966659121e-07, "loss": 0.1113, "num_input_tokens_seen": 122139696, "step": 56600 }, { "epoch": 9.234094616639478, "grad_norm": 0.22010153532028198, "learning_rate": 8.885267150469723e-07, "loss": 0.2322, "num_input_tokens_seen": 122151024, "step": 56605 }, { "epoch": 9.234910277324634, "grad_norm": 0.056948062032461166, "learning_rate": 8.866468880263961e-07, "loss": 0.0115, "num_input_tokens_seen": 122161648, "step": 56610 }, { "epoch": 9.235725938009788, "grad_norm": 0.05341499298810959, "learning_rate": 8.847690157565758e-07, "loss": 0.1356, "num_input_tokens_seen": 122172656, "step": 56615 }, { "epoch": 9.236541598694943, "grad_norm": 0.05375171825289726, "learning_rate": 8.828930983897366e-07, "loss": 0.1139, "num_input_tokens_seen": 122183056, "step": 56620 }, { "epoch": 9.237357259380097, "grad_norm": 0.24702052772045135, "learning_rate": 8.810191360779513e-07, "loss": 0.0217, "num_input_tokens_seen": 122194224, "step": 56625 }, { "epoch": 9.238172920065253, "grad_norm": 0.04193101450800896, "learning_rate": 8.791471289731346e-07, "loss": 0.0308, "num_input_tokens_seen": 122204784, "step": 56630 }, { "epoch": 9.238988580750409, "grad_norm": 0.053464144468307495, "learning_rate": 8.7727707722704e-07, "loss": 0.0023, "num_input_tokens_seen": 122215280, "step": 56635 }, { "epoch": 9.239804241435563, "grad_norm": 0.9686161875724792, "learning_rate": 8.75408980991263e-07, "loss": 0.0977, "num_input_tokens_seen": 122226128, "step": 56640 }, { "epoch": 9.240619902120718, "grad_norm": 4.109722137451172, "learning_rate": 8.735428404172408e-07, "loss": 0.1423, "num_input_tokens_seen": 122237552, "step": 56645 }, { "epoch": 9.241435562805872, "grad_norm": 0.08541485667228699, "learning_rate": 8.716786556562495e-07, "loss": 0.2074, "num_input_tokens_seen": 122248880, "step": 56650 }, { "epoch": 9.242251223491028, "grad_norm": 0.3115135133266449, "learning_rate": 8.698164268594155e-07, "loss": 0.1094, "num_input_tokens_seen": 122259760, "step": 56655 }, { "epoch": 9.243066884176184, "grad_norm": 5.633404731750488, "learning_rate": 8.679561541776959e-07, "loss": 0.0973, "num_input_tokens_seen": 122271120, "step": 56660 }, { "epoch": 9.243882544861338, "grad_norm": 0.1086176410317421, "learning_rate": 8.660978377618951e-07, "loss": 0.0678, "num_input_tokens_seen": 122281424, "step": 56665 }, { "epoch": 9.244698205546493, "grad_norm": 0.23381710052490234, "learning_rate": 8.64241477762659e-07, "loss": 0.0081, "num_input_tokens_seen": 122292784, "step": 56670 }, { "epoch": 9.245513866231647, "grad_norm": 0.07667340338230133, "learning_rate": 8.62387074330473e-07, "loss": 0.0459, "num_input_tokens_seen": 122302992, "step": 56675 }, { "epoch": 9.246329526916803, "grad_norm": 3.798373222351074, "learning_rate": 8.605346276156611e-07, "loss": 0.0729, "num_input_tokens_seen": 122314320, "step": 56680 }, { "epoch": 9.247145187601957, "grad_norm": 0.08883180469274521, "learning_rate": 8.586841377683951e-07, "loss": 0.0029, "num_input_tokens_seen": 122324848, "step": 56685 }, { "epoch": 9.247960848287113, "grad_norm": 0.037743981927633286, "learning_rate": 8.568356049386827e-07, "loss": 0.0028, "num_input_tokens_seen": 122335760, "step": 56690 }, { "epoch": 9.248776508972268, "grad_norm": 0.11212898045778275, "learning_rate": 8.549890292763819e-07, "loss": 0.0862, "num_input_tokens_seen": 122346416, "step": 56695 }, { "epoch": 9.249592169657422, "grad_norm": 0.07642631232738495, "learning_rate": 8.531444109311781e-07, "loss": 0.094, "num_input_tokens_seen": 122358608, "step": 56700 }, { "epoch": 9.250407830342578, "grad_norm": 0.07463725656270981, "learning_rate": 8.513017500526105e-07, "loss": 0.0045, "num_input_tokens_seen": 122369264, "step": 56705 }, { "epoch": 9.251223491027732, "grad_norm": 0.4207376539707184, "learning_rate": 8.49461046790051e-07, "loss": 0.3118, "num_input_tokens_seen": 122380528, "step": 56710 }, { "epoch": 9.252039151712887, "grad_norm": 0.08353845775127411, "learning_rate": 8.476223012927193e-07, "loss": 0.0027, "num_input_tokens_seen": 122391856, "step": 56715 }, { "epoch": 9.252854812398043, "grad_norm": 0.053139425814151764, "learning_rate": 8.457855137096682e-07, "loss": 0.0053, "num_input_tokens_seen": 122402672, "step": 56720 }, { "epoch": 9.253670473083197, "grad_norm": 0.13485592603683472, "learning_rate": 8.439506841898037e-07, "loss": 0.0875, "num_input_tokens_seen": 122412368, "step": 56725 }, { "epoch": 9.254486133768353, "grad_norm": 24.39397430419922, "learning_rate": 8.421178128818624e-07, "loss": 0.0173, "num_input_tokens_seen": 122423248, "step": 56730 }, { "epoch": 9.255301794453507, "grad_norm": 0.08826201409101486, "learning_rate": 8.402868999344283e-07, "loss": 0.0068, "num_input_tokens_seen": 122433040, "step": 56735 }, { "epoch": 9.256117455138662, "grad_norm": 0.04894102364778519, "learning_rate": 8.384579454959185e-07, "loss": 0.0032, "num_input_tokens_seen": 122442928, "step": 56740 }, { "epoch": 9.256933115823816, "grad_norm": 0.3541920483112335, "learning_rate": 8.366309497146063e-07, "loss": 0.0129, "num_input_tokens_seen": 122453488, "step": 56745 }, { "epoch": 9.257748776508972, "grad_norm": 0.07416754215955734, "learning_rate": 8.348059127385926e-07, "loss": 0.0055, "num_input_tokens_seen": 122463632, "step": 56750 }, { "epoch": 9.258564437194128, "grad_norm": 0.09072282910346985, "learning_rate": 8.329828347158231e-07, "loss": 0.0029, "num_input_tokens_seen": 122474736, "step": 56755 }, { "epoch": 9.259380097879282, "grad_norm": 0.07462501525878906, "learning_rate": 8.311617157940904e-07, "loss": 0.0051, "num_input_tokens_seen": 122485776, "step": 56760 }, { "epoch": 9.260195758564437, "grad_norm": 0.04590676724910736, "learning_rate": 8.293425561210183e-07, "loss": 0.0034, "num_input_tokens_seen": 122496176, "step": 56765 }, { "epoch": 9.261011419249591, "grad_norm": 0.03457889333367348, "learning_rate": 8.275253558440776e-07, "loss": 0.0037, "num_input_tokens_seen": 122507760, "step": 56770 }, { "epoch": 9.261827079934747, "grad_norm": 19.248991012573242, "learning_rate": 8.257101151105839e-07, "loss": 0.0902, "num_input_tokens_seen": 122518768, "step": 56775 }, { "epoch": 9.262642740619903, "grad_norm": 11.731621742248535, "learning_rate": 8.23896834067689e-07, "loss": 0.0843, "num_input_tokens_seen": 122531216, "step": 56780 }, { "epoch": 9.263458401305057, "grad_norm": 17.964866638183594, "learning_rate": 8.220855128623805e-07, "loss": 0.0988, "num_input_tokens_seen": 122541552, "step": 56785 }, { "epoch": 9.264274061990212, "grad_norm": 0.7958881258964539, "learning_rate": 8.202761516415025e-07, "loss": 0.0886, "num_input_tokens_seen": 122553712, "step": 56790 }, { "epoch": 9.265089722675366, "grad_norm": 0.15371784567832947, "learning_rate": 8.184687505517236e-07, "loss": 0.1292, "num_input_tokens_seen": 122564976, "step": 56795 }, { "epoch": 9.265905383360522, "grad_norm": 0.05780799314379692, "learning_rate": 8.166633097395626e-07, "loss": 0.1017, "num_input_tokens_seen": 122576272, "step": 56800 }, { "epoch": 9.266721044045678, "grad_norm": 2.097024917602539, "learning_rate": 8.148598293513804e-07, "loss": 0.097, "num_input_tokens_seen": 122586576, "step": 56805 }, { "epoch": 9.267536704730832, "grad_norm": 7.795865058898926, "learning_rate": 8.130583095333739e-07, "loss": 0.0138, "num_input_tokens_seen": 122598000, "step": 56810 }, { "epoch": 9.268352365415987, "grad_norm": 0.16377630829811096, "learning_rate": 8.112587504315844e-07, "loss": 0.1037, "num_input_tokens_seen": 122606896, "step": 56815 }, { "epoch": 9.269168026101141, "grad_norm": 4.764657974243164, "learning_rate": 8.094611521918927e-07, "loss": 0.0071, "num_input_tokens_seen": 122617200, "step": 56820 }, { "epoch": 9.269983686786297, "grad_norm": 0.371650367975235, "learning_rate": 8.076655149600237e-07, "loss": 0.2225, "num_input_tokens_seen": 122626512, "step": 56825 }, { "epoch": 9.270799347471453, "grad_norm": 0.23115122318267822, "learning_rate": 8.058718388815362e-07, "loss": 0.1342, "num_input_tokens_seen": 122636848, "step": 56830 }, { "epoch": 9.271615008156607, "grad_norm": 2.2169458866119385, "learning_rate": 8.040801241018386e-07, "loss": 0.099, "num_input_tokens_seen": 122646928, "step": 56835 }, { "epoch": 9.272430668841762, "grad_norm": 1.1989107131958008, "learning_rate": 8.022903707661761e-07, "loss": 0.1356, "num_input_tokens_seen": 122656752, "step": 56840 }, { "epoch": 9.273246329526916, "grad_norm": 0.11020349711179733, "learning_rate": 8.005025790196325e-07, "loss": 0.0815, "num_input_tokens_seen": 122668656, "step": 56845 }, { "epoch": 9.274061990212072, "grad_norm": 0.1726074516773224, "learning_rate": 7.987167490071362e-07, "loss": 0.0086, "num_input_tokens_seen": 122679728, "step": 56850 }, { "epoch": 9.274877650897226, "grad_norm": 8.0415678024292, "learning_rate": 7.969328808734577e-07, "loss": 0.1705, "num_input_tokens_seen": 122691536, "step": 56855 }, { "epoch": 9.275693311582382, "grad_norm": 0.1015799269080162, "learning_rate": 7.951509747632063e-07, "loss": 0.0056, "num_input_tokens_seen": 122701488, "step": 56860 }, { "epoch": 9.276508972267537, "grad_norm": 0.15888768434524536, "learning_rate": 7.933710308208275e-07, "loss": 0.0042, "num_input_tokens_seen": 122712784, "step": 56865 }, { "epoch": 9.277324632952691, "grad_norm": 0.11911559104919434, "learning_rate": 7.915930491906198e-07, "loss": 0.1025, "num_input_tokens_seen": 122723184, "step": 56870 }, { "epoch": 9.278140293637847, "grad_norm": 0.11030533164739609, "learning_rate": 7.898170300167096e-07, "loss": 0.111, "num_input_tokens_seen": 122733808, "step": 56875 }, { "epoch": 9.278955954323001, "grad_norm": 0.9021227955818176, "learning_rate": 7.880429734430706e-07, "loss": 0.0076, "num_input_tokens_seen": 122745552, "step": 56880 }, { "epoch": 9.279771615008157, "grad_norm": 0.10199274867773056, "learning_rate": 7.862708796135182e-07, "loss": 0.0093, "num_input_tokens_seen": 122755632, "step": 56885 }, { "epoch": 9.280587275693312, "grad_norm": 0.09611979871988297, "learning_rate": 7.845007486717099e-07, "loss": 0.0684, "num_input_tokens_seen": 122767696, "step": 56890 }, { "epoch": 9.281402936378466, "grad_norm": 0.17125804722309113, "learning_rate": 7.827325807611391e-07, "loss": 0.1048, "num_input_tokens_seen": 122779376, "step": 56895 }, { "epoch": 9.282218597063622, "grad_norm": 0.1180923730134964, "learning_rate": 7.80966376025144e-07, "loss": 0.0117, "num_input_tokens_seen": 122790832, "step": 56900 }, { "epoch": 9.283034257748776, "grad_norm": 0.10925772786140442, "learning_rate": 7.792021346068989e-07, "loss": 0.1719, "num_input_tokens_seen": 122801456, "step": 56905 }, { "epoch": 9.283849918433932, "grad_norm": 0.050430506467819214, "learning_rate": 7.774398566494201e-07, "loss": 0.0705, "num_input_tokens_seen": 122813520, "step": 56910 }, { "epoch": 9.284665579119087, "grad_norm": 0.15304403007030487, "learning_rate": 7.756795422955737e-07, "loss": 0.0806, "num_input_tokens_seen": 122823344, "step": 56915 }, { "epoch": 9.285481239804241, "grad_norm": 0.0694902166724205, "learning_rate": 7.739211916880595e-07, "loss": 0.2018, "num_input_tokens_seen": 122834128, "step": 56920 }, { "epoch": 9.286296900489397, "grad_norm": 0.06205080822110176, "learning_rate": 7.721648049694108e-07, "loss": 0.0031, "num_input_tokens_seen": 122845456, "step": 56925 }, { "epoch": 9.28711256117455, "grad_norm": 0.03544263914227486, "learning_rate": 7.704103822820164e-07, "loss": 0.0061, "num_input_tokens_seen": 122856272, "step": 56930 }, { "epoch": 9.287928221859707, "grad_norm": 0.021660730242729187, "learning_rate": 7.686579237680957e-07, "loss": 0.1363, "num_input_tokens_seen": 122866544, "step": 56935 }, { "epoch": 9.28874388254486, "grad_norm": 8.081960678100586, "learning_rate": 7.669074295697132e-07, "loss": 0.1119, "num_input_tokens_seen": 122876432, "step": 56940 }, { "epoch": 9.289559543230016, "grad_norm": 0.17294886708259583, "learning_rate": 7.651588998287717e-07, "loss": 0.0653, "num_input_tokens_seen": 122888432, "step": 56945 }, { "epoch": 9.290375203915172, "grad_norm": 0.13838863372802734, "learning_rate": 7.634123346870165e-07, "loss": 0.1269, "num_input_tokens_seen": 122900016, "step": 56950 }, { "epoch": 9.291190864600326, "grad_norm": 0.11037831753492355, "learning_rate": 7.616677342860312e-07, "loss": 0.0072, "num_input_tokens_seen": 122910096, "step": 56955 }, { "epoch": 9.292006525285482, "grad_norm": 0.08414048701524734, "learning_rate": 7.599250987672446e-07, "loss": 0.0041, "num_input_tokens_seen": 122920336, "step": 56960 }, { "epoch": 9.292822185970635, "grad_norm": 0.7040512561798096, "learning_rate": 7.581844282719213e-07, "loss": 0.2497, "num_input_tokens_seen": 122931632, "step": 56965 }, { "epoch": 9.293637846655791, "grad_norm": 0.02372356317937374, "learning_rate": 7.564457229411709e-07, "loss": 0.1279, "num_input_tokens_seen": 122943728, "step": 56970 }, { "epoch": 9.294453507340947, "grad_norm": 0.2222600281238556, "learning_rate": 7.547089829159415e-07, "loss": 0.0056, "num_input_tokens_seen": 122953264, "step": 56975 }, { "epoch": 9.2952691680261, "grad_norm": 0.4200578033924103, "learning_rate": 7.529742083370206e-07, "loss": 0.0649, "num_input_tokens_seen": 122965296, "step": 56980 }, { "epoch": 9.296084828711257, "grad_norm": 7.137514591217041, "learning_rate": 7.512413993450373e-07, "loss": 0.2838, "num_input_tokens_seen": 122974384, "step": 56985 }, { "epoch": 9.29690048939641, "grad_norm": 2.8665337562561035, "learning_rate": 7.495105560804627e-07, "loss": 0.0794, "num_input_tokens_seen": 122985296, "step": 56990 }, { "epoch": 9.297716150081566, "grad_norm": 0.11218998581171036, "learning_rate": 7.477816786836122e-07, "loss": 0.004, "num_input_tokens_seen": 122996272, "step": 56995 }, { "epoch": 9.298531810766722, "grad_norm": 0.11332866549491882, "learning_rate": 7.460547672946294e-07, "loss": 0.0908, "num_input_tokens_seen": 123007440, "step": 57000 }, { "epoch": 9.299347471451876, "grad_norm": 3.5079095363616943, "learning_rate": 7.443298220535106e-07, "loss": 0.3637, "num_input_tokens_seen": 123018096, "step": 57005 }, { "epoch": 9.300163132137031, "grad_norm": 0.09769842028617859, "learning_rate": 7.426068431000882e-07, "loss": 0.2011, "num_input_tokens_seen": 123029072, "step": 57010 }, { "epoch": 9.300978792822185, "grad_norm": 1.6165767908096313, "learning_rate": 7.408858305740368e-07, "loss": 0.1306, "num_input_tokens_seen": 123040816, "step": 57015 }, { "epoch": 9.301794453507341, "grad_norm": 0.28688180446624756, "learning_rate": 7.391667846148697e-07, "loss": 0.0069, "num_input_tokens_seen": 123051376, "step": 57020 }, { "epoch": 9.302610114192497, "grad_norm": 3.106509208679199, "learning_rate": 7.374497053619423e-07, "loss": 0.1187, "num_input_tokens_seen": 123061232, "step": 57025 }, { "epoch": 9.30342577487765, "grad_norm": 0.052906475961208344, "learning_rate": 7.357345929544485e-07, "loss": 0.2703, "num_input_tokens_seen": 123071632, "step": 57030 }, { "epoch": 9.304241435562806, "grad_norm": 2.5314533710479736, "learning_rate": 7.340214475314244e-07, "loss": 0.0949, "num_input_tokens_seen": 123083600, "step": 57035 }, { "epoch": 9.30505709624796, "grad_norm": 0.0716528445482254, "learning_rate": 7.323102692317452e-07, "loss": 0.1539, "num_input_tokens_seen": 123093392, "step": 57040 }, { "epoch": 9.305872756933116, "grad_norm": 0.08974530547857285, "learning_rate": 7.306010581941275e-07, "loss": 0.0046, "num_input_tokens_seen": 123104784, "step": 57045 }, { "epoch": 9.30668841761827, "grad_norm": 0.23656074702739716, "learning_rate": 7.288938145571328e-07, "loss": 0.207, "num_input_tokens_seen": 123115344, "step": 57050 }, { "epoch": 9.307504078303426, "grad_norm": 0.40098971128463745, "learning_rate": 7.271885384591503e-07, "loss": 0.0064, "num_input_tokens_seen": 123125520, "step": 57055 }, { "epoch": 9.308319738988581, "grad_norm": 0.11197123676538467, "learning_rate": 7.25485230038428e-07, "loss": 0.0041, "num_input_tokens_seen": 123136016, "step": 57060 }, { "epoch": 9.309135399673735, "grad_norm": 0.10504305362701416, "learning_rate": 7.237838894330412e-07, "loss": 0.1315, "num_input_tokens_seen": 123147536, "step": 57065 }, { "epoch": 9.309951060358891, "grad_norm": 0.10544651001691818, "learning_rate": 7.220845167809076e-07, "loss": 0.0087, "num_input_tokens_seen": 123157904, "step": 57070 }, { "epoch": 9.310766721044045, "grad_norm": 0.08824353665113449, "learning_rate": 7.203871122197891e-07, "loss": 0.006, "num_input_tokens_seen": 123168720, "step": 57075 }, { "epoch": 9.3115823817292, "grad_norm": 0.19445312023162842, "learning_rate": 7.186916758872841e-07, "loss": 0.0123, "num_input_tokens_seen": 123180144, "step": 57080 }, { "epoch": 9.312398042414356, "grad_norm": 0.1056525856256485, "learning_rate": 7.169982079208326e-07, "loss": 0.0042, "num_input_tokens_seen": 123190896, "step": 57085 }, { "epoch": 9.31321370309951, "grad_norm": 0.20880796015262604, "learning_rate": 7.153067084577192e-07, "loss": 0.0068, "num_input_tokens_seen": 123201424, "step": 57090 }, { "epoch": 9.314029363784666, "grad_norm": 2.59110689163208, "learning_rate": 7.13617177635062e-07, "loss": 0.1177, "num_input_tokens_seen": 123211536, "step": 57095 }, { "epoch": 9.31484502446982, "grad_norm": 0.04983392357826233, "learning_rate": 7.119296155898236e-07, "loss": 0.0978, "num_input_tokens_seen": 123222896, "step": 57100 }, { "epoch": 9.315660685154976, "grad_norm": 3.8871772289276123, "learning_rate": 7.102440224588086e-07, "loss": 0.2656, "num_input_tokens_seen": 123235280, "step": 57105 }, { "epoch": 9.31647634584013, "grad_norm": 0.08402803540229797, "learning_rate": 7.085603983786576e-07, "loss": 0.1197, "num_input_tokens_seen": 123246416, "step": 57110 }, { "epoch": 9.317292006525285, "grad_norm": 0.32142174243927, "learning_rate": 7.068787434858532e-07, "loss": 0.0052, "num_input_tokens_seen": 123257200, "step": 57115 }, { "epoch": 9.318107667210441, "grad_norm": 0.09484074264764786, "learning_rate": 7.051990579167195e-07, "loss": 0.2763, "num_input_tokens_seen": 123268816, "step": 57120 }, { "epoch": 9.318923327895595, "grad_norm": 0.14092539250850677, "learning_rate": 7.035213418074227e-07, "loss": 0.1005, "num_input_tokens_seen": 123279408, "step": 57125 }, { "epoch": 9.31973898858075, "grad_norm": 0.3421323895454407, "learning_rate": 7.018455952939651e-07, "loss": 0.167, "num_input_tokens_seen": 123289232, "step": 57130 }, { "epoch": 9.320554649265905, "grad_norm": 0.09815683960914612, "learning_rate": 7.001718185121908e-07, "loss": 0.2351, "num_input_tokens_seen": 123299984, "step": 57135 }, { "epoch": 9.32137030995106, "grad_norm": 0.09825440496206284, "learning_rate": 6.98500011597783e-07, "loss": 0.017, "num_input_tokens_seen": 123310768, "step": 57140 }, { "epoch": 9.322185970636216, "grad_norm": 0.09648661315441132, "learning_rate": 6.96830174686272e-07, "loss": 0.003, "num_input_tokens_seen": 123321232, "step": 57145 }, { "epoch": 9.32300163132137, "grad_norm": 0.06510291993618011, "learning_rate": 6.951623079130192e-07, "loss": 0.0617, "num_input_tokens_seen": 123332432, "step": 57150 }, { "epoch": 9.323817292006526, "grad_norm": 0.10147050023078918, "learning_rate": 6.934964114132303e-07, "loss": 0.0987, "num_input_tokens_seen": 123344240, "step": 57155 }, { "epoch": 9.32463295269168, "grad_norm": 13.28692626953125, "learning_rate": 6.918324853219527e-07, "loss": 0.2151, "num_input_tokens_seen": 123354672, "step": 57160 }, { "epoch": 9.325448613376835, "grad_norm": 0.053556740283966064, "learning_rate": 6.901705297740729e-07, "loss": 0.2067, "num_input_tokens_seen": 123365168, "step": 57165 }, { "epoch": 9.326264274061991, "grad_norm": 0.11643710732460022, "learning_rate": 6.885105449043138e-07, "loss": 0.0099, "num_input_tokens_seen": 123375696, "step": 57170 }, { "epoch": 9.327079934747145, "grad_norm": 0.0685681700706482, "learning_rate": 6.868525308472484e-07, "loss": 0.2615, "num_input_tokens_seen": 123387344, "step": 57175 }, { "epoch": 9.3278955954323, "grad_norm": 0.09857043623924255, "learning_rate": 6.851964877372802e-07, "loss": 0.006, "num_input_tokens_seen": 123398992, "step": 57180 }, { "epoch": 9.328711256117455, "grad_norm": 2.408092737197876, "learning_rate": 6.835424157086573e-07, "loss": 0.2, "num_input_tokens_seen": 123409168, "step": 57185 }, { "epoch": 9.32952691680261, "grad_norm": 0.1575390100479126, "learning_rate": 6.818903148954642e-07, "loss": 0.0059, "num_input_tokens_seen": 123421488, "step": 57190 }, { "epoch": 9.330342577487766, "grad_norm": 0.11237485706806183, "learning_rate": 6.802401854316298e-07, "loss": 0.0151, "num_input_tokens_seen": 123432816, "step": 57195 }, { "epoch": 9.33115823817292, "grad_norm": 0.7823300957679749, "learning_rate": 6.78592027450925e-07, "loss": 0.015, "num_input_tokens_seen": 123444048, "step": 57200 }, { "epoch": 9.331973898858076, "grad_norm": 0.07418625801801682, "learning_rate": 6.769458410869595e-07, "loss": 0.1235, "num_input_tokens_seen": 123454320, "step": 57205 }, { "epoch": 9.33278955954323, "grad_norm": 0.06946953386068344, "learning_rate": 6.753016264731738e-07, "loss": 0.003, "num_input_tokens_seen": 123465424, "step": 57210 }, { "epoch": 9.333605220228385, "grad_norm": 0.12592476606369019, "learning_rate": 6.736593837428639e-07, "loss": 0.0917, "num_input_tokens_seen": 123475856, "step": 57215 }, { "epoch": 9.33442088091354, "grad_norm": 0.23111547529697418, "learning_rate": 6.720191130291514e-07, "loss": 0.4095, "num_input_tokens_seen": 123484816, "step": 57220 }, { "epoch": 9.335236541598695, "grad_norm": 0.016037456691265106, "learning_rate": 6.703808144650076e-07, "loss": 0.0044, "num_input_tokens_seen": 123495920, "step": 57225 }, { "epoch": 9.33605220228385, "grad_norm": 0.08093839138746262, "learning_rate": 6.687444881832455e-07, "loss": 0.2162, "num_input_tokens_seen": 123507344, "step": 57230 }, { "epoch": 9.336867862969005, "grad_norm": 0.8716356158256531, "learning_rate": 6.67110134316512e-07, "loss": 0.0896, "num_input_tokens_seen": 123517808, "step": 57235 }, { "epoch": 9.33768352365416, "grad_norm": 0.0805845558643341, "learning_rate": 6.654777529972928e-07, "loss": 0.2977, "num_input_tokens_seen": 123529456, "step": 57240 }, { "epoch": 9.338499184339314, "grad_norm": 9.077909469604492, "learning_rate": 6.638473443579179e-07, "loss": 0.1592, "num_input_tokens_seen": 123538960, "step": 57245 }, { "epoch": 9.33931484502447, "grad_norm": 3.288137912750244, "learning_rate": 6.622189085305597e-07, "loss": 0.2394, "num_input_tokens_seen": 123549200, "step": 57250 }, { "epoch": 9.340130505709626, "grad_norm": 0.2037453055381775, "learning_rate": 6.605924456472262e-07, "loss": 0.0729, "num_input_tokens_seen": 123560176, "step": 57255 }, { "epoch": 9.34094616639478, "grad_norm": 0.06806585192680359, "learning_rate": 6.589679558397648e-07, "loss": 0.1231, "num_input_tokens_seen": 123571280, "step": 57260 }, { "epoch": 9.341761827079935, "grad_norm": 0.15784721076488495, "learning_rate": 6.573454392398648e-07, "loss": 0.1468, "num_input_tokens_seen": 123581296, "step": 57265 }, { "epoch": 9.34257748776509, "grad_norm": 0.09280826151371002, "learning_rate": 6.557248959790596e-07, "loss": 0.1565, "num_input_tokens_seen": 123592016, "step": 57270 }, { "epoch": 9.343393148450245, "grad_norm": 0.050861794501543045, "learning_rate": 6.541063261887137e-07, "loss": 0.0804, "num_input_tokens_seen": 123602992, "step": 57275 }, { "epoch": 9.3442088091354, "grad_norm": 0.4141773283481598, "learning_rate": 6.524897300000388e-07, "loss": 0.0277, "num_input_tokens_seen": 123614384, "step": 57280 }, { "epoch": 9.345024469820554, "grad_norm": 14.976688385009766, "learning_rate": 6.508751075440856e-07, "loss": 0.0174, "num_input_tokens_seen": 123625744, "step": 57285 }, { "epoch": 9.34584013050571, "grad_norm": 3.195305347442627, "learning_rate": 6.49262458951741e-07, "loss": 0.2904, "num_input_tokens_seen": 123636816, "step": 57290 }, { "epoch": 9.346655791190864, "grad_norm": 3.126220464706421, "learning_rate": 6.476517843537395e-07, "loss": 0.1124, "num_input_tokens_seen": 123648656, "step": 57295 }, { "epoch": 9.34747145187602, "grad_norm": 0.684742271900177, "learning_rate": 6.46043083880643e-07, "loss": 0.0056, "num_input_tokens_seen": 123658672, "step": 57300 }, { "epoch": 9.348287112561174, "grad_norm": 1.4990553855895996, "learning_rate": 6.44436357662867e-07, "loss": 0.1757, "num_input_tokens_seen": 123669968, "step": 57305 }, { "epoch": 9.34910277324633, "grad_norm": 12.546418190002441, "learning_rate": 6.428316058306571e-07, "loss": 0.0582, "num_input_tokens_seen": 123680880, "step": 57310 }, { "epoch": 9.349918433931485, "grad_norm": 3.4454562664031982, "learning_rate": 6.412288285141066e-07, "loss": 0.1173, "num_input_tokens_seen": 123691536, "step": 57315 }, { "epoch": 9.350734094616639, "grad_norm": 0.20921696722507477, "learning_rate": 6.396280258431391e-07, "loss": 0.0072, "num_input_tokens_seen": 123701904, "step": 57320 }, { "epoch": 9.351549755301795, "grad_norm": 0.11067357659339905, "learning_rate": 6.38029197947529e-07, "loss": 0.0066, "num_input_tokens_seen": 123712528, "step": 57325 }, { "epoch": 9.352365415986949, "grad_norm": 0.15110088884830475, "learning_rate": 6.364323449568804e-07, "loss": 0.0124, "num_input_tokens_seen": 123723184, "step": 57330 }, { "epoch": 9.353181076672104, "grad_norm": 0.04171011224389076, "learning_rate": 6.348374670006485e-07, "loss": 0.0042, "num_input_tokens_seen": 123733968, "step": 57335 }, { "epoch": 9.35399673735726, "grad_norm": 0.12456182390451431, "learning_rate": 6.332445642081214e-07, "loss": 0.0531, "num_input_tokens_seen": 123743120, "step": 57340 }, { "epoch": 9.354812398042414, "grad_norm": 0.09891777485609055, "learning_rate": 6.316536367084236e-07, "loss": 0.0162, "num_input_tokens_seen": 123754992, "step": 57345 }, { "epoch": 9.35562805872757, "grad_norm": 0.024394137784838676, "learning_rate": 6.300646846305241e-07, "loss": 0.0053, "num_input_tokens_seen": 123765488, "step": 57350 }, { "epoch": 9.356443719412724, "grad_norm": 4.904883861541748, "learning_rate": 6.28477708103234e-07, "loss": 0.1155, "num_input_tokens_seen": 123776944, "step": 57355 }, { "epoch": 9.35725938009788, "grad_norm": 2.6884372234344482, "learning_rate": 6.268927072552028e-07, "loss": 0.0686, "num_input_tokens_seen": 123789168, "step": 57360 }, { "epoch": 9.358075040783035, "grad_norm": 0.09686939418315887, "learning_rate": 6.253096822149113e-07, "loss": 0.2114, "num_input_tokens_seen": 123799536, "step": 57365 }, { "epoch": 9.358890701468189, "grad_norm": 0.16365379095077515, "learning_rate": 6.237286331106984e-07, "loss": 0.1283, "num_input_tokens_seen": 123811536, "step": 57370 }, { "epoch": 9.359706362153345, "grad_norm": 0.18582594394683838, "learning_rate": 6.221495600707227e-07, "loss": 0.0113, "num_input_tokens_seen": 123820848, "step": 57375 }, { "epoch": 9.360522022838499, "grad_norm": 0.1327890008687973, "learning_rate": 6.205724632229987e-07, "loss": 0.0052, "num_input_tokens_seen": 123832112, "step": 57380 }, { "epoch": 9.361337683523654, "grad_norm": 0.05526406317949295, "learning_rate": 6.18997342695371e-07, "loss": 0.1159, "num_input_tokens_seen": 123842544, "step": 57385 }, { "epoch": 9.362153344208808, "grad_norm": 3.5303916931152344, "learning_rate": 6.174241986155238e-07, "loss": 0.1894, "num_input_tokens_seen": 123852816, "step": 57390 }, { "epoch": 9.362969004893964, "grad_norm": 0.06925785541534424, "learning_rate": 6.158530311109884e-07, "loss": 0.1415, "num_input_tokens_seen": 123863312, "step": 57395 }, { "epoch": 9.36378466557912, "grad_norm": 0.1369050145149231, "learning_rate": 6.142838403091322e-07, "loss": 0.0222, "num_input_tokens_seen": 123873936, "step": 57400 }, { "epoch": 9.364600326264274, "grad_norm": 13.540993690490723, "learning_rate": 6.127166263371592e-07, "loss": 0.2189, "num_input_tokens_seen": 123885648, "step": 57405 }, { "epoch": 9.36541598694943, "grad_norm": 0.09398102015256882, "learning_rate": 6.111513893221149e-07, "loss": 0.0838, "num_input_tokens_seen": 123896528, "step": 57410 }, { "epoch": 9.366231647634583, "grad_norm": 0.09548982232809067, "learning_rate": 6.095881293908867e-07, "loss": 0.0808, "num_input_tokens_seen": 123907728, "step": 57415 }, { "epoch": 9.367047308319739, "grad_norm": 0.11356570571660995, "learning_rate": 6.080268466702011e-07, "loss": 0.004, "num_input_tokens_seen": 123918128, "step": 57420 }, { "epoch": 9.367862969004895, "grad_norm": 0.31677040457725525, "learning_rate": 6.064675412866233e-07, "loss": 0.0081, "num_input_tokens_seen": 123929392, "step": 57425 }, { "epoch": 9.368678629690049, "grad_norm": 0.34641584753990173, "learning_rate": 6.049102133665552e-07, "loss": 0.1761, "num_input_tokens_seen": 123940688, "step": 57430 }, { "epoch": 9.369494290375204, "grad_norm": 0.041574303060770035, "learning_rate": 6.033548630362457e-07, "loss": 0.0078, "num_input_tokens_seen": 123950704, "step": 57435 }, { "epoch": 9.370309951060358, "grad_norm": 0.26669496297836304, "learning_rate": 6.018014904217801e-07, "loss": 0.0069, "num_input_tokens_seen": 123961552, "step": 57440 }, { "epoch": 9.371125611745514, "grad_norm": 0.10131517052650452, "learning_rate": 6.002500956490798e-07, "loss": 0.0067, "num_input_tokens_seen": 123971408, "step": 57445 }, { "epoch": 9.37194127243067, "grad_norm": 0.13997958600521088, "learning_rate": 5.987006788439109e-07, "loss": 0.0047, "num_input_tokens_seen": 123981264, "step": 57450 }, { "epoch": 9.372756933115824, "grad_norm": 0.17554813623428345, "learning_rate": 5.971532401318758e-07, "loss": 0.133, "num_input_tokens_seen": 123992944, "step": 57455 }, { "epoch": 9.37357259380098, "grad_norm": 0.18611003458499908, "learning_rate": 5.956077796384185e-07, "loss": 0.0985, "num_input_tokens_seen": 124004464, "step": 57460 }, { "epoch": 9.374388254486133, "grad_norm": 0.10970862954854965, "learning_rate": 5.940642974888195e-07, "loss": 0.0104, "num_input_tokens_seen": 124017072, "step": 57465 }, { "epoch": 9.375203915171289, "grad_norm": 0.06436091661453247, "learning_rate": 5.925227938082034e-07, "loss": 0.1005, "num_input_tokens_seen": 124027792, "step": 57470 }, { "epoch": 9.376019575856443, "grad_norm": 3.337602138519287, "learning_rate": 5.909832687215317e-07, "loss": 0.0926, "num_input_tokens_seen": 124039440, "step": 57475 }, { "epoch": 9.376835236541599, "grad_norm": 3.4454894065856934, "learning_rate": 5.894457223536071e-07, "loss": 0.1433, "num_input_tokens_seen": 124050608, "step": 57480 }, { "epoch": 9.377650897226754, "grad_norm": 0.0916593074798584, "learning_rate": 5.879101548290716e-07, "loss": 0.0908, "num_input_tokens_seen": 124061296, "step": 57485 }, { "epoch": 9.378466557911908, "grad_norm": 0.2612755298614502, "learning_rate": 5.863765662724036e-07, "loss": 0.3228, "num_input_tokens_seen": 124071920, "step": 57490 }, { "epoch": 9.379282218597064, "grad_norm": 0.09947580844163895, "learning_rate": 5.848449568079228e-07, "loss": 0.115, "num_input_tokens_seen": 124082608, "step": 57495 }, { "epoch": 9.380097879282218, "grad_norm": 0.14681026339530945, "learning_rate": 5.83315326559794e-07, "loss": 0.1171, "num_input_tokens_seen": 124094416, "step": 57500 }, { "epoch": 9.380913539967374, "grad_norm": 2.713966131210327, "learning_rate": 5.817876756520125e-07, "loss": 0.0842, "num_input_tokens_seen": 124105168, "step": 57505 }, { "epoch": 9.38172920065253, "grad_norm": 0.03372839093208313, "learning_rate": 5.80262004208415e-07, "loss": 0.0769, "num_input_tokens_seen": 124114992, "step": 57510 }, { "epoch": 9.382544861337683, "grad_norm": 2.9920592308044434, "learning_rate": 5.787383123526891e-07, "loss": 0.0075, "num_input_tokens_seen": 124125936, "step": 57515 }, { "epoch": 9.383360522022839, "grad_norm": 0.09146337956190109, "learning_rate": 5.772166002083467e-07, "loss": 0.0833, "num_input_tokens_seen": 124137616, "step": 57520 }, { "epoch": 9.384176182707993, "grad_norm": 3.874807119369507, "learning_rate": 5.756968678987451e-07, "loss": 0.1147, "num_input_tokens_seen": 124147856, "step": 57525 }, { "epoch": 9.384991843393149, "grad_norm": 0.15907549858093262, "learning_rate": 5.741791155470854e-07, "loss": 0.0058, "num_input_tokens_seen": 124159824, "step": 57530 }, { "epoch": 9.385807504078304, "grad_norm": 0.1259309947490692, "learning_rate": 5.726633432764e-07, "loss": 0.0044, "num_input_tokens_seen": 124169168, "step": 57535 }, { "epoch": 9.386623164763458, "grad_norm": 0.3803000748157501, "learning_rate": 5.711495512095682e-07, "loss": 0.0052, "num_input_tokens_seen": 124179952, "step": 57540 }, { "epoch": 9.387438825448614, "grad_norm": 0.07788604497909546, "learning_rate": 5.696377394693003e-07, "loss": 0.1984, "num_input_tokens_seen": 124189648, "step": 57545 }, { "epoch": 9.388254486133768, "grad_norm": 0.09048943966627121, "learning_rate": 5.681279081781593e-07, "loss": 0.0955, "num_input_tokens_seen": 124200336, "step": 57550 }, { "epoch": 9.389070146818923, "grad_norm": 0.638272762298584, "learning_rate": 5.66620057458539e-07, "loss": 0.1214, "num_input_tokens_seen": 124209264, "step": 57555 }, { "epoch": 9.38988580750408, "grad_norm": 0.030949106439948082, "learning_rate": 5.651141874326666e-07, "loss": 0.0039, "num_input_tokens_seen": 124219184, "step": 57560 }, { "epoch": 9.390701468189233, "grad_norm": 0.049306951463222504, "learning_rate": 5.636102982226221e-07, "loss": 0.0085, "num_input_tokens_seen": 124230800, "step": 57565 }, { "epoch": 9.391517128874389, "grad_norm": 0.09985353797674179, "learning_rate": 5.621083899503138e-07, "loss": 0.0032, "num_input_tokens_seen": 124242224, "step": 57570 }, { "epoch": 9.392332789559543, "grad_norm": 4.070796012878418, "learning_rate": 5.606084627374969e-07, "loss": 0.1413, "num_input_tokens_seen": 124252592, "step": 57575 }, { "epoch": 9.393148450244698, "grad_norm": 0.39609283208847046, "learning_rate": 5.591105167057631e-07, "loss": 0.0078, "num_input_tokens_seen": 124263760, "step": 57580 }, { "epoch": 9.393964110929852, "grad_norm": 0.14552120864391327, "learning_rate": 5.576145519765402e-07, "loss": 0.004, "num_input_tokens_seen": 124274704, "step": 57585 }, { "epoch": 9.394779771615008, "grad_norm": 0.24937283992767334, "learning_rate": 5.561205686711035e-07, "loss": 0.0061, "num_input_tokens_seen": 124284784, "step": 57590 }, { "epoch": 9.395595432300164, "grad_norm": 0.12961940467357635, "learning_rate": 5.546285669105589e-07, "loss": 0.0408, "num_input_tokens_seen": 124294512, "step": 57595 }, { "epoch": 9.396411092985318, "grad_norm": 0.31368333101272583, "learning_rate": 5.531385468158595e-07, "loss": 0.0046, "num_input_tokens_seen": 124306096, "step": 57600 }, { "epoch": 9.397226753670473, "grad_norm": 4.694768905639648, "learning_rate": 5.516505085077895e-07, "loss": 0.1096, "num_input_tokens_seen": 124317104, "step": 57605 }, { "epoch": 9.398042414355627, "grad_norm": 0.1727198362350464, "learning_rate": 5.501644521069799e-07, "loss": 0.0124, "num_input_tokens_seen": 124327792, "step": 57610 }, { "epoch": 9.398858075040783, "grad_norm": 0.04438969865441322, "learning_rate": 5.486803777338956e-07, "loss": 0.0879, "num_input_tokens_seen": 124340112, "step": 57615 }, { "epoch": 9.399673735725939, "grad_norm": 0.183514803647995, "learning_rate": 5.471982855088459e-07, "loss": 0.098, "num_input_tokens_seen": 124351184, "step": 57620 }, { "epoch": 9.400489396411093, "grad_norm": 0.06302642822265625, "learning_rate": 5.457181755519763e-07, "loss": 0.066, "num_input_tokens_seen": 124361456, "step": 57625 }, { "epoch": 9.401305057096248, "grad_norm": 6.795217514038086, "learning_rate": 5.442400479832715e-07, "loss": 0.0512, "num_input_tokens_seen": 124372848, "step": 57630 }, { "epoch": 9.402120717781402, "grad_norm": 16.359466552734375, "learning_rate": 5.427639029225551e-07, "loss": 0.0202, "num_input_tokens_seen": 124383664, "step": 57635 }, { "epoch": 9.402936378466558, "grad_norm": 0.09848824888467789, "learning_rate": 5.412897404894896e-07, "loss": 0.0053, "num_input_tokens_seen": 124394160, "step": 57640 }, { "epoch": 9.403752039151712, "grad_norm": 0.05733660236001015, "learning_rate": 5.398175608035821e-07, "loss": 0.1236, "num_input_tokens_seen": 124405584, "step": 57645 }, { "epoch": 9.404567699836868, "grad_norm": 12.071624755859375, "learning_rate": 5.38347363984168e-07, "loss": 0.0749, "num_input_tokens_seen": 124416560, "step": 57650 }, { "epoch": 9.405383360522023, "grad_norm": 0.1648532748222351, "learning_rate": 5.368791501504378e-07, "loss": 0.0059, "num_input_tokens_seen": 124426864, "step": 57655 }, { "epoch": 9.406199021207177, "grad_norm": 0.15438251197338104, "learning_rate": 5.354129194214103e-07, "loss": 0.0054, "num_input_tokens_seen": 124438384, "step": 57660 }, { "epoch": 9.407014681892333, "grad_norm": 0.1673191785812378, "learning_rate": 5.339486719159404e-07, "loss": 0.0133, "num_input_tokens_seen": 124449936, "step": 57665 }, { "epoch": 9.407830342577487, "grad_norm": 0.059828829020261765, "learning_rate": 5.324864077527331e-07, "loss": 0.2073, "num_input_tokens_seen": 124461040, "step": 57670 }, { "epoch": 9.408646003262643, "grad_norm": 0.28742295503616333, "learning_rate": 5.310261270503214e-07, "loss": 0.1346, "num_input_tokens_seen": 124472144, "step": 57675 }, { "epoch": 9.409461663947798, "grad_norm": 0.21105064451694489, "learning_rate": 5.295678299270884e-07, "loss": 0.0846, "num_input_tokens_seen": 124483792, "step": 57680 }, { "epoch": 9.410277324632952, "grad_norm": 0.1553102731704712, "learning_rate": 5.281115165012479e-07, "loss": 0.1999, "num_input_tokens_seen": 124495152, "step": 57685 }, { "epoch": 9.411092985318108, "grad_norm": 3.6352155208587646, "learning_rate": 5.266571868908582e-07, "loss": 0.1038, "num_input_tokens_seen": 124505360, "step": 57690 }, { "epoch": 9.411908646003262, "grad_norm": 0.22404715418815613, "learning_rate": 5.252048412138111e-07, "loss": 0.3082, "num_input_tokens_seen": 124515184, "step": 57695 }, { "epoch": 9.412724306688418, "grad_norm": 0.12199053168296814, "learning_rate": 5.237544795878457e-07, "loss": 0.1025, "num_input_tokens_seen": 124526416, "step": 57700 }, { "epoch": 9.413539967373573, "grad_norm": 4.170020580291748, "learning_rate": 5.22306102130532e-07, "loss": 0.1867, "num_input_tokens_seen": 124537648, "step": 57705 }, { "epoch": 9.414355628058727, "grad_norm": 0.051957301795482635, "learning_rate": 5.208597089592871e-07, "loss": 0.1987, "num_input_tokens_seen": 124547920, "step": 57710 }, { "epoch": 9.415171288743883, "grad_norm": 0.1836085319519043, "learning_rate": 5.194153001913588e-07, "loss": 0.0062, "num_input_tokens_seen": 124556624, "step": 57715 }, { "epoch": 9.415986949429037, "grad_norm": 3.2332894802093506, "learning_rate": 5.179728759438368e-07, "loss": 0.2768, "num_input_tokens_seen": 124566448, "step": 57720 }, { "epoch": 9.416802610114193, "grad_norm": 0.09326938539743423, "learning_rate": 5.165324363336582e-07, "loss": 0.0992, "num_input_tokens_seen": 124576624, "step": 57725 }, { "epoch": 9.417618270799348, "grad_norm": 0.11804001778364182, "learning_rate": 5.150939814775852e-07, "loss": 0.0694, "num_input_tokens_seen": 124587184, "step": 57730 }, { "epoch": 9.418433931484502, "grad_norm": 3.470707893371582, "learning_rate": 5.136575114922299e-07, "loss": 0.1025, "num_input_tokens_seen": 124598448, "step": 57735 }, { "epoch": 9.419249592169658, "grad_norm": 6.755199909210205, "learning_rate": 5.122230264940409e-07, "loss": 0.1097, "num_input_tokens_seen": 124610576, "step": 57740 }, { "epoch": 9.420065252854812, "grad_norm": 0.10873281955718994, "learning_rate": 5.107905265993001e-07, "loss": 0.006, "num_input_tokens_seen": 124620656, "step": 57745 }, { "epoch": 9.420880913539968, "grad_norm": 0.06490042805671692, "learning_rate": 5.09360011924137e-07, "loss": 0.066, "num_input_tokens_seen": 124631344, "step": 57750 }, { "epoch": 9.421696574225122, "grad_norm": 8.283732414245605, "learning_rate": 5.079314825845144e-07, "loss": 0.098, "num_input_tokens_seen": 124641872, "step": 57755 }, { "epoch": 9.422512234910277, "grad_norm": 0.10572148114442825, "learning_rate": 5.065049386962395e-07, "loss": 0.1146, "num_input_tokens_seen": 124652464, "step": 57760 }, { "epoch": 9.423327895595433, "grad_norm": 0.04828275740146637, "learning_rate": 5.050803803749532e-07, "loss": 0.0065, "num_input_tokens_seen": 124662896, "step": 57765 }, { "epoch": 9.424143556280587, "grad_norm": 0.14845052361488342, "learning_rate": 5.036578077361381e-07, "loss": 0.0052, "num_input_tokens_seen": 124674480, "step": 57770 }, { "epoch": 9.424959216965743, "grad_norm": 0.08852400630712509, "learning_rate": 5.022372208951131e-07, "loss": 0.0095, "num_input_tokens_seen": 124684240, "step": 57775 }, { "epoch": 9.425774877650896, "grad_norm": 0.06310316920280457, "learning_rate": 5.008186199670389e-07, "loss": 0.0028, "num_input_tokens_seen": 124696336, "step": 57780 }, { "epoch": 9.426590538336052, "grad_norm": 0.19328726828098297, "learning_rate": 4.994020050669152e-07, "loss": 0.1241, "num_input_tokens_seen": 124708432, "step": 57785 }, { "epoch": 9.427406199021208, "grad_norm": 0.09310528635978699, "learning_rate": 4.979873763095805e-07, "loss": 0.0279, "num_input_tokens_seen": 124721072, "step": 57790 }, { "epoch": 9.428221859706362, "grad_norm": 0.09432114660739899, "learning_rate": 4.965747338097099e-07, "loss": 0.0948, "num_input_tokens_seen": 124731696, "step": 57795 }, { "epoch": 9.429037520391518, "grad_norm": 0.17877328395843506, "learning_rate": 4.951640776818228e-07, "loss": 0.136, "num_input_tokens_seen": 124742416, "step": 57800 }, { "epoch": 9.429853181076671, "grad_norm": 0.1892382949590683, "learning_rate": 4.937554080402695e-07, "loss": 0.1272, "num_input_tokens_seen": 124752784, "step": 57805 }, { "epoch": 9.430668841761827, "grad_norm": 5.177409648895264, "learning_rate": 4.923487249992476e-07, "loss": 0.2363, "num_input_tokens_seen": 124763248, "step": 57810 }, { "epoch": 9.431484502446983, "grad_norm": 0.07287146896123886, "learning_rate": 4.909440286727879e-07, "loss": 0.3061, "num_input_tokens_seen": 124774576, "step": 57815 }, { "epoch": 9.432300163132137, "grad_norm": 4.4076056480407715, "learning_rate": 4.895413191747633e-07, "loss": 0.1904, "num_input_tokens_seen": 124784368, "step": 57820 }, { "epoch": 9.433115823817293, "grad_norm": 0.03391724079847336, "learning_rate": 4.881405966188801e-07, "loss": 0.1205, "num_input_tokens_seen": 124793904, "step": 57825 }, { "epoch": 9.433931484502446, "grad_norm": 0.044700562953948975, "learning_rate": 4.867418611186974e-07, "loss": 0.0039, "num_input_tokens_seen": 124804848, "step": 57830 }, { "epoch": 9.434747145187602, "grad_norm": 0.28951773047447205, "learning_rate": 4.853451127875968e-07, "loss": 0.1278, "num_input_tokens_seen": 124817840, "step": 57835 }, { "epoch": 9.435562805872756, "grad_norm": 0.12186767905950546, "learning_rate": 4.839503517388072e-07, "loss": 0.0055, "num_input_tokens_seen": 124828080, "step": 57840 }, { "epoch": 9.436378466557912, "grad_norm": 0.10500957816839218, "learning_rate": 4.825575780853964e-07, "loss": 0.005, "num_input_tokens_seen": 124839632, "step": 57845 }, { "epoch": 9.437194127243067, "grad_norm": 0.3825618326663971, "learning_rate": 4.811667919402685e-07, "loss": 0.0913, "num_input_tokens_seen": 124852176, "step": 57850 }, { "epoch": 9.438009787928221, "grad_norm": 0.18801839649677277, "learning_rate": 4.797779934161667e-07, "loss": 0.161, "num_input_tokens_seen": 124862768, "step": 57855 }, { "epoch": 9.438825448613377, "grad_norm": 6.414939880371094, "learning_rate": 4.78391182625676e-07, "loss": 0.1748, "num_input_tokens_seen": 124873936, "step": 57860 }, { "epoch": 9.439641109298531, "grad_norm": 0.04732104018330574, "learning_rate": 4.770063596812146e-07, "loss": 0.0807, "num_input_tokens_seen": 124884912, "step": 57865 }, { "epoch": 9.440456769983687, "grad_norm": 0.07606936246156693, "learning_rate": 4.7562352469504855e-07, "loss": 0.0046, "num_input_tokens_seen": 124895632, "step": 57870 }, { "epoch": 9.441272430668842, "grad_norm": 0.059153228998184204, "learning_rate": 4.7424267777927414e-07, "loss": 0.0026, "num_input_tokens_seen": 124906640, "step": 57875 }, { "epoch": 9.442088091353996, "grad_norm": 0.04881925880908966, "learning_rate": 4.728638190458323e-07, "loss": 0.0048, "num_input_tokens_seen": 124917392, "step": 57880 }, { "epoch": 9.442903752039152, "grad_norm": 0.13776183128356934, "learning_rate": 4.7148694860649765e-07, "loss": 0.005, "num_input_tokens_seen": 124927408, "step": 57885 }, { "epoch": 9.443719412724306, "grad_norm": 0.10099656879901886, "learning_rate": 4.701120665728892e-07, "loss": 0.0964, "num_input_tokens_seen": 124938160, "step": 57890 }, { "epoch": 9.444535073409462, "grad_norm": 0.04308278486132622, "learning_rate": 4.687391730564594e-07, "loss": 0.0055, "num_input_tokens_seen": 124947376, "step": 57895 }, { "epoch": 9.445350734094617, "grad_norm": 0.10519593209028244, "learning_rate": 4.673682681684999e-07, "loss": 0.0677, "num_input_tokens_seen": 124958576, "step": 57900 }, { "epoch": 9.446166394779771, "grad_norm": 0.17816418409347534, "learning_rate": 4.6599935202014943e-07, "loss": 0.2961, "num_input_tokens_seen": 124968944, "step": 57905 }, { "epoch": 9.446982055464927, "grad_norm": 0.6837834715843201, "learning_rate": 4.646324247223749e-07, "loss": 0.0211, "num_input_tokens_seen": 124980496, "step": 57910 }, { "epoch": 9.447797716150081, "grad_norm": 18.661529541015625, "learning_rate": 4.6326748638598485e-07, "loss": 0.157, "num_input_tokens_seen": 124990256, "step": 57915 }, { "epoch": 9.448613376835237, "grad_norm": 0.057231463491916656, "learning_rate": 4.619045371216324e-07, "loss": 0.1044, "num_input_tokens_seen": 125001840, "step": 57920 }, { "epoch": 9.449429037520392, "grad_norm": 0.04664487764239311, "learning_rate": 4.605435770398042e-07, "loss": 0.0549, "num_input_tokens_seen": 125012272, "step": 57925 }, { "epoch": 9.450244698205546, "grad_norm": 8.235623359680176, "learning_rate": 4.591846062508232e-07, "loss": 0.1696, "num_input_tokens_seen": 125023024, "step": 57930 }, { "epoch": 9.451060358890702, "grad_norm": 0.13234318792819977, "learning_rate": 4.578276248648594e-07, "loss": 0.0821, "num_input_tokens_seen": 125035184, "step": 57935 }, { "epoch": 9.451876019575856, "grad_norm": 0.6127078533172607, "learning_rate": 4.5647263299191113e-07, "loss": 0.0045, "num_input_tokens_seen": 125045424, "step": 57940 }, { "epoch": 9.452691680261012, "grad_norm": 9.226028442382812, "learning_rate": 4.5511963074182653e-07, "loss": 0.2673, "num_input_tokens_seen": 125055504, "step": 57945 }, { "epoch": 9.453507340946166, "grad_norm": 0.32815080881118774, "learning_rate": 4.5376861822428176e-07, "loss": 0.1972, "num_input_tokens_seen": 125065744, "step": 57950 }, { "epoch": 9.454323001631321, "grad_norm": 0.5989218354225159, "learning_rate": 4.524195955488031e-07, "loss": 0.1054, "num_input_tokens_seen": 125076720, "step": 57955 }, { "epoch": 9.455138662316477, "grad_norm": 0.24300257861614227, "learning_rate": 4.5107256282474196e-07, "loss": 0.3522, "num_input_tokens_seen": 125087504, "step": 57960 }, { "epoch": 9.455954323001631, "grad_norm": 0.14050297439098358, "learning_rate": 4.4972752016129995e-07, "loss": 0.0249, "num_input_tokens_seen": 125099408, "step": 57965 }, { "epoch": 9.456769983686787, "grad_norm": 56.48374557495117, "learning_rate": 4.483844676675092e-07, "loss": 0.1118, "num_input_tokens_seen": 125109904, "step": 57970 }, { "epoch": 9.45758564437194, "grad_norm": 0.13741572201251984, "learning_rate": 4.4704340545224934e-07, "loss": 0.0132, "num_input_tokens_seen": 125120144, "step": 57975 }, { "epoch": 9.458401305057096, "grad_norm": 3.443633794784546, "learning_rate": 4.457043336242306e-07, "loss": 0.1897, "num_input_tokens_seen": 125131056, "step": 57980 }, { "epoch": 9.459216965742252, "grad_norm": 2.717332601547241, "learning_rate": 4.443672522920078e-07, "loss": 0.1386, "num_input_tokens_seen": 125141072, "step": 57985 }, { "epoch": 9.460032626427406, "grad_norm": 0.04186335578560829, "learning_rate": 4.4303216156396933e-07, "loss": 0.0061, "num_input_tokens_seen": 125152656, "step": 57990 }, { "epoch": 9.460848287112562, "grad_norm": 0.09012457728385925, "learning_rate": 4.416990615483396e-07, "loss": 0.115, "num_input_tokens_seen": 125164880, "step": 57995 }, { "epoch": 9.461663947797716, "grad_norm": 0.19651956856250763, "learning_rate": 4.4036795235319617e-07, "loss": 0.0072, "num_input_tokens_seen": 125175792, "step": 58000 }, { "epoch": 9.462479608482871, "grad_norm": 0.5376098155975342, "learning_rate": 4.390388340864415e-07, "loss": 0.0346, "num_input_tokens_seen": 125187152, "step": 58005 }, { "epoch": 9.463295269168025, "grad_norm": 0.05495506897568703, "learning_rate": 4.377117068558201e-07, "loss": 0.0045, "num_input_tokens_seen": 125197072, "step": 58010 }, { "epoch": 9.464110929853181, "grad_norm": 5.769318580627441, "learning_rate": 4.363865707689152e-07, "loss": 0.0105, "num_input_tokens_seen": 125208144, "step": 58015 }, { "epoch": 9.464926590538337, "grad_norm": 0.17950724065303802, "learning_rate": 4.350634259331465e-07, "loss": 0.0974, "num_input_tokens_seen": 125218448, "step": 58020 }, { "epoch": 9.46574225122349, "grad_norm": 2.7524445056915283, "learning_rate": 4.33742272455781e-07, "loss": 0.2416, "num_input_tokens_seen": 125229168, "step": 58025 }, { "epoch": 9.466557911908646, "grad_norm": 0.229463130235672, "learning_rate": 4.324231104439136e-07, "loss": 0.0923, "num_input_tokens_seen": 125240496, "step": 58030 }, { "epoch": 9.4673735725938, "grad_norm": 0.1134629175066948, "learning_rate": 4.3110594000448365e-07, "loss": 0.1163, "num_input_tokens_seen": 125251408, "step": 58035 }, { "epoch": 9.468189233278956, "grad_norm": 0.019398687407374382, "learning_rate": 4.29790761244267e-07, "loss": 0.1041, "num_input_tokens_seen": 125262128, "step": 58040 }, { "epoch": 9.469004893964112, "grad_norm": 0.08468424528837204, "learning_rate": 4.2847757426988097e-07, "loss": 0.0041, "num_input_tokens_seen": 125272816, "step": 58045 }, { "epoch": 9.469820554649266, "grad_norm": 0.3412284553050995, "learning_rate": 4.271663791877767e-07, "loss": 0.1515, "num_input_tokens_seen": 125283792, "step": 58050 }, { "epoch": 9.470636215334421, "grad_norm": 4.713772773742676, "learning_rate": 4.258571761042468e-07, "loss": 0.1922, "num_input_tokens_seen": 125294960, "step": 58055 }, { "epoch": 9.471451876019575, "grad_norm": 0.0300301481038332, "learning_rate": 4.2454996512542033e-07, "loss": 0.054, "num_input_tokens_seen": 125304816, "step": 58060 }, { "epoch": 9.47226753670473, "grad_norm": 0.27174460887908936, "learning_rate": 4.2324474635727085e-07, "loss": 0.0666, "num_input_tokens_seen": 125314448, "step": 58065 }, { "epoch": 9.473083197389887, "grad_norm": 0.14359116554260254, "learning_rate": 4.219415199056026e-07, "loss": 0.0037, "num_input_tokens_seen": 125325584, "step": 58070 }, { "epoch": 9.47389885807504, "grad_norm": 0.27878323197364807, "learning_rate": 4.2064028587606163e-07, "loss": 0.0049, "num_input_tokens_seen": 125337328, "step": 58075 }, { "epoch": 9.474714518760196, "grad_norm": 0.22504855692386627, "learning_rate": 4.19341044374133e-07, "loss": 0.0062, "num_input_tokens_seen": 125348336, "step": 58080 }, { "epoch": 9.47553017944535, "grad_norm": 0.06329631060361862, "learning_rate": 4.180437955051436e-07, "loss": 0.1611, "num_input_tokens_seen": 125359152, "step": 58085 }, { "epoch": 9.476345840130506, "grad_norm": 0.08869102597236633, "learning_rate": 4.167485393742482e-07, "loss": 0.0036, "num_input_tokens_seen": 125371792, "step": 58090 }, { "epoch": 9.477161500815662, "grad_norm": 0.058582451194524765, "learning_rate": 4.1545527608645163e-07, "loss": 0.0945, "num_input_tokens_seen": 125382960, "step": 58095 }, { "epoch": 9.477977161500815, "grad_norm": 4.101239204406738, "learning_rate": 4.1416400574659233e-07, "loss": 0.485, "num_input_tokens_seen": 125392752, "step": 58100 }, { "epoch": 9.478792822185971, "grad_norm": 0.1705857366323471, "learning_rate": 4.128747284593448e-07, "loss": 0.0626, "num_input_tokens_seen": 125404080, "step": 58105 }, { "epoch": 9.479608482871125, "grad_norm": 2.991602659225464, "learning_rate": 4.1158744432922835e-07, "loss": 0.1067, "num_input_tokens_seen": 125415568, "step": 58110 }, { "epoch": 9.48042414355628, "grad_norm": 0.2598673105239868, "learning_rate": 4.1030215346059e-07, "loss": 0.0563, "num_input_tokens_seen": 125426832, "step": 58115 }, { "epoch": 9.481239804241435, "grad_norm": 3.7980761528015137, "learning_rate": 4.090188559576269e-07, "loss": 0.105, "num_input_tokens_seen": 125436656, "step": 58120 }, { "epoch": 9.48205546492659, "grad_norm": 0.20062430202960968, "learning_rate": 4.077375519243698e-07, "loss": 0.0139, "num_input_tokens_seen": 125447440, "step": 58125 }, { "epoch": 9.482871125611746, "grad_norm": 7.75474214553833, "learning_rate": 4.0645824146468834e-07, "loss": 0.1233, "num_input_tokens_seen": 125458928, "step": 58130 }, { "epoch": 9.4836867862969, "grad_norm": 0.10313055664300919, "learning_rate": 4.0518092468228297e-07, "loss": 0.0067, "num_input_tokens_seen": 125469520, "step": 58135 }, { "epoch": 9.484502446982056, "grad_norm": 0.1118944063782692, "learning_rate": 4.0390560168070966e-07, "loss": 0.0042, "num_input_tokens_seen": 125480016, "step": 58140 }, { "epoch": 9.48531810766721, "grad_norm": 0.06879203021526337, "learning_rate": 4.026322725633441e-07, "loss": 0.0038, "num_input_tokens_seen": 125490608, "step": 58145 }, { "epoch": 9.486133768352365, "grad_norm": 0.12433100491762161, "learning_rate": 4.0136093743341485e-07, "loss": 0.0892, "num_input_tokens_seen": 125501392, "step": 58150 }, { "epoch": 9.486949429037521, "grad_norm": 0.09972847253084183, "learning_rate": 4.000915963939783e-07, "loss": 0.1006, "num_input_tokens_seen": 125510448, "step": 58155 }, { "epoch": 9.487765089722675, "grad_norm": 0.1359276920557022, "learning_rate": 3.988242495479383e-07, "loss": 0.1287, "num_input_tokens_seen": 125521968, "step": 58160 }, { "epoch": 9.48858075040783, "grad_norm": 0.08717767894268036, "learning_rate": 3.9755889699802926e-07, "loss": 0.1284, "num_input_tokens_seen": 125533360, "step": 58165 }, { "epoch": 9.489396411092985, "grad_norm": 0.10583186894655228, "learning_rate": 3.962955388468248e-07, "loss": 0.0046, "num_input_tokens_seen": 125544400, "step": 58170 }, { "epoch": 9.49021207177814, "grad_norm": 0.04262809455394745, "learning_rate": 3.950341751967457e-07, "loss": 0.0052, "num_input_tokens_seen": 125555568, "step": 58175 }, { "epoch": 9.491027732463296, "grad_norm": 0.1110820323228836, "learning_rate": 3.9377480615003794e-07, "loss": 0.2439, "num_input_tokens_seen": 125566224, "step": 58180 }, { "epoch": 9.49184339314845, "grad_norm": 0.1875482201576233, "learning_rate": 3.9251743180879483e-07, "loss": 0.0306, "num_input_tokens_seen": 125576976, "step": 58185 }, { "epoch": 9.492659053833606, "grad_norm": 0.2513222396373749, "learning_rate": 3.9126205227494605e-07, "loss": 0.0071, "num_input_tokens_seen": 125587856, "step": 58190 }, { "epoch": 9.49347471451876, "grad_norm": 6.3517632484436035, "learning_rate": 3.900086676502601e-07, "loss": 0.1194, "num_input_tokens_seen": 125598832, "step": 58195 }, { "epoch": 9.494290375203915, "grad_norm": 0.08826222270727158, "learning_rate": 3.8875727803634186e-07, "loss": 0.1277, "num_input_tokens_seen": 125610832, "step": 58200 }, { "epoch": 9.49510603588907, "grad_norm": 0.5476817488670349, "learning_rate": 3.8750788353463243e-07, "loss": 0.0902, "num_input_tokens_seen": 125621104, "step": 58205 }, { "epoch": 9.495921696574225, "grad_norm": 0.0348920039832592, "learning_rate": 3.862604842464201e-07, "loss": 0.0948, "num_input_tokens_seen": 125632752, "step": 58210 }, { "epoch": 9.49673735725938, "grad_norm": 0.1593414843082428, "learning_rate": 3.8501508027281865e-07, "loss": 0.0044, "num_input_tokens_seen": 125642832, "step": 58215 }, { "epoch": 9.497553017944535, "grad_norm": 0.04932843893766403, "learning_rate": 3.8377167171479154e-07, "loss": 0.0838, "num_input_tokens_seen": 125653840, "step": 58220 }, { "epoch": 9.49836867862969, "grad_norm": 0.16821761429309845, "learning_rate": 3.825302586731333e-07, "loss": 0.091, "num_input_tokens_seen": 125665104, "step": 58225 }, { "epoch": 9.499184339314844, "grad_norm": 0.13988712430000305, "learning_rate": 3.8129084124848e-07, "loss": 0.0044, "num_input_tokens_seen": 125675824, "step": 58230 }, { "epoch": 9.5, "grad_norm": 3.213824510574341, "learning_rate": 3.800534195413069e-07, "loss": 0.1249, "num_input_tokens_seen": 125686064, "step": 58235 }, { "epoch": 9.5, "eval_loss": 0.19253920018672943, "eval_runtime": 568.4597, "eval_samples_per_second": 4.794, "eval_steps_per_second": 1.2, "num_input_tokens_seen": 125686064, "step": 58235 }, { "epoch": 9.500815660685156, "grad_norm": 3.043863296508789, "learning_rate": 3.788179936519226e-07, "loss": 0.1592, "num_input_tokens_seen": 125697296, "step": 58240 }, { "epoch": 9.50163132137031, "grad_norm": 0.08366536349058151, "learning_rate": 3.775845636804776e-07, "loss": 0.0057, "num_input_tokens_seen": 125708688, "step": 58245 }, { "epoch": 9.502446982055465, "grad_norm": 0.06130436807870865, "learning_rate": 3.7635312972696404e-07, "loss": 0.0042, "num_input_tokens_seen": 125719792, "step": 58250 }, { "epoch": 9.50326264274062, "grad_norm": 0.10451815277338028, "learning_rate": 3.751236918912021e-07, "loss": 0.0058, "num_input_tokens_seen": 125729680, "step": 58255 }, { "epoch": 9.504078303425775, "grad_norm": 0.03382956609129906, "learning_rate": 3.7389625027285936e-07, "loss": 0.171, "num_input_tokens_seen": 125739472, "step": 58260 }, { "epoch": 9.50489396411093, "grad_norm": 0.05498937889933586, "learning_rate": 3.726708049714367e-07, "loss": 0.0024, "num_input_tokens_seen": 125751952, "step": 58265 }, { "epoch": 9.505709624796085, "grad_norm": 19.4271183013916, "learning_rate": 3.714473560862797e-07, "loss": 0.0378, "num_input_tokens_seen": 125762992, "step": 58270 }, { "epoch": 9.50652528548124, "grad_norm": 0.08409254997968674, "learning_rate": 3.702259037165617e-07, "loss": 0.2106, "num_input_tokens_seen": 125774480, "step": 58275 }, { "epoch": 9.507340946166394, "grad_norm": 10.254300117492676, "learning_rate": 3.690064479613009e-07, "loss": 0.0538, "num_input_tokens_seen": 125784944, "step": 58280 }, { "epoch": 9.50815660685155, "grad_norm": 4.610869407653809, "learning_rate": 3.67788988919357e-07, "loss": 0.2103, "num_input_tokens_seen": 125795120, "step": 58285 }, { "epoch": 9.508972267536706, "grad_norm": 10.56911849975586, "learning_rate": 3.665735266894177e-07, "loss": 0.1609, "num_input_tokens_seen": 125806640, "step": 58290 }, { "epoch": 9.50978792822186, "grad_norm": 18.840635299682617, "learning_rate": 3.653600613700209e-07, "loss": 0.036, "num_input_tokens_seen": 125817712, "step": 58295 }, { "epoch": 9.510603588907015, "grad_norm": 0.27261725068092346, "learning_rate": 3.6414859305952955e-07, "loss": 0.2522, "num_input_tokens_seen": 125829008, "step": 58300 }, { "epoch": 9.51141924959217, "grad_norm": 3.41131329536438, "learning_rate": 3.629391218561512e-07, "loss": 0.1678, "num_input_tokens_seen": 125840016, "step": 58305 }, { "epoch": 9.512234910277325, "grad_norm": 0.09870193898677826, "learning_rate": 3.6173164785794076e-07, "loss": 0.0045, "num_input_tokens_seen": 125850608, "step": 58310 }, { "epoch": 9.513050570962479, "grad_norm": 9.098423957824707, "learning_rate": 3.605261711627728e-07, "loss": 0.0976, "num_input_tokens_seen": 125863312, "step": 58315 }, { "epoch": 9.513866231647635, "grad_norm": 0.09959303587675095, "learning_rate": 3.593226918683745e-07, "loss": 0.0034, "num_input_tokens_seen": 125874960, "step": 58320 }, { "epoch": 9.51468189233279, "grad_norm": 0.18624134361743927, "learning_rate": 3.5812121007230414e-07, "loss": 0.006, "num_input_tokens_seen": 125885264, "step": 58325 }, { "epoch": 9.515497553017944, "grad_norm": 0.0898330882191658, "learning_rate": 3.569217258719587e-07, "loss": 0.1345, "num_input_tokens_seen": 125896624, "step": 58330 }, { "epoch": 9.5163132137031, "grad_norm": 3.7100512981414795, "learning_rate": 3.557242393645771e-07, "loss": 0.1631, "num_input_tokens_seen": 125907056, "step": 58335 }, { "epoch": 9.517128874388254, "grad_norm": 0.24952083826065063, "learning_rate": 3.5452875064723445e-07, "loss": 0.2797, "num_input_tokens_seen": 125917616, "step": 58340 }, { "epoch": 9.51794453507341, "grad_norm": 0.2295496165752411, "learning_rate": 3.5333525981683937e-07, "loss": 0.2073, "num_input_tokens_seen": 125927984, "step": 58345 }, { "epoch": 9.518760195758565, "grad_norm": 4.289388656616211, "learning_rate": 3.521437669701422e-07, "loss": 0.106, "num_input_tokens_seen": 125938736, "step": 58350 }, { "epoch": 9.51957585644372, "grad_norm": 9.160137176513672, "learning_rate": 3.5095427220373513e-07, "loss": 0.2638, "num_input_tokens_seen": 125948816, "step": 58355 }, { "epoch": 9.520391517128875, "grad_norm": 0.06078990548849106, "learning_rate": 3.497667756140438e-07, "loss": 0.0036, "num_input_tokens_seen": 125959504, "step": 58360 }, { "epoch": 9.521207177814029, "grad_norm": 0.1819821298122406, "learning_rate": 3.4858127729733015e-07, "loss": 0.0325, "num_input_tokens_seen": 125971504, "step": 58365 }, { "epoch": 9.522022838499185, "grad_norm": 0.08073264360427856, "learning_rate": 3.473977773496978e-07, "loss": 0.0682, "num_input_tokens_seen": 125981840, "step": 58370 }, { "epoch": 9.522838499184338, "grad_norm": 0.015772346407175064, "learning_rate": 3.462162758670895e-07, "loss": 0.1356, "num_input_tokens_seen": 125992432, "step": 58375 }, { "epoch": 9.523654159869494, "grad_norm": 3.9206371307373047, "learning_rate": 3.4503677294527857e-07, "loss": 0.3046, "num_input_tokens_seen": 126002576, "step": 58380 }, { "epoch": 9.52446982055465, "grad_norm": 0.09783914685249329, "learning_rate": 3.438592686798886e-07, "loss": 0.0091, "num_input_tokens_seen": 126011920, "step": 58385 }, { "epoch": 9.525285481239804, "grad_norm": 0.06690984964370728, "learning_rate": 3.4268376316636816e-07, "loss": 0.0039, "num_input_tokens_seen": 126022864, "step": 58390 }, { "epoch": 9.52610114192496, "grad_norm": 0.031618401408195496, "learning_rate": 3.4151025650001056e-07, "loss": 0.0023, "num_input_tokens_seen": 126034960, "step": 58395 }, { "epoch": 9.526916802610113, "grad_norm": 0.13895615935325623, "learning_rate": 3.4033874877595074e-07, "loss": 0.0151, "num_input_tokens_seen": 126046672, "step": 58400 }, { "epoch": 9.52773246329527, "grad_norm": 0.06268240511417389, "learning_rate": 3.3916924008915163e-07, "loss": 0.0068, "num_input_tokens_seen": 126058000, "step": 58405 }, { "epoch": 9.528548123980425, "grad_norm": 0.09085869044065475, "learning_rate": 3.3800173053442354e-07, "loss": 0.003, "num_input_tokens_seen": 126068496, "step": 58410 }, { "epoch": 9.529363784665579, "grad_norm": 0.11956500262022018, "learning_rate": 3.3683622020640736e-07, "loss": 0.0331, "num_input_tokens_seen": 126078544, "step": 58415 }, { "epoch": 9.530179445350734, "grad_norm": 0.46510857343673706, "learning_rate": 3.356727091995859e-07, "loss": 0.318, "num_input_tokens_seen": 126089712, "step": 58420 }, { "epoch": 9.530995106035888, "grad_norm": 0.032259371131658554, "learning_rate": 3.3451119760828374e-07, "loss": 0.0792, "num_input_tokens_seen": 126100656, "step": 58425 }, { "epoch": 9.531810766721044, "grad_norm": 0.1584005057811737, "learning_rate": 3.333516855266533e-07, "loss": 0.2102, "num_input_tokens_seen": 126111376, "step": 58430 }, { "epoch": 9.5326264274062, "grad_norm": 0.07590462267398834, "learning_rate": 3.321941730486916e-07, "loss": 0.0088, "num_input_tokens_seen": 126122640, "step": 58435 }, { "epoch": 9.533442088091354, "grad_norm": 0.33584555983543396, "learning_rate": 3.3103866026823473e-07, "loss": 0.0088, "num_input_tokens_seen": 126132432, "step": 58440 }, { "epoch": 9.53425774877651, "grad_norm": 0.05497417226433754, "learning_rate": 3.2988514727895217e-07, "loss": 0.0028, "num_input_tokens_seen": 126142576, "step": 58445 }, { "epoch": 9.535073409461663, "grad_norm": 0.08528464287519455, "learning_rate": 3.287336341743524e-07, "loss": 0.0036, "num_input_tokens_seen": 126153552, "step": 58450 }, { "epoch": 9.535889070146819, "grad_norm": 0.0931963175535202, "learning_rate": 3.275841210477887e-07, "loss": 0.1672, "num_input_tokens_seen": 126165072, "step": 58455 }, { "epoch": 9.536704730831975, "grad_norm": 0.11097992956638336, "learning_rate": 3.264366079924419e-07, "loss": 0.007, "num_input_tokens_seen": 126175696, "step": 58460 }, { "epoch": 9.537520391517129, "grad_norm": 0.05439147725701332, "learning_rate": 3.252910951013349e-07, "loss": 0.0553, "num_input_tokens_seen": 126187088, "step": 58465 }, { "epoch": 9.538336052202284, "grad_norm": 0.1262970119714737, "learning_rate": 3.2414758246733234e-07, "loss": 0.1925, "num_input_tokens_seen": 126198480, "step": 58470 }, { "epoch": 9.539151712887438, "grad_norm": 0.46631112694740295, "learning_rate": 3.2300607018312944e-07, "loss": 0.0052, "num_input_tokens_seen": 126209904, "step": 58475 }, { "epoch": 9.539967373572594, "grad_norm": 0.08133397996425629, "learning_rate": 3.2186655834126335e-07, "loss": 0.0042, "num_input_tokens_seen": 126219792, "step": 58480 }, { "epoch": 9.540783034257748, "grad_norm": 0.1267090141773224, "learning_rate": 3.207290470341101e-07, "loss": 0.0105, "num_input_tokens_seen": 126230256, "step": 58485 }, { "epoch": 9.541598694942904, "grad_norm": 0.06662649661302567, "learning_rate": 3.1959353635388214e-07, "loss": 0.1066, "num_input_tokens_seen": 126241136, "step": 58490 }, { "epoch": 9.54241435562806, "grad_norm": 0.4028434753417969, "learning_rate": 3.1846002639263074e-07, "loss": 0.0069, "num_input_tokens_seen": 126252048, "step": 58495 }, { "epoch": 9.543230016313213, "grad_norm": 0.09449943155050278, "learning_rate": 3.17328517242238e-07, "loss": 0.1113, "num_input_tokens_seen": 126262320, "step": 58500 }, { "epoch": 9.544045676998369, "grad_norm": 4.737239837646484, "learning_rate": 3.161990089944389e-07, "loss": 0.3361, "num_input_tokens_seen": 126272464, "step": 58505 }, { "epoch": 9.544861337683523, "grad_norm": 0.17989328503608704, "learning_rate": 3.150715017407907e-07, "loss": 0.0036, "num_input_tokens_seen": 126283984, "step": 58510 }, { "epoch": 9.545676998368679, "grad_norm": 0.09410925954580307, "learning_rate": 3.1394599557269534e-07, "loss": 0.0155, "num_input_tokens_seen": 126295024, "step": 58515 }, { "epoch": 9.546492659053834, "grad_norm": 0.03972543403506279, "learning_rate": 3.128224905813965e-07, "loss": 0.0909, "num_input_tokens_seen": 126305296, "step": 58520 }, { "epoch": 9.547308319738988, "grad_norm": 0.05499481409788132, "learning_rate": 3.1170098685796565e-07, "loss": 0.0051, "num_input_tokens_seen": 126317264, "step": 58525 }, { "epoch": 9.548123980424144, "grad_norm": 0.32111480832099915, "learning_rate": 3.1058148449331914e-07, "loss": 0.0893, "num_input_tokens_seen": 126326640, "step": 58530 }, { "epoch": 9.548939641109298, "grad_norm": 0.15600375831127167, "learning_rate": 3.09463983578212e-07, "loss": 0.007, "num_input_tokens_seen": 126337840, "step": 58535 }, { "epoch": 9.549755301794454, "grad_norm": 15.249553680419922, "learning_rate": 3.0834848420323305e-07, "loss": 0.1863, "num_input_tokens_seen": 126347792, "step": 58540 }, { "epoch": 9.550570962479608, "grad_norm": 0.20458751916885376, "learning_rate": 3.0723498645880976e-07, "loss": 0.0073, "num_input_tokens_seen": 126359344, "step": 58545 }, { "epoch": 9.551386623164763, "grad_norm": 0.1945304274559021, "learning_rate": 3.061234904352089e-07, "loss": 0.1916, "num_input_tokens_seen": 126370064, "step": 58550 }, { "epoch": 9.552202283849919, "grad_norm": 0.18202432990074158, "learning_rate": 3.0501399622253344e-07, "loss": 0.0049, "num_input_tokens_seen": 126380080, "step": 58555 }, { "epoch": 9.553017944535073, "grad_norm": 0.618599534034729, "learning_rate": 3.0390650391072527e-07, "loss": 0.0072, "num_input_tokens_seen": 126390864, "step": 58560 }, { "epoch": 9.553833605220229, "grad_norm": 0.07423996925354004, "learning_rate": 3.028010135895598e-07, "loss": 0.0576, "num_input_tokens_seen": 126402224, "step": 58565 }, { "epoch": 9.554649265905383, "grad_norm": 0.09729321300983429, "learning_rate": 3.016975253486598e-07, "loss": 0.0036, "num_input_tokens_seen": 126412464, "step": 58570 }, { "epoch": 9.555464926590538, "grad_norm": 0.2405720204114914, "learning_rate": 3.0059603927747313e-07, "loss": 0.0124, "num_input_tokens_seen": 126423600, "step": 58575 }, { "epoch": 9.556280587275694, "grad_norm": 0.2697793245315552, "learning_rate": 2.9949655546529785e-07, "loss": 0.064, "num_input_tokens_seen": 126434928, "step": 58580 }, { "epoch": 9.557096247960848, "grad_norm": 0.12202146649360657, "learning_rate": 2.9839907400125986e-07, "loss": 0.0051, "num_input_tokens_seen": 126444944, "step": 58585 }, { "epoch": 9.557911908646004, "grad_norm": 0.0535927414894104, "learning_rate": 2.973035949743269e-07, "loss": 0.1052, "num_input_tokens_seen": 126456208, "step": 58590 }, { "epoch": 9.558727569331158, "grad_norm": 0.14638705551624298, "learning_rate": 2.9621011847330293e-07, "loss": 0.0034, "num_input_tokens_seen": 126467312, "step": 58595 }, { "epoch": 9.559543230016313, "grad_norm": 0.13557329773902893, "learning_rate": 2.951186445868337e-07, "loss": 0.0099, "num_input_tokens_seen": 126476656, "step": 58600 }, { "epoch": 9.560358890701469, "grad_norm": 0.12493931502103806, "learning_rate": 2.940291734034012e-07, "loss": 0.2067, "num_input_tokens_seen": 126487920, "step": 58605 }, { "epoch": 9.561174551386623, "grad_norm": 0.0669429674744606, "learning_rate": 2.929417050113181e-07, "loss": 0.0046, "num_input_tokens_seen": 126499216, "step": 58610 }, { "epoch": 9.561990212071779, "grad_norm": 0.11747512966394424, "learning_rate": 2.918562394987445e-07, "loss": 0.0045, "num_input_tokens_seen": 126509936, "step": 58615 }, { "epoch": 9.562805872756933, "grad_norm": 4.202183723449707, "learning_rate": 2.907727769536683e-07, "loss": 0.1327, "num_input_tokens_seen": 126521776, "step": 58620 }, { "epoch": 9.563621533442088, "grad_norm": 0.12939080595970154, "learning_rate": 2.8969131746392763e-07, "loss": 0.0051, "num_input_tokens_seen": 126532464, "step": 58625 }, { "epoch": 9.564437194127244, "grad_norm": 0.08323273807764053, "learning_rate": 2.886118611171884e-07, "loss": 0.0843, "num_input_tokens_seen": 126542992, "step": 58630 }, { "epoch": 9.565252854812398, "grad_norm": 3.485731363296509, "learning_rate": 2.875344080009529e-07, "loss": 0.1096, "num_input_tokens_seen": 126553680, "step": 58635 }, { "epoch": 9.566068515497554, "grad_norm": 0.17163684964179993, "learning_rate": 2.8645895820257065e-07, "loss": 0.0046, "num_input_tokens_seen": 126563440, "step": 58640 }, { "epoch": 9.566884176182707, "grad_norm": 0.3211774230003357, "learning_rate": 2.8538551180921913e-07, "loss": 0.1721, "num_input_tokens_seen": 126573904, "step": 58645 }, { "epoch": 9.567699836867863, "grad_norm": 3.841701030731201, "learning_rate": 2.8431406890792045e-07, "loss": 0.2517, "num_input_tokens_seen": 126584560, "step": 58650 }, { "epoch": 9.568515497553017, "grad_norm": 0.06320230662822723, "learning_rate": 2.8324462958552735e-07, "loss": 0.0046, "num_input_tokens_seen": 126595632, "step": 58655 }, { "epoch": 9.569331158238173, "grad_norm": 0.043477654457092285, "learning_rate": 2.821771939287371e-07, "loss": 0.0056, "num_input_tokens_seen": 126606800, "step": 58660 }, { "epoch": 9.570146818923329, "grad_norm": 19.05152702331543, "learning_rate": 2.811117620240833e-07, "loss": 0.102, "num_input_tokens_seen": 126617808, "step": 58665 }, { "epoch": 9.570962479608482, "grad_norm": 0.1006511002779007, "learning_rate": 2.800483339579274e-07, "loss": 0.2518, "num_input_tokens_seen": 126628720, "step": 58670 }, { "epoch": 9.571778140293638, "grad_norm": 0.11229012906551361, "learning_rate": 2.789869098164838e-07, "loss": 0.0066, "num_input_tokens_seen": 126639664, "step": 58675 }, { "epoch": 9.572593800978792, "grad_norm": 0.061839509755373, "learning_rate": 2.779274896857947e-07, "loss": 0.0032, "num_input_tokens_seen": 126650736, "step": 58680 }, { "epoch": 9.573409461663948, "grad_norm": 9.058664321899414, "learning_rate": 2.768700736517416e-07, "loss": 0.16, "num_input_tokens_seen": 126662224, "step": 58685 }, { "epoch": 9.574225122349104, "grad_norm": 0.0872805118560791, "learning_rate": 2.7581466180004454e-07, "loss": 0.0052, "num_input_tokens_seen": 126673040, "step": 58690 }, { "epoch": 9.575040783034257, "grad_norm": 0.10303211212158203, "learning_rate": 2.747612542162603e-07, "loss": 0.0087, "num_input_tokens_seen": 126683824, "step": 58695 }, { "epoch": 9.575856443719413, "grad_norm": 0.07268112152814865, "learning_rate": 2.737098509857816e-07, "loss": 0.0068, "num_input_tokens_seen": 126693488, "step": 58700 }, { "epoch": 9.576672104404567, "grad_norm": 3.7061920166015625, "learning_rate": 2.726604521938458e-07, "loss": 0.2255, "num_input_tokens_seen": 126703888, "step": 58705 }, { "epoch": 9.577487765089723, "grad_norm": 0.11714824289083481, "learning_rate": 2.716130579255155e-07, "loss": 0.1504, "num_input_tokens_seen": 126715440, "step": 58710 }, { "epoch": 9.578303425774878, "grad_norm": 0.09520381689071655, "learning_rate": 2.7056766826570045e-07, "loss": 0.2826, "num_input_tokens_seen": 126725680, "step": 58715 }, { "epoch": 9.579119086460032, "grad_norm": 9.55907917022705, "learning_rate": 2.6952428329914956e-07, "loss": 0.0382, "num_input_tokens_seen": 126736144, "step": 58720 }, { "epoch": 9.579934747145188, "grad_norm": 0.10738665610551834, "learning_rate": 2.684829031104397e-07, "loss": 0.0664, "num_input_tokens_seen": 126747504, "step": 58725 }, { "epoch": 9.580750407830342, "grad_norm": 0.08543211966753006, "learning_rate": 2.6744352778399204e-07, "loss": 0.0955, "num_input_tokens_seen": 126757904, "step": 58730 }, { "epoch": 9.581566068515498, "grad_norm": 0.12505745887756348, "learning_rate": 2.6640615740406436e-07, "loss": 0.006, "num_input_tokens_seen": 126768400, "step": 58735 }, { "epoch": 9.582381729200652, "grad_norm": 6.947787284851074, "learning_rate": 2.6537079205475323e-07, "loss": 0.2567, "num_input_tokens_seen": 126779440, "step": 58740 }, { "epoch": 9.583197389885807, "grad_norm": 3.702768564224243, "learning_rate": 2.6433743181998316e-07, "loss": 0.1049, "num_input_tokens_seen": 126790672, "step": 58745 }, { "epoch": 9.584013050570963, "grad_norm": 0.11286371201276779, "learning_rate": 2.633060767835316e-07, "loss": 0.0074, "num_input_tokens_seen": 126799824, "step": 58750 }, { "epoch": 9.584828711256117, "grad_norm": 1.5042606592178345, "learning_rate": 2.6227672702900106e-07, "loss": 0.0064, "num_input_tokens_seen": 126811248, "step": 58755 }, { "epoch": 9.585644371941273, "grad_norm": 2.318598508834839, "learning_rate": 2.61249382639836e-07, "loss": 0.2699, "num_input_tokens_seen": 126822256, "step": 58760 }, { "epoch": 9.586460032626427, "grad_norm": 0.4412936270236969, "learning_rate": 2.6022404369931976e-07, "loss": 0.1239, "num_input_tokens_seen": 126833840, "step": 58765 }, { "epoch": 9.587275693311582, "grad_norm": 0.15247473120689392, "learning_rate": 2.592007102905719e-07, "loss": 0.0034, "num_input_tokens_seen": 126846160, "step": 58770 }, { "epoch": 9.588091353996738, "grad_norm": 0.17033638060092926, "learning_rate": 2.581793824965484e-07, "loss": 0.0056, "num_input_tokens_seen": 126857904, "step": 58775 }, { "epoch": 9.588907014681892, "grad_norm": 0.09159134328365326, "learning_rate": 2.5716006040004123e-07, "loss": 0.0447, "num_input_tokens_seen": 126869232, "step": 58780 }, { "epoch": 9.589722675367048, "grad_norm": 0.021363073959946632, "learning_rate": 2.5614274408368444e-07, "loss": 0.0054, "num_input_tokens_seen": 126880784, "step": 58785 }, { "epoch": 9.590538336052202, "grad_norm": 10.857181549072266, "learning_rate": 2.5512743362994527e-07, "loss": 0.0184, "num_input_tokens_seen": 126891600, "step": 58790 }, { "epoch": 9.591353996737357, "grad_norm": 0.3296675980091095, "learning_rate": 2.541141291211302e-07, "loss": 0.1118, "num_input_tokens_seen": 126903056, "step": 58795 }, { "epoch": 9.592169657422513, "grad_norm": 0.08449359238147736, "learning_rate": 2.5310283063938457e-07, "loss": 0.0585, "num_input_tokens_seen": 126914032, "step": 58800 }, { "epoch": 9.592985318107667, "grad_norm": 0.07261674106121063, "learning_rate": 2.5209353826668726e-07, "loss": 0.2212, "num_input_tokens_seen": 126923920, "step": 58805 }, { "epoch": 9.593800978792823, "grad_norm": 0.1694912612438202, "learning_rate": 2.510862520848589e-07, "loss": 0.0046, "num_input_tokens_seen": 126935216, "step": 58810 }, { "epoch": 9.594616639477977, "grad_norm": 0.10771431773900986, "learning_rate": 2.500809721755509e-07, "loss": 0.0042, "num_input_tokens_seen": 126944752, "step": 58815 }, { "epoch": 9.595432300163132, "grad_norm": 9.962686538696289, "learning_rate": 2.490776986202592e-07, "loss": 0.0914, "num_input_tokens_seen": 126955856, "step": 58820 }, { "epoch": 9.596247960848288, "grad_norm": 0.11950484663248062, "learning_rate": 2.480764315003159e-07, "loss": 0.0081, "num_input_tokens_seen": 126966064, "step": 58825 }, { "epoch": 9.597063621533442, "grad_norm": 0.024678900837898254, "learning_rate": 2.470771708968866e-07, "loss": 0.116, "num_input_tokens_seen": 126977424, "step": 58830 }, { "epoch": 9.597879282218598, "grad_norm": 3.2517993450164795, "learning_rate": 2.4607991689097607e-07, "loss": 0.1095, "num_input_tokens_seen": 126987728, "step": 58835 }, { "epoch": 9.598694942903752, "grad_norm": 0.09872573614120483, "learning_rate": 2.4508466956343066e-07, "loss": 0.1991, "num_input_tokens_seen": 126999536, "step": 58840 }, { "epoch": 9.599510603588907, "grad_norm": 0.10038405656814575, "learning_rate": 2.4409142899492474e-07, "loss": 0.1479, "num_input_tokens_seen": 127010384, "step": 58845 }, { "epoch": 9.600326264274061, "grad_norm": 0.21252353489398956, "learning_rate": 2.4310019526597726e-07, "loss": 0.0055, "num_input_tokens_seen": 127020176, "step": 58850 }, { "epoch": 9.601141924959217, "grad_norm": 0.216371089220047, "learning_rate": 2.4211096845694336e-07, "loss": 0.0673, "num_input_tokens_seen": 127030192, "step": 58855 }, { "epoch": 9.601957585644373, "grad_norm": 0.5040737390518188, "learning_rate": 2.411237486480145e-07, "loss": 0.0073, "num_input_tokens_seen": 127041808, "step": 58860 }, { "epoch": 9.602773246329527, "grad_norm": 0.09158005565404892, "learning_rate": 2.4013853591922097e-07, "loss": 0.089, "num_input_tokens_seen": 127052400, "step": 58865 }, { "epoch": 9.603588907014682, "grad_norm": 0.07197082042694092, "learning_rate": 2.391553303504296e-07, "loss": 0.1119, "num_input_tokens_seen": 127063056, "step": 58870 }, { "epoch": 9.604404567699836, "grad_norm": 0.12315355986356735, "learning_rate": 2.3817413202134041e-07, "loss": 0.094, "num_input_tokens_seen": 127073840, "step": 58875 }, { "epoch": 9.605220228384992, "grad_norm": 0.031999364495277405, "learning_rate": 2.3719494101149543e-07, "loss": 0.079, "num_input_tokens_seen": 127084656, "step": 58880 }, { "epoch": 9.606035889070148, "grad_norm": 0.05075620487332344, "learning_rate": 2.3621775740027553e-07, "loss": 0.008, "num_input_tokens_seen": 127095984, "step": 58885 }, { "epoch": 9.606851549755302, "grad_norm": 0.030352383852005005, "learning_rate": 2.3524258126689235e-07, "loss": 0.0222, "num_input_tokens_seen": 127105776, "step": 58890 }, { "epoch": 9.607667210440457, "grad_norm": 0.060310106724500656, "learning_rate": 2.3426941269040213e-07, "loss": 0.1309, "num_input_tokens_seen": 127117424, "step": 58895 }, { "epoch": 9.608482871125611, "grad_norm": 3.352529764175415, "learning_rate": 2.3329825174969455e-07, "loss": 0.0962, "num_input_tokens_seen": 127128240, "step": 58900 }, { "epoch": 9.609298531810767, "grad_norm": 0.132798433303833, "learning_rate": 2.3232909852349273e-07, "loss": 0.0089, "num_input_tokens_seen": 127140176, "step": 58905 }, { "epoch": 9.61011419249592, "grad_norm": 0.14223125576972961, "learning_rate": 2.3136195309036435e-07, "loss": 0.0106, "num_input_tokens_seen": 127149392, "step": 58910 }, { "epoch": 9.610929853181077, "grad_norm": 0.08127401024103165, "learning_rate": 2.303968155287134e-07, "loss": 0.0066, "num_input_tokens_seen": 127160496, "step": 58915 }, { "epoch": 9.611745513866232, "grad_norm": 1.1695650815963745, "learning_rate": 2.294336859167745e-07, "loss": 0.0062, "num_input_tokens_seen": 127170544, "step": 58920 }, { "epoch": 9.612561174551386, "grad_norm": 0.03738000616431236, "learning_rate": 2.2847256433262686e-07, "loss": 0.1002, "num_input_tokens_seen": 127182992, "step": 58925 }, { "epoch": 9.613376835236542, "grad_norm": 0.07341515272855759, "learning_rate": 2.2751345085418042e-07, "loss": 0.1272, "num_input_tokens_seen": 127194320, "step": 58930 }, { "epoch": 9.614192495921696, "grad_norm": 0.09440329670906067, "learning_rate": 2.265563455591896e-07, "loss": 0.1013, "num_input_tokens_seen": 127205776, "step": 58935 }, { "epoch": 9.615008156606851, "grad_norm": 5.581656455993652, "learning_rate": 2.2560124852523955e-07, "loss": 0.1296, "num_input_tokens_seen": 127217264, "step": 58940 }, { "epoch": 9.615823817292007, "grad_norm": 0.07413234561681747, "learning_rate": 2.246481598297573e-07, "loss": 0.1324, "num_input_tokens_seen": 127229584, "step": 58945 }, { "epoch": 9.616639477977161, "grad_norm": 0.14779232442378998, "learning_rate": 2.2369707955000318e-07, "loss": 0.126, "num_input_tokens_seen": 127239824, "step": 58950 }, { "epoch": 9.617455138662317, "grad_norm": 0.09638264775276184, "learning_rate": 2.2274800776307946e-07, "loss": 0.082, "num_input_tokens_seen": 127250672, "step": 58955 }, { "epoch": 9.61827079934747, "grad_norm": 0.26495882868766785, "learning_rate": 2.2180094454591903e-07, "loss": 0.1262, "num_input_tokens_seen": 127260496, "step": 58960 }, { "epoch": 9.619086460032626, "grad_norm": 0.05389063432812691, "learning_rate": 2.2085588997529938e-07, "loss": 0.1803, "num_input_tokens_seen": 127272336, "step": 58965 }, { "epoch": 9.619902120717782, "grad_norm": 0.4798796772956848, "learning_rate": 2.1991284412782864e-07, "loss": 0.0042, "num_input_tokens_seen": 127284368, "step": 58970 }, { "epoch": 9.620717781402936, "grad_norm": 0.043308209627866745, "learning_rate": 2.18971807079954e-07, "loss": 0.0773, "num_input_tokens_seen": 127294032, "step": 58975 }, { "epoch": 9.621533442088092, "grad_norm": 0.06999657303094864, "learning_rate": 2.1803277890796447e-07, "loss": 0.1303, "num_input_tokens_seen": 127304080, "step": 58980 }, { "epoch": 9.622349102773246, "grad_norm": 7.49120569229126, "learning_rate": 2.170957596879797e-07, "loss": 0.0539, "num_input_tokens_seen": 127315888, "step": 58985 }, { "epoch": 9.623164763458401, "grad_norm": 0.24277690052986145, "learning_rate": 2.1616074949595832e-07, "loss": 0.0056, "num_input_tokens_seen": 127326864, "step": 58990 }, { "epoch": 9.623980424143557, "grad_norm": 0.12107644230127335, "learning_rate": 2.1522774840770087e-07, "loss": 0.011, "num_input_tokens_seen": 127336368, "step": 58995 }, { "epoch": 9.624796084828711, "grad_norm": 13.174209594726562, "learning_rate": 2.1429675649883575e-07, "loss": 0.0277, "num_input_tokens_seen": 127347824, "step": 59000 }, { "epoch": 9.625611745513867, "grad_norm": 3.4110028743743896, "learning_rate": 2.1336777384484141e-07, "loss": 0.1964, "num_input_tokens_seen": 127356656, "step": 59005 }, { "epoch": 9.62642740619902, "grad_norm": 0.1018814742565155, "learning_rate": 2.1244080052101879e-07, "loss": 0.2314, "num_input_tokens_seen": 127367120, "step": 59010 }, { "epoch": 9.627243066884176, "grad_norm": 0.8009899854660034, "learning_rate": 2.115158366025133e-07, "loss": 0.0108, "num_input_tokens_seen": 127378800, "step": 59015 }, { "epoch": 9.62805872756933, "grad_norm": 3.683324098587036, "learning_rate": 2.1059288216431217e-07, "loss": 0.1045, "num_input_tokens_seen": 127390000, "step": 59020 }, { "epoch": 9.628874388254486, "grad_norm": 0.12710148096084595, "learning_rate": 2.0967193728123334e-07, "loss": 0.0032, "num_input_tokens_seen": 127400144, "step": 59025 }, { "epoch": 9.629690048939642, "grad_norm": 0.03041822463274002, "learning_rate": 2.0875300202793101e-07, "loss": 0.0743, "num_input_tokens_seen": 127411568, "step": 59030 }, { "epoch": 9.630505709624796, "grad_norm": 0.14042043685913086, "learning_rate": 2.0783607647889837e-07, "loss": 0.0704, "num_input_tokens_seen": 127421520, "step": 59035 }, { "epoch": 9.631321370309951, "grad_norm": 0.14955595135688782, "learning_rate": 2.0692116070847035e-07, "loss": 0.1032, "num_input_tokens_seen": 127433424, "step": 59040 }, { "epoch": 9.632137030995105, "grad_norm": 0.15251396596431732, "learning_rate": 2.0600825479080986e-07, "loss": 0.0048, "num_input_tokens_seen": 127443408, "step": 59045 }, { "epoch": 9.632952691680261, "grad_norm": 6.020075798034668, "learning_rate": 2.0509735879992442e-07, "loss": 0.0084, "num_input_tokens_seen": 127454256, "step": 59050 }, { "epoch": 9.633768352365417, "grad_norm": 0.0663226842880249, "learning_rate": 2.041884728096549e-07, "loss": 0.1546, "num_input_tokens_seen": 127464624, "step": 59055 }, { "epoch": 9.63458401305057, "grad_norm": 0.07296096533536911, "learning_rate": 2.0328159689368133e-07, "loss": 0.0904, "num_input_tokens_seen": 127474192, "step": 59060 }, { "epoch": 9.635399673735726, "grad_norm": 2.504868984222412, "learning_rate": 2.0237673112551704e-07, "loss": 0.0083, "num_input_tokens_seen": 127483056, "step": 59065 }, { "epoch": 9.63621533442088, "grad_norm": 0.22530889511108398, "learning_rate": 2.0147387557851727e-07, "loss": 0.1021, "num_input_tokens_seen": 127493360, "step": 59070 }, { "epoch": 9.637030995106036, "grad_norm": 0.17570574581623077, "learning_rate": 2.005730303258735e-07, "loss": 0.1274, "num_input_tokens_seen": 127503728, "step": 59075 }, { "epoch": 9.63784665579119, "grad_norm": 0.11782146990299225, "learning_rate": 1.9967419544060784e-07, "loss": 0.0038, "num_input_tokens_seen": 127514672, "step": 59080 }, { "epoch": 9.638662316476346, "grad_norm": 0.03208833560347557, "learning_rate": 1.987773709955898e-07, "loss": 0.0049, "num_input_tokens_seen": 127525200, "step": 59085 }, { "epoch": 9.639477977161501, "grad_norm": 0.13830067217350006, "learning_rate": 1.9788255706351678e-07, "loss": 0.1041, "num_input_tokens_seen": 127535824, "step": 59090 }, { "epoch": 9.640293637846655, "grad_norm": 0.050570495426654816, "learning_rate": 1.9698975371693075e-07, "loss": 0.0148, "num_input_tokens_seen": 127546960, "step": 59095 }, { "epoch": 9.641109298531811, "grad_norm": 0.31174710392951965, "learning_rate": 1.9609896102820157e-07, "loss": 0.0097, "num_input_tokens_seen": 127557680, "step": 59100 }, { "epoch": 9.641924959216965, "grad_norm": 0.3156566321849823, "learning_rate": 1.9521017906954654e-07, "loss": 0.0218, "num_input_tokens_seen": 127567696, "step": 59105 }, { "epoch": 9.64274061990212, "grad_norm": 0.08258449286222458, "learning_rate": 1.9432340791301073e-07, "loss": 0.1024, "num_input_tokens_seen": 127578736, "step": 59110 }, { "epoch": 9.643556280587276, "grad_norm": 7.100406169891357, "learning_rate": 1.9343864763048392e-07, "loss": 0.0236, "num_input_tokens_seen": 127590224, "step": 59115 }, { "epoch": 9.64437194127243, "grad_norm": 0.13057167828083038, "learning_rate": 1.925558982936865e-07, "loss": 0.0045, "num_input_tokens_seen": 127601328, "step": 59120 }, { "epoch": 9.645187601957586, "grad_norm": 0.20511381328105927, "learning_rate": 1.916751599741806e-07, "loss": 0.0041, "num_input_tokens_seen": 127612464, "step": 59125 }, { "epoch": 9.64600326264274, "grad_norm": 0.11308741569519043, "learning_rate": 1.90796432743362e-07, "loss": 0.0048, "num_input_tokens_seen": 127622864, "step": 59130 }, { "epoch": 9.646818923327896, "grad_norm": 6.885258674621582, "learning_rate": 1.8991971667246533e-07, "loss": 0.0504, "num_input_tokens_seen": 127634000, "step": 59135 }, { "epoch": 9.647634584013051, "grad_norm": 0.5002412796020508, "learning_rate": 1.8904501183256152e-07, "loss": 0.0101, "num_input_tokens_seen": 127645264, "step": 59140 }, { "epoch": 9.648450244698205, "grad_norm": 0.28994885087013245, "learning_rate": 1.8817231829455773e-07, "loss": 0.0049, "num_input_tokens_seen": 127655376, "step": 59145 }, { "epoch": 9.649265905383361, "grad_norm": 0.05022817477583885, "learning_rate": 1.8730163612920015e-07, "loss": 0.2531, "num_input_tokens_seen": 127665584, "step": 59150 }, { "epoch": 9.650081566068515, "grad_norm": 0.08764076977968216, "learning_rate": 1.8643296540707121e-07, "loss": 0.1174, "num_input_tokens_seen": 127675920, "step": 59155 }, { "epoch": 9.65089722675367, "grad_norm": 0.13821817934513092, "learning_rate": 1.855663061985896e-07, "loss": 0.005, "num_input_tokens_seen": 127686928, "step": 59160 }, { "epoch": 9.651712887438826, "grad_norm": 0.11403993517160416, "learning_rate": 1.8470165857401023e-07, "loss": 0.0042, "num_input_tokens_seen": 127697296, "step": 59165 }, { "epoch": 9.65252854812398, "grad_norm": 0.06376594305038452, "learning_rate": 1.8383902260342422e-07, "loss": 0.1246, "num_input_tokens_seen": 127707312, "step": 59170 }, { "epoch": 9.653344208809136, "grad_norm": 0.2896687388420105, "learning_rate": 1.8297839835676456e-07, "loss": 0.0052, "num_input_tokens_seen": 127719056, "step": 59175 }, { "epoch": 9.65415986949429, "grad_norm": 0.22857573628425598, "learning_rate": 1.8211978590379486e-07, "loss": 0.092, "num_input_tokens_seen": 127730864, "step": 59180 }, { "epoch": 9.654975530179446, "grad_norm": 10.07347297668457, "learning_rate": 1.8126318531412056e-07, "loss": 0.0627, "num_input_tokens_seen": 127740944, "step": 59185 }, { "epoch": 9.655791190864601, "grad_norm": 0.03637094795703888, "learning_rate": 1.8040859665718057e-07, "loss": 0.0032, "num_input_tokens_seen": 127752016, "step": 59190 }, { "epoch": 9.656606851549755, "grad_norm": 0.07079402357339859, "learning_rate": 1.795560200022528e-07, "loss": 0.0033, "num_input_tokens_seen": 127762480, "step": 59195 }, { "epoch": 9.65742251223491, "grad_norm": 0.0924726203083992, "learning_rate": 1.7870545541845418e-07, "loss": 0.095, "num_input_tokens_seen": 127773232, "step": 59200 }, { "epoch": 9.658238172920065, "grad_norm": 0.10256163775920868, "learning_rate": 1.7785690297473234e-07, "loss": 0.0082, "num_input_tokens_seen": 127784112, "step": 59205 }, { "epoch": 9.65905383360522, "grad_norm": 0.09814684092998505, "learning_rate": 1.770103627398767e-07, "loss": 0.1416, "num_input_tokens_seen": 127795600, "step": 59210 }, { "epoch": 9.659869494290374, "grad_norm": 6.013543605804443, "learning_rate": 1.7616583478251013e-07, "loss": 0.1384, "num_input_tokens_seen": 127805648, "step": 59215 }, { "epoch": 9.66068515497553, "grad_norm": 1.4875679016113281, "learning_rate": 1.7532331917109457e-07, "loss": 0.014, "num_input_tokens_seen": 127816560, "step": 59220 }, { "epoch": 9.661500815660686, "grad_norm": 0.11682610213756561, "learning_rate": 1.7448281597393368e-07, "loss": 0.1847, "num_input_tokens_seen": 127828784, "step": 59225 }, { "epoch": 9.66231647634584, "grad_norm": 0.35796356201171875, "learning_rate": 1.736443252591563e-07, "loss": 0.0053, "num_input_tokens_seen": 127839088, "step": 59230 }, { "epoch": 9.663132137030995, "grad_norm": 0.2201497107744217, "learning_rate": 1.7280784709473862e-07, "loss": 0.0703, "num_input_tokens_seen": 127848016, "step": 59235 }, { "epoch": 9.66394779771615, "grad_norm": 0.05457500368356705, "learning_rate": 1.719733815484903e-07, "loss": 0.0819, "num_input_tokens_seen": 127858320, "step": 59240 }, { "epoch": 9.664763458401305, "grad_norm": 0.2189728021621704, "learning_rate": 1.7114092868805443e-07, "loss": 0.2399, "num_input_tokens_seen": 127869936, "step": 59245 }, { "epoch": 9.66557911908646, "grad_norm": 0.20777510106563568, "learning_rate": 1.7031048858091313e-07, "loss": 0.008, "num_input_tokens_seen": 127881040, "step": 59250 }, { "epoch": 9.666394779771615, "grad_norm": 0.15500149130821228, "learning_rate": 1.6948206129439037e-07, "loss": 0.2172, "num_input_tokens_seen": 127891792, "step": 59255 }, { "epoch": 9.66721044045677, "grad_norm": 3.3623745441436768, "learning_rate": 1.6865564689564074e-07, "loss": 0.2944, "num_input_tokens_seen": 127903376, "step": 59260 }, { "epoch": 9.668026101141924, "grad_norm": 0.15240967273712158, "learning_rate": 1.6783124545165785e-07, "loss": 0.0104, "num_input_tokens_seen": 127914576, "step": 59265 }, { "epoch": 9.66884176182708, "grad_norm": 0.05557018890976906, "learning_rate": 1.6700885702926882e-07, "loss": 0.0044, "num_input_tokens_seen": 127924784, "step": 59270 }, { "epoch": 9.669657422512234, "grad_norm": 0.7313627600669861, "learning_rate": 1.6618848169514533e-07, "loss": 0.0936, "num_input_tokens_seen": 127934800, "step": 59275 }, { "epoch": 9.67047308319739, "grad_norm": 0.08608107268810272, "learning_rate": 1.6537011951578974e-07, "loss": 0.1346, "num_input_tokens_seen": 127945584, "step": 59280 }, { "epoch": 9.671288743882545, "grad_norm": 0.16615554690361023, "learning_rate": 1.645537705575406e-07, "loss": 0.1261, "num_input_tokens_seen": 127956272, "step": 59285 }, { "epoch": 9.6721044045677, "grad_norm": 0.07968626916408539, "learning_rate": 1.6373943488657562e-07, "loss": 0.0086, "num_input_tokens_seen": 127966672, "step": 59290 }, { "epoch": 9.672920065252855, "grad_norm": 4.198758125305176, "learning_rate": 1.6292711256891134e-07, "loss": 0.321, "num_input_tokens_seen": 127978032, "step": 59295 }, { "epoch": 9.673735725938009, "grad_norm": 0.4226551949977875, "learning_rate": 1.6211680367039793e-07, "loss": 0.008, "num_input_tokens_seen": 127989072, "step": 59300 }, { "epoch": 9.674551386623165, "grad_norm": 0.027734851464629173, "learning_rate": 1.6130850825672173e-07, "loss": 0.0023, "num_input_tokens_seen": 127999568, "step": 59305 }, { "epoch": 9.67536704730832, "grad_norm": 0.19730962812900543, "learning_rate": 1.6050222639340807e-07, "loss": 0.0055, "num_input_tokens_seen": 128010256, "step": 59310 }, { "epoch": 9.676182707993474, "grad_norm": 0.058019958436489105, "learning_rate": 1.5969795814581856e-07, "loss": 0.0036, "num_input_tokens_seen": 128020176, "step": 59315 }, { "epoch": 9.67699836867863, "grad_norm": 0.13013321161270142, "learning_rate": 1.5889570357915108e-07, "loss": 0.0796, "num_input_tokens_seen": 128032496, "step": 59320 }, { "epoch": 9.677814029363784, "grad_norm": 0.06711160391569138, "learning_rate": 1.5809546275843968e-07, "loss": 0.0038, "num_input_tokens_seen": 128043632, "step": 59325 }, { "epoch": 9.67862969004894, "grad_norm": 0.03470923751592636, "learning_rate": 1.572972357485575e-07, "loss": 0.1232, "num_input_tokens_seen": 128054608, "step": 59330 }, { "epoch": 9.679445350734095, "grad_norm": 0.10599584132432938, "learning_rate": 1.5650102261421107e-07, "loss": 0.0508, "num_input_tokens_seen": 128065360, "step": 59335 }, { "epoch": 9.68026101141925, "grad_norm": 0.054533615708351135, "learning_rate": 1.557068234199488e-07, "loss": 0.1016, "num_input_tokens_seen": 128076464, "step": 59340 }, { "epoch": 9.681076672104405, "grad_norm": 6.840791702270508, "learning_rate": 1.5491463823014697e-07, "loss": 0.0148, "num_input_tokens_seen": 128086320, "step": 59345 }, { "epoch": 9.681892332789559, "grad_norm": 0.11100902408361435, "learning_rate": 1.5412446710902917e-07, "loss": 0.0075, "num_input_tokens_seen": 128096208, "step": 59350 }, { "epoch": 9.682707993474715, "grad_norm": 0.47080928087234497, "learning_rate": 1.5333631012064698e-07, "loss": 0.1193, "num_input_tokens_seen": 128107376, "step": 59355 }, { "epoch": 9.68352365415987, "grad_norm": 0.08434358239173889, "learning_rate": 1.5255016732889648e-07, "loss": 0.0066, "num_input_tokens_seen": 128118800, "step": 59360 }, { "epoch": 9.684339314845024, "grad_norm": 0.2086295783519745, "learning_rate": 1.5176603879750173e-07, "loss": 0.2222, "num_input_tokens_seen": 128131152, "step": 59365 }, { "epoch": 9.68515497553018, "grad_norm": 0.1977551281452179, "learning_rate": 1.509839245900313e-07, "loss": 0.0075, "num_input_tokens_seen": 128142480, "step": 59370 }, { "epoch": 9.685970636215334, "grad_norm": 1.4557702541351318, "learning_rate": 1.5020382476988726e-07, "loss": 0.0072, "num_input_tokens_seen": 128152816, "step": 59375 }, { "epoch": 9.68678629690049, "grad_norm": 0.22527346014976501, "learning_rate": 1.4942573940030791e-07, "loss": 0.0064, "num_input_tokens_seen": 128163632, "step": 59380 }, { "epoch": 9.687601957585644, "grad_norm": 0.10114660859107971, "learning_rate": 1.4864966854437056e-07, "loss": 0.0034, "num_input_tokens_seen": 128174640, "step": 59385 }, { "epoch": 9.6884176182708, "grad_norm": 6.902266502380371, "learning_rate": 1.4787561226498048e-07, "loss": 0.1922, "num_input_tokens_seen": 128184144, "step": 59390 }, { "epoch": 9.689233278955955, "grad_norm": 0.05543965846300125, "learning_rate": 1.4710357062489577e-07, "loss": 0.0826, "num_input_tokens_seen": 128192976, "step": 59395 }, { "epoch": 9.690048939641109, "grad_norm": 0.04904047027230263, "learning_rate": 1.4633354368669694e-07, "loss": 0.0056, "num_input_tokens_seen": 128204144, "step": 59400 }, { "epoch": 9.690864600326265, "grad_norm": 0.5166468024253845, "learning_rate": 1.4556553151280628e-07, "loss": 0.0083, "num_input_tokens_seen": 128215600, "step": 59405 }, { "epoch": 9.691680261011419, "grad_norm": 0.11157098412513733, "learning_rate": 1.447995341654851e-07, "loss": 0.1073, "num_input_tokens_seen": 128226672, "step": 59410 }, { "epoch": 9.692495921696574, "grad_norm": 3.7563095092773438, "learning_rate": 1.4403555170682816e-07, "loss": 0.1257, "num_input_tokens_seen": 128237936, "step": 59415 }, { "epoch": 9.69331158238173, "grad_norm": 0.1909668743610382, "learning_rate": 1.4327358419876646e-07, "loss": 0.1063, "num_input_tokens_seen": 128248688, "step": 59420 }, { "epoch": 9.694127243066884, "grad_norm": 0.1627446413040161, "learning_rate": 1.4251363170307008e-07, "loss": 0.0057, "num_input_tokens_seen": 128259440, "step": 59425 }, { "epoch": 9.69494290375204, "grad_norm": 0.10831242799758911, "learning_rate": 1.4175569428134527e-07, "loss": 0.0613, "num_input_tokens_seen": 128268880, "step": 59430 }, { "epoch": 9.695758564437194, "grad_norm": 0.07561370730400085, "learning_rate": 1.4099977199503178e-07, "loss": 0.0046, "num_input_tokens_seen": 128277744, "step": 59435 }, { "epoch": 9.69657422512235, "grad_norm": 17.639007568359375, "learning_rate": 1.4024586490540837e-07, "loss": 0.094, "num_input_tokens_seen": 128288176, "step": 59440 }, { "epoch": 9.697389885807503, "grad_norm": 0.12820494174957275, "learning_rate": 1.3949397307359557e-07, "loss": 0.0041, "num_input_tokens_seen": 128298672, "step": 59445 }, { "epoch": 9.698205546492659, "grad_norm": 0.12352553755044937, "learning_rate": 1.3874409656054189e-07, "loss": 0.0053, "num_input_tokens_seen": 128309264, "step": 59450 }, { "epoch": 9.699021207177815, "grad_norm": 0.5875067114830017, "learning_rate": 1.3799623542703478e-07, "loss": 0.1419, "num_input_tokens_seen": 128320656, "step": 59455 }, { "epoch": 9.699836867862969, "grad_norm": 0.15500937402248383, "learning_rate": 1.3725038973370076e-07, "loss": 0.0062, "num_input_tokens_seen": 128332016, "step": 59460 }, { "epoch": 9.700652528548124, "grad_norm": 4.622019290924072, "learning_rate": 1.3650655954100532e-07, "loss": 0.009, "num_input_tokens_seen": 128342448, "step": 59465 }, { "epoch": 9.701468189233278, "grad_norm": 0.28856053948402405, "learning_rate": 1.3576474490924195e-07, "loss": 0.0933, "num_input_tokens_seen": 128352368, "step": 59470 }, { "epoch": 9.702283849918434, "grad_norm": 8.22559928894043, "learning_rate": 1.3502494589855142e-07, "loss": 0.3689, "num_input_tokens_seen": 128361488, "step": 59475 }, { "epoch": 9.70309951060359, "grad_norm": 0.09286815673112869, "learning_rate": 1.3428716256889962e-07, "loss": 0.215, "num_input_tokens_seen": 128371632, "step": 59480 }, { "epoch": 9.703915171288743, "grad_norm": 0.10720881074666977, "learning_rate": 1.3355139498009706e-07, "loss": 0.2728, "num_input_tokens_seen": 128380688, "step": 59485 }, { "epoch": 9.7047308319739, "grad_norm": 0.21968218684196472, "learning_rate": 1.3281764319179046e-07, "loss": 0.205, "num_input_tokens_seen": 128390480, "step": 59490 }, { "epoch": 9.705546492659053, "grad_norm": 0.3598906397819519, "learning_rate": 1.320859072634628e-07, "loss": 0.2241, "num_input_tokens_seen": 128401712, "step": 59495 }, { "epoch": 9.706362153344209, "grad_norm": 0.06498785316944122, "learning_rate": 1.3135618725442778e-07, "loss": 0.0063, "num_input_tokens_seen": 128413040, "step": 59500 }, { "epoch": 9.707177814029365, "grad_norm": 0.0967356488108635, "learning_rate": 1.3062848322384357e-07, "loss": 0.1098, "num_input_tokens_seen": 128421744, "step": 59505 }, { "epoch": 9.707993474714518, "grad_norm": 0.4873754382133484, "learning_rate": 1.2990279523069916e-07, "loss": 0.1651, "num_input_tokens_seen": 128432528, "step": 59510 }, { "epoch": 9.708809135399674, "grad_norm": 0.04376742243766785, "learning_rate": 1.291791233338252e-07, "loss": 0.0069, "num_input_tokens_seen": 128443664, "step": 59515 }, { "epoch": 9.709624796084828, "grad_norm": 0.03959466516971588, "learning_rate": 1.2845746759188314e-07, "loss": 0.0029, "num_input_tokens_seen": 128454256, "step": 59520 }, { "epoch": 9.710440456769984, "grad_norm": 0.044352855533361435, "learning_rate": 1.277378280633762e-07, "loss": 0.0193, "num_input_tokens_seen": 128466768, "step": 59525 }, { "epoch": 9.71125611745514, "grad_norm": 0.2385054975748062, "learning_rate": 1.27020204806641e-07, "loss": 0.1477, "num_input_tokens_seen": 128477264, "step": 59530 }, { "epoch": 9.712071778140293, "grad_norm": 11.07192325592041, "learning_rate": 1.2630459787985326e-07, "loss": 0.1438, "num_input_tokens_seen": 128489040, "step": 59535 }, { "epoch": 9.71288743882545, "grad_norm": 0.10194293409585953, "learning_rate": 1.2559100734102214e-07, "loss": 0.0041, "num_input_tokens_seen": 128500176, "step": 59540 }, { "epoch": 9.713703099510603, "grad_norm": 0.15271548926830292, "learning_rate": 1.248794332479958e-07, "loss": 0.1518, "num_input_tokens_seen": 128510096, "step": 59545 }, { "epoch": 9.714518760195759, "grad_norm": 0.10981497168540955, "learning_rate": 1.2416987565845861e-07, "loss": 0.0938, "num_input_tokens_seen": 128520976, "step": 59550 }, { "epoch": 9.715334420880914, "grad_norm": 0.0959286168217659, "learning_rate": 1.2346233462992852e-07, "loss": 0.2037, "num_input_tokens_seen": 128532080, "step": 59555 }, { "epoch": 9.716150081566068, "grad_norm": 0.12493567168712616, "learning_rate": 1.2275681021976515e-07, "loss": 0.154, "num_input_tokens_seen": 128544176, "step": 59560 }, { "epoch": 9.716965742251224, "grad_norm": 0.1780611276626587, "learning_rate": 1.220533024851589e-07, "loss": 0.0787, "num_input_tokens_seen": 128555472, "step": 59565 }, { "epoch": 9.717781402936378, "grad_norm": 0.04792303219437599, "learning_rate": 1.213518114831419e-07, "loss": 0.0298, "num_input_tokens_seen": 128565968, "step": 59570 }, { "epoch": 9.718597063621534, "grad_norm": 4.539177417755127, "learning_rate": 1.206523372705798e-07, "loss": 0.3807, "num_input_tokens_seen": 128576528, "step": 59575 }, { "epoch": 9.719412724306688, "grad_norm": 0.1179857924580574, "learning_rate": 1.199548799041772e-07, "loss": 0.0076, "num_input_tokens_seen": 128586928, "step": 59580 }, { "epoch": 9.720228384991843, "grad_norm": 34.01148223876953, "learning_rate": 1.1925943944047225e-07, "loss": 0.0785, "num_input_tokens_seen": 128599280, "step": 59585 }, { "epoch": 9.721044045676999, "grad_norm": 0.12337164580821991, "learning_rate": 1.1856601593583928e-07, "loss": 0.005, "num_input_tokens_seen": 128609424, "step": 59590 }, { "epoch": 9.721859706362153, "grad_norm": 0.12772507965564728, "learning_rate": 1.1787460944649443e-07, "loss": 0.0034, "num_input_tokens_seen": 128620368, "step": 59595 }, { "epoch": 9.722675367047309, "grad_norm": 0.09744003415107727, "learning_rate": 1.1718522002848175e-07, "loss": 0.0339, "num_input_tokens_seen": 128631792, "step": 59600 }, { "epoch": 9.723491027732463, "grad_norm": 0.20774321258068085, "learning_rate": 1.164978477376899e-07, "loss": 0.0056, "num_input_tokens_seen": 128641584, "step": 59605 }, { "epoch": 9.724306688417618, "grad_norm": 0.044551603496074677, "learning_rate": 1.1581249262984096e-07, "loss": 0.327, "num_input_tokens_seen": 128651824, "step": 59610 }, { "epoch": 9.725122349102774, "grad_norm": 0.12571464478969574, "learning_rate": 1.1512915476049325e-07, "loss": 0.2225, "num_input_tokens_seen": 128661616, "step": 59615 }, { "epoch": 9.725938009787928, "grad_norm": 0.07284899055957794, "learning_rate": 1.1444783418503857e-07, "loss": 0.0051, "num_input_tokens_seen": 128671952, "step": 59620 }, { "epoch": 9.726753670473084, "grad_norm": 0.2515014708042145, "learning_rate": 1.1376853095871332e-07, "loss": 0.0033, "num_input_tokens_seen": 128682928, "step": 59625 }, { "epoch": 9.727569331158238, "grad_norm": 4.521650314331055, "learning_rate": 1.1309124513657899e-07, "loss": 0.3166, "num_input_tokens_seen": 128692592, "step": 59630 }, { "epoch": 9.728384991843393, "grad_norm": 5.1966423988342285, "learning_rate": 1.124159767735472e-07, "loss": 0.0091, "num_input_tokens_seen": 128703280, "step": 59635 }, { "epoch": 9.729200652528547, "grad_norm": 0.18649420142173767, "learning_rate": 1.1174272592435197e-07, "loss": 0.0031, "num_input_tokens_seen": 128713904, "step": 59640 }, { "epoch": 9.730016313213703, "grad_norm": 0.287245512008667, "learning_rate": 1.1107149264357186e-07, "loss": 0.0097, "num_input_tokens_seen": 128723696, "step": 59645 }, { "epoch": 9.730831973898859, "grad_norm": 0.11232049763202667, "learning_rate": 1.1040227698562445e-07, "loss": 0.006, "num_input_tokens_seen": 128735120, "step": 59650 }, { "epoch": 9.731647634584013, "grad_norm": 0.04480063170194626, "learning_rate": 1.0973507900475521e-07, "loss": 0.1014, "num_input_tokens_seen": 128745232, "step": 59655 }, { "epoch": 9.732463295269168, "grad_norm": 0.13746652007102966, "learning_rate": 1.0906989875505425e-07, "loss": 0.0428, "num_input_tokens_seen": 128755856, "step": 59660 }, { "epoch": 9.733278955954322, "grad_norm": 0.21175557374954224, "learning_rate": 1.0840673629044228e-07, "loss": 0.0038, "num_input_tokens_seen": 128766768, "step": 59665 }, { "epoch": 9.734094616639478, "grad_norm": 12.107427597045898, "learning_rate": 1.0774559166467912e-07, "loss": 0.0743, "num_input_tokens_seen": 128778448, "step": 59670 }, { "epoch": 9.734910277324634, "grad_norm": 0.08206488937139511, "learning_rate": 1.0708646493135799e-07, "loss": 0.049, "num_input_tokens_seen": 128788176, "step": 59675 }, { "epoch": 9.735725938009788, "grad_norm": 0.2523277997970581, "learning_rate": 1.0642935614391392e-07, "loss": 0.0056, "num_input_tokens_seen": 128799280, "step": 59680 }, { "epoch": 9.736541598694943, "grad_norm": 0.05426796153187752, "learning_rate": 1.0577426535561541e-07, "loss": 0.0036, "num_input_tokens_seen": 128808752, "step": 59685 }, { "epoch": 9.737357259380097, "grad_norm": 0.173715740442276, "learning_rate": 1.0512119261956999e-07, "loss": 0.0432, "num_input_tokens_seen": 128819728, "step": 59690 }, { "epoch": 9.738172920065253, "grad_norm": 0.21712566912174225, "learning_rate": 1.0447013798871308e-07, "loss": 0.0774, "num_input_tokens_seen": 128831440, "step": 59695 }, { "epoch": 9.738988580750409, "grad_norm": 2.1026999950408936, "learning_rate": 1.0382110151582469e-07, "loss": 0.0072, "num_input_tokens_seen": 128841200, "step": 59700 }, { "epoch": 9.739804241435563, "grad_norm": 0.03553512692451477, "learning_rate": 1.0317408325352107e-07, "loss": 0.0052, "num_input_tokens_seen": 128852112, "step": 59705 }, { "epoch": 9.740619902120718, "grad_norm": 0.10768935084342957, "learning_rate": 1.0252908325425192e-07, "loss": 0.0048, "num_input_tokens_seen": 128864592, "step": 59710 }, { "epoch": 9.741435562805872, "grad_norm": 0.013585901819169521, "learning_rate": 1.018861015703032e-07, "loss": 0.0045, "num_input_tokens_seen": 128876272, "step": 59715 }, { "epoch": 9.742251223491028, "grad_norm": 0.31762149930000305, "learning_rate": 1.0124513825379989e-07, "loss": 0.0062, "num_input_tokens_seen": 128887056, "step": 59720 }, { "epoch": 9.743066884176184, "grad_norm": 3.006070613861084, "learning_rate": 1.0060619335669764e-07, "loss": 0.1659, "num_input_tokens_seen": 128898128, "step": 59725 }, { "epoch": 9.743882544861338, "grad_norm": 0.08332651108503342, "learning_rate": 9.996926693079945e-08, "loss": 0.0051, "num_input_tokens_seen": 128909040, "step": 59730 }, { "epoch": 9.744698205546493, "grad_norm": 0.043319880962371826, "learning_rate": 9.93343590277307e-08, "loss": 0.1583, "num_input_tokens_seen": 128918288, "step": 59735 }, { "epoch": 9.745513866231647, "grad_norm": 19.856128692626953, "learning_rate": 9.870146969896688e-08, "loss": 0.0231, "num_input_tokens_seen": 128929584, "step": 59740 }, { "epoch": 9.746329526916803, "grad_norm": 0.1288125216960907, "learning_rate": 9.807059899580861e-08, "loss": 0.0092, "num_input_tokens_seen": 128940208, "step": 59745 }, { "epoch": 9.747145187601957, "grad_norm": 6.128967761993408, "learning_rate": 9.744174696939834e-08, "loss": 0.1955, "num_input_tokens_seen": 128950224, "step": 59750 }, { "epoch": 9.747960848287113, "grad_norm": 0.06756886094808578, "learning_rate": 9.681491367071193e-08, "loss": 0.004, "num_input_tokens_seen": 128961072, "step": 59755 }, { "epoch": 9.748776508972268, "grad_norm": 0.10933032631874084, "learning_rate": 9.619009915056987e-08, "loss": 0.0045, "num_input_tokens_seen": 128972528, "step": 59760 }, { "epoch": 9.749592169657422, "grad_norm": 0.08776938915252686, "learning_rate": 9.556730345961773e-08, "loss": 0.3243, "num_input_tokens_seen": 128982480, "step": 59765 }, { "epoch": 9.750407830342578, "grad_norm": 0.15287411212921143, "learning_rate": 9.494652664834292e-08, "loss": 0.0811, "num_input_tokens_seen": 128992304, "step": 59770 }, { "epoch": 9.751223491027732, "grad_norm": 6.290401458740234, "learning_rate": 9.432776876707183e-08, "loss": 0.1513, "num_input_tokens_seen": 129003344, "step": 59775 }, { "epoch": 9.752039151712887, "grad_norm": 8.094135284423828, "learning_rate": 9.371102986595881e-08, "loss": 0.0618, "num_input_tokens_seen": 129014224, "step": 59780 }, { "epoch": 9.752854812398043, "grad_norm": 0.020119240507483482, "learning_rate": 9.309630999500551e-08, "loss": 0.0683, "num_input_tokens_seen": 129024016, "step": 59785 }, { "epoch": 9.753670473083197, "grad_norm": 0.09110990911722183, "learning_rate": 9.248360920404154e-08, "loss": 0.0056, "num_input_tokens_seen": 129033840, "step": 59790 }, { "epoch": 9.754486133768353, "grad_norm": 0.28636953234672546, "learning_rate": 9.187292754273269e-08, "loss": 0.0074, "num_input_tokens_seen": 129044848, "step": 59795 }, { "epoch": 9.755301794453507, "grad_norm": 0.13501136004924774, "learning_rate": 9.126426506058938e-08, "loss": 0.0106, "num_input_tokens_seen": 129055664, "step": 59800 }, { "epoch": 9.756117455138662, "grad_norm": 0.07010824978351593, "learning_rate": 9.06576218069527e-08, "loss": 0.2729, "num_input_tokens_seen": 129066288, "step": 59805 }, { "epoch": 9.756933115823816, "grad_norm": 0.11294228583574295, "learning_rate": 9.005299783099441e-08, "loss": 0.0641, "num_input_tokens_seen": 129076016, "step": 59810 }, { "epoch": 9.757748776508972, "grad_norm": 0.10147662460803986, "learning_rate": 8.945039318173365e-08, "loss": 0.0031, "num_input_tokens_seen": 129086704, "step": 59815 }, { "epoch": 9.758564437194128, "grad_norm": 0.13516418635845184, "learning_rate": 8.884980790801745e-08, "loss": 0.0098, "num_input_tokens_seen": 129096560, "step": 59820 }, { "epoch": 9.759380097879282, "grad_norm": 3.2705533504486084, "learning_rate": 8.825124205853463e-08, "loss": 0.079, "num_input_tokens_seen": 129107984, "step": 59825 }, { "epoch": 9.760195758564437, "grad_norm": 0.07122524827718735, "learning_rate": 8.76546956818075e-08, "loss": 0.0819, "num_input_tokens_seen": 129118352, "step": 59830 }, { "epoch": 9.761011419249591, "grad_norm": 0.20543478429317474, "learning_rate": 8.706016882619461e-08, "loss": 0.0078, "num_input_tokens_seen": 129129072, "step": 59835 }, { "epoch": 9.761827079934747, "grad_norm": 0.13996347784996033, "learning_rate": 8.646766153989072e-08, "loss": 0.01, "num_input_tokens_seen": 129140496, "step": 59840 }, { "epoch": 9.762642740619903, "grad_norm": 0.15542426705360413, "learning_rate": 8.587717387092686e-08, "loss": 0.0041, "num_input_tokens_seen": 129151952, "step": 59845 }, { "epoch": 9.763458401305057, "grad_norm": 0.19635987281799316, "learning_rate": 8.528870586717308e-08, "loss": 0.1406, "num_input_tokens_seen": 129165008, "step": 59850 }, { "epoch": 9.764274061990212, "grad_norm": 3.898505687713623, "learning_rate": 8.470225757633565e-08, "loss": 0.1201, "num_input_tokens_seen": 129176272, "step": 59855 }, { "epoch": 9.765089722675366, "grad_norm": 0.15890909731388092, "learning_rate": 8.411782904594879e-08, "loss": 0.0254, "num_input_tokens_seen": 129187504, "step": 59860 }, { "epoch": 9.765905383360522, "grad_norm": 0.06403691321611404, "learning_rate": 8.3535420323394e-08, "loss": 0.0052, "num_input_tokens_seen": 129198928, "step": 59865 }, { "epoch": 9.766721044045678, "grad_norm": 0.14090733230113983, "learning_rate": 8.295503145588357e-08, "loss": 0.0036, "num_input_tokens_seen": 129211024, "step": 59870 }, { "epoch": 9.767536704730832, "grad_norm": 5.934608459472656, "learning_rate": 8.237666249046593e-08, "loss": 0.0678, "num_input_tokens_seen": 129222096, "step": 59875 }, { "epoch": 9.768352365415987, "grad_norm": 3.298285484313965, "learning_rate": 8.180031347402583e-08, "loss": 0.2141, "num_input_tokens_seen": 129232880, "step": 59880 }, { "epoch": 9.769168026101141, "grad_norm": 0.13178037106990814, "learning_rate": 8.122598445328699e-08, "loss": 0.0047, "num_input_tokens_seen": 129244560, "step": 59885 }, { "epoch": 9.769983686786297, "grad_norm": 0.1279880404472351, "learning_rate": 8.065367547480384e-08, "loss": 0.0919, "num_input_tokens_seen": 129255824, "step": 59890 }, { "epoch": 9.770799347471453, "grad_norm": 0.9681717753410339, "learning_rate": 8.008338658497538e-08, "loss": 0.0922, "num_input_tokens_seen": 129267248, "step": 59895 }, { "epoch": 9.771615008156607, "grad_norm": 5.033680438995361, "learning_rate": 7.95151178300313e-08, "loss": 0.0105, "num_input_tokens_seen": 129277616, "step": 59900 }, { "epoch": 9.772430668841762, "grad_norm": 0.08865257352590561, "learning_rate": 7.894886925603473e-08, "loss": 0.0049, "num_input_tokens_seen": 129288112, "step": 59905 }, { "epoch": 9.773246329526916, "grad_norm": 0.807728111743927, "learning_rate": 7.838464090889342e-08, "loss": 0.2076, "num_input_tokens_seen": 129298064, "step": 59910 }, { "epoch": 9.774061990212072, "grad_norm": 0.055666111409664154, "learning_rate": 7.782243283434299e-08, "loss": 0.0025, "num_input_tokens_seen": 129307408, "step": 59915 }, { "epoch": 9.774877650897226, "grad_norm": 0.2182152420282364, "learning_rate": 7.726224507795809e-08, "loss": 0.1182, "num_input_tokens_seen": 129318128, "step": 59920 }, { "epoch": 9.775693311582382, "grad_norm": 0.09525197744369507, "learning_rate": 7.67040776851552e-08, "loss": 0.0035, "num_input_tokens_seen": 129328080, "step": 59925 }, { "epoch": 9.776508972267537, "grad_norm": 0.36281150579452515, "learning_rate": 7.614793070117865e-08, "loss": 0.1037, "num_input_tokens_seen": 129339120, "step": 59930 }, { "epoch": 9.777324632952691, "grad_norm": 9.132209777832031, "learning_rate": 7.559380417111184e-08, "loss": 0.1949, "num_input_tokens_seen": 129349648, "step": 59935 }, { "epoch": 9.778140293637847, "grad_norm": 0.17699457705020905, "learning_rate": 7.504169813987716e-08, "loss": 0.0053, "num_input_tokens_seen": 129360400, "step": 59940 }, { "epoch": 9.778955954323001, "grad_norm": 3.648254871368408, "learning_rate": 7.449161265223048e-08, "loss": 0.1654, "num_input_tokens_seen": 129373168, "step": 59945 }, { "epoch": 9.779771615008157, "grad_norm": 0.08858548104763031, "learning_rate": 7.394354775276391e-08, "loss": 0.1162, "num_input_tokens_seen": 129382736, "step": 59950 }, { "epoch": 9.780587275693312, "grad_norm": 0.2097587138414383, "learning_rate": 7.339750348590857e-08, "loss": 0.1244, "num_input_tokens_seen": 129394160, "step": 59955 }, { "epoch": 9.781402936378466, "grad_norm": 0.1020960733294487, "learning_rate": 7.285347989592628e-08, "loss": 0.0699, "num_input_tokens_seen": 129405744, "step": 59960 }, { "epoch": 9.782218597063622, "grad_norm": 0.2437201291322708, "learning_rate": 7.231147702692065e-08, "loss": 0.0067, "num_input_tokens_seen": 129416368, "step": 59965 }, { "epoch": 9.783034257748776, "grad_norm": 0.17274110019207, "learning_rate": 7.177149492282876e-08, "loss": 0.0042, "num_input_tokens_seen": 129426928, "step": 59970 }, { "epoch": 9.783849918433932, "grad_norm": 3.8773226737976074, "learning_rate": 7.123353362742391e-08, "loss": 0.168, "num_input_tokens_seen": 129438896, "step": 59975 }, { "epoch": 9.784665579119086, "grad_norm": 0.029082465916872025, "learning_rate": 7.069759318431567e-08, "loss": 0.0073, "num_input_tokens_seen": 129449808, "step": 59980 }, { "epoch": 9.785481239804241, "grad_norm": 0.15257468819618225, "learning_rate": 7.016367363694986e-08, "loss": 0.0472, "num_input_tokens_seen": 129461232, "step": 59985 }, { "epoch": 9.786296900489397, "grad_norm": 4.954206943511963, "learning_rate": 6.963177502861129e-08, "loss": 0.1873, "num_input_tokens_seen": 129471984, "step": 59990 }, { "epoch": 9.78711256117455, "grad_norm": 0.19747355580329895, "learning_rate": 6.910189740241269e-08, "loss": 0.0062, "num_input_tokens_seen": 129483088, "step": 59995 }, { "epoch": 9.787928221859707, "grad_norm": 0.11289242655038834, "learning_rate": 6.857404080131691e-08, "loss": 0.0078, "num_input_tokens_seen": 129492144, "step": 60000 }, { "epoch": 9.78874388254486, "grad_norm": 0.0487457811832428, "learning_rate": 6.804820526810917e-08, "loss": 0.0402, "num_input_tokens_seen": 129502128, "step": 60005 }, { "epoch": 9.789559543230016, "grad_norm": 0.14821499586105347, "learning_rate": 6.75243908454165e-08, "loss": 0.1213, "num_input_tokens_seen": 129512848, "step": 60010 }, { "epoch": 9.790375203915172, "grad_norm": 2.631269931793213, "learning_rate": 6.700259757570216e-08, "loss": 0.0793, "num_input_tokens_seen": 129522992, "step": 60015 }, { "epoch": 9.791190864600326, "grad_norm": 0.058667488396167755, "learning_rate": 6.648282550126562e-08, "loss": 0.0076, "num_input_tokens_seen": 129533328, "step": 60020 }, { "epoch": 9.792006525285482, "grad_norm": 0.10266431421041489, "learning_rate": 6.59650746642454e-08, "loss": 0.0035, "num_input_tokens_seen": 129542768, "step": 60025 }, { "epoch": 9.792822185970635, "grad_norm": 0.1482907086610794, "learning_rate": 6.544934510660794e-08, "loss": 0.1718, "num_input_tokens_seen": 129554448, "step": 60030 }, { "epoch": 9.793637846655791, "grad_norm": 5.6294684410095215, "learning_rate": 6.493563687016424e-08, "loss": 0.009, "num_input_tokens_seen": 129566736, "step": 60035 }, { "epoch": 9.794453507340947, "grad_norm": 0.07283642888069153, "learning_rate": 6.442394999655599e-08, "loss": 0.0081, "num_input_tokens_seen": 129577392, "step": 60040 }, { "epoch": 9.7952691680261, "grad_norm": 0.25296127796173096, "learning_rate": 6.391428452726389e-08, "loss": 0.0058, "num_input_tokens_seen": 129588080, "step": 60045 }, { "epoch": 9.796084828711257, "grad_norm": 0.08539183437824249, "learning_rate": 6.340664050360767e-08, "loss": 0.1011, "num_input_tokens_seen": 129599312, "step": 60050 }, { "epoch": 9.79690048939641, "grad_norm": 3.745346784591675, "learning_rate": 6.29010179667322e-08, "loss": 0.1297, "num_input_tokens_seen": 129609488, "step": 60055 }, { "epoch": 9.797716150081566, "grad_norm": 4.010807991027832, "learning_rate": 6.239741695763246e-08, "loss": 0.0944, "num_input_tokens_seen": 129620944, "step": 60060 }, { "epoch": 9.798531810766722, "grad_norm": 0.1772913634777069, "learning_rate": 6.189583751712857e-08, "loss": 0.0069, "num_input_tokens_seen": 129631920, "step": 60065 }, { "epoch": 9.799347471451876, "grad_norm": 0.0517205074429512, "learning_rate": 6.139627968588524e-08, "loss": 0.0051, "num_input_tokens_seen": 129643696, "step": 60070 }, { "epoch": 9.800163132137031, "grad_norm": 0.1302974820137024, "learning_rate": 6.089874350439506e-08, "loss": 0.1105, "num_input_tokens_seen": 129655184, "step": 60075 }, { "epoch": 9.800978792822185, "grad_norm": 0.10189558565616608, "learning_rate": 6.040322901299245e-08, "loss": 0.0031, "num_input_tokens_seen": 129665008, "step": 60080 }, { "epoch": 9.801794453507341, "grad_norm": 0.028916558250784874, "learning_rate": 5.990973625184526e-08, "loss": 0.0027, "num_input_tokens_seen": 129675344, "step": 60085 }, { "epoch": 9.802610114192497, "grad_norm": 3.744006633758545, "learning_rate": 5.9418265260960394e-08, "loss": 0.2226, "num_input_tokens_seen": 129687120, "step": 60090 }, { "epoch": 9.80342577487765, "grad_norm": 0.07970011234283447, "learning_rate": 5.892881608017819e-08, "loss": 0.0097, "num_input_tokens_seen": 129697936, "step": 60095 }, { "epoch": 9.804241435562806, "grad_norm": 0.13731755316257477, "learning_rate": 5.844138874917526e-08, "loss": 0.0071, "num_input_tokens_seen": 129707312, "step": 60100 }, { "epoch": 9.80505709624796, "grad_norm": 0.0961083248257637, "learning_rate": 5.795598330746721e-08, "loss": 0.0053, "num_input_tokens_seen": 129718256, "step": 60105 }, { "epoch": 9.805872756933116, "grad_norm": 0.39364707469940186, "learning_rate": 5.747259979440034e-08, "loss": 0.0087, "num_input_tokens_seen": 129728336, "step": 60110 }, { "epoch": 9.80668841761827, "grad_norm": 0.16876554489135742, "learning_rate": 5.699123824916275e-08, "loss": 0.0075, "num_input_tokens_seen": 129738960, "step": 60115 }, { "epoch": 9.807504078303426, "grad_norm": 3.1732735633850098, "learning_rate": 5.6511898710776e-08, "loss": 0.1868, "num_input_tokens_seen": 129748752, "step": 60120 }, { "epoch": 9.808319738988581, "grad_norm": 0.09069392085075378, "learning_rate": 5.603458121809513e-08, "loss": 0.0052, "num_input_tokens_seen": 129758864, "step": 60125 }, { "epoch": 9.809135399673735, "grad_norm": 15.149872779846191, "learning_rate": 5.555928580981418e-08, "loss": 0.1611, "num_input_tokens_seen": 129768112, "step": 60130 }, { "epoch": 9.809951060358891, "grad_norm": 0.11818689107894897, "learning_rate": 5.5086012524466216e-08, "loss": 0.0076, "num_input_tokens_seen": 129779440, "step": 60135 }, { "epoch": 9.810766721044045, "grad_norm": 0.18144717812538147, "learning_rate": 5.4614761400414996e-08, "loss": 0.0782, "num_input_tokens_seen": 129790320, "step": 60140 }, { "epoch": 9.8115823817292, "grad_norm": 0.2294866293668747, "learning_rate": 5.414553247586329e-08, "loss": 0.1056, "num_input_tokens_seen": 129799792, "step": 60145 }, { "epoch": 9.812398042414356, "grad_norm": 7.488786220550537, "learning_rate": 5.367832578884735e-08, "loss": 0.18, "num_input_tokens_seen": 129811472, "step": 60150 }, { "epoch": 9.81321370309951, "grad_norm": 0.09118147194385529, "learning_rate": 5.3213141377245205e-08, "loss": 0.0047, "num_input_tokens_seen": 129822512, "step": 60155 }, { "epoch": 9.814029363784666, "grad_norm": 0.48315927386283875, "learning_rate": 5.2749979278762794e-08, "loss": 0.0061, "num_input_tokens_seen": 129834608, "step": 60160 }, { "epoch": 9.81484502446982, "grad_norm": 0.08426214009523392, "learning_rate": 5.228883953094788e-08, "loss": 0.1253, "num_input_tokens_seen": 129845008, "step": 60165 }, { "epoch": 9.815660685154976, "grad_norm": 3.5015861988067627, "learning_rate": 5.182972217118165e-08, "loss": 0.1164, "num_input_tokens_seen": 129854928, "step": 60170 }, { "epoch": 9.81647634584013, "grad_norm": 7.503712177276611, "learning_rate": 5.137262723668712e-08, "loss": 0.0292, "num_input_tokens_seen": 129865904, "step": 60175 }, { "epoch": 9.817292006525285, "grad_norm": 0.10699860006570816, "learning_rate": 5.0917554764515206e-08, "loss": 0.1108, "num_input_tokens_seen": 129878288, "step": 60180 }, { "epoch": 9.818107667210441, "grad_norm": 0.04898899421095848, "learning_rate": 5.0464504791553066e-08, "loss": 0.0695, "num_input_tokens_seen": 129890512, "step": 60185 }, { "epoch": 9.818923327895595, "grad_norm": 0.100347600877285, "learning_rate": 5.001347735453521e-08, "loss": 0.1822, "num_input_tokens_seen": 129900560, "step": 60190 }, { "epoch": 9.81973898858075, "grad_norm": 0.02564556896686554, "learning_rate": 4.95644724900185e-08, "loss": 0.0029, "num_input_tokens_seen": 129912272, "step": 60195 }, { "epoch": 9.820554649265905, "grad_norm": 0.06976073980331421, "learning_rate": 4.91174902344016e-08, "loss": 0.1059, "num_input_tokens_seen": 129923120, "step": 60200 }, { "epoch": 9.82137030995106, "grad_norm": 0.08213542401790619, "learning_rate": 4.867253062391941e-08, "loss": 0.0031, "num_input_tokens_seen": 129935280, "step": 60205 }, { "epoch": 9.822185970636216, "grad_norm": 0.016145285218954086, "learning_rate": 4.822959369464586e-08, "loss": 0.0025, "num_input_tokens_seen": 129946032, "step": 60210 }, { "epoch": 9.82300163132137, "grad_norm": 0.3220311403274536, "learning_rate": 4.7788679482485556e-08, "loss": 0.0115, "num_input_tokens_seen": 129956784, "step": 60215 }, { "epoch": 9.823817292006526, "grad_norm": 0.16524042189121246, "learning_rate": 4.734978802318213e-08, "loss": 0.0046, "num_input_tokens_seen": 129967504, "step": 60220 }, { "epoch": 9.82463295269168, "grad_norm": 0.40293651819229126, "learning_rate": 4.69129193523099e-08, "loss": 0.1095, "num_input_tokens_seen": 129977840, "step": 60225 }, { "epoch": 9.825448613376835, "grad_norm": 0.07762454450130463, "learning_rate": 4.6478073505290544e-08, "loss": 0.0092, "num_input_tokens_seen": 129988080, "step": 60230 }, { "epoch": 9.826264274061991, "grad_norm": 0.15140385925769806, "learning_rate": 4.6045250517370854e-08, "loss": 0.0033, "num_input_tokens_seen": 129999728, "step": 60235 }, { "epoch": 9.827079934747145, "grad_norm": 0.12313911318778992, "learning_rate": 4.561445042363666e-08, "loss": 0.0932, "num_input_tokens_seen": 130010352, "step": 60240 }, { "epoch": 9.8278955954323, "grad_norm": 0.23499034345149994, "learning_rate": 4.518567325901279e-08, "loss": 0.1038, "num_input_tokens_seen": 130021776, "step": 60245 }, { "epoch": 9.828711256117455, "grad_norm": 0.03810376301407814, "learning_rate": 4.475891905825758e-08, "loss": 0.0047, "num_input_tokens_seen": 130032304, "step": 60250 }, { "epoch": 9.82952691680261, "grad_norm": 0.076126828789711, "learning_rate": 4.4334187855968326e-08, "loss": 0.1281, "num_input_tokens_seen": 130041840, "step": 60255 }, { "epoch": 9.830342577487766, "grad_norm": 0.05613238364458084, "learning_rate": 4.391147968657028e-08, "loss": 0.0048, "num_input_tokens_seen": 130051216, "step": 60260 }, { "epoch": 9.83115823817292, "grad_norm": 17.134361267089844, "learning_rate": 4.3490794584336024e-08, "loss": 0.0249, "num_input_tokens_seen": 130061872, "step": 60265 }, { "epoch": 9.831973898858076, "grad_norm": 0.05618003383278847, "learning_rate": 4.307213258336606e-08, "loss": 0.0035, "num_input_tokens_seen": 130072944, "step": 60270 }, { "epoch": 9.83278955954323, "grad_norm": 20.586030960083008, "learning_rate": 4.2655493717597137e-08, "loss": 0.1361, "num_input_tokens_seen": 130082704, "step": 60275 }, { "epoch": 9.833605220228385, "grad_norm": 0.5808132886886597, "learning_rate": 4.224087802080778e-08, "loss": 0.0055, "num_input_tokens_seen": 130093936, "step": 60280 }, { "epoch": 9.83442088091354, "grad_norm": 0.12000591307878494, "learning_rate": 4.182828552660722e-08, "loss": 0.0049, "num_input_tokens_seen": 130103888, "step": 60285 }, { "epoch": 9.835236541598695, "grad_norm": 0.11502066254615784, "learning_rate": 4.141771626844093e-08, "loss": 0.005, "num_input_tokens_seen": 130114864, "step": 60290 }, { "epoch": 9.83605220228385, "grad_norm": 0.12205211073160172, "learning_rate": 4.100917027959617e-08, "loss": 0.1241, "num_input_tokens_seen": 130125136, "step": 60295 }, { "epoch": 9.836867862969005, "grad_norm": 0.12927758693695068, "learning_rate": 4.0602647593185325e-08, "loss": 0.0072, "num_input_tokens_seen": 130136240, "step": 60300 }, { "epoch": 9.83768352365416, "grad_norm": 0.07437407970428467, "learning_rate": 4.0198148242168163e-08, "loss": 0.1205, "num_input_tokens_seen": 130146864, "step": 60305 }, { "epoch": 9.838499184339314, "grad_norm": 0.05137110501527786, "learning_rate": 3.979567225933234e-08, "loss": 0.0067, "num_input_tokens_seen": 130156560, "step": 60310 }, { "epoch": 9.83931484502447, "grad_norm": 0.12136679887771606, "learning_rate": 3.939521967730731e-08, "loss": 0.2939, "num_input_tokens_seen": 130167664, "step": 60315 }, { "epoch": 9.840130505709626, "grad_norm": 0.06857368350028992, "learning_rate": 3.8996790528555985e-08, "loss": 0.1693, "num_input_tokens_seen": 130178992, "step": 60320 }, { "epoch": 9.84094616639478, "grad_norm": 0.08583206683397293, "learning_rate": 3.860038484537476e-08, "loss": 0.1008, "num_input_tokens_seen": 130188944, "step": 60325 }, { "epoch": 9.841761827079935, "grad_norm": 0.19351406395435333, "learning_rate": 3.820600265989904e-08, "loss": 0.0969, "num_input_tokens_seen": 130199408, "step": 60330 }, { "epoch": 9.84257748776509, "grad_norm": 0.27275794744491577, "learning_rate": 3.78136440040977e-08, "loss": 0.0939, "num_input_tokens_seen": 130211152, "step": 60335 }, { "epoch": 9.843393148450245, "grad_norm": 0.14788496494293213, "learning_rate": 3.742330890978141e-08, "loss": 0.0939, "num_input_tokens_seen": 130222192, "step": 60340 }, { "epoch": 9.844208809135399, "grad_norm": 0.17083828151226044, "learning_rate": 3.703499740859151e-08, "loss": 0.0094, "num_input_tokens_seen": 130232912, "step": 60345 }, { "epoch": 9.845024469820554, "grad_norm": 0.0933980867266655, "learning_rate": 3.6648709532002835e-08, "loss": 0.0043, "num_input_tokens_seen": 130242704, "step": 60350 }, { "epoch": 9.84584013050571, "grad_norm": 0.12900620698928833, "learning_rate": 3.6264445311334774e-08, "loss": 0.0423, "num_input_tokens_seen": 130254224, "step": 60355 }, { "epoch": 9.846655791190864, "grad_norm": 0.12694202363491058, "learning_rate": 3.588220477773463e-08, "loss": 0.1178, "num_input_tokens_seen": 130265392, "step": 60360 }, { "epoch": 9.84747145187602, "grad_norm": 0.23511680960655212, "learning_rate": 3.5501987962191505e-08, "loss": 0.0045, "num_input_tokens_seen": 130276048, "step": 60365 }, { "epoch": 9.848287112561174, "grad_norm": 0.042450256645679474, "learning_rate": 3.5123794895522425e-08, "loss": 0.0026, "num_input_tokens_seen": 130287472, "step": 60370 }, { "epoch": 9.84910277324633, "grad_norm": 0.5879544615745544, "learning_rate": 3.4747625608391735e-08, "loss": 0.0049, "num_input_tokens_seen": 130298832, "step": 60375 }, { "epoch": 9.849918433931485, "grad_norm": 0.1835554540157318, "learning_rate": 3.4373480131288936e-08, "loss": 0.0038, "num_input_tokens_seen": 130308432, "step": 60380 }, { "epoch": 9.850734094616639, "grad_norm": 0.15611405670642853, "learning_rate": 3.400135849454811e-08, "loss": 0.1483, "num_input_tokens_seen": 130318544, "step": 60385 }, { "epoch": 9.851549755301795, "grad_norm": 0.11268353462219238, "learning_rate": 3.363126072833123e-08, "loss": 0.2281, "num_input_tokens_seen": 130328176, "step": 60390 }, { "epoch": 9.852365415986949, "grad_norm": 11.306412696838379, "learning_rate": 3.326318686264485e-08, "loss": 0.1049, "num_input_tokens_seen": 130338896, "step": 60395 }, { "epoch": 9.853181076672104, "grad_norm": 0.05320761352777481, "learning_rate": 3.2897136927323436e-08, "loss": 0.0933, "num_input_tokens_seen": 130350160, "step": 60400 }, { "epoch": 9.85399673735726, "grad_norm": 0.08861350268125534, "learning_rate": 3.253311095204048e-08, "loss": 0.004, "num_input_tokens_seen": 130360976, "step": 60405 }, { "epoch": 9.854812398042414, "grad_norm": 0.07633858174085617, "learning_rate": 3.2171108966308486e-08, "loss": 0.0029, "num_input_tokens_seen": 130371184, "step": 60410 }, { "epoch": 9.85562805872757, "grad_norm": 0.1442979872226715, "learning_rate": 3.1811130999473415e-08, "loss": 0.0056, "num_input_tokens_seen": 130381168, "step": 60415 }, { "epoch": 9.856443719412724, "grad_norm": 0.22508582472801208, "learning_rate": 3.145317708071194e-08, "loss": 0.0039, "num_input_tokens_seen": 130390544, "step": 60420 }, { "epoch": 9.85725938009788, "grad_norm": 0.05943860858678818, "learning_rate": 3.1097247239048057e-08, "loss": 0.0036, "num_input_tokens_seen": 130401232, "step": 60425 }, { "epoch": 9.858075040783035, "grad_norm": 0.08935709297657013, "learning_rate": 3.074334150333091e-08, "loss": 0.0031, "num_input_tokens_seen": 130412560, "step": 60430 }, { "epoch": 9.858890701468189, "grad_norm": 0.06829848140478134, "learning_rate": 3.039145990225145e-08, "loss": 0.0875, "num_input_tokens_seen": 130423472, "step": 60435 }, { "epoch": 9.859706362153345, "grad_norm": 0.30248787999153137, "learning_rate": 3.0041602464334076e-08, "loss": 0.0062, "num_input_tokens_seen": 130433008, "step": 60440 }, { "epoch": 9.860522022838499, "grad_norm": 4.402347087860107, "learning_rate": 2.9693769217942203e-08, "loss": 0.1105, "num_input_tokens_seen": 130445776, "step": 60445 }, { "epoch": 9.861337683523654, "grad_norm": 3.2330546379089355, "learning_rate": 2.9347960191269952e-08, "loss": 0.2335, "num_input_tokens_seen": 130456720, "step": 60450 }, { "epoch": 9.86215334420881, "grad_norm": 0.17700782418251038, "learning_rate": 2.900417541235323e-08, "loss": 0.0091, "num_input_tokens_seen": 130468176, "step": 60455 }, { "epoch": 9.862969004893964, "grad_norm": 0.05328657478094101, "learning_rate": 2.8662414909058634e-08, "loss": 0.0154, "num_input_tokens_seen": 130479344, "step": 60460 }, { "epoch": 9.86378466557912, "grad_norm": 0.07495290786027908, "learning_rate": 2.8322678709094553e-08, "loss": 0.0159, "num_input_tokens_seen": 130490672, "step": 60465 }, { "epoch": 9.864600326264274, "grad_norm": 0.06120337173342705, "learning_rate": 2.7984966839997294e-08, "loss": 0.0036, "num_input_tokens_seen": 130502160, "step": 60470 }, { "epoch": 9.86541598694943, "grad_norm": 0.04699868708848953, "learning_rate": 2.7649279329142185e-08, "loss": 0.0029, "num_input_tokens_seen": 130514160, "step": 60475 }, { "epoch": 9.866231647634583, "grad_norm": 0.05974680557847023, "learning_rate": 2.7315616203749118e-08, "loss": 0.1149, "num_input_tokens_seen": 130524336, "step": 60480 }, { "epoch": 9.867047308319739, "grad_norm": 0.057295531034469604, "learning_rate": 2.6983977490860345e-08, "loss": 0.171, "num_input_tokens_seen": 130535440, "step": 60485 }, { "epoch": 9.867862969004895, "grad_norm": 0.044656090438365936, "learning_rate": 2.6654363217362698e-08, "loss": 0.1006, "num_input_tokens_seen": 130546832, "step": 60490 }, { "epoch": 9.868678629690049, "grad_norm": 0.09445744752883911, "learning_rate": 2.632677340997647e-08, "loss": 0.0058, "num_input_tokens_seen": 130557168, "step": 60495 }, { "epoch": 9.869494290375204, "grad_norm": 0.26121169328689575, "learning_rate": 2.6001208095258188e-08, "loss": 0.0071, "num_input_tokens_seen": 130567664, "step": 60500 }, { "epoch": 9.870309951060358, "grad_norm": 0.3801686465740204, "learning_rate": 2.5677667299597863e-08, "loss": 0.0044, "num_input_tokens_seen": 130577520, "step": 60505 }, { "epoch": 9.871125611745514, "grad_norm": 0.1790764480829239, "learning_rate": 2.5356151049221734e-08, "loss": 0.0052, "num_input_tokens_seen": 130589328, "step": 60510 }, { "epoch": 9.87194127243067, "grad_norm": 5.141573905944824, "learning_rate": 2.5036659370197836e-08, "loss": 0.0888, "num_input_tokens_seen": 130599760, "step": 60515 }, { "epoch": 9.872756933115824, "grad_norm": 0.644772469997406, "learning_rate": 2.4719192288424896e-08, "loss": 0.008, "num_input_tokens_seen": 130610576, "step": 60520 }, { "epoch": 9.87357259380098, "grad_norm": 0.12314771860837936, "learning_rate": 2.440374982963789e-08, "loss": 0.063, "num_input_tokens_seen": 130621168, "step": 60525 }, { "epoch": 9.874388254486133, "grad_norm": 0.024125229567289352, "learning_rate": 2.409033201940525e-08, "loss": 0.0063, "num_input_tokens_seen": 130630736, "step": 60530 }, { "epoch": 9.875203915171289, "grad_norm": 0.2039998471736908, "learning_rate": 2.3778938883139977e-08, "loss": 0.0713, "num_input_tokens_seen": 130641840, "step": 60535 }, { "epoch": 9.876019575856443, "grad_norm": 0.05761853605508804, "learning_rate": 2.3469570446080223e-08, "loss": 0.0062, "num_input_tokens_seen": 130652816, "step": 60540 }, { "epoch": 9.876835236541599, "grad_norm": 0.11537981778383255, "learning_rate": 2.3162226733305925e-08, "loss": 0.0074, "num_input_tokens_seen": 130663248, "step": 60545 }, { "epoch": 9.877650897226754, "grad_norm": 0.06408042460680008, "learning_rate": 2.2856907769736037e-08, "loss": 0.0311, "num_input_tokens_seen": 130675184, "step": 60550 }, { "epoch": 9.878466557911908, "grad_norm": 3.4832675457000732, "learning_rate": 2.255361358011465e-08, "loss": 0.0079, "num_input_tokens_seen": 130686576, "step": 60555 }, { "epoch": 9.879282218597064, "grad_norm": 0.15520964562892914, "learning_rate": 2.2252344189033213e-08, "loss": 0.0038, "num_input_tokens_seen": 130696240, "step": 60560 }, { "epoch": 9.880097879282218, "grad_norm": 3.5604052543640137, "learning_rate": 2.1953099620911076e-08, "loss": 0.0101, "num_input_tokens_seen": 130706064, "step": 60565 }, { "epoch": 9.880913539967374, "grad_norm": 0.12187746912240982, "learning_rate": 2.165587990000939e-08, "loss": 0.0063, "num_input_tokens_seen": 130716720, "step": 60570 }, { "epoch": 9.88172920065253, "grad_norm": 0.27157095074653625, "learning_rate": 2.1360685050419994e-08, "loss": 0.1446, "num_input_tokens_seen": 130726288, "step": 60575 }, { "epoch": 9.882544861337683, "grad_norm": 0.08113939315080643, "learning_rate": 2.106751509607374e-08, "loss": 0.0055, "num_input_tokens_seen": 130736368, "step": 60580 }, { "epoch": 9.883360522022839, "grad_norm": 0.10311729460954666, "learning_rate": 2.0776370060737737e-08, "loss": 0.1237, "num_input_tokens_seen": 130746160, "step": 60585 }, { "epoch": 9.884176182707993, "grad_norm": 0.1000794768333435, "learning_rate": 2.0487249968012546e-08, "loss": 0.0123, "num_input_tokens_seen": 130755728, "step": 60590 }, { "epoch": 9.884991843393149, "grad_norm": 0.04848021641373634, "learning_rate": 2.020015484133497e-08, "loss": 0.0051, "num_input_tokens_seen": 130767088, "step": 60595 }, { "epoch": 9.885807504078304, "grad_norm": 0.1445668786764145, "learning_rate": 1.9915084703980845e-08, "loss": 0.0079, "num_input_tokens_seen": 130776944, "step": 60600 }, { "epoch": 9.886623164763458, "grad_norm": 0.26100313663482666, "learning_rate": 1.9632039579053907e-08, "loss": 0.0172, "num_input_tokens_seen": 130786480, "step": 60605 }, { "epoch": 9.887438825448614, "grad_norm": 0.058276962488889694, "learning_rate": 1.935101948950524e-08, "loss": 0.0161, "num_input_tokens_seen": 130797968, "step": 60610 }, { "epoch": 9.888254486133768, "grad_norm": 10.515925407409668, "learning_rate": 1.9072024458113847e-08, "loss": 0.0844, "num_input_tokens_seen": 130808272, "step": 60615 }, { "epoch": 9.889070146818923, "grad_norm": 0.08804245293140411, "learning_rate": 1.8795054507494967e-08, "loss": 0.1519, "num_input_tokens_seen": 130817488, "step": 60620 }, { "epoch": 9.88988580750408, "grad_norm": 0.04314444214105606, "learning_rate": 1.852010966010287e-08, "loss": 0.0036, "num_input_tokens_seen": 130828432, "step": 60625 }, { "epoch": 9.890701468189233, "grad_norm": 0.06376510858535767, "learning_rate": 1.8247189938225274e-08, "loss": 0.129, "num_input_tokens_seen": 130838416, "step": 60630 }, { "epoch": 9.891517128874389, "grad_norm": 0.2041330337524414, "learning_rate": 1.7976295363988927e-08, "loss": 0.0052, "num_input_tokens_seen": 130847920, "step": 60635 }, { "epoch": 9.892332789559543, "grad_norm": 4.164166450500488, "learning_rate": 1.7707425959348488e-08, "loss": 0.2132, "num_input_tokens_seen": 130859184, "step": 60640 }, { "epoch": 9.893148450244698, "grad_norm": 0.05904705449938774, "learning_rate": 1.744058174610319e-08, "loss": 0.0067, "num_input_tokens_seen": 130869808, "step": 60645 }, { "epoch": 9.893964110929852, "grad_norm": 0.11198320239782333, "learning_rate": 1.7175762745885727e-08, "loss": 0.0046, "num_input_tokens_seen": 130880912, "step": 60650 }, { "epoch": 9.894779771615008, "grad_norm": 6.100296974182129, "learning_rate": 1.6912968980162257e-08, "loss": 0.0115, "num_input_tokens_seen": 130892336, "step": 60655 }, { "epoch": 9.895595432300164, "grad_norm": 0.1114816814661026, "learning_rate": 1.665220047023519e-08, "loss": 0.0078, "num_input_tokens_seen": 130900752, "step": 60660 }, { "epoch": 9.896411092985318, "grad_norm": 0.062797412276268, "learning_rate": 1.639345723724872e-08, "loss": 0.1296, "num_input_tokens_seen": 130911600, "step": 60665 }, { "epoch": 9.897226753670473, "grad_norm": 0.08252307027578354, "learning_rate": 1.6136739302169412e-08, "loss": 0.0045, "num_input_tokens_seen": 130922640, "step": 60670 }, { "epoch": 9.898042414355627, "grad_norm": 14.04305362701416, "learning_rate": 1.588204668581672e-08, "loss": 0.0549, "num_input_tokens_seen": 130933136, "step": 60675 }, { "epoch": 9.898858075040783, "grad_norm": 0.21598735451698303, "learning_rate": 1.5629379408832468e-08, "loss": 0.02, "num_input_tokens_seen": 130944336, "step": 60680 }, { "epoch": 9.899673735725939, "grad_norm": 0.047149281948804855, "learning_rate": 1.537873749169749e-08, "loss": 0.1066, "num_input_tokens_seen": 130952848, "step": 60685 }, { "epoch": 9.900489396411093, "grad_norm": 0.06601323932409286, "learning_rate": 1.513012095473443e-08, "loss": 0.1251, "num_input_tokens_seen": 130963536, "step": 60690 }, { "epoch": 9.901305057096248, "grad_norm": 0.13944591581821442, "learning_rate": 1.4883529818096598e-08, "loss": 0.23, "num_input_tokens_seen": 130973840, "step": 60695 }, { "epoch": 9.902120717781402, "grad_norm": 0.1133158877491951, "learning_rate": 1.4638964101773568e-08, "loss": 0.0043, "num_input_tokens_seen": 130984304, "step": 60700 }, { "epoch": 9.902936378466558, "grad_norm": 0.11855597794055939, "learning_rate": 1.4396423825588367e-08, "loss": 0.0057, "num_input_tokens_seen": 130995504, "step": 60705 }, { "epoch": 9.903752039151712, "grad_norm": 3.568680763244629, "learning_rate": 1.4155909009205826e-08, "loss": 0.1106, "num_input_tokens_seen": 131005968, "step": 60710 }, { "epoch": 9.904567699836868, "grad_norm": 0.0719594955444336, "learning_rate": 1.3917419672124233e-08, "loss": 0.0043, "num_input_tokens_seen": 131018160, "step": 60715 }, { "epoch": 9.905383360522023, "grad_norm": 0.3674751818180084, "learning_rate": 1.368095583367257e-08, "loss": 0.0835, "num_input_tokens_seen": 131029008, "step": 60720 }, { "epoch": 9.906199021207177, "grad_norm": 0.3260965645313263, "learning_rate": 1.3446517513021617e-08, "loss": 0.0512, "num_input_tokens_seen": 131038960, "step": 60725 }, { "epoch": 9.907014681892333, "grad_norm": 0.09662462770938873, "learning_rate": 1.321410472917839e-08, "loss": 0.0058, "num_input_tokens_seen": 131050160, "step": 60730 }, { "epoch": 9.907830342577487, "grad_norm": 11.745221138000488, "learning_rate": 1.2983717500977822e-08, "loss": 0.0256, "num_input_tokens_seen": 131060048, "step": 60735 }, { "epoch": 9.908646003262643, "grad_norm": 5.630153656005859, "learning_rate": 1.275535584710219e-08, "loss": 0.1848, "num_input_tokens_seen": 131069360, "step": 60740 }, { "epoch": 9.909461663947798, "grad_norm": 0.08159533888101578, "learning_rate": 1.2529019786061691e-08, "loss": 0.0065, "num_input_tokens_seen": 131079888, "step": 60745 }, { "epoch": 9.910277324632952, "grad_norm": 0.3844931721687317, "learning_rate": 1.2304709336205533e-08, "loss": 0.0058, "num_input_tokens_seen": 131090512, "step": 60750 }, { "epoch": 9.911092985318108, "grad_norm": 0.07297611981630325, "learning_rate": 1.2082424515713619e-08, "loss": 0.1885, "num_input_tokens_seen": 131101680, "step": 60755 }, { "epoch": 9.911908646003262, "grad_norm": 0.09410180151462555, "learning_rate": 1.1862165342607645e-08, "loss": 0.0615, "num_input_tokens_seen": 131112336, "step": 60760 }, { "epoch": 9.912724306688418, "grad_norm": 0.14489737153053284, "learning_rate": 1.1643931834745548e-08, "loss": 0.0077, "num_input_tokens_seen": 131123088, "step": 60765 }, { "epoch": 9.913539967373573, "grad_norm": 0.6937752366065979, "learning_rate": 1.1427724009813179e-08, "loss": 0.0498, "num_input_tokens_seen": 131132368, "step": 60770 }, { "epoch": 9.914355628058727, "grad_norm": 0.08811195194721222, "learning_rate": 1.1213541885340962e-08, "loss": 0.0106, "num_input_tokens_seen": 131143600, "step": 60775 }, { "epoch": 9.915171288743883, "grad_norm": 0.07232557237148285, "learning_rate": 1.1001385478692783e-08, "loss": 0.0036, "num_input_tokens_seen": 131153872, "step": 60780 }, { "epoch": 9.915986949429037, "grad_norm": 0.056486278772354126, "learning_rate": 1.0791254807063223e-08, "loss": 0.0977, "num_input_tokens_seen": 131164848, "step": 60785 }, { "epoch": 9.916802610114193, "grad_norm": 0.14884202182292938, "learning_rate": 1.0583149887488653e-08, "loss": 0.0057, "num_input_tokens_seen": 131175280, "step": 60790 }, { "epoch": 9.917618270799348, "grad_norm": 0.3710552752017975, "learning_rate": 1.0377070736838912e-08, "loss": 0.0042, "num_input_tokens_seen": 131185776, "step": 60795 }, { "epoch": 9.918433931484502, "grad_norm": 12.366984367370605, "learning_rate": 1.017301737182008e-08, "loss": 0.0479, "num_input_tokens_seen": 131196944, "step": 60800 }, { "epoch": 9.919249592169658, "grad_norm": 0.05777287483215332, "learning_rate": 9.970989808974485e-09, "loss": 0.0048, "num_input_tokens_seen": 131208752, "step": 60805 }, { "epoch": 9.920065252854812, "grad_norm": 0.15197744965553284, "learning_rate": 9.77098806467791e-09, "loss": 0.007, "num_input_tokens_seen": 131217360, "step": 60810 }, { "epoch": 9.920880913539968, "grad_norm": 0.06549471616744995, "learning_rate": 9.573012155145166e-09, "loss": 0.003, "num_input_tokens_seen": 131228304, "step": 60815 }, { "epoch": 9.921696574225122, "grad_norm": 0.08914361149072647, "learning_rate": 9.37706209642175e-09, "loss": 0.0324, "num_input_tokens_seen": 131239568, "step": 60820 }, { "epoch": 9.922512234910277, "grad_norm": 3.147571563720703, "learning_rate": 9.183137904397732e-09, "loss": 0.1684, "num_input_tokens_seen": 131250576, "step": 60825 }, { "epoch": 9.923327895595433, "grad_norm": 0.07605230063199997, "learning_rate": 8.991239594788315e-09, "loss": 0.0946, "num_input_tokens_seen": 131261616, "step": 60830 }, { "epoch": 9.924143556280587, "grad_norm": 0.07768086344003677, "learning_rate": 8.801367183153276e-09, "loss": 0.1195, "num_input_tokens_seen": 131272400, "step": 60835 }, { "epoch": 9.924959216965743, "grad_norm": 0.1955137997865677, "learning_rate": 8.61352068488308e-09, "loss": 0.1882, "num_input_tokens_seen": 131283280, "step": 60840 }, { "epoch": 9.925774877650896, "grad_norm": 0.11185144633054733, "learning_rate": 8.427700115207216e-09, "loss": 0.005, "num_input_tokens_seen": 131293808, "step": 60845 }, { "epoch": 9.926590538336052, "grad_norm": 0.0732840970158577, "learning_rate": 8.243905489185855e-09, "loss": 0.0054, "num_input_tokens_seen": 131304016, "step": 60850 }, { "epoch": 9.927406199021208, "grad_norm": 0.13591451942920685, "learning_rate": 8.062136821723742e-09, "loss": 0.1467, "num_input_tokens_seen": 131314032, "step": 60855 }, { "epoch": 9.928221859706362, "grad_norm": 0.18943621218204498, "learning_rate": 7.882394127550763e-09, "loss": 0.1236, "num_input_tokens_seen": 131324112, "step": 60860 }, { "epoch": 9.929037520391518, "grad_norm": 0.09203652292490005, "learning_rate": 7.704677421238593e-09, "loss": 0.1006, "num_input_tokens_seen": 131334352, "step": 60865 }, { "epoch": 9.929853181076671, "grad_norm": 0.5394014120101929, "learning_rate": 7.528986717195152e-09, "loss": 0.2849, "num_input_tokens_seen": 131344656, "step": 60870 }, { "epoch": 9.930668841761827, "grad_norm": 0.402437686920166, "learning_rate": 7.355322029661826e-09, "loss": 0.0323, "num_input_tokens_seen": 131355984, "step": 60875 }, { "epoch": 9.931484502446983, "grad_norm": 0.08685159683227539, "learning_rate": 7.183683372719019e-09, "loss": 0.0141, "num_input_tokens_seen": 131367056, "step": 60880 }, { "epoch": 9.932300163132137, "grad_norm": 2.5764238834381104, "learning_rate": 7.0140707602805995e-09, "loss": 0.0777, "num_input_tokens_seen": 131378192, "step": 60885 }, { "epoch": 9.933115823817293, "grad_norm": 0.03751058876514435, "learning_rate": 6.846484206091131e-09, "loss": 0.2534, "num_input_tokens_seen": 131389040, "step": 60890 }, { "epoch": 9.933931484502446, "grad_norm": 0.20254509150981903, "learning_rate": 6.6809237237425156e-09, "loss": 0.0498, "num_input_tokens_seen": 131399664, "step": 60895 }, { "epoch": 9.934747145187602, "grad_norm": 0.194112628698349, "learning_rate": 6.517389326651801e-09, "loss": 0.0091, "num_input_tokens_seen": 131409680, "step": 60900 }, { "epoch": 9.935562805872756, "grad_norm": 0.3252966105937958, "learning_rate": 6.3558810280778254e-09, "loss": 0.006, "num_input_tokens_seen": 131420240, "step": 60905 }, { "epoch": 9.936378466557912, "grad_norm": 1.819018840789795, "learning_rate": 6.196398841112893e-09, "loss": 0.0073, "num_input_tokens_seen": 131429488, "step": 60910 }, { "epoch": 9.937194127243067, "grad_norm": 0.1418514996767044, "learning_rate": 6.038942778685553e-09, "loss": 0.1406, "num_input_tokens_seen": 131439792, "step": 60915 }, { "epoch": 9.938009787928221, "grad_norm": 0.05585205927491188, "learning_rate": 5.883512853557816e-09, "loss": 0.0762, "num_input_tokens_seen": 131448976, "step": 60920 }, { "epoch": 9.938825448613377, "grad_norm": 0.11939923465251923, "learning_rate": 5.730109078330714e-09, "loss": 0.0109, "num_input_tokens_seen": 131459088, "step": 60925 }, { "epoch": 9.939641109298531, "grad_norm": 0.08607755601406097, "learning_rate": 5.578731465444298e-09, "loss": 0.0796, "num_input_tokens_seen": 131470096, "step": 60930 }, { "epoch": 9.940456769983687, "grad_norm": 3.4439589977264404, "learning_rate": 5.429380027163755e-09, "loss": 0.1152, "num_input_tokens_seen": 131479376, "step": 60935 }, { "epoch": 9.941272430668842, "grad_norm": 0.13130417466163635, "learning_rate": 5.2820547755988434e-09, "loss": 0.0122, "num_input_tokens_seen": 131491792, "step": 60940 }, { "epoch": 9.942088091353996, "grad_norm": 0.037019938230514526, "learning_rate": 5.1367557226927875e-09, "loss": 0.0021, "num_input_tokens_seen": 131502512, "step": 60945 }, { "epoch": 9.942903752039152, "grad_norm": 3.488393783569336, "learning_rate": 4.9934828802250535e-09, "loss": 0.2115, "num_input_tokens_seen": 131514224, "step": 60950 }, { "epoch": 9.943719412724306, "grad_norm": 0.09522642195224762, "learning_rate": 4.852236259805798e-09, "loss": 0.096, "num_input_tokens_seen": 131525840, "step": 60955 }, { "epoch": 9.944535073409462, "grad_norm": 0.09075871109962463, "learning_rate": 4.7130158728925236e-09, "loss": 0.0079, "num_input_tokens_seen": 131536208, "step": 60960 }, { "epoch": 9.945350734094617, "grad_norm": 3.328155994415283, "learning_rate": 4.575821730765095e-09, "loss": 0.1114, "num_input_tokens_seen": 131547344, "step": 60965 }, { "epoch": 9.946166394779771, "grad_norm": 3.054372549057007, "learning_rate": 4.440653844545173e-09, "loss": 0.0839, "num_input_tokens_seen": 131558544, "step": 60970 }, { "epoch": 9.946982055464927, "grad_norm": 0.0750637948513031, "learning_rate": 4.307512225196209e-09, "loss": 0.1256, "num_input_tokens_seen": 131569136, "step": 60975 }, { "epoch": 9.947797716150081, "grad_norm": 0.23475918173789978, "learning_rate": 4.176396883504019e-09, "loss": 0.1248, "num_input_tokens_seen": 131580848, "step": 60980 }, { "epoch": 9.948613376835237, "grad_norm": 0.05277956649661064, "learning_rate": 4.047307830101765e-09, "loss": 0.0022, "num_input_tokens_seen": 131591664, "step": 60985 }, { "epoch": 9.949429037520392, "grad_norm": 0.23914973437786102, "learning_rate": 3.9202450754533e-09, "loss": 0.1028, "num_input_tokens_seen": 131602032, "step": 60990 }, { "epoch": 9.950244698205546, "grad_norm": 3.7414486408233643, "learning_rate": 3.79520862985594e-09, "loss": 0.1077, "num_input_tokens_seen": 131613520, "step": 60995 }, { "epoch": 9.951060358890702, "grad_norm": 0.11124800145626068, "learning_rate": 3.6721985034515738e-09, "loss": 0.0221, "num_input_tokens_seen": 131624624, "step": 61000 }, { "epoch": 9.951876019575856, "grad_norm": 0.16320769488811493, "learning_rate": 3.5512147062072287e-09, "loss": 0.0057, "num_input_tokens_seen": 131634416, "step": 61005 }, { "epoch": 9.952691680261012, "grad_norm": 0.25731217861175537, "learning_rate": 3.4322572479345005e-09, "loss": 0.0025, "num_input_tokens_seen": 131645232, "step": 61010 }, { "epoch": 9.953507340946166, "grad_norm": 0.12489716708660126, "learning_rate": 3.3153261382729008e-09, "loss": 0.1842, "num_input_tokens_seen": 131655824, "step": 61015 }, { "epoch": 9.954323001631321, "grad_norm": 0.24719005823135376, "learning_rate": 3.2004213867009582e-09, "loss": 0.0044, "num_input_tokens_seen": 131666640, "step": 61020 }, { "epoch": 9.955138662316477, "grad_norm": 0.08298831433057785, "learning_rate": 3.0875430025362197e-09, "loss": 0.0048, "num_input_tokens_seen": 131675120, "step": 61025 }, { "epoch": 9.955954323001631, "grad_norm": 0.08427087217569351, "learning_rate": 2.9766909949296983e-09, "loss": 0.08, "num_input_tokens_seen": 131686096, "step": 61030 }, { "epoch": 9.956769983686787, "grad_norm": 0.05212075263261795, "learning_rate": 2.8678653728658746e-09, "loss": 0.0041, "num_input_tokens_seen": 131697968, "step": 61035 }, { "epoch": 9.95758564437194, "grad_norm": 0.06858661770820618, "learning_rate": 2.761066145168245e-09, "loss": 0.1066, "num_input_tokens_seen": 131709168, "step": 61040 }, { "epoch": 9.958401305057096, "grad_norm": 0.09093804657459259, "learning_rate": 2.656293320490999e-09, "loss": 0.2007, "num_input_tokens_seen": 131720656, "step": 61045 }, { "epoch": 9.959216965742252, "grad_norm": 0.3559302091598511, "learning_rate": 2.5535469073301176e-09, "loss": 0.0086, "num_input_tokens_seen": 131731536, "step": 61050 }, { "epoch": 9.960032626427406, "grad_norm": 0.1537299007177353, "learning_rate": 2.4528269140150497e-09, "loss": 0.0055, "num_input_tokens_seen": 131742928, "step": 61055 }, { "epoch": 9.960848287112562, "grad_norm": 0.0762186124920845, "learning_rate": 2.354133348711485e-09, "loss": 0.0622, "num_input_tokens_seen": 131752496, "step": 61060 }, { "epoch": 9.961663947797716, "grad_norm": 0.09247704595327377, "learning_rate": 2.2574662194158047e-09, "loss": 0.0055, "num_input_tokens_seen": 131764048, "step": 61065 }, { "epoch": 9.962479608482871, "grad_norm": 0.3819979727268219, "learning_rate": 2.1628255339689596e-09, "loss": 0.0688, "num_input_tokens_seen": 131774352, "step": 61070 }, { "epoch": 9.963295269168025, "grad_norm": 0.5185974836349487, "learning_rate": 2.0702113000425903e-09, "loss": 0.0081, "num_input_tokens_seen": 131783696, "step": 61075 }, { "epoch": 9.964110929853181, "grad_norm": 0.06221344321966171, "learning_rate": 1.979623525141805e-09, "loss": 0.0128, "num_input_tokens_seen": 131795696, "step": 61080 }, { "epoch": 9.964926590538337, "grad_norm": 0.1394263356924057, "learning_rate": 1.891062216610728e-09, "loss": 0.0053, "num_input_tokens_seen": 131805616, "step": 61085 }, { "epoch": 9.96574225122349, "grad_norm": 5.402780055999756, "learning_rate": 1.804527381629728e-09, "loss": 0.1396, "num_input_tokens_seen": 131815056, "step": 61090 }, { "epoch": 9.966557911908646, "grad_norm": 0.06656648963689804, "learning_rate": 1.7200190272126382e-09, "loss": 0.0044, "num_input_tokens_seen": 131825392, "step": 61095 }, { "epoch": 9.9673735725938, "grad_norm": 5.79808235168457, "learning_rate": 1.6375371602123103e-09, "loss": 0.0852, "num_input_tokens_seen": 131836144, "step": 61100 }, { "epoch": 9.968189233278956, "grad_norm": 0.22231650352478027, "learning_rate": 1.5570817873122868e-09, "loss": 0.0615, "num_input_tokens_seen": 131846672, "step": 61105 }, { "epoch": 9.969004893964112, "grad_norm": 3.6045851707458496, "learning_rate": 1.4786529150379036e-09, "loss": 0.3261, "num_input_tokens_seen": 131858544, "step": 61110 }, { "epoch": 9.969820554649266, "grad_norm": 0.045092545449733734, "learning_rate": 1.4022505497424122e-09, "loss": 0.097, "num_input_tokens_seen": 131869296, "step": 61115 }, { "epoch": 9.970636215334421, "grad_norm": 0.08182383328676224, "learning_rate": 1.3278746976236322e-09, "loss": 0.0165, "num_input_tokens_seen": 131879376, "step": 61120 }, { "epoch": 9.971451876019575, "grad_norm": 0.1155458465218544, "learning_rate": 1.255525364710075e-09, "loss": 0.0055, "num_input_tokens_seen": 131890768, "step": 61125 }, { "epoch": 9.97226753670473, "grad_norm": 0.12539751827716827, "learning_rate": 1.1852025568637183e-09, "loss": 0.0039, "num_input_tokens_seen": 131901104, "step": 61130 }, { "epoch": 9.973083197389887, "grad_norm": 0.03399152681231499, "learning_rate": 1.116906279791108e-09, "loss": 0.2061, "num_input_tokens_seen": 131912208, "step": 61135 }, { "epoch": 9.97389885807504, "grad_norm": 5.785745620727539, "learning_rate": 1.0506365390211547e-09, "loss": 0.1843, "num_input_tokens_seen": 131923472, "step": 61140 }, { "epoch": 9.974714518760196, "grad_norm": 4.4891533851623535, "learning_rate": 9.863933399328895e-10, "loss": 0.3352, "num_input_tokens_seen": 131934384, "step": 61145 }, { "epoch": 9.97553017944535, "grad_norm": 0.08644963800907135, "learning_rate": 9.241766877304825e-10, "loss": 0.1085, "num_input_tokens_seen": 131944784, "step": 61150 }, { "epoch": 9.976345840130506, "grad_norm": 3.782694101333618, "learning_rate": 8.639865874571218e-10, "loss": 0.0992, "num_input_tokens_seen": 131955088, "step": 61155 }, { "epoch": 9.977161500815662, "grad_norm": 0.06311924755573273, "learning_rate": 8.058230439950132e-10, "loss": 0.1173, "num_input_tokens_seen": 131966384, "step": 61160 }, { "epoch": 9.977977161500815, "grad_norm": 0.0469924658536911, "learning_rate": 7.496860620570534e-10, "loss": 0.0584, "num_input_tokens_seen": 131977200, "step": 61165 }, { "epoch": 9.978792822185971, "grad_norm": 0.06673408299684525, "learning_rate": 6.955756461951568e-10, "loss": 0.0748, "num_input_tokens_seen": 131988848, "step": 61170 }, { "epoch": 9.979608482871125, "grad_norm": 8.973872184753418, "learning_rate": 6.434918007947044e-10, "loss": 0.066, "num_input_tokens_seen": 132000752, "step": 61175 }, { "epoch": 9.98042414355628, "grad_norm": 0.1769551783800125, "learning_rate": 5.934345300773192e-10, "loss": 0.0055, "num_input_tokens_seen": 132010544, "step": 61180 }, { "epoch": 9.981239804241435, "grad_norm": 0.06666838377714157, "learning_rate": 5.454038381008664e-10, "loss": 0.0827, "num_input_tokens_seen": 132021552, "step": 61185 }, { "epoch": 9.98205546492659, "grad_norm": 0.1354232281446457, "learning_rate": 4.993997287622287e-10, "loss": 0.2025, "num_input_tokens_seen": 132032752, "step": 61190 }, { "epoch": 9.982871125611746, "grad_norm": 0.06022237241268158, "learning_rate": 4.554222057889801e-10, "loss": 0.0046, "num_input_tokens_seen": 132043536, "step": 61195 }, { "epoch": 9.9836867862969, "grad_norm": 0.4063657522201538, "learning_rate": 4.1347127274493635e-10, "loss": 0.0163, "num_input_tokens_seen": 132053840, "step": 61200 }, { "epoch": 9.984502446982056, "grad_norm": 0.08528797328472137, "learning_rate": 3.735469330301555e-10, "loss": 0.0111, "num_input_tokens_seen": 132064400, "step": 61205 }, { "epoch": 9.98531810766721, "grad_norm": 0.09119515120983124, "learning_rate": 3.356491898837133e-10, "loss": 0.1199, "num_input_tokens_seen": 132073552, "step": 61210 }, { "epoch": 9.986133768352365, "grad_norm": 0.06901831179857254, "learning_rate": 2.997780463753763e-10, "loss": 0.0062, "num_input_tokens_seen": 132084400, "step": 61215 }, { "epoch": 9.986949429037521, "grad_norm": 0.12367760390043259, "learning_rate": 2.659335054139289e-10, "loss": 0.0056, "num_input_tokens_seen": 132095856, "step": 61220 }, { "epoch": 9.987765089722675, "grad_norm": 0.05554777756333351, "learning_rate": 2.341155697471731e-10, "loss": 0.0772, "num_input_tokens_seen": 132108368, "step": 61225 }, { "epoch": 9.98858075040783, "grad_norm": 0.19484113156795502, "learning_rate": 2.043242419452751e-10, "loss": 0.1241, "num_input_tokens_seen": 132120432, "step": 61230 }, { "epoch": 9.989396411092985, "grad_norm": 0.039668235927820206, "learning_rate": 1.7655952443129675e-10, "loss": 0.024, "num_input_tokens_seen": 132131568, "step": 61235 }, { "epoch": 9.99021207177814, "grad_norm": 0.15951573848724365, "learning_rate": 1.5082141945343963e-10, "loss": 0.083, "num_input_tokens_seen": 132142992, "step": 61240 }, { "epoch": 9.991027732463294, "grad_norm": 0.0305704977363348, "learning_rate": 1.2710992909892306e-10, "loss": 0.0029, "num_input_tokens_seen": 132153520, "step": 61245 }, { "epoch": 9.99184339314845, "grad_norm": 0.09939110279083252, "learning_rate": 1.0542505528565727e-10, "loss": 0.0069, "num_input_tokens_seen": 132164368, "step": 61250 }, { "epoch": 9.992659053833606, "grad_norm": 0.0912177562713623, "learning_rate": 8.57667997788969e-11, "loss": 0.0032, "num_input_tokens_seen": 132174992, "step": 61255 }, { "epoch": 9.99347471451876, "grad_norm": 0.19032734632492065, "learning_rate": 6.813516416626087e-11, "loss": 0.0041, "num_input_tokens_seen": 132186320, "step": 61260 }, { "epoch": 9.994290375203915, "grad_norm": 0.11631111800670624, "learning_rate": 5.2530149877161315e-11, "loss": 0.0359, "num_input_tokens_seen": 132198000, "step": 61265 }, { "epoch": 9.99510603588907, "grad_norm": 0.09231505542993546, "learning_rate": 3.895175818002805e-11, "loss": 0.0831, "num_input_tokens_seen": 132209840, "step": 61270 }, { "epoch": 9.995921696574225, "grad_norm": 0.0757303237915039, "learning_rate": 2.7399990173981872e-11, "loss": 0.0034, "num_input_tokens_seen": 132220848, "step": 61275 }, { "epoch": 9.99673735725938, "grad_norm": 3.7855224609375, "learning_rate": 1.7874846797161228e-11, "loss": 0.2133, "num_input_tokens_seen": 132230384, "step": 61280 }, { "epoch": 9.997553017944535, "grad_norm": 0.0497620664536953, "learning_rate": 1.0376328818395564e-11, "loss": 0.1674, "num_input_tokens_seen": 132241200, "step": 61285 }, { "epoch": 9.99836867862969, "grad_norm": 4.879983901977539, "learning_rate": 4.90443684553199e-12, "loss": 0.2153, "num_input_tokens_seen": 132251664, "step": 61290 }, { "epoch": 9.999184339314844, "grad_norm": 0.039522819221019745, "learning_rate": 1.4591713254352712e-12, "loss": 0.0041, "num_input_tokens_seen": 132262576, "step": 61295 }, { "epoch": 10.0, "grad_norm": 0.16737177968025208, "learning_rate": 4.053253843672167e-14, "loss": 0.0941, "num_input_tokens_seen": 132272272, "step": 61300 }, { "epoch": 10.0, "eval_loss": 0.1923980414867401, "eval_runtime": 568.4728, "eval_samples_per_second": 4.794, "eval_steps_per_second": 1.2, "num_input_tokens_seen": 132272272, "step": 61300 }, { "epoch": 10.0, "num_input_tokens_seen": 132272272, "step": 61300, "total_flos": 5.956371922343952e+18, "train_loss": 0.11942618750450527, "train_runtime": 66936.7162, "train_samples_per_second": 3.663, "train_steps_per_second": 0.916 } ], "logging_steps": 5, "max_steps": 61300, "num_input_tokens_seen": 132272272, "num_train_epochs": 10, "save_steps": 3065, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.956371922343952e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }